Commit dba5764289827a82201dafa605d531411cb5b24f
1 parent
47452e1d
bayes调参计划
Showing
13 changed files
with
1682 additions
and
112 deletions
Show diff stats
artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +python scripts/evaluation/tune_fusion.py --mode optimize --run-name coarse_fusion_long_001 --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md --tenant-id 163 --queries-file scripts/evaluation/queries/queries.txt --top-k 100 --language en --search-base-url http://127.0.0.1:6002 --eval-web-base-url http://127.0.0.1:6010 --max-evals 400 --batch-size 3 --candidate-pool-size 512 --random-seed 20260416 |
artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid
0 → 100644
| @@ -0,0 +1 @@ | @@ -0,0 +1 @@ | ||
| 1 | +2218620 |
config/config.yaml
| @@ -256,9 +256,11 @@ coarse_rank: | @@ -256,9 +256,11 @@ coarse_rank: | ||
| 256 | knn_text_weight: 1.0 | 256 | knn_text_weight: 1.0 |
| 257 | knn_image_weight: 2.0 | 257 | knn_image_weight: 2.0 |
| 258 | knn_tie_breaker: 0.3 | 258 | knn_tie_breaker: 0.3 |
| 259 | - knn_bias: 0.0 | 259 | + knn_bias: 0.2 |
| 260 | knn_exponent: 5.6 | 260 | knn_exponent: 5.6 |
| 261 | + knn_text_bias: 0.2 | ||
| 261 | knn_text_exponent: 0.0 | 262 | knn_text_exponent: 0.0 |
| 263 | + knn_image_bias: 0.2 | ||
| 262 | knn_image_exponent: 0.0 | 264 | knn_image_exponent: 0.0 |
| 263 | fine_rank: | 265 | fine_rank: |
| 264 | enabled: false # false 时保序透传 | 266 | enabled: false # false 时保序透传 |
| @@ -649,4 +651,4 @@ tenant_config: | @@ -649,4 +651,4 @@ tenant_config: | ||
| 649 | primary_language: en | 651 | primary_language: en |
| 650 | index_languages: | 652 | index_languages: |
| 651 | - en | 653 | - en |
| 652 | - - zh | 654 | - - zh |
| 655 | + - zh | ||
| 653 | \ No newline at end of file | 656 | \ No newline at end of file |
docs/caches-inventory.md
| @@ -96,9 +96,22 @@ | @@ -96,9 +96,22 @@ | ||
| 96 | | `scripts/redis/redis_cache_prefix_stats.py` | 按前缀统计 key 数量与 **MEMORY USAGE**(可多 DB) | | 96 | | `scripts/redis/redis_cache_prefix_stats.py` | 按前缀统计 key 数量与 **MEMORY USAGE**(可多 DB) | |
| 97 | | `scripts/redis/redis_memory_heavy_keys.py` | 扫描占用内存最大的 key,辅助排查「统计与总内存不一致」 | | 97 | | `scripts/redis/redis_memory_heavy_keys.py` | 扫描占用内存最大的 key,辅助排查「统计与总内存不一致」 | |
| 98 | | `scripts/redis/monitor_eviction.py` | 实时监控 **eviction** 相关事件,用于容量与驱逐策略排查 | | 98 | | `scripts/redis/monitor_eviction.py` | 实时监控 **eviction** 相关事件,用于容量与驱逐策略排查 | |
| 99 | +| `scripts/redis/purge_caches.py` | 一键清空业务缓存:embedding(含 `:image:` / `:clip_text:`)、anchors、translation;**默认跳过 `trans:deepl*`**(可 dry-run 预览) | | ||
| 99 | 100 | ||
| 100 | 使用前需加载项目配置(如 `source activate.sh`)以保证 `REDIS_CONFIG` 与生产一致。脚本注释中给出了 **`redis-cli` 手工统计**示例(按前缀 `wc -l`、`MEMORY STATS` 等)。 | 101 | 使用前需加载项目配置(如 `source activate.sh`)以保证 `REDIS_CONFIG` 与生产一致。脚本注释中给出了 **`redis-cli` 手工统计**示例(按前缀 `wc -l`、`MEMORY STATS` 等)。 |
| 101 | 102 | ||
| 103 | +### 快速清空(排除 `trans:deepl*`) | ||
| 104 | + | ||
| 105 | +```bash | ||
| 106 | +source activate.sh | ||
| 107 | + | ||
| 108 | +# 先预览会删多少 key(推荐) | ||
| 109 | +python scripts/redis/purge_caches.py --dry-run | ||
| 110 | + | ||
| 111 | +# 真正删除(默认 db=0) | ||
| 112 | +python scripts/redis/purge_caches.py | ||
| 113 | +``` | ||
| 114 | + | ||
| 102 | --- | 115 | --- |
| 103 | 116 | ||
| 104 | ## 六、总表(Redis 与各层缓存) | 117 | ## 六、总表(Redis 与各层缓存) |
| @@ -106,8 +119,8 @@ | @@ -106,8 +119,8 @@ | ||
| 106 | | 缓存名称 | 业务模块 | 存储 | Key 前缀 / 命名模式 | 过期时间 | 过期策略 | 值摘要 | 配置键 / 环境变量 | | 119 | | 缓存名称 | 业务模块 | 存储 | Key 前缀 / 命名模式 | 过期时间 | 过期策略 | 值摘要 | 配置键 / 环境变量 | |
| 107 | |----------|----------|------|---------------------|----------|----------|--------|-------------------| | 120 | |----------|----------|------|---------------------|----------|----------|--------|-------------------| |
| 108 | | 文本向量 | 检索 / 索引 / Embedding 服务 | Redis db≈0 | `{embedding_cache_prefix}:*`(逻辑键以 `embed:norm…` 开头) | `cache_expire_days`(默认 720 天) | 写入 TTL + 命中滑动续期 | BF16 字节向量 | `infrastructure.redis.*`;`REDIS_EMBEDDING_CACHE_PREFIX`、`REDIS_CACHE_EXPIRE_DAYS` | | 121 | | 文本向量 | 检索 / 索引 / Embedding 服务 | Redis db≈0 | `{embedding_cache_prefix}:*`(逻辑键以 `embed:norm…` 开头) | `cache_expire_days`(默认 720 天) | 写入 TTL + 命中滑动续期 | BF16 字节向量 | `infrastructure.redis.*`;`REDIS_EMBEDDING_CACHE_PREFIX`、`REDIS_CACHE_EXPIRE_DAYS` | |
| 109 | -| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{prefix}:image:*` | 同上 | 同上 | BF16 字节 | 同上 | | ||
| 110 | -| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{prefix}:clip_text:*` | 同上 | 同上 | BF16 字节 | 同上 | | 122 | +| 图像向量(CLIP 图) | 图搜 / 多模态 | 同上 | `{embedding_cache_prefix}:image:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 | |
| 123 | +| CLIP 文本塔向量 | 图搜文本侧 | 同上 | `{embedding_cache_prefix}:clip_text:*`(其中 `{embedding_cache_prefix}` 默认 `embedding`) | 同上 | 同上 | BF16 字节 | 同上 | | ||
| 111 | | 翻译译文 | 查询翻译、翻译服务 | 同上 | `trans:{model}:{lang}:*` | `services.translation.cache.ttl_seconds`(默认 720 天) | 可配置滑动(`sliding_expiration`) | UTF-8 字符串 | `services.translation.cache.*`;各能力 `use_cache` | | 124 | | 翻译译文 | 查询翻译、翻译服务 | 同上 | `trans:{model}:{lang}:*` | `services.translation.cache.ttl_seconds`(默认 720 天) | 可配置滑动(`sliding_expiration`) | UTF-8 字符串 | `services.translation.cache.*`;各能力 `use_cache` | |
| 112 | | 商品分析 / Anchors | 索引富化、LLM 内容理解 | 同上 | `{anchor_cache_prefix}:{kind}:{hash}:{lang}:*` | `anchor_cache_expire_days`(默认 30 天) | 固定 TTL,不滑动 | JSON 字符串 | `anchor_cache_prefix`、`anchor_cache_expire_days`;`REDIS_ANCHOR_*` | | 125 | | 商品分析 / Anchors | 索引富化、LLM 内容理解 | 同上 | `{anchor_cache_prefix}:{kind}:{hash}:{lang}:*` | `anchor_cache_expire_days`(默认 30 天) | 固定 TTL,不滑动 | JSON 字符串 | `anchor_cache_prefix`、`anchor_cache_expire_days`;`REDIS_ANCHOR_*` | |
| 113 | | 应用配置 | 全栈 | 进程内存 | N/A(单例) | 进程生命周期 | `reload_app_config` 清除 | `AppConfig` 对象 | `config/loader.py` | | 126 | | 应用配置 | 全栈 | 进程内存 | N/A(单例) | 进程生命周期 | `reload_app_config` 清除 | `AppConfig` 对象 | `config/loader.py` | |
docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env renamed to docs/issues/issue-2026-04-14-粗排流程放入ES-TODO-env.md
| @@ -0,0 +1,136 @@ | @@ -0,0 +1,136 @@ | ||
| 1 | + | ||
| 2 | +我以前经过过一轮调参,是基于54个评测样本(queries.txt),过程中发现的最优的参数是这一组: | ||
| 3 | +0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker': | ||
| 4 | + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'} | ||
| 5 | + | ||
| 6 | +这一组参数分布比较极端,text_bias太大(文本项得分事0~1的,加上4被稀释的很大),图片的exponent太大,不过在这个数据集上面确实是最好的,我觉得有过拟合的可能,因此要扩大数据集,先扩展标注集,然后使用扩展的标注集,继续进行寻参。 | ||
| 7 | +因为标注任务和寻参任务耗时都比较长,请你写好一个脚本,内部先启动标注任务,然后再启动寻参任务,把任务跑起来,程序已经正常跑起来了、运转正常,你才可以退出,以后等程序跑完了应该能拿到寻参结果,下次你可以结合程序执行的结果进行结论分析。 | ||
| 8 | + | ||
| 9 | + | ||
| 10 | +以前的一轮调参: | ||
| 11 | +我当时的调参需求: | ||
| 12 | + | ||
| 13 | +请对coarse_rank fusion公式进行调参: | ||
| 14 | + 目前的baseline是这一组,Primary_Metric_Score: 0.637642: | ||
| 15 | + coarse_rank: | ||
| 16 | + ... | ||
| 17 | + fusion: | ||
| 18 | + es_bias: 10.0 | ||
| 19 | + es_exponent: 0.05 | ||
| 20 | + text_bias: 0.1 | ||
| 21 | + text_exponent: 0.35 | ||
| 22 | + text_translation_weight: 1.0 | ||
| 23 | + knn_text_weight: 1.0 | ||
| 24 | + knn_image_weight: 2.0 | ||
| 25 | + knn_tie_breaker: 0.3 | ||
| 26 | + knn_bias: 0.2 | ||
| 27 | + knn_exponent: 5.6 | ||
| 28 | + knn_text_bias: 0.2 | ||
| 29 | + knn_text_exponent: 0.0 | ||
| 30 | + knn_image_bias: 0.2 | ||
| 31 | + knn_image_exponent: 0.0 | ||
| 32 | + 评估指标在/data/saas-search/artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | ||
| 33 | + 请以这个为基准,发散思维,进行宽一点的范围调参。因为每次重启、评测都需要几分钟,请写一个调参的框架,基于框架、设定好多组参数、写好脚本,每轮收集效果自动的调整参数分布(因为笛卡尔积、多种参数调参成本太高,因此考虑贝叶斯调参等方法通 | ||
| 34 | + 过多轮迭代通过脚本自动收敛) | ||
| 35 | + 每次调整参数后需要重启backend (有时候重启backend后eval-web好像也挂了,如果有可以追查原因并修复) | ||
| 36 | + ./restart.sh backend | ||
| 37 | + 注意:请你调试脚本、进行一轮分析,最终要沉淀一套调参脚本,我下次可以重新跑(还是针对这组参数调参),能自动迭代(调整参数分布)、收集每组参数的指标、调优到最优的参数组合。 | ||
| 38 | + | ||
| 39 | + | ||
| 40 | + | ||
| 41 | +你给出的调参脚本: | ||
| 42 | + | ||
| 43 | + 长时间自动调参脚本已经配好,核心文件是: | ||
| 44 | + | ||
| 45 | + - 调参主程序:scripts/evaluation/tune_fusion.py | ||
| 46 | + - 宽范围搜索空间:scripts/evaluation/tuning/coarse_rank_fusion_space.yaml | ||
| 47 | + - 后台长跑启动脚本:scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 48 | + - 续跑脚本:scripts/evaluation/resume_coarse_fusion_tuning_long.sh | ||
| 49 | + - 使用说明:scripts/evaluation/tuning/README.md | ||
| 50 | + | ||
| 51 | + 这套方案不是笛卡尔积穷举,而是“种子实验 + 随机探索 + 基于历史最优分布的自适应采样”反复迭代。搜索范围已经放宽,新增了多组偏离 baseline 很远的 seed,也保留了 baseline 附近抖动,适合长时间慢慢收敛。 | ||
| 52 | + | ||
| 53 | + 直接启动一轮: | ||
| 54 | + | ||
| 55 | + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 56 | + | ||
| 57 | + 如果想跑更久一点: | ||
| 58 | + | ||
| 59 | + MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 \ | ||
| 60 | + ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 61 | + | ||
| 62 | + 看日志: | ||
| 63 | + | ||
| 64 | + tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log | ||
| 65 | + | ||
| 66 | + 看结果目录: | ||
| 67 | + | ||
| 68 | + ls artifacts/search_evaluation/tuning_runs/<run_name>/ | ||
| 69 | + | ||
| 70 | + 续跑: | ||
| 71 | + | ||
| 72 | + ./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name> | ||
| 73 | + | ||
| 74 | + 每轮会自动做这些事: | ||
| 75 | + | ||
| 76 | + - 生成一批候选 coarse_rank.fusion 参数 | ||
| 77 | + - 写入配置并重启 backend | ||
| 78 | + - 检查 eval-web,必要时自愈拉起 | ||
| 79 | + - 跑评测并收集 Primary_Metric_Score | ||
| 80 | + - 更新 trials.jsonl、leaderboard.csv、summary.md | ||
| 81 | + - 基于历史结果调整下一轮采样分布 | ||
| 82 | + | ||
| 83 | + 你先跑一轮长任务。跑完后把 run_name 或 scripts/evaluation/tuning/README.md 里对应实验目录发我,我再基于结果做下一步分析和收缩搜索空间。 | ||
| 84 | + | ||
| 85 | + | ||
| 86 | + | ||
| 87 | + | ||
| 88 | +› 我运行了:MAX_EVALS=400 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 RUN_NAME=coarse_fusion_long_001 ./scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 89 | + | ||
| 90 | + run_name=coarse_fusion_long_001 | ||
| 91 | + pid=2218620 | ||
| 92 | + log=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log | ||
| 93 | + pid_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.pid | ||
| 94 | + cmd_file=artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.cmd | ||
| 95 | + run_dir=artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001 | ||
| 96 | + | ||
| 97 | + tail -f artifacts/search_evaluation/tuning_launches/coarse_fusion_long_001.log | ||
| 98 | + cat artifacts/search_evaluation/tuning_runs/coarse_fusion_long_001/leaderboard.csv | ||
| 99 | + | ||
| 100 | + | ||
| 101 | +• 我先看这次长跑的日志和当前排行榜,确认任务是否正常推进、是否有异常中断,以及目前最优参数落在哪个方向。 | ||
| 102 | + | ||
| 103 | +• Explored | ||
| 104 | + └ Read coarse_fusion_long_001.log, leaderboard.csv | ||
| 105 | + List coarse_fusion_long_001 | ||
| 106 | + Read summary.md | ||
| 107 | + | ||
| 108 | +─── | ||
| 109 | + | ||
| 110 | +• 结果已经足够看出明显趋势了。我再补一层聚合分析,看看 top 区间里哪些参数方向反复出现,避免只盯着单个最优点。 | ||
| 111 | + | ||
| 112 | +• Ran ps -p 2218620 -o pid=,etimes=,cmd= | ||
| 113 | + └ (no output) | ||
| 114 | + | ||
| 115 | +• Ran python - <<'PY' | ||
| 116 | + │ import csv, statistics, math | ||
| 117 | + │ from pathlib import Path | ||
| 118 | + │ … +15 lines | ||
| 119 | + └ rows 385 | ||
| 120 | + | ||
| 121 | + … +61 lines (ctrl + t to view transcript) | ||
| 122 | + knn_image_bias: mean=0.5631 median=0.2000 min=0.0010 max=4.0000 | ||
| 123 | + knn_image_exponent: mean=4.6316 median=5.8680 min=0.0000 max=6.0000 | ||
| 124 | + | ||
| 125 | +• Ran python - <<'PY' | ||
| 126 | + │ import csv | ||
| 127 | + │ from pathlib import Path | ||
| 128 | + │ … +19 lines | ||
| 129 | + └ baseline_score 0.637642 | ||
| 130 | + | ||
| 131 | + … +30 lines (ctrl + t to view transcript) | ||
| 132 | + 10 bo_234 0.641241 {'es_bias': '7.214', 'es_exponent': '0.2025', 'text_bias': '4.0', 'text_exponent': '1.584', 'text_translation_weight': '1.4441', 'knn_text_weight': '0.1', 'knn_image_weight': '5.6232', 'knn_tie_breaker': | ||
| 133 | + '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'} | ||
| 134 | + | ||
| 135 | +这一次因为外部原因(磁盘满)终止了,以上是最好的一组参数。 | ||
| 136 | + |
scripts/evaluation/resume_coarse_fusion_tuning_long.sh
0 → 100755
| @@ -0,0 +1,76 @@ | @@ -0,0 +1,76 @@ | ||
| 1 | +#!/bin/bash | ||
| 2 | + | ||
| 3 | +set -euo pipefail | ||
| 4 | + | ||
| 5 | +if [ "$#" -lt 1 ]; then | ||
| 6 | + echo "usage: $0 <run_dir_or_name> [extra tune_fusion args...]" >&2 | ||
| 7 | + exit 1 | ||
| 8 | +fi | ||
| 9 | + | ||
| 10 | +cd "$(dirname "$0")/../.." | ||
| 11 | +source ./activate.sh | ||
| 12 | + | ||
| 13 | +TARGET="$1" | ||
| 14 | +shift | ||
| 15 | + | ||
| 16 | +if [ -d "${TARGET}" ]; then | ||
| 17 | + RUN_DIR="${TARGET}" | ||
| 18 | + RUN_NAME="$(basename "${RUN_DIR}")" | ||
| 19 | +else | ||
| 20 | + RUN_NAME="${TARGET}" | ||
| 21 | + RUN_DIR="artifacts/search_evaluation/tuning_runs/${RUN_NAME}" | ||
| 22 | +fi | ||
| 23 | + | ||
| 24 | +if [ ! -d "${RUN_DIR}" ]; then | ||
| 25 | + echo "run dir not found: ${RUN_DIR}" >&2 | ||
| 26 | + exit 1 | ||
| 27 | +fi | ||
| 28 | + | ||
| 29 | +MAX_EVALS="${MAX_EVALS:-36}" | ||
| 30 | +BATCH_SIZE="${BATCH_SIZE:-3}" | ||
| 31 | +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}" | ||
| 32 | + | ||
| 33 | +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches" | ||
| 34 | +mkdir -p "${LAUNCH_DIR}" | ||
| 35 | +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.log" | ||
| 36 | +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.pid" | ||
| 37 | +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.resume.cmd" | ||
| 38 | + | ||
| 39 | +CMD=( | ||
| 40 | + python | ||
| 41 | + scripts/evaluation/tune_fusion.py | ||
| 42 | + --mode optimize | ||
| 43 | + --resume-run "${RUN_DIR}" | ||
| 44 | + --search-space "${RUN_DIR}/search_space.yaml" | ||
| 45 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | ||
| 46 | + --tenant-id 163 | ||
| 47 | + --queries-file scripts/evaluation/queries/queries.txt | ||
| 48 | + --top-k 100 | ||
| 49 | + --language en | ||
| 50 | + --search-base-url http://127.0.0.1:6002 | ||
| 51 | + --eval-web-base-url http://127.0.0.1:6010 | ||
| 52 | + --max-evals "${MAX_EVALS}" | ||
| 53 | + --batch-size "${BATCH_SIZE}" | ||
| 54 | + --candidate-pool-size "${CANDIDATE_POOL_SIZE}" | ||
| 55 | +) | ||
| 56 | + | ||
| 57 | +if [ "$#" -gt 0 ]; then | ||
| 58 | + CMD+=("$@") | ||
| 59 | +fi | ||
| 60 | + | ||
| 61 | +printf '%q ' "${CMD[@]}" > "${CMD_PATH}" | ||
| 62 | +printf '\n' >> "${CMD_PATH}" | ||
| 63 | + | ||
| 64 | +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 & | ||
| 65 | +PID=$! | ||
| 66 | +echo "${PID}" > "${PID_PATH}" | ||
| 67 | + | ||
| 68 | +echo "run_name=${RUN_NAME}" | ||
| 69 | +echo "pid=${PID}" | ||
| 70 | +echo "log=${LOG_PATH}" | ||
| 71 | +echo "pid_file=${PID_PATH}" | ||
| 72 | +echo "cmd_file=${CMD_PATH}" | ||
| 73 | +echo "run_dir=${RUN_DIR}" | ||
| 74 | +echo | ||
| 75 | +echo "tail -f ${LOG_PATH}" | ||
| 76 | +echo "cat ${RUN_DIR}/leaderboard.csv" |
| @@ -0,0 +1,18 @@ | @@ -0,0 +1,18 @@ | ||
| 1 | +#!/bin/bash | ||
| 2 | + | ||
| 3 | +set -euo pipefail | ||
| 4 | + | ||
| 5 | +cd "$(dirname "$0")/../.." | ||
| 6 | +source ./activate.sh | ||
| 7 | + | ||
| 8 | +python scripts/evaluation/tune_fusion.py \ | ||
| 9 | + --mode optimize \ | ||
| 10 | + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml \ | ||
| 11 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md \ | ||
| 12 | + --tenant-id 163 \ | ||
| 13 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 14 | + --top-k 100 \ | ||
| 15 | + --language en \ | ||
| 16 | + --search-base-url http://127.0.0.1:6002 \ | ||
| 17 | + --eval-web-base-url http://127.0.0.1:6010 \ | ||
| 18 | + "$@" |
scripts/evaluation/start_coarse_fusion_tuning_long.sh
0 → 100755
| @@ -0,0 +1,58 @@ | @@ -0,0 +1,58 @@ | ||
| 1 | +#!/bin/bash | ||
| 2 | + | ||
| 3 | +set -euo pipefail | ||
| 4 | + | ||
| 5 | +cd "$(dirname "$0")/../.." | ||
| 6 | +source ./activate.sh | ||
| 7 | + | ||
| 8 | +RUN_NAME="${RUN_NAME:-coarse_fusion_long_$(date -u +%Y%m%dT%H%M%SZ)}" | ||
| 9 | +MAX_EVALS="${MAX_EVALS:-36}" | ||
| 10 | +BATCH_SIZE="${BATCH_SIZE:-3}" | ||
| 11 | +CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}" | ||
| 12 | +RANDOM_SEED="${RANDOM_SEED:-20260416}" | ||
| 13 | + | ||
| 14 | +LAUNCH_DIR="artifacts/search_evaluation/tuning_launches" | ||
| 15 | +mkdir -p "${LAUNCH_DIR}" | ||
| 16 | +LOG_PATH="${LAUNCH_DIR}/${RUN_NAME}.log" | ||
| 17 | +PID_PATH="${LAUNCH_DIR}/${RUN_NAME}.pid" | ||
| 18 | +CMD_PATH="${LAUNCH_DIR}/${RUN_NAME}.cmd" | ||
| 19 | + | ||
| 20 | +CMD=( | ||
| 21 | + python | ||
| 22 | + scripts/evaluation/tune_fusion.py | ||
| 23 | + --mode optimize | ||
| 24 | + --run-name "${RUN_NAME}" | ||
| 25 | + --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml | ||
| 26 | + --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md | ||
| 27 | + --tenant-id 163 | ||
| 28 | + --queries-file scripts/evaluation/queries/queries.txt | ||
| 29 | + --top-k 100 | ||
| 30 | + --language en | ||
| 31 | + --search-base-url http://127.0.0.1:6002 | ||
| 32 | + --eval-web-base-url http://127.0.0.1:6010 | ||
| 33 | + --max-evals "${MAX_EVALS}" | ||
| 34 | + --batch-size "${BATCH_SIZE}" | ||
| 35 | + --candidate-pool-size "${CANDIDATE_POOL_SIZE}" | ||
| 36 | + --random-seed "${RANDOM_SEED}" | ||
| 37 | +) | ||
| 38 | + | ||
| 39 | +if [ "$#" -gt 0 ]; then | ||
| 40 | + CMD+=("$@") | ||
| 41 | +fi | ||
| 42 | + | ||
| 43 | +printf '%q ' "${CMD[@]}" > "${CMD_PATH}" | ||
| 44 | +printf '\n' >> "${CMD_PATH}" | ||
| 45 | + | ||
| 46 | +nohup "${CMD[@]}" > "${LOG_PATH}" 2>&1 & | ||
| 47 | +PID=$! | ||
| 48 | +echo "${PID}" > "${PID_PATH}" | ||
| 49 | + | ||
| 50 | +echo "run_name=${RUN_NAME}" | ||
| 51 | +echo "pid=${PID}" | ||
| 52 | +echo "log=${LOG_PATH}" | ||
| 53 | +echo "pid_file=${PID_PATH}" | ||
| 54 | +echo "cmd_file=${CMD_PATH}" | ||
| 55 | +echo "run_dir=artifacts/search_evaluation/tuning_runs/${RUN_NAME}" | ||
| 56 | +echo | ||
| 57 | +echo "tail -f ${LOG_PATH}" | ||
| 58 | +echo "cat artifacts/search_evaluation/tuning_runs/${RUN_NAME}/leaderboard.csv" |
scripts/evaluation/tune_fusion.py
| @@ -4,23 +4,37 @@ from __future__ import annotations | @@ -4,23 +4,37 @@ from __future__ import annotations | ||
| 4 | 4 | ||
| 5 | import argparse | 5 | import argparse |
| 6 | import copy | 6 | import copy |
| 7 | +import csv | ||
| 7 | import json | 8 | import json |
| 9 | +import math | ||
| 10 | +import random | ||
| 8 | import re | 11 | import re |
| 12 | +import shutil | ||
| 9 | import subprocess | 13 | import subprocess |
| 10 | import sys | 14 | import sys |
| 11 | import time | 15 | import time |
| 12 | from dataclasses import dataclass | 16 | from dataclasses import dataclass |
| 13 | from pathlib import Path | 17 | from pathlib import Path |
| 14 | -from typing import Any, Dict, List | 18 | +from typing import Any, Dict, List, Sequence |
| 15 | 19 | ||
| 20 | +import numpy as np | ||
| 16 | import requests | 21 | import requests |
| 17 | import yaml | 22 | import yaml |
| 18 | 23 | ||
| 24 | +try: | ||
| 25 | + from sklearn.gaussian_process import GaussianProcessRegressor | ||
| 26 | + from sklearn.gaussian_process.kernels import ConstantKernel, Matern, WhiteKernel | ||
| 27 | +except Exception: # noqa: BLE001 | ||
| 28 | + GaussianProcessRegressor = None # type: ignore[assignment] | ||
| 29 | + ConstantKernel = None # type: ignore[assignment] | ||
| 30 | + Matern = None # type: ignore[assignment] | ||
| 31 | + WhiteKernel = None # type: ignore[assignment] | ||
| 32 | + | ||
| 19 | PROJECT_ROOT = Path(__file__).resolve().parents[2] | 33 | PROJECT_ROOT = Path(__file__).resolve().parents[2] |
| 20 | if str(PROJECT_ROOT) not in sys.path: | 34 | if str(PROJECT_ROOT) not in sys.path: |
| 21 | sys.path.insert(0, str(PROJECT_ROOT)) | 35 | sys.path.insert(0, str(PROJECT_ROOT)) |
| 22 | 36 | ||
| 23 | -from scripts.evaluation.eval_framework import ( | 37 | +from scripts.evaluation.eval_framework import ( # noqa: E402 |
| 24 | DEFAULT_ARTIFACT_ROOT, | 38 | DEFAULT_ARTIFACT_ROOT, |
| 25 | DEFAULT_QUERY_FILE, | 39 | DEFAULT_QUERY_FILE, |
| 26 | ensure_dir, | 40 | ensure_dir, |
| @@ -30,6 +44,7 @@ from scripts.evaluation.eval_framework import ( | @@ -30,6 +44,7 @@ from scripts.evaluation.eval_framework import ( | ||
| 30 | 44 | ||
| 31 | 45 | ||
| 32 | CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml" | 46 | CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml" |
| 47 | +LOG_DIR = PROJECT_ROOT / "logs" | ||
| 33 | 48 | ||
| 34 | 49 | ||
| 35 | @dataclass | 50 | @dataclass |
| @@ -39,6 +54,108 @@ class ExperimentSpec: | @@ -39,6 +54,108 @@ class ExperimentSpec: | ||
| 39 | params: Dict[str, Any] | 54 | params: Dict[str, Any] |
| 40 | 55 | ||
| 41 | 56 | ||
| 57 | +@dataclass | ||
| 58 | +class ParameterSpec: | ||
| 59 | + name: str | ||
| 60 | + lower: float | ||
| 61 | + upper: float | ||
| 62 | + scale: str = "linear" | ||
| 63 | + round_digits: int = 6 | ||
| 64 | + | ||
| 65 | + def __post_init__(self) -> None: | ||
| 66 | + if self.lower >= self.upper: | ||
| 67 | + raise ValueError(f"invalid bounds for {self.name}: {self.lower} >= {self.upper}") | ||
| 68 | + if self.scale not in {"linear", "log"}: | ||
| 69 | + raise ValueError(f"unsupported scale={self.scale!r} for {self.name}") | ||
| 70 | + if self.scale == "log" and (self.lower <= 0 or self.upper <= 0): | ||
| 71 | + raise ValueError(f"log-scaled parameter {self.name} must have positive bounds") | ||
| 72 | + | ||
| 73 | + @property | ||
| 74 | + def transformed_lower(self) -> float: | ||
| 75 | + return math.log10(self.lower) if self.scale == "log" else self.lower | ||
| 76 | + | ||
| 77 | + @property | ||
| 78 | + def transformed_upper(self) -> float: | ||
| 79 | + return math.log10(self.upper) if self.scale == "log" else self.upper | ||
| 80 | + | ||
| 81 | + @property | ||
| 82 | + def transformed_span(self) -> float: | ||
| 83 | + return self.transformed_upper - self.transformed_lower | ||
| 84 | + | ||
| 85 | + def transform(self, value: float) -> float: | ||
| 86 | + clipped = min(max(float(value), self.lower), self.upper) | ||
| 87 | + return math.log10(clipped) if self.scale == "log" else clipped | ||
| 88 | + | ||
| 89 | + def inverse_transform(self, value: float) -> float: | ||
| 90 | + raw = (10 ** value) if self.scale == "log" else value | ||
| 91 | + raw = min(max(float(raw), self.lower), self.upper) | ||
| 92 | + return round(raw, self.round_digits) | ||
| 93 | + | ||
| 94 | + def sample_uniform(self, rng: random.Random) -> float: | ||
| 95 | + draw = rng.uniform(self.transformed_lower, self.transformed_upper) | ||
| 96 | + return self.inverse_transform(draw) | ||
| 97 | + | ||
| 98 | + | ||
| 99 | +@dataclass | ||
| 100 | +class SearchSpace: | ||
| 101 | + target_path: str | ||
| 102 | + baseline: Dict[str, float] | ||
| 103 | + parameters: List[ParameterSpec] | ||
| 104 | + seed_experiments: List[ExperimentSpec] | ||
| 105 | + init_random: int = 6 | ||
| 106 | + candidate_pool_size: int = 256 | ||
| 107 | + explore_probability: float = 0.25 | ||
| 108 | + local_jitter_probability: float = 0.45 | ||
| 109 | + elite_fraction: float = 0.35 | ||
| 110 | + min_normalized_distance: float = 0.14 | ||
| 111 | + | ||
| 112 | + @property | ||
| 113 | + def parameter_names(self) -> List[str]: | ||
| 114 | + return [item.name for item in self.parameters] | ||
| 115 | + | ||
| 116 | + def fill_params(self, params: Dict[str, Any]) -> Dict[str, float]: | ||
| 117 | + merged = {name: float(self.baseline[name]) for name in self.parameter_names} | ||
| 118 | + for name, value in params.items(): | ||
| 119 | + if name not in merged: | ||
| 120 | + raise KeyError(f"unknown parameter in search space: {name}") | ||
| 121 | + merged[name] = float(value) | ||
| 122 | + return { | ||
| 123 | + spec.name: spec.inverse_transform(spec.transform(float(merged[spec.name]))) | ||
| 124 | + for spec in self.parameters | ||
| 125 | + } | ||
| 126 | + | ||
| 127 | + def sample_random(self, rng: random.Random) -> Dict[str, float]: | ||
| 128 | + return {spec.name: spec.sample_uniform(rng) for spec in self.parameters} | ||
| 129 | + | ||
| 130 | + def vectorize(self, params: Dict[str, Any]) -> np.ndarray: | ||
| 131 | + merged = self.fill_params(params) | ||
| 132 | + return np.array([spec.transform(float(merged[spec.name])) for spec in self.parameters], dtype=float) | ||
| 133 | + | ||
| 134 | + def from_vector(self, vector: Sequence[float]) -> Dict[str, float]: | ||
| 135 | + return { | ||
| 136 | + spec.name: spec.inverse_transform(float(vector[idx])) | ||
| 137 | + for idx, spec in enumerate(self.parameters) | ||
| 138 | + } | ||
| 139 | + | ||
| 140 | + def normalized_vector(self, params: Dict[str, Any]) -> np.ndarray: | ||
| 141 | + vector = self.vectorize(params) | ||
| 142 | + parts: List[float] = [] | ||
| 143 | + for idx, spec in enumerate(self.parameters): | ||
| 144 | + parts.append((vector[idx] - spec.transformed_lower) / max(spec.transformed_span, 1e-9)) | ||
| 145 | + return np.array(parts, dtype=float) | ||
| 146 | + | ||
| 147 | + def canonical_key(self, params: Dict[str, Any]) -> str: | ||
| 148 | + return json.dumps(self.fill_params(params), ensure_ascii=False, sort_keys=True) | ||
| 149 | + | ||
| 150 | + | ||
| 151 | +@dataclass | ||
| 152 | +class CandidateProposal: | ||
| 153 | + name: str | ||
| 154 | + description: str | ||
| 155 | + params: Dict[str, float] | ||
| 156 | + source: str | ||
| 157 | + | ||
| 158 | + | ||
| 42 | def load_yaml(path: Path) -> Dict[str, Any]: | 159 | def load_yaml(path: Path) -> Dict[str, Any]: |
| 43 | return yaml.safe_load(path.read_text(encoding="utf-8")) | 160 | return yaml.safe_load(path.read_text(encoding="utf-8")) |
| 44 | 161 | ||
| @@ -50,6 +167,13 @@ def write_yaml(path: Path, payload: Dict[str, Any]) -> None: | @@ -50,6 +167,13 @@ def write_yaml(path: Path, payload: Dict[str, Any]) -> None: | ||
| 50 | ) | 167 | ) |
| 51 | 168 | ||
| 52 | 169 | ||
| 170 | +def get_nested_value(payload: Dict[str, Any], dotted_path: str) -> Any: | ||
| 171 | + current: Any = payload | ||
| 172 | + for part in dotted_path.split("."): | ||
| 173 | + current = current[part] | ||
| 174 | + return current | ||
| 175 | + | ||
| 176 | + | ||
| 53 | def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None: | 177 | def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None: |
| 54 | current = payload | 178 | current = payload |
| 55 | parts = dotted_path.split(".") | 179 | parts = dotted_path.split(".") |
| @@ -58,16 +182,115 @@ def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> N | @@ -58,16 +182,115 @@ def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> N | ||
| 58 | current[parts[-1]] = value | 182 | current[parts[-1]] = value |
| 59 | 183 | ||
| 60 | 184 | ||
| 61 | -def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]: | 185 | +def apply_target_params(base_config: Dict[str, Any], target_path: str, params: Dict[str, Any]) -> Dict[str, Any]: |
| 62 | candidate = copy.deepcopy(base_config) | 186 | candidate = copy.deepcopy(base_config) |
| 63 | - for dotted_path, value in params.items(): | ||
| 64 | - set_nested_value(candidate, dotted_path, value) | 187 | + for key, value in params.items(): |
| 188 | + set_nested_value(candidate, f"{target_path}.{key}", value) | ||
| 65 | return candidate | 189 | return candidate |
| 66 | 190 | ||
| 67 | 191 | ||
| 192 | +def read_queries(path: Path) -> List[str]: | ||
| 193 | + return [ | ||
| 194 | + line.strip() | ||
| 195 | + for line in path.read_text(encoding="utf-8").splitlines() | ||
| 196 | + if line.strip() and not line.strip().startswith("#") | ||
| 197 | + ] | ||
| 198 | + | ||
| 199 | + | ||
| 200 | +def run_restart(targets: Sequence[str]) -> None: | ||
| 201 | + cmd = ["./restart.sh", *targets] | ||
| 202 | + subprocess.run(cmd, cwd=PROJECT_ROOT, check=True, timeout=900) | ||
| 203 | + | ||
| 204 | + | ||
| 205 | +def bytes_to_gib(value: int) -> float: | ||
| 206 | + return float(value) / float(1024 ** 3) | ||
| 207 | + | ||
| 208 | + | ||
| 209 | +def get_free_disk_bytes(path: Path) -> int: | ||
| 210 | + return int(shutil.disk_usage(path).free) | ||
| 211 | + | ||
| 212 | + | ||
| 213 | +def iter_log_cleanup_candidates() -> List[Path]: | ||
| 214 | + if not LOG_DIR.is_dir(): | ||
| 215 | + return [] | ||
| 216 | + items: List[Path] = [] | ||
| 217 | + seen: set[str] = set() | ||
| 218 | + for path in LOG_DIR.rglob("*"): | ||
| 219 | + try: | ||
| 220 | + if not path.is_file(): | ||
| 221 | + continue | ||
| 222 | + resolved = path.resolve() | ||
| 223 | + key = str(resolved) | ||
| 224 | + if key in seen: | ||
| 225 | + continue | ||
| 226 | + seen.add(key) | ||
| 227 | + items.append(resolved) | ||
| 228 | + except FileNotFoundError: | ||
| 229 | + continue | ||
| 230 | + items.sort(key=lambda item: item.stat().st_size if item.exists() else 0, reverse=True) | ||
| 231 | + return items | ||
| 232 | + | ||
| 233 | + | ||
| 234 | +def truncate_file(path: Path) -> int: | ||
| 235 | + if not path.exists() or not path.is_file(): | ||
| 236 | + return 0 | ||
| 237 | + size = int(path.stat().st_size) | ||
| 238 | + if size <= 0: | ||
| 239 | + return 0 | ||
| 240 | + with path.open("w", encoding="utf-8"): | ||
| 241 | + pass | ||
| 242 | + return size | ||
| 243 | + | ||
| 244 | + | ||
| 245 | +def ensure_disk_headroom( | ||
| 246 | + *, | ||
| 247 | + min_free_gb: float, | ||
| 248 | + auto_truncate_logs: bool, | ||
| 249 | + context: str, | ||
| 250 | +) -> None: | ||
| 251 | + required_bytes = int(min_free_gb * (1024 ** 3)) | ||
| 252 | + free_bytes = get_free_disk_bytes(PROJECT_ROOT) | ||
| 253 | + if free_bytes >= required_bytes: | ||
| 254 | + return | ||
| 255 | + | ||
| 256 | + print( | ||
| 257 | + f"[disk] low free space before {context}: " | ||
| 258 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB" | ||
| 259 | + ) | ||
| 260 | + if not auto_truncate_logs: | ||
| 261 | + raise RuntimeError( | ||
| 262 | + f"insufficient disk headroom before {context}: " | ||
| 263 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB" | ||
| 264 | + ) | ||
| 265 | + | ||
| 266 | + reclaimed_bytes = 0 | ||
| 267 | + for candidate in iter_log_cleanup_candidates(): | ||
| 268 | + try: | ||
| 269 | + reclaimed = truncate_file(candidate) | ||
| 270 | + except Exception as exc: # noqa: BLE001 | ||
| 271 | + print(f"[disk] skip truncate {candidate}: {exc}") | ||
| 272 | + continue | ||
| 273 | + if reclaimed <= 0: | ||
| 274 | + continue | ||
| 275 | + reclaimed_bytes += reclaimed | ||
| 276 | + free_bytes = get_free_disk_bytes(PROJECT_ROOT) | ||
| 277 | + print( | ||
| 278 | + f"[disk] truncated {candidate} reclaimed={bytes_to_gib(reclaimed):.2f}GiB " | ||
| 279 | + f"free_now={bytes_to_gib(free_bytes):.2f}GiB" | ||
| 280 | + ) | ||
| 281 | + if free_bytes >= required_bytes: | ||
| 282 | + return | ||
| 283 | + | ||
| 284 | + raise RuntimeError( | ||
| 285 | + f"insufficient disk headroom after log truncation before {context}: " | ||
| 286 | + f"free={bytes_to_gib(free_bytes):.2f}GiB required={min_free_gb:.2f}GiB " | ||
| 287 | + f"reclaimed={bytes_to_gib(reclaimed_bytes):.2f}GiB" | ||
| 288 | + ) | ||
| 289 | + | ||
| 290 | + | ||
| 68 | def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]: | 291 | def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]: |
| 69 | deadline = time.time() + timeout_sec | 292 | deadline = time.time() + timeout_sec |
| 70 | - last_error = None | 293 | + last_error: Any = None |
| 71 | while time.time() < deadline: | 294 | while time.time() < deadline: |
| 72 | try: | 295 | try: |
| 73 | response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10) | 296 | response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10) |
| @@ -82,16 +305,69 @@ def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any | @@ -82,16 +305,69 @@ def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any | ||
| 82 | raise RuntimeError(f"backend did not become healthy: {last_error}") | 305 | raise RuntimeError(f"backend did not become healthy: {last_error}") |
| 83 | 306 | ||
| 84 | 307 | ||
| 85 | -def run_restart() -> None: | ||
| 86 | - subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600) | 308 | +def wait_for_eval_web(base_url: str, timeout_sec: float = 90.0) -> Dict[str, Any]: |
| 309 | + url = f"{base_url.rstrip('/')}/api/history" | ||
| 310 | + deadline = time.time() + timeout_sec | ||
| 311 | + last_error: Any = None | ||
| 312 | + while time.time() < deadline: | ||
| 313 | + try: | ||
| 314 | + response = requests.get(url, timeout=10) | ||
| 315 | + response.raise_for_status() | ||
| 316 | + payload = response.json() | ||
| 317 | + if isinstance(payload, dict) and "history" in payload: | ||
| 318 | + return payload | ||
| 319 | + last_error = payload | ||
| 320 | + except Exception as exc: # noqa: BLE001 | ||
| 321 | + last_error = str(exc) | ||
| 322 | + time.sleep(2.0) | ||
| 323 | + raise RuntimeError(f"eval-web did not become healthy: {last_error}") | ||
| 324 | + | ||
| 325 | + | ||
| 326 | +def ensure_eval_web(eval_web_base_url: str) -> Dict[str, Any]: | ||
| 327 | + try: | ||
| 328 | + return wait_for_eval_web(eval_web_base_url, timeout_sec=20.0) | ||
| 329 | + except Exception: # noqa: BLE001 | ||
| 330 | + run_restart(["eval-web"]) | ||
| 331 | + return wait_for_eval_web(eval_web_base_url, timeout_sec=120.0) | ||
| 87 | 332 | ||
| 88 | 333 | ||
| 89 | -def read_queries(path: Path) -> List[str]: | ||
| 90 | - return [ | ||
| 91 | - line.strip() | ||
| 92 | - for line in path.read_text(encoding="utf-8").splitlines() | ||
| 93 | - if line.strip() and not line.strip().startswith("#") | ||
| 94 | - ] | 334 | +def verify_backend_config(base_url: str, target_path: str, expected: Dict[str, Any], tol: float = 1e-6) -> bool: |
| 335 | + response = requests.get(f"{base_url.rstrip('/')}/admin/config", timeout=20) | ||
| 336 | + response.raise_for_status() | ||
| 337 | + payload = response.json() | ||
| 338 | + candidate_paths = [target_path] | ||
| 339 | + if not target_path.startswith("search."): | ||
| 340 | + candidate_paths.append(f"search.{target_path}") | ||
| 341 | + if target_path.startswith("search."): | ||
| 342 | + candidate_paths.append(target_path[len("search."):]) | ||
| 343 | + | ||
| 344 | + live_block = None | ||
| 345 | + for path in candidate_paths: | ||
| 346 | + try: | ||
| 347 | + maybe_block = get_nested_value(payload, path) | ||
| 348 | + except Exception: # noqa: BLE001 | ||
| 349 | + continue | ||
| 350 | + if isinstance(maybe_block, dict): | ||
| 351 | + live_block = maybe_block | ||
| 352 | + break | ||
| 353 | + if live_block is None: | ||
| 354 | + raise RuntimeError( | ||
| 355 | + f"unable to resolve backend config path {target_path!r}; " | ||
| 356 | + f"tried={candidate_paths!r} top_level_keys={sorted(payload.keys())[:20]!r}" | ||
| 357 | + ) | ||
| 358 | + for key, expected_value in expected.items(): | ||
| 359 | + live_value = live_block[key] | ||
| 360 | + if isinstance(expected_value, (int, float)): | ||
| 361 | + if abs(float(live_value) - float(expected_value)) > tol: | ||
| 362 | + raise RuntimeError( | ||
| 363 | + f"backend config mismatch for {target_path}.{key}: " | ||
| 364 | + f"expected={expected_value} live={live_value}" | ||
| 365 | + ) | ||
| 366 | + elif live_value != expected_value: | ||
| 367 | + raise RuntimeError( | ||
| 368 | + f"backend config mismatch for {target_path}.{key}: expected={expected_value!r} live={live_value!r}" | ||
| 369 | + ) | ||
| 370 | + return True | ||
| 95 | 371 | ||
| 96 | 372 | ||
| 97 | def run_batch_eval( | 373 | def run_batch_eval( |
| @@ -126,95 +402,580 @@ def run_batch_eval( | @@ -126,95 +402,580 @@ def run_batch_eval( | ||
| 126 | timeout=7200, | 402 | timeout=7200, |
| 127 | ) | 403 | ) |
| 128 | output = (completed.stdout or "") + "\n" + (completed.stderr or "") | 404 | output = (completed.stdout or "") + "\n" + (completed.stderr or "") |
| 129 | - match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output) | ||
| 130 | - if not match: | 405 | + batch_ids = re.findall(r"batch_id=([A-Za-z0-9_]+)", output) |
| 406 | + if not batch_ids: | ||
| 131 | raise RuntimeError(f"failed to parse batch output: {output[-2000:]}") | 407 | raise RuntimeError(f"failed to parse batch output: {output[-2000:]}") |
| 132 | - batch_id = match.group(1) | ||
| 133 | - aggregate_metrics = json.loads(match.group(2).replace("'", '"')) | 408 | + batch_id = batch_ids[-1] |
| 409 | + batch_json_path = DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.json" | ||
| 410 | + if not batch_json_path.is_file(): | ||
| 411 | + raise RuntimeError(f"batch json not found after eval: {batch_json_path}") | ||
| 412 | + payload = json.loads(batch_json_path.read_text(encoding="utf-8")) | ||
| 134 | return { | 413 | return { |
| 135 | "batch_id": batch_id, | 414 | "batch_id": batch_id, |
| 136 | - "aggregate_metrics": aggregate_metrics, | 415 | + "payload": payload, |
| 137 | "raw_output": output, | 416 | "raw_output": output, |
| 417 | + "batch_json_path": str(batch_json_path), | ||
| 418 | + "batch_report_path": str(DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.md"), | ||
| 419 | + } | ||
| 420 | + | ||
| 421 | + | ||
| 422 | +def resolve_batch_json_path(path_like: str) -> Path: | ||
| 423 | + path = Path(path_like) | ||
| 424 | + if not path.is_absolute(): | ||
| 425 | + path = (PROJECT_ROOT / path).resolve() | ||
| 426 | + if path.suffix == ".json": | ||
| 427 | + return path | ||
| 428 | + if path.suffix == ".md": | ||
| 429 | + candidate = path.with_suffix(".json") | ||
| 430 | + if candidate.is_file(): | ||
| 431 | + return candidate | ||
| 432 | + if path.is_file(): | ||
| 433 | + return path | ||
| 434 | + candidate = path.parent / f"{path.name}.json" | ||
| 435 | + if candidate.is_file(): | ||
| 436 | + return candidate | ||
| 437 | + raise FileNotFoundError(f"cannot resolve batch json from: {path_like}") | ||
| 438 | + | ||
| 439 | + | ||
| 440 | +def load_batch_payload(path_like: str) -> Dict[str, Any]: | ||
| 441 | + path = resolve_batch_json_path(path_like) | ||
| 442 | + return json.loads(path.read_text(encoding="utf-8")) | ||
| 443 | + | ||
| 444 | + | ||
| 445 | +def load_experiments(path: Path) -> List[ExperimentSpec]: | ||
| 446 | + payload = json.loads(path.read_text(encoding="utf-8")) | ||
| 447 | + items = payload["experiments"] if isinstance(payload, dict) else payload | ||
| 448 | + experiments: List[ExperimentSpec] = [] | ||
| 449 | + for item in items: | ||
| 450 | + experiments.append( | ||
| 451 | + ExperimentSpec( | ||
| 452 | + name=str(item["name"]), | ||
| 453 | + description=str(item.get("description") or ""), | ||
| 454 | + params=dict(item.get("params") or {}), | ||
| 455 | + ) | ||
| 456 | + ) | ||
| 457 | + return experiments | ||
| 458 | + | ||
| 459 | + | ||
| 460 | +def load_search_space(path: Path) -> SearchSpace: | ||
| 461 | + payload = load_yaml(path) | ||
| 462 | + parameters = [ | ||
| 463 | + ParameterSpec( | ||
| 464 | + name=str(name), | ||
| 465 | + lower=float(spec["min"]), | ||
| 466 | + upper=float(spec["max"]), | ||
| 467 | + scale=str(spec.get("scale", "linear")), | ||
| 468 | + round_digits=int(spec.get("round", 6)), | ||
| 469 | + ) | ||
| 470 | + for name, spec in dict(payload["parameters"]).items() | ||
| 471 | + ] | ||
| 472 | + baseline = {str(key): float(value) for key, value in dict(payload["baseline"]).items()} | ||
| 473 | + seed_experiments = [ | ||
| 474 | + ExperimentSpec( | ||
| 475 | + name=str(item["name"]), | ||
| 476 | + description=str(item.get("description") or ""), | ||
| 477 | + params={str(k): float(v) for k, v in dict(item.get("params") or {}).items()}, | ||
| 478 | + ) | ||
| 479 | + for item in list(payload.get("seed_experiments") or []) | ||
| 480 | + ] | ||
| 481 | + optimizer = dict(payload.get("optimizer") or {}) | ||
| 482 | + return SearchSpace( | ||
| 483 | + target_path=str(payload["target_path"]), | ||
| 484 | + baseline=baseline, | ||
| 485 | + parameters=parameters, | ||
| 486 | + seed_experiments=seed_experiments, | ||
| 487 | + init_random=int(optimizer.get("init_random", 6)), | ||
| 488 | + candidate_pool_size=int(optimizer.get("candidate_pool_size", 256)), | ||
| 489 | + explore_probability=float(optimizer.get("explore_probability", 0.25)), | ||
| 490 | + local_jitter_probability=float(optimizer.get("local_jitter_probability", 0.45)), | ||
| 491 | + elite_fraction=float(optimizer.get("elite_fraction", 0.35)), | ||
| 492 | + min_normalized_distance=float(optimizer.get("min_normalized_distance", 0.14)), | ||
| 493 | + ) | ||
| 494 | + | ||
| 495 | + | ||
| 496 | +def load_existing_trials(run_dir: Path) -> List[Dict[str, Any]]: | ||
| 497 | + path = run_dir / "trials.jsonl" | ||
| 498 | + if not path.is_file(): | ||
| 499 | + return [] | ||
| 500 | + trials: List[Dict[str, Any]] = [] | ||
| 501 | + for line in path.read_text(encoding="utf-8").splitlines(): | ||
| 502 | + line = line.strip() | ||
| 503 | + if line: | ||
| 504 | + trials.append(json.loads(line)) | ||
| 505 | + return trials | ||
| 506 | + | ||
| 507 | + | ||
| 508 | +def append_trial(run_dir: Path, trial: Dict[str, Any]) -> None: | ||
| 509 | + path = run_dir / "trials.jsonl" | ||
| 510 | + with path.open("a", encoding="utf-8") as handle: | ||
| 511 | + handle.write(json.dumps(trial, ensure_ascii=False) + "\n") | ||
| 512 | + | ||
| 513 | + | ||
| 514 | +def live_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 515 | + return [ | ||
| 516 | + item | ||
| 517 | + for item in trials | ||
| 518 | + if item.get("status") == "ok" and not bool(item.get("is_seed")) | ||
| 519 | + ] | ||
| 520 | + | ||
| 521 | + | ||
| 522 | +def all_success_trials(trials: Sequence[Dict[str, Any]]) -> List[Dict[str, Any]]: | ||
| 523 | + return [item for item in trials if item.get("status") == "ok"] | ||
| 524 | + | ||
| 525 | + | ||
| 526 | +def score_of(trial: Dict[str, Any], metric: str) -> float: | ||
| 527 | + return float((trial.get("aggregate_metrics") or {}).get(metric, trial.get("score", 0.0)) or 0.0) | ||
| 528 | + | ||
| 529 | + | ||
| 530 | +def next_trial_name(trials: Sequence[Dict[str, Any]], prefix: str) -> str: | ||
| 531 | + return f"{prefix}_{len(trials) + 1:03d}" | ||
| 532 | + | ||
| 533 | + | ||
| 534 | +def normal_pdf(x: float) -> float: | ||
| 535 | + return math.exp(-0.5 * x * x) / math.sqrt(2.0 * math.pi) | ||
| 536 | + | ||
| 537 | + | ||
| 538 | +def normal_cdf(x: float) -> float: | ||
| 539 | + return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0))) | ||
| 540 | + | ||
| 541 | + | ||
| 542 | +def expected_improvement(mu: float, sigma: float, best: float, xi: float = 0.002) -> float: | ||
| 543 | + if sigma <= 1e-12: | ||
| 544 | + return max(mu - best - xi, 0.0) | ||
| 545 | + z = (mu - best - xi) / sigma | ||
| 546 | + return (mu - best - xi) * normal_cdf(z) + sigma * normal_pdf(z) | ||
| 547 | + | ||
| 548 | + | ||
| 549 | +def normalized_distance(space: SearchSpace, left: Dict[str, Any], right: Dict[str, Any]) -> float: | ||
| 550 | + lv = space.normalized_vector(left) | ||
| 551 | + rv = space.normalized_vector(right) | ||
| 552 | + return float(np.linalg.norm(lv - rv) / math.sqrt(len(space.parameters))) | ||
| 553 | + | ||
| 554 | + | ||
| 555 | +def fit_surrogate(space: SearchSpace, trials: Sequence[Dict[str, Any]], metric: str, seed: int) -> Any: | ||
| 556 | + if GaussianProcessRegressor is None or len(trials) < 4: | ||
| 557 | + return None | ||
| 558 | + X = np.array([space.vectorize(item["params"]) for item in trials], dtype=float) | ||
| 559 | + y = np.array([score_of(item, metric) for item in trials], dtype=float) | ||
| 560 | + if len(np.unique(np.round(y, 8))) < 2: | ||
| 561 | + return None | ||
| 562 | + try: | ||
| 563 | + kernel = ( | ||
| 564 | + ConstantKernel(1.0, (1e-3, 1e3)) | ||
| 565 | + * Matern(length_scale=np.ones(len(space.parameters)), length_scale_bounds=(1e-2, 1e2), nu=2.5) | ||
| 566 | + + WhiteKernel(noise_level=1e-5, noise_level_bounds=(1e-8, 1e-1)) | ||
| 567 | + ) | ||
| 568 | + gp = GaussianProcessRegressor( | ||
| 569 | + kernel=kernel, | ||
| 570 | + normalize_y=True, | ||
| 571 | + n_restarts_optimizer=2, | ||
| 572 | + random_state=seed, | ||
| 573 | + ) | ||
| 574 | + gp.fit(X, y) | ||
| 575 | + return gp | ||
| 576 | + except Exception: # noqa: BLE001 | ||
| 577 | + return None | ||
| 578 | + | ||
| 579 | + | ||
| 580 | +def build_sampling_spread(space: SearchSpace, elite_vectors: np.ndarray) -> np.ndarray: | ||
| 581 | + spans = np.array([spec.transformed_span for spec in space.parameters], dtype=float) | ||
| 582 | + floor = np.maximum(spans * 0.05, 0.015) | ||
| 583 | + ceiling = np.maximum(spans * 0.5, floor) | ||
| 584 | + if elite_vectors.shape[0] <= 1: | ||
| 585 | + return np.minimum(np.maximum(spans * 0.18, floor), ceiling) | ||
| 586 | + elite_std = elite_vectors.std(axis=0) | ||
| 587 | + elite_range = elite_vectors.max(axis=0) - elite_vectors.min(axis=0) | ||
| 588 | + spread = np.maximum(elite_std * 1.8, elite_range * 0.5) | ||
| 589 | + return np.minimum(np.maximum(spread, floor), ceiling) | ||
| 590 | + | ||
| 591 | + | ||
| 592 | +def sample_local_candidate( | ||
| 593 | + space: SearchSpace, | ||
| 594 | + rng: random.Random, | ||
| 595 | + center: np.ndarray, | ||
| 596 | + spread: np.ndarray, | ||
| 597 | +) -> Dict[str, float]: | ||
| 598 | + draw = [] | ||
| 599 | + for idx, spec in enumerate(space.parameters): | ||
| 600 | + value = rng.gauss(float(center[idx]), float(spread[idx])) | ||
| 601 | + value = min(max(value, spec.transformed_lower), spec.transformed_upper) | ||
| 602 | + draw.append(value) | ||
| 603 | + return space.from_vector(draw) | ||
| 604 | + | ||
| 605 | + | ||
| 606 | +def sample_crossover_candidate( | ||
| 607 | + space: SearchSpace, | ||
| 608 | + rng: random.Random, | ||
| 609 | + left: np.ndarray, | ||
| 610 | + right: np.ndarray, | ||
| 611 | +) -> Dict[str, float]: | ||
| 612 | + draw = [] | ||
| 613 | + for idx, spec in enumerate(space.parameters): | ||
| 614 | + mix = rng.random() | ||
| 615 | + value = float(left[idx]) * mix + float(right[idx]) * (1.0 - mix) | ||
| 616 | + jitter = spec.transformed_span * 0.04 | ||
| 617 | + value += rng.uniform(-jitter, jitter) | ||
| 618 | + value = min(max(value, spec.transformed_lower), spec.transformed_upper) | ||
| 619 | + draw.append(value) | ||
| 620 | + return space.from_vector(draw) | ||
| 621 | + | ||
| 622 | + | ||
| 623 | +def propose_candidates( | ||
| 624 | + *, | ||
| 625 | + space: SearchSpace, | ||
| 626 | + trials: Sequence[Dict[str, Any]], | ||
| 627 | + metric: str, | ||
| 628 | + batch_size: int, | ||
| 629 | + rng: random.Random, | ||
| 630 | + init_random: int, | ||
| 631 | + candidate_pool_size: int, | ||
| 632 | +) -> List[CandidateProposal]: | ||
| 633 | + existing_keys = {space.canonical_key(item["params"]) for item in trials if item.get("params")} | ||
| 634 | + proposals: List[CandidateProposal] = [] | ||
| 635 | + | ||
| 636 | + for seed in space.seed_experiments: | ||
| 637 | + params = space.fill_params(seed.params) | ||
| 638 | + key = space.canonical_key(params) | ||
| 639 | + if key not in existing_keys: | ||
| 640 | + proposals.append( | ||
| 641 | + CandidateProposal( | ||
| 642 | + name=seed.name, | ||
| 643 | + description=seed.description, | ||
| 644 | + params=params, | ||
| 645 | + source="seed_experiment", | ||
| 646 | + ) | ||
| 647 | + ) | ||
| 648 | + existing_keys.add(key) | ||
| 649 | + if len(proposals) >= batch_size: | ||
| 650 | + return proposals | ||
| 651 | + | ||
| 652 | + successes = live_success_trials(trials) | ||
| 653 | + if len(successes) < init_random: | ||
| 654 | + while len(proposals) < batch_size: | ||
| 655 | + params = space.sample_random(rng) | ||
| 656 | + key = space.canonical_key(params) | ||
| 657 | + if key in existing_keys: | ||
| 658 | + continue | ||
| 659 | + proposals.append( | ||
| 660 | + CandidateProposal( | ||
| 661 | + name=f"random_{len(successes) + len(proposals) + 1:03d}", | ||
| 662 | + description="global random exploration", | ||
| 663 | + params=params, | ||
| 664 | + source="random", | ||
| 665 | + ) | ||
| 666 | + ) | ||
| 667 | + existing_keys.add(key) | ||
| 668 | + return proposals | ||
| 669 | + | ||
| 670 | + ranked = sorted(successes, key=lambda item: score_of(item, metric), reverse=True) | ||
| 671 | + elite_count = max(2, min(len(ranked), int(math.ceil(len(ranked) * space.elite_fraction)))) | ||
| 672 | + elites = ranked[:elite_count] | ||
| 673 | + elite_vectors = np.array([space.vectorize(item["params"]) for item in elites], dtype=float) | ||
| 674 | + spread = build_sampling_spread(space, elite_vectors) | ||
| 675 | + gp = fit_surrogate(space, successes, metric, seed=rng.randint(1, 10_000_000)) | ||
| 676 | + best_score = score_of(ranked[0], metric) | ||
| 677 | + best_vector = space.vectorize(ranked[0]["params"]) | ||
| 678 | + | ||
| 679 | + pool: List[Dict[str, Any]] = [] | ||
| 680 | + pool_keys = set(existing_keys) | ||
| 681 | + attempts = 0 | ||
| 682 | + max_attempts = max(candidate_pool_size * 12, 200) | ||
| 683 | + while len(pool) < candidate_pool_size and attempts < max_attempts: | ||
| 684 | + attempts += 1 | ||
| 685 | + roll = rng.random() | ||
| 686 | + if roll < space.explore_probability: | ||
| 687 | + params = space.sample_random(rng) | ||
| 688 | + source = "global_explore" | ||
| 689 | + elif roll < space.explore_probability + space.local_jitter_probability: | ||
| 690 | + center = elite_vectors[rng.randrange(len(elite_vectors))] | ||
| 691 | + params = sample_local_candidate(space, rng, center=center, spread=spread) | ||
| 692 | + source = "elite_jitter" | ||
| 693 | + else: | ||
| 694 | + if len(elite_vectors) >= 2: | ||
| 695 | + left = elite_vectors[rng.randrange(len(elite_vectors))] | ||
| 696 | + right = elite_vectors[rng.randrange(len(elite_vectors))] | ||
| 697 | + params = sample_crossover_candidate(space, rng, left=left, right=right) | ||
| 698 | + source = "elite_crossover" | ||
| 699 | + else: | ||
| 700 | + params = sample_local_candidate(space, rng, center=best_vector, spread=spread) | ||
| 701 | + source = "best_jitter" | ||
| 702 | + key = space.canonical_key(params) | ||
| 703 | + if key in pool_keys: | ||
| 704 | + continue | ||
| 705 | + pool_keys.add(key) | ||
| 706 | + pool.append({"params": params, "source": source}) | ||
| 707 | + | ||
| 708 | + if not pool: | ||
| 709 | + return proposals | ||
| 710 | + | ||
| 711 | + if gp is not None: | ||
| 712 | + X = np.array([space.vectorize(item["params"]) for item in pool], dtype=float) | ||
| 713 | + mu, sigma = gp.predict(X, return_std=True) | ||
| 714 | + for idx, item in enumerate(pool): | ||
| 715 | + item["acquisition"] = expected_improvement(float(mu[idx]), float(sigma[idx]), best_score) | ||
| 716 | + item["uncertainty"] = float(sigma[idx]) | ||
| 717 | + item["predicted_score"] = float(mu[idx]) | ||
| 718 | + pool.sort( | ||
| 719 | + key=lambda item: ( | ||
| 720 | + float(item.get("acquisition") or 0.0), | ||
| 721 | + float(item.get("uncertainty") or 0.0), | ||
| 722 | + float(item.get("predicted_score") or 0.0), | ||
| 723 | + ), | ||
| 724 | + reverse=True, | ||
| 725 | + ) | ||
| 726 | + else: | ||
| 727 | + rng.shuffle(pool) | ||
| 728 | + | ||
| 729 | + chosen_params = [item.params for item in proposals] | ||
| 730 | + chosen: List[CandidateProposal] = [] | ||
| 731 | + for item in pool: | ||
| 732 | + params = item["params"] | ||
| 733 | + if any(normalized_distance(space, params, other) < space.min_normalized_distance for other in chosen_params): | ||
| 734 | + continue | ||
| 735 | + chosen_params.append(params) | ||
| 736 | + chosen.append( | ||
| 737 | + CandidateProposal( | ||
| 738 | + name=f"bo_{len(successes) + len(proposals) + len(chosen) + 1:03d}", | ||
| 739 | + description=( | ||
| 740 | + f"{item['source']} predicted={item.get('predicted_score', 'n/a')} " | ||
| 741 | + f"ei={item.get('acquisition', 'n/a')}" | ||
| 742 | + ), | ||
| 743 | + params=params, | ||
| 744 | + source=str(item["source"]), | ||
| 745 | + ) | ||
| 746 | + ) | ||
| 747 | + if len(proposals) + len(chosen) >= batch_size: | ||
| 748 | + break | ||
| 749 | + | ||
| 750 | + proposals.extend(chosen) | ||
| 751 | + if len(proposals) < batch_size: | ||
| 752 | + while len(proposals) < batch_size: | ||
| 753 | + params = space.sample_random(rng) | ||
| 754 | + key = space.canonical_key(params) | ||
| 755 | + if key in existing_keys: | ||
| 756 | + continue | ||
| 757 | + proposals.append( | ||
| 758 | + CandidateProposal( | ||
| 759 | + name=f"fallback_{len(successes) + len(proposals) + 1:03d}", | ||
| 760 | + description="fallback random exploration", | ||
| 761 | + params=params, | ||
| 762 | + source="fallback_random", | ||
| 763 | + ) | ||
| 764 | + ) | ||
| 765 | + existing_keys.add(key) | ||
| 766 | + return proposals | ||
| 767 | + | ||
| 768 | + | ||
| 769 | +def compare_query_deltas( | ||
| 770 | + baseline_payload: Dict[str, Any] | None, | ||
| 771 | + best_payload: Dict[str, Any] | None, | ||
| 772 | + metric: str, | ||
| 773 | + limit: int = 8, | ||
| 774 | +) -> Dict[str, List[Dict[str, Any]]]: | ||
| 775 | + if not baseline_payload or not best_payload: | ||
| 776 | + return {"gains": [], "losses": []} | ||
| 777 | + base = { | ||
| 778 | + str(item["query"]): float(item["metrics"].get(metric, 0.0)) | ||
| 779 | + for item in baseline_payload.get("per_query") or [] | ||
| 780 | + } | ||
| 781 | + cur = { | ||
| 782 | + str(item["query"]): float(item["metrics"].get(metric, 0.0)) | ||
| 783 | + for item in best_payload.get("per_query") or [] | ||
| 138 | } | 784 | } |
| 785 | + rows: List[Dict[str, Any]] = [] | ||
| 786 | + for query, score in cur.items(): | ||
| 787 | + if query not in base: | ||
| 788 | + continue | ||
| 789 | + rows.append( | ||
| 790 | + { | ||
| 791 | + "query": query, | ||
| 792 | + "baseline": round(base[query], 6), | ||
| 793 | + "current": round(score, 6), | ||
| 794 | + "delta": round(score - base[query], 6), | ||
| 795 | + } | ||
| 796 | + ) | ||
| 797 | + rows.sort(key=lambda item: item["delta"], reverse=True) | ||
| 798 | + gains = [item for item in rows[:limit] if item["delta"] > 0] | ||
| 799 | + losses = [item for item in rows[-limit:] if item["delta"] < 0] | ||
| 800 | + losses.sort(key=lambda item: item["delta"]) | ||
| 801 | + return {"gains": gains, "losses": losses} | ||
| 802 | + | ||
| 139 | 803 | ||
| 804 | +def render_markdown( | ||
| 805 | + *, | ||
| 806 | + run_id: str, | ||
| 807 | + created_at: str, | ||
| 808 | + tenant_id: str, | ||
| 809 | + query_count: int, | ||
| 810 | + top_k: int, | ||
| 811 | + metric: str, | ||
| 812 | + trials: Sequence[Dict[str, Any]], | ||
| 813 | +) -> str: | ||
| 814 | + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | ||
| 815 | + live_successes = sorted(live_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | ||
| 816 | + best = successes[0] if successes else None | ||
| 817 | + baseline = next((item for item in successes if item.get("is_seed")), None) | ||
| 818 | + best_payload = load_batch_payload(best["batch_json_path"]) if best and best.get("batch_json_path") else None | ||
| 819 | + baseline_payload = ( | ||
| 820 | + load_batch_payload(baseline["batch_json_path"]) | ||
| 821 | + if baseline and baseline.get("batch_json_path") | ||
| 822 | + else None | ||
| 823 | + ) | ||
| 824 | + delta_summary = compare_query_deltas(baseline_payload, best_payload, metric) if best else {"gains": [], "losses": []} | ||
| 140 | 825 | ||
| 141 | -def render_markdown(summary: Dict[str, Any]) -> str: | ||
| 142 | lines = [ | 826 | lines = [ |
| 143 | "# Fusion Tuning Report", | 827 | "# Fusion Tuning Report", |
| 144 | "", | 828 | "", |
| 145 | - f"- Created at: {summary['created_at']}", | ||
| 146 | - f"- Tenant ID: {summary['tenant_id']}", | ||
| 147 | - f"- Query count: {summary['query_count']}", | ||
| 148 | - f"- Top K: {summary['top_k']}", | ||
| 149 | - f"- Score metric: {summary['score_metric']}", | 829 | + f"- Run ID: {run_id}", |
| 830 | + f"- Created at: {created_at}", | ||
| 831 | + f"- Tenant ID: {tenant_id}", | ||
| 832 | + f"- Query count: {query_count}", | ||
| 833 | + f"- Top K: {top_k}", | ||
| 834 | + f"- Score metric: {metric}", | ||
| 835 | + f"- Successful live evals: {len(live_successes)}", | ||
| 150 | "", | 836 | "", |
| 151 | - "## Experiments", | 837 | + "## Leaderboard", |
| 152 | "", | 838 | "", |
| 153 | - "| Rank | Name | Score | Primary | NDCG@20 | ERR@10 | Strong@10 | Gain Recall@20 | Config |", | ||
| 154 | - "|---|---|---:|---:|---:|---:|---:|---:|---|", | 839 | + "| Rank | Name | Source | Score | Primary | NDCG@20 | ERR@10 | Gain Recall@20 | Batch |", |
| 840 | + "|---|---|---|---:|---:|---:|---:|---:|---|", | ||
| 155 | ] | 841 | ] |
| 156 | - for idx, item in enumerate(summary["experiments"], start=1): | ||
| 157 | - metrics = item["aggregate_metrics"] | 842 | + for idx, item in enumerate(successes, start=1): |
| 843 | + metrics = item.get("aggregate_metrics") or {} | ||
| 158 | lines.append( | 844 | lines.append( |
| 159 | "| " | 845 | "| " |
| 160 | + " | ".join( | 846 | + " | ".join( |
| 161 | [ | 847 | [ |
| 162 | str(idx), | 848 | str(idx), |
| 163 | - item["name"], | ||
| 164 | - str(item["score"]), | 849 | + str(item.get("name") or ""), |
| 850 | + str(item.get("source") or ""), | ||
| 851 | + f"{score_of(item, metric):.6f}", | ||
| 165 | str(metrics.get("Primary_Metric_Score", "")), | 852 | str(metrics.get("Primary_Metric_Score", "")), |
| 166 | str(metrics.get("NDCG@20", "")), | 853 | str(metrics.get("NDCG@20", "")), |
| 167 | str(metrics.get("ERR@10", "")), | 854 | str(metrics.get("ERR@10", "")), |
| 168 | - str(metrics.get("Strong_Precision@10", "")), | ||
| 169 | str(metrics.get("Gain_Recall@20", "")), | 855 | str(metrics.get("Gain_Recall@20", "")), |
| 170 | - item["config_snapshot_path"], | 856 | + str(item.get("batch_id") or ""), |
| 171 | ] | 857 | ] |
| 172 | ) | 858 | ) |
| 173 | + " |" | 859 | + " |" |
| 174 | ) | 860 | ) |
| 175 | - lines.extend(["", "## Details", ""]) | ||
| 176 | - for item in summary["experiments"]: | ||
| 177 | - lines.append(f"### {item['name']}") | ||
| 178 | - lines.append("") | ||
| 179 | - lines.append(f"- Description: {item['description']}") | ||
| 180 | - lines.append(f"- Score: {item['score']}") | ||
| 181 | - lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`") | ||
| 182 | - lines.append(f"- Batch report: {item['batch_report_path']}") | ||
| 183 | - lines.append("") | ||
| 184 | - return "\n".join(lines) | ||
| 185 | 861 | ||
| 862 | + if best: | ||
| 863 | + lines.extend( | ||
| 864 | + [ | ||
| 865 | + "", | ||
| 866 | + "## Best Params", | ||
| 867 | + "", | ||
| 868 | + f"- Name: {best['name']}", | ||
| 869 | + f"- Source: {best['source']}", | ||
| 870 | + f"- Score: {score_of(best, metric):.6f}", | ||
| 871 | + f"- Params: `{json.dumps(best['params'], ensure_ascii=False, sort_keys=True)}`", | ||
| 872 | + f"- Batch report: {best.get('batch_report_path') or ''}", | ||
| 873 | + ] | ||
| 874 | + ) | ||
| 186 | 875 | ||
| 187 | -def load_experiments(path: Path) -> List[ExperimentSpec]: | ||
| 188 | - payload = json.loads(path.read_text(encoding="utf-8")) | ||
| 189 | - items = payload["experiments"] if isinstance(payload, dict) else payload | ||
| 190 | - experiments: List[ExperimentSpec] = [] | ||
| 191 | - for item in items: | ||
| 192 | - experiments.append( | ||
| 193 | - ExperimentSpec( | ||
| 194 | - name=str(item["name"]), | ||
| 195 | - description=str(item.get("description") or ""), | ||
| 196 | - params=dict(item.get("params") or {}), | ||
| 197 | - ) | 876 | + if delta_summary["gains"] or delta_summary["losses"]: |
| 877 | + lines.extend(["", "## Best vs Baseline", ""]) | ||
| 878 | + if delta_summary["gains"]: | ||
| 879 | + lines.append("### Top Gains") | ||
| 880 | + lines.append("") | ||
| 881 | + for item in delta_summary["gains"]: | ||
| 882 | + lines.append( | ||
| 883 | + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})" | ||
| 884 | + ) | ||
| 885 | + if delta_summary["losses"]: | ||
| 886 | + lines.append("") | ||
| 887 | + lines.append("### Top Losses") | ||
| 888 | + lines.append("") | ||
| 889 | + for item in delta_summary["losses"]: | ||
| 890 | + lines.append( | ||
| 891 | + f"- {item['query']}: {item['baseline']:.6f} -> {item['current']:.6f} ({item['delta']:+.6f})" | ||
| 892 | + ) | ||
| 893 | + | ||
| 894 | + failures = [item for item in trials if item.get("status") != "ok"] | ||
| 895 | + if failures: | ||
| 896 | + lines.extend(["", "## Failures", ""]) | ||
| 897 | + for item in failures: | ||
| 898 | + lines.append(f"- {item.get('name')}: {item.get('error')}") | ||
| 899 | + | ||
| 900 | + return "\n".join(lines) + "\n" | ||
| 901 | + | ||
| 902 | + | ||
| 903 | +def write_leaderboard_csv(run_dir: Path, metric: str, trials: Sequence[Dict[str, Any]], parameter_names: Sequence[str]) -> None: | ||
| 904 | + path = run_dir / "leaderboard.csv" | ||
| 905 | + successes = sorted(all_success_trials(trials), key=lambda item: score_of(item, metric), reverse=True) | ||
| 906 | + with path.open("w", encoding="utf-8", newline="") as handle: | ||
| 907 | + writer = csv.writer(handle) | ||
| 908 | + writer.writerow( | ||
| 909 | + [ | ||
| 910 | + "rank", | ||
| 911 | + "name", | ||
| 912 | + "source", | ||
| 913 | + "score", | ||
| 914 | + "Primary_Metric_Score", | ||
| 915 | + "NDCG@20", | ||
| 916 | + "ERR@10", | ||
| 917 | + "Gain_Recall@20", | ||
| 918 | + "batch_id", | ||
| 919 | + *parameter_names, | ||
| 920 | + ] | ||
| 198 | ) | 921 | ) |
| 199 | - return experiments | 922 | + for idx, item in enumerate(successes, start=1): |
| 923 | + metrics = item.get("aggregate_metrics") or {} | ||
| 924 | + row = [ | ||
| 925 | + idx, | ||
| 926 | + item.get("name") or "", | ||
| 927 | + item.get("source") or "", | ||
| 928 | + f"{score_of(item, metric):.6f}", | ||
| 929 | + metrics.get("Primary_Metric_Score", ""), | ||
| 930 | + metrics.get("NDCG@20", ""), | ||
| 931 | + metrics.get("ERR@10", ""), | ||
| 932 | + metrics.get("Gain_Recall@20", ""), | ||
| 933 | + item.get("batch_id") or "", | ||
| 934 | + ] | ||
| 935 | + row.extend(item.get("params", {}).get(name, "") for name in parameter_names) | ||
| 936 | + writer.writerow(row) | ||
| 200 | 937 | ||
| 201 | 938 | ||
| 202 | -def build_parser() -> argparse.ArgumentParser: | ||
| 203 | - parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend") | ||
| 204 | - parser.add_argument("--tenant-id", default="163") | ||
| 205 | - parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 206 | - parser.add_argument("--top-k", type=int, default=100) | ||
| 207 | - parser.add_argument("--language", default="en") | ||
| 208 | - parser.add_argument("--experiments-file", required=True) | ||
| 209 | - parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") | ||
| 210 | - parser.add_argument("--score-metric", default="Primary_Metric_Score") | ||
| 211 | - parser.add_argument("--apply-best", action="store_true") | ||
| 212 | - parser.add_argument("--force-refresh-labels-first-pass", action="store_true") | ||
| 213 | - return parser | 939 | +def persist_run_summary( |
| 940 | + *, | ||
| 941 | + run_dir: Path, | ||
| 942 | + run_id: str, | ||
| 943 | + tenant_id: str, | ||
| 944 | + query_count: int, | ||
| 945 | + top_k: int, | ||
| 946 | + metric: str, | ||
| 947 | + trials: Sequence[Dict[str, Any]], | ||
| 948 | + parameter_names: Sequence[str], | ||
| 949 | +) -> None: | ||
| 950 | + summary = { | ||
| 951 | + "run_id": run_id, | ||
| 952 | + "created_at": utc_now_iso(), | ||
| 953 | + "tenant_id": tenant_id, | ||
| 954 | + "query_count": query_count, | ||
| 955 | + "top_k": top_k, | ||
| 956 | + "score_metric": metric, | ||
| 957 | + "trials": list(trials), | ||
| 958 | + } | ||
| 959 | + (run_dir / "summary.json").write_text( | ||
| 960 | + json.dumps(summary, ensure_ascii=False, indent=2), | ||
| 961 | + encoding="utf-8", | ||
| 962 | + ) | ||
| 963 | + (run_dir / "summary.md").write_text( | ||
| 964 | + render_markdown( | ||
| 965 | + run_id=run_id, | ||
| 966 | + created_at=summary["created_at"], | ||
| 967 | + tenant_id=tenant_id, | ||
| 968 | + query_count=query_count, | ||
| 969 | + top_k=top_k, | ||
| 970 | + metric=metric, | ||
| 971 | + trials=trials, | ||
| 972 | + ), | ||
| 973 | + encoding="utf-8", | ||
| 974 | + ) | ||
| 975 | + write_leaderboard_csv(run_dir, metric, trials, parameter_names) | ||
| 214 | 976 | ||
| 215 | 977 | ||
| 216 | -def main() -> None: | ||
| 217 | - args = build_parser().parse_args() | 978 | +def run_experiment_mode(args: argparse.Namespace) -> None: |
| 218 | queries_file = Path(args.queries_file) | 979 | queries_file = Path(args.queries_file) |
| 219 | queries = read_queries(queries_file) | 980 | queries = read_queries(queries_file) |
| 220 | base_config_text = CONFIG_PATH.read_text(encoding="utf-8") | 981 | base_config_text = CONFIG_PATH.read_text(encoding="utf-8") |
| @@ -222,19 +983,33 @@ def main() -> None: | @@ -222,19 +983,33 @@ def main() -> None: | ||
| 222 | experiments = load_experiments(Path(args.experiments_file)) | 983 | experiments = load_experiments(Path(args.experiments_file)) |
| 223 | 984 | ||
| 224 | tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") | 985 | tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") |
| 225 | - run_id = f"tuning_{utc_timestamp()}" | 986 | + run_id = args.run_name or f"tuning_{utc_timestamp()}" |
| 226 | run_dir = ensure_dir(tuning_dir / run_id) | 987 | run_dir = ensure_dir(tuning_dir / run_id) |
| 227 | results: List[Dict[str, Any]] = [] | 988 | results: List[Dict[str, Any]] = [] |
| 228 | 989 | ||
| 229 | try: | 990 | try: |
| 230 | for experiment in experiments: | 991 | for experiment in experiments: |
| 231 | - candidate = apply_params(base_config, experiment.params) | 992 | + params = dict(experiment.params) |
| 993 | + target_path = args.target_path or "coarse_rank.fusion" | ||
| 994 | + candidate = apply_target_params(base_config, target_path, params) | ||
| 232 | write_yaml(CONFIG_PATH, candidate) | 995 | write_yaml(CONFIG_PATH, candidate) |
| 233 | - candidate_config_path = run_dir / f"{experiment.name}_config.yaml" | 996 | + candidate_config_path = ensure_dir(run_dir / "configs") / f"{experiment.name}_config.yaml" |
| 234 | write_yaml(candidate_config_path, candidate) | 997 | write_yaml(candidate_config_path, candidate) |
| 235 | 998 | ||
| 236 | - run_restart() | 999 | + ensure_disk_headroom( |
| 1000 | + min_free_gb=args.min_free_gb, | ||
| 1001 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1002 | + context=f"restart {experiment.name}", | ||
| 1003 | + ) | ||
| 1004 | + run_restart(args.restart_targets) | ||
| 237 | health = wait_for_backend(args.search_base_url) | 1005 | health = wait_for_backend(args.search_base_url) |
| 1006 | + if args.heal_eval_web: | ||
| 1007 | + ensure_eval_web(args.eval_web_base_url) | ||
| 1008 | + ensure_disk_headroom( | ||
| 1009 | + min_free_gb=args.min_free_gb, | ||
| 1010 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1011 | + context=f"batch eval {experiment.name}", | ||
| 1012 | + ) | ||
| 238 | batch_result = run_batch_eval( | 1013 | batch_result = run_batch_eval( |
| 239 | tenant_id=args.tenant_id, | 1014 | tenant_id=args.tenant_id, |
| 240 | queries_file=queries_file, | 1015 | queries_file=queries_file, |
| @@ -242,21 +1017,27 @@ def main() -> None: | @@ -242,21 +1017,27 @@ def main() -> None: | ||
| 242 | language=args.language, | 1017 | language=args.language, |
| 243 | force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results), | 1018 | force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results), |
| 244 | ) | 1019 | ) |
| 245 | - aggregate_metrics = dict(batch_result["aggregate_metrics"]) | 1020 | + ensure_disk_headroom( |
| 1021 | + min_free_gb=args.min_free_gb, | ||
| 1022 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1023 | + context=f"persist {experiment.name}", | ||
| 1024 | + ) | ||
| 1025 | + payload = batch_result["payload"] | ||
| 1026 | + aggregate_metrics = dict(payload["aggregate_metrics"]) | ||
| 246 | results.append( | 1027 | results.append( |
| 247 | { | 1028 | { |
| 248 | "name": experiment.name, | 1029 | "name": experiment.name, |
| 249 | "description": experiment.description, | 1030 | "description": experiment.description, |
| 250 | - "params": experiment.params, | 1031 | + "params": params, |
| 251 | "aggregate_metrics": aggregate_metrics, | 1032 | "aggregate_metrics": aggregate_metrics, |
| 252 | "score": float(aggregate_metrics.get(args.score_metric, 0.0)), | 1033 | "score": float(aggregate_metrics.get(args.score_metric, 0.0)), |
| 253 | "batch_id": batch_result["batch_id"], | 1034 | "batch_id": batch_result["batch_id"], |
| 254 | - "batch_report_path": str( | ||
| 255 | - DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md" | ||
| 256 | - ), | ||
| 257 | - "config_snapshot_path": str(candidate_config_path), | 1035 | + "batch_json_path": batch_result["batch_json_path"], |
| 1036 | + "batch_report_path": batch_result["batch_report_path"], | ||
| 1037 | + "candidate_config_path": str(candidate_config_path), | ||
| 258 | "backend_health": health, | 1038 | "backend_health": health, |
| 259 | - "batch_stdout": batch_result["raw_output"], | 1039 | + "status": "ok", |
| 1040 | + "source": "experiments_file", | ||
| 260 | } | 1041 | } |
| 261 | ) | 1042 | ) |
| 262 | print( | 1043 | print( |
| @@ -265,32 +1046,285 @@ def main() -> None: | @@ -265,32 +1046,285 @@ def main() -> None: | ||
| 265 | ) | 1046 | ) |
| 266 | finally: | 1047 | finally: |
| 267 | if args.apply_best and results: | 1048 | if args.apply_best and results: |
| 268 | - best = max(results, key=lambda item: item["score"]) | ||
| 269 | - best_config = apply_params(base_config, best["params"]) | 1049 | + best = max(results, key=lambda item: score_of(item, args.score_metric)) |
| 1050 | + best_config = apply_target_params(base_config, args.target_path or "coarse_rank.fusion", best["params"]) | ||
| 270 | write_yaml(CONFIG_PATH, best_config) | 1051 | write_yaml(CONFIG_PATH, best_config) |
| 271 | - run_restart() | 1052 | + run_restart(args.restart_targets) |
| 272 | wait_for_backend(args.search_base_url) | 1053 | wait_for_backend(args.search_base_url) |
| 1054 | + if args.heal_eval_web: | ||
| 1055 | + ensure_eval_web(args.eval_web_base_url) | ||
| 273 | else: | 1056 | else: |
| 274 | CONFIG_PATH.write_text(base_config_text, encoding="utf-8") | 1057 | CONFIG_PATH.write_text(base_config_text, encoding="utf-8") |
| 275 | - run_restart() | 1058 | + run_restart(args.restart_targets) |
| 276 | wait_for_backend(args.search_base_url) | 1059 | wait_for_backend(args.search_base_url) |
| 1060 | + if args.heal_eval_web: | ||
| 1061 | + ensure_eval_web(args.eval_web_base_url) | ||
| 277 | 1062 | ||
| 278 | - results.sort(key=lambda item: item["score"], reverse=True) | ||
| 279 | - summary = { | ||
| 280 | - "run_id": run_id, | ||
| 281 | - "created_at": utc_now_iso(), | ||
| 282 | - "tenant_id": args.tenant_id, | ||
| 283 | - "query_count": len(queries), | ||
| 284 | - "top_k": args.top_k, | ||
| 285 | - "score_metric": args.score_metric, | ||
| 286 | - "experiments": results, | ||
| 287 | - } | ||
| 288 | - summary_json_path = run_dir / "summary.json" | ||
| 289 | - summary_md_path = run_dir / "summary.md" | ||
| 290 | - summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | ||
| 291 | - summary_md_path.write_text(render_markdown(summary), encoding="utf-8") | ||
| 292 | - print(f"[done] summary_json={summary_json_path}") | ||
| 293 | - print(f"[done] summary_md={summary_md_path}") | 1063 | + persist_run_summary( |
| 1064 | + run_dir=run_dir, | ||
| 1065 | + run_id=run_id, | ||
| 1066 | + tenant_id=str(args.tenant_id), | ||
| 1067 | + query_count=len(queries), | ||
| 1068 | + top_k=args.top_k, | ||
| 1069 | + metric=args.score_metric, | ||
| 1070 | + trials=results, | ||
| 1071 | + parameter_names=list(results[0]["params"].keys()) if results else [], | ||
| 1072 | + ) | ||
| 1073 | + print(f"[done] summary_json={run_dir / 'summary.json'}") | ||
| 1074 | + print(f"[done] summary_md={run_dir / 'summary.md'}") | ||
| 1075 | + | ||
| 1076 | + | ||
| 1077 | +def run_optimize_mode(args: argparse.Namespace) -> None: | ||
| 1078 | + queries_file = Path(args.queries_file) | ||
| 1079 | + queries = read_queries(queries_file) | ||
| 1080 | + base_config_text = CONFIG_PATH.read_text(encoding="utf-8") | ||
| 1081 | + base_config = load_yaml(CONFIG_PATH) | ||
| 1082 | + search_space_path = Path(args.search_space) | ||
| 1083 | + space = load_search_space(search_space_path) | ||
| 1084 | + rng = random.Random(args.random_seed) | ||
| 1085 | + | ||
| 1086 | + tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") | ||
| 1087 | + run_dir = ( | ||
| 1088 | + Path(args.resume_run).resolve() | ||
| 1089 | + if args.resume_run | ||
| 1090 | + else ensure_dir(tuning_dir / (args.run_name or f"coarse_fusion_bo_{utc_timestamp()}")) | ||
| 1091 | + ) | ||
| 1092 | + run_id = run_dir.name | ||
| 1093 | + ensure_dir(run_dir / "configs") | ||
| 1094 | + ensure_dir(run_dir / "logs") | ||
| 1095 | + if not (run_dir / "search_space.yaml").exists(): | ||
| 1096 | + (run_dir / "search_space.yaml").write_text(search_space_path.read_text(encoding="utf-8"), encoding="utf-8") | ||
| 1097 | + | ||
| 1098 | + trials = load_existing_trials(run_dir) | ||
| 1099 | + if args.seed_report: | ||
| 1100 | + baseline_params = space.fill_params(space.baseline) | ||
| 1101 | + baseline_key = space.canonical_key(baseline_params) | ||
| 1102 | + if baseline_key not in {space.canonical_key(item["params"]) for item in trials if item.get("params")}: | ||
| 1103 | + payload = load_batch_payload(args.seed_report) | ||
| 1104 | + trial = { | ||
| 1105 | + "trial_id": next_trial_name(trials, "trial"), | ||
| 1106 | + "name": "seed_baseline", | ||
| 1107 | + "description": f"seeded from {args.seed_report}", | ||
| 1108 | + "source": "seed_report", | ||
| 1109 | + "is_seed": True, | ||
| 1110 | + "status": "ok", | ||
| 1111 | + "created_at": utc_now_iso(), | ||
| 1112 | + "params": baseline_params, | ||
| 1113 | + "score": float(payload["aggregate_metrics"].get(args.score_metric, 0.0)), | ||
| 1114 | + "aggregate_metrics": dict(payload["aggregate_metrics"]), | ||
| 1115 | + "batch_id": payload["batch_id"], | ||
| 1116 | + "batch_json_path": str(resolve_batch_json_path(args.seed_report)), | ||
| 1117 | + "batch_report_path": str(resolve_batch_json_path(args.seed_report).with_suffix(".md")), | ||
| 1118 | + } | ||
| 1119 | + append_trial(run_dir, trial) | ||
| 1120 | + trials.append(trial) | ||
| 1121 | + | ||
| 1122 | + init_random = args.init_random if args.init_random is not None else space.init_random | ||
| 1123 | + candidate_pool_size = args.candidate_pool_size if args.candidate_pool_size is not None else space.candidate_pool_size | ||
| 1124 | + | ||
| 1125 | + try: | ||
| 1126 | + live_done = len(live_success_trials(trials)) | ||
| 1127 | + while live_done < args.max_evals: | ||
| 1128 | + remaining = args.max_evals - live_done | ||
| 1129 | + current_batch_size = min(args.batch_size, remaining) | ||
| 1130 | + proposals = propose_candidates( | ||
| 1131 | + space=space, | ||
| 1132 | + trials=trials, | ||
| 1133 | + metric=args.score_metric, | ||
| 1134 | + batch_size=current_batch_size, | ||
| 1135 | + rng=rng, | ||
| 1136 | + init_random=init_random, | ||
| 1137 | + candidate_pool_size=candidate_pool_size, | ||
| 1138 | + ) | ||
| 1139 | + if not proposals: | ||
| 1140 | + raise RuntimeError("optimizer failed to produce new candidate proposals") | ||
| 1141 | + | ||
| 1142 | + for proposal in proposals: | ||
| 1143 | + force_refresh_labels = bool(args.force_refresh_labels_first_pass and live_done == 0 and not any(t.get("is_seed") for t in trials)) | ||
| 1144 | + trial_id = next_trial_name(trials, "trial") | ||
| 1145 | + candidate_config = apply_target_params(base_config, space.target_path, proposal.params) | ||
| 1146 | + candidate_config_path = run_dir / "configs" / f"{trial_id}_{proposal.name}.yaml" | ||
| 1147 | + trial_log_path = run_dir / "logs" / f"{trial_id}_{proposal.name}.log" | ||
| 1148 | + write_yaml(CONFIG_PATH, candidate_config) | ||
| 1149 | + write_yaml(candidate_config_path, candidate_config) | ||
| 1150 | + print( | ||
| 1151 | + f"[tune] start {proposal.name} source={proposal.source} " | ||
| 1152 | + f"params={json.dumps(proposal.params, ensure_ascii=False, sort_keys=True)}" | ||
| 1153 | + ) | ||
| 1154 | + try: | ||
| 1155 | + ensure_disk_headroom( | ||
| 1156 | + min_free_gb=args.min_free_gb, | ||
| 1157 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1158 | + context=f"restart {proposal.name}", | ||
| 1159 | + ) | ||
| 1160 | + run_restart(args.restart_targets) | ||
| 1161 | + backend_health = wait_for_backend(args.search_base_url) | ||
| 1162 | + verify_backend_config(args.search_base_url, space.target_path, proposal.params) | ||
| 1163 | + if args.heal_eval_web: | ||
| 1164 | + ensure_eval_web(args.eval_web_base_url) | ||
| 1165 | + ensure_disk_headroom( | ||
| 1166 | + min_free_gb=args.min_free_gb, | ||
| 1167 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1168 | + context=f"batch eval {proposal.name}", | ||
| 1169 | + ) | ||
| 1170 | + batch_result = run_batch_eval( | ||
| 1171 | + tenant_id=args.tenant_id, | ||
| 1172 | + queries_file=queries_file, | ||
| 1173 | + top_k=args.top_k, | ||
| 1174 | + language=args.language, | ||
| 1175 | + force_refresh_labels=force_refresh_labels, | ||
| 1176 | + ) | ||
| 1177 | + ensure_disk_headroom( | ||
| 1178 | + min_free_gb=args.min_free_gb, | ||
| 1179 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1180 | + context=f"persist {proposal.name}", | ||
| 1181 | + ) | ||
| 1182 | + payload = batch_result["payload"] | ||
| 1183 | + trial_log_path.write_text(batch_result["raw_output"], encoding="utf-8") | ||
| 1184 | + aggregate_metrics = dict(payload["aggregate_metrics"]) | ||
| 1185 | + trial = { | ||
| 1186 | + "trial_id": trial_id, | ||
| 1187 | + "name": proposal.name, | ||
| 1188 | + "description": proposal.description, | ||
| 1189 | + "source": proposal.source, | ||
| 1190 | + "is_seed": False, | ||
| 1191 | + "status": "ok", | ||
| 1192 | + "created_at": utc_now_iso(), | ||
| 1193 | + "params": proposal.params, | ||
| 1194 | + "score": float(aggregate_metrics.get(args.score_metric, 0.0)), | ||
| 1195 | + "aggregate_metrics": aggregate_metrics, | ||
| 1196 | + "batch_id": batch_result["batch_id"], | ||
| 1197 | + "batch_json_path": batch_result["batch_json_path"], | ||
| 1198 | + "batch_report_path": batch_result["batch_report_path"], | ||
| 1199 | + "candidate_config_path": str(candidate_config_path), | ||
| 1200 | + "trial_log_path": str(trial_log_path), | ||
| 1201 | + "backend_health": backend_health, | ||
| 1202 | + } | ||
| 1203 | + print( | ||
| 1204 | + f"[tune] done {proposal.name} " | ||
| 1205 | + f"{args.score_metric}={trial['score']:.6f} " | ||
| 1206 | + f"Primary={aggregate_metrics.get('Primary_Metric_Score')}" | ||
| 1207 | + ) | ||
| 1208 | + except Exception as exc: # noqa: BLE001 | ||
| 1209 | + trial = { | ||
| 1210 | + "trial_id": trial_id, | ||
| 1211 | + "name": proposal.name, | ||
| 1212 | + "description": proposal.description, | ||
| 1213 | + "source": proposal.source, | ||
| 1214 | + "is_seed": False, | ||
| 1215 | + "status": "error", | ||
| 1216 | + "created_at": utc_now_iso(), | ||
| 1217 | + "params": proposal.params, | ||
| 1218 | + "error": str(exc), | ||
| 1219 | + "candidate_config_path": str(candidate_config_path), | ||
| 1220 | + "trial_log_path": str(trial_log_path), | ||
| 1221 | + } | ||
| 1222 | + print(f"[tune] error {proposal.name}: {exc}") | ||
| 1223 | + ensure_disk_headroom( | ||
| 1224 | + min_free_gb=args.min_free_gb, | ||
| 1225 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1226 | + context=f"error-persist {proposal.name}", | ||
| 1227 | + ) | ||
| 1228 | + append_trial(run_dir, trial) | ||
| 1229 | + trials.append(trial) | ||
| 1230 | + ensure_disk_headroom( | ||
| 1231 | + min_free_gb=args.min_free_gb, | ||
| 1232 | + auto_truncate_logs=args.auto_truncate_logs, | ||
| 1233 | + context=f"summary {proposal.name}", | ||
| 1234 | + ) | ||
| 1235 | + persist_run_summary( | ||
| 1236 | + run_dir=run_dir, | ||
| 1237 | + run_id=run_id, | ||
| 1238 | + tenant_id=str(args.tenant_id), | ||
| 1239 | + query_count=len(queries), | ||
| 1240 | + top_k=args.top_k, | ||
| 1241 | + metric=args.score_metric, | ||
| 1242 | + trials=trials, | ||
| 1243 | + parameter_names=space.parameter_names, | ||
| 1244 | + ) | ||
| 1245 | + if trial.get("status") == "ok": | ||
| 1246 | + live_done += 1 | ||
| 1247 | + if live_done >= args.max_evals: | ||
| 1248 | + break | ||
| 1249 | + finally: | ||
| 1250 | + if args.apply_best: | ||
| 1251 | + successes = all_success_trials(trials) | ||
| 1252 | + best_live = max(successes, key=lambda item: score_of(item, args.score_metric)) if successes else None | ||
| 1253 | + if best_live: | ||
| 1254 | + best_config = apply_target_params(base_config, space.target_path, best_live["params"]) | ||
| 1255 | + write_yaml(CONFIG_PATH, best_config) | ||
| 1256 | + run_restart(args.restart_targets) | ||
| 1257 | + wait_for_backend(args.search_base_url) | ||
| 1258 | + if args.heal_eval_web: | ||
| 1259 | + ensure_eval_web(args.eval_web_base_url) | ||
| 1260 | + else: | ||
| 1261 | + CONFIG_PATH.write_text(base_config_text, encoding="utf-8") | ||
| 1262 | + run_restart(args.restart_targets) | ||
| 1263 | + wait_for_backend(args.search_base_url) | ||
| 1264 | + if args.heal_eval_web: | ||
| 1265 | + ensure_eval_web(args.eval_web_base_url) | ||
| 1266 | + | ||
| 1267 | + persist_run_summary( | ||
| 1268 | + run_dir=run_dir, | ||
| 1269 | + run_id=run_id, | ||
| 1270 | + tenant_id=str(args.tenant_id), | ||
| 1271 | + query_count=len(queries), | ||
| 1272 | + top_k=args.top_k, | ||
| 1273 | + metric=args.score_metric, | ||
| 1274 | + trials=trials, | ||
| 1275 | + parameter_names=space.parameter_names, | ||
| 1276 | + ) | ||
| 1277 | + print(f"[done] run_dir={run_dir}") | ||
| 1278 | + print(f"[done] summary_json={run_dir / 'summary.json'}") | ||
| 1279 | + print(f"[done] summary_md={run_dir / 'summary.md'}") | ||
| 1280 | + print(f"[done] leaderboard_csv={run_dir / 'leaderboard.csv'}") | ||
| 1281 | + | ||
| 1282 | + | ||
| 1283 | +def build_parser() -> argparse.ArgumentParser: | ||
| 1284 | + parser = argparse.ArgumentParser( | ||
| 1285 | + description="Tune coarse/fusion params against the live backend with adaptive Bayesian-style search." | ||
| 1286 | + ) | ||
| 1287 | + parser.add_argument("--mode", choices=["optimize", "experiments"], default="optimize") | ||
| 1288 | + parser.add_argument("--tenant-id", default="163") | ||
| 1289 | + parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 1290 | + parser.add_argument("--top-k", type=int, default=100) | ||
| 1291 | + parser.add_argument("--language", default="en") | ||
| 1292 | + parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") | ||
| 1293 | + parser.add_argument("--eval-web-base-url", default="http://127.0.0.1:6010") | ||
| 1294 | + parser.add_argument("--score-metric", default="Primary_Metric_Score") | ||
| 1295 | + parser.add_argument("--restart-targets", nargs="+", default=["backend"]) | ||
| 1296 | + parser.add_argument("--heal-eval-web", action=argparse.BooleanOptionalAction, default=True) | ||
| 1297 | + parser.add_argument("--force-refresh-labels-first-pass", action="store_true") | ||
| 1298 | + parser.add_argument("--apply-best", action="store_true") | ||
| 1299 | + parser.add_argument("--run-name", default=None) | ||
| 1300 | + | ||
| 1301 | + parser.add_argument("--experiments-file") | ||
| 1302 | + parser.add_argument("--target-path", default="coarse_rank.fusion") | ||
| 1303 | + | ||
| 1304 | + parser.add_argument( | ||
| 1305 | + "--search-space", | ||
| 1306 | + default=str(PROJECT_ROOT / "scripts" / "evaluation" / "tuning" / "coarse_rank_fusion_space.yaml"), | ||
| 1307 | + ) | ||
| 1308 | + parser.add_argument("--seed-report", default=None) | ||
| 1309 | + parser.add_argument("--resume-run", default=None) | ||
| 1310 | + parser.add_argument("--max-evals", type=int, default=12) | ||
| 1311 | + parser.add_argument("--batch-size", type=int, default=3) | ||
| 1312 | + parser.add_argument("--init-random", type=int, default=None) | ||
| 1313 | + parser.add_argument("--candidate-pool-size", type=int, default=None) | ||
| 1314 | + parser.add_argument("--random-seed", type=int, default=20260415) | ||
| 1315 | + parser.add_argument("--min-free-gb", type=float, default=5.0) | ||
| 1316 | + parser.add_argument("--auto-truncate-logs", action=argparse.BooleanOptionalAction, default=True) | ||
| 1317 | + return parser | ||
| 1318 | + | ||
| 1319 | + | ||
| 1320 | +def main() -> None: | ||
| 1321 | + args = build_parser().parse_args() | ||
| 1322 | + if args.mode == "experiments": | ||
| 1323 | + if not args.experiments_file: | ||
| 1324 | + raise SystemExit("--experiments-file is required when --mode=experiments") | ||
| 1325 | + run_experiment_mode(args) | ||
| 1326 | + return | ||
| 1327 | + run_optimize_mode(args) | ||
| 294 | 1328 | ||
| 295 | 1329 | ||
| 296 | if __name__ == "__main__": | 1330 | if __name__ == "__main__": |
| @@ -0,0 +1,71 @@ | @@ -0,0 +1,71 @@ | ||
| 1 | +# Coarse Fusion 长跑调参 | ||
| 2 | + | ||
| 3 | +## 启动一轮长跑 | ||
| 4 | + | ||
| 5 | +```bash | ||
| 6 | +./scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 7 | +``` | ||
| 8 | + | ||
| 9 | +可用环境变量: | ||
| 10 | + | ||
| 11 | +```bash | ||
| 12 | +MAX_EVALS=48 BATCH_SIZE=3 CANDIDATE_POOL_SIZE=512 \ | ||
| 13 | +RUN_NAME=coarse_fusion_long_001 \ | ||
| 14 | +./scripts/evaluation/start_coarse_fusion_tuning_long.sh | ||
| 15 | +``` | ||
| 16 | + | ||
| 17 | +启动后会打印: | ||
| 18 | + | ||
| 19 | +- `run_name` | ||
| 20 | +- `pid` | ||
| 21 | +- `log` | ||
| 22 | +- `run_dir` | ||
| 23 | + | ||
| 24 | +默认搜索空间: | ||
| 25 | + | ||
| 26 | +- `scripts/evaluation/tuning/coarse_rank_fusion_space.yaml` | ||
| 27 | + | ||
| 28 | +默认 baseline seed: | ||
| 29 | + | ||
| 30 | +- `artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md` | ||
| 31 | + | ||
| 32 | +## 查看进度 | ||
| 33 | + | ||
| 34 | +```bash | ||
| 35 | +tail -f artifacts/search_evaluation/tuning_launches/<run_name>.log | ||
| 36 | +cat artifacts/search_evaluation/tuning_runs/<run_name>/leaderboard.csv | ||
| 37 | +sed -n '1,200p' artifacts/search_evaluation/tuning_runs/<run_name>/summary.md | ||
| 38 | +``` | ||
| 39 | + | ||
| 40 | +实时记录文件: | ||
| 41 | + | ||
| 42 | +- `trials.jsonl` | ||
| 43 | +- `leaderboard.csv` | ||
| 44 | +- `summary.json` | ||
| 45 | +- `summary.md` | ||
| 46 | + | ||
| 47 | +## 续跑 | ||
| 48 | + | ||
| 49 | +```bash | ||
| 50 | +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh <run_name> | ||
| 51 | +``` | ||
| 52 | + | ||
| 53 | +也可直接传完整目录: | ||
| 54 | + | ||
| 55 | +```bash | ||
| 56 | +./scripts/evaluation/resume_coarse_fusion_tuning_long.sh \ | ||
| 57 | + artifacts/search_evaluation/tuning_runs/<run_name> | ||
| 58 | +``` | ||
| 59 | + | ||
| 60 | +## 停止 | ||
| 61 | + | ||
| 62 | +```bash | ||
| 63 | +kill "$(cat artifacts/search_evaluation/tuning_launches/<run_name>.pid)" | ||
| 64 | +``` | ||
| 65 | + | ||
| 66 | +## 说明 | ||
| 67 | + | ||
| 68 | +- 每轮会自动写入 `config/config.yaml` | ||
| 69 | +- 每轮会自动执行 `./restart.sh backend` | ||
| 70 | +- 如果 `eval-web` 因 backend 重启不可用,调参器会尝试补拉起 `eval-web` | ||
| 71 | +- 默认不 `apply-best`,跑完后会恢复 baseline 配置 |
scripts/evaluation/tuning/coarse_rank_fusion_space.yaml
0 → 100644
| @@ -0,0 +1,153 @@ | @@ -0,0 +1,153 @@ | ||
| 1 | +target_path: coarse_rank.fusion | ||
| 2 | + | ||
| 3 | +baseline: | ||
| 4 | + es_bias: 10.0 | ||
| 5 | + es_exponent: 0.05 | ||
| 6 | + text_bias: 0.1 | ||
| 7 | + text_exponent: 0.35 | ||
| 8 | + text_translation_weight: 1.0 | ||
| 9 | + knn_text_weight: 1.0 | ||
| 10 | + knn_image_weight: 2.0 | ||
| 11 | + knn_tie_breaker: 0.3 | ||
| 12 | + knn_bias: 0.2 | ||
| 13 | + knn_exponent: 5.6 | ||
| 14 | + knn_text_bias: 0.2 | ||
| 15 | + knn_text_exponent: 0.0 | ||
| 16 | + knn_image_bias: 0.2 | ||
| 17 | + knn_image_exponent: 0.0 | ||
| 18 | + | ||
| 19 | +parameters: | ||
| 20 | + es_bias: {min: 0.3, max: 80.0, scale: log, round: 4} | ||
| 21 | + es_exponent: {min: 0.0, max: 0.4, scale: linear, round: 4} | ||
| 22 | + text_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | ||
| 23 | + text_exponent: {min: 0.02, max: 1.6, scale: linear, round: 4} | ||
| 24 | + text_translation_weight: {min: 0.1, max: 2.5, scale: linear, round: 4} | ||
| 25 | + knn_text_weight: {min: 0.1, max: 4.0, scale: linear, round: 4} | ||
| 26 | + knn_image_weight: {min: 0.1, max: 6.0, scale: linear, round: 4} | ||
| 27 | + knn_tie_breaker: {min: 0.0, max: 1.0, scale: linear, round: 4} | ||
| 28 | + knn_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | ||
| 29 | + knn_exponent: {min: 0.05, max: 12.0, scale: log, round: 4} | ||
| 30 | + knn_text_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | ||
| 31 | + knn_text_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4} | ||
| 32 | + knn_image_bias: {min: 0.001, max: 4.0, scale: log, round: 4} | ||
| 33 | + knn_image_exponent: {min: 0.0, max: 6.0, scale: linear, round: 4} | ||
| 34 | + | ||
| 35 | +seed_experiments: | ||
| 36 | + - name: seed_knn_soften | ||
| 37 | + description: 压低 knn 全局指数,先验证当前 5.6 是否过猛 | ||
| 38 | + params: | ||
| 39 | + text_exponent: 0.42 | ||
| 40 | + knn_image_weight: 1.2 | ||
| 41 | + knn_bias: 0.35 | ||
| 42 | + knn_exponent: 1.4 | ||
| 43 | + - name: seed_text_guard | ||
| 44 | + description: 提升 lexical 稳定性,抑制翻译与 image knn 过度主导 | ||
| 45 | + params: | ||
| 46 | + text_exponent: 0.62 | ||
| 47 | + text_translation_weight: 0.75 | ||
| 48 | + knn_image_weight: 1.0 | ||
| 49 | + knn_tie_breaker: 0.15 | ||
| 50 | + knn_exponent: 2.2 | ||
| 51 | + - name: seed_semantic_balanced | ||
| 52 | + description: 让 text/image knn 都参与,但降低 image 偏置和总指数 | ||
| 53 | + params: | ||
| 54 | + text_exponent: 0.32 | ||
| 55 | + knn_text_weight: 1.4 | ||
| 56 | + knn_image_weight: 1.8 | ||
| 57 | + knn_tie_breaker: 0.45 | ||
| 58 | + knn_bias: 0.18 | ||
| 59 | + knn_exponent: 3.0 | ||
| 60 | + - name: seed_component_exp | ||
| 61 | + description: 打开 knn_text/image 子项指数,观察全局 knn_exponent 是否可下放 | ||
| 62 | + params: | ||
| 63 | + knn_bias: 0.15 | ||
| 64 | + knn_exponent: 1.6 | ||
| 65 | + knn_text_exponent: 0.8 | ||
| 66 | + knn_image_exponent: 0.4 | ||
| 67 | + - name: seed_es_relax | ||
| 68 | + description: 增强 es 因子的区分度,验证 coarse 是否过分压平 lexical 分数 | ||
| 69 | + params: | ||
| 70 | + es_bias: 3.0 | ||
| 71 | + es_exponent: 0.11 | ||
| 72 | + text_exponent: 0.48 | ||
| 73 | + knn_exponent: 2.6 | ||
| 74 | + - name: seed_image_heavy | ||
| 75 | + description: 刻意放大 image knn 做对照,看哪些 query 会明显受损 | ||
| 76 | + params: | ||
| 77 | + text_exponent: 0.22 | ||
| 78 | + knn_text_weight: 0.9 | ||
| 79 | + knn_image_weight: 3.4 | ||
| 80 | + knn_tie_breaker: 0.55 | ||
| 81 | + knn_bias: 0.12 | ||
| 82 | + knn_exponent: 3.8 | ||
| 83 | + - name: seed_high_knn_global | ||
| 84 | + description: 沿着 baseline 继续上探更强 knn 全局指数,验证 5.6 是否仍偏保守 | ||
| 85 | + params: | ||
| 86 | + text_exponent: 0.28 | ||
| 87 | + knn_text_weight: 1.1 | ||
| 88 | + knn_image_weight: 2.6 | ||
| 89 | + knn_tie_breaker: 0.4 | ||
| 90 | + knn_bias: 0.12 | ||
| 91 | + knn_exponent: 7.2 | ||
| 92 | + - name: seed_text_knn_split | ||
| 93 | + description: 提高 text knn,压低 image knn,同时打开 text/image 子项指数 | ||
| 94 | + params: | ||
| 95 | + text_exponent: 0.38 | ||
| 96 | + knn_text_weight: 2.0 | ||
| 97 | + knn_image_weight: 0.8 | ||
| 98 | + knn_tie_breaker: 0.2 | ||
| 99 | + knn_bias: 0.08 | ||
| 100 | + knn_exponent: 4.8 | ||
| 101 | + knn_text_exponent: 1.1 | ||
| 102 | + knn_image_exponent: 0.15 | ||
| 103 | + - name: seed_image_split | ||
| 104 | + description: 保持较高 image 权重,但把非线性拆到 image 子项而不是全局 knn | ||
| 105 | + params: | ||
| 106 | + text_exponent: 0.26 | ||
| 107 | + knn_text_weight: 0.9 | ||
| 108 | + knn_image_weight: 3.0 | ||
| 109 | + knn_tie_breaker: 0.35 | ||
| 110 | + knn_bias: 0.08 | ||
| 111 | + knn_exponent: 3.4 | ||
| 112 | + knn_text_exponent: 0.2 | ||
| 113 | + knn_image_exponent: 1.0 | ||
| 114 | + - name: seed_es_text_sharpen | ||
| 115 | + description: 提升 es 与 lexical 区分度,测试 coarse 是否需要更强文本排序稳定性 | ||
| 116 | + params: | ||
| 117 | + es_bias: 2.0 | ||
| 118 | + es_exponent: 0.16 | ||
| 119 | + text_bias: 0.03 | ||
| 120 | + text_exponent: 0.78 | ||
| 121 | + text_translation_weight: 0.9 | ||
| 122 | + knn_bias: 0.1 | ||
| 123 | + knn_exponent: 5.0 | ||
| 124 | + - name: seed_translation_discount | ||
| 125 | + description: 明显削弱 translation 命中,验证抽象 query 是否过度依赖翻译通路 | ||
| 126 | + params: | ||
| 127 | + text_exponent: 0.44 | ||
| 128 | + text_translation_weight: 0.45 | ||
| 129 | + knn_text_weight: 1.2 | ||
| 130 | + knn_image_weight: 1.7 | ||
| 131 | + knn_tie_breaker: 0.25 | ||
| 132 | + knn_exponent: 5.4 | ||
| 133 | + - name: seed_near_baseline_jitter | ||
| 134 | + description: 贴近 baseline 做小扰动,优先寻找可行增益而不是只测极端方向 | ||
| 135 | + params: | ||
| 136 | + es_bias: 8.0 | ||
| 137 | + es_exponent: 0.06 | ||
| 138 | + text_bias: 0.06 | ||
| 139 | + text_exponent: 0.31 | ||
| 140 | + text_translation_weight: 1.1 | ||
| 141 | + knn_text_weight: 1.1 | ||
| 142 | + knn_image_weight: 2.2 | ||
| 143 | + knn_tie_breaker: 0.34 | ||
| 144 | + knn_bias: 0.16 | ||
| 145 | + knn_exponent: 5.9 | ||
| 146 | + | ||
| 147 | +optimizer: | ||
| 148 | + init_random: 8 | ||
| 149 | + candidate_pool_size: 512 | ||
| 150 | + explore_probability: 0.28 | ||
| 151 | + local_jitter_probability: 0.42 | ||
| 152 | + elite_fraction: 0.35 | ||
| 153 | + min_normalized_distance: 0.12 |
scripts/service_ctl.sh
| @@ -213,6 +213,7 @@ health_path_for_service() { | @@ -213,6 +213,7 @@ health_path_for_service() { | ||
| 213 | local service="$1" | 213 | local service="$1" |
| 214 | case "${service}" in | 214 | case "${service}" in |
| 215 | backend|indexer|embedding|embedding-image|translator|reranker|reranker-fine|tei) echo "/health" ;; | 215 | backend|indexer|embedding|embedding-image|translator|reranker|reranker-fine|tei) echo "/health" ;; |
| 216 | + eval-web) echo "/api/history" ;; | ||
| 216 | *) echo "" ;; | 217 | *) echo "" ;; |
| 217 | esac | 218 | esac |
| 218 | } | 219 | } |
| @@ -469,7 +470,7 @@ monitor_services() { | @@ -469,7 +470,7 @@ monitor_services() { | ||
| 469 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then | 470 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then |
| 470 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" | 471 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" |
| 471 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | 472 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 472 | - python "${wechat_alert_py}" \ | 473 | + "$(config_python_bin)" "${wechat_alert_py}" \ |
| 473 | --service "${svc}" \ | 474 | --service "${svc}" \ |
| 474 | --level "error" \ | 475 | --level "error" \ |
| 475 | --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" | 476 | --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" |
| @@ -479,7 +480,7 @@ monitor_services() { | @@ -479,7 +480,7 @@ monitor_services() { | ||
| 479 | 480 | ||
| 480 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" | 481 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" |
| 481 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | 482 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 482 | - python "${wechat_alert_py}" \ | 483 | + "$(config_python_bin)" "${wechat_alert_py}" \ |
| 483 | --service "${svc}" \ | 484 | --service "${svc}" \ |
| 484 | --level "error" \ | 485 | --level "error" \ |
| 485 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" | 486 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" |
| @@ -494,7 +495,7 @@ monitor_services() { | @@ -494,7 +495,7 @@ monitor_services() { | ||
| 494 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" | 495 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" |
| 495 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" | 496 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" |
| 496 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | 497 | if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then |
| 497 | - python "${wechat_alert_py}" \ | 498 | + "$(config_python_bin)" "${wechat_alert_py}" \ |
| 498 | --service "${svc}" \ | 499 | --service "${svc}" \ |
| 499 | --level "error" \ | 500 | --level "error" \ |
| 500 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." | 501 | --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." |
| @@ -609,7 +610,13 @@ is_running_by_port() { | @@ -609,7 +610,13 @@ is_running_by_port() { | ||
| 609 | local service="$1" | 610 | local service="$1" |
| 610 | local port | 611 | local port |
| 611 | port="$(get_port "${service}")" | 612 | port="$(get_port "${service}")" |
| 612 | - [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1 | 613 | + [ -n "${port}" ] && lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t >/dev/null 2>&1 |
| 614 | +} | ||
| 615 | + | ||
| 616 | +list_listen_pids_by_port() { | ||
| 617 | + local port="$1" | ||
| 618 | + [ -n "${port}" ] || return 0 | ||
| 619 | + lsof -nP -iTCP:"${port}" -sTCP:LISTEN -t 2>/dev/null || true | ||
| 613 | } | 620 | } |
| 614 | 621 | ||
| 615 | is_running_tei_container() { | 622 | is_running_tei_container() { |
| @@ -794,14 +801,14 @@ stop_one() { | @@ -794,14 +801,14 @@ stop_one() { | ||
| 794 | port="$(get_port "${service}")" | 801 | port="$(get_port "${service}")" |
| 795 | if [ -n "${port}" ]; then | 802 | if [ -n "${port}" ]; then |
| 796 | local pids | 803 | local pids |
| 797 | - pids="$(lsof -ti:${port} 2>/dev/null || true)" | 804 | + pids="$(list_listen_pids_by_port "${port}")" |
| 798 | if [ -n "${pids}" ]; then | 805 | if [ -n "${pids}" ]; then |
| 799 | echo "[stop] ${service} port=${port} pids=${pids}" | 806 | echo "[stop] ${service} port=${port} pids=${pids}" |
| 800 | for pid in ${pids}; do | 807 | for pid in ${pids}; do |
| 801 | kill -TERM "${pid}" 2>/dev/null || true | 808 | kill -TERM "${pid}" 2>/dev/null || true |
| 802 | done | 809 | done |
| 803 | sleep 1 | 810 | sleep 1 |
| 804 | - pids="$(lsof -ti:${port} 2>/dev/null || true)" | 811 | + pids="$(list_listen_pids_by_port "${port}")" |
| 805 | for pid in ${pids}; do | 812 | for pid in ${pids}; do |
| 806 | kill -KILL "${pid}" 2>/dev/null || true | 813 | kill -KILL "${pid}" 2>/dev/null || true |
| 807 | done | 814 | done |
| @@ -854,7 +861,7 @@ status_one() { | @@ -854,7 +861,7 @@ status_one() { | ||
| 854 | pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")" | 861 | pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")" |
| 855 | elif is_running_by_port "${service}"; then | 862 | elif is_running_by_port "${service}"; then |
| 856 | running="yes" | 863 | running="yes" |
| 857 | - pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")" | 864 | + pid_info="$(list_listen_pids_by_port "${port}" | tr '\n' ',' | sed 's/,$//' || echo "-")" |
| 858 | fi | 865 | fi |
| 859 | 866 | ||
| 860 | if [ "${running}" = "yes" ]; then | 867 | if [ "${running}" = "yes" ]; then |