Commit d172c2591757cdbb0104c872cc9e04ee79bee724
1 parent
3ac1f8d1
eval框架
Showing
7 changed files
with
584 additions
and
17 deletions
Show diff stats
| ... | ... | @@ -0,0 +1,151 @@ |
| 1 | + | |
| 2 | + | |
| 3 | +参考资料: | |
| 4 | + | |
| 5 | +1. 搜索接口: | |
| 6 | + | |
| 7 | +```bash | |
| 8 | +export BASE_URL="${BASE_URL:-http://localhost:6002}" | |
| 9 | +export TENANT_ID="${TENANT_ID:-163}" # 改成你的租户ID | |
| 10 | +``` | |
| 11 | +```bash | |
| 12 | +curl -sS "$BASE_URL/search/" \ | |
| 13 | + -H "Content-Type: application/json" \ | |
| 14 | + -H "X-Tenant-ID: $TENANT_ID" \ | |
| 15 | + -d '{ | |
| 16 | + "query": "芭比娃娃", | |
| 17 | + "size": 20, | |
| 18 | + "from": 0, | |
| 19 | + "language": "zh" | |
| 20 | + }' | |
| 21 | +``` | |
| 22 | + | |
| 23 | +response: | |
| 24 | +{ | |
| 25 | + "results": [ | |
| 26 | + { | |
| 27 | + "spu_id": "12345", | |
| 28 | + "title": "芭比时尚娃娃", | |
| 29 | + "image_url": "https://example.com/image.jpg", | |
| 30 | + "specifications":[], | |
| 31 | + "skus":[{"sku_id":" ... | |
| 32 | +... | |
| 33 | + | |
| 34 | +2. 重排服务: | |
| 35 | +curl -X POST "http://localhost:6007/rerank" \ | |
| 36 | + -H "Content-Type: application/json" \ | |
| 37 | + -d '{ | |
| 38 | + "query": "玩具 芭比", | |
| 39 | + "docs": ["12PCS 6 Types of Dolls with Bottles", "纯棉T恤 短袖"], | |
| 40 | + "top_n":386, | |
| 41 | + "normalize": true | |
| 42 | + }' | |
| 43 | + | |
| 44 | + | |
| 45 | +3. 基于指定字段查询:es_debug_search.py | |
| 46 | + | |
| 47 | + | |
| 48 | +主要任务: | |
| 49 | +1. 评估工具的建立: | |
| 50 | +注意判断结果好坏,要用统一的评估工具,不要对每个query设定关键词匹配的规则来判断是否符合要求,这样不可扩展,这种方式且容易有误判还是复杂,并且不好扩展到其他搜索词。 | |
| 51 | +因此要做一个搜索结果评估工具、多个结果对比的工具,供后面的标注集合构建工具调用。工具内部实现可以是调用大模型来判断,说清楚什么叫高相关、基本相关、不相关: | |
| 52 | + | |
| 53 | +prompt: | |
| 54 | +```bash | |
| 55 | +你是一个电商搜索结果相关性评估助手。请根据用户查询(query)和每个商品的信息,输出该商品的相关性等级。 | |
| 56 | + | |
| 57 | +## 相关性等级标准 | |
| 58 | +Exact 完全相关 — 完全匹配用户搜索需求。 | |
| 59 | +Partial 部分相关 — 主意图满足(同品类或相近用途,基本上符合搜索意图),但次要属性(如颜色、风格、尺码等)跟用户需求有偏差或无法确认。 | |
| 60 | +Irrelevant 不相关 — 品类或用途不符,主诉求未满足。 | |
| 61 | + | |
| 62 | +1. {title1} {option1_value1} {option2_value1} {option3_value1} | |
| 63 | +2. {title2} {option1_value2} {option2_value2}, {option3_value2} | |
| 64 | +... | |
| 65 | +50. {title50} {option1_value50} {option2_value50} {option3_value50} | |
| 66 | + | |
| 67 | +## 输出格式 | |
| 68 | +严格输出 {input_nums} 行,每行仅Exact / Partial / Irrelevant三者之一。按顺序对应上述 50 个商品。不要输出任何其他任何信息 | |
| 69 | +``` | |
| 70 | + | |
| 71 | + | |
| 72 | +2. 测试集(结果标注)建立: | |
| 73 | +@queries/queries.txt | |
| 74 | + | |
| 75 | +对其中每一个query: | |
| 76 | +1. 召回: | |
| 77 | +1)参考搜索接口 召回结果。搜索结果的top500,纳入召回池,打分全部标记为1 | |
| 78 | +2)调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过reranker模型接口调用。每80个doc做一次请求。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 | |
| 79 | +3)对reranker打分超过0.5的结果数大于1000条的query,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 | |
| 80 | + | |
| 81 | + | |
| 82 | +2. 对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 | |
| 83 | +直到连续三批不相关比例都大于92%。 | |
| 84 | +最少要跑15批,最多跑40批 | |
| 85 | + | |
| 86 | +3. 请你思考如何存储结果、并利于以后的对比、使用、展示。 | |
| 87 | + | |
| 88 | + | |
| 89 | + | |
| 90 | + | |
| 91 | +3. 评估工具页面: | |
| 92 | +请你设计一个搜索评估交互页面。端口6010。 | |
| 93 | +页面主题:上方是搜索框,如果发起搜索,那么下方给出本次结果的总体指标以及top100结果(允许翻页) | |
| 94 | + | |
| 95 | +总体指标: | |
| 96 | +| 指标 | 含义 | | |
| 97 | +|------|------| | |
| 98 | +| **P@5, P@10, P@20, P@50** | 前 K 个结果中「仅 3 相关」的精确率 | | |
| 99 | +| **P@5_2_3 ~ P@50_2_3** | 前 K 个结果中「2 和 3 都算相关」的精确率 | | |
| 100 | +| **MAP_3** | 仅 3 相关时的 Average Precision(单 query) | | |
| 101 | +| **MAP_2_3** | 2 和 3 都相关时的 Average Precision | | |
| 102 | + | |
| 103 | +结果列表: | |
| 104 | +按行列下来,每行左侧给每个结果找到标注值(三个等级。对结果也可以颜色标记),展示图片,title.en+title.en+首个sku的option1/2/3_value(分三行展示,这三行和左侧的图片并列) | |
| 105 | + | |
| 106 | + | |
| 107 | +评测页面最左侧: | |
| 108 | +queries默认是queries/queries.txt,填入左侧列表框,点击其中任何一个发起搜索。 | |
| 109 | + | |
| 110 | +4. 批量评估工具 | |
| 111 | + | |
| 112 | +给一个批量执行脚本, | |
| 113 | + | |
| 114 | +这里要新增一个批量评估的页面。点击批量评估的按钮,对所有搜索词依次发起搜索,最后汇总总体的评估指标,生成报告,报告名称带上时间标记和一些关键信息。并且记录当时的主搜索程序的config.yaml。 | |
| 115 | +你需要精心地设计如何切换两种模式,通过同一个端口承载这两种不同交互的内容。 | |
| 116 | +批量评估关注的是所有搜索词总体的评估指标。 | |
| 117 | +需要记录测试环境时间以及当时的配置文件,以及对应的结果。要保存历次的评估记录,并能查到每一次评估结果对应的配置文件有相关的指标 | |
| 118 | + | |
| 119 | +以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。 | |
| 120 | + | |
| 121 | + | |
| 122 | + | |
| 123 | + | |
| 124 | + | |
| 125 | + | |
| 126 | +1. 请仔细检验这个标注集的质量,如果质量不符合要求,那么你要优化工具,迭代直至标注集的结果质量足够高,可以以此为自动化工具来评估检索效果,对检索效果形成指导性意见。 | |
| 127 | +2. 在结果标注集的质量足够好,批量评估工具足够好用,并且经过你的试用,能判断出搜索质量好坏的情况下,开始真正的动手检索效果调优:基于这个50条query的结果标注集和批量评估工具,对融合公式进行调参。请你先精心地设计实验,设计几组参数,对几组参数分别修改config.yaml、重启(./restart.sh backend)、跑批量评估、收集结果。 | |
| 128 | +注意评估的过程中,如果发现工具不好用,发现日志不全,发现可以通过修改工具或者日志来提高效率,都可以先做这些,根据完善。 | |
| 129 | +注意你是代码的总负责人,你有任何权限来满足你进行检索效果调优的需要。你如果发现有其他可能带来更大提升的点,也可以进行实验,你甚至可以修改融合、重排漏斗的代码,来进行实验,以追求更好的结果指标。 | |
| 130 | +但是注意,因为收到性能和耗时的约束,不要调大reranker模型的输入条数、不要打开精排,耗时方面无法承受两轮reranker模型的调用。 | |
| 131 | + | |
| 132 | + | |
| 133 | + | |
| 134 | + | |
| 135 | + | |
| 136 | + | |
| 137 | + | |
| 138 | + | |
| 139 | + | |
| 140 | +@scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py | |
| 141 | +@quick_start_eval.sh (29-35) | |
| 142 | +请以如下流程为准,进行改造: | |
| 143 | +如果重建的话,对每个query: | |
| 144 | +每个搜索结果应该会扫描全库, | |
| 145 | +1. 搜索结果的top500,纳入召回池,打分全部标记为1 | |
| 146 | +2. 调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过 | |
| 147 | +3. 对reranker打分超过0.5的大于1000条,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 | |
| 148 | + | |
| 149 | +对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 | |
| 150 | +直到连续三批不相关比例都大于92%。 | |
| 151 | +最少要跑15批,最多跑40批 | ... | ... |
scripts/evaluation/README.md
| ... | ... | @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, |
| 23 | 23 | | `fusion_experiments_round1.json` | Broader first-round experiments | |
| 24 | 24 | | `queries/queries.txt` | Canonical evaluation queries | |
| 25 | 25 | | `README_Requirement.md` | Product/requirements reference | |
| 26 | -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, or `serve` | | |
| 26 | +| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 27 | 27 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | |
| 28 | 28 | |
| 29 | 29 | ## Quick start (repo root) |
| ... | ... | @@ -34,7 +34,7 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS |
| 34 | 34 | # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM |
| 35 | 35 | ./scripts/evaluation/quick_start_eval.sh batch |
| 36 | 36 | |
| 37 | -# Full re-label of current top_k recall (expensive) | |
| 37 | +# Deep rebuild: search recall top-500 (score 1) + full-corpus rerank outside pool + batched LLM (early stop; expensive) | |
| 38 | 38 | ./scripts/evaluation/quick_start_eval.sh batch-rebuild |
| 39 | 39 | |
| 40 | 40 | # UI: http://127.0.0.1:6010/ |
| ... | ... | @@ -52,9 +52,15 @@ Explicit equivalents: |
| 52 | 52 | --language en \ |
| 53 | 53 | --labeler-mode simple |
| 54 | 54 | |
| 55 | -./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | |
| 56 | - ... same args ... \ | |
| 57 | - --force-refresh-labels | |
| 55 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \ | |
| 56 | + --tenant-id "${TENANT_ID:-163}" \ | |
| 57 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 58 | + --search-depth 500 \ | |
| 59 | + --rerank-depth 10000 \ | |
| 60 | + --force-refresh-rerank \ | |
| 61 | + --force-refresh-labels \ | |
| 62 | + --language en \ | |
| 63 | + --labeler-mode simple | |
| 58 | 64 | |
| 59 | 65 | ./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ |
| 60 | 66 | --tenant-id "${TENANT_ID:-163}" \ |
| ... | ... | @@ -63,7 +69,9 @@ Explicit equivalents: |
| 63 | 69 | --port 6010 |
| 64 | 70 | ``` |
| 65 | 71 | |
| 66 | -Each batch run walks the full queries file. With `--force-refresh-labels`, every recalled `spu_id` in the window is re-sent to the LLM and upserted. Without it, only missing labels are filled. | |
| 72 | +Each `batch` run walks the full queries file. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM. | |
| 73 | + | |
| 74 | +**Rebuild (`build --force-refresh-labels`):** For each query: take search top **500** as the recall pool (treated as rerank score **1**; those SKUs are not sent to the reranker). Rerank the rest of the tenant corpus; if more than **1000** non-pool docs have rerank score **> 0.5**, the query is **skipped** (logged as too easy / tail too relevant). Otherwise merge pool (search order) + non-pool (rerank score descending), then LLM-judge in batches of **50**, logging **exact_ratio** and **irrelevant_ratio** per batch. Stop after **3** consecutive batches with irrelevant_ratio **> 92%**, but only after at least **15** batches and at most **40** batches. | |
| 67 | 75 | |
| 68 | 76 | ## Artifacts |
| 69 | 77 | |
| ... | ... | @@ -87,7 +95,7 @@ Default root: `artifacts/search_evaluation/` |
| 87 | 95 | |
| 88 | 96 | **Standard:** Run `batch` without `--force-refresh-labels` to extend coverage, then use the UI or batch in cached mode. Single-query evaluation defaults to **no** auto-annotation: recall still hits the live API; scoring uses SQLite only, and unlabeled hits count as `Irrelevant`. |
| 89 | 97 | |
| 90 | -**Deeper pool:** `build_annotation_set.py build` merges deep search and full-corpus rerank windows before labeling (see CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). | |
| 98 | +**Incremental pool (no full rebuild):** `build_annotation_set.py build` without `--force-refresh-labels` merges search and full-corpus rerank windows before labeling (CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). **Full rebuild** uses the recall-pool + rerank-skip + batched early-stop flow above; tune thresholds via `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-*` flags on `build`. | |
| 91 | 99 | |
| 92 | 100 | **Fusion tuning:** `tune_fusion.py` writes experiment configs, restarts the backend, runs batch evaluation, and optionally applies the best variant (see `--experiments-file`, `--score-metric`, `--apply-best`). |
| 93 | 101 | ... | ... |
scripts/evaluation/README_Requirement_zh.md
| ... | ... | @@ -72,12 +72,20 @@ Irrelevant 不相关 — 品类或用途不符,主诉求未满足。 |
| 72 | 72 | |
| 73 | 73 | 对其中每一个query: |
| 74 | 74 | 1. 召回: |
| 75 | -1)参考搜索接口 召回1k结果。 | |
| 76 | -2)遍历全库,得到每个spu的title,请求重排模型,进行全排序,得到top1w结果。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 | |
| 77 | -2. 对以上结果,拆分batch请求llm,进行结果标注。 | |
| 75 | +1)参考搜索接口 召回结果。搜索结果的top500,纳入召回池,打分全部标记为1 | |
| 76 | +2)调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过reranker模型接口调用。每80个doc做一次请求。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 | |
| 77 | +3)对reranker打分超过0.5的结果数大于1000条的query,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 | |
| 78 | + | |
| 79 | + | |
| 80 | +2. 对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 | |
| 81 | +直到连续三批不相关比例都大于92%。 | |
| 82 | +最少要跑15批,最多跑40批 | |
| 83 | + | |
| 78 | 84 | 3. 请你思考如何存储结果、并利于以后的对比、使用、展示。 |
| 79 | 85 | |
| 80 | 86 | |
| 87 | + | |
| 88 | + | |
| 81 | 89 | 3. 评估工具页面: |
| 82 | 90 | 请你设计一个搜索评估交互页面。端口6010。 |
| 83 | 91 | 页面主题:上方是搜索框,如果发起搜索,那么下方给出本次结果的总体指标以及top100结果(允许翻页) | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -6,7 +6,18 @@ import argparse |
| 6 | 6 | import json |
| 7 | 7 | from pathlib import Path |
| 8 | 8 | |
| 9 | -from .constants import DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE | |
| 9 | +from .constants import ( | |
| 10 | + DEFAULT_LABELER_MODE, | |
| 11 | + DEFAULT_QUERY_FILE, | |
| 12 | + DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | |
| 13 | + DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | |
| 14 | + DEFAULT_REBUILD_LLM_BATCH_SIZE, | |
| 15 | + DEFAULT_REBUILD_MAX_LLM_BATCHES, | |
| 16 | + DEFAULT_REBUILD_MIN_LLM_BATCHES, | |
| 17 | + DEFAULT_RERANK_HIGH_SKIP_COUNT, | |
| 18 | + DEFAULT_RERANK_HIGH_THRESHOLD, | |
| 19 | + DEFAULT_SEARCH_RECALL_TOP_K, | |
| 20 | +) | |
| 10 | 21 | from .framework import SearchEvaluationFramework |
| 11 | 22 | from .utils import ensure_dir, utc_now_iso, utc_timestamp |
| 12 | 23 | from .web_app import create_web_app |
| ... | ... | @@ -23,6 +34,39 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 23 | 34 | build.add_argument("--rerank-depth", type=int, default=10000) |
| 24 | 35 | build.add_argument("--annotate-search-top-k", type=int, default=120) |
| 25 | 36 | build.add_argument("--annotate-rerank-top-k", type=int, default=200) |
| 37 | + build.add_argument( | |
| 38 | + "--search-recall-top-k", | |
| 39 | + type=int, | |
| 40 | + default=None, | |
| 41 | + help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 500).", | |
| 42 | + ) | |
| 43 | + build.add_argument( | |
| 44 | + "--rerank-high-threshold", | |
| 45 | + type=float, | |
| 46 | + default=None, | |
| 47 | + help="Rebuild only: count rerank scores above this on non-pool docs (default 0.5).", | |
| 48 | + ) | |
| 49 | + build.add_argument( | |
| 50 | + "--rerank-high-skip-count", | |
| 51 | + type=int, | |
| 52 | + default=None, | |
| 53 | + help="Rebuild only: skip query if more than this many non-pool docs have rerank score > threshold (default 1000).", | |
| 54 | + ) | |
| 55 | + build.add_argument("--rebuild-llm-batch-size", type=int, default=None, help="Rebuild only: LLM batch size (default 50).") | |
| 56 | + build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 15).") | |
| 57 | + build.add_argument("--rebuild-max-batches", type=int, default=None, help="Rebuild only: max LLM batches (default 40).") | |
| 58 | + build.add_argument( | |
| 59 | + "--rebuild-irrelevant-stop-ratio", | |
| 60 | + type=float, | |
| 61 | + default=None, | |
| 62 | + help="Rebuild only: irrelevant ratio above this counts toward early-stop streak (default 0.92).", | |
| 63 | + ) | |
| 64 | + build.add_argument( | |
| 65 | + "--rebuild-irrelevant-stop-streak", | |
| 66 | + type=int, | |
| 67 | + default=None, | |
| 68 | + help="Rebuild only: stop after this many consecutive batches above irrelevant ratio (default 3).", | |
| 69 | + ) | |
| 26 | 70 | build.add_argument("--language", default="en") |
| 27 | 71 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 28 | 72 | build.add_argument("--force-refresh-labels", action="store_true") |
| ... | ... | @@ -59,6 +103,22 @@ def run_build(args: argparse.Namespace) -> None: |
| 59 | 103 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) |
| 60 | 104 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 61 | 105 | summary = [] |
| 106 | + rebuild_kwargs = {} | |
| 107 | + if args.force_refresh_labels: | |
| 108 | + rebuild_kwargs = { | |
| 109 | + "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K, | |
| 110 | + "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD, | |
| 111 | + "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT, | |
| 112 | + "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE, | |
| 113 | + "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES, | |
| 114 | + "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES, | |
| 115 | + "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio | |
| 116 | + if args.rebuild_irrelevant_stop_ratio is not None | |
| 117 | + else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | |
| 118 | + "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak | |
| 119 | + if args.rebuild_irrelevant_stop_streak is not None | |
| 120 | + else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | |
| 121 | + } | |
| 62 | 122 | for query in queries: |
| 63 | 123 | result = framework.build_query_annotation_set( |
| 64 | 124 | query=query, |
| ... | ... | @@ -69,6 +129,7 @@ def run_build(args: argparse.Namespace) -> None: |
| 69 | 129 | language=args.language, |
| 70 | 130 | force_refresh_rerank=args.force_refresh_rerank, |
| 71 | 131 | force_refresh_labels=args.force_refresh_labels, |
| 132 | + **rebuild_kwargs, | |
| 72 | 133 | ) |
| 73 | 134 | summary.append( |
| 74 | 135 | { | ... | ... |
scripts/evaluation/eval_framework/constants.py
| ... | ... | @@ -17,3 +17,13 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" |
| 17 | 17 | JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" |
| 18 | 18 | JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" |
| 19 | 19 | DEFAULT_LABELER_MODE = "simple" |
| 20 | + | |
| 21 | +# Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches | |
| 22 | +DEFAULT_SEARCH_RECALL_TOP_K = 500 | |
| 23 | +DEFAULT_RERANK_HIGH_THRESHOLD = 0.5 | |
| 24 | +DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000 | |
| 25 | +DEFAULT_REBUILD_LLM_BATCH_SIZE = 50 | |
| 26 | +DEFAULT_REBUILD_MIN_LLM_BATCHES = 15 | |
| 27 | +DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | |
| 28 | +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.92 | |
| 29 | +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -17,6 +17,14 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli |
| 17 | 17 | from .constants import ( |
| 18 | 18 | DEFAULT_ARTIFACT_ROOT, |
| 19 | 19 | DEFAULT_LABELER_MODE, |
| 20 | + DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | |
| 21 | + DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | |
| 22 | + DEFAULT_REBUILD_LLM_BATCH_SIZE, | |
| 23 | + DEFAULT_REBUILD_MAX_LLM_BATCHES, | |
| 24 | + DEFAULT_REBUILD_MIN_LLM_BATCHES, | |
| 25 | + DEFAULT_RERANK_HIGH_SKIP_COUNT, | |
| 26 | + DEFAULT_RERANK_HIGH_THRESHOLD, | |
| 27 | + DEFAULT_SEARCH_RECALL_TOP_K, | |
| 20 | 28 | JUDGE_PROMPT_VERSION_COMPLEX, |
| 21 | 29 | RELEVANCE_EXACT, |
| 22 | 30 | RELEVANCE_IRRELEVANT, |
| ... | ... | @@ -345,7 +353,7 @@ class SearchEvaluationFramework: |
| 345 | 353 | self, |
| 346 | 354 | query: str, |
| 347 | 355 | docs: Sequence[Dict[str, Any]], |
| 348 | - batch_size: int = 24, | |
| 356 | + batch_size: int = 80, | |
| 349 | 357 | force_refresh: bool = False, |
| 350 | 358 | ) -> List[Dict[str, Any]]: |
| 351 | 359 | cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) |
| ... | ... | @@ -374,6 +382,52 @@ class SearchEvaluationFramework: |
| 374 | 382 | ranked.sort(key=lambda item: item["score"], reverse=True) |
| 375 | 383 | return ranked |
| 376 | 384 | |
| 385 | + def full_corpus_rerank_outside_exclude( | |
| 386 | + self, | |
| 387 | + query: str, | |
| 388 | + docs: Sequence[Dict[str, Any]], | |
| 389 | + exclude_spu_ids: set[str], | |
| 390 | + batch_size: int = 80, | |
| 391 | + force_refresh: bool = False, | |
| 392 | + ) -> List[Dict[str, Any]]: | |
| 393 | + """Rerank all corpus docs whose spu_id is not in ``exclude_spu_ids``; excluded IDs are not scored via API.""" | |
| 394 | + exclude_spu_ids = {str(x) for x in exclude_spu_ids} | |
| 395 | + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) | |
| 396 | + pending: List[Dict[str, Any]] = [ | |
| 397 | + doc | |
| 398 | + for doc in docs | |
| 399 | + if str(doc.get("spu_id")) not in exclude_spu_ids | |
| 400 | + and str(doc.get("spu_id")) | |
| 401 | + and (force_refresh or str(doc.get("spu_id")) not in cached) | |
| 402 | + ] | |
| 403 | + if pending: | |
| 404 | + new_scores: Dict[str, float] = {} | |
| 405 | + for start in range(0, len(pending), batch_size): | |
| 406 | + batch = pending[start : start + batch_size] | |
| 407 | + scores = self._rerank_batch_with_retry(query=query, docs=batch) | |
| 408 | + if len(scores) != len(batch): | |
| 409 | + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs") | |
| 410 | + for doc, score in zip(batch, scores): | |
| 411 | + new_scores[str(doc.get("spu_id"))] = float(score) | |
| 412 | + self.store.upsert_rerank_scores( | |
| 413 | + self.tenant_id, | |
| 414 | + query, | |
| 415 | + new_scores, | |
| 416 | + model_name="qwen3_vllm_score", | |
| 417 | + ) | |
| 418 | + cached.update(new_scores) | |
| 419 | + | |
| 420 | + ranked: List[Dict[str, Any]] = [] | |
| 421 | + for doc in docs: | |
| 422 | + spu_id = str(doc.get("spu_id") or "") | |
| 423 | + if not spu_id or spu_id in exclude_spu_ids: | |
| 424 | + continue | |
| 425 | + ranked.append( | |
| 426 | + {"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc} | |
| 427 | + ) | |
| 428 | + ranked.sort(key=lambda item: item["score"], reverse=True) | |
| 429 | + return ranked | |
| 430 | + | |
| 377 | 431 | def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: |
| 378 | 432 | if not docs: |
| 379 | 433 | return [] |
| ... | ... | @@ -447,6 +501,78 @@ class SearchEvaluationFramework: |
| 447 | 501 | mid = len(docs) // 2 |
| 448 | 502 | return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) |
| 449 | 503 | |
| 504 | + def _annotate_rebuild_batches( | |
| 505 | + self, | |
| 506 | + query: str, | |
| 507 | + ordered_docs: Sequence[Dict[str, Any]], | |
| 508 | + *, | |
| 509 | + batch_size: int = DEFAULT_REBUILD_LLM_BATCH_SIZE, | |
| 510 | + min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, | |
| 511 | + max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, | |
| 512 | + irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | |
| 513 | + stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | |
| 514 | + force_refresh: bool = True, | |
| 515 | + ) -> Tuple[Dict[str, str], List[Dict[str, Any]]]: | |
| 516 | + """LLM-label ``ordered_docs`` in fixed-size batches with early stop after enough irrelevant-heavy batches.""" | |
| 517 | + batch_logs: List[Dict[str, Any]] = [] | |
| 518 | + streak = 0 | |
| 519 | + labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query)) | |
| 520 | + total_ordered = len(ordered_docs) | |
| 521 | + | |
| 522 | + for batch_idx in range(max_batches): | |
| 523 | + start = batch_idx * batch_size | |
| 524 | + batch_docs = list(ordered_docs[start : start + batch_size]) | |
| 525 | + if not batch_docs: | |
| 526 | + break | |
| 527 | + | |
| 528 | + batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh) | |
| 529 | + for sub_labels, raw_response, sub_batch in batch_pairs: | |
| 530 | + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} | |
| 531 | + self.store.upsert_labels( | |
| 532 | + self.tenant_id, | |
| 533 | + query, | |
| 534 | + to_store, | |
| 535 | + judge_model=self.label_client.model, | |
| 536 | + raw_response=raw_response, | |
| 537 | + ) | |
| 538 | + labels.update(to_store) | |
| 539 | + time.sleep(0.1) | |
| 540 | + | |
| 541 | + n = len(batch_docs) | |
| 542 | + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) | |
| 543 | + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) | |
| 544 | + exact_ratio = exact_n / n if n else 0.0 | |
| 545 | + irrelevant_ratio = irrel_n / n if n else 0.0 | |
| 546 | + log_entry = { | |
| 547 | + "batch_index": batch_idx + 1, | |
| 548 | + "size": n, | |
| 549 | + "exact_ratio": round(exact_ratio, 6), | |
| 550 | + "irrelevant_ratio": round(irrelevant_ratio, 6), | |
| 551 | + "offset_start": start, | |
| 552 | + "offset_end": min(start + n, total_ordered), | |
| 553 | + } | |
| 554 | + batch_logs.append(log_entry) | |
| 555 | + print( | |
| 556 | + f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} " | |
| 557 | + f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f}", | |
| 558 | + flush=True, | |
| 559 | + ) | |
| 560 | + | |
| 561 | + if batch_idx + 1 >= min_batches: | |
| 562 | + if irrelevant_ratio > irrelevant_stop_ratio: | |
| 563 | + streak += 1 | |
| 564 | + else: | |
| 565 | + streak = 0 | |
| 566 | + if streak >= stop_streak: | |
| 567 | + print( | |
| 568 | + f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " | |
| 569 | + f"({stop_streak} consecutive batches with irrelevant_ratio > {irrelevant_stop_ratio})", | |
| 570 | + flush=True, | |
| 571 | + ) | |
| 572 | + break | |
| 573 | + | |
| 574 | + return labels, batch_logs | |
| 575 | + | |
| 450 | 576 | def build_query_annotation_set( |
| 451 | 577 | self, |
| 452 | 578 | query: str, |
| ... | ... | @@ -458,7 +584,32 @@ class SearchEvaluationFramework: |
| 458 | 584 | language: str = "en", |
| 459 | 585 | force_refresh_rerank: bool = False, |
| 460 | 586 | force_refresh_labels: bool = False, |
| 587 | + search_recall_top_k: int = DEFAULT_SEARCH_RECALL_TOP_K, | |
| 588 | + rerank_high_threshold: float = DEFAULT_RERANK_HIGH_THRESHOLD, | |
| 589 | + rerank_high_skip_count: int = DEFAULT_RERANK_HIGH_SKIP_COUNT, | |
| 590 | + rebuild_llm_batch_size: int = DEFAULT_REBUILD_LLM_BATCH_SIZE, | |
| 591 | + rebuild_min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, | |
| 592 | + rebuild_max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, | |
| 593 | + rebuild_irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | |
| 594 | + rebuild_irrelevant_stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | |
| 461 | 595 | ) -> QueryBuildResult: |
| 596 | + if force_refresh_labels: | |
| 597 | + return self._build_query_annotation_set_rebuild( | |
| 598 | + query=query, | |
| 599 | + search_depth=search_depth, | |
| 600 | + rerank_depth=rerank_depth, | |
| 601 | + language=language, | |
| 602 | + force_refresh_rerank=force_refresh_rerank, | |
| 603 | + search_recall_top_k=search_recall_top_k, | |
| 604 | + rerank_high_threshold=rerank_high_threshold, | |
| 605 | + rerank_high_skip_count=rerank_high_skip_count, | |
| 606 | + rebuild_llm_batch_size=rebuild_llm_batch_size, | |
| 607 | + rebuild_min_batches=rebuild_min_batches, | |
| 608 | + rebuild_max_batches=rebuild_max_batches, | |
| 609 | + rebuild_irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, | |
| 610 | + rebuild_irrelevant_stop_streak=rebuild_irrelevant_stop_streak, | |
| 611 | + ) | |
| 612 | + | |
| 462 | 613 | search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language) |
| 463 | 614 | search_results = list(search_payload.get("results") or []) |
| 464 | 615 | corpus = self.corpus_docs(refresh=False) |
| ... | ... | @@ -558,6 +709,182 @@ class SearchEvaluationFramework: |
| 558 | 709 | output_json_path=output_json_path, |
| 559 | 710 | ) |
| 560 | 711 | |
| 712 | + def _build_query_annotation_set_rebuild( | |
| 713 | + self, | |
| 714 | + query: str, | |
| 715 | + *, | |
| 716 | + search_depth: int, | |
| 717 | + rerank_depth: int, | |
| 718 | + language: str, | |
| 719 | + force_refresh_rerank: bool, | |
| 720 | + search_recall_top_k: int, | |
| 721 | + rerank_high_threshold: float, | |
| 722 | + rerank_high_skip_count: int, | |
| 723 | + rebuild_llm_batch_size: int, | |
| 724 | + rebuild_min_batches: int, | |
| 725 | + rebuild_max_batches: int, | |
| 726 | + rebuild_irrelevant_stop_ratio: float, | |
| 727 | + rebuild_irrelevant_stop_streak: int, | |
| 728 | + ) -> QueryBuildResult: | |
| 729 | + search_size = max(int(search_depth), int(search_recall_top_k)) | |
| 730 | + search_payload = self.search_client.search(query=query, size=search_size, from_=0, language=language) | |
| 731 | + search_results = list(search_payload.get("results") or []) | |
| 732 | + recall_n = min(int(search_recall_top_k), len(search_results)) | |
| 733 | + pool_search_docs = search_results[:recall_n] | |
| 734 | + pool_spu_ids = {str(d.get("spu_id")) for d in pool_search_docs if str(d.get("spu_id") or "").strip()} | |
| 735 | + | |
| 736 | + corpus = self.corpus_docs(refresh=False) | |
| 737 | + corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} | |
| 738 | + | |
| 739 | + ranked_outside = self.full_corpus_rerank_outside_exclude( | |
| 740 | + query=query, | |
| 741 | + docs=corpus, | |
| 742 | + exclude_spu_ids=pool_spu_ids, | |
| 743 | + force_refresh=force_refresh_rerank, | |
| 744 | + ) | |
| 745 | + rerank_high_n = sum(1 for item in ranked_outside if float(item["score"]) > float(rerank_high_threshold)) | |
| 746 | + | |
| 747 | + rebuild_meta: Dict[str, Any] = { | |
| 748 | + "mode": "rebuild_v1", | |
| 749 | + "search_recall_top_k": search_recall_top_k, | |
| 750 | + "recall_pool_size": len(pool_spu_ids), | |
| 751 | + "pool_rerank_score_assigned": 1.0, | |
| 752 | + "rerank_high_threshold": rerank_high_threshold, | |
| 753 | + "rerank_high_count_outside_pool": rerank_high_n, | |
| 754 | + "rerank_high_skip_count": rerank_high_skip_count, | |
| 755 | + "rebuild_llm_batch_size": rebuild_llm_batch_size, | |
| 756 | + "rebuild_min_batches": rebuild_min_batches, | |
| 757 | + "rebuild_max_batches": rebuild_max_batches, | |
| 758 | + "rebuild_irrelevant_stop_ratio": rebuild_irrelevant_stop_ratio, | |
| 759 | + "rebuild_irrelevant_stop_streak": rebuild_irrelevant_stop_streak, | |
| 760 | + } | |
| 761 | + | |
| 762 | + batch_logs: List[Dict[str, Any]] = [] | |
| 763 | + skipped = False | |
| 764 | + skip_reason: str | None = None | |
| 765 | + labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query)) | |
| 766 | + llm_labeled_total = 0 | |
| 767 | + | |
| 768 | + if rerank_high_n > int(rerank_high_skip_count): | |
| 769 | + skipped = True | |
| 770 | + skip_reason = "too_many_high_rerank_scores" | |
| 771 | + print( | |
| 772 | + f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} " | |
| 773 | + f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} " | |
| 774 | + f"(relevant tail too large / query too easy to satisfy)", | |
| 775 | + flush=True, | |
| 776 | + ) | |
| 777 | + else: | |
| 778 | + ordered_docs: List[Dict[str, Any]] = [] | |
| 779 | + seen_ordered: set[str] = set() | |
| 780 | + for doc in pool_search_docs: | |
| 781 | + sid = str(doc.get("spu_id") or "") | |
| 782 | + if not sid or sid in seen_ordered: | |
| 783 | + continue | |
| 784 | + seen_ordered.add(sid) | |
| 785 | + ordered_docs.append(corpus_by_id.get(sid, doc)) | |
| 786 | + for item in ranked_outside: | |
| 787 | + sid = str(item["spu_id"]) | |
| 788 | + if sid in seen_ordered: | |
| 789 | + continue | |
| 790 | + seen_ordered.add(sid) | |
| 791 | + ordered_docs.append(item["doc"]) | |
| 792 | + | |
| 793 | + labels, batch_logs = self._annotate_rebuild_batches( | |
| 794 | + query, | |
| 795 | + ordered_docs, | |
| 796 | + batch_size=rebuild_llm_batch_size, | |
| 797 | + min_batches=rebuild_min_batches, | |
| 798 | + max_batches=rebuild_max_batches, | |
| 799 | + irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, | |
| 800 | + stop_streak=rebuild_irrelevant_stop_streak, | |
| 801 | + force_refresh=True, | |
| 802 | + ) | |
| 803 | + llm_labeled_total = sum(int(entry.get("size") or 0) for entry in batch_logs) | |
| 804 | + | |
| 805 | + rebuild_meta["skipped"] = skipped | |
| 806 | + rebuild_meta["skip_reason"] = skip_reason | |
| 807 | + rebuild_meta["llm_batch_logs"] = batch_logs | |
| 808 | + rebuild_meta["llm_labeled_total"] = llm_labeled_total | |
| 809 | + | |
| 810 | + rerank_depth_effective = min(int(rerank_depth), len(ranked_outside)) | |
| 811 | + search_labeled_results: List[Dict[str, Any]] = [] | |
| 812 | + for rank, doc in enumerate(search_results, start=1): | |
| 813 | + spu_id = str(doc.get("spu_id")) | |
| 814 | + in_pool = rank <= recall_n | |
| 815 | + search_labeled_results.append( | |
| 816 | + { | |
| 817 | + "rank": rank, | |
| 818 | + "spu_id": spu_id, | |
| 819 | + "title": build_display_title(doc), | |
| 820 | + "image_url": doc.get("image_url"), | |
| 821 | + "rerank_score": 1.0 if in_pool else None, | |
| 822 | + "label": labels.get(spu_id), | |
| 823 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 824 | + "product": compact_product_payload(doc), | |
| 825 | + } | |
| 826 | + ) | |
| 827 | + | |
| 828 | + rerank_top_results: List[Dict[str, Any]] = [] | |
| 829 | + for rank, item in enumerate(ranked_outside[:rerank_depth_effective], start=1): | |
| 830 | + doc = item["doc"] | |
| 831 | + spu_id = str(item["spu_id"]) | |
| 832 | + rerank_top_results.append( | |
| 833 | + { | |
| 834 | + "rank": rank, | |
| 835 | + "spu_id": spu_id, | |
| 836 | + "title": build_display_title(doc), | |
| 837 | + "image_url": doc.get("image_url"), | |
| 838 | + "rerank_score": round(float(item["score"]), 8), | |
| 839 | + "label": labels.get(spu_id), | |
| 840 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 841 | + "product": compact_product_payload(doc), | |
| 842 | + } | |
| 843 | + ) | |
| 844 | + | |
| 845 | + top100_labels = [ | |
| 846 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 847 | + for item in search_labeled_results[:100] | |
| 848 | + ] | |
| 849 | + metrics = compute_query_metrics(top100_labels) | |
| 850 | + output_dir = ensure_dir(self.artifact_root / "query_builds") | |
| 851 | + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" | |
| 852 | + output_json_path = output_dir / f"{run_id}.json" | |
| 853 | + pool_docs_count = len(pool_spu_ids) + len(ranked_outside) | |
| 854 | + payload = { | |
| 855 | + "run_id": run_id, | |
| 856 | + "created_at": utc_now_iso(), | |
| 857 | + "tenant_id": self.tenant_id, | |
| 858 | + "query": query, | |
| 859 | + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 860 | + "search_total": int(search_payload.get("total") or 0), | |
| 861 | + "search_depth_requested": search_depth, | |
| 862 | + "search_depth_effective": len(search_results), | |
| 863 | + "rerank_depth_requested": rerank_depth, | |
| 864 | + "rerank_depth_effective": rerank_depth_effective, | |
| 865 | + "corpus_size": len(corpus), | |
| 866 | + "annotation_pool": { | |
| 867 | + "rebuild": rebuild_meta, | |
| 868 | + "ordered_union_size": pool_docs_count, | |
| 869 | + }, | |
| 870 | + "labeler_mode": self.labeler_mode, | |
| 871 | + "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None, | |
| 872 | + "metrics_top100": metrics, | |
| 873 | + "search_results": search_labeled_results, | |
| 874 | + "full_rerank_top": rerank_top_results, | |
| 875 | + } | |
| 876 | + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 877 | + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"]) | |
| 878 | + return QueryBuildResult( | |
| 879 | + query=query, | |
| 880 | + tenant_id=self.tenant_id, | |
| 881 | + search_total=int(search_payload.get("total") or 0), | |
| 882 | + search_depth=len(search_results), | |
| 883 | + rerank_corpus_size=len(corpus), | |
| 884 | + annotated_count=llm_labeled_total if not skipped else 0, | |
| 885 | + output_json_path=output_json_path, | |
| 886 | + ) | |
| 887 | + | |
| 561 | 888 | def evaluate_live_query( |
| 562 | 889 | self, |
| 563 | 890 | query: str, | ... | ... |
scripts/evaluation/quick_start_eval.sh
| ... | ... | @@ -11,7 +11,7 @@ QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 11 | 11 | usage() { |
| 12 | 12 | echo "Usage: $0 batch|batch-rebuild|serve" |
| 13 | 13 | echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" |
| 14 | - echo " batch-rebuild — same as batch but --force-refresh-labels (re-LLM all top_k hits; expensive, overwrites cache)" | |
| 14 | + echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" | |
| 15 | 15 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 16 | 16 | echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" |
| 17 | 17 | } |
| ... | ... | @@ -26,13 +26,15 @@ case "${1:-}" in |
| 26 | 26 | --labeler-mode simple |
| 27 | 27 | ;; |
| 28 | 28 | batch-rebuild) |
| 29 | - exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | |
| 29 | + exec "$PY" scripts/evaluation/build_annotation_set.py build \ | |
| 30 | 30 | --tenant-id "$TENANT_ID" \ |
| 31 | 31 | --queries-file "$QUERIES" \ |
| 32 | - --top-k 50 \ | |
| 32 | + --search-depth 500 \ | |
| 33 | + --rerank-depth 10000 \ | |
| 34 | + --force-refresh-rerank \ | |
| 35 | + --force-refresh-labels \ | |
| 33 | 36 | --language en \ |
| 34 | - --labeler-mode simple \ | |
| 35 | - --force-refresh-labels | |
| 37 | + --labeler-mode simple | |
| 36 | 38 | ;; |
| 37 | 39 | serve) |
| 38 | 40 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ... | ... |