Commit 35ae3b29029f0253836699d1f94c121c5933aa5f
1 parent
dedd31c5
批量评估框架,召回参数修改和llm评估终止条件优化
Showing
4 changed files
with
27 additions
and
30 deletions
Show diff stats
scripts/evaluation/README.md
| ... | ... | @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order: |
| 87 | 87 | **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): |
| 88 | 88 | |
| 89 | 89 | - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. |
| 90 | - - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either: | |
| 91 | - - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or | |
| 92 | - - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`). | |
| 93 | - (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.) | |
| 90 | + - After that, a **bad batch** is one where **both** are true (strict **>**): | |
| 91 | + - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and | |
| 92 | + - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`). | |
| 93 | + (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.) | |
| 94 | 94 | - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. |
| 95 | - - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). | |
| 95 | + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). | |
| 96 | 96 | |
| 97 | - So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged. | |
| 97 | + So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged. | |
| 98 | 98 | |
| 99 | 99 | **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. |
| 100 | 100 | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 92 | 92 | "--rebuild-irrelevant-stop-ratio", |
| 93 | 93 | type=float, |
| 94 | 94 | default=None, |
| 95 | - help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).", | |
| 95 | + help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", | |
| 96 | 96 | ) |
| 97 | 97 | build.add_argument( |
| 98 | 98 | "--rebuild-irrel-low-combined-stop-ratio", |
| 99 | 99 | type=float, |
| 100 | 100 | default=None, |
| 101 | - help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).", | |
| 101 | + help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).", | |
| 102 | 102 | ) |
| 103 | 103 | build.add_argument( |
| 104 | 104 | "--rebuild-irrelevant-stop-streak", |
| 105 | 105 | type=int, |
| 106 | 106 | default=None, |
| 107 | - help="Rebuild only: consecutive bad batches before early stop (default 2).", | |
| 107 | + help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", | |
| 108 | 108 | ) |
| 109 | 109 | build.add_argument("--language", default="en") |
| 110 | 110 | build.add_argument("--force-refresh-rerank", action="store_true") | ... | ... |
scripts/evaluation/eval_framework/constants.py
| ... | ... | @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 |
| 54 | 54 | DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 |
| 55 | 55 | |
| 56 | 56 | # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): |
| 57 | -# A batch is "bad" when it has **no** ``Exact Match`` label AND either: | |
| 58 | -# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or | |
| 59 | -# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO. | |
| 57 | +# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): | |
| 58 | +# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), | |
| 59 | +# - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). | |
| 60 | 60 | # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). |
| 61 | -# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches | |
| 62 | -# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches). | |
| 63 | -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94 | |
| 64 | -DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96 | |
| 65 | -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2 | |
| 61 | +# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak | |
| 62 | +# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). | |
| 63 | +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 | |
| 64 | +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 | |
| 65 | +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -337,14 +337,14 @@ class SearchEvaluationFramework: |
| 337 | 337 | |
| 338 | 338 | Per batch, let *n* = batch size, and count labels among docs in that batch only. |
| 339 | 339 | |
| 340 | - - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of: | |
| 340 | + - *bad batch* iff **both** (strict ``>``): | |
| 341 | 341 | |
| 342 | - - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or | |
| 343 | - - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio`` | |
| 344 | - (default 0.96; weak relevance = ``RELEVANCE_LOW``). | |
| 342 | + - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and | |
| 343 | + - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio`` | |
| 344 | + (default 0.959; weak relevance = ``RELEVANCE_LOW``). | |
| 345 | 345 | |
| 346 | 346 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. |
| 347 | - Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached | |
| 347 | + Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached | |
| 348 | 348 | or the ordered list is exhausted. |
| 349 | 349 | |
| 350 | 350 | Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). |
| ... | ... | @@ -401,12 +401,9 @@ class SearchEvaluationFramework: |
| 401 | 401 | |
| 402 | 402 | # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). |
| 403 | 403 | if batch_idx + 1 >= min_batches: |
| 404 | - no_exact = exact_n == 0 | |
| 405 | - # Branch 1: high Irrelevant share, no Exact in this batch. | |
| 406 | - heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio | |
| 407 | - # Branch 2: Irrelevant + Low Relevant combined share, still no Exact. | |
| 408 | - heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio | |
| 409 | - bad_batch = no_exact and (heavy_irrel or heavy_irrel_low) | |
| 404 | + bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and ( | |
| 405 | + irrel_low_ratio > irrelevant_low_combined_stop_ratio | |
| 406 | + ) | |
| 410 | 407 | if bad_batch: |
| 411 | 408 | streak += 1 |
| 412 | 409 | else: |
| ... | ... | @@ -414,8 +411,8 @@ class SearchEvaluationFramework: |
| 414 | 411 | if streak >= stop_streak: |
| 415 | 412 | print( |
| 416 | 413 | f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " |
| 417 | - f"({stop_streak} consecutive batches: no Exact and " | |
| 418 | - f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))", | |
| 414 | + f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} " | |
| 415 | + f"and irrel+low>{irrelevant_low_combined_stop_ratio})", | |
| 419 | 416 | flush=True, |
| 420 | 417 | ) |
| 421 | 418 | break | ... | ... |