Commit 35ae3b29029f0253836699d1f94c121c5933aa5f
1 parent
dedd31c5
批量评估框架,召回参数修改和llm评估终止条件优化
Showing
4 changed files
with
27 additions
and
30 deletions
Show diff stats
scripts/evaluation/README.md
| @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order: | @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order: | ||
| 87 | **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): | 87 | **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): |
| 88 | 88 | ||
| 89 | - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. | 89 | - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. |
| 90 | - - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either: | ||
| 91 | - - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or | ||
| 92 | - - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`). | ||
| 93 | - (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.) | 90 | + - After that, a **bad batch** is one where **both** are true (strict **>**): |
| 91 | + - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and | ||
| 92 | + - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`). | ||
| 93 | + (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.) | ||
| 94 | - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. | 94 | - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. |
| 95 | - - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). | 95 | + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). |
| 96 | 96 | ||
| 97 | - So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged. | 97 | + So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged. |
| 98 | 98 | ||
| 99 | **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. | 99 | **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. |
| 100 | 100 |
scripts/evaluation/eval_framework/cli.py
| @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 92 | "--rebuild-irrelevant-stop-ratio", | 92 | "--rebuild-irrelevant-stop-ratio", |
| 93 | type=float, | 93 | type=float, |
| 94 | default=None, | 94 | default=None, |
| 95 | - help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).", | 95 | + help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", |
| 96 | ) | 96 | ) |
| 97 | build.add_argument( | 97 | build.add_argument( |
| 98 | "--rebuild-irrel-low-combined-stop-ratio", | 98 | "--rebuild-irrel-low-combined-stop-ratio", |
| 99 | type=float, | 99 | type=float, |
| 100 | default=None, | 100 | default=None, |
| 101 | - help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).", | 101 | + help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).", |
| 102 | ) | 102 | ) |
| 103 | build.add_argument( | 103 | build.add_argument( |
| 104 | "--rebuild-irrelevant-stop-streak", | 104 | "--rebuild-irrelevant-stop-streak", |
| 105 | type=int, | 105 | type=int, |
| 106 | default=None, | 106 | default=None, |
| 107 | - help="Rebuild only: consecutive bad batches before early stop (default 2).", | 107 | + help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", |
| 108 | ) | 108 | ) |
| 109 | build.add_argument("--language", default="en") | 109 | build.add_argument("--language", default="en") |
| 110 | build.add_argument("--force-refresh-rerank", action="store_true") | 110 | build.add_argument("--force-refresh-rerank", action="store_true") |
scripts/evaluation/eval_framework/constants.py
| @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 | @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 | ||
| 54 | DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | 54 | DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 |
| 55 | 55 | ||
| 56 | # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): | 56 | # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): |
| 57 | -# A batch is "bad" when it has **no** ``Exact Match`` label AND either: | ||
| 58 | -# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or | ||
| 59 | -# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO. | 57 | +# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): |
| 58 | +# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), | ||
| 59 | +# - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). | ||
| 60 | # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). | 60 | # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). |
| 61 | -# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches | ||
| 62 | -# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches). | ||
| 63 | -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94 | ||
| 64 | -DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96 | ||
| 65 | -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2 | 61 | +# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak |
| 62 | +# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). | ||
| 63 | +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 | ||
| 64 | +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 | ||
| 65 | +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 |
scripts/evaluation/eval_framework/framework.py
| @@ -337,14 +337,14 @@ class SearchEvaluationFramework: | @@ -337,14 +337,14 @@ class SearchEvaluationFramework: | ||
| 337 | 337 | ||
| 338 | Per batch, let *n* = batch size, and count labels among docs in that batch only. | 338 | Per batch, let *n* = batch size, and count labels among docs in that batch only. |
| 339 | 339 | ||
| 340 | - - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of: | 340 | + - *bad batch* iff **both** (strict ``>``): |
| 341 | 341 | ||
| 342 | - - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or | ||
| 343 | - - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio`` | ||
| 344 | - (default 0.96; weak relevance = ``RELEVANCE_LOW``). | 342 | + - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and |
| 343 | + - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio`` | ||
| 344 | + (default 0.959; weak relevance = ``RELEVANCE_LOW``). | ||
| 345 | 345 | ||
| 346 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. | 346 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. |
| 347 | - Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached | 347 | + Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached |
| 348 | or the ordered list is exhausted. | 348 | or the ordered list is exhausted. |
| 349 | 349 | ||
| 350 | Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). | 350 | Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). |
| @@ -401,12 +401,9 @@ class SearchEvaluationFramework: | @@ -401,12 +401,9 @@ class SearchEvaluationFramework: | ||
| 401 | 401 | ||
| 402 | # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). | 402 | # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). |
| 403 | if batch_idx + 1 >= min_batches: | 403 | if batch_idx + 1 >= min_batches: |
| 404 | - no_exact = exact_n == 0 | ||
| 405 | - # Branch 1: high Irrelevant share, no Exact in this batch. | ||
| 406 | - heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio | ||
| 407 | - # Branch 2: Irrelevant + Low Relevant combined share, still no Exact. | ||
| 408 | - heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio | ||
| 409 | - bad_batch = no_exact and (heavy_irrel or heavy_irrel_low) | 404 | + bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and ( |
| 405 | + irrel_low_ratio > irrelevant_low_combined_stop_ratio | ||
| 406 | + ) | ||
| 410 | if bad_batch: | 407 | if bad_batch: |
| 411 | streak += 1 | 408 | streak += 1 |
| 412 | else: | 409 | else: |
| @@ -414,8 +411,8 @@ class SearchEvaluationFramework: | @@ -414,8 +411,8 @@ class SearchEvaluationFramework: | ||
| 414 | if streak >= stop_streak: | 411 | if streak >= stop_streak: |
| 415 | print( | 412 | print( |
| 416 | f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " | 413 | f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " |
| 417 | - f"({stop_streak} consecutive batches: no Exact and " | ||
| 418 | - f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))", | 414 | + f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} " |
| 415 | + f"and irrel+low>{irrelevant_low_combined_stop_ratio})", | ||
| 419 | flush=True, | 416 | flush=True, |
| 420 | ) | 417 | ) |
| 421 | break | 418 | break |