Commit 35ae3b29029f0253836699d1f94c121c5933aa5f

Authored by tangwang
1 parent dedd31c5

批量评估框架,召回参数修改和llm评估终止条件优化

scripts/evaluation/README.md
... ... @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order:
87 87 **Early stop** (defaults in `eval_framework.constants`; overridable via CLI):
88 88  
89 89 - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed.
90   - - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either:
91   - - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or
92   - - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`).
93   - (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.)
  90 + - After that, a **bad batch** is one where **both** are true (strict **>**):
  91 + - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and
  92 + - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`).
  93 + (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.)
94 94 - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad.
95   - - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
  95 + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
96 96  
97   - So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged.
  97 + So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged.
98 98  
99 99 **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop.
100 100  
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser:
92 92 "--rebuild-irrelevant-stop-ratio",
93 93 type=float,
94 94 default=None,
95   - help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).",
  95 + help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).",
96 96 )
97 97 build.add_argument(
98 98 "--rebuild-irrel-low-combined-stop-ratio",
99 99 type=float,
100 100 default=None,
101   - help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).",
  101 + help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).",
102 102 )
103 103 build.add_argument(
104 104 "--rebuild-irrelevant-stop-streak",
105 105 type=int,
106 106 default=None,
107   - help="Rebuild only: consecutive bad batches before early stop (default 2).",
  107 + help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
108 108 )
109 109 build.add_argument("--language", default="en")
110 110 build.add_argument("--force-refresh-rerank", action="store_true")
... ...
scripts/evaluation/eval_framework/constants.py
... ... @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
54 54 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
55 55  
56 56 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
57   -# A batch is "bad" when it has **no** ``Exact Match`` label AND either:
58   -# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or
59   -# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO.
  57 +# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
  58 +# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
  59 +# - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
60 60 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
61   -# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches
62   -# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches).
63   -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94
64   -DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96
65   -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2
  61 +# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
  62 +# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
  63 +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
  64 +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
  65 +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -337,14 +337,14 @@ class SearchEvaluationFramework:
337 337  
338 338 Per batch, let *n* = batch size, and count labels among docs in that batch only.
339 339  
340   - - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of:
  340 + - *bad batch* iff **both** (strict ``>``):
341 341  
342   - - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or
343   - - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio``
344   - (default 0.96; weak relevance = ``RELEVANCE_LOW``).
  342 + - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
  343 + - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio``
  344 + (default 0.959; weak relevance = ``RELEVANCE_LOW``).
345 345  
346 346 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
347   - Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached
  347 + Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
348 348 or the ordered list is exhausted.
349 349  
350 350 Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``).
... ... @@ -401,12 +401,9 @@ class SearchEvaluationFramework:
401 401  
402 402 # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
403 403 if batch_idx + 1 >= min_batches:
404   - no_exact = exact_n == 0
405   - # Branch 1: high Irrelevant share, no Exact in this batch.
406   - heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio
407   - # Branch 2: Irrelevant + Low Relevant combined share, still no Exact.
408   - heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio
409   - bad_batch = no_exact and (heavy_irrel or heavy_irrel_low)
  404 + bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and (
  405 + irrel_low_ratio > irrelevant_low_combined_stop_ratio
  406 + )
410 407 if bad_batch:
411 408 streak += 1
412 409 else:
... ... @@ -414,8 +411,8 @@ class SearchEvaluationFramework:
414 411 if streak >= stop_streak:
415 412 print(
416 413 f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
417   - f"({stop_streak} consecutive batches: no Exact and "
418   - f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))",
  414 + f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} "
  415 + f"and irrel+low>{irrelevant_low_combined_stop_ratio})",
419 416 flush=True,
420 417 )
421 418 break
... ...