Commit 35ae3b29029f0253836699d1f94c121c5933aa5f

Authored by tangwang
1 parent dedd31c5

批量评估框架,召回参数修改和llm评估终止条件优化

scripts/evaluation/README.md
@@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order: @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order:
87 **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): 87 **Early stop** (defaults in `eval_framework.constants`; overridable via CLI):
88 88
89 - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. 89 - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed.
90 - - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either:  
91 - - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or  
92 - - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`).  
93 - (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.) 90 + - After that, a **bad batch** is one where **both** are true (strict **>**):
  91 + - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and
  92 + - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`).
  93 + (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.)
94 - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. 94 - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad.
95 - - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). 95 + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
96 96
97 - So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged. 97 + So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged.
98 98
99 **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. 99 **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop.
100 100
scripts/evaluation/eval_framework/cli.py
@@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser:
92 "--rebuild-irrelevant-stop-ratio", 92 "--rebuild-irrelevant-stop-ratio",
93 type=float, 93 type=float,
94 default=None, 94 default=None,
95 - help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).", 95 + help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).",
96 ) 96 )
97 build.add_argument( 97 build.add_argument(
98 "--rebuild-irrel-low-combined-stop-ratio", 98 "--rebuild-irrel-low-combined-stop-ratio",
99 type=float, 99 type=float,
100 default=None, 100 default=None,
101 - help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).", 101 + help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).",
102 ) 102 )
103 build.add_argument( 103 build.add_argument(
104 "--rebuild-irrelevant-stop-streak", 104 "--rebuild-irrelevant-stop-streak",
105 type=int, 105 type=int,
106 default=None, 106 default=None,
107 - help="Rebuild only: consecutive bad batches before early stop (default 2).", 107 + help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
108 ) 108 )
109 build.add_argument("--language", default="en") 109 build.add_argument("--language", default="en")
110 build.add_argument("--force-refresh-rerank", action="store_true") 110 build.add_argument("--force-refresh-rerank", action="store_true")
scripts/evaluation/eval_framework/constants.py
@@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
54 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 54 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
55 55
56 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): 56 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
57 -# A batch is "bad" when it has **no** ``Exact Match`` label AND either:  
58 -# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or  
59 -# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO. 57 +# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
  58 +# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
  59 +# - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
60 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). 60 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
61 -# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches  
62 -# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches).  
63 -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94  
64 -DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96  
65 -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2 61 +# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
  62 +# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
  63 +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
  64 +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
  65 +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
scripts/evaluation/eval_framework/framework.py
@@ -337,14 +337,14 @@ class SearchEvaluationFramework: @@ -337,14 +337,14 @@ class SearchEvaluationFramework:
337 337
338 Per batch, let *n* = batch size, and count labels among docs in that batch only. 338 Per batch, let *n* = batch size, and count labels among docs in that batch only.
339 339
340 - - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of: 340 + - *bad batch* iff **both** (strict ``>``):
341 341
342 - - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or  
343 - - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio``  
344 - (default 0.96; weak relevance = ``RELEVANCE_LOW``). 342 + - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
  343 + - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio``
  344 + (default 0.959; weak relevance = ``RELEVANCE_LOW``).
345 345
346 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. 346 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
347 - Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached 347 + Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
348 or the ordered list is exhausted. 348 or the ordered list is exhausted.
349 349
350 Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). 350 Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``).
@@ -401,12 +401,9 @@ class SearchEvaluationFramework: @@ -401,12 +401,9 @@ class SearchEvaluationFramework:
401 401
402 # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). 402 # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
403 if batch_idx + 1 >= min_batches: 403 if batch_idx + 1 >= min_batches:
404 - no_exact = exact_n == 0  
405 - # Branch 1: high Irrelevant share, no Exact in this batch.  
406 - heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio  
407 - # Branch 2: Irrelevant + Low Relevant combined share, still no Exact.  
408 - heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio  
409 - bad_batch = no_exact and (heavy_irrel or heavy_irrel_low) 404 + bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and (
  405 + irrel_low_ratio > irrelevant_low_combined_stop_ratio
  406 + )
410 if bad_batch: 407 if bad_batch:
411 streak += 1 408 streak += 1
412 else: 409 else:
@@ -414,8 +411,8 @@ class SearchEvaluationFramework: @@ -414,8 +411,8 @@ class SearchEvaluationFramework:
414 if streak >= stop_streak: 411 if streak >= stop_streak:
415 print( 412 print(
416 f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " 413 f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
417 - f"({stop_streak} consecutive batches: no Exact and "  
418 - f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))", 414 + f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} "
  415 + f"and irrel+low>{irrelevant_low_combined_stop_ratio})",
419 flush=True, 416 flush=True,
420 ) 417 )
421 break 418 break