From 35ae3b29029f0253836699d1f94c121c5933aa5f Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 11:47:33 +0800 Subject: [PATCH] 批量评估框架,召回参数修改和llm评估终止条件优化 --- scripts/evaluation/README.md | 12 ++++++------ scripts/evaluation/eval_framework/cli.py | 6 +++--- scripts/evaluation/eval_framework/constants.py | 16 ++++++++-------- scripts/evaluation/eval_framework/framework.py | 23 ++++++++++------------- 4 files changed, 27 insertions(+), 30 deletions(-) diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 411936e..9beb859 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order: **Early stop** (defaults in `eval_framework.constants`; overridable via CLI): - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed. - - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either: - - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or - - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`). - (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.) + - After that, a **bad batch** is one where **both** are true (strict **>**): + - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and + - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`). + (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.) - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad. - - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). + - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size). - So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged. + So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged. **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop. diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 6113beb..b5a3486 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -92,19 +92,19 @@ def build_cli_parser() -> argparse.ArgumentParser: "--rebuild-irrelevant-stop-ratio", type=float, default=None, - help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).", + help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", ) build.add_argument( "--rebuild-irrel-low-combined-stop-ratio", type=float, default=None, - help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).", + help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).", ) build.add_argument( "--rebuild-irrelevant-stop-streak", type=int, default=None, - help="Rebuild only: consecutive bad batches before early stop (default 2).", + help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", ) build.add_argument("--language", default="en") build.add_argument("--force-refresh-rerank", action="store_true") diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index a701a47..4c54b38 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): -# A batch is "bad" when it has **no** ``Exact Match`` label AND either: -# - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or -# - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO. +# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): +# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), +# - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). -# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches -# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches). -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94 -DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96 -DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2 +# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak +# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 +DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index f7973a9..975f7dd 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -337,14 +337,14 @@ class SearchEvaluationFramework: Per batch, let *n* = batch size, and count labels among docs in that batch only. - - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of: + - *bad batch* iff **both** (strict ``>``): - - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or - - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio`` - (default 0.96; weak relevance = ``RELEVANCE_LOW``). + - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and + - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio`` + (default 0.959; weak relevance = ``RELEVANCE_LOW``). Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. - Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached + Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached or the ordered list is exhausted. Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``). @@ -401,12 +401,9 @@ class SearchEvaluationFramework: # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). if batch_idx + 1 >= min_batches: - no_exact = exact_n == 0 - # Branch 1: high Irrelevant share, no Exact in this batch. - heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio - # Branch 2: Irrelevant + Low Relevant combined share, still no Exact. - heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio - bad_batch = no_exact and (heavy_irrel or heavy_irrel_low) + bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and ( + irrel_low_ratio > irrelevant_low_combined_stop_ratio + ) if bad_batch: streak += 1 else: @@ -414,8 +411,8 @@ class SearchEvaluationFramework: if streak >= stop_streak: print( f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " - f"({stop_streak} consecutive batches: no Exact and " - f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))", + f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} " + f"and irrel+low>{irrelevant_low_combined_stop_ratio})", flush=True, ) break -- libgit2 0.21.2