批量评估框架，召回参数修改和llm评估终止条件优化

tangwang
1 parent dedd31c5
Showing 4 changed files with 27 additions and 30 deletions Show diff stats
scripts/evaluation/README.md
scripts/evaluation/eval_framework/cli.py
scripts/evaluation/eval_framework/constants.py
scripts/evaluation/eval_framework/framework.py
@@ -87,14 +87,14 @@ For **each** query in `queries.txt`, in order:
    **Early stop** (defaults in `eval_framework.constants`; overridable via CLI):
    - Run **at least** `--rebuild-min-batches` batches (**10** by default) before any early stop is allowed.
-   - After that, define a **bad batch** as one where the batch has **no** **Exact Match** label **and** either:
-     - **Irrelevant** proportion **≥ 0.94** (`--rebuild-irrelevant-stop-ratio`), or
-     - **(Irrelevant + Low Relevant)** proportion **≥ 0.96** (`--rebuild-irrel-low-combined-stop-ratio`).  
-       (“Low Relevant” is the weak tier; **High Relevant** does not count toward this combined ratio.)
+   - After that, a **bad batch** is one where **both** are true (strict **>**):
+     - **Irrelevant** proportion **> 93.9%** (`--rebuild-irrelevant-stop-ratio`, default `0.939`), and
+     - **(Irrelevant + Low Relevant)** proportion **> 95.9%** (`--rebuild-irrel-low-combined-stop-ratio`, default `0.959`).  
+       (“Low Relevant” is the weak tier; **High Relevant** and **Exact** do not enter this sum.)
    - Count **consecutive** bad batches. **Reset** the count to 0 on any batch that is not bad.
-   - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**2** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
+   - **Stop** when the consecutive bad count reaches **`--rebuild-irrelevant-stop-streak`** (**3** by default), or when **`--rebuild-max-batches`** (**40**) is reached—whichever comes first (up to **2000** docs per query at default batch size).
-   So labeling follows best-first order but **stops early** when the model sees two consecutive “dead” batches; the tail may never be judged.
+   So labeling follows best-first order but **stops early** after **three** consecutive batches that are overwhelmingly Irrelevant and Irrelevant+Low; the tail may never be judged.
 **Incremental pool (no full rebuild):** `build_annotation_set.py build` **without** `--force-refresh-labels` uses the older windowed pool (`--annotate-search-top-k`, `--annotate-rerank-top-k`) and fills missing labels in one pass—no rerank-skip rule and no LLM early-stop loop.
@@ -92,19 +92,19 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         "--rebuild-irrelevant-stop-ratio",
         type=float,
         default=None,
-        help="Rebuild only: irrelevant-only branch threshold (>=) for early-stop streak, requires no Exact (default 0.94).",
+        help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).",
     )
     build.add_argument(
         "--rebuild-irrel-low-combined-stop-ratio",
         type=float,
         default=None,
-        help="Rebuild only: (irrelevant+low)/n threshold (>=) for early-stop streak, requires no Exact (default 0.96).",
+        help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).",
     )
     build.add_argument(
         "--rebuild-irrelevant-stop-streak",
         type=int,
         default=None,
-        help="Rebuild only: consecutive bad batches before early stop (default 2).",
+        help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
     )
     build.add_argument("--language", default="en")
     build.add_argument("--force-refresh-rerank", action="store_true")
@@ -54,12 +54,12 @@ DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
 DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
-# A batch is "bad" when it has **no** ``Exact Match`` label AND either:
-#   - irrelevant_ratio >= DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, or
-#   - (Irrelevant + Low Relevant) / n >= DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO.
+# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
+#   - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO  (default 93.9%),
+#   - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO  (default 95.9%).
 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
-# If a batch is bad, increment a streak; otherwise reset streak to 0. Stop when streak reaches
-# ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (consecutive bad batches).
-DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.94
-DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.96
-DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 2
+# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
+# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
+DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
+DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
+DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
@@ -337,14 +337,14 @@ class SearchEvaluationFramework:
         Per batch, let *n* = batch size, and count labels among docs in that batch only.
-        - *bad batch* iff there is **no** ``Exact Match`` in the batch **and** at least one of:
+        - *bad batch* iff **both** (strict ``>``):
-          - ``irrelevant_ratio = #(Irrelevant)/n >= irrelevant_stop_ratio`` (default 0.94), or
-          - ``( #(Irrelevant) + #(Low Relevant) ) / n >= irrelevant_low_combined_stop_ratio``
-            (default 0.96; weak relevance = ``RELEVANCE_LOW``).
+          - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
+          - ``( #(Irrelevant) + #(Low Relevant) ) / n > irrelevant_low_combined_stop_ratio``
+            (default 0.959; weak relevance = ``RELEVANCE_LOW``).
         Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
-        Stop labeling when ``streak >= stop_streak`` (default 2) or when ``max_batches`` is reached
+        Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
         or the ordered list is exhausted.
         Constants for defaults: ``eval_framework.constants`` (``DEFAULT_REBUILD_*``).
@@ -401,12 +401,9 @@ class SearchEvaluationFramework:
             # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
             if batch_idx + 1 >= min_batches:
-                no_exact = exact_n == 0
-                # Branch 1: high Irrelevant share, no Exact in this batch.
-                heavy_irrel = irrelevant_ratio >= irrelevant_stop_ratio
-                # Branch 2: Irrelevant + Low Relevant combined share, still no Exact.
-                heavy_irrel_low = irrel_low_ratio >= irrelevant_low_combined_stop_ratio
-                bad_batch = no_exact and (heavy_irrel or heavy_irrel_low)
+                bad_batch = (irrelevant_ratio > irrelevant_stop_ratio) and (
+                    irrel_low_ratio > irrelevant_low_combined_stop_ratio
+                )
                 if bad_batch:
                     streak += 1
                 else:
@@ -414,8 +411,8 @@ class SearchEvaluationFramework:
                 if streak >= stop_streak:
                     print(
                         f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
-                        f"({stop_streak} consecutive batches: no Exact and "
-                        f"(irrelevant>={irrelevant_stop_ratio} or irrel+low>={irrelevant_low_combined_stop_ratio}))",
+                        f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} "
+                        f"and irrel+low>{irrelevant_low_combined_stop_ratio})",
                         flush=True,
                     )
                     break