From d73ca84a48afc0945a533707c77ba3bbfaac9621 Mon Sep 17 00:00:00 2001
From: tangwang <tangwang@essa.top>
Date: Wed, 8 Apr 2026 15:39:47 +0800
Subject: [PATCH] refine eval case snapshots and rename relevance levels

---
 scripts/evaluation/README.md                         | 20 ++++++++++++++++++--
 scripts/evaluation/eval_framework/__init__.py        | 16 ++++++++--------
 scripts/evaluation/eval_framework/clients.py         | 11 +++++++++--
 scripts/evaluation/eval_framework/constants.py       | 32 ++++++++++++++++----------------
 scripts/evaluation/eval_framework/framework.py       | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
 scripts/evaluation/eval_framework/metrics.py         | 20 ++++++++++----------
 scripts/evaluation/eval_framework/reports.py         | 51 ++++++++++++++++++++++++++++++++++++++++++---------
 scripts/evaluation/eval_framework/static/eval_web.js |  2 +-
 scripts/evaluation/eval_framework/store.py           | 15 ++++++++++++++-
 scripts/evaluation/offline_ltr_fit.py                | 16 ++++++++--------
 10 files changed, 180 insertions(+), 83 deletions(-)

diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
index c0fc339..c6233ed 100644
--- a/scripts/evaluation/README.md
+++ b/scripts/evaluation/README.md
@@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p
 - **Composite tuning score: `Primary_Metric_Score`**
   For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`).
 - **Gain scheme**
-  `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0`
-  The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup.
+  `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0`
+  We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive.
 - **Why this is better**
   `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`.
 
@@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc
 
 Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`.
 
+To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes:
+
+- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call
+- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2`
+- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score`
+
+The Markdown report now surfaces the same case context in a lighter human-readable form:
+
+- request id
+- top-10 / top-20 label sequence
+- top 5 result snapshot for quick scanning
+
+This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand.
+
+The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI.
+
 ## Ranking debug and LTR prep
 
 `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work:
diff --git a/scripts/evaluation/eval_framework/__init__.py b/scripts/evaluation/eval_framework/__init__.py
index 074e558..c4335f4 100644
--- a/scripts/evaluation/eval_framework/__init__.py
+++ b/scripts/evaluation/eval_framework/__init__.py
@@ -14,10 +14,10 @@ from .constants import (  # noqa: E402
     DEFAULT_ARTIFACT_ROOT,
     DEFAULT_QUERY_FILE,
     PROJECT_ROOT,
-    RELEVANCE_EXACT,
-    RELEVANCE_HIGH,
-    RELEVANCE_IRRELEVANT,
-    RELEVANCE_LOW,
+    RELEVANCE_LV0,
+    RELEVANCE_LV1,
+    RELEVANCE_LV2,
+    RELEVANCE_LV3,
     RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
 )
@@ -39,10 +39,10 @@ __all__ = [
     "EvalStore",
     "PROJECT_ROOT",
     "QueryBuildResult",
-    "RELEVANCE_EXACT",
-    "RELEVANCE_HIGH",
-    "RELEVANCE_IRRELEVANT",
-    "RELEVANCE_LOW",
+    "RELEVANCE_LV0",
+    "RELEVANCE_LV1",
+    "RELEVANCE_LV2",
+    "RELEVANCE_LV3",
     "RELEVANCE_NON_IRRELEVANT",
     "SearchEvaluationFramework",
     "VALID_LABELS",
diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py
index 95b230e..3ec4056 100644
--- a/scripts/evaluation/eval_framework/clients.py
+++ b/scripts/evaluation/eval_framework/clients.py
@@ -157,6 +157,7 @@ class SearchServiceClient:
         return self._request_json("GET", path, timeout=timeout)
 
     def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
+        request_id = uuid.uuid4().hex[:8]
         payload: Dict[str, Any] = {
             "query": query,
             "size": size,
@@ -165,13 +166,19 @@ class SearchServiceClient:
         }
         if debug:
             payload["debug"] = True
-        return self._request_json(
+        response = self._request_json(
             "POST",
             "/search/",
             timeout=120,
-            headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
+            headers={
+                "Content-Type": "application/json",
+                "X-Tenant-ID": self.tenant_id,
+                "X-Request-ID": request_id,
+            },
             json_payload=payload,
         )
+        response["_eval_request_id"] = request_id
+        return response
 
 
 class RerankServiceClient:
diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py
index 3d1379e..04de982 100644
--- a/scripts/evaluation/eval_framework/constants.py
+++ b/scripts/evaluation/eval_framework/constants.py
@@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
 
 # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
-RELEVANCE_EXACT = "Fully Relevant"
-RELEVANCE_HIGH = "Mostly Relevant"
-RELEVANCE_LOW = "Weakly Relevant"
-RELEVANCE_IRRELEVANT = "Irrelevant"
+RELEVANCE_LV3 = "Fully Relevant"
+RELEVANCE_LV2 = "Mostly Relevant"
+RELEVANCE_LV1 = "Weakly Relevant"
+RELEVANCE_LV0 = "Irrelevant"
 
-VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
+VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
 
 # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
-RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
-RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH})
+RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
+RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
 
 # Graded relevance for ranking evaluation.
 # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
 RELEVANCE_GRADE_MAP = {
-    RELEVANCE_EXACT: 3,
-    RELEVANCE_HIGH: 2,
-    RELEVANCE_LOW: 1,
-    RELEVANCE_IRRELEVANT: 0,
+    RELEVANCE_LV3: 3,
+    RELEVANCE_LV2: 2,
+    RELEVANCE_LV1: 1,
+    RELEVANCE_LV0: 0,
 }
 # 标准的gain计算方法：2^rel - 1
 # 但是是因为标注质量不是特别精确，因此适当降低 exact 和 high 的区分度
@@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = {
 
 # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
 STOP_PROB_MAP = {
-    RELEVANCE_EXACT: 0.99,
-    RELEVANCE_HIGH: 0.8,
-    RELEVANCE_LOW: 0.1,
-    RELEVANCE_IRRELEVANT: 0.0,
+    RELEVANCE_LV3: 0.99,
+    RELEVANCE_LV2: 0.8,
+    RELEVANCE_LV1: 0.1,
+    RELEVANCE_LV0: 0.0,
 }
 
 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
@@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
 # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
 #   - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO  (default 93.9%),
 #   - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO  (default 95.9%).
-# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant").
+# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py
index 6b4688f..b68a60f 100644
--- a/scripts/evaluation/eval_framework/framework.py
+++ b/scripts/evaluation/eval_framework/framework.py
@@ -25,14 +25,14 @@ from .constants import (
     DEFAULT_RERANK_HIGH_SKIP_COUNT,
     DEFAULT_RERANK_HIGH_THRESHOLD,
     DEFAULT_SEARCH_RECALL_TOP_K,
-    RELEVANCE_EXACT,
     RELEVANCE_GAIN_MAP,
-    RELEVANCE_HIGH,
-    STOP_PROB_MAP,
-    RELEVANCE_IRRELEVANT,
-    RELEVANCE_LOW,
+    RELEVANCE_LV0,
+    RELEVANCE_LV1,
+    RELEVANCE_LV2,
+    RELEVANCE_LV3,
     RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
+    STOP_PROB_MAP,
 )
 from .metrics import (
     PRIMARY_METRIC_GRADE_NORMALIZER,
@@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
     return out
 
 
+def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str:
+    parts: List[str] = []
+    for item in items[:limit]:
+        rank = int(item.get("rank") or 0)
+        label = str(item.get("label") or "")
+        grade = RELEVANCE_GAIN_MAP.get(label)
+        parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?")
+    return " | ".join(parts)
+
+
 class SearchEvaluationFramework:
     def __init__(
         self,
@@ -168,7 +178,7 @@ class SearchEvaluationFramework:
     ) -> Dict[str, Any]:
         live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
         labels = [
-            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
             for item in live["results"]
         ]
         return {
@@ -432,7 +442,7 @@ class SearchEvaluationFramework:
 
           - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
           - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio``
-            (default 0.959; weak relevance = ``RELEVANCE_LOW``).
+            (default 0.959; weak relevance = ``RELEVANCE_LV1``).
 
         Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
         Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
@@ -474,9 +484,9 @@ class SearchEvaluationFramework:
             time.sleep(0.1)
 
             n = len(batch_docs)
-            exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT)
-            irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT)
-            low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW)
+            exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3)
+            irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0)
+            low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1)
             exact_ratio = exact_n / n if n else 0.0
             irrelevant_ratio = irrel_n / n if n else 0.0
             low_ratio = low_n / n if n else 0.0
@@ -633,7 +643,7 @@ class SearchEvaluationFramework:
             )
 
         top100_labels = [
-            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
             for item in search_labeled_results[:100]
         ]
         metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -843,7 +853,7 @@ class SearchEvaluationFramework:
             )
 
         top100_labels = [
-            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
             for item in search_labeled_results[:100]
         ]
         metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -920,16 +930,17 @@ class SearchEvaluationFramework:
                     "title_zh": title_zh if title_zh and title_zh != primary_title else "",
                     "image_url": doc.get("image_url"),
                     "label": label,
+                    "relevance_score": doc.get("relevance_score"),
                     "option_values": list(compact_option_values(doc.get("skus") or [])),
                     "product": compact_product_payload(doc),
                 }
             )
         metric_labels = [
-            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
             for item in labeled
         ]
         ideal_labels = [
-            label if label in VALID_LABELS else RELEVANCE_IRRELEVANT
+            label if label in VALID_LABELS else RELEVANCE_LV0
             for label in labels.values()
         ]
         label_stats = self.store.get_query_label_stats(self.tenant_id, query)
@@ -960,10 +971,10 @@ class SearchEvaluationFramework:
                 }
             )
         label_order = {
-            RELEVANCE_EXACT: 0,
-            RELEVANCE_HIGH: 1,
-            RELEVANCE_LOW: 2,
-            RELEVANCE_IRRELEVANT: 3,
+            RELEVANCE_LV3: 0,
+            RELEVANCE_LV2: 1,
+            RELEVANCE_LV1: 2,
+            RELEVANCE_LV0: 3,
         }
         missing_relevant.sort(
             key=lambda item: (
@@ -989,6 +1000,7 @@ class SearchEvaluationFramework:
             "top_k": top_k,
             "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels),
             "metric_context": _metric_context_payload(),
+            "request_id": str(search_payload.get("_eval_request_id") or ""),
             "results": labeled,
             "missing_relevant": missing_relevant,
             "label_stats": {
@@ -996,9 +1008,9 @@ class SearchEvaluationFramework:
                 "unlabeled_hits_treated_irrelevant": unlabeled_hits,
                 "recalled_hits": len(labeled),
                 "missing_relevant_count": len(missing_relevant),
-                "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
-                "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
-                "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
+                "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3),
+                "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2),
+                "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1),
             },
             "tips": tips,
             "total": int(search_payload.get("total") or 0),
@@ -1014,6 +1026,7 @@ class SearchEvaluationFramework:
         force_refresh_labels: bool = False,
     ) -> Dict[str, Any]:
         per_query = []
+        case_snapshot_top_n = min(max(int(top_k), 1), 20)
         total_q = len(queries)
         _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
         for q_index, query in enumerate(queries, start=1):
@@ -1025,7 +1038,7 @@ class SearchEvaluationFramework:
                 force_refresh_labels=force_refresh_labels,
             )
             labels = [
-                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
                 for item in live["results"]
             ]
             per_query.append(
@@ -1036,6 +1049,21 @@ class SearchEvaluationFramework:
                     "metrics": live["metrics"],
                     "distribution": label_distribution(labels),
                     "total": live["total"],
+                    "request_id": live.get("request_id") or "",
+                    "case_snapshot_top_n": case_snapshot_top_n,
+                    "top_label_sequence_top10": _encode_label_sequence(live["results"], 10),
+                    "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n),
+                    "top_results": [
+                        {
+                            "rank": int(item.get("rank") or 0),
+                            "spu_id": str(item.get("spu_id") or ""),
+                            "label": item.get("label"),
+                            "title": item.get("title"),
+                            "title_zh": item.get("title_zh"),
+                            "relevance_score": item.get("relevance_score"),
+                        }
+                        for item in live["results"][:case_snapshot_top_n]
+                    ],
                 }
             )
             m = live["metrics"]
@@ -1055,10 +1083,10 @@ class SearchEvaluationFramework:
             )
         aggregate = aggregate_metrics([item["metrics"] for item in per_query])
         aggregate_distribution = {
-            RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
-            RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
-            RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
-            RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
+            RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query),
+            RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query),
+            RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query),
+            RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query),
         }
         batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
         report_dir = ensure_dir(self.artifact_root / "batch_reports")
diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py
index bc7d45a..93c77b6 100644
--- a/scripts/evaluation/eval_framework/metrics.py
+++ b/scripts/evaluation/eval_framework/metrics.py
@@ -6,12 +6,12 @@ import math
 from typing import Dict, Iterable, Sequence
 
 from .constants import (
-    RELEVANCE_EXACT,
     RELEVANCE_GAIN_MAP,
     RELEVANCE_GRADE_MAP,
-    RELEVANCE_HIGH,
-    RELEVANCE_IRRELEVANT,
-    RELEVANCE_LOW,
+    RELEVANCE_LV0,
+    RELEVANCE_LV1,
+    RELEVANCE_LV2,
+    RELEVANCE_LV3,
     RELEVANCE_NON_IRRELEVANT,
     RELEVANCE_STRONG,
     STOP_PROB_MAP,
@@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0
 def _normalize_label(label: str) -> str:
     if label in RELEVANCE_GRADE_MAP:
         return label
-    return RELEVANCE_IRRELEVANT
+    return RELEVANCE_LV0
 
 
 def _gains_for_labels(labels: Sequence[str]) -> list[float]:
@@ -135,7 +135,7 @@ def compute_query_metrics(
     ideal = list(ideal_labels) if ideal_labels is not None else list(labels)
     metrics: Dict[str, float] = {}
 
-    exact_hits = _binary_hits(labels, [RELEVANCE_EXACT])
+    exact_hits = _binary_hits(labels, [RELEVANCE_LV3])
     strong_hits = _binary_hits(labels, RELEVANCE_STRONG)
     useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT)
 
@@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo
 
 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
     return {
-        RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
-        RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
-        RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
-        RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
+        RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3),
+        RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2),
+        RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1),
+        RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0),
     }
diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py
index 3c53352..7db2f0c 100644
--- a/scripts/evaluation/eval_framework/reports.py
+++ b/scripts/evaluation/eval_framework/reports.py
@@ -4,7 +4,7 @@ from __future__ import annotations
 
 from typing import Any, Dict
 
-from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
+from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
 from .metrics import PRIMARY_METRIC_KEYS
 
 
@@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
         lines.append(f"- {key}: {value}")
 
 
+def _label_level_code(label: str) -> str:
+    grade = RELEVANCE_GAIN_MAP.get(label)
+    return f"L{grade}" if grade is not None else "?"
+
+
+def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
+    request_id = str(item.get("request_id") or "").strip()
+    if request_id:
+        lines.append(f"- Request ID: `{request_id}`")
+    seq10 = str(item.get("top_label_sequence_top10") or "").strip()
+    if seq10:
+        lines.append(f"- Top-10 Labels: `{seq10}`")
+    seq20 = str(item.get("top_label_sequence_top20") or "").strip()
+    if seq20 and seq20 != seq10:
+        lines.append(f"- Top-20 Labels: `{seq20}`")
+    top_results = item.get("top_results") or []
+    if not top_results:
+        return
+    lines.append("- Case Snapshot:")
+    for result in top_results[:5]:
+        rank = int(result.get("rank") or 0)
+        label = _label_level_code(str(result.get("label") or ""))
+        spu_id = str(result.get("spu_id") or "")
+        title = str(result.get("title") or "")
+        title_zh = str(result.get("title_zh") or "")
+        relevance_score = result.get("relevance_score")
+        score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
+        lines.append(f"  - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
+        if title_zh:
+            lines.append(f"    zh: {title_zh}")
+
+
 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
     lines = [
         "# Search Batch Evaluation",
@@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
                 "",
                 "## Label Distribution",
                 "",
-                f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
-                f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
-                f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
-                f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
+                f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
+                f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
+                f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
+                f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
             ]
         )
     lines.extend(["", "## Per Query", ""])
@@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
         lines.append("")
         _append_metric_block(lines, item.get("metrics") or {})
         distribution = item.get("distribution") or {}
-        lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
-        lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
-        lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
-        lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
+        lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
+        lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
+        lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
+        lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
+        _append_case_snapshot(lines, item)
         lines.append("")
     return "\n".join(lines)
diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js
index beaa4fa..3d298cd 100644
--- a/scripts/evaluation/eval_framework/static/eval_web.js
+++ b/scripts/evaluation/eval_framework/static/eval_web.js
@@ -190,7 +190,7 @@ async function loadQueries() {
 
 function historySummaryHtml(meta) {
   const m = meta && meta.aggregate_metrics;
-  const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
+  const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
   const parts = [];
   if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
   if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`);
diff --git a/scripts/evaluation/eval_framework/store.py b/scripts/evaluation/eval_framework/store.py
index da030f4..ceac809 100644
--- a/scripts/evaluation/eval_framework/store.py
+++ b/scripts/evaluation/eval_framework/store.py
@@ -23,6 +23,18 @@ class QueryBuildResult:
     output_json_path: Path
 
 
+def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "batch_id": metadata.get("batch_id"),
+        "created_at": metadata.get("created_at"),
+        "tenant_id": metadata.get("tenant_id"),
+        "top_k": metadata.get("top_k"),
+        "query_count": len(metadata.get("queries") or []),
+        "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}),
+        "metric_context": dict(metadata.get("metric_context") or {}),
+    }
+
+
 class EvalStore:
     def __init__(self, db_path: Path):
         self.db_path = db_path
@@ -339,6 +351,7 @@ class EvalStore:
         ).fetchall()
         items: List[Dict[str, Any]] = []
         for row in rows:
+            metadata = json.loads(row["metadata_json"])
             items.append(
                 {
                     "batch_id": row["batch_id"],
@@ -346,7 +359,7 @@ class EvalStore:
                     "output_json_path": row["output_json_path"],
                     "report_markdown_path": row["report_markdown_path"],
                     "config_snapshot_path": row["config_snapshot_path"],
-                    "metadata": json.loads(row["metadata_json"]),
+                    "metadata": _compact_batch_metadata(metadata),
                     "created_at": row["created_at"],
                 }
             )
diff --git a/scripts/evaluation/offline_ltr_fit.py b/scripts/evaluation/offline_ltr_fit.py
index 351f4f7..d8436ca 100644
--- a/scripts/evaluation/offline_ltr_fit.py
+++ b/scripts/evaluation/offline_ltr_fit.py
@@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path:
 
 from scripts.evaluation.eval_framework.constants import (
     DEFAULT_ARTIFACT_ROOT,
-    RELEVANCE_EXACT,
     RELEVANCE_GRADE_MAP,
-    RELEVANCE_HIGH,
-    RELEVANCE_IRRELEVANT,
-    RELEVANCE_LOW,
+    RELEVANCE_LV0,
+    RELEVANCE_LV1,
+    RELEVANCE_LV2,
+    RELEVANCE_LV3,
 )
 from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics
 from scripts.evaluation.eval_framework.store import EvalStore
@@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp
 
 
 LABELS_BY_GRADE = {
-    3: RELEVANCE_EXACT,
-    2: RELEVANCE_HIGH,
-    1: RELEVANCE_LOW,
-    0: RELEVANCE_IRRELEVANT,
+    3: RELEVANCE_LV3,
+    2: RELEVANCE_LV2,
+    1: RELEVANCE_LV1,
+    0: RELEVANCE_LV0,
 }
 
 
--
libgit2 0.21.2