diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
index c0fc339..c6233ed 100644
--- a/scripts/evaluation/README.md
+++ b/scripts/evaluation/README.md
@@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p
- **Composite tuning score: `Primary_Metric_Score`**
For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`).
- **Gain scheme**
- `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0`
- The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup.
+ `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0`
+ We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive.
- **Why this is better**
`NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`.
@@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc
Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`.
+To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes:
+
+- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call
+- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2`
+- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score`
+
+The Markdown report now surfaces the same case context in a lighter human-readable form:
+
+- request id
+- top-10 / top-20 label sequence
+- top 5 result snapshot for quick scanning
+
+This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand.
+
+The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI.
+
## Ranking debug and LTR prep
`debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work:
diff --git a/scripts/evaluation/eval_framework/__init__.py b/scripts/evaluation/eval_framework/__init__.py
index 074e558..c4335f4 100644
--- a/scripts/evaluation/eval_framework/__init__.py
+++ b/scripts/evaluation/eval_framework/__init__.py
@@ -14,10 +14,10 @@ from .constants import ( # noqa: E402
DEFAULT_ARTIFACT_ROOT,
DEFAULT_QUERY_FILE,
PROJECT_ROOT,
- RELEVANCE_EXACT,
- RELEVANCE_HIGH,
- RELEVANCE_IRRELEVANT,
- RELEVANCE_LOW,
+ RELEVANCE_LV0,
+ RELEVANCE_LV1,
+ RELEVANCE_LV2,
+ RELEVANCE_LV3,
RELEVANCE_NON_IRRELEVANT,
VALID_LABELS,
)
@@ -39,10 +39,10 @@ __all__ = [
"EvalStore",
"PROJECT_ROOT",
"QueryBuildResult",
- "RELEVANCE_EXACT",
- "RELEVANCE_HIGH",
- "RELEVANCE_IRRELEVANT",
- "RELEVANCE_LOW",
+ "RELEVANCE_LV0",
+ "RELEVANCE_LV1",
+ "RELEVANCE_LV2",
+ "RELEVANCE_LV3",
"RELEVANCE_NON_IRRELEVANT",
"SearchEvaluationFramework",
"VALID_LABELS",
diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py
index 95b230e..3ec4056 100644
--- a/scripts/evaluation/eval_framework/clients.py
+++ b/scripts/evaluation/eval_framework/clients.py
@@ -157,6 +157,7 @@ class SearchServiceClient:
return self._request_json("GET", path, timeout=timeout)
def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
+ request_id = uuid.uuid4().hex[:8]
payload: Dict[str, Any] = {
"query": query,
"size": size,
@@ -165,13 +166,19 @@ class SearchServiceClient:
}
if debug:
payload["debug"] = True
- return self._request_json(
+ response = self._request_json(
"POST",
"/search/",
timeout=120,
- headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
+ headers={
+ "Content-Type": "application/json",
+ "X-Tenant-ID": self.tenant_id,
+ "X-Request-ID": request_id,
+ },
json_payload=payload,
)
+ response["_eval_request_id"] = request_id
+ return response
class RerankServiceClient:
diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py
index 3d1379e..04de982 100644
--- a/scripts/evaluation/eval_framework/constants.py
+++ b/scripts/evaluation/eval_framework/constants.py
@@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
-RELEVANCE_EXACT = "Fully Relevant"
-RELEVANCE_HIGH = "Mostly Relevant"
-RELEVANCE_LOW = "Weakly Relevant"
-RELEVANCE_IRRELEVANT = "Irrelevant"
+RELEVANCE_LV3 = "Fully Relevant"
+RELEVANCE_LV2 = "Mostly Relevant"
+RELEVANCE_LV1 = "Weakly Relevant"
+RELEVANCE_LV0 = "Irrelevant"
-VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
+VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
# Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
-RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
-RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH})
+RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
+RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
# Graded relevance for ranking evaluation.
# We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
RELEVANCE_GRADE_MAP = {
- RELEVANCE_EXACT: 3,
- RELEVANCE_HIGH: 2,
- RELEVANCE_LOW: 1,
- RELEVANCE_IRRELEVANT: 0,
+ RELEVANCE_LV3: 3,
+ RELEVANCE_LV2: 2,
+ RELEVANCE_LV1: 1,
+ RELEVANCE_LV0: 0,
}
# 标准的gain计算方法:2^rel - 1
# 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度
@@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = {
# P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
STOP_PROB_MAP = {
- RELEVANCE_EXACT: 0.99,
- RELEVANCE_HIGH: 0.8,
- RELEVANCE_LOW: 0.1,
- RELEVANCE_IRRELEVANT: 0.0,
+ RELEVANCE_LV3: 0.99,
+ RELEVANCE_LV2: 0.8,
+ RELEVANCE_LV1: 0.1,
+ RELEVANCE_LV0: 0.0,
}
DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
@@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
# - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
-# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant").
+# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py
index 6b4688f..b68a60f 100644
--- a/scripts/evaluation/eval_framework/framework.py
+++ b/scripts/evaluation/eval_framework/framework.py
@@ -25,14 +25,14 @@ from .constants import (
DEFAULT_RERANK_HIGH_SKIP_COUNT,
DEFAULT_RERANK_HIGH_THRESHOLD,
DEFAULT_SEARCH_RECALL_TOP_K,
- RELEVANCE_EXACT,
RELEVANCE_GAIN_MAP,
- RELEVANCE_HIGH,
- STOP_PROB_MAP,
- RELEVANCE_IRRELEVANT,
- RELEVANCE_LOW,
+ RELEVANCE_LV0,
+ RELEVANCE_LV1,
+ RELEVANCE_LV2,
+ RELEVANCE_LV3,
RELEVANCE_NON_IRRELEVANT,
VALID_LABELS,
+ STOP_PROB_MAP,
)
from .metrics import (
PRIMARY_METRIC_GRADE_NORMALIZER,
@@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
return out
+def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str:
+ parts: List[str] = []
+ for item in items[:limit]:
+ rank = int(item.get("rank") or 0)
+ label = str(item.get("label") or "")
+ grade = RELEVANCE_GAIN_MAP.get(label)
+ parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?")
+ return " | ".join(parts)
+
+
class SearchEvaluationFramework:
def __init__(
self,
@@ -168,7 +178,7 @@ class SearchEvaluationFramework:
) -> Dict[str, Any]:
live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
labels = [
- item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
for item in live["results"]
]
return {
@@ -432,7 +442,7 @@ class SearchEvaluationFramework:
- ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
- ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio``
- (default 0.959; weak relevance = ``RELEVANCE_LOW``).
+ (default 0.959; weak relevance = ``RELEVANCE_LV1``).
Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
@@ -474,9 +484,9 @@ class SearchEvaluationFramework:
time.sleep(0.1)
n = len(batch_docs)
- exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT)
- irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT)
- low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW)
+ exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3)
+ irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0)
+ low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1)
exact_ratio = exact_n / n if n else 0.0
irrelevant_ratio = irrel_n / n if n else 0.0
low_ratio = low_n / n if n else 0.0
@@ -633,7 +643,7 @@ class SearchEvaluationFramework:
)
top100_labels = [
- item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
for item in search_labeled_results[:100]
]
metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -843,7 +853,7 @@ class SearchEvaluationFramework:
)
top100_labels = [
- item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
for item in search_labeled_results[:100]
]
metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -920,16 +930,17 @@ class SearchEvaluationFramework:
"title_zh": title_zh if title_zh and title_zh != primary_title else "",
"image_url": doc.get("image_url"),
"label": label,
+ "relevance_score": doc.get("relevance_score"),
"option_values": list(compact_option_values(doc.get("skus") or [])),
"product": compact_product_payload(doc),
}
)
metric_labels = [
- item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
for item in labeled
]
ideal_labels = [
- label if label in VALID_LABELS else RELEVANCE_IRRELEVANT
+ label if label in VALID_LABELS else RELEVANCE_LV0
for label in labels.values()
]
label_stats = self.store.get_query_label_stats(self.tenant_id, query)
@@ -960,10 +971,10 @@ class SearchEvaluationFramework:
}
)
label_order = {
- RELEVANCE_EXACT: 0,
- RELEVANCE_HIGH: 1,
- RELEVANCE_LOW: 2,
- RELEVANCE_IRRELEVANT: 3,
+ RELEVANCE_LV3: 0,
+ RELEVANCE_LV2: 1,
+ RELEVANCE_LV1: 2,
+ RELEVANCE_LV0: 3,
}
missing_relevant.sort(
key=lambda item: (
@@ -989,6 +1000,7 @@ class SearchEvaluationFramework:
"top_k": top_k,
"metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels),
"metric_context": _metric_context_payload(),
+ "request_id": str(search_payload.get("_eval_request_id") or ""),
"results": labeled,
"missing_relevant": missing_relevant,
"label_stats": {
@@ -996,9 +1008,9 @@ class SearchEvaluationFramework:
"unlabeled_hits_treated_irrelevant": unlabeled_hits,
"recalled_hits": len(labeled),
"missing_relevant_count": len(missing_relevant),
- "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
- "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
- "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
+ "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3),
+ "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2),
+ "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1),
},
"tips": tips,
"total": int(search_payload.get("total") or 0),
@@ -1014,6 +1026,7 @@ class SearchEvaluationFramework:
force_refresh_labels: bool = False,
) -> Dict[str, Any]:
per_query = []
+ case_snapshot_top_n = min(max(int(top_k), 1), 20)
total_q = len(queries)
_log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
for q_index, query in enumerate(queries, start=1):
@@ -1025,7 +1038,7 @@ class SearchEvaluationFramework:
force_refresh_labels=force_refresh_labels,
)
labels = [
- item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
for item in live["results"]
]
per_query.append(
@@ -1036,6 +1049,21 @@ class SearchEvaluationFramework:
"metrics": live["metrics"],
"distribution": label_distribution(labels),
"total": live["total"],
+ "request_id": live.get("request_id") or "",
+ "case_snapshot_top_n": case_snapshot_top_n,
+ "top_label_sequence_top10": _encode_label_sequence(live["results"], 10),
+ "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n),
+ "top_results": [
+ {
+ "rank": int(item.get("rank") or 0),
+ "spu_id": str(item.get("spu_id") or ""),
+ "label": item.get("label"),
+ "title": item.get("title"),
+ "title_zh": item.get("title_zh"),
+ "relevance_score": item.get("relevance_score"),
+ }
+ for item in live["results"][:case_snapshot_top_n]
+ ],
}
)
m = live["metrics"]
@@ -1055,10 +1083,10 @@ class SearchEvaluationFramework:
)
aggregate = aggregate_metrics([item["metrics"] for item in per_query])
aggregate_distribution = {
- RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
- RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
- RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
- RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
+ RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query),
+ RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query),
+ RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query),
+ RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query),
}
batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
report_dir = ensure_dir(self.artifact_root / "batch_reports")
diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py
index bc7d45a..93c77b6 100644
--- a/scripts/evaluation/eval_framework/metrics.py
+++ b/scripts/evaluation/eval_framework/metrics.py
@@ -6,12 +6,12 @@ import math
from typing import Dict, Iterable, Sequence
from .constants import (
- RELEVANCE_EXACT,
RELEVANCE_GAIN_MAP,
RELEVANCE_GRADE_MAP,
- RELEVANCE_HIGH,
- RELEVANCE_IRRELEVANT,
- RELEVANCE_LOW,
+ RELEVANCE_LV0,
+ RELEVANCE_LV1,
+ RELEVANCE_LV2,
+ RELEVANCE_LV3,
RELEVANCE_NON_IRRELEVANT,
RELEVANCE_STRONG,
STOP_PROB_MAP,
@@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0
def _normalize_label(label: str) -> str:
if label in RELEVANCE_GRADE_MAP:
return label
- return RELEVANCE_IRRELEVANT
+ return RELEVANCE_LV0
def _gains_for_labels(labels: Sequence[str]) -> list[float]:
@@ -135,7 +135,7 @@ def compute_query_metrics(
ideal = list(ideal_labels) if ideal_labels is not None else list(labels)
metrics: Dict[str, float] = {}
- exact_hits = _binary_hits(labels, [RELEVANCE_EXACT])
+ exact_hits = _binary_hits(labels, [RELEVANCE_LV3])
strong_hits = _binary_hits(labels, RELEVANCE_STRONG)
useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT)
@@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo
def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
return {
- RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
- RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
- RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
- RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
+ RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3),
+ RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2),
+ RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1),
+ RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0),
}
diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py
index 3c53352..7db2f0c 100644
--- a/scripts/evaluation/eval_framework/reports.py
+++ b/scripts/evaluation/eval_framework/reports.py
@@ -4,7 +4,7 @@ from __future__ import annotations
from typing import Any, Dict
-from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
+from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
from .metrics import PRIMARY_METRIC_KEYS
@@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
lines.append(f"- {key}: {value}")
+def _label_level_code(label: str) -> str:
+ grade = RELEVANCE_GAIN_MAP.get(label)
+ return f"L{grade}" if grade is not None else "?"
+
+
+def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
+ request_id = str(item.get("request_id") or "").strip()
+ if request_id:
+ lines.append(f"- Request ID: `{request_id}`")
+ seq10 = str(item.get("top_label_sequence_top10") or "").strip()
+ if seq10:
+ lines.append(f"- Top-10 Labels: `{seq10}`")
+ seq20 = str(item.get("top_label_sequence_top20") or "").strip()
+ if seq20 and seq20 != seq10:
+ lines.append(f"- Top-20 Labels: `{seq20}`")
+ top_results = item.get("top_results") or []
+ if not top_results:
+ return
+ lines.append("- Case Snapshot:")
+ for result in top_results[:5]:
+ rank = int(result.get("rank") or 0)
+ label = _label_level_code(str(result.get("label") or ""))
+ spu_id = str(result.get("spu_id") or "")
+ title = str(result.get("title") or "")
+ title_zh = str(result.get("title_zh") or "")
+ relevance_score = result.get("relevance_score")
+ score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
+ lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
+ if title_zh:
+ lines.append(f" zh: {title_zh}")
+
+
def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
lines = [
"# Search Batch Evaluation",
@@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
"",
"## Label Distribution",
"",
- f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
- f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
- f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
- f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
+ f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
+ f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
+ f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
+ f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
]
)
lines.extend(["", "## Per Query", ""])
@@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
lines.append("")
_append_metric_block(lines, item.get("metrics") or {})
distribution = item.get("distribution") or {}
- lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
- lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
- lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
- lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
+ lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
+ lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
+ lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
+ lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
+ _append_case_snapshot(lines, item)
lines.append("")
return "\n".join(lines)
diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js
index beaa4fa..3d298cd 100644
--- a/scripts/evaluation/eval_framework/static/eval_web.js
+++ b/scripts/evaluation/eval_framework/static/eval_web.js
@@ -190,7 +190,7 @@ async function loadQueries() {
function historySummaryHtml(meta) {
const m = meta && meta.aggregate_metrics;
- const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
+ const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
const parts = [];
if (nq != null) parts.push(`Queries ${nq}`);
if (m && m["Primary_Metric_Score"] != null) parts.push(`Primary ${fmtNumber(m["Primary_Metric_Score"])}`);
diff --git a/scripts/evaluation/eval_framework/store.py b/scripts/evaluation/eval_framework/store.py
index da030f4..ceac809 100644
--- a/scripts/evaluation/eval_framework/store.py
+++ b/scripts/evaluation/eval_framework/store.py
@@ -23,6 +23,18 @@ class QueryBuildResult:
output_json_path: Path
+def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
+ return {
+ "batch_id": metadata.get("batch_id"),
+ "created_at": metadata.get("created_at"),
+ "tenant_id": metadata.get("tenant_id"),
+ "top_k": metadata.get("top_k"),
+ "query_count": len(metadata.get("queries") or []),
+ "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}),
+ "metric_context": dict(metadata.get("metric_context") or {}),
+ }
+
+
class EvalStore:
def __init__(self, db_path: Path):
self.db_path = db_path
@@ -339,6 +351,7 @@ class EvalStore:
).fetchall()
items: List[Dict[str, Any]] = []
for row in rows:
+ metadata = json.loads(row["metadata_json"])
items.append(
{
"batch_id": row["batch_id"],
@@ -346,7 +359,7 @@ class EvalStore:
"output_json_path": row["output_json_path"],
"report_markdown_path": row["report_markdown_path"],
"config_snapshot_path": row["config_snapshot_path"],
- "metadata": json.loads(row["metadata_json"]),
+ "metadata": _compact_batch_metadata(metadata),
"created_at": row["created_at"],
}
)
diff --git a/scripts/evaluation/offline_ltr_fit.py b/scripts/evaluation/offline_ltr_fit.py
index 351f4f7..d8436ca 100644
--- a/scripts/evaluation/offline_ltr_fit.py
+++ b/scripts/evaluation/offline_ltr_fit.py
@@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path:
from scripts.evaluation.eval_framework.constants import (
DEFAULT_ARTIFACT_ROOT,
- RELEVANCE_EXACT,
RELEVANCE_GRADE_MAP,
- RELEVANCE_HIGH,
- RELEVANCE_IRRELEVANT,
- RELEVANCE_LOW,
+ RELEVANCE_LV0,
+ RELEVANCE_LV1,
+ RELEVANCE_LV2,
+ RELEVANCE_LV3,
)
from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics
from scripts.evaluation.eval_framework.store import EvalStore
@@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp
LABELS_BY_GRADE = {
- 3: RELEVANCE_EXACT,
- 2: RELEVANCE_HIGH,
- 1: RELEVANCE_LOW,
- 0: RELEVANCE_IRRELEVANT,
+ 3: RELEVANCE_LV3,
+ 2: RELEVANCE_LV2,
+ 1: RELEVANCE_LV1,
+ 0: RELEVANCE_LV0,
}
--
libgit2 0.21.2