From d73ca84a48afc0945a533707c77ba3bbfaac9621 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 8 Apr 2026 15:39:47 +0800 Subject: [PATCH] refine eval case snapshots and rename relevance levels --- scripts/evaluation/README.md | 20 ++++++++++++++++++-- scripts/evaluation/eval_framework/__init__.py | 16 ++++++++-------- scripts/evaluation/eval_framework/clients.py | 11 +++++++++-- scripts/evaluation/eval_framework/constants.py | 32 ++++++++++++++++---------------- scripts/evaluation/eval_framework/framework.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------- scripts/evaluation/eval_framework/metrics.py | 20 ++++++++++---------- scripts/evaluation/eval_framework/reports.py | 51 ++++++++++++++++++++++++++++++++++++++++++--------- scripts/evaluation/eval_framework/static/eval_web.js | 2 +- scripts/evaluation/eval_framework/store.py | 15 ++++++++++++++- scripts/evaluation/offline_ltr_fit.py | 16 ++++++++-------- 10 files changed, 180 insertions(+), 83 deletions(-) diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index c0fc339..c6233ed 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p - **Composite tuning score: `Primary_Metric_Score`** For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`). - **Gain scheme** - `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0` - The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. + `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0` + We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive. - **Why this is better** `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`. @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. +To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes: + +- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call +- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2` +- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score` + +The Markdown report now surfaces the same case context in a lighter human-readable form: + +- request id +- top-10 / top-20 label sequence +- top 5 result snapshot for quick scanning + +This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand. + +The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI. + ## Ranking debug and LTR prep `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work: diff --git a/scripts/evaluation/eval_framework/__init__.py b/scripts/evaluation/eval_framework/__init__.py index 074e558..c4335f4 100644 --- a/scripts/evaluation/eval_framework/__init__.py +++ b/scripts/evaluation/eval_framework/__init__.py @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402 DEFAULT_ARTIFACT_ROOT, DEFAULT_QUERY_FILE, PROJECT_ROOT, - RELEVANCE_EXACT, - RELEVANCE_HIGH, - RELEVANCE_IRRELEVANT, - RELEVANCE_LOW, + RELEVANCE_LV0, + RELEVANCE_LV1, + RELEVANCE_LV2, + RELEVANCE_LV3, RELEVANCE_NON_IRRELEVANT, VALID_LABELS, ) @@ -39,10 +39,10 @@ __all__ = [ "EvalStore", "PROJECT_ROOT", "QueryBuildResult", - "RELEVANCE_EXACT", - "RELEVANCE_HIGH", - "RELEVANCE_IRRELEVANT", - "RELEVANCE_LOW", + "RELEVANCE_LV0", + "RELEVANCE_LV1", + "RELEVANCE_LV2", + "RELEVANCE_LV3", "RELEVANCE_NON_IRRELEVANT", "SearchEvaluationFramework", "VALID_LABELS", diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 95b230e..3ec4056 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -157,6 +157,7 @@ class SearchServiceClient: return self._request_json("GET", path, timeout=timeout) def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: + request_id = uuid.uuid4().hex[:8] payload: Dict[str, Any] = { "query": query, "size": size, @@ -165,13 +166,19 @@ class SearchServiceClient: } if debug: payload["debug"] = True - return self._request_json( + response = self._request_json( "POST", "/search/", timeout=120, - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, + headers={ + "Content-Type": "application/json", + "X-Tenant-ID": self.tenant_id, + "X-Request-ID": request_id, + }, json_payload=payload, ) + response["_eval_request_id"] = request_id + return response class RerankServiceClient: diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index 3d1379e..04de982 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) -RELEVANCE_EXACT = "Fully Relevant" -RELEVANCE_HIGH = "Mostly Relevant" -RELEVANCE_LOW = "Weakly Relevant" -RELEVANCE_IRRELEVANT = "Irrelevant" +RELEVANCE_LV3 = "Fully Relevant" +RELEVANCE_LV2 = "Mostly Relevant" +RELEVANCE_LV1 = "Weakly Relevant" +RELEVANCE_LV0 = "Irrelevant" -VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) +VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0}) # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. -RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) -RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1}) +RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2}) # Graded relevance for ranking evaluation. # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. RELEVANCE_GRADE_MAP = { - RELEVANCE_EXACT: 3, - RELEVANCE_HIGH: 2, - RELEVANCE_LOW: 1, - RELEVANCE_IRRELEVANT: 0, + RELEVANCE_LV3: 3, + RELEVANCE_LV2: 2, + RELEVANCE_LV1: 1, + RELEVANCE_LV0: 0, } # 标准的gain计算方法:2^rel - 1 # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度 @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = { # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009). STOP_PROB_MAP = { - RELEVANCE_EXACT: 0.99, - RELEVANCE_HIGH: 0.8, - RELEVANCE_LOW: 0.1, - RELEVANCE_IRRELEVANT: 0.0, + RELEVANCE_LV3: 0.99, + RELEVANCE_LV2: 0.8, + RELEVANCE_LV1: 0.1, + RELEVANCE_LV0: 0.0, } DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). -# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant"). +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant"). # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 6b4688f..b68a60f 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -25,14 +25,14 @@ from .constants import ( DEFAULT_RERANK_HIGH_SKIP_COUNT, DEFAULT_RERANK_HIGH_THRESHOLD, DEFAULT_SEARCH_RECALL_TOP_K, - RELEVANCE_EXACT, RELEVANCE_GAIN_MAP, - RELEVANCE_HIGH, - STOP_PROB_MAP, - RELEVANCE_IRRELEVANT, - RELEVANCE_LOW, + RELEVANCE_LV0, + RELEVANCE_LV1, + RELEVANCE_LV2, + RELEVANCE_LV3, RELEVANCE_NON_IRRELEVANT, VALID_LABELS, + STOP_PROB_MAP, ) from .metrics import ( PRIMARY_METRIC_GRADE_NORMALIZER, @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: return out +def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str: + parts: List[str] = [] + for item in items[:limit]: + rank = int(item.get("rank") or 0) + label = str(item.get("label") or "") + grade = RELEVANCE_GAIN_MAP.get(label) + parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?") + return " | ".join(parts) + + class SearchEvaluationFramework: def __init__( self, @@ -168,7 +178,7 @@ class SearchEvaluationFramework: ) -> Dict[str, Any]: live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 for item in live["results"] ] return { @@ -432,7 +442,7 @@ class SearchEvaluationFramework: - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio`` - (default 0.959; weak relevance = ``RELEVANCE_LOW``). + (default 0.959; weak relevance = ``RELEVANCE_LV1``). Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached @@ -474,9 +484,9 @@ class SearchEvaluationFramework: time.sleep(0.1) n = len(batch_docs) - exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) - irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) - low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW) + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3) + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0) + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1) exact_ratio = exact_n / n if n else 0.0 irrelevant_ratio = irrel_n / n if n else 0.0 low_ratio = low_n / n if n else 0.0 @@ -633,7 +643,7 @@ class SearchEvaluationFramework: ) top100_labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 for item in search_labeled_results[:100] ] metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) @@ -843,7 +853,7 @@ class SearchEvaluationFramework: ) top100_labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 for item in search_labeled_results[:100] ] metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) @@ -920,16 +930,17 @@ class SearchEvaluationFramework: "title_zh": title_zh if title_zh and title_zh != primary_title else "", "image_url": doc.get("image_url"), "label": label, + "relevance_score": doc.get("relevance_score"), "option_values": list(compact_option_values(doc.get("skus") or [])), "product": compact_product_payload(doc), } ) metric_labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 for item in labeled ] ideal_labels = [ - label if label in VALID_LABELS else RELEVANCE_IRRELEVANT + label if label in VALID_LABELS else RELEVANCE_LV0 for label in labels.values() ] label_stats = self.store.get_query_label_stats(self.tenant_id, query) @@ -960,10 +971,10 @@ class SearchEvaluationFramework: } ) label_order = { - RELEVANCE_EXACT: 0, - RELEVANCE_HIGH: 1, - RELEVANCE_LOW: 2, - RELEVANCE_IRRELEVANT: 3, + RELEVANCE_LV3: 0, + RELEVANCE_LV2: 1, + RELEVANCE_LV1: 2, + RELEVANCE_LV0: 3, } missing_relevant.sort( key=lambda item: ( @@ -989,6 +1000,7 @@ class SearchEvaluationFramework: "top_k": top_k, "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), "metric_context": _metric_context_payload(), + "request_id": str(search_payload.get("_eval_request_id") or ""), "results": labeled, "missing_relevant": missing_relevant, "label_stats": { @@ -996,9 +1008,9 @@ class SearchEvaluationFramework: "unlabeled_hits_treated_irrelevant": unlabeled_hits, "recalled_hits": len(labeled), "missing_relevant_count": len(missing_relevant), - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), - "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), - "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3), + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2), + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1), }, "tips": tips, "total": int(search_payload.get("total") or 0), @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework: force_refresh_labels: bool = False, ) -> Dict[str, Any]: per_query = [] + case_snapshot_top_n = min(max(int(top_k), 1), 20) total_q = len(queries) _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) for q_index, query in enumerate(queries, start=1): @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework: force_refresh_labels=force_refresh_labels, ) labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 for item in live["results"] ] per_query.append( @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework: "metrics": live["metrics"], "distribution": label_distribution(labels), "total": live["total"], + "request_id": live.get("request_id") or "", + "case_snapshot_top_n": case_snapshot_top_n, + "top_label_sequence_top10": _encode_label_sequence(live["results"], 10), + "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n), + "top_results": [ + { + "rank": int(item.get("rank") or 0), + "spu_id": str(item.get("spu_id") or ""), + "label": item.get("label"), + "title": item.get("title"), + "title_zh": item.get("title_zh"), + "relevance_score": item.get("relevance_score"), + } + for item in live["results"][:case_snapshot_top_n] + ], } ) m = live["metrics"] @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework: ) aggregate = aggregate_metrics([item["metrics"] for item in per_query]) aggregate_distribution = { - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), - RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), - RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), + RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query), + RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query), + RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query), + RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query), } batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" report_dir = ensure_dir(self.artifact_root / "batch_reports") diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py index bc7d45a..93c77b6 100644 --- a/scripts/evaluation/eval_framework/metrics.py +++ b/scripts/evaluation/eval_framework/metrics.py @@ -6,12 +6,12 @@ import math from typing import Dict, Iterable, Sequence from .constants import ( - RELEVANCE_EXACT, RELEVANCE_GAIN_MAP, RELEVANCE_GRADE_MAP, - RELEVANCE_HIGH, - RELEVANCE_IRRELEVANT, - RELEVANCE_LOW, + RELEVANCE_LV0, + RELEVANCE_LV1, + RELEVANCE_LV2, + RELEVANCE_LV3, RELEVANCE_NON_IRRELEVANT, RELEVANCE_STRONG, STOP_PROB_MAP, @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0 def _normalize_label(label: str) -> str: if label in RELEVANCE_GRADE_MAP: return label - return RELEVANCE_IRRELEVANT + return RELEVANCE_LV0 def _gains_for_labels(labels: Sequence[str]) -> list[float]: @@ -135,7 +135,7 @@ def compute_query_metrics( ideal = list(ideal_labels) if ideal_labels is not None else list(labels) metrics: Dict[str, float] = {} - exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) + exact_hits = _binary_hits(labels, [RELEVANCE_LV3]) strong_hits = _binary_hits(labels, RELEVANCE_STRONG) useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo def label_distribution(labels: Sequence[str]) -> Dict[str, int]: return { - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), - RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), - RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), + RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3), + RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2), + RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1), + RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0), } diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py index 3c53352..7db2f0c 100644 --- a/scripts/evaluation/eval_framework/reports.py +++ b/scripts/evaluation/eval_framework/reports.py @@ -4,7 +4,7 @@ from __future__ import annotations from typing import Any, Dict -from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW +from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3 from .metrics import PRIMARY_METRIC_KEYS @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: lines.append(f"- {key}: {value}") +def _label_level_code(label: str) -> str: + grade = RELEVANCE_GAIN_MAP.get(label) + return f"L{grade}" if grade is not None else "?" + + +def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None: + request_id = str(item.get("request_id") or "").strip() + if request_id: + lines.append(f"- Request ID: `{request_id}`") + seq10 = str(item.get("top_label_sequence_top10") or "").strip() + if seq10: + lines.append(f"- Top-10 Labels: `{seq10}`") + seq20 = str(item.get("top_label_sequence_top20") or "").strip() + if seq20 and seq20 != seq10: + lines.append(f"- Top-20 Labels: `{seq20}`") + top_results = item.get("top_results") or [] + if not top_results: + return + lines.append("- Case Snapshot:") + for result in top_results[:5]: + rank = int(result.get("rank") or 0) + label = _label_level_code(str(result.get("label") or "")) + spu_id = str(result.get("spu_id") or "") + title = str(result.get("title") or "") + title_zh = str(result.get("title_zh") or "") + relevance_score = result.get("relevance_score") + score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else "" + lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}") + if title_zh: + lines.append(f" zh: {title_zh}") + + def render_batch_report_markdown(payload: Dict[str, Any]) -> str: lines = [ "# Search Batch Evaluation", @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: "", "## Label Distribution", "", - f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}", - f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", - f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}", - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", + f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}", + f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}", + f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}", + f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}", ] ) lines.extend(["", "## Per Query", ""]) @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: lines.append("") _append_metric_block(lines, item.get("metrics") or {}) distribution = item.get("distribution") or {} - lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}") - lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") - lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}") - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") + lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}") + lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}") + lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}") + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}") + _append_case_snapshot(lines, item) lines.append("") return "\n".join(lines) diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js index beaa4fa..3d298cd 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.js +++ b/scripts/evaluation/eval_framework/static/eval_web.js @@ -190,7 +190,7 @@ async function loadQueries() { function historySummaryHtml(meta) { const m = meta && meta.aggregate_metrics; - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; + const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; const parts = []; if (nq != null) parts.push(`Queries ${nq}`); if (m && m["Primary_Metric_Score"] != null) parts.push(`Primary ${fmtNumber(m["Primary_Metric_Score"])}`); diff --git a/scripts/evaluation/eval_framework/store.py b/scripts/evaluation/eval_framework/store.py index da030f4..ceac809 100644 --- a/scripts/evaluation/eval_framework/store.py +++ b/scripts/evaluation/eval_framework/store.py @@ -23,6 +23,18 @@ class QueryBuildResult: output_json_path: Path +def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]: + return { + "batch_id": metadata.get("batch_id"), + "created_at": metadata.get("created_at"), + "tenant_id": metadata.get("tenant_id"), + "top_k": metadata.get("top_k"), + "query_count": len(metadata.get("queries") or []), + "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}), + "metric_context": dict(metadata.get("metric_context") or {}), + } + + class EvalStore: def __init__(self, db_path: Path): self.db_path = db_path @@ -339,6 +351,7 @@ class EvalStore: ).fetchall() items: List[Dict[str, Any]] = [] for row in rows: + metadata = json.loads(row["metadata_json"]) items.append( { "batch_id": row["batch_id"], @@ -346,7 +359,7 @@ class EvalStore: "output_json_path": row["output_json_path"], "report_markdown_path": row["report_markdown_path"], "config_snapshot_path": row["config_snapshot_path"], - "metadata": json.loads(row["metadata_json"]), + "metadata": _compact_batch_metadata(metadata), "created_at": row["created_at"], } ) diff --git a/scripts/evaluation/offline_ltr_fit.py b/scripts/evaluation/offline_ltr_fit.py index 351f4f7..d8436ca 100644 --- a/scripts/evaluation/offline_ltr_fit.py +++ b/scripts/evaluation/offline_ltr_fit.py @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path: from scripts.evaluation.eval_framework.constants import ( DEFAULT_ARTIFACT_ROOT, - RELEVANCE_EXACT, RELEVANCE_GRADE_MAP, - RELEVANCE_HIGH, - RELEVANCE_IRRELEVANT, - RELEVANCE_LOW, + RELEVANCE_LV0, + RELEVANCE_LV1, + RELEVANCE_LV2, + RELEVANCE_LV3, ) from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics from scripts.evaluation.eval_framework.store import EvalStore @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp LABELS_BY_GRADE = { - 3: RELEVANCE_EXACT, - 2: RELEVANCE_HIGH, - 1: RELEVANCE_LOW, - 0: RELEVANCE_IRRELEVANT, + 3: RELEVANCE_LV3, + 2: RELEVANCE_LV2, + 1: RELEVANCE_LV1, + 0: RELEVANCE_LV0, } -- libgit2 0.21.2