From 7ddd4cb3acf5e2e0b748467448c83348c87eff20 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 21:35:57 +0800 Subject: [PATCH] 评估体系从三等级->四等级 Exact Match / High Relevant / Low Relevant / Irrelevant --- scripts/evaluation/README.md | 36 ++++++++++++++++++++++++++++++------ scripts/evaluation/eval_framework/constants.py | 16 +++++++++++++++- scripts/evaluation/eval_framework/framework.py | 35 ++++++++++++++++++++++++++++------- scripts/evaluation/eval_framework/metrics.py | 140 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------- scripts/evaluation/eval_framework/reports.py | 28 ++++++++++++++++++++++++---- scripts/evaluation/eval_framework/static/eval_web.css | 23 ++++++++++++++++++++--- scripts/evaluation/eval_framework/static/eval_web.js | 448 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/evaluation/eval_framework/static/index.html | 5 +++-- scripts/evaluation/tune_fusion.py | 12 ++++++------ 9 files changed, 502 insertions(+), 241 deletions(-) diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 9beb859..532ef59 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -2,7 +2,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, audit tooling, and the fusion-tuning runner for retrieval quality. -**Design:** Build labels offline for a fixed query set (`queries/queries.txt`). Single-query and batch evaluation map recalled `spu_id` values to the SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when coverage is incomplete. +**Design:** Build labels offline for a fixed query set (`queries/queries.txt`). Single-query and batch evaluation map recalled `spu_id` values to the SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when judged coverage is incomplete. Evaluation now uses a graded four-tier relevance system and ranking metrics centered on `NDCG`. ## What it does @@ -112,9 +112,33 @@ Default root: `artifacts/search_evaluation/` ## Labels -- **Exact** — Matches intended product type and all explicit required attributes. -- **Partial** — Main intent matches; attributes missing, approximate, or weaker. -- **Irrelevant** — Type mismatch or conflicting required attributes. +- **Exact Match** — Matches intended product type and all explicit required attributes. +- **High Relevant** — Main intent matches and is a strong substitute, but some attributes are missing, weaker, or slightly off. +- **Low Relevant** — Only a weak substitute; may share scenario, style, or broad category but is no longer a strong match. +- **Irrelevant** — Type mismatch or important conflicts make it a poor search result. + +## Metric design + +This framework now follows graded ranking evaluation closer to e-commerce best practice instead of collapsing everything into binary relevance. + +- **Primary metric: `NDCG@10`** + Uses the four labels as graded gains and rewards both relevance and early placement. +- **Gain scheme** + `Exact Match=7`, `High Relevant=3`, `Low Relevant=1`, `Irrelevant=0` + The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. +- **Why this is better** + `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Exact Match` with a `Low Relevant` item is penalized more than swapping `High Relevant` with `Low Relevant`. + +The reported metrics are: + +- **`NDCG@5`, `NDCG@10`, `NDCG@20`, `NDCG@50`** — Primary graded ranking quality. +- **`Exact_Precision@K`** — Strict top-slot quality when only `Exact Match` counts. +- **`Strong_Precision@K`** — Business-facing top-slot quality where `Exact Match + High Relevant` count as strong positives. +- **`Useful_Precision@K`** — Broader usefulness where any non-irrelevant result counts. +- **`Gain_Recall@K`** — Gain captured in the returned list versus the judged label pool for the query. +- **`Exact_Success@K` / `Strong_Success@K`** — Whether at least one exact or strong result appears in the first K. +- **`MRR_Exact@10` / `MRR_Strong@10`** — How early the first exact or strong result appears. +- **`Avg_Grade@10`** — Average relevance grade of the visible first page. **Labeler modes:** `simple` (default): one judging pass per batch with the standard relevance prompt. `complex`: query-profile extraction plus extra guardrails (for structured experiments). @@ -139,11 +163,11 @@ Default root: `artifacts/search_evaluation/` ## Web UI -Features: query list from `queries.txt`, single-query and batch evaluation, batch report history, top recalls, missed Exact/Partial, and coverage tips for unlabeled hits. +Features: query list from `queries.txt`, single-query and batch evaluation, batch report history, grouped graded-metric cards, top recalls, missed judged useful results, and coverage tips for unlabeled hits. ## Batch reports -Each run stores aggregate and per-query metrics, label distribution, timestamp, and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. +Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. ## Caveats diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index 2fdd865..19e9194 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -14,8 +14,22 @@ RELEVANCE_IRRELEVANT = "Irrelevant" VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) -# Precision / MAP "positive" set (all non-irrelevant tiers) +# Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) +RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) + +# Graded relevance for ranking evaluation. +# We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. +RELEVANCE_GRADE_MAP = { + RELEVANCE_EXACT: 3, + RELEVANCE_HIGH: 2, + RELEVANCE_LOW: 1, + RELEVANCE_IRRELEVANT: 0, +} +RELEVANCE_GAIN_MAP = { + label: (2 ** grade) - 1 + for label, grade in RELEVANCE_GRADE_MAP.items() +} _LEGACY_LABEL_MAP = { "Exact": RELEVANCE_EXACT, diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 5c8fcc6..d71a5d7 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -26,6 +26,7 @@ from .constants import ( DEFAULT_RERANK_HIGH_THRESHOLD, DEFAULT_SEARCH_RECALL_TOP_K, RELEVANCE_EXACT, + RELEVANCE_GAIN_MAP, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW, @@ -50,6 +51,18 @@ from .utils import ( _log = logging.getLogger("search_eval.framework") +def _metric_context_payload() -> Dict[str, Any]: + return { + "primary_metric": "NDCG@10", + "gain_scheme": dict(RELEVANCE_GAIN_MAP), + "notes": [ + "NDCG uses graded gains derived from the four relevance labels.", + "Strong metrics treat Exact Match and High Relevant as strong business positives.", + "Useful metrics treat any non-irrelevant item as useful recall coverage.", + ], + } + + def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" out: Dict[str, str] = {} @@ -607,7 +620,7 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in search_labeled_results[:100] ] - metrics = compute_query_metrics(top100_labels) + metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) output_dir = ensure_dir(self.artifact_root / "query_builds") run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" output_json_path = output_dir / f"{run_id}.json" @@ -629,6 +642,7 @@ class SearchEvaluationFramework: "pool_size": len(pool_docs), }, "metrics_top100": metrics, + "metric_context": _metric_context_payload(), "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, } @@ -816,7 +830,7 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in search_labeled_results[:100] ] - metrics = compute_query_metrics(top100_labels) + metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) output_dir = ensure_dir(self.artifact_root / "query_builds") run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" output_json_path = output_dir / f"{run_id}.json" @@ -838,6 +852,7 @@ class SearchEvaluationFramework: "ordered_union_size": pool_docs_count, }, "metrics_top100": metrics, + "metric_context": _metric_context_payload(), "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, } @@ -897,6 +912,10 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in labeled ] + ideal_labels = [ + label if label in VALID_LABELS else RELEVANCE_IRRELEVANT + for label in labels.values() + ] label_stats = self.store.get_query_label_stats(self.tenant_id, query) rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) relevant_missing_ids = [ @@ -947,12 +966,13 @@ class SearchEvaluationFramework: if unlabeled_hits: tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") if not missing_relevant: - tips.append("No cached non-irrelevant products were missed by this recall set.") + tips.append("No cached judged useful products were missed by this recall set.") return { "query": query, "tenant_id": self.tenant_id, "top_k": top_k, - "metrics": compute_query_metrics(metric_labels), + "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), + "metric_context": _metric_context_payload(), "results": labeled, "missing_relevant": missing_relevant, "label_stats": { @@ -1004,12 +1024,12 @@ class SearchEvaluationFramework: ) m = live["metrics"] _log.info( - "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s", + "[batch-eval] (%s/%s) query=%r NDCG@10=%s Strong_Precision@10=%s total_hits=%s", q_index, total_q, query, - m.get("P@10"), - m.get("MAP_3"), + m.get("NDCG@10"), + m.get("Strong_Precision@10"), live.get("total"), ) aggregate = aggregate_metrics([item["metrics"] for item in per_query]) @@ -1033,6 +1053,7 @@ class SearchEvaluationFramework: "queries": list(queries), "top_k": top_k, "aggregate_metrics": aggregate, + "metric_context": _metric_context_payload(), "aggregate_distribution": aggregate_distribution, "per_query": per_query, "config_snapshot_path": str(config_snapshot_path), diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py index 542a993..7848023 100644 --- a/scripts/evaluation/eval_framework/metrics.py +++ b/scripts/evaluation/eval_framework/metrics.py @@ -1,56 +1,142 @@ -"""IR metrics for labeled result lists.""" +"""Ranking metrics for graded e-commerce relevance labels.""" from __future__ import annotations -from typing import Dict, Sequence +import math +from typing import Dict, Iterable, Sequence -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT +from .constants import ( + RELEVANCE_EXACT, + RELEVANCE_GAIN_MAP, + RELEVANCE_GRADE_MAP, + RELEVANCE_HIGH, + RELEVANCE_IRRELEVANT, + RELEVANCE_LOW, + RELEVANCE_NON_IRRELEVANT, + RELEVANCE_STRONG, +) -def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: +def _normalize_label(label: str) -> str: + if label in RELEVANCE_GRADE_MAP: + return label + return RELEVANCE_IRRELEVANT + + +def _gains_for_labels(labels: Sequence[str]) -> list[float]: + return [float(RELEVANCE_GAIN_MAP.get(_normalize_label(label), 0.0)) for label in labels] + + +def _binary_hits(labels: Sequence[str], relevant: Iterable[str]) -> list[int]: + relevant_set = set(relevant) + return [1 if _normalize_label(label) in relevant_set else 0 for label in labels] + + +def _precision_at_k_from_hits(hits: Sequence[int], k: int) -> float: if k <= 0: return 0.0 - sliced = list(labels[:k]) + sliced = list(hits[:k]) if not sliced: return 0.0 - rel = set(relevant) - hits = sum(1 for label in sliced if label in rel) - return hits / float(min(k, len(sliced))) + return sum(sliced) / float(len(sliced)) + + +def _success_at_k_from_hits(hits: Sequence[int], k: int) -> float: + if k <= 0: + return 0.0 + return 1.0 if any(hits[:k]) else 0.0 + + +def _reciprocal_rank_from_hits(hits: Sequence[int], k: int) -> float: + if k <= 0: + return 0.0 + for idx, hit in enumerate(hits[:k], start=1): + if hit: + return 1.0 / float(idx) + return 0.0 -def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: - rel = set(relevant) - hit_count = 0 - precision_sum = 0.0 - for idx, label in enumerate(labels, start=1): - if label not in rel: +def _dcg_at_k(gains: Sequence[float], k: int) -> float: + if k <= 0: + return 0.0 + total = 0.0 + for idx, gain in enumerate(gains[:k], start=1): + if gain <= 0.0: continue - hit_count += 1 - precision_sum += hit_count / idx - if hit_count == 0: + total += gain / math.log2(idx + 1.0) + return total + + +def _ndcg_at_k(labels: Sequence[str], ideal_labels: Sequence[str], k: int) -> float: + actual_gains = _gains_for_labels(labels) + ideal_gains = sorted(_gains_for_labels(ideal_labels), reverse=True) + dcg = _dcg_at_k(actual_gains, k) + idcg = _dcg_at_k(ideal_gains, k) + if idcg <= 0.0: + return 0.0 + return dcg / idcg + + +def _gain_recall_at_k(labels: Sequence[str], ideal_labels: Sequence[str], k: int) -> float: + ideal_total_gain = sum(_gains_for_labels(ideal_labels)) + if ideal_total_gain <= 0.0: return 0.0 - return precision_sum / hit_count + actual_gain = sum(_gains_for_labels(labels[:k])) + return actual_gain / ideal_total_gain -def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: - """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names).""" +def _grade_avg_at_k(labels: Sequence[str], k: int) -> float: + if k <= 0: + return 0.0 + sliced = [_normalize_label(label) for label in labels[:k]] + if not sliced: + return 0.0 + return sum(float(RELEVANCE_GRADE_MAP.get(label, 0)) for label in sliced) / float(len(sliced)) + + +def compute_query_metrics( + labels: Sequence[str], + *, + ideal_labels: Sequence[str] | None = None, +) -> Dict[str, float]: + """Compute graded ranking metrics plus binary diagnostic slices. + + `labels` are the ranked results returned by search. + `ideal_labels` is the judged label pool for the same query; when omitted we fall back + to the retrieved labels, which still keeps the metrics well-defined. + """ + + ideal = list(ideal_labels) if ideal_labels is not None else list(labels) metrics: Dict[str, float] = {} - non_irrel = list(RELEVANCE_NON_IRRELEVANT) + + exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) + strong_hits = _binary_hits(labels, RELEVANCE_STRONG) + useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) + for k in (5, 10, 20, 50): - metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6) - metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) - metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6) + metrics[f"NDCG@{k}"] = round(_ndcg_at_k(labels, ideal, k), 6) + for k in (5, 10, 20): + metrics[f"Exact_Precision@{k}"] = round(_precision_at_k_from_hits(exact_hits, k), 6) + metrics[f"Strong_Precision@{k}"] = round(_precision_at_k_from_hits(strong_hits, k), 6) + for k in (10, 20, 50): + metrics[f"Useful_Precision@{k}"] = round(_precision_at_k_from_hits(useful_hits, k), 6) + metrics[f"Gain_Recall@{k}"] = round(_gain_recall_at_k(labels, ideal, k), 6) + for k in (5, 10): + metrics[f"Exact_Success@{k}"] = round(_success_at_k_from_hits(exact_hits, k), 6) + metrics[f"Strong_Success@{k}"] = round(_success_at_k_from_hits(strong_hits, k), 6) + metrics["MRR_Exact@10"] = round(_reciprocal_rank_from_hits(exact_hits, 10), 6) + metrics["MRR_Strong@10"] = round(_reciprocal_rank_from_hits(strong_hits, 10), 6) + metrics["Avg_Grade@10"] = round(_grade_avg_at_k(labels, 10), 6) return metrics def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]: if not metric_items: return {} - keys = sorted(metric_items[0].keys()) + all_keys = sorted({key for item in metric_items for key in item.keys()}) return { key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6) - for key in keys + for key in all_keys } diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py index 7587b57..2df34d3 100644 --- a/scripts/evaluation/eval_framework/reports.py +++ b/scripts/evaluation/eval_framework/reports.py @@ -7,6 +7,19 @@ from typing import Any, Dict from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW +def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: + primary_keys = ("NDCG@5", "NDCG@10", "NDCG@20", "Exact_Precision@10", "Strong_Precision@10", "Gain_Recall@50") + included = set() + for key in primary_keys: + if key in metrics: + lines.append(f"- {key}: {metrics[key]}") + included.add(key) + for key, value in sorted(metrics.items()): + if key in included: + continue + lines.append(f"- {key}: {value}") + + def render_batch_report_markdown(payload: Dict[str, Any]) -> str: lines = [ "# Search Batch Evaluation", @@ -20,8 +33,16 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: "## Aggregate Metrics", "", ] - for key, value in sorted((payload.get("aggregate_metrics") or {}).items()): - lines.append(f"- {key}: {value}") + metric_context = payload.get("metric_context") or {} + if metric_context: + lines.extend( + [ + f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}", + f"- Gain scheme: {metric_context.get('gain_scheme', {})}", + "", + ] + ) + _append_metric_block(lines, payload.get("aggregate_metrics") or {}) distribution = payload.get("aggregate_distribution") or {} if distribution: lines.extend( @@ -39,8 +60,7 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: for item in payload.get("per_query") or []: lines.append(f"### {item['query']}") lines.append("") - for key, value in sorted((item.get("metrics") or {}).items()): - lines.append(f"- {key}: {value}") + _append_metric_block(lines, item.get("metrics") or {}) distribution = item.get("distribution") or {} lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}") lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") diff --git a/scripts/evaluation/eval_framework/static/eval_web.css b/scripts/evaluation/eval_framework/static/eval_web.css index e8472ba..2123d40 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.css +++ b/scripts/evaluation/eval_framework/static/eval_web.css @@ -6,7 +6,8 @@ --line: #ddd4c6; --accent: #0f766e; --exact: #0f766e; - --partial: #b7791f; + --high: #b7791f; + --low: #3b82a0; --irrelevant: #b42318; } body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background: @@ -29,6 +30,12 @@ button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; } button.secondary { background: #d9e6e3; color: #12433d; } .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; } + .metric-context { margin: 0 0 12px; line-height: 1.5; } + .metric-section { margin-bottom: 18px; } + .metric-section-head { display: flex; align-items: baseline; justify-content: space-between; gap: 12px; margin-bottom: 10px; } + .metric-section-head h3 { margin: 0; font-size: 14px; color: #12433d; } + .metric-section-head p { margin: 0; color: var(--muted); font-size: 12px; } + .metric-grid { margin-bottom: 0; } .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; } .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; } .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; } @@ -36,8 +43,8 @@ .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } .label-exact-match { background: var(--exact); } - .label-high-relevant { background: var(--partial); } - .label-low-relevant { background: #6b5b95; } + .label-high-relevant { background: var(--high); } + .label-low-relevant { background: var(--low); } .label-irrelevant { background: var(--irrelevant); } .badge-unknown { background: #637381; } .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } @@ -91,3 +98,13 @@ .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } .tip { margin-bottom: 6px; color: var(--muted); } + @media (max-width: 960px) { + .app { grid-template-columns: 1fr; } + .sidebar { border-right: 0; border-bottom: 1px solid var(--line); } + .metric-section-head { flex-direction: column; align-items: flex-start; } + } + @media (max-width: 640px) { + .main, .sidebar { padding: 16px; } + .result { grid-template-columns: 1fr; } + .thumb { width: 100%; max-width: 180px; height: auto; aspect-ratio: 1 / 1; } + } diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js index 33411b2..ec93f38 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.js +++ b/scripts/evaluation/eval_framework/static/eval_web.js @@ -1,186 +1,264 @@ - async function fetchJSON(url, options) { - const res = await fetch(url, options); - if (!res.ok) throw new Error(await res.text()); - return await res.json(); - } - function renderMetrics(metrics) { - const root = document.getElementById('metrics'); - root.innerHTML = ''; - Object.entries(metrics || {}).forEach(([key, value]) => { - const card = document.createElement('div'); - card.className = 'metric'; - card.innerHTML = `
${key}
${value}
`; - root.appendChild(card); - }); - } - function labelBadgeClass(label) { - if (!label || label === 'Unknown') return 'badge-unknown'; - return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-'); - } - function renderResults(results, rootId='results', showRank=true) { - const mount = document.getElementById(rootId); - mount.innerHTML = ''; - (results || []).forEach(item => { - const label = item.label || 'Unknown'; - const box = document.createElement('div'); - box.className = 'result'; - box.innerHTML = ` -
${label}
${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}
- -
-
${item.title || ''}
- ${item.title_zh ? `
${item.title_zh}
` : ''} -
-
${(item.option_values || [])[0] || ''}
-
${(item.option_values || [])[1] || ''}
-
${(item.option_values || [])[2] || ''}
-
-
`; - mount.appendChild(box); - }); - if (!(results || []).length) { - mount.innerHTML = '
None.
'; - } - } - function renderTips(data) { - const root = document.getElementById('tips'); - const tips = [...(data.tips || [])]; - const stats = data.label_stats || {}; - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`); - root.innerHTML = tips.map(text => `
${text}
`).join(''); - } - async function loadQueries() { - const data = await fetchJSON('/api/queries'); - const root = document.getElementById('queryList'); - root.innerHTML = ''; - data.queries.forEach(query => { - const btn = document.createElement('button'); - btn.className = 'query-item'; - btn.textContent = query; - btn.onclick = () => { - document.getElementById('queryInput').value = query; - runSingle(); - }; - root.appendChild(btn); - }); - } - function fmtMetric(m, key, digits) { - const v = m && m[key]; - if (v == null || Number.isNaN(Number(v))) return null; - const n = Number(v); - return n.toFixed(digits); - } - function historySummaryHtml(meta) { - const m = meta && meta.aggregate_metrics; - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; - const parts = []; - if (nq != null) parts.push(`Queries ${nq}`); - const p10 = fmtMetric(m, 'P@10', 3); - const p52 = fmtMetric(m, 'P@5_2_3', 3); - const map3 = fmtMetric(m, 'MAP_3', 3); - if (p10) parts.push(`P@10 ${p10}`); - if (p52) parts.push(`P@5_2_3 ${p52}`); - if (map3) parts.push(`MAP_3 ${map3}`); - if (!parts.length) return ''; - return `
${parts.join(' · ')}
`; - } - async function loadHistory() { - const data = await fetchJSON('/api/history'); - const root = document.getElementById('history'); - root.classList.remove('muted'); - const items = data.history || []; - if (!items.length) { - root.innerHTML = 'No history yet.'; - return; - } - root.innerHTML = `
`; - const list = root.querySelector('.history-list'); - items.forEach(item => { - const btn = document.createElement('button'); - btn.type = 'button'; - btn.className = 'history-item'; - btn.setAttribute('aria-label', `Open report ${item.batch_id}`); - const sum = historySummaryHtml(item.metadata); - btn.innerHTML = `
${item.batch_id}
-
${item.created_at} · tenant ${item.tenant_id}
${sum}`; - btn.onclick = () => openBatchReport(item.batch_id); - list.appendChild(btn); - }); - } - let _lastReportPath = ''; - function closeReportModal() { - const el = document.getElementById('reportModal'); - el.classList.remove('is-open'); - el.setAttribute('aria-hidden', 'true'); - document.getElementById('reportModalBody').innerHTML = ''; - document.getElementById('reportModalMeta').textContent = ''; - } - async function openBatchReport(batchId) { - const el = document.getElementById('reportModal'); - const body = document.getElementById('reportModalBody'); - const metaEl = document.getElementById('reportModalMeta'); - const titleEl = document.getElementById('reportModalTitle'); - el.classList.add('is-open'); - el.setAttribute('aria-hidden', 'false'); - titleEl.textContent = batchId; - metaEl.textContent = ''; - body.className = 'report-modal-body batch-report-md report-modal-loading'; - body.textContent = 'Loading report…'; - try { - const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report'); - _lastReportPath = rep.report_markdown_path || ''; - metaEl.textContent = rep.report_markdown_path || ''; - const raw = marked.parse(rep.markdown || '', { gfm: true }); - const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } }); - body.className = 'report-modal-body batch-report-md'; - body.innerHTML = safe; - } catch (e) { - body.className = 'report-modal-body report-modal-error'; - body.textContent = (e && e.message) ? e.message : String(e); - } - } - document.getElementById('reportModal').addEventListener('click', (ev) => { - if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal(); +async function fetchJSON(url, options) { + const res = await fetch(url, options); + if (!res.ok) throw new Error(await res.text()); + return await res.json(); +} + +function fmtNumber(value, digits = 3) { + if (value == null || Number.isNaN(Number(value))) return "-"; + return Number(value).toFixed(digits); +} + +function metricSections(metrics) { + const groups = [ + { + title: "Primary Ranking", + keys: ["NDCG@5", "NDCG@10", "NDCG@20", "NDCG@50"], + description: "Graded ranking quality across the four relevance tiers.", + }, + { + title: "Top Slot Quality", + keys: ["Exact_Precision@5", "Exact_Precision@10", "Strong_Precision@5", "Strong_Precision@10", "Strong_Precision@20"], + description: "How much of the visible top rank is exact or strong business relevance.", + }, + { + title: "Recall Coverage", + keys: ["Useful_Precision@10", "Useful_Precision@20", "Useful_Precision@50", "Gain_Recall@10", "Gain_Recall@20", "Gain_Recall@50"], + description: "How much judged relevance is captured in the returned list.", + }, + { + title: "First Good Result", + keys: ["Exact_Success@5", "Exact_Success@10", "Strong_Success@5", "Strong_Success@10", "MRR_Exact@10", "MRR_Strong@10", "Avg_Grade@10"], + description: "Whether users see a good result early and how good the top page feels overall.", + }, + ]; + const seen = new Set(); + return groups + .map((group) => { + const items = group.keys + .filter((key) => metrics && Object.prototype.hasOwnProperty.call(metrics, key)) + .map((key) => { + seen.add(key); + return [key, metrics[key]]; + }); + return { ...group, items }; + }) + .filter((group) => group.items.length) + .concat( + (() => { + const rest = Object.entries(metrics || {}).filter(([key]) => !seen.has(key)); + return rest.length + ? [{ title: "Other Metrics", description: "", items: rest }] + : []; + })() + ); +} + +function renderMetrics(metrics, metricContext) { + const root = document.getElementById("metrics"); + root.innerHTML = ""; + const ctx = document.getElementById("metricContext"); + const gainScheme = metricContext && metricContext.gain_scheme; + const primary = metricContext && metricContext.primary_metric; + ctx.textContent = primary + ? `Primary metric: ${primary}. Gain scheme: ${Object.entries(gainScheme || {}).map(([label, gain]) => `${label}=${gain}`).join(", ")}.` + : ""; + + metricSections(metrics || {}).forEach((section) => { + const wrap = document.createElement("section"); + wrap.className = "metric-section"; + wrap.innerHTML = ` +
+

${section.title}

+ ${section.description ? `

${section.description}

` : ""} +
+
+ `; + const grid = wrap.querySelector(".metric-grid"); + section.items.forEach(([key, value]) => { + const card = document.createElement("div"); + card.className = "metric"; + card.innerHTML = `
${key}
${fmtNumber(value)}
`; + grid.appendChild(card); }); - document.addEventListener('keydown', (ev) => { - if (ev.key === 'Escape') closeReportModal(); - }); - document.getElementById('reportCopyPath').addEventListener('click', async () => { - if (!_lastReportPath) return; - try { - await navigator.clipboard.writeText(_lastReportPath); - } catch (_) {} - }); - async function runSingle() { - const query = document.getElementById('queryInput').value.trim(); - if (!query) return; - document.getElementById('status').textContent = `Evaluating "${query}"...`; - const data = await fetchJSON('/api/search-eval', { - method: 'POST', - headers: {'Content-Type': 'application/json'}, - body: JSON.stringify({query, top_k: 100, auto_annotate: false}) - }); - document.getElementById('status').textContent = `Done. total=${data.total}`; - renderMetrics(data.metrics); - renderResults(data.results, 'results', true); - renderResults(data.missing_relevant, 'missingRelevant', false); - renderTips(data); - loadHistory(); - } - async function runBatch() { - document.getElementById('status').textContent = 'Running batch evaluation...'; - const data = await fetchJSON('/api/batch-eval', { - method: 'POST', - headers: {'Content-Type': 'application/json'}, - body: JSON.stringify({top_k: 100, auto_annotate: false}) - }); - document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; - renderMetrics(data.aggregate_metrics); - renderResults([], 'results', true); - renderResults([], 'missingRelevant', false); - document.getElementById('tips').innerHTML = '
Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.
'; - loadHistory(); - } - loadQueries(); - loadHistory(); - + root.appendChild(wrap); + }); +} + +function labelBadgeClass(label) { + if (!label || label === "Unknown") return "badge-unknown"; + return "label-" + String(label).toLowerCase().replace(/\s+/g, "-"); +} + +function renderResults(results, rootId = "results", showRank = true) { + const mount = document.getElementById(rootId); + mount.innerHTML = ""; + (results || []).forEach((item) => { + const label = item.label || "Unknown"; + const box = document.createElement("div"); + box.className = "result"; + box.innerHTML = ` +
${label}
${showRank ? `#${item.rank || "-"}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : "not recalled")}
+ +
+
${item.title || ""}
+ ${item.title_zh ? `
${item.title_zh}
` : ""} +
+
${(item.option_values || [])[0] || ""}
+
${(item.option_values || [])[1] || ""}
+
${(item.option_values || [])[2] || ""}
+
+
`; + mount.appendChild(box); + }); + if (!(results || []).length) { + mount.innerHTML = '
None.
'; + } +} + +function renderTips(data) { + const root = document.getElementById("tips"); + const tips = [...(data.tips || [])]; + const stats = data.label_stats || {}; + tips.unshift( + `Cached labels: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed judged useful results: ${stats.missing_relevant_count || 0} (Exact ${stats.missing_exact_count || 0}, High ${stats.missing_high_count || 0}, Low ${stats.missing_low_count || 0}).` + ); + root.innerHTML = tips.map((text) => `
${text}
`).join(""); +} + +async function loadQueries() { + const data = await fetchJSON("/api/queries"); + const root = document.getElementById("queryList"); + root.innerHTML = ""; + data.queries.forEach((query) => { + const btn = document.createElement("button"); + btn.className = "query-item"; + btn.textContent = query; + btn.onclick = () => { + document.getElementById("queryInput").value = query; + runSingle(); + }; + root.appendChild(btn); + }); +} + +function historySummaryHtml(meta) { + const m = meta && meta.aggregate_metrics; + const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; + const parts = []; + if (nq != null) parts.push(`Queries ${nq}`); + if (m && m["NDCG@10"] != null) parts.push(`NDCG@10 ${fmtNumber(m["NDCG@10"])}`); + if (m && m["Strong_Precision@10"] != null) parts.push(`Strong@10 ${fmtNumber(m["Strong_Precision@10"])}`); + if (m && m["Gain_Recall@50"] != null) parts.push(`Gain Recall@50 ${fmtNumber(m["Gain_Recall@50"])}`); + if (!parts.length) return ""; + return `
${parts.join(" · ")}
`; +} + +async function loadHistory() { + const data = await fetchJSON("/api/history"); + const root = document.getElementById("history"); + root.classList.remove("muted"); + const items = data.history || []; + if (!items.length) { + root.innerHTML = 'No history yet.'; + return; + } + root.innerHTML = `
`; + const list = root.querySelector(".history-list"); + items.forEach((item) => { + const btn = document.createElement("button"); + btn.type = "button"; + btn.className = "history-item"; + btn.setAttribute("aria-label", `Open report ${item.batch_id}`); + const sum = historySummaryHtml(item.metadata); + btn.innerHTML = `
${item.batch_id}
+
${item.created_at} · tenant ${item.tenant_id}
${sum}`; + btn.onclick = () => openBatchReport(item.batch_id); + list.appendChild(btn); + }); +} + +let _lastReportPath = ""; + +function closeReportModal() { + const el = document.getElementById("reportModal"); + el.classList.remove("is-open"); + el.setAttribute("aria-hidden", "true"); + document.getElementById("reportModalBody").innerHTML = ""; + document.getElementById("reportModalMeta").textContent = ""; +} + +async function openBatchReport(batchId) { + const el = document.getElementById("reportModal"); + const body = document.getElementById("reportModalBody"); + const metaEl = document.getElementById("reportModalMeta"); + const titleEl = document.getElementById("reportModalTitle"); + el.classList.add("is-open"); + el.setAttribute("aria-hidden", "false"); + titleEl.textContent = batchId; + metaEl.textContent = ""; + body.className = "report-modal-body batch-report-md report-modal-loading"; + body.textContent = "Loading report…"; + try { + const rep = await fetchJSON("/api/history/" + encodeURIComponent(batchId) + "/report"); + _lastReportPath = rep.report_markdown_path || ""; + metaEl.textContent = rep.report_markdown_path || ""; + const raw = marked.parse(rep.markdown || "", { gfm: true }); + const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } }); + body.className = "report-modal-body batch-report-md"; + body.innerHTML = safe; + } catch (e) { + body.className = "report-modal-body report-modal-error"; + body.textContent = e && e.message ? e.message : String(e); + } +} + +document.getElementById("reportModal").addEventListener("click", (ev) => { + if (ev.target && ev.target.getAttribute("data-close-report") === "1") closeReportModal(); +}); + +document.addEventListener("keydown", (ev) => { + if (ev.key === "Escape") closeReportModal(); +}); + +document.getElementById("reportCopyPath").addEventListener("click", async () => { + if (!_lastReportPath) return; + try { + await navigator.clipboard.writeText(_lastReportPath); + } catch (_) {} +}); + +async function runSingle() { + const query = document.getElementById("queryInput").value.trim(); + if (!query) return; + document.getElementById("status").textContent = `Evaluating "${query}"...`; + const data = await fetchJSON("/api/search-eval", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ query, top_k: 100, auto_annotate: false }), + }); + document.getElementById("status").textContent = `Done. total=${data.total}`; + renderMetrics(data.metrics, data.metric_context); + renderResults(data.results, "results", true); + renderResults(data.missing_relevant, "missingRelevant", false); + renderTips(data); + loadHistory(); +} + +async function runBatch() { + document.getElementById("status").textContent = "Running batch evaluation..."; + const data = await fetchJSON("/api/batch-eval", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ top_k: 100, auto_annotate: false }), + }); + document.getElementById("status").textContent = `Batch done. report=${data.batch_id}`; + renderMetrics(data.aggregate_metrics, data.metric_context); + renderResults([], "results", true); + renderResults([], "missingRelevant", false); + document.getElementById("tips").innerHTML = '
Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.
'; + loadHistory(); +} + +loadQueries(); +loadHistory(); diff --git a/scripts/evaluation/eval_framework/static/index.html b/scripts/evaluation/eval_framework/static/index.html index 3333781..974945d 100644 --- a/scripts/evaluation/eval_framework/static/index.html +++ b/scripts/evaluation/eval_framework/static/index.html @@ -30,6 +30,7 @@

Metrics

+

@@ -37,7 +38,7 @@
-

Missed non-irrelevant (cached)

+

Missed judged useful results

@@ -67,4 +68,4 @@ - \ No newline at end of file + diff --git a/scripts/evaluation/tune_fusion.py b/scripts/evaluation/tune_fusion.py index de40a49..23b0bb4 100644 --- a/scripts/evaluation/tune_fusion.py +++ b/scripts/evaluation/tune_fusion.py @@ -150,7 +150,7 @@ def render_markdown(summary: Dict[str, Any]) -> str: "", "## Experiments", "", - "| Rank | Name | Score | MAP_3 | MAP_2_3 | P@5 | P@10 | Config |", + "| Rank | Name | Score | NDCG@10 | NDCG@20 | Strong@10 | Gain Recall@50 | Config |", "|---|---|---:|---:|---:|---:|---:|---|", ] for idx, item in enumerate(summary["experiments"], start=1): @@ -162,10 +162,10 @@ def render_markdown(summary: Dict[str, Any]) -> str: str(idx), item["name"], str(item["score"]), - str(metrics.get("MAP_3", "")), - str(metrics.get("MAP_2_3", "")), - str(metrics.get("P@5", "")), - str(metrics.get("P@10", "")), + str(metrics.get("NDCG@10", "")), + str(metrics.get("NDCG@20", "")), + str(metrics.get("Strong_Precision@10", "")), + str(metrics.get("Gain_Recall@50", "")), item["config_snapshot_path"], ] ) @@ -206,7 +206,7 @@ def build_parser() -> argparse.ArgumentParser: parser.add_argument("--language", default="en") parser.add_argument("--experiments-file", required=True) parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") - parser.add_argument("--score-metric", default="MAP_3") + parser.add_argument("--score-metric", default="NDCG@10") parser.add_argument("--apply-best", action="store_true") parser.add_argument("--force-refresh-labels-first-pass", action="store_true") return parser -- libgit2 0.21.2