diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 9beb859..532ef59 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -2,7 +2,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, audit tooling, and the fusion-tuning runner for retrieval quality. -**Design:** Build labels offline for a fixed query set (`queries/queries.txt`). Single-query and batch evaluation map recalled `spu_id` values to the SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when coverage is incomplete. +**Design:** Build labels offline for a fixed query set (`queries/queries.txt`). Single-query and batch evaluation map recalled `spu_id` values to the SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when judged coverage is incomplete. Evaluation now uses a graded four-tier relevance system and ranking metrics centered on `NDCG`. ## What it does @@ -112,9 +112,33 @@ Default root: `artifacts/search_evaluation/` ## Labels -- **Exact** — Matches intended product type and all explicit required attributes. -- **Partial** — Main intent matches; attributes missing, approximate, or weaker. -- **Irrelevant** — Type mismatch or conflicting required attributes. +- **Exact Match** — Matches intended product type and all explicit required attributes. +- **High Relevant** — Main intent matches and is a strong substitute, but some attributes are missing, weaker, or slightly off. +- **Low Relevant** — Only a weak substitute; may share scenario, style, or broad category but is no longer a strong match. +- **Irrelevant** — Type mismatch or important conflicts make it a poor search result. + +## Metric design + +This framework now follows graded ranking evaluation closer to e-commerce best practice instead of collapsing everything into binary relevance. + +- **Primary metric: `NDCG@10`** + Uses the four labels as graded gains and rewards both relevance and early placement. +- **Gain scheme** + `Exact Match=7`, `High Relevant=3`, `Low Relevant=1`, `Irrelevant=0` + The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. +- **Why this is better** + `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Exact Match` with a `Low Relevant` item is penalized more than swapping `High Relevant` with `Low Relevant`. + +The reported metrics are: + +- **`NDCG@5`, `NDCG@10`, `NDCG@20`, `NDCG@50`** — Primary graded ranking quality. +- **`Exact_Precision@K`** — Strict top-slot quality when only `Exact Match` counts. +- **`Strong_Precision@K`** — Business-facing top-slot quality where `Exact Match + High Relevant` count as strong positives. +- **`Useful_Precision@K`** — Broader usefulness where any non-irrelevant result counts. +- **`Gain_Recall@K`** — Gain captured in the returned list versus the judged label pool for the query. +- **`Exact_Success@K` / `Strong_Success@K`** — Whether at least one exact or strong result appears in the first K. +- **`MRR_Exact@10` / `MRR_Strong@10`** — How early the first exact or strong result appears. +- **`Avg_Grade@10`** — Average relevance grade of the visible first page. **Labeler modes:** `simple` (default): one judging pass per batch with the standard relevance prompt. `complex`: query-profile extraction plus extra guardrails (for structured experiments). @@ -139,11 +163,11 @@ Default root: `artifacts/search_evaluation/` ## Web UI -Features: query list from `queries.txt`, single-query and batch evaluation, batch report history, top recalls, missed Exact/Partial, and coverage tips for unlabeled hits. +Features: query list from `queries.txt`, single-query and batch evaluation, batch report history, grouped graded-metric cards, top recalls, missed judged useful results, and coverage tips for unlabeled hits. ## Batch reports -Each run stores aggregate and per-query metrics, label distribution, timestamp, and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. +Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. ## Caveats diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index 2fdd865..19e9194 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -14,8 +14,22 @@ RELEVANCE_IRRELEVANT = "Irrelevant" VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) -# Precision / MAP "positive" set (all non-irrelevant tiers) +# Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) +RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) + +# Graded relevance for ranking evaluation. +# We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. +RELEVANCE_GRADE_MAP = { + RELEVANCE_EXACT: 3, + RELEVANCE_HIGH: 2, + RELEVANCE_LOW: 1, + RELEVANCE_IRRELEVANT: 0, +} +RELEVANCE_GAIN_MAP = { + label: (2 ** grade) - 1 + for label, grade in RELEVANCE_GRADE_MAP.items() +} _LEGACY_LABEL_MAP = { "Exact": RELEVANCE_EXACT, diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 5c8fcc6..d71a5d7 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -26,6 +26,7 @@ from .constants import ( DEFAULT_RERANK_HIGH_THRESHOLD, DEFAULT_SEARCH_RECALL_TOP_K, RELEVANCE_EXACT, + RELEVANCE_GAIN_MAP, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW, @@ -50,6 +51,18 @@ from .utils import ( _log = logging.getLogger("search_eval.framework") +def _metric_context_payload() -> Dict[str, Any]: + return { + "primary_metric": "NDCG@10", + "gain_scheme": dict(RELEVANCE_GAIN_MAP), + "notes": [ + "NDCG uses graded gains derived from the four relevance labels.", + "Strong metrics treat Exact Match and High Relevant as strong business positives.", + "Useful metrics treat any non-irrelevant item as useful recall coverage.", + ], + } + + def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" out: Dict[str, str] = {} @@ -607,7 +620,7 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in search_labeled_results[:100] ] - metrics = compute_query_metrics(top100_labels) + metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) output_dir = ensure_dir(self.artifact_root / "query_builds") run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" output_json_path = output_dir / f"{run_id}.json" @@ -629,6 +642,7 @@ class SearchEvaluationFramework: "pool_size": len(pool_docs), }, "metrics_top100": metrics, + "metric_context": _metric_context_payload(), "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, } @@ -816,7 +830,7 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in search_labeled_results[:100] ] - metrics = compute_query_metrics(top100_labels) + metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) output_dir = ensure_dir(self.artifact_root / "query_builds") run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" output_json_path = output_dir / f"{run_id}.json" @@ -838,6 +852,7 @@ class SearchEvaluationFramework: "ordered_union_size": pool_docs_count, }, "metrics_top100": metrics, + "metric_context": _metric_context_payload(), "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, } @@ -897,6 +912,10 @@ class SearchEvaluationFramework: item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in labeled ] + ideal_labels = [ + label if label in VALID_LABELS else RELEVANCE_IRRELEVANT + for label in labels.values() + ] label_stats = self.store.get_query_label_stats(self.tenant_id, query) rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) relevant_missing_ids = [ @@ -947,12 +966,13 @@ class SearchEvaluationFramework: if unlabeled_hits: tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") if not missing_relevant: - tips.append("No cached non-irrelevant products were missed by this recall set.") + tips.append("No cached judged useful products were missed by this recall set.") return { "query": query, "tenant_id": self.tenant_id, "top_k": top_k, - "metrics": compute_query_metrics(metric_labels), + "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), + "metric_context": _metric_context_payload(), "results": labeled, "missing_relevant": missing_relevant, "label_stats": { @@ -1004,12 +1024,12 @@ class SearchEvaluationFramework: ) m = live["metrics"] _log.info( - "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s", + "[batch-eval] (%s/%s) query=%r NDCG@10=%s Strong_Precision@10=%s total_hits=%s", q_index, total_q, query, - m.get("P@10"), - m.get("MAP_3"), + m.get("NDCG@10"), + m.get("Strong_Precision@10"), live.get("total"), ) aggregate = aggregate_metrics([item["metrics"] for item in per_query]) @@ -1033,6 +1053,7 @@ class SearchEvaluationFramework: "queries": list(queries), "top_k": top_k, "aggregate_metrics": aggregate, + "metric_context": _metric_context_payload(), "aggregate_distribution": aggregate_distribution, "per_query": per_query, "config_snapshot_path": str(config_snapshot_path), diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py index 542a993..7848023 100644 --- a/scripts/evaluation/eval_framework/metrics.py +++ b/scripts/evaluation/eval_framework/metrics.py @@ -1,56 +1,142 @@ -"""IR metrics for labeled result lists.""" +"""Ranking metrics for graded e-commerce relevance labels.""" from __future__ import annotations -from typing import Dict, Sequence +import math +from typing import Dict, Iterable, Sequence -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT +from .constants import ( + RELEVANCE_EXACT, + RELEVANCE_GAIN_MAP, + RELEVANCE_GRADE_MAP, + RELEVANCE_HIGH, + RELEVANCE_IRRELEVANT, + RELEVANCE_LOW, + RELEVANCE_NON_IRRELEVANT, + RELEVANCE_STRONG, +) -def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: +def _normalize_label(label: str) -> str: + if label in RELEVANCE_GRADE_MAP: + return label + return RELEVANCE_IRRELEVANT + + +def _gains_for_labels(labels: Sequence[str]) -> list[float]: + return [float(RELEVANCE_GAIN_MAP.get(_normalize_label(label), 0.0)) for label in labels] + + +def _binary_hits(labels: Sequence[str], relevant: Iterable[str]) -> list[int]: + relevant_set = set(relevant) + return [1 if _normalize_label(label) in relevant_set else 0 for label in labels] + + +def _precision_at_k_from_hits(hits: Sequence[int], k: int) -> float: if k <= 0: return 0.0 - sliced = list(labels[:k]) + sliced = list(hits[:k]) if not sliced: return 0.0 - rel = set(relevant) - hits = sum(1 for label in sliced if label in rel) - return hits / float(min(k, len(sliced))) + return sum(sliced) / float(len(sliced)) + + +def _success_at_k_from_hits(hits: Sequence[int], k: int) -> float: + if k <= 0: + return 0.0 + return 1.0 if any(hits[:k]) else 0.0 + + +def _reciprocal_rank_from_hits(hits: Sequence[int], k: int) -> float: + if k <= 0: + return 0.0 + for idx, hit in enumerate(hits[:k], start=1): + if hit: + return 1.0 / float(idx) + return 0.0 -def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: - rel = set(relevant) - hit_count = 0 - precision_sum = 0.0 - for idx, label in enumerate(labels, start=1): - if label not in rel: +def _dcg_at_k(gains: Sequence[float], k: int) -> float: + if k <= 0: + return 0.0 + total = 0.0 + for idx, gain in enumerate(gains[:k], start=1): + if gain <= 0.0: continue - hit_count += 1 - precision_sum += hit_count / idx - if hit_count == 0: + total += gain / math.log2(idx + 1.0) + return total + + +def _ndcg_at_k(labels: Sequence[str], ideal_labels: Sequence[str], k: int) -> float: + actual_gains = _gains_for_labels(labels) + ideal_gains = sorted(_gains_for_labels(ideal_labels), reverse=True) + dcg = _dcg_at_k(actual_gains, k) + idcg = _dcg_at_k(ideal_gains, k) + if idcg <= 0.0: + return 0.0 + return dcg / idcg + + +def _gain_recall_at_k(labels: Sequence[str], ideal_labels: Sequence[str], k: int) -> float: + ideal_total_gain = sum(_gains_for_labels(ideal_labels)) + if ideal_total_gain <= 0.0: return 0.0 - return precision_sum / hit_count + actual_gain = sum(_gains_for_labels(labels[:k])) + return actual_gain / ideal_total_gain -def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: - """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names).""" +def _grade_avg_at_k(labels: Sequence[str], k: int) -> float: + if k <= 0: + return 0.0 + sliced = [_normalize_label(label) for label in labels[:k]] + if not sliced: + return 0.0 + return sum(float(RELEVANCE_GRADE_MAP.get(label, 0)) for label in sliced) / float(len(sliced)) + + +def compute_query_metrics( + labels: Sequence[str], + *, + ideal_labels: Sequence[str] | None = None, +) -> Dict[str, float]: + """Compute graded ranking metrics plus binary diagnostic slices. + + `labels` are the ranked results returned by search. + `ideal_labels` is the judged label pool for the same query; when omitted we fall back + to the retrieved labels, which still keeps the metrics well-defined. + """ + + ideal = list(ideal_labels) if ideal_labels is not None else list(labels) metrics: Dict[str, float] = {} - non_irrel = list(RELEVANCE_NON_IRRELEVANT) + + exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) + strong_hits = _binary_hits(labels, RELEVANCE_STRONG) + useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) + for k in (5, 10, 20, 50): - metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6) - metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) - metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6) + metrics[f"NDCG@{k}"] = round(_ndcg_at_k(labels, ideal, k), 6) + for k in (5, 10, 20): + metrics[f"Exact_Precision@{k}"] = round(_precision_at_k_from_hits(exact_hits, k), 6) + metrics[f"Strong_Precision@{k}"] = round(_precision_at_k_from_hits(strong_hits, k), 6) + for k in (10, 20, 50): + metrics[f"Useful_Precision@{k}"] = round(_precision_at_k_from_hits(useful_hits, k), 6) + metrics[f"Gain_Recall@{k}"] = round(_gain_recall_at_k(labels, ideal, k), 6) + for k in (5, 10): + metrics[f"Exact_Success@{k}"] = round(_success_at_k_from_hits(exact_hits, k), 6) + metrics[f"Strong_Success@{k}"] = round(_success_at_k_from_hits(strong_hits, k), 6) + metrics["MRR_Exact@10"] = round(_reciprocal_rank_from_hits(exact_hits, 10), 6) + metrics["MRR_Strong@10"] = round(_reciprocal_rank_from_hits(strong_hits, 10), 6) + metrics["Avg_Grade@10"] = round(_grade_avg_at_k(labels, 10), 6) return metrics def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]: if not metric_items: return {} - keys = sorted(metric_items[0].keys()) + all_keys = sorted({key for item in metric_items for key in item.keys()}) return { key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6) - for key in keys + for key in all_keys } diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py index 7587b57..2df34d3 100644 --- a/scripts/evaluation/eval_framework/reports.py +++ b/scripts/evaluation/eval_framework/reports.py @@ -7,6 +7,19 @@ from typing import Any, Dict from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW +def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: + primary_keys = ("NDCG@5", "NDCG@10", "NDCG@20", "Exact_Precision@10", "Strong_Precision@10", "Gain_Recall@50") + included = set() + for key in primary_keys: + if key in metrics: + lines.append(f"- {key}: {metrics[key]}") + included.add(key) + for key, value in sorted(metrics.items()): + if key in included: + continue + lines.append(f"- {key}: {value}") + + def render_batch_report_markdown(payload: Dict[str, Any]) -> str: lines = [ "# Search Batch Evaluation", @@ -20,8 +33,16 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: "## Aggregate Metrics", "", ] - for key, value in sorted((payload.get("aggregate_metrics") or {}).items()): - lines.append(f"- {key}: {value}") + metric_context = payload.get("metric_context") or {} + if metric_context: + lines.extend( + [ + f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}", + f"- Gain scheme: {metric_context.get('gain_scheme', {})}", + "", + ] + ) + _append_metric_block(lines, payload.get("aggregate_metrics") or {}) distribution = payload.get("aggregate_distribution") or {} if distribution: lines.extend( @@ -39,8 +60,7 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: for item in payload.get("per_query") or []: lines.append(f"### {item['query']}") lines.append("") - for key, value in sorted((item.get("metrics") or {}).items()): - lines.append(f"- {key}: {value}") + _append_metric_block(lines, item.get("metrics") or {}) distribution = item.get("distribution") or {} lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}") lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") diff --git a/scripts/evaluation/eval_framework/static/eval_web.css b/scripts/evaluation/eval_framework/static/eval_web.css index e8472ba..2123d40 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.css +++ b/scripts/evaluation/eval_framework/static/eval_web.css @@ -6,7 +6,8 @@ --line: #ddd4c6; --accent: #0f766e; --exact: #0f766e; - --partial: #b7791f; + --high: #b7791f; + --low: #3b82a0; --irrelevant: #b42318; } body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background: @@ -29,6 +30,12 @@ button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; } button.secondary { background: #d9e6e3; color: #12433d; } .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; } + .metric-context { margin: 0 0 12px; line-height: 1.5; } + .metric-section { margin-bottom: 18px; } + .metric-section-head { display: flex; align-items: baseline; justify-content: space-between; gap: 12px; margin-bottom: 10px; } + .metric-section-head h3 { margin: 0; font-size: 14px; color: #12433d; } + .metric-section-head p { margin: 0; color: var(--muted); font-size: 12px; } + .metric-grid { margin-bottom: 0; } .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; } .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; } .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; } @@ -36,8 +43,8 @@ .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } .label-exact-match { background: var(--exact); } - .label-high-relevant { background: var(--partial); } - .label-low-relevant { background: #6b5b95; } + .label-high-relevant { background: var(--high); } + .label-low-relevant { background: var(--low); } .label-irrelevant { background: var(--irrelevant); } .badge-unknown { background: #637381; } .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } @@ -91,3 +98,13 @@ .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } .tip { margin-bottom: 6px; color: var(--muted); } + @media (max-width: 960px) { + .app { grid-template-columns: 1fr; } + .sidebar { border-right: 0; border-bottom: 1px solid var(--line); } + .metric-section-head { flex-direction: column; align-items: flex-start; } + } + @media (max-width: 640px) { + .main, .sidebar { padding: 16px; } + .result { grid-template-columns: 1fr; } + .thumb { width: 100%; max-width: 180px; height: auto; aspect-ratio: 1 / 1; } + } diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js index 33411b2..ec93f38 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.js +++ b/scripts/evaluation/eval_framework/static/eval_web.js @@ -1,186 +1,264 @@ - async function fetchJSON(url, options) { - const res = await fetch(url, options); - if (!res.ok) throw new Error(await res.text()); - return await res.json(); - } - function renderMetrics(metrics) { - const root = document.getElementById('metrics'); - root.innerHTML = ''; - Object.entries(metrics || {}).forEach(([key, value]) => { - const card = document.createElement('div'); - card.className = 'metric'; - card.innerHTML = `
${section.description}
` : ""} +