Commit d73ca84a48afc0945a533707c77ba3bbfaac9621

Authored by tangwang
1 parent 1fdab52d

refine eval case snapshots and rename relevance levels

scripts/evaluation/README.md
... ... @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p
127 127 - **Composite tuning score: `Primary_Metric_Score`**
128 128 For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`).
129 129 - **Gain scheme**
130   - `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0`
131   - The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup.
  130 + `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0`
  131 + We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive.
132 132 - **Why this is better**
133 133 `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`.
134 134  
... ... @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc
174 174  
175 175 Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`.
176 176  
  177 +To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes:
  178 +
  179 +- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call
  180 +- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2`
  181 +- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score`
  182 +
  183 +The Markdown report now surfaces the same case context in a lighter human-readable form:
  184 +
  185 +- request id
  186 +- top-10 / top-20 label sequence
  187 +- top 5 result snapshot for quick scanning
  188 +
  189 +This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand.
  190 +
  191 +The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI.
  192 +
177 193 ## Ranking debug and LTR prep
178 194  
179 195 `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work:
... ...
scripts/evaluation/eval_framework/__init__.py
... ... @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402
14 14 DEFAULT_ARTIFACT_ROOT,
15 15 DEFAULT_QUERY_FILE,
16 16 PROJECT_ROOT,
17   - RELEVANCE_EXACT,
18   - RELEVANCE_HIGH,
19   - RELEVANCE_IRRELEVANT,
20   - RELEVANCE_LOW,
  17 + RELEVANCE_LV0,
  18 + RELEVANCE_LV1,
  19 + RELEVANCE_LV2,
  20 + RELEVANCE_LV3,
21 21 RELEVANCE_NON_IRRELEVANT,
22 22 VALID_LABELS,
23 23 )
... ... @@ -39,10 +39,10 @@ __all__ = [
39 39 "EvalStore",
40 40 "PROJECT_ROOT",
41 41 "QueryBuildResult",
42   - "RELEVANCE_EXACT",
43   - "RELEVANCE_HIGH",
44   - "RELEVANCE_IRRELEVANT",
45   - "RELEVANCE_LOW",
  42 + "RELEVANCE_LV0",
  43 + "RELEVANCE_LV1",
  44 + "RELEVANCE_LV2",
  45 + "RELEVANCE_LV3",
46 46 "RELEVANCE_NON_IRRELEVANT",
47 47 "SearchEvaluationFramework",
48 48 "VALID_LABELS",
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -157,6 +157,7 @@ class SearchServiceClient:
157 157 return self._request_json("GET", path, timeout=timeout)
158 158  
159 159 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
  160 + request_id = uuid.uuid4().hex[:8]
160 161 payload: Dict[str, Any] = {
161 162 "query": query,
162 163 "size": size,
... ... @@ -165,13 +166,19 @@ class SearchServiceClient:
165 166 }
166 167 if debug:
167 168 payload["debug"] = True
168   - return self._request_json(
  169 + response = self._request_json(
169 170 "POST",
170 171 "/search/",
171 172 timeout=120,
172   - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  173 + headers={
  174 + "Content-Type": "application/json",
  175 + "X-Tenant-ID": self.tenant_id,
  176 + "X-Request-ID": request_id,
  177 + },
173 178 json_payload=payload,
174 179 )
  180 + response["_eval_request_id"] = request_id
  181 + return response
175 182  
176 183  
177 184 class RerankServiceClient:
... ...
scripts/evaluation/eval_framework/constants.py
... ... @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
7 7 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
8 8  
9 9 # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
10   -RELEVANCE_EXACT = "Fully Relevant"
11   -RELEVANCE_HIGH = "Mostly Relevant"
12   -RELEVANCE_LOW = "Weakly Relevant"
13   -RELEVANCE_IRRELEVANT = "Irrelevant"
  10 +RELEVANCE_LV3 = "Fully Relevant"
  11 +RELEVANCE_LV2 = "Mostly Relevant"
  12 +RELEVANCE_LV1 = "Weakly Relevant"
  13 +RELEVANCE_LV0 = "Irrelevant"
14 14  
15   -VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
  15 +VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
16 16  
17 17 # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
18   -RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
19   -RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH})
  18 +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
  19 +RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
20 20  
21 21 # Graded relevance for ranking evaluation.
22 22 # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
23 23 RELEVANCE_GRADE_MAP = {
24   - RELEVANCE_EXACT: 3,
25   - RELEVANCE_HIGH: 2,
26   - RELEVANCE_LOW: 1,
27   - RELEVANCE_IRRELEVANT: 0,
  24 + RELEVANCE_LV3: 3,
  25 + RELEVANCE_LV2: 2,
  26 + RELEVANCE_LV1: 1,
  27 + RELEVANCE_LV0: 0,
28 28 }
29 29 # 标准的gain计算方法:2^rel - 1
30 30 # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度
... ... @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = {
36 36  
37 37 # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
38 38 STOP_PROB_MAP = {
39   - RELEVANCE_EXACT: 0.99,
40   - RELEVANCE_HIGH: 0.8,
41   - RELEVANCE_LOW: 0.1,
42   - RELEVANCE_IRRELEVANT: 0.0,
  39 + RELEVANCE_LV3: 0.99,
  40 + RELEVANCE_LV2: 0.8,
  41 + RELEVANCE_LV1: 0.1,
  42 + RELEVANCE_LV0: 0.0,
43 43 }
44 44  
45 45 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
... ... @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
78 78 # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
79 79 # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
80 80 # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
81   -# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant").
  81 +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
82 82 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
83 83 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
84 84 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -25,14 +25,14 @@ from .constants import (
25 25 DEFAULT_RERANK_HIGH_SKIP_COUNT,
26 26 DEFAULT_RERANK_HIGH_THRESHOLD,
27 27 DEFAULT_SEARCH_RECALL_TOP_K,
28   - RELEVANCE_EXACT,
29 28 RELEVANCE_GAIN_MAP,
30   - RELEVANCE_HIGH,
31   - STOP_PROB_MAP,
32   - RELEVANCE_IRRELEVANT,
33   - RELEVANCE_LOW,
  29 + RELEVANCE_LV0,
  30 + RELEVANCE_LV1,
  31 + RELEVANCE_LV2,
  32 + RELEVANCE_LV3,
34 33 RELEVANCE_NON_IRRELEVANT,
35 34 VALID_LABELS,
  35 + STOP_PROB_MAP,
36 36 )
37 37 from .metrics import (
38 38 PRIMARY_METRIC_GRADE_NORMALIZER,
... ... @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
96 96 return out
97 97  
98 98  
  99 +def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str:
  100 + parts: List[str] = []
  101 + for item in items[:limit]:
  102 + rank = int(item.get("rank") or 0)
  103 + label = str(item.get("label") or "")
  104 + grade = RELEVANCE_GAIN_MAP.get(label)
  105 + parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?")
  106 + return " | ".join(parts)
  107 +
  108 +
99 109 class SearchEvaluationFramework:
100 110 def __init__(
101 111 self,
... ... @@ -168,7 +178,7 @@ class SearchEvaluationFramework:
168 178 ) -> Dict[str, Any]:
169 179 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
170 180 labels = [
171   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  181 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
172 182 for item in live["results"]
173 183 ]
174 184 return {
... ... @@ -432,7 +442,7 @@ class SearchEvaluationFramework:
432 442  
433 443 - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
434 444 - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio``
435   - (default 0.959; weak relevance = ``RELEVANCE_LOW``).
  445 + (default 0.959; weak relevance = ``RELEVANCE_LV1``).
436 446  
437 447 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
438 448 Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
... ... @@ -474,9 +484,9 @@ class SearchEvaluationFramework:
474 484 time.sleep(0.1)
475 485  
476 486 n = len(batch_docs)
477   - exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT)
478   - irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT)
479   - low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW)
  487 + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3)
  488 + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0)
  489 + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1)
480 490 exact_ratio = exact_n / n if n else 0.0
481 491 irrelevant_ratio = irrel_n / n if n else 0.0
482 492 low_ratio = low_n / n if n else 0.0
... ... @@ -633,7 +643,7 @@ class SearchEvaluationFramework:
633 643 )
634 644  
635 645 top100_labels = [
636   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  646 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
637 647 for item in search_labeled_results[:100]
638 648 ]
639 649 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
... ... @@ -843,7 +853,7 @@ class SearchEvaluationFramework:
843 853 )
844 854  
845 855 top100_labels = [
846   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  856 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
847 857 for item in search_labeled_results[:100]
848 858 ]
849 859 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
... ... @@ -920,16 +930,17 @@ class SearchEvaluationFramework:
920 930 "title_zh": title_zh if title_zh and title_zh != primary_title else "",
921 931 "image_url": doc.get("image_url"),
922 932 "label": label,
  933 + "relevance_score": doc.get("relevance_score"),
923 934 "option_values": list(compact_option_values(doc.get("skus") or [])),
924 935 "product": compact_product_payload(doc),
925 936 }
926 937 )
927 938 metric_labels = [
928   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  939 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
929 940 for item in labeled
930 941 ]
931 942 ideal_labels = [
932   - label if label in VALID_LABELS else RELEVANCE_IRRELEVANT
  943 + label if label in VALID_LABELS else RELEVANCE_LV0
933 944 for label in labels.values()
934 945 ]
935 946 label_stats = self.store.get_query_label_stats(self.tenant_id, query)
... ... @@ -960,10 +971,10 @@ class SearchEvaluationFramework:
960 971 }
961 972 )
962 973 label_order = {
963   - RELEVANCE_EXACT: 0,
964   - RELEVANCE_HIGH: 1,
965   - RELEVANCE_LOW: 2,
966   - RELEVANCE_IRRELEVANT: 3,
  974 + RELEVANCE_LV3: 0,
  975 + RELEVANCE_LV2: 1,
  976 + RELEVANCE_LV1: 2,
  977 + RELEVANCE_LV0: 3,
967 978 }
968 979 missing_relevant.sort(
969 980 key=lambda item: (
... ... @@ -989,6 +1000,7 @@ class SearchEvaluationFramework:
989 1000 "top_k": top_k,
990 1001 "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels),
991 1002 "metric_context": _metric_context_payload(),
  1003 + "request_id": str(search_payload.get("_eval_request_id") or ""),
992 1004 "results": labeled,
993 1005 "missing_relevant": missing_relevant,
994 1006 "label_stats": {
... ... @@ -996,9 +1008,9 @@ class SearchEvaluationFramework:
996 1008 "unlabeled_hits_treated_irrelevant": unlabeled_hits,
997 1009 "recalled_hits": len(labeled),
998 1010 "missing_relevant_count": len(missing_relevant),
999   - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
1000   - "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
1001   - "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
  1011 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3),
  1012 + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2),
  1013 + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1),
1002 1014 },
1003 1015 "tips": tips,
1004 1016 "total": int(search_payload.get("total") or 0),
... ... @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework:
1014 1026 force_refresh_labels: bool = False,
1015 1027 ) -> Dict[str, Any]:
1016 1028 per_query = []
  1029 + case_snapshot_top_n = min(max(int(top_k), 1), 20)
1017 1030 total_q = len(queries)
1018 1031 _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
1019 1032 for q_index, query in enumerate(queries, start=1):
... ... @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework:
1025 1038 force_refresh_labels=force_refresh_labels,
1026 1039 )
1027 1040 labels = [
1028   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1041 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
1029 1042 for item in live["results"]
1030 1043 ]
1031 1044 per_query.append(
... ... @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework:
1036 1049 "metrics": live["metrics"],
1037 1050 "distribution": label_distribution(labels),
1038 1051 "total": live["total"],
  1052 + "request_id": live.get("request_id") or "",
  1053 + "case_snapshot_top_n": case_snapshot_top_n,
  1054 + "top_label_sequence_top10": _encode_label_sequence(live["results"], 10),
  1055 + "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n),
  1056 + "top_results": [
  1057 + {
  1058 + "rank": int(item.get("rank") or 0),
  1059 + "spu_id": str(item.get("spu_id") or ""),
  1060 + "label": item.get("label"),
  1061 + "title": item.get("title"),
  1062 + "title_zh": item.get("title_zh"),
  1063 + "relevance_score": item.get("relevance_score"),
  1064 + }
  1065 + for item in live["results"][:case_snapshot_top_n]
  1066 + ],
1039 1067 }
1040 1068 )
1041 1069 m = live["metrics"]
... ... @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework:
1055 1083 )
1056 1084 aggregate = aggregate_metrics([item["metrics"] for item in per_query])
1057 1085 aggregate_distribution = {
1058   - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
1059   - RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
1060   - RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
1061   - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
  1086 + RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query),
  1087 + RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query),
  1088 + RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query),
  1089 + RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query),
1062 1090 }
1063 1091 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
1064 1092 report_dir = ensure_dir(self.artifact_root / "batch_reports")
... ...
scripts/evaluation/eval_framework/metrics.py
... ... @@ -6,12 +6,12 @@ import math
6 6 from typing import Dict, Iterable, Sequence
7 7  
8 8 from .constants import (
9   - RELEVANCE_EXACT,
10 9 RELEVANCE_GAIN_MAP,
11 10 RELEVANCE_GRADE_MAP,
12   - RELEVANCE_HIGH,
13   - RELEVANCE_IRRELEVANT,
14   - RELEVANCE_LOW,
  11 + RELEVANCE_LV0,
  12 + RELEVANCE_LV1,
  13 + RELEVANCE_LV2,
  14 + RELEVANCE_LV3,
15 15 RELEVANCE_NON_IRRELEVANT,
16 16 RELEVANCE_STRONG,
17 17 STOP_PROB_MAP,
... ... @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0
33 33 def _normalize_label(label: str) -> str:
34 34 if label in RELEVANCE_GRADE_MAP:
35 35 return label
36   - return RELEVANCE_IRRELEVANT
  36 + return RELEVANCE_LV0
37 37  
38 38  
39 39 def _gains_for_labels(labels: Sequence[str]) -> list[float]:
... ... @@ -135,7 +135,7 @@ def compute_query_metrics(
135 135 ideal = list(ideal_labels) if ideal_labels is not None else list(labels)
136 136 metrics: Dict[str, float] = {}
137 137  
138   - exact_hits = _binary_hits(labels, [RELEVANCE_EXACT])
  138 + exact_hits = _binary_hits(labels, [RELEVANCE_LV3])
139 139 strong_hits = _binary_hits(labels, RELEVANCE_STRONG)
140 140 useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT)
141 141  
... ... @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo
183 183  
184 184 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
185 185 return {
186   - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
187   - RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
188   - RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
189   - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
  186 + RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3),
  187 + RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2),
  188 + RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1),
  189 + RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0),
190 190 }
... ...
scripts/evaluation/eval_framework/reports.py
... ... @@ -4,7 +4,7 @@ from __future__ import annotations
4 4  
5 5 from typing import Any, Dict
6 6  
7   -from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
  7 +from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
8 8 from .metrics import PRIMARY_METRIC_KEYS
9 9  
10 10  
... ... @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
25 25 lines.append(f"- {key}: {value}")
26 26  
27 27  
  28 +def _label_level_code(label: str) -> str:
  29 + grade = RELEVANCE_GAIN_MAP.get(label)
  30 + return f"L{grade}" if grade is not None else "?"
  31 +
  32 +
  33 +def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
  34 + request_id = str(item.get("request_id") or "").strip()
  35 + if request_id:
  36 + lines.append(f"- Request ID: `{request_id}`")
  37 + seq10 = str(item.get("top_label_sequence_top10") or "").strip()
  38 + if seq10:
  39 + lines.append(f"- Top-10 Labels: `{seq10}`")
  40 + seq20 = str(item.get("top_label_sequence_top20") or "").strip()
  41 + if seq20 and seq20 != seq10:
  42 + lines.append(f"- Top-20 Labels: `{seq20}`")
  43 + top_results = item.get("top_results") or []
  44 + if not top_results:
  45 + return
  46 + lines.append("- Case Snapshot:")
  47 + for result in top_results[:5]:
  48 + rank = int(result.get("rank") or 0)
  49 + label = _label_level_code(str(result.get("label") or ""))
  50 + spu_id = str(result.get("spu_id") or "")
  51 + title = str(result.get("title") or "")
  52 + title_zh = str(result.get("title_zh") or "")
  53 + relevance_score = result.get("relevance_score")
  54 + score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
  55 + lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
  56 + if title_zh:
  57 + lines.append(f" zh: {title_zh}")
  58 +
  59 +
28 60 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
29 61 lines = [
30 62 "# Search Batch Evaluation",
... ... @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
56 88 "",
57 89 "## Label Distribution",
58 90 "",
59   - f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
60   - f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
61   - f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
62   - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
  91 + f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
  92 + f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
  93 + f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
  94 + f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
63 95 ]
64 96 )
65 97 lines.extend(["", "## Per Query", ""])
... ... @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
68 100 lines.append("")
69 101 _append_metric_block(lines, item.get("metrics") or {})
70 102 distribution = item.get("distribution") or {}
71   - lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
72   - lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
73   - lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
74   - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
  103 + lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
  104 + lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
  105 + lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
  106 + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
  107 + _append_case_snapshot(lines, item)
75 108 lines.append("")
76 109 return "\n".join(lines)
... ...
scripts/evaluation/eval_framework/static/eval_web.js
... ... @@ -190,7 +190,7 @@ async function loadQueries() {
190 190  
191 191 function historySummaryHtml(meta) {
192 192 const m = meta && meta.aggregate_metrics;
193   - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
  193 + const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
194 194 const parts = [];
195 195 if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
196 196 if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`);
... ...
scripts/evaluation/eval_framework/store.py
... ... @@ -23,6 +23,18 @@ class QueryBuildResult:
23 23 output_json_path: Path
24 24  
25 25  
  26 +def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
  27 + return {
  28 + "batch_id": metadata.get("batch_id"),
  29 + "created_at": metadata.get("created_at"),
  30 + "tenant_id": metadata.get("tenant_id"),
  31 + "top_k": metadata.get("top_k"),
  32 + "query_count": len(metadata.get("queries") or []),
  33 + "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}),
  34 + "metric_context": dict(metadata.get("metric_context") or {}),
  35 + }
  36 +
  37 +
26 38 class EvalStore:
27 39 def __init__(self, db_path: Path):
28 40 self.db_path = db_path
... ... @@ -339,6 +351,7 @@ class EvalStore:
339 351 ).fetchall()
340 352 items: List[Dict[str, Any]] = []
341 353 for row in rows:
  354 + metadata = json.loads(row["metadata_json"])
342 355 items.append(
343 356 {
344 357 "batch_id": row["batch_id"],
... ... @@ -346,7 +359,7 @@ class EvalStore:
346 359 "output_json_path": row["output_json_path"],
347 360 "report_markdown_path": row["report_markdown_path"],
348 361 "config_snapshot_path": row["config_snapshot_path"],
349   - "metadata": json.loads(row["metadata_json"]),
  362 + "metadata": _compact_batch_metadata(metadata),
350 363 "created_at": row["created_at"],
351 364 }
352 365 )
... ...
scripts/evaluation/offline_ltr_fit.py
... ... @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path:
23 23  
24 24 from scripts.evaluation.eval_framework.constants import (
25 25 DEFAULT_ARTIFACT_ROOT,
26   - RELEVANCE_EXACT,
27 26 RELEVANCE_GRADE_MAP,
28   - RELEVANCE_HIGH,
29   - RELEVANCE_IRRELEVANT,
30   - RELEVANCE_LOW,
  27 + RELEVANCE_LV0,
  28 + RELEVANCE_LV1,
  29 + RELEVANCE_LV2,
  30 + RELEVANCE_LV3,
31 31 )
32 32 from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics
33 33 from scripts.evaluation.eval_framework.store import EvalStore
... ... @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp
35 35  
36 36  
37 37 LABELS_BY_GRADE = {
38   - 3: RELEVANCE_EXACT,
39   - 2: RELEVANCE_HIGH,
40   - 1: RELEVANCE_LOW,
41   - 0: RELEVANCE_IRRELEVANT,
  38 + 3: RELEVANCE_LV3,
  39 + 2: RELEVANCE_LV2,
  40 + 1: RELEVANCE_LV1,
  41 + 0: RELEVANCE_LV0,
42 42 }
43 43  
44 44  
... ...