Commit d73ca84a48afc0945a533707c77ba3bbfaac9621

Authored by tangwang
1 parent 1fdab52d

refine eval case snapshots and rename relevance levels

scripts/evaluation/README.md
@@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p
127 - **Composite tuning score: `Primary_Metric_Score`** 127 - **Composite tuning score: `Primary_Metric_Score`**
128 For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`). 128 For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`).
129 - **Gain scheme** 129 - **Gain scheme**
130 - `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0`  
131 - The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. 130 + `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0`
  131 + We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive.
132 - **Why this is better** 132 - **Why this is better**
133 `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`. 133 `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`.
134 134
@@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc
174 174
175 Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. 175 Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`.
176 176
  177 +To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes:
  178 +
  179 +- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call
  180 +- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2`
  181 +- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score`
  182 +
  183 +The Markdown report now surfaces the same case context in a lighter human-readable form:
  184 +
  185 +- request id
  186 +- top-10 / top-20 label sequence
  187 +- top 5 result snapshot for quick scanning
  188 +
  189 +This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand.
  190 +
  191 +The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI.
  192 +
177 ## Ranking debug and LTR prep 193 ## Ranking debug and LTR prep
178 194
179 `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work: 195 `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work:
scripts/evaluation/eval_framework/__init__.py
@@ -14,10 +14,10 @@ from .constants import ( # noqa: E402 @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402
14 DEFAULT_ARTIFACT_ROOT, 14 DEFAULT_ARTIFACT_ROOT,
15 DEFAULT_QUERY_FILE, 15 DEFAULT_QUERY_FILE,
16 PROJECT_ROOT, 16 PROJECT_ROOT,
17 - RELEVANCE_EXACT,  
18 - RELEVANCE_HIGH,  
19 - RELEVANCE_IRRELEVANT,  
20 - RELEVANCE_LOW, 17 + RELEVANCE_LV0,
  18 + RELEVANCE_LV1,
  19 + RELEVANCE_LV2,
  20 + RELEVANCE_LV3,
21 RELEVANCE_NON_IRRELEVANT, 21 RELEVANCE_NON_IRRELEVANT,
22 VALID_LABELS, 22 VALID_LABELS,
23 ) 23 )
@@ -39,10 +39,10 @@ __all__ = [ @@ -39,10 +39,10 @@ __all__ = [
39 "EvalStore", 39 "EvalStore",
40 "PROJECT_ROOT", 40 "PROJECT_ROOT",
41 "QueryBuildResult", 41 "QueryBuildResult",
42 - "RELEVANCE_EXACT",  
43 - "RELEVANCE_HIGH",  
44 - "RELEVANCE_IRRELEVANT",  
45 - "RELEVANCE_LOW", 42 + "RELEVANCE_LV0",
  43 + "RELEVANCE_LV1",
  44 + "RELEVANCE_LV2",
  45 + "RELEVANCE_LV3",
46 "RELEVANCE_NON_IRRELEVANT", 46 "RELEVANCE_NON_IRRELEVANT",
47 "SearchEvaluationFramework", 47 "SearchEvaluationFramework",
48 "VALID_LABELS", 48 "VALID_LABELS",
scripts/evaluation/eval_framework/clients.py
@@ -157,6 +157,7 @@ class SearchServiceClient: @@ -157,6 +157,7 @@ class SearchServiceClient:
157 return self._request_json("GET", path, timeout=timeout) 157 return self._request_json("GET", path, timeout=timeout)
158 158
159 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: 159 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
  160 + request_id = uuid.uuid4().hex[:8]
160 payload: Dict[str, Any] = { 161 payload: Dict[str, Any] = {
161 "query": query, 162 "query": query,
162 "size": size, 163 "size": size,
@@ -165,13 +166,19 @@ class SearchServiceClient: @@ -165,13 +166,19 @@ class SearchServiceClient:
165 } 166 }
166 if debug: 167 if debug:
167 payload["debug"] = True 168 payload["debug"] = True
168 - return self._request_json( 169 + response = self._request_json(
169 "POST", 170 "POST",
170 "/search/", 171 "/search/",
171 timeout=120, 172 timeout=120,
172 - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, 173 + headers={
  174 + "Content-Type": "application/json",
  175 + "X-Tenant-ID": self.tenant_id,
  176 + "X-Request-ID": request_id,
  177 + },
173 json_payload=payload, 178 json_payload=payload,
174 ) 179 )
  180 + response["_eval_request_id"] = request_id
  181 + return response
175 182
176 183
177 class RerankServiceClient: 184 class RerankServiceClient:
scripts/evaluation/eval_framework/constants.py
@@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
7 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] 7 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
8 8
9 # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) 9 # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
10 -RELEVANCE_EXACT = "Fully Relevant"  
11 -RELEVANCE_HIGH = "Mostly Relevant"  
12 -RELEVANCE_LOW = "Weakly Relevant"  
13 -RELEVANCE_IRRELEVANT = "Irrelevant" 10 +RELEVANCE_LV3 = "Fully Relevant"
  11 +RELEVANCE_LV2 = "Mostly Relevant"
  12 +RELEVANCE_LV1 = "Weakly Relevant"
  13 +RELEVANCE_LV0 = "Irrelevant"
14 14
15 -VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) 15 +VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
16 16
17 # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. 17 # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
18 -RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})  
19 -RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) 18 +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
  19 +RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
20 20
21 # Graded relevance for ranking evaluation. 21 # Graded relevance for ranking evaluation.
22 # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. 22 # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
23 RELEVANCE_GRADE_MAP = { 23 RELEVANCE_GRADE_MAP = {
24 - RELEVANCE_EXACT: 3,  
25 - RELEVANCE_HIGH: 2,  
26 - RELEVANCE_LOW: 1,  
27 - RELEVANCE_IRRELEVANT: 0, 24 + RELEVANCE_LV3: 3,
  25 + RELEVANCE_LV2: 2,
  26 + RELEVANCE_LV1: 1,
  27 + RELEVANCE_LV0: 0,
28 } 28 }
29 # 标准的gain计算方法:2^rel - 1 29 # 标准的gain计算方法:2^rel - 1
30 # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度 30 # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度
@@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = { @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = {
36 36
37 # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009). 37 # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
38 STOP_PROB_MAP = { 38 STOP_PROB_MAP = {
39 - RELEVANCE_EXACT: 0.99,  
40 - RELEVANCE_HIGH: 0.8,  
41 - RELEVANCE_LOW: 0.1,  
42 - RELEVANCE_IRRELEVANT: 0.0, 39 + RELEVANCE_LV3: 0.99,
  40 + RELEVANCE_LV2: 0.8,
  41 + RELEVANCE_LV1: 0.1,
  42 + RELEVANCE_LV0: 0.0,
43 } 43 }
44 44
45 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" 45 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
@@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
78 # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): 78 # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
79 # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), 79 # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
80 # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). 80 # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
81 -# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant"). 81 +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
82 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak 82 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
83 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). 83 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
84 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 84 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
scripts/evaluation/eval_framework/framework.py
@@ -25,14 +25,14 @@ from .constants import ( @@ -25,14 +25,14 @@ from .constants import (
25 DEFAULT_RERANK_HIGH_SKIP_COUNT, 25 DEFAULT_RERANK_HIGH_SKIP_COUNT,
26 DEFAULT_RERANK_HIGH_THRESHOLD, 26 DEFAULT_RERANK_HIGH_THRESHOLD,
27 DEFAULT_SEARCH_RECALL_TOP_K, 27 DEFAULT_SEARCH_RECALL_TOP_K,
28 - RELEVANCE_EXACT,  
29 RELEVANCE_GAIN_MAP, 28 RELEVANCE_GAIN_MAP,
30 - RELEVANCE_HIGH,  
31 - STOP_PROB_MAP,  
32 - RELEVANCE_IRRELEVANT,  
33 - RELEVANCE_LOW, 29 + RELEVANCE_LV0,
  30 + RELEVANCE_LV1,
  31 + RELEVANCE_LV2,
  32 + RELEVANCE_LV3,
34 RELEVANCE_NON_IRRELEVANT, 33 RELEVANCE_NON_IRRELEVANT,
35 VALID_LABELS, 34 VALID_LABELS,
  35 + STOP_PROB_MAP,
36 ) 36 )
37 from .metrics import ( 37 from .metrics import (
38 PRIMARY_METRIC_GRADE_NORMALIZER, 38 PRIMARY_METRIC_GRADE_NORMALIZER,
@@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
96 return out 96 return out
97 97
98 98
  99 +def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str:
  100 + parts: List[str] = []
  101 + for item in items[:limit]:
  102 + rank = int(item.get("rank") or 0)
  103 + label = str(item.get("label") or "")
  104 + grade = RELEVANCE_GAIN_MAP.get(label)
  105 + parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?")
  106 + return " | ".join(parts)
  107 +
  108 +
99 class SearchEvaluationFramework: 109 class SearchEvaluationFramework:
100 def __init__( 110 def __init__(
101 self, 111 self,
@@ -168,7 +178,7 @@ class SearchEvaluationFramework: @@ -168,7 +178,7 @@ class SearchEvaluationFramework:
168 ) -> Dict[str, Any]: 178 ) -> Dict[str, Any]:
169 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) 179 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
170 labels = [ 180 labels = [
171 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 181 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
172 for item in live["results"] 182 for item in live["results"]
173 ] 183 ]
174 return { 184 return {
@@ -432,7 +442,7 @@ class SearchEvaluationFramework: @@ -432,7 +442,7 @@ class SearchEvaluationFramework:
432 442
433 - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and 443 - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and
434 - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio`` 444 - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio``
435 - (default 0.959; weak relevance = ``RELEVANCE_LOW``). 445 + (default 0.959; weak relevance = ``RELEVANCE_LV1``).
436 446
437 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. 447 Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0.
438 Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached 448 Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached
@@ -474,9 +484,9 @@ class SearchEvaluationFramework: @@ -474,9 +484,9 @@ class SearchEvaluationFramework:
474 time.sleep(0.1) 484 time.sleep(0.1)
475 485
476 n = len(batch_docs) 486 n = len(batch_docs)
477 - exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT)  
478 - irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT)  
479 - low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW) 487 + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3)
  488 + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0)
  489 + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1)
480 exact_ratio = exact_n / n if n else 0.0 490 exact_ratio = exact_n / n if n else 0.0
481 irrelevant_ratio = irrel_n / n if n else 0.0 491 irrelevant_ratio = irrel_n / n if n else 0.0
482 low_ratio = low_n / n if n else 0.0 492 low_ratio = low_n / n if n else 0.0
@@ -633,7 +643,7 @@ class SearchEvaluationFramework: @@ -633,7 +643,7 @@ class SearchEvaluationFramework:
633 ) 643 )
634 644
635 top100_labels = [ 645 top100_labels = [
636 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 646 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
637 for item in search_labeled_results[:100] 647 for item in search_labeled_results[:100]
638 ] 648 ]
639 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) 649 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -843,7 +853,7 @@ class SearchEvaluationFramework: @@ -843,7 +853,7 @@ class SearchEvaluationFramework:
843 ) 853 )
844 854
845 top100_labels = [ 855 top100_labels = [
846 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 856 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
847 for item in search_labeled_results[:100] 857 for item in search_labeled_results[:100]
848 ] 858 ]
849 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) 859 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
@@ -920,16 +930,17 @@ class SearchEvaluationFramework: @@ -920,16 +930,17 @@ class SearchEvaluationFramework:
920 "title_zh": title_zh if title_zh and title_zh != primary_title else "", 930 "title_zh": title_zh if title_zh and title_zh != primary_title else "",
921 "image_url": doc.get("image_url"), 931 "image_url": doc.get("image_url"),
922 "label": label, 932 "label": label,
  933 + "relevance_score": doc.get("relevance_score"),
923 "option_values": list(compact_option_values(doc.get("skus") or [])), 934 "option_values": list(compact_option_values(doc.get("skus") or [])),
924 "product": compact_product_payload(doc), 935 "product": compact_product_payload(doc),
925 } 936 }
926 ) 937 )
927 metric_labels = [ 938 metric_labels = [
928 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 939 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
929 for item in labeled 940 for item in labeled
930 ] 941 ]
931 ideal_labels = [ 942 ideal_labels = [
932 - label if label in VALID_LABELS else RELEVANCE_IRRELEVANT 943 + label if label in VALID_LABELS else RELEVANCE_LV0
933 for label in labels.values() 944 for label in labels.values()
934 ] 945 ]
935 label_stats = self.store.get_query_label_stats(self.tenant_id, query) 946 label_stats = self.store.get_query_label_stats(self.tenant_id, query)
@@ -960,10 +971,10 @@ class SearchEvaluationFramework: @@ -960,10 +971,10 @@ class SearchEvaluationFramework:
960 } 971 }
961 ) 972 )
962 label_order = { 973 label_order = {
963 - RELEVANCE_EXACT: 0,  
964 - RELEVANCE_HIGH: 1,  
965 - RELEVANCE_LOW: 2,  
966 - RELEVANCE_IRRELEVANT: 3, 974 + RELEVANCE_LV3: 0,
  975 + RELEVANCE_LV2: 1,
  976 + RELEVANCE_LV1: 2,
  977 + RELEVANCE_LV0: 3,
967 } 978 }
968 missing_relevant.sort( 979 missing_relevant.sort(
969 key=lambda item: ( 980 key=lambda item: (
@@ -989,6 +1000,7 @@ class SearchEvaluationFramework: @@ -989,6 +1000,7 @@ class SearchEvaluationFramework:
989 "top_k": top_k, 1000 "top_k": top_k,
990 "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), 1001 "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels),
991 "metric_context": _metric_context_payload(), 1002 "metric_context": _metric_context_payload(),
  1003 + "request_id": str(search_payload.get("_eval_request_id") or ""),
992 "results": labeled, 1004 "results": labeled,
993 "missing_relevant": missing_relevant, 1005 "missing_relevant": missing_relevant,
994 "label_stats": { 1006 "label_stats": {
@@ -996,9 +1008,9 @@ class SearchEvaluationFramework: @@ -996,9 +1008,9 @@ class SearchEvaluationFramework:
996 "unlabeled_hits_treated_irrelevant": unlabeled_hits, 1008 "unlabeled_hits_treated_irrelevant": unlabeled_hits,
997 "recalled_hits": len(labeled), 1009 "recalled_hits": len(labeled),
998 "missing_relevant_count": len(missing_relevant), 1010 "missing_relevant_count": len(missing_relevant),
999 - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),  
1000 - "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),  
1001 - "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), 1011 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3),
  1012 + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2),
  1013 + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1),
1002 }, 1014 },
1003 "tips": tips, 1015 "tips": tips,
1004 "total": int(search_payload.get("total") or 0), 1016 "total": int(search_payload.get("total") or 0),
@@ -1014,6 +1026,7 @@ class SearchEvaluationFramework: @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework:
1014 force_refresh_labels: bool = False, 1026 force_refresh_labels: bool = False,
1015 ) -> Dict[str, Any]: 1027 ) -> Dict[str, Any]:
1016 per_query = [] 1028 per_query = []
  1029 + case_snapshot_top_n = min(max(int(top_k), 1), 20)
1017 total_q = len(queries) 1030 total_q = len(queries)
1018 _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) 1031 _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
1019 for q_index, query in enumerate(queries, start=1): 1032 for q_index, query in enumerate(queries, start=1):
@@ -1025,7 +1038,7 @@ class SearchEvaluationFramework: @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework:
1025 force_refresh_labels=force_refresh_labels, 1038 force_refresh_labels=force_refresh_labels,
1026 ) 1039 )
1027 labels = [ 1040 labels = [
1028 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 1041 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
1029 for item in live["results"] 1042 for item in live["results"]
1030 ] 1043 ]
1031 per_query.append( 1044 per_query.append(
@@ -1036,6 +1049,21 @@ class SearchEvaluationFramework: @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework:
1036 "metrics": live["metrics"], 1049 "metrics": live["metrics"],
1037 "distribution": label_distribution(labels), 1050 "distribution": label_distribution(labels),
1038 "total": live["total"], 1051 "total": live["total"],
  1052 + "request_id": live.get("request_id") or "",
  1053 + "case_snapshot_top_n": case_snapshot_top_n,
  1054 + "top_label_sequence_top10": _encode_label_sequence(live["results"], 10),
  1055 + "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n),
  1056 + "top_results": [
  1057 + {
  1058 + "rank": int(item.get("rank") or 0),
  1059 + "spu_id": str(item.get("spu_id") or ""),
  1060 + "label": item.get("label"),
  1061 + "title": item.get("title"),
  1062 + "title_zh": item.get("title_zh"),
  1063 + "relevance_score": item.get("relevance_score"),
  1064 + }
  1065 + for item in live["results"][:case_snapshot_top_n]
  1066 + ],
1039 } 1067 }
1040 ) 1068 )
1041 m = live["metrics"] 1069 m = live["metrics"]
@@ -1055,10 +1083,10 @@ class SearchEvaluationFramework: @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework:
1055 ) 1083 )
1056 aggregate = aggregate_metrics([item["metrics"] for item in per_query]) 1084 aggregate = aggregate_metrics([item["metrics"] for item in per_query])
1057 aggregate_distribution = { 1085 aggregate_distribution = {
1058 - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),  
1059 - RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),  
1060 - RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),  
1061 - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), 1086 + RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query),
  1087 + RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query),
  1088 + RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query),
  1089 + RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query),
1062 } 1090 }
1063 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" 1091 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
1064 report_dir = ensure_dir(self.artifact_root / "batch_reports") 1092 report_dir = ensure_dir(self.artifact_root / "batch_reports")
scripts/evaluation/eval_framework/metrics.py
@@ -6,12 +6,12 @@ import math @@ -6,12 +6,12 @@ import math
6 from typing import Dict, Iterable, Sequence 6 from typing import Dict, Iterable, Sequence
7 7
8 from .constants import ( 8 from .constants import (
9 - RELEVANCE_EXACT,  
10 RELEVANCE_GAIN_MAP, 9 RELEVANCE_GAIN_MAP,
11 RELEVANCE_GRADE_MAP, 10 RELEVANCE_GRADE_MAP,
12 - RELEVANCE_HIGH,  
13 - RELEVANCE_IRRELEVANT,  
14 - RELEVANCE_LOW, 11 + RELEVANCE_LV0,
  12 + RELEVANCE_LV1,
  13 + RELEVANCE_LV2,
  14 + RELEVANCE_LV3,
15 RELEVANCE_NON_IRRELEVANT, 15 RELEVANCE_NON_IRRELEVANT,
16 RELEVANCE_STRONG, 16 RELEVANCE_STRONG,
17 STOP_PROB_MAP, 17 STOP_PROB_MAP,
@@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0 @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0
33 def _normalize_label(label: str) -> str: 33 def _normalize_label(label: str) -> str:
34 if label in RELEVANCE_GRADE_MAP: 34 if label in RELEVANCE_GRADE_MAP:
35 return label 35 return label
36 - return RELEVANCE_IRRELEVANT 36 + return RELEVANCE_LV0
37 37
38 38
39 def _gains_for_labels(labels: Sequence[str]) -> list[float]: 39 def _gains_for_labels(labels: Sequence[str]) -> list[float]:
@@ -135,7 +135,7 @@ def compute_query_metrics( @@ -135,7 +135,7 @@ def compute_query_metrics(
135 ideal = list(ideal_labels) if ideal_labels is not None else list(labels) 135 ideal = list(ideal_labels) if ideal_labels is not None else list(labels)
136 metrics: Dict[str, float] = {} 136 metrics: Dict[str, float] = {}
137 137
138 - exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) 138 + exact_hits = _binary_hits(labels, [RELEVANCE_LV3])
139 strong_hits = _binary_hits(labels, RELEVANCE_STRONG) 139 strong_hits = _binary_hits(labels, RELEVANCE_STRONG)
140 useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) 140 useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT)
141 141
@@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo
183 183
184 def label_distribution(labels: Sequence[str]) -> Dict[str, int]: 184 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
185 return { 185 return {
186 - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),  
187 - RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),  
188 - RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),  
189 - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), 186 + RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3),
  187 + RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2),
  188 + RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1),
  189 + RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0),
190 } 190 }
scripts/evaluation/eval_framework/reports.py
@@ -4,7 +4,7 @@ from __future__ import annotations @@ -4,7 +4,7 @@ from __future__ import annotations
4 4
5 from typing import Any, Dict 5 from typing import Any, Dict
6 6
7 -from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW 7 +from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
8 from .metrics import PRIMARY_METRIC_KEYS 8 from .metrics import PRIMARY_METRIC_KEYS
9 9
10 10
@@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
25 lines.append(f"- {key}: {value}") 25 lines.append(f"- {key}: {value}")
26 26
27 27
  28 +def _label_level_code(label: str) -> str:
  29 + grade = RELEVANCE_GAIN_MAP.get(label)
  30 + return f"L{grade}" if grade is not None else "?"
  31 +
  32 +
  33 +def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
  34 + request_id = str(item.get("request_id") or "").strip()
  35 + if request_id:
  36 + lines.append(f"- Request ID: `{request_id}`")
  37 + seq10 = str(item.get("top_label_sequence_top10") or "").strip()
  38 + if seq10:
  39 + lines.append(f"- Top-10 Labels: `{seq10}`")
  40 + seq20 = str(item.get("top_label_sequence_top20") or "").strip()
  41 + if seq20 and seq20 != seq10:
  42 + lines.append(f"- Top-20 Labels: `{seq20}`")
  43 + top_results = item.get("top_results") or []
  44 + if not top_results:
  45 + return
  46 + lines.append("- Case Snapshot:")
  47 + for result in top_results[:5]:
  48 + rank = int(result.get("rank") or 0)
  49 + label = _label_level_code(str(result.get("label") or ""))
  50 + spu_id = str(result.get("spu_id") or "")
  51 + title = str(result.get("title") or "")
  52 + title_zh = str(result.get("title_zh") or "")
  53 + relevance_score = result.get("relevance_score")
  54 + score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
  55 + lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
  56 + if title_zh:
  57 + lines.append(f" zh: {title_zh}")
  58 +
  59 +
28 def render_batch_report_markdown(payload: Dict[str, Any]) -> str: 60 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
29 lines = [ 61 lines = [
30 "# Search Batch Evaluation", 62 "# Search Batch Evaluation",
@@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
56 "", 88 "",
57 "## Label Distribution", 89 "## Label Distribution",
58 "", 90 "",
59 - f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",  
60 - f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",  
61 - f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",  
62 - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", 91 + f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
  92 + f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
  93 + f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
  94 + f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
63 ] 95 ]
64 ) 96 )
65 lines.extend(["", "## Per Query", ""]) 97 lines.extend(["", "## Per Query", ""])
@@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
68 lines.append("") 100 lines.append("")
69 _append_metric_block(lines, item.get("metrics") or {}) 101 _append_metric_block(lines, item.get("metrics") or {})
70 distribution = item.get("distribution") or {} 102 distribution = item.get("distribution") or {}
71 - lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")  
72 - lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")  
73 - lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")  
74 - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") 103 + lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
  104 + lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
  105 + lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
  106 + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
  107 + _append_case_snapshot(lines, item)
75 lines.append("") 108 lines.append("")
76 return "\n".join(lines) 109 return "\n".join(lines)
scripts/evaluation/eval_framework/static/eval_web.js
@@ -190,7 +190,7 @@ async function loadQueries() { @@ -190,7 +190,7 @@ async function loadQueries() {
190 190
191 function historySummaryHtml(meta) { 191 function historySummaryHtml(meta) {
192 const m = meta && meta.aggregate_metrics; 192 const m = meta && meta.aggregate_metrics;
193 - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; 193 + const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
194 const parts = []; 194 const parts = [];
195 if (nq != null) parts.push(`<span>Queries</span> ${nq}`); 195 if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
196 if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`); 196 if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`);
scripts/evaluation/eval_framework/store.py
@@ -23,6 +23,18 @@ class QueryBuildResult: @@ -23,6 +23,18 @@ class QueryBuildResult:
23 output_json_path: Path 23 output_json_path: Path
24 24
25 25
  26 +def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
  27 + return {
  28 + "batch_id": metadata.get("batch_id"),
  29 + "created_at": metadata.get("created_at"),
  30 + "tenant_id": metadata.get("tenant_id"),
  31 + "top_k": metadata.get("top_k"),
  32 + "query_count": len(metadata.get("queries") or []),
  33 + "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}),
  34 + "metric_context": dict(metadata.get("metric_context") or {}),
  35 + }
  36 +
  37 +
26 class EvalStore: 38 class EvalStore:
27 def __init__(self, db_path: Path): 39 def __init__(self, db_path: Path):
28 self.db_path = db_path 40 self.db_path = db_path
@@ -339,6 +351,7 @@ class EvalStore: @@ -339,6 +351,7 @@ class EvalStore:
339 ).fetchall() 351 ).fetchall()
340 items: List[Dict[str, Any]] = [] 352 items: List[Dict[str, Any]] = []
341 for row in rows: 353 for row in rows:
  354 + metadata = json.loads(row["metadata_json"])
342 items.append( 355 items.append(
343 { 356 {
344 "batch_id": row["batch_id"], 357 "batch_id": row["batch_id"],
@@ -346,7 +359,7 @@ class EvalStore: @@ -346,7 +359,7 @@ class EvalStore:
346 "output_json_path": row["output_json_path"], 359 "output_json_path": row["output_json_path"],
347 "report_markdown_path": row["report_markdown_path"], 360 "report_markdown_path": row["report_markdown_path"],
348 "config_snapshot_path": row["config_snapshot_path"], 361 "config_snapshot_path": row["config_snapshot_path"],
349 - "metadata": json.loads(row["metadata_json"]), 362 + "metadata": _compact_batch_metadata(metadata),
350 "created_at": row["created_at"], 363 "created_at": row["created_at"],
351 } 364 }
352 ) 365 )
scripts/evaluation/offline_ltr_fit.py
@@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path: @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path:
23 23
24 from scripts.evaluation.eval_framework.constants import ( 24 from scripts.evaluation.eval_framework.constants import (
25 DEFAULT_ARTIFACT_ROOT, 25 DEFAULT_ARTIFACT_ROOT,
26 - RELEVANCE_EXACT,  
27 RELEVANCE_GRADE_MAP, 26 RELEVANCE_GRADE_MAP,
28 - RELEVANCE_HIGH,  
29 - RELEVANCE_IRRELEVANT,  
30 - RELEVANCE_LOW, 27 + RELEVANCE_LV0,
  28 + RELEVANCE_LV1,
  29 + RELEVANCE_LV2,
  30 + RELEVANCE_LV3,
31 ) 31 )
32 from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics 32 from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics
33 from scripts.evaluation.eval_framework.store import EvalStore 33 from scripts.evaluation.eval_framework.store import EvalStore
@@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp
35 35
36 36
37 LABELS_BY_GRADE = { 37 LABELS_BY_GRADE = {
38 - 3: RELEVANCE_EXACT,  
39 - 2: RELEVANCE_HIGH,  
40 - 1: RELEVANCE_LOW,  
41 - 0: RELEVANCE_IRRELEVANT, 38 + 3: RELEVANCE_LV3,
  39 + 2: RELEVANCE_LV2,
  40 + 1: RELEVANCE_LV1,
  41 + 0: RELEVANCE_LV0,
42 } 42 }
43 43
44 44