Blame view

scripts/evaluation/eval_framework/reports.py 4.14 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
  """Markdown and text reports for batch evaluation."""
  
  from __future__ import annotations
  
  from typing import Any, Dict
  
d73ca84a   tangwang   refine eval case ...
7
  from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
465f90e1   tangwang   添加LTR数据收集
8
  from .metrics import PRIMARY_METRIC_KEYS
c81b0fc1   tangwang   scripts/evaluatio...
9
10
  
  
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
11
  def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
30b490e1   tangwang   添加ERR评估指标
12
      primary_keys = (
465f90e1   tangwang   添加LTR数据收集
13
14
          "Primary_Metric_Score",
          *PRIMARY_METRIC_KEYS,
30b490e1   tangwang   添加ERR评估指标
15
          "ERR@10",
30b490e1   tangwang   添加ERR评估指标
16
      )
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
17
18
19
20
21
22
23
24
25
26
27
      included = set()
      for key in primary_keys:
          if key in metrics:
              lines.append(f"- {key}: {metrics[key]}")
              included.add(key)
      for key, value in sorted(metrics.items()):
          if key in included:
              continue
          lines.append(f"- {key}: {value}")
  
  
d73ca84a   tangwang   refine eval case ...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  def _label_level_code(label: str) -> str:
      grade = RELEVANCE_GAIN_MAP.get(label)
      return f"L{grade}" if grade is not None else "?"
  
  
  def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
      request_id = str(item.get("request_id") or "").strip()
      if request_id:
          lines.append(f"- Request ID: `{request_id}`")
      seq10 = str(item.get("top_label_sequence_top10") or "").strip()
      if seq10:
          lines.append(f"- Top-10 Labels: `{seq10}`")
      seq20 = str(item.get("top_label_sequence_top20") or "").strip()
      if seq20 and seq20 != seq10:
          lines.append(f"- Top-20 Labels: `{seq20}`")
      top_results = item.get("top_results") or []
      if not top_results:
          return
      lines.append("- Case Snapshot:")
      for result in top_results[:5]:
          rank = int(result.get("rank") or 0)
          label = _label_level_code(str(result.get("label") or ""))
          spu_id = str(result.get("spu_id") or "")
          title = str(result.get("title") or "")
          title_zh = str(result.get("title_zh") or "")
          relevance_score = result.get("relevance_score")
          score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
          lines.append(f"  - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
          if title_zh:
              lines.append(f"    zh: {title_zh}")
  
  
c81b0fc1   tangwang   scripts/evaluatio...
60
61
62
63
64
65
66
67
68
69
70
71
72
  def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
      lines = [
          "# Search Batch Evaluation",
          "",
          f"- Batch ID: {payload['batch_id']}",
          f"- Created at: {payload['created_at']}",
          f"- Tenant ID: {payload['tenant_id']}",
          f"- Query count: {len(payload['queries'])}",
          f"- Top K: {payload['top_k']}",
          "",
          "## Aggregate Metrics",
          "",
      ]
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
73
74
75
76
77
      metric_context = payload.get("metric_context") or {}
      if metric_context:
          lines.extend(
              [
                  f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}",
30b490e1   tangwang   添加ERR评估指标
78
79
                  f"- Gain scheme (NDCG): {metric_context.get('gain_scheme', {})}",
                  f"- Stop probabilities (ERR): {metric_context.get('stop_prob_scheme', {})}",
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
80
81
82
83
                  "",
              ]
          )
      _append_metric_block(lines, payload.get("aggregate_metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
84
85
86
87
88
89
90
      distribution = payload.get("aggregate_distribution") or {}
      if distribution:
          lines.extend(
              [
                  "",
                  "## Label Distribution",
                  "",
d73ca84a   tangwang   refine eval case ...
91
92
93
94
                  f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
                  f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
                  f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
                  f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
c81b0fc1   tangwang   scripts/evaluatio...
95
96
97
98
99
100
              ]
          )
      lines.extend(["", "## Per Query", ""])
      for item in payload.get("per_query") or []:
          lines.append(f"### {item['query']}")
          lines.append("")
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
101
          _append_metric_block(lines, item.get("metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
102
          distribution = item.get("distribution") or {}
d73ca84a   tangwang   refine eval case ...
103
104
105
106
107
          lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
          lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
          lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
          lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
          _append_case_snapshot(lines, item)
c81b0fc1   tangwang   scripts/evaluatio...
108
109
          lines.append("")
      return "\n".join(lines)