Blame view

scripts/evaluation/eval_framework/reports.py 4.66 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
  """Markdown and text reports for batch evaluation."""
  
  from __future__ import annotations
  
  from typing import Any, Dict
  
d73ca84a   tangwang   refine eval case ...
7
  from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3
465f90e1   tangwang   添加LTR数据收集
8
  from .metrics import PRIMARY_METRIC_KEYS
c81b0fc1   tangwang   scripts/evaluatio...
9
10
  
  
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
11
  def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
30b490e1   tangwang   添加ERR评估指标
12
      primary_keys = (
465f90e1   tangwang   添加LTR数据收集
13
14
          "Primary_Metric_Score",
          *PRIMARY_METRIC_KEYS,
30b490e1   tangwang   添加ERR评估指标
15
          "ERR@10",
30b490e1   tangwang   添加ERR评估指标
16
      )
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
17
18
19
20
21
22
23
24
25
26
27
      included = set()
      for key in primary_keys:
          if key in metrics:
              lines.append(f"- {key}: {metrics[key]}")
              included.add(key)
      for key, value in sorted(metrics.items()):
          if key in included:
              continue
          lines.append(f"- {key}: {value}")
  
  
d73ca84a   tangwang   refine eval case ...
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  def _label_level_code(label: str) -> str:
      grade = RELEVANCE_GAIN_MAP.get(label)
      return f"L{grade}" if grade is not None else "?"
  
  
  def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None:
      request_id = str(item.get("request_id") or "").strip()
      if request_id:
          lines.append(f"- Request ID: `{request_id}`")
      seq10 = str(item.get("top_label_sequence_top10") or "").strip()
      if seq10:
          lines.append(f"- Top-10 Labels: `{seq10}`")
      seq20 = str(item.get("top_label_sequence_top20") or "").strip()
      if seq20 and seq20 != seq10:
          lines.append(f"- Top-20 Labels: `{seq20}`")
      top_results = item.get("top_results") or []
      if not top_results:
          return
      lines.append("- Case Snapshot:")
      for result in top_results[:5]:
          rank = int(result.get("rank") or 0)
          label = _label_level_code(str(result.get("label") or ""))
          spu_id = str(result.get("spu_id") or "")
          title = str(result.get("title") or "")
          title_zh = str(result.get("title_zh") or "")
          relevance_score = result.get("relevance_score")
          score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else ""
          lines.append(f"  - #{rank} [{label}] spu={spu_id} {title}{score_suffix}")
          if title_zh:
              lines.append(f"    zh: {title_zh}")
  
  
c81b0fc1   tangwang   scripts/evaluatio...
60
61
62
63
64
65
66
67
68
69
  def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
      lines = [
          "# Search Batch Evaluation",
          "",
          f"- Batch ID: {payload['batch_id']}",
          f"- Created at: {payload['created_at']}",
          f"- Tenant ID: {payload['tenant_id']}",
          f"- Query count: {len(payload['queries'])}",
          f"- Top K: {payload['top_k']}",
          "",
c81b0fc1   tangwang   scripts/evaluatio...
70
      ]
2059d959   tangwang   feat(eval): 多评估集统...
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
      dataset = payload.get("dataset") or {}
      if dataset:
          lines.extend(
              [
                  "## Dataset",
                  "",
                  f"- Dataset ID: {dataset.get('dataset_id', '')}",
                  f"- Display Name: {dataset.get('display_name', '')}",
                  f"- Query File: {dataset.get('query_file', '')}",
                  f"- Query Count: {dataset.get('query_count', '')}",
                  f"- Query SHA1: {dataset.get('query_sha1', '')}",
                  "",
              ]
          )
      lines.extend(["## Aggregate Metrics", ""])
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
86
87
88
89
90
      metric_context = payload.get("metric_context") or {}
      if metric_context:
          lines.extend(
              [
                  f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}",
30b490e1   tangwang   添加ERR评估指标
91
92
                  f"- Gain scheme (NDCG): {metric_context.get('gain_scheme', {})}",
                  f"- Stop probabilities (ERR): {metric_context.get('stop_prob_scheme', {})}",
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
93
94
95
96
                  "",
              ]
          )
      _append_metric_block(lines, payload.get("aggregate_metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
97
98
99
100
101
102
103
      distribution = payload.get("aggregate_distribution") or {}
      if distribution:
          lines.extend(
              [
                  "",
                  "## Label Distribution",
                  "",
d73ca84a   tangwang   refine eval case ...
104
105
106
107
                  f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}",
                  f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}",
                  f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}",
                  f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}",
c81b0fc1   tangwang   scripts/evaluatio...
108
109
110
111
112
113
              ]
          )
      lines.extend(["", "## Per Query", ""])
      for item in payload.get("per_query") or []:
          lines.append(f"### {item['query']}")
          lines.append("")
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
114
          _append_metric_block(lines, item.get("metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
115
          distribution = item.get("distribution") or {}
d73ca84a   tangwang   refine eval case ...
116
117
118
119
120
          lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}")
          lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}")
          lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}")
          lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}")
          _append_case_snapshot(lines, item)
c81b0fc1   tangwang   scripts/evaluatio...
121
122
          lines.append("")
      return "\n".join(lines)