Blame view

scripts/evaluation/eval_framework/reports.py 2.78 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
  """Markdown and text reports for batch evaluation."""
  
  from __future__ import annotations
  
  from typing import Any, Dict
  
a345b01f   tangwang   eval framework
7
  from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
465f90e1   tangwang   添加LTR数据收集
8
  from .metrics import PRIMARY_METRIC_KEYS
c81b0fc1   tangwang   scripts/evaluatio...
9
10
  
  
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
11
  def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
30b490e1   tangwang   添加ERR评估指标
12
      primary_keys = (
465f90e1   tangwang   添加LTR数据收集
13
14
          "Primary_Metric_Score",
          *PRIMARY_METRIC_KEYS,
30b490e1   tangwang   添加ERR评估指标
15
          "ERR@10",
30b490e1   tangwang   添加ERR评估指标
16
      )
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
17
18
19
20
21
22
23
24
25
26
27
      included = set()
      for key in primary_keys:
          if key in metrics:
              lines.append(f"- {key}: {metrics[key]}")
              included.add(key)
      for key, value in sorted(metrics.items()):
          if key in included:
              continue
          lines.append(f"- {key}: {value}")
  
  
c81b0fc1   tangwang   scripts/evaluatio...
28
29
30
31
32
33
34
35
36
37
38
39
40
  def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
      lines = [
          "# Search Batch Evaluation",
          "",
          f"- Batch ID: {payload['batch_id']}",
          f"- Created at: {payload['created_at']}",
          f"- Tenant ID: {payload['tenant_id']}",
          f"- Query count: {len(payload['queries'])}",
          f"- Top K: {payload['top_k']}",
          "",
          "## Aggregate Metrics",
          "",
      ]
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
41
42
43
44
45
      metric_context = payload.get("metric_context") or {}
      if metric_context:
          lines.extend(
              [
                  f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}",
30b490e1   tangwang   添加ERR评估指标
46
47
                  f"- Gain scheme (NDCG): {metric_context.get('gain_scheme', {})}",
                  f"- Stop probabilities (ERR): {metric_context.get('stop_prob_scheme', {})}",
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
48
49
50
51
                  "",
              ]
          )
      _append_metric_block(lines, payload.get("aggregate_metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
52
53
54
55
56
57
58
      distribution = payload.get("aggregate_distribution") or {}
      if distribution:
          lines.extend(
              [
                  "",
                  "## Label Distribution",
                  "",
441f049d   tangwang   评测体系优化,以及
59
60
61
                  f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
                  f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
                  f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
c81b0fc1   tangwang   scripts/evaluatio...
62
63
64
65
66
67
68
                  f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
              ]
          )
      lines.extend(["", "## Per Query", ""])
      for item in payload.get("per_query") or []:
          lines.append(f"### {item['query']}")
          lines.append("")
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
69
          _append_metric_block(lines, item.get("metrics") or {})
c81b0fc1   tangwang   scripts/evaluatio...
70
          distribution = item.get("distribution") or {}
441f049d   tangwang   评测体系优化,以及
71
72
73
          lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
          lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
          lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
c81b0fc1   tangwang   scripts/evaluatio...
74
75
76
          lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
          lines.append("")
      return "\n".join(lines)