c81b0fc1
tangwang
scripts/evaluatio...
|
1
2
3
4
5
6
|
"""Markdown and text reports for batch evaluation."""
from __future__ import annotations
from typing import Any, Dict
|
a345b01f
tangwang
eval framework
|
7
|
from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
|
465f90e1
tangwang
添加LTR数据收集
|
8
|
from .metrics import PRIMARY_METRIC_KEYS
|
c81b0fc1
tangwang
scripts/evaluatio...
|
9
10
|
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
11
|
def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
|
30b490e1
tangwang
添加ERR评估指标
|
12
|
primary_keys = (
|
465f90e1
tangwang
添加LTR数据收集
|
13
14
|
"Primary_Metric_Score",
*PRIMARY_METRIC_KEYS,
|
30b490e1
tangwang
添加ERR评估指标
|
15
|
"ERR@10",
|
30b490e1
tangwang
添加ERR评估指标
|
16
|
)
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
17
18
19
20
21
22
23
24
25
26
27
|
included = set()
for key in primary_keys:
if key in metrics:
lines.append(f"- {key}: {metrics[key]}")
included.add(key)
for key, value in sorted(metrics.items()):
if key in included:
continue
lines.append(f"- {key}: {value}")
|
c81b0fc1
tangwang
scripts/evaluatio...
|
28
29
30
31
32
33
34
35
36
37
38
39
40
|
def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
lines = [
"# Search Batch Evaluation",
"",
f"- Batch ID: {payload['batch_id']}",
f"- Created at: {payload['created_at']}",
f"- Tenant ID: {payload['tenant_id']}",
f"- Query count: {len(payload['queries'])}",
f"- Top K: {payload['top_k']}",
"",
"## Aggregate Metrics",
"",
]
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
41
42
43
44
45
|
metric_context = payload.get("metric_context") or {}
if metric_context:
lines.extend(
[
f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}",
|
30b490e1
tangwang
添加ERR评估指标
|
46
47
|
f"- Gain scheme (NDCG): {metric_context.get('gain_scheme', {})}",
f"- Stop probabilities (ERR): {metric_context.get('stop_prob_scheme', {})}",
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
48
49
50
51
|
"",
]
)
_append_metric_block(lines, payload.get("aggregate_metrics") or {})
|
c81b0fc1
tangwang
scripts/evaluatio...
|
52
53
54
55
56
57
58
|
distribution = payload.get("aggregate_distribution") or {}
if distribution:
lines.extend(
[
"",
"## Label Distribution",
"",
|
441f049d
tangwang
评测体系优化,以及
|
59
60
61
|
f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
|
c81b0fc1
tangwang
scripts/evaluatio...
|
62
63
64
65
66
67
68
|
f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
]
)
lines.extend(["", "## Per Query", ""])
for item in payload.get("per_query") or []:
lines.append(f"### {item['query']}")
lines.append("")
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
69
|
_append_metric_block(lines, item.get("metrics") or {})
|
c81b0fc1
tangwang
scripts/evaluatio...
|
70
|
distribution = item.get("distribution") or {}
|
441f049d
tangwang
评测体系优化,以及
|
71
72
73
|
lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
|
c81b0fc1
tangwang
scripts/evaluatio...
|
74
75
76
|
lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
lines.append("")
return "\n".join(lines)
|