scripts/evaluation/eval_framework/reports.py

"""Markdown and text reports for batch evaluation."""
from __future__ import annotations
from typing import Any, Dict
from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
from .metrics import PRIMARY_METRIC_KEYS
def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None:
    primary_keys = (
        "Primary_Metric_Score",
        *PRIMARY_METRIC_KEYS,
        "ERR@10",
    )
    included = set()
    for key in primary_keys:
        if key in metrics:
            lines.append(f"- {key}: {metrics[key]}")
            included.add(key)
    for key, value in sorted(metrics.items()):
        if key in included:
            continue
        lines.append(f"- {key}: {value}")
def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
    lines = [
        "# Search Batch Evaluation",
        "",
        f"- Batch ID: {payload['batch_id']}",
        f"- Created at: {payload['created_at']}",
        f"- Tenant ID: {payload['tenant_id']}",
        f"- Query count: {len(payload['queries'])}",
        f"- Top K: {payload['top_k']}",
        "",
        "## Aggregate Metrics",
        "",
    ]
    metric_context = payload.get("metric_context") or {}
    if metric_context:
        lines.extend(
            [
                f"- Primary metric: {metric_context.get('primary_metric', 'N/A')}",
                f"- Gain scheme (NDCG): {metric_context.get('gain_scheme', {})}",
                f"- Stop probabilities (ERR): {metric_context.get('stop_prob_scheme', {})}",
                "",
            ]
        )
    _append_metric_block(lines, payload.get("aggregate_metrics") or {})
    distribution = payload.get("aggregate_distribution") or {}
    if distribution:
        lines.extend(
            [
                "",
                "## Label Distribution",
                "",
                f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}",
                f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
                f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
                f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
            ]
        )
    lines.extend(["", "## Per Query", ""])
    for item in payload.get("per_query") or []:
        lines.append(f"### {item['query']}")
        lines.append("")
        _append_metric_block(lines, item.get("metrics") or {})
        distribution = item.get("distribution") or {}
        lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}")
        lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
        lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
        lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
        lines.append("")
    return "\n".join(lines)