"""Tests for search evaluation ranking metrics (NDCG, ERR).""" from scripts.evaluation.eval_framework.constants import ( RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW, ) from scripts.evaluation.eval_framework.metrics import compute_query_metrics def test_err_matches_documented_three_item_examples(): # Model A: [Exact, Irrelevant, High] -> ERR ≈ 0.992667 m_a = compute_query_metrics( [RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH], ideal_labels=[RELEVANCE_EXACT], ) assert abs(m_a["ERR@5"] - (0.99 + (1.0 / 3.0) * 0.8 * 0.01)) < 1e-5 # Model B: [High, Low, Exact] -> ERR ≈ 0.8694 m_b = compute_query_metrics( [RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_EXACT], ideal_labels=[RELEVANCE_EXACT], ) expected_b = 0.8 + 0.5 * 0.1 * 0.2 + (1.0 / 3.0) * 0.99 * 0.18 assert abs(m_b["ERR@5"] - expected_b) < 1e-5