scripts/evaluation/eval_framework/framework.py

"""Core orchestration: corpus, rerank, LLM labels, live/batch evaluation."""
from __future__ import annotations
import json
import time
from pathlib import Path
from typing import Any, Dict, List, Sequence, Tuple
import requests
from elasticsearch.helpers import scan
from api.app import get_app_config, get_es_client, get_query_parser, init_service
from indexer.mapping_generator import get_tenant_index_name
from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
from .constants import (
    DEFAULT_ARTIFACT_ROOT,
    DEFAULT_LABELER_MODE,
    JUDGE_PROMPT_VERSION_COMPLEX,
    RELEVANCE_EXACT,
    RELEVANCE_IRRELEVANT,
    RELEVANCE_PARTIAL,
    VALID_LABELS,
)
from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
from .reports import render_batch_report_markdown
from .store import EvalStore, QueryBuildResult
from .utils import (
    build_display_title,
    build_rerank_doc,
    compact_option_values,
    compact_product_payload,
    ensure_dir,
    normalize_text,
    pick_text,
    sha1_text,
    utc_now_iso,
    utc_timestamp,
)
class SearchEvaluationFramework:
    def __init__(
        self,
        tenant_id: str,
        artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
        search_base_url: str = "http://localhost:6002",
        labeler_mode: str = DEFAULT_LABELER_MODE,
    ):
        init_service(get_app_config().infrastructure.elasticsearch.host)
        self.tenant_id = str(tenant_id)
        self.artifact_root = ensure_dir(artifact_root)
        self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
        self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
        self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
        app_cfg = get_app_config()
        rerank_service_url = str(
            app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
        )
        self.rerank_client = RerankServiceClient(rerank_service_url)
        llm_cfg = app_cfg.services.translation.capabilities["llm"]
        api_key = app_cfg.infrastructure.secrets.dashscope_api_key
        if not api_key:
            raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
        self.label_client = DashScopeLabelClient(
            model=str(llm_cfg["model"]),
            base_url=str(llm_cfg["base_url"]),
            api_key=str(api_key),
        )
        self.query_parser = None
    def _get_query_parser(self):
        if self.query_parser is None:
            self.query_parser = get_query_parser()
        return self.query_parser
    def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
        parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
        payload = parsed.to_dict()
        payload["text_for_rerank"] = parsed.text_for_rerank()
        return payload
    def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
        if self.labeler_mode != "complex":
            raise RuntimeError("query profiles are only used in complex labeler mode")
        if not force_refresh:
            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
            if cached is not None:
                return cached
        parser_hints = self.build_query_parser_hints(query)
        profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
        profile["parser_hints"] = parser_hints
        self.store.upsert_query_profile(
            self.tenant_id,
            query,
            JUDGE_PROMPT_VERSION_COMPLEX,
            self.label_client.model,
            profile,
            raw_response,
        )
        return profile
    @staticmethod
    def _doc_evidence_text(doc: Dict[str, Any]) -> str:
        pieces: List[str] = [
            build_display_title(doc),
            pick_text(doc.get("vendor"), "en"),
            pick_text(doc.get("category_path"), "en"),
            pick_text(doc.get("category_name"), "en"),
        ]
        for sku in doc.get("skus") or []:
            pieces.extend(
                [
                    str(sku.get("option1_value") or ""),
                    str(sku.get("option2_value") or ""),
                    str(sku.get("option3_value") or ""),
                ]
            )
        for tag in doc.get("tags") or []:
            pieces.append(str(tag))
        return normalize_text(" | ".join(piece for piece in pieces if piece))
    def _apply_rule_based_label_guardrails(
        self,
        label: str,
        query_profile: Dict[str, Any],
        doc: Dict[str, Any],
    ) -> str:
        if label not in VALID_LABELS:
            return label
        evidence = self._doc_evidence_text(doc)
        category = normalize_text(query_profile.get("primary_category"))
        allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
        primary_category_match = True
        if category:
            primary_category_match = category in evidence
        allowed_category_match = True
        if allowed_categories:
            allowed_category_match = any(signal in evidence for signal in allowed_categories)
        if label == RELEVANCE_EXACT and not primary_category_match:
            if allowed_category_match:
                label = RELEVANCE_PARTIAL
            else:
                return RELEVANCE_IRRELEVANT
        for attr in query_profile.get("required_attributes") or []:
            if not isinstance(attr, dict):
                continue
            attr_name = normalize_text(attr.get("name"))
            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
                continue
            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
            if attr_name == "fit":
                if any(term in {"oversized", "oversize"} for term in required_terms):
                    conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
                if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
                    conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
            has_required = any(term in evidence for term in required_terms) if required_terms else True
            has_conflict = any(term in evidence for term in conflicting_terms)
            if has_conflict:
                return RELEVANCE_IRRELEVANT
            if label == RELEVANCE_EXACT and not has_required:
                label = RELEVANCE_PARTIAL
        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
            return RELEVANCE_IRRELEVANT
        return label
    @staticmethod
    def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
        option_values = list(item.get("option_values") or [])
        while len(option_values) < 3:
            option_values.append("")
        product = dict(item.get("product") or {})
        return {
            "spu_id": item.get("spu_id"),
            "title": product.get("title") or item.get("title"),
            "vendor": product.get("vendor"),
            "category_path": product.get("category"),
            "category_name": product.get("category"),
            "image_url": item.get("image_url") or product.get("image_url"),
            "tags": product.get("tags") or [],
            "skus": [
                {
                    "option1_value": option_values[0],
                    "option2_value": option_values[1],
                    "option3_value": option_values[2],
                }
            ],
        }
    def _collect_label_issues(
        self,
        label: str,
        query_profile: Dict[str, Any],
        doc: Dict[str, Any],
    ) -> List[str]:
        evidence = self._doc_evidence_text(doc)
        issues: List[str] = []
        category = normalize_text(query_profile.get("primary_category"))
        allowed_categories = [
            normalize_text(item)
            for item in query_profile.get("allowed_categories") or []
            if str(item).strip()
        ]
        primary_category_match = True if not category else category in evidence
        allowed_category_match = False if allowed_categories else primary_category_match
        if allowed_categories:
            allowed_category_match = any(signal in evidence for signal in allowed_categories)
        if label == RELEVANCE_EXACT and not primary_category_match:
            if allowed_category_match:
                issues.append("Exact missing primary category evidence")
            else:
                issues.append("Exact has category mismatch")
        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
            issues.append("Partial has category mismatch")
        for attr in query_profile.get("required_attributes") or []:
            if not isinstance(attr, dict):
                continue
            attr_name = normalize_text(attr.get("name"))
            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
                continue
            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
            has_required = any(term in evidence for term in required_terms) if required_terms else True
            has_conflict = any(term in evidence for term in conflicting_terms)
            if has_conflict and label != RELEVANCE_IRRELEVANT:
                issues.append(f"{label} conflicts on {attr_name}")
            if label == RELEVANCE_EXACT and not has_required:
                issues.append(f"Exact missing {attr_name}")
        return issues
    def audit_live_query(
        self,
        query: str,
        *,
        top_k: int = 100,
        language: str = "en",
        auto_annotate: bool = False,
    ) -> Dict[str, Any]:
        live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
        if self.labeler_mode != "complex":
            labels = [
                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
                for item in live["results"]
            ]
            return {
                "query": query,
                "tenant_id": self.tenant_id,
                "top_k": top_k,
                "metrics": live["metrics"],
                "distribution": label_distribution(labels),
                "query_profile": None,
                "suspicious": [],
                "results": live["results"],
            }
        query_profile = self.get_query_profile(query, force_refresh=False)
        suspicious: List[Dict[str, Any]] = []
        for item in live["results"]:
            doc = self._result_item_to_doc(item)
            issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
            suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
            if suggested_label != (item["label"] or ""):
                issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
            if issues:
                suspicious.append(
                    {
                        "rank": item["rank"],
                        "spu_id": item["spu_id"],
                        "title": item["title"],
                        "label": item["label"],
                        "suggested_label": suggested_label,
                        "issues": issues,
                    }
                )
        labels = [
            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
            for item in live["results"]
        ]
        return {
            "query": query,
            "tenant_id": self.tenant_id,
            "top_k": top_k,
            "metrics": live["metrics"],
            "distribution": label_distribution(labels),
            "query_profile": query_profile,
            "suspicious": suspicious,
            "results": live["results"],
        }
    def queries_from_file(self, path: Path) -> List[str]:
        return [
            line.strip()
            for line in path.read_text(encoding="utf-8").splitlines()
            if line.strip() and not line.strip().startswith("#")
        ]
    def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
        if not refresh and self.store.has_corpus(self.tenant_id):
            return self.store.get_corpus_docs(self.tenant_id)
        es_client = get_es_client().client
        index_name = get_tenant_index_name(self.tenant_id)
        docs: List[Dict[str, Any]] = []
        for hit in scan(
            client=es_client,
            index=index_name,
            query={
                "_source": [
                    "spu_id",
                    "title",
                    "vendor",
                    "category_path",
                    "category_name",
                    "image_url",
                    "skus",
                    "tags",
                ],
                "query": {"match_all": {}},
            },
            size=500,
            preserve_order=False,
            clear_scroll=True,
        ):
            source = dict(hit.get("_source") or {})
            source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
            docs.append(source)
        self.store.upsert_corpus_docs(self.tenant_id, docs)
        return docs
    def full_corpus_rerank(
        self,
        query: str,
        docs: Sequence[Dict[str, Any]],
        batch_size: int = 24,
        force_refresh: bool = False,
    ) -> List[Dict[str, Any]]:
        cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
        pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
        if pending:
            new_scores: Dict[str, float] = {}
            for start in range(0, len(pending), batch_size):
                batch = pending[start : start + batch_size]
                scores = self._rerank_batch_with_retry(query=query, docs=batch)
                if len(scores) != len(batch):
                    raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
                for doc, score in zip(batch, scores):
                    new_scores[str(doc.get("spu_id"))] = float(score)
            self.store.upsert_rerank_scores(
                self.tenant_id,
                query,
                new_scores,
                model_name="qwen3_vllm_score",
            )
            cached.update(new_scores)
        ranked = []
        for doc in docs:
            spu_id = str(doc.get("spu_id"))
            ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
        ranked.sort(key=lambda item: item["score"], reverse=True)
        return ranked
    def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
        if not docs:
            return []
        doc_texts = [build_rerank_doc(doc) for doc in docs]
        try:
            scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
            return scores
        except Exception:
            if len(docs) == 1:
                return [-1.0]
            if len(docs) <= 6:
                scores: List[float] = []
                for doc in docs:
                    scores.extend(self._rerank_batch_with_retry(query, [doc]))
                return scores
            mid = len(docs) // 2
            left = self._rerank_batch_with_retry(query, docs[:mid])
            right = self._rerank_batch_with_retry(query, docs[mid:])
            return left + right
    def annotate_missing_labels(
        self,
        query: str,
        docs: Sequence[Dict[str, Any]],
        force_refresh: bool = False,
    ) -> Dict[str, str]:
        labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
        missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
        if not missing_docs:
            return labels
        for start in range(0, len(missing_docs), self.label_client.batch_size):
            batch = missing_docs[start : start + self.label_client.batch_size]
            batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
            for sub_labels, raw_response, sub_batch in batch_pairs:
                to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
                self.store.upsert_labels(
                    self.tenant_id,
                    query,
                    to_store,
                    judge_model=self.label_client.model,
                    raw_response=raw_response,
                )
                labels.update(to_store)
            time.sleep(0.1)
        return labels
    def _classify_with_retry(
        self,
        query: str,
        docs: Sequence[Dict[str, Any]],
        *,
        force_refresh: bool = False,
    ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
        if not docs:
            return []
        try:
            if self.labeler_mode == "complex":
                query_profile = self.get_query_profile(query, force_refresh=force_refresh)
                labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
                labels = [
                    self._apply_rule_based_label_guardrails(label, query_profile, doc)
                    for doc, label in zip(docs, labels)
                ]
            else:
                labels, raw_response = self.label_client.classify_batch_simple(query, docs)
            return [(labels, raw_response, docs)]
        except Exception:
            if len(docs) == 1:
                raise
            mid = len(docs) // 2
            return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
    def build_query_annotation_set(
        self,
        query: str,
        *,
        search_depth: int = 1000,
        rerank_depth: int = 10000,
        annotate_search_top_k: int = 120,
        annotate_rerank_top_k: int = 200,
        language: str = "en",
        force_refresh_rerank: bool = False,
        force_refresh_labels: bool = False,
    ) -> QueryBuildResult:
        search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
        search_results = list(search_payload.get("results") or [])
        corpus = self.corpus_docs(refresh=False)
        full_rerank = self.full_corpus_rerank(
            query=query,
            docs=corpus,
            force_refresh=force_refresh_rerank,
        )
        rerank_depth_effective = min(rerank_depth, len(full_rerank))
        pool_docs: Dict[str, Dict[str, Any]] = {}
        for doc in search_results[:annotate_search_top_k]:
            pool_docs[str(doc.get("spu_id"))] = doc
        for item in full_rerank[:annotate_rerank_top_k]:
            pool_docs[str(item["spu_id"])] = item["doc"]
        labels = self.annotate_missing_labels(
            query=query,
            docs=list(pool_docs.values()),
            force_refresh=force_refresh_labels,
        )
        search_labeled_results: List[Dict[str, Any]] = []
        for rank, doc in enumerate(search_results, start=1):
            spu_id = str(doc.get("spu_id"))
            label = labels.get(spu_id)
            search_labeled_results.append(
                {
                    "rank": rank,
                    "spu_id": spu_id,
                    "title": build_display_title(doc),
                    "image_url": doc.get("image_url"),
                    "rerank_score": None,
                    "label": label,
                    "option_values": list(compact_option_values(doc.get("skus") or [])),
                    "product": compact_product_payload(doc),
                }
            )
        rerank_top_results: List[Dict[str, Any]] = []
        for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
            doc = item["doc"]
            spu_id = str(item["spu_id"])
            rerank_top_results.append(
                {
                    "rank": rank,
                    "spu_id": spu_id,
                    "title": build_display_title(doc),
                    "image_url": doc.get("image_url"),
                    "rerank_score": round(float(item["score"]), 8),
                    "label": labels.get(spu_id),
                    "option_values": list(compact_option_values(doc.get("skus") or [])),
                    "product": compact_product_payload(doc),
                }
            )
        top100_labels = [
            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
            for item in search_labeled_results[:100]
        ]
        metrics = compute_query_metrics(top100_labels)
        output_dir = ensure_dir(self.artifact_root / "query_builds")
        run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
        output_json_path = output_dir / f"{run_id}.json"
        payload = {
            "run_id": run_id,
            "created_at": utc_now_iso(),
            "tenant_id": self.tenant_id,
            "query": query,
            "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
            "search_total": int(search_payload.get("total") or 0),
            "search_depth_requested": search_depth,
            "search_depth_effective": len(search_results),
            "rerank_depth_requested": rerank_depth,
            "rerank_depth_effective": rerank_depth_effective,
            "corpus_size": len(corpus),
            "annotation_pool": {
                "annotate_search_top_k": annotate_search_top_k,
                "annotate_rerank_top_k": annotate_rerank_top_k,
                "pool_size": len(pool_docs),
            },
            "labeler_mode": self.labeler_mode,
            "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
            "metrics_top100": metrics,
            "search_results": search_labeled_results,
            "full_rerank_top": rerank_top_results,
        }
        output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
        self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
        return QueryBuildResult(
            query=query,
            tenant_id=self.tenant_id,
            search_total=int(search_payload.get("total") or 0),
            search_depth=len(search_results),
            rerank_corpus_size=len(corpus),
            annotated_count=len(pool_docs),
            output_json_path=output_json_path,
        )
    def evaluate_live_query(
        self,
        query: str,
        top_k: int = 100,
        auto_annotate: bool = False,
        language: str = "en",
        force_refresh_labels: bool = False,
    ) -> Dict[str, Any]:
        search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
        results = list(search_payload.get("results") or [])
        if auto_annotate:
            self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
        labels = self.store.get_labels(self.tenant_id, query)
        recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
        labeled = []
        unlabeled_hits = 0
        for rank, doc in enumerate(results[:top_k], start=1):
            spu_id = str(doc.get("spu_id"))
            label = labels.get(spu_id)
            if label not in VALID_LABELS:
                unlabeled_hits += 1
            labeled.append(
                {
                    "rank": rank,
                    "spu_id": spu_id,
                    "title": build_display_title(doc),
                    "image_url": doc.get("image_url"),
                    "label": label,
                    "option_values": list(compact_option_values(doc.get("skus") or [])),
                    "product": compact_product_payload(doc),
                }
            )
        metric_labels = [
            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
            for item in labeled
        ]
        label_stats = self.store.get_query_label_stats(self.tenant_id, query)
        rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
        relevant_missing_ids = [
            spu_id
            for spu_id, label in labels.items()
            if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
        ]
        missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
        missing_relevant = []
        for spu_id in relevant_missing_ids:
            doc = missing_docs_map.get(spu_id)
            if not doc:
                continue
            missing_relevant.append(
                {
                    "spu_id": spu_id,
                    "label": labels[spu_id],
                    "rerank_score": rerank_scores.get(spu_id),
                    "title": build_display_title(doc),
                    "image_url": doc.get("image_url"),
                    "option_values": list(compact_option_values(doc.get("skus") or [])),
                    "product": compact_product_payload(doc),
                }
            )
        label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
        missing_relevant.sort(
            key=lambda item: (
                label_order.get(str(item.get("label")), 9),
                -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
                str(item.get("title") or ""),
            )
        )
        tips: List[str] = []
        if auto_annotate:
            tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
        else:
            tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
        if label_stats["total"] == 0:
            tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
        if unlabeled_hits:
            tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
        if not missing_relevant:
            tips.append("No cached Exact/Partial products were missed by this recall set.")
        return {
            "query": query,
            "tenant_id": self.tenant_id,
            "top_k": top_k,
            "metrics": compute_query_metrics(metric_labels),
            "results": labeled,
            "missing_relevant": missing_relevant,
            "label_stats": {
                **label_stats,
                "unlabeled_hits_treated_irrelevant": unlabeled_hits,
                "recalled_hits": len(labeled),
                "missing_relevant_count": len(missing_relevant),
                "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
                "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
            },
            "tips": tips,
            "total": int(search_payload.get("total") or 0),
        }
    def batch_evaluate(
        self,
        queries: Sequence[str],
        *,
        top_k: int = 100,
        auto_annotate: bool = True,
        language: str = "en",
        force_refresh_labels: bool = False,
    ) -> Dict[str, Any]:
        per_query = []
        for query in queries:
            live = self.evaluate_live_query(
                query,
                top_k=top_k,
                auto_annotate=auto_annotate,
                language=language,
                force_refresh_labels=force_refresh_labels,
            )
            labels = [
                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
                for item in live["results"]
            ]
            per_query.append(
                {
                    "query": live["query"],
                    "tenant_id": live["tenant_id"],
                    "top_k": live["top_k"],
                    "metrics": live["metrics"],
                    "distribution": label_distribution(labels),
                    "total": live["total"],
                }
            )
        aggregate = aggregate_metrics([item["metrics"] for item in per_query])
        aggregate_distribution = {
            RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
            RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
            RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
        }
        batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
        report_dir = ensure_dir(self.artifact_root / "batch_reports")
        config_snapshot_path = report_dir / f"{batch_id}_config.json"
        config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
        config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
        output_json_path = report_dir / f"{batch_id}.json"
        report_md_path = report_dir / f"{batch_id}.md"
        payload = {
            "batch_id": batch_id,
            "created_at": utc_now_iso(),
            "tenant_id": self.tenant_id,
            "queries": list(queries),
            "top_k": top_k,
            "aggregate_metrics": aggregate,
            "aggregate_distribution": aggregate_distribution,
            "per_query": per_query,
            "config_snapshot_path": str(config_snapshot_path),
        }
        output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
        report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
        self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
        return payload