eval_search_quality.py 8.1 KB
Edit Raw Blame History

#!/usr/bin/env python3
"""
Run search quality evaluation against real tenant indexes and emit JSON/Markdown reports.

Usage:
  source activate.sh
  python scripts/eval_search_quality.py
"""

from __future__ import annotations

import json
import sys
from dataclasses import asdict, dataclass
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List

PROJECT_ROOT = Path(__file__).resolve().parents[1]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

from api.app import get_searcher, init_service
from context import create_request_context


DEFAULT_QUERIES_BY_TENANT: Dict[str, List[str]] = {
    "0": [
        "连衣裙",
        "dress",
        "dress 连衣裙",
        "maxi dress 长裙",
        "波西米亚连衣裙",
        "T恤",
        "graphic tee 图案T恤",
        "shirt",
        "礼服衬衫",
        "hoodie 卫衣",
        "连帽卫衣",
        "sweatshirt",
        "牛仔裤",
        "jeans",
        "阔腿牛仔裤",
        "毛衣 sweater",
        "cardigan 开衫",
        "jacket 外套",
        "puffer jacket 羽绒服",
        "飞行员夹克",
    ],
    "162": [
        "连衣裙",
        "dress",
        "dress 连衣裙",
        "T恤",
        "shirt",
        "hoodie 卫衣",
        "牛仔裤",
        "jeans",
        "毛衣 sweater",
        "jacket 外套",
        "娃娃衣服",
        "芭比裙子",
        "连衣短裙芭比",
        "公主大裙",
        "晚礼服芭比",
        "毛衣熊",
        "服饰饰品",
        "鞋子",
        "军人套",
        "陆军套",
    ],
}


@dataclass
class RankedItem:
    rank: int
    spu_id: str
    title: str
    vendor: str
    es_score: float | None
    rerank_score: float | None
    text_score: float | None
    text_source_score: float | None
    text_translation_score: float | None
    text_fallback_score: float | None
    text_primary_score: float | None
    text_support_score: float | None
    knn_score: float | None
    fused_score: float | None
    matched_queries: Any


def _pick_text(value: Any, language: str = "zh") -> str:
    if value is None:
        return ""
    if isinstance(value, dict):
        return str(value.get(language) or value.get("zh") or value.get("en") or "").strip()
    return str(value).strip()


def _to_float(value: Any) -> float | None:
    try:
        if value is None:
            return None
        return float(value)
    except (TypeError, ValueError):
        return None


def _evaluate_query(searcher, tenant_id: str, query: str) -> Dict[str, Any]:
    context = create_request_context(
        reqid=f"eval-{tenant_id}-{abs(hash(query)) % 1000000}",
        uid="codex",
    )
    result = searcher.search(
        query=query,
        tenant_id=tenant_id,
        size=20,
        from_=0,
        context=context,
        debug=True,
        language="zh",
        enable_rerank=True,
    )

    per_result_debug = ((result.debug_info or {}).get("per_result") or [])
    debug_by_spu_id = {
        str(item.get("spu_id")): item
        for item in per_result_debug
        if isinstance(item, dict) and item.get("spu_id") is not None
    }

    ranked_items: List[RankedItem] = []
    for rank, spu in enumerate(result.results[:20], 1):
        spu_id = str(getattr(spu, "spu_id", ""))
        debug_item = debug_by_spu_id.get(spu_id, {})
        ranked_items.append(
            RankedItem(
                rank=rank,
                spu_id=spu_id,
                title=_pick_text(getattr(spu, "title", None), language="zh"),
                vendor=_pick_text(getattr(spu, "vendor", None), language="zh"),
                es_score=_to_float(debug_item.get("es_score")),
                rerank_score=_to_float(debug_item.get("rerank_score")),
                text_score=_to_float(debug_item.get("text_score")),
                text_source_score=_to_float(debug_item.get("text_source_score")),
                text_translation_score=_to_float(debug_item.get("text_translation_score")),
                text_fallback_score=_to_float(debug_item.get("text_fallback_score")),
                text_primary_score=_to_float(debug_item.get("text_primary_score")),
                text_support_score=_to_float(debug_item.get("text_support_score")),
                knn_score=_to_float(debug_item.get("knn_score")),
                fused_score=_to_float(debug_item.get("fused_score")),
                matched_queries=debug_item.get("matched_queries"),
            )
        )

    return {
        "query": query,
        "tenant_id": tenant_id,
        "total": result.total,
        "max_score": result.max_score,
        "took_ms": result.took_ms,
        "query_analysis": ((result.debug_info or {}).get("query_analysis") or {}),
        "stage_timings": ((result.debug_info or {}).get("stage_timings") or {}),
        "top20": [asdict(item) for item in ranked_items],
    }


def _render_markdown(report: Dict[str, Any]) -> str:
    lines: List[str] = []
    lines.append(f"# Search Quality Evaluation")
    lines.append("")
    lines.append(f"- Generated at: {report['generated_at']}")
    lines.append(f"- Queries per tenant: {report['queries_per_tenant']}")
    lines.append("")
    for tenant_id, entries in report["tenants"].items():
        lines.append(f"## Tenant {tenant_id}")
        lines.append("")
        for entry in entries:
            qa = entry.get("query_analysis") or {}
            lines.append(f"### Query: {entry['query']}")
            lines.append("")
            lines.append(
                f"- total={entry['total']} max_score={entry['max_score']:.6f} took_ms={entry['took_ms']}"
            )
            lines.append(
                f"- detected_language={qa.get('detected_language')} search_langs={qa.get('search_langs')} supplemental_search_langs={qa.get('supplemental_search_langs')}"
            )
            lines.append(f"- query_text_by_lang={qa.get('query_text_by_lang')}")
            lines.append("")
            lines.append("| rank | spu_id | title | fused | rerank | text | text_src | text_trans | text_fb | knn | es | matched_queries |")
            lines.append("| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |")
            for item in entry.get("top20", []):
                title = str(item.get("title", "")).replace("|", "/")
                matched = json.dumps(item.get("matched_queries"), ensure_ascii=False)
                matched = matched.replace("|", "/")
                lines.append(
                    f"| {item.get('rank')} | {item.get('spu_id')} | {title} | "
                    f"{item.get('fused_score')} | {item.get('rerank_score')} | {item.get('text_score')} | "
                    f"{item.get('text_source_score')} | {item.get('text_translation_score')} | "
                    f"{item.get('text_fallback_score')} | {item.get('knn_score')} | {item.get('es_score')} | {matched} |"
                )
            lines.append("")
    return "\n".join(lines)


def main() -> None:
    init_service("http://localhost:9200")
    searcher = get_searcher()

    tenants_report: Dict[str, List[Dict[str, Any]]] = {}
    for tenant_id, queries in DEFAULT_QUERIES_BY_TENANT.items():
        tenant_entries: List[Dict[str, Any]] = []
        for query in queries:
            print(f"[eval] tenant={tenant_id} query={query}")
            tenant_entries.append(_evaluate_query(searcher, tenant_id, query))
        tenants_report[tenant_id] = tenant_entries

    report = {
        "generated_at": datetime.now(timezone.utc).isoformat(),
        "queries_per_tenant": {tenant: len(queries) for tenant, queries in DEFAULT_QUERIES_BY_TENANT.items()},
        "tenants": tenants_report,
    }

    out_dir = Path("artifacts/search_eval")
    out_dir.mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
    json_path = out_dir / f"search_eval_{timestamp}.json"
    md_path = out_dir / f"search_eval_{timestamp}.md"
    json_path.write_text(json.dumps(report, ensure_ascii=False, indent=2), encoding="utf-8")
    md_path.write_text(_render_markdown(report), encoding="utf-8")
    print(f"[done] json={json_path}")
    print(f"[done] md={md_path}")


if __name__ == "__main__":
    main()