diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 83adc0d..5621dbd 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -36,16 +36,19 @@ The framework supports four related tasks: - `README_Requirement.md` Requirement reference document. - `quick_start_eval.sh` - Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`). + Optional wrapper: `batch` (fill missing labels only), `batch-rebuild` (force full re-label), or `serve` (web UI). ## Quick start (from repo root) -Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend. +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key when the batch step needs new LLM labels, and a working backend. ```bash -# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/ +# 1) Batch evaluation: every query in the file gets a live search; only uncached (query, spu_id) pairs call the LLM ./scripts/evaluation/quick_start_eval.sh batch +# Optional: full re-label of current top_k recall (expensive; use only when you intentionally rebuild the cache) +./scripts/evaluation/quick_start_eval.sh batch-rebuild + # 2) Evaluation UI on http://127.0.0.1:6010/ ./scripts/evaluation/quick_start_eval.sh serve ``` @@ -53,6 +56,15 @@ Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashSco Equivalent explicit commands: ```bash +# Safe default: no --force-refresh-labels +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ + --tenant-id "${TENANT_ID:-163}" \ + --queries-file scripts/evaluation/queries/queries.txt \ + --top-k 50 \ + --language en \ + --labeler-mode simple + +# Rebuild all labels for recalled top_k (same as quick_start_eval.sh batch-rebuild) ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ --tenant-id "${TENANT_ID:-163}" \ --queries-file scripts/evaluation/queries/queries.txt \ @@ -135,7 +147,7 @@ There are now two labeler modes: For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. -Example: +Example (fills missing labels only; recommended default): ```bash ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ @@ -143,10 +155,11 @@ Example: --queries-file scripts/evaluation/queries/queries.txt \ --top-k 50 \ --language en \ - --labeler-mode simple \ - --force-refresh-labels + --labeler-mode simple ``` +To **rebuild** every label for the current `top_k` recall window (all queries, all hits re-sent to the LLM), add `--force-refresh-labels` or run `./scripts/evaluation/quick_start_eval.sh batch-rebuild`. + This command does two things: - runs **every** query in the file against the live backend (no skip list) diff --git a/scripts/evaluation/eval_framework.py b/scripts/evaluation/eval_framework.py index af0be96..834b566 100644 --- a/scripts/evaluation/eval_framework.py +++ b/scripts/evaluation/eval_framework.py @@ -510,6 +510,27 @@ class EvalStore: ) return items + def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]: + row = self.conn.execute( + """ + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at + FROM batch_runs + WHERE batch_id = ? + """, + (batch_id,), + ).fetchone() + if row is None: + return None + return { + "batch_id": row["batch_id"], + "tenant_id": row["tenant_id"], + "output_json_path": row["output_json_path"], + "report_markdown_path": row["report_markdown_path"], + "config_snapshot_path": row["config_snapshot_path"], + "metadata": json.loads(row["metadata_json"]), + "created_at": row["created_at"], + } + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: rows = self.conn.execute( """ @@ -1581,6 +1602,27 @@ def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFA def api_history() -> Dict[str, Any]: return {"history": framework.store.list_batch_runs(limit=20)} + @app.get("/api/history/{batch_id}/report") + def api_history_report(batch_id: str) -> Dict[str, Any]: + row = framework.store.get_batch_run(batch_id) + if row is None: + raise HTTPException(status_code=404, detail="Unknown batch_id") + report_path = Path(row["report_markdown_path"]).resolve() + root = framework.artifact_root.resolve() + try: + report_path.relative_to(root) + except ValueError: + raise HTTPException(status_code=403, detail="Report path is outside artifact root") + if not report_path.is_file(): + raise HTTPException(status_code=404, detail="Report file not found") + return { + "batch_id": row["batch_id"], + "created_at": row["created_at"], + "tenant_id": row["tenant_id"], + "report_markdown_path": str(report_path), + "markdown": report_path.read_text(encoding="utf-8"), + } + return app @@ -1612,7 +1654,11 @@ WEB_APP_HTML = """ h1, h2 { margin: 0 0 12px; } .muted { color: var(--muted); } .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } - .query-item { display: block; width: 100%; border: 0; background: transparent; text-align: left; padding: 10px 12px; border-radius: 10px; cursor: pointer; } + .query-item { + display: block; width: 100%; border: 0; background: transparent; text-align: left; + padding: 10px 12px; border-radius: 10px; cursor: pointer; + color: var(--ink); font-size: 15px; font-weight: 500; + } .query-item:hover { background: #eef6f4; } .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } @@ -1634,6 +1680,49 @@ WEB_APP_HTML = """ .options { color: var(--muted); line-height: 1.5; font-size: 14px; } .section { margin-bottom: 28px; } .history { font-size: 13px; line-height: 1.5; } + .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; } + .history-item { + display: block; width: 100%; border: 1px solid var(--line); background: var(--panel); + text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer; + color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s; + } + .history-item:hover { background: #eef6f4; border-color: #b8d4cd; } + .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } + .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; } + .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; } + .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; } + .history-item .hstats span { color: var(--muted); } + .report-modal-root { + position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center; + padding: 16px; box-sizing: border-box; + } + .report-modal-root.is-open { display: flex; } + .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); } + .report-modal-dialog { + position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column; + background: var(--panel); border: 1px solid var(--line); border-radius: 18px; + box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18); + } + .report-modal-head { + flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px; + padding: 16px 18px; border-bottom: 1px solid var(--line); + } + .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; } + .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; } + .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; } + .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); } + .report-modal-body { + flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px; + font-size: 14px; line-height: 1.55; + } + .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; } + .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; } + .batch-report-md h2:first-of-type { margin-top: 0; } + .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; } + .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; } + .batch-report-md li { margin: 0.2rem 0; } + .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; } + .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } .tip { margin-bottom: 6px; color: var(--muted); } @@ -1646,6 +1735,7 @@ WEB_APP_HTML = """

History

+

Click a run to open the batch markdown report.

Loading...
@@ -1676,6 +1766,22 @@ WEB_APP_HTML = """ + + +