Commit f8e7cb97c9ece410767366f5f328166bf353a270
1 parent
881d338b
evalution framework
Showing
3 changed files
with
223 additions
and
15 deletions
Show diff stats
scripts/evaluation/README.md
| @@ -36,16 +36,19 @@ The framework supports four related tasks: | @@ -36,16 +36,19 @@ The framework supports four related tasks: | ||
| 36 | - `README_Requirement.md` | 36 | - `README_Requirement.md` |
| 37 | Requirement reference document. | 37 | Requirement reference document. |
| 38 | - `quick_start_eval.sh` | 38 | - `quick_start_eval.sh` |
| 39 | - Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`). | 39 | + Optional wrapper: `batch` (fill missing labels only), `batch-rebuild` (force full re-label), or `serve` (web UI). |
| 40 | 40 | ||
| 41 | ## Quick start (from repo root) | 41 | ## Quick start (from repo root) |
| 42 | 42 | ||
| 43 | -Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend. | 43 | +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key when the batch step needs new LLM labels, and a working backend. |
| 44 | 44 | ||
| 45 | ```bash | 45 | ```bash |
| 46 | -# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/ | 46 | +# 1) Batch evaluation: every query in the file gets a live search; only uncached (query, spu_id) pairs call the LLM |
| 47 | ./scripts/evaluation/quick_start_eval.sh batch | 47 | ./scripts/evaluation/quick_start_eval.sh batch |
| 48 | 48 | ||
| 49 | +# Optional: full re-label of current top_k recall (expensive; use only when you intentionally rebuild the cache) | ||
| 50 | +./scripts/evaluation/quick_start_eval.sh batch-rebuild | ||
| 51 | + | ||
| 49 | # 2) Evaluation UI on http://127.0.0.1:6010/ | 52 | # 2) Evaluation UI on http://127.0.0.1:6010/ |
| 50 | ./scripts/evaluation/quick_start_eval.sh serve | 53 | ./scripts/evaluation/quick_start_eval.sh serve |
| 51 | ``` | 54 | ``` |
| @@ -53,6 +56,15 @@ Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashSco | @@ -53,6 +56,15 @@ Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashSco | ||
| 53 | Equivalent explicit commands: | 56 | Equivalent explicit commands: |
| 54 | 57 | ||
| 55 | ```bash | 58 | ```bash |
| 59 | +# Safe default: no --force-refresh-labels | ||
| 60 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | ||
| 61 | + --tenant-id "${TENANT_ID:-163}" \ | ||
| 62 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 63 | + --top-k 50 \ | ||
| 64 | + --language en \ | ||
| 65 | + --labeler-mode simple | ||
| 66 | + | ||
| 67 | +# Rebuild all labels for recalled top_k (same as quick_start_eval.sh batch-rebuild) | ||
| 56 | ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | 68 | ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ |
| 57 | --tenant-id "${TENANT_ID:-163}" \ | 69 | --tenant-id "${TENANT_ID:-163}" \ |
| 58 | --queries-file scripts/evaluation/queries/queries.txt \ | 70 | --queries-file scripts/evaluation/queries/queries.txt \ |
| @@ -135,7 +147,7 @@ There are now two labeler modes: | @@ -135,7 +147,7 @@ There are now two labeler modes: | ||
| 135 | 147 | ||
| 136 | For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. | 148 | For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. |
| 137 | 149 | ||
| 138 | -Example: | 150 | +Example (fills missing labels only; recommended default): |
| 139 | 151 | ||
| 140 | ```bash | 152 | ```bash |
| 141 | ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | 153 | ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ |
| @@ -143,10 +155,11 @@ Example: | @@ -143,10 +155,11 @@ Example: | ||
| 143 | --queries-file scripts/evaluation/queries/queries.txt \ | 155 | --queries-file scripts/evaluation/queries/queries.txt \ |
| 144 | --top-k 50 \ | 156 | --top-k 50 \ |
| 145 | --language en \ | 157 | --language en \ |
| 146 | - --labeler-mode simple \ | ||
| 147 | - --force-refresh-labels | 158 | + --labeler-mode simple |
| 148 | ``` | 159 | ``` |
| 149 | 160 | ||
| 161 | +To **rebuild** every label for the current `top_k` recall window (all queries, all hits re-sent to the LLM), add `--force-refresh-labels` or run `./scripts/evaluation/quick_start_eval.sh batch-rebuild`. | ||
| 162 | + | ||
| 150 | This command does two things: | 163 | This command does two things: |
| 151 | 164 | ||
| 152 | - runs **every** query in the file against the live backend (no skip list) | 165 | - runs **every** query in the file against the live backend (no skip list) |
scripts/evaluation/eval_framework.py
| @@ -510,6 +510,27 @@ class EvalStore: | @@ -510,6 +510,27 @@ class EvalStore: | ||
| 510 | ) | 510 | ) |
| 511 | return items | 511 | return items |
| 512 | 512 | ||
| 513 | + def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]: | ||
| 514 | + row = self.conn.execute( | ||
| 515 | + """ | ||
| 516 | + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | ||
| 517 | + FROM batch_runs | ||
| 518 | + WHERE batch_id = ? | ||
| 519 | + """, | ||
| 520 | + (batch_id,), | ||
| 521 | + ).fetchone() | ||
| 522 | + if row is None: | ||
| 523 | + return None | ||
| 524 | + return { | ||
| 525 | + "batch_id": row["batch_id"], | ||
| 526 | + "tenant_id": row["tenant_id"], | ||
| 527 | + "output_json_path": row["output_json_path"], | ||
| 528 | + "report_markdown_path": row["report_markdown_path"], | ||
| 529 | + "config_snapshot_path": row["config_snapshot_path"], | ||
| 530 | + "metadata": json.loads(row["metadata_json"]), | ||
| 531 | + "created_at": row["created_at"], | ||
| 532 | + } | ||
| 533 | + | ||
| 513 | def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: | 534 | def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: |
| 514 | rows = self.conn.execute( | 535 | rows = self.conn.execute( |
| 515 | """ | 536 | """ |
| @@ -1581,6 +1602,27 @@ def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFA | @@ -1581,6 +1602,27 @@ def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFA | ||
| 1581 | def api_history() -> Dict[str, Any]: | 1602 | def api_history() -> Dict[str, Any]: |
| 1582 | return {"history": framework.store.list_batch_runs(limit=20)} | 1603 | return {"history": framework.store.list_batch_runs(limit=20)} |
| 1583 | 1604 | ||
| 1605 | + @app.get("/api/history/{batch_id}/report") | ||
| 1606 | + def api_history_report(batch_id: str) -> Dict[str, Any]: | ||
| 1607 | + row = framework.store.get_batch_run(batch_id) | ||
| 1608 | + if row is None: | ||
| 1609 | + raise HTTPException(status_code=404, detail="Unknown batch_id") | ||
| 1610 | + report_path = Path(row["report_markdown_path"]).resolve() | ||
| 1611 | + root = framework.artifact_root.resolve() | ||
| 1612 | + try: | ||
| 1613 | + report_path.relative_to(root) | ||
| 1614 | + except ValueError: | ||
| 1615 | + raise HTTPException(status_code=403, detail="Report path is outside artifact root") | ||
| 1616 | + if not report_path.is_file(): | ||
| 1617 | + raise HTTPException(status_code=404, detail="Report file not found") | ||
| 1618 | + return { | ||
| 1619 | + "batch_id": row["batch_id"], | ||
| 1620 | + "created_at": row["created_at"], | ||
| 1621 | + "tenant_id": row["tenant_id"], | ||
| 1622 | + "report_markdown_path": str(report_path), | ||
| 1623 | + "markdown": report_path.read_text(encoding="utf-8"), | ||
| 1624 | + } | ||
| 1625 | + | ||
| 1584 | return app | 1626 | return app |
| 1585 | 1627 | ||
| 1586 | 1628 | ||
| @@ -1612,7 +1654,11 @@ WEB_APP_HTML = """ | @@ -1612,7 +1654,11 @@ WEB_APP_HTML = """ | ||
| 1612 | h1, h2 { margin: 0 0 12px; } | 1654 | h1, h2 { margin: 0 0 12px; } |
| 1613 | .muted { color: var(--muted); } | 1655 | .muted { color: var(--muted); } |
| 1614 | .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } | 1656 | .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } |
| 1615 | - .query-item { display: block; width: 100%; border: 0; background: transparent; text-align: left; padding: 10px 12px; border-radius: 10px; cursor: pointer; } | 1657 | + .query-item { |
| 1658 | + display: block; width: 100%; border: 0; background: transparent; text-align: left; | ||
| 1659 | + padding: 10px 12px; border-radius: 10px; cursor: pointer; | ||
| 1660 | + color: var(--ink); font-size: 15px; font-weight: 500; | ||
| 1661 | + } | ||
| 1616 | .query-item:hover { background: #eef6f4; } | 1662 | .query-item:hover { background: #eef6f4; } |
| 1617 | .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } | 1663 | .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } |
| 1618 | input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } | 1664 | input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } |
| @@ -1634,6 +1680,49 @@ WEB_APP_HTML = """ | @@ -1634,6 +1680,49 @@ WEB_APP_HTML = """ | ||
| 1634 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } | 1680 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } |
| 1635 | .section { margin-bottom: 28px; } | 1681 | .section { margin-bottom: 28px; } |
| 1636 | .history { font-size: 13px; line-height: 1.5; } | 1682 | .history { font-size: 13px; line-height: 1.5; } |
| 1683 | + .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; } | ||
| 1684 | + .history-item { | ||
| 1685 | + display: block; width: 100%; border: 1px solid var(--line); background: var(--panel); | ||
| 1686 | + text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer; | ||
| 1687 | + color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s; | ||
| 1688 | + } | ||
| 1689 | + .history-item:hover { background: #eef6f4; border-color: #b8d4cd; } | ||
| 1690 | + .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } | ||
| 1691 | + .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; } | ||
| 1692 | + .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; } | ||
| 1693 | + .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; } | ||
| 1694 | + .history-item .hstats span { color: var(--muted); } | ||
| 1695 | + .report-modal-root { | ||
| 1696 | + position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center; | ||
| 1697 | + padding: 16px; box-sizing: border-box; | ||
| 1698 | + } | ||
| 1699 | + .report-modal-root.is-open { display: flex; } | ||
| 1700 | + .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); } | ||
| 1701 | + .report-modal-dialog { | ||
| 1702 | + position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column; | ||
| 1703 | + background: var(--panel); border: 1px solid var(--line); border-radius: 18px; | ||
| 1704 | + box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18); | ||
| 1705 | + } | ||
| 1706 | + .report-modal-head { | ||
| 1707 | + flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px; | ||
| 1708 | + padding: 16px 18px; border-bottom: 1px solid var(--line); | ||
| 1709 | + } | ||
| 1710 | + .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; } | ||
| 1711 | + .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; } | ||
| 1712 | + .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; } | ||
| 1713 | + .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); } | ||
| 1714 | + .report-modal-body { | ||
| 1715 | + flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px; | ||
| 1716 | + font-size: 14px; line-height: 1.55; | ||
| 1717 | + } | ||
| 1718 | + .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; } | ||
| 1719 | + .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; } | ||
| 1720 | + .batch-report-md h2:first-of-type { margin-top: 0; } | ||
| 1721 | + .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; } | ||
| 1722 | + .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; } | ||
| 1723 | + .batch-report-md li { margin: 0.2rem 0; } | ||
| 1724 | + .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; } | ||
| 1725 | + .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } | ||
| 1637 | .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } | 1726 | .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } |
| 1638 | .tip { margin-bottom: 6px; color: var(--muted); } | 1727 | .tip { margin-bottom: 6px; color: var(--muted); } |
| 1639 | </style> | 1728 | </style> |
| @@ -1646,6 +1735,7 @@ WEB_APP_HTML = """ | @@ -1646,6 +1735,7 @@ WEB_APP_HTML = """ | ||
| 1646 | <div id="queryList" class="query-list"></div> | 1735 | <div id="queryList" class="query-list"></div> |
| 1647 | <div class="section"> | 1736 | <div class="section"> |
| 1648 | <h2>History</h2> | 1737 | <h2>History</h2> |
| 1738 | + <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p> | ||
| 1649 | <div id="history" class="history muted">Loading...</div> | 1739 | <div id="history" class="history muted">Loading...</div> |
| 1650 | </div> | 1740 | </div> |
| 1651 | </aside> | 1741 | </aside> |
| @@ -1676,6 +1766,22 @@ WEB_APP_HTML = """ | @@ -1676,6 +1766,22 @@ WEB_APP_HTML = """ | ||
| 1676 | </section> | 1766 | </section> |
| 1677 | </main> | 1767 | </main> |
| 1678 | </div> | 1768 | </div> |
| 1769 | + <div id="reportModal" class="report-modal-root" aria-hidden="true"> | ||
| 1770 | + <div class="report-modal-backdrop" data-close-report="1"></div> | ||
| 1771 | + <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle"> | ||
| 1772 | + <div class="report-modal-head"> | ||
| 1773 | + <h3 id="reportModalTitle">Batch report</h3> | ||
| 1774 | + <div class="head-actions"> | ||
| 1775 | + <button type="button" class="secondary" id="reportCopyPath">Copy path</button> | ||
| 1776 | + <button type="button" onclick="closeReportModal()">Close</button> | ||
| 1777 | + </div> | ||
| 1778 | + </div> | ||
| 1779 | + <div id="reportModalMeta" class="report-modal-meta muted"></div> | ||
| 1780 | + <div id="reportModalBody" class="report-modal-body batch-report-md"></div> | ||
| 1781 | + </div> | ||
| 1782 | + </div> | ||
| 1783 | + <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script> | ||
| 1784 | + <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script> | ||
| 1679 | <script> | 1785 | <script> |
| 1680 | async function fetchJSON(url, options) { | 1786 | async function fetchJSON(url, options) { |
| 1681 | const res = await fetch(url, options); | 1787 | const res = await fetch(url, options); |
| @@ -1738,13 +1844,93 @@ WEB_APP_HTML = """ | @@ -1738,13 +1844,93 @@ WEB_APP_HTML = """ | ||
| 1738 | root.appendChild(btn); | 1844 | root.appendChild(btn); |
| 1739 | }); | 1845 | }); |
| 1740 | } | 1846 | } |
| 1847 | + function fmtMetric(m, key, digits) { | ||
| 1848 | + const v = m && m[key]; | ||
| 1849 | + if (v == null || Number.isNaN(Number(v))) return null; | ||
| 1850 | + const n = Number(v); | ||
| 1851 | + return n.toFixed(digits); | ||
| 1852 | + } | ||
| 1853 | + function historySummaryHtml(meta) { | ||
| 1854 | + const m = meta && meta.aggregate_metrics; | ||
| 1855 | + const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | ||
| 1856 | + const parts = []; | ||
| 1857 | + if (nq != null) parts.push(`<span>Queries</span> ${nq}`); | ||
| 1858 | + const p10 = fmtMetric(m, 'P@10', 3); | ||
| 1859 | + const p52 = fmtMetric(m, 'P@5_2_3', 3); | ||
| 1860 | + const map3 = fmtMetric(m, 'MAP_3', 3); | ||
| 1861 | + if (p10) parts.push(`<span>P@10</span> ${p10}`); | ||
| 1862 | + if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`); | ||
| 1863 | + if (map3) parts.push(`<span>MAP_3</span> ${map3}`); | ||
| 1864 | + if (!parts.length) return ''; | ||
| 1865 | + return `<div class="hstats">${parts.join(' · ')}</div>`; | ||
| 1866 | + } | ||
| 1741 | async function loadHistory() { | 1867 | async function loadHistory() { |
| 1742 | const data = await fetchJSON('/api/history'); | 1868 | const data = await fetchJSON('/api/history'); |
| 1743 | const root = document.getElementById('history'); | 1869 | const root = document.getElementById('history'); |
| 1744 | - root.innerHTML = (data.history || []).map(item => | ||
| 1745 | - `<div><strong>${item.batch_id}</strong><br/>${item.created_at}<br/>${item.report_markdown_path}</div><br/>` | ||
| 1746 | - ).join('') || 'No history yet.'; | 1870 | + root.classList.remove('muted'); |
| 1871 | + const items = data.history || []; | ||
| 1872 | + if (!items.length) { | ||
| 1873 | + root.innerHTML = '<span class="muted">No history yet.</span>'; | ||
| 1874 | + return; | ||
| 1875 | + } | ||
| 1876 | + root.innerHTML = `<div class="history-list"></div>`; | ||
| 1877 | + const list = root.querySelector('.history-list'); | ||
| 1878 | + items.forEach(item => { | ||
| 1879 | + const btn = document.createElement('button'); | ||
| 1880 | + btn.type = 'button'; | ||
| 1881 | + btn.className = 'history-item'; | ||
| 1882 | + btn.setAttribute('aria-label', `Open report ${item.batch_id}`); | ||
| 1883 | + const sum = historySummaryHtml(item.metadata); | ||
| 1884 | + btn.innerHTML = `<div class="hid">${item.batch_id}</div> | ||
| 1885 | + <div class="hmeta">${item.created_at} · tenant ${item.tenant_id}</div>${sum}`; | ||
| 1886 | + btn.onclick = () => openBatchReport(item.batch_id); | ||
| 1887 | + list.appendChild(btn); | ||
| 1888 | + }); | ||
| 1889 | + } | ||
| 1890 | + let _lastReportPath = ''; | ||
| 1891 | + function closeReportModal() { | ||
| 1892 | + const el = document.getElementById('reportModal'); | ||
| 1893 | + el.classList.remove('is-open'); | ||
| 1894 | + el.setAttribute('aria-hidden', 'true'); | ||
| 1895 | + document.getElementById('reportModalBody').innerHTML = ''; | ||
| 1896 | + document.getElementById('reportModalMeta').textContent = ''; | ||
| 1897 | + } | ||
| 1898 | + async function openBatchReport(batchId) { | ||
| 1899 | + const el = document.getElementById('reportModal'); | ||
| 1900 | + const body = document.getElementById('reportModalBody'); | ||
| 1901 | + const metaEl = document.getElementById('reportModalMeta'); | ||
| 1902 | + const titleEl = document.getElementById('reportModalTitle'); | ||
| 1903 | + el.classList.add('is-open'); | ||
| 1904 | + el.setAttribute('aria-hidden', 'false'); | ||
| 1905 | + titleEl.textContent = batchId; | ||
| 1906 | + metaEl.textContent = ''; | ||
| 1907 | + body.className = 'report-modal-body batch-report-md report-modal-loading'; | ||
| 1908 | + body.textContent = 'Loading report…'; | ||
| 1909 | + try { | ||
| 1910 | + const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report'); | ||
| 1911 | + _lastReportPath = rep.report_markdown_path || ''; | ||
| 1912 | + metaEl.textContent = rep.report_markdown_path || ''; | ||
| 1913 | + const raw = marked.parse(rep.markdown || '', { gfm: true }); | ||
| 1914 | + const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } }); | ||
| 1915 | + body.className = 'report-modal-body batch-report-md'; | ||
| 1916 | + body.innerHTML = safe; | ||
| 1917 | + } catch (e) { | ||
| 1918 | + body.className = 'report-modal-body report-modal-error'; | ||
| 1919 | + body.textContent = (e && e.message) ? e.message : String(e); | ||
| 1920 | + } | ||
| 1747 | } | 1921 | } |
| 1922 | + document.getElementById('reportModal').addEventListener('click', (ev) => { | ||
| 1923 | + if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal(); | ||
| 1924 | + }); | ||
| 1925 | + document.addEventListener('keydown', (ev) => { | ||
| 1926 | + if (ev.key === 'Escape') closeReportModal(); | ||
| 1927 | + }); | ||
| 1928 | + document.getElementById('reportCopyPath').addEventListener('click', async () => { | ||
| 1929 | + if (!_lastReportPath) return; | ||
| 1930 | + try { | ||
| 1931 | + await navigator.clipboard.writeText(_lastReportPath); | ||
| 1932 | + } catch (_) {} | ||
| 1933 | + }); | ||
| 1748 | async function runSingle() { | 1934 | async function runSingle() { |
| 1749 | const query = document.getElementById('queryInput').value.trim(); | 1935 | const query = document.getElementById('queryInput').value.trim(); |
| 1750 | if (!query) return; | 1936 | if (!query) return; |
scripts/evaluation/quick_start_eval.sh
| @@ -9,10 +9,11 @@ TENANT_ID="${TENANT_ID:-163}" | @@ -9,10 +9,11 @@ TENANT_ID="${TENANT_ID:-163}" | ||
| 9 | QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | 9 | QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 10 | 10 | ||
| 11 | usage() { | 11 | usage() { |
| 12 | - echo "Usage: $0 batch|serve" | ||
| 13 | - echo " batch — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)" | ||
| 14 | - echo " serve — eval UI on http://127.0.0.1:6010/" | ||
| 15 | - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)" | 12 | + echo "Usage: $0 batch|batch-rebuild|serve" |
| 13 | + echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" | ||
| 14 | + echo " batch-rebuild — same as batch but --force-refresh-labels (re-LLM all top_k hits; expensive, overwrites cache)" | ||
| 15 | + echo " serve — eval UI on http://0.0.0.0:6010/" | ||
| 16 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (path to queries file)" | ||
| 16 | } | 17 | } |
| 17 | 18 | ||
| 18 | case "${1:-}" in | 19 | case "${1:-}" in |
| @@ -22,6 +23,14 @@ case "${1:-}" in | @@ -22,6 +23,14 @@ case "${1:-}" in | ||
| 22 | --queries-file "$QUERIES" \ | 23 | --queries-file "$QUERIES" \ |
| 23 | --top-k 50 \ | 24 | --top-k 50 \ |
| 24 | --language en \ | 25 | --language en \ |
| 26 | + --labeler-mode simple | ||
| 27 | + ;; | ||
| 28 | + batch-rebuild) | ||
| 29 | + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | ||
| 30 | + --tenant-id "$TENANT_ID" \ | ||
| 31 | + --queries-file "$QUERIES" \ | ||
| 32 | + --top-k 50 \ | ||
| 33 | + --language en \ | ||
| 25 | --labeler-mode simple \ | 34 | --labeler-mode simple \ |
| 26 | --force-refresh-labels | 35 | --force-refresh-labels |
| 27 | ;; | 36 | ;; |
| @@ -29,7 +38,7 @@ case "${1:-}" in | @@ -29,7 +38,7 @@ case "${1:-}" in | ||
| 29 | exec "$PY" scripts/evaluation/serve_eval_web.py serve \ | 38 | exec "$PY" scripts/evaluation/serve_eval_web.py serve \ |
| 30 | --tenant-id "$TENANT_ID" \ | 39 | --tenant-id "$TENANT_ID" \ |
| 31 | --queries-file "$QUERIES" \ | 40 | --queries-file "$QUERIES" \ |
| 32 | - --host 127.0.0.1 \ | 41 | + --host 0.0.0.0 \ |
| 33 | --port 6010 | 42 | --port 6010 |
| 34 | ;; | 43 | ;; |
| 35 | *) | 44 | *) |