Commit c81b0fc12093cca9a8eef590e545a71a3fc2cd1c

Authored by tangwang
1 parent 7b8d9e1a

scripts/evaluation/eval_framework

scripts/evaluation/README.md
@@ -19,12 +19,12 @@ The framework supports four related tasks: @@ -19,12 +19,12 @@ The framework supports four related tasks:
19 19
20 ## Files 20 ## Files
21 21
22 -- `eval_framework.py`  
23 - Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation. 22 +- `eval_framework/` (Python package)
  23 + Modular layout: `framework.py` (orchestration), `store.py` (SQLite), `clients.py` (search/rerank/LLM), `prompts.py` (judge templates), `metrics.py`, `reports.py`, `web_app.py`, `cli.py`, and `static/` (evaluation UI HTML/CSS/JS).
24 - `build_annotation_set.py` 24 - `build_annotation_set.py`
25 - Thin CLI entrypoint into `eval_framework.py`. 25 + Thin CLI entrypoint into `eval_framework`.
26 - `serve_eval_web.py` 26 - `serve_eval_web.py`
27 - Thin web entrypoint into `eval_framework.py`. 27 + Thin web entrypoint into `eval_framework`.
28 - `tune_fusion.py` 28 - `tune_fusion.py`
29 Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports. 29 Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports.
30 - `fusion_experiments_shortlist.json` 30 - `fusion_experiments_shortlist.json`
scripts/evaluation/eval_framework.py deleted
@@ -1,2140 +0,0 @@ @@ -1,2140 +0,0 @@
1 -#!/usr/bin/env python3  
2 -"""  
3 -Search evaluation framework for pooled relevance annotation, live metrics, and reports.  
4 -"""  
5 -  
6 -from __future__ import annotations  
7 -  
8 -import argparse  
9 -import hashlib  
10 -import json  
11 -import math  
12 -import os  
13 -import re  
14 -import sqlite3  
15 -import sys  
16 -import time  
17 -from dataclasses import dataclass  
18 -from datetime import datetime, timezone  
19 -from pathlib import Path  
20 -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple  
21 -  
22 -import requests  
23 -from elasticsearch.helpers import scan  
24 -from fastapi import FastAPI, HTTPException  
25 -from fastapi.responses import HTMLResponse  
26 -from pydantic import BaseModel, Field  
27 -  
28 -PROJECT_ROOT = Path(__file__).resolve().parents[2]  
29 -if str(PROJECT_ROOT) not in sys.path:  
30 - sys.path.insert(0, str(PROJECT_ROOT))  
31 -  
32 -from api.app import get_app_config, get_es_client, get_query_parser, init_service  
33 -from indexer.mapping_generator import get_tenant_index_name  
34 -  
35 -  
36 -RELEVANCE_EXACT = "Exact"  
37 -RELEVANCE_PARTIAL = "Partial"  
38 -RELEVANCE_IRRELEVANT = "Irrelevant"  
39 -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}  
40 -DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"  
41 -DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"  
42 -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"  
43 -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"  
44 -DEFAULT_LABELER_MODE = "simple"  
45 -  
46 -  
47 -def utc_now_iso() -> str:  
48 - return datetime.now(timezone.utc).isoformat()  
49 -  
50 -  
51 -def utc_timestamp() -> str:  
52 - return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")  
53 -  
54 -  
55 -def ensure_dir(path: Path) -> Path:  
56 - path.mkdir(parents=True, exist_ok=True)  
57 - return path  
58 -  
59 -  
60 -def sha1_text(text: str) -> str:  
61 - return hashlib.sha1(text.encode("utf-8")).hexdigest()  
62 -  
63 -  
64 -def pick_text(value: Any, preferred_lang: str = "en") -> str:  
65 - if value is None:  
66 - return ""  
67 - if isinstance(value, dict):  
68 - return str(  
69 - value.get(preferred_lang)  
70 - or value.get("en")  
71 - or value.get("zh")  
72 - or next((v for v in value.values() if v), "")  
73 - ).strip()  
74 - return str(value).strip()  
75 -  
76 -  
77 -def safe_json_dumps(data: Any) -> str:  
78 - return json.dumps(data, ensure_ascii=False, separators=(",", ":"))  
79 -  
80 -  
81 -def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:  
82 - if not skus:  
83 - return "", "", ""  
84 - first = skus[0] or {}  
85 - return (  
86 - str(first.get("option1_value") or "").strip(),  
87 - str(first.get("option2_value") or "").strip(),  
88 - str(first.get("option3_value") or "").strip(),  
89 - )  
90 -  
91 -  
92 -def build_display_title(doc: Dict[str, Any]) -> str:  
93 - title = doc.get("title")  
94 - en = pick_text(title, "en")  
95 - zh = pick_text(title, "zh")  
96 - if en and zh and en != zh:  
97 - return f"{en} / {zh}"  
98 - return en or zh  
99 -  
100 -  
101 -def build_rerank_doc(doc: Dict[str, Any]) -> str:  
102 - title = build_display_title(doc)  
103 - return title[:400]  
104 -  
105 -  
106 -def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:  
107 - title = build_display_title(doc)  
108 - option1, option2, option3 = compact_option_values(doc.get("skus") or [])  
109 - vendor = pick_text(doc.get("vendor"), "en")  
110 - category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")  
111 - tags = doc.get("tags") or []  
112 - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)  
113 - parts = [title]  
114 - if option1:  
115 - parts.append(f"option1={option1}")  
116 - if option2:  
117 - parts.append(f"option2={option2}")  
118 - if option3:  
119 - parts.append(f"option3={option3}")  
120 - if vendor:  
121 - parts.append(f"vendor={vendor}")  
122 - if category:  
123 - parts.append(f"category={category}")  
124 - if tags_text:  
125 - parts.append(f"tags={tags_text}")  
126 - return f"{idx}. " + " | ".join(part for part in parts if part)  
127 -  
128 -  
129 -def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:  
130 - return {  
131 - "spu_id": str(doc.get("spu_id") or ""),  
132 - "title": build_display_title(doc),  
133 - "image_url": doc.get("image_url"),  
134 - "vendor": pick_text(doc.get("vendor"), "en"),  
135 - "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),  
136 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
137 - "tags": list((doc.get("tags") or [])[:6]),  
138 - }  
139 -  
140 -  
141 -def normalize_text(text: Any) -> str:  
142 - value = str(text or "").strip().lower()  
143 - value = re.sub(r"\s+", " ", value)  
144 - return value  
145 -  
146 -  
147 -def _extract_json_blob(text: str) -> Any:  
148 - cleaned = str(text or "").strip()  
149 - candidates: List[str] = [cleaned]  
150 - fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)  
151 - candidates.extend(match.strip() for match in fence_matches if match.strip())  
152 -  
153 - for candidate in candidates:  
154 - try:  
155 - return json.loads(candidate)  
156 - except Exception:  
157 - pass  
158 -  
159 - starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]  
160 - ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]  
161 - for start in starts:  
162 - for end in reversed(ends):  
163 - if end <= start:  
164 - continue  
165 - fragment = cleaned[start : end + 1]  
166 - try:  
167 - return json.loads(fragment)  
168 - except Exception:  
169 - continue  
170 - raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")  
171 -  
172 -  
173 -@dataclass  
174 -class QueryBuildResult:  
175 - query: str  
176 - tenant_id: str  
177 - search_total: int  
178 - search_depth: int  
179 - rerank_corpus_size: int  
180 - annotated_count: int  
181 - output_json_path: Path  
182 -  
183 -  
184 -class EvalStore:  
185 - def __init__(self, db_path: Path):  
186 - self.db_path = db_path  
187 - ensure_dir(db_path.parent)  
188 - self.conn = sqlite3.connect(str(db_path), check_same_thread=False)  
189 - self.conn.row_factory = sqlite3.Row  
190 - self._init_schema()  
191 -  
192 - def _init_schema(self) -> None:  
193 - self.conn.executescript(  
194 - """  
195 - CREATE TABLE IF NOT EXISTS corpus_docs (  
196 - tenant_id TEXT NOT NULL,  
197 - spu_id TEXT NOT NULL,  
198 - title_json TEXT,  
199 - vendor_json TEXT,  
200 - category_path_json TEXT,  
201 - category_name_json TEXT,  
202 - image_url TEXT,  
203 - skus_json TEXT,  
204 - tags_json TEXT,  
205 - raw_json TEXT NOT NULL,  
206 - updated_at TEXT NOT NULL,  
207 - PRIMARY KEY (tenant_id, spu_id)  
208 - );  
209 -  
210 - CREATE TABLE IF NOT EXISTS rerank_scores (  
211 - tenant_id TEXT NOT NULL,  
212 - query_text TEXT NOT NULL,  
213 - spu_id TEXT NOT NULL,  
214 - score REAL NOT NULL,  
215 - model_name TEXT,  
216 - updated_at TEXT NOT NULL,  
217 - PRIMARY KEY (tenant_id, query_text, spu_id)  
218 - );  
219 -  
220 - CREATE TABLE IF NOT EXISTS relevance_labels (  
221 - tenant_id TEXT NOT NULL,  
222 - query_text TEXT NOT NULL,  
223 - spu_id TEXT NOT NULL,  
224 - label TEXT NOT NULL,  
225 - judge_model TEXT,  
226 - raw_response TEXT,  
227 - updated_at TEXT NOT NULL,  
228 - PRIMARY KEY (tenant_id, query_text, spu_id)  
229 - );  
230 -  
231 - CREATE TABLE IF NOT EXISTS build_runs (  
232 - run_id TEXT PRIMARY KEY,  
233 - tenant_id TEXT NOT NULL,  
234 - query_text TEXT NOT NULL,  
235 - output_json_path TEXT NOT NULL,  
236 - metadata_json TEXT NOT NULL,  
237 - created_at TEXT NOT NULL  
238 - );  
239 -  
240 - CREATE TABLE IF NOT EXISTS batch_runs (  
241 - batch_id TEXT PRIMARY KEY,  
242 - tenant_id TEXT NOT NULL,  
243 - output_json_path TEXT NOT NULL,  
244 - report_markdown_path TEXT NOT NULL,  
245 - config_snapshot_path TEXT NOT NULL,  
246 - metadata_json TEXT NOT NULL,  
247 - created_at TEXT NOT NULL  
248 - );  
249 -  
250 - CREATE TABLE IF NOT EXISTS query_profiles (  
251 - tenant_id TEXT NOT NULL,  
252 - query_text TEXT NOT NULL,  
253 - prompt_version TEXT NOT NULL,  
254 - judge_model TEXT,  
255 - profile_json TEXT NOT NULL,  
256 - raw_response TEXT NOT NULL,  
257 - updated_at TEXT NOT NULL,  
258 - PRIMARY KEY (tenant_id, query_text, prompt_version)  
259 - );  
260 - """  
261 - )  
262 - self.conn.commit()  
263 -  
264 - def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:  
265 - now = utc_now_iso()  
266 - rows = []  
267 - for doc in docs:  
268 - rows.append(  
269 - (  
270 - tenant_id,  
271 - str(doc.get("spu_id") or ""),  
272 - safe_json_dumps(doc.get("title")),  
273 - safe_json_dumps(doc.get("vendor")),  
274 - safe_json_dumps(doc.get("category_path")),  
275 - safe_json_dumps(doc.get("category_name")),  
276 - str(doc.get("image_url") or ""),  
277 - safe_json_dumps(doc.get("skus") or []),  
278 - safe_json_dumps(doc.get("tags") or []),  
279 - safe_json_dumps(doc),  
280 - now,  
281 - )  
282 - )  
283 - self.conn.executemany(  
284 - """  
285 - INSERT INTO corpus_docs (  
286 - tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,  
287 - image_url, skus_json, tags_json, raw_json, updated_at  
288 - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)  
289 - ON CONFLICT(tenant_id, spu_id) DO UPDATE SET  
290 - title_json=excluded.title_json,  
291 - vendor_json=excluded.vendor_json,  
292 - category_path_json=excluded.category_path_json,  
293 - category_name_json=excluded.category_name_json,  
294 - image_url=excluded.image_url,  
295 - skus_json=excluded.skus_json,  
296 - tags_json=excluded.tags_json,  
297 - raw_json=excluded.raw_json,  
298 - updated_at=excluded.updated_at  
299 - """,  
300 - rows,  
301 - )  
302 - self.conn.commit()  
303 -  
304 - def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:  
305 - rows = self.conn.execute(  
306 - "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",  
307 - (tenant_id,),  
308 - ).fetchall()  
309 - return [json.loads(row["raw_json"]) for row in rows]  
310 -  
311 - def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:  
312 - keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]  
313 - if not keys:  
314 - return {}  
315 - placeholders = ",".join("?" for _ in keys)  
316 - rows = self.conn.execute(  
317 - f"""  
318 - SELECT spu_id, raw_json  
319 - FROM corpus_docs  
320 - WHERE tenant_id=? AND spu_id IN ({placeholders})  
321 - """,  
322 - [tenant_id, *keys],  
323 - ).fetchall()  
324 - return {  
325 - str(row["spu_id"]): json.loads(row["raw_json"])  
326 - for row in rows  
327 - }  
328 -  
329 - def has_corpus(self, tenant_id: str) -> bool:  
330 - row = self.conn.execute(  
331 - "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",  
332 - (tenant_id,),  
333 - ).fetchone()  
334 - return bool(row and row["n"] > 0)  
335 -  
336 - def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:  
337 - rows = self.conn.execute(  
338 - """  
339 - SELECT spu_id, score  
340 - FROM rerank_scores  
341 - WHERE tenant_id=? AND query_text=?  
342 - """,  
343 - (tenant_id, query_text),  
344 - ).fetchall()  
345 - return {str(row["spu_id"]): float(row["score"]) for row in rows}  
346 -  
347 - def upsert_rerank_scores(  
348 - self,  
349 - tenant_id: str,  
350 - query_text: str,  
351 - scores: Dict[str, float],  
352 - model_name: str,  
353 - ) -> None:  
354 - now = utc_now_iso()  
355 - rows = [  
356 - (tenant_id, query_text, spu_id, float(score), model_name, now)  
357 - for spu_id, score in scores.items()  
358 - ]  
359 - self.conn.executemany(  
360 - """  
361 - INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)  
362 - VALUES (?, ?, ?, ?, ?, ?)  
363 - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET  
364 - score=excluded.score,  
365 - model_name=excluded.model_name,  
366 - updated_at=excluded.updated_at  
367 - """,  
368 - rows,  
369 - )  
370 - self.conn.commit()  
371 -  
372 - def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:  
373 - rows = self.conn.execute(  
374 - """  
375 - SELECT spu_id, label  
376 - FROM relevance_labels  
377 - WHERE tenant_id=? AND query_text=?  
378 - """,  
379 - (tenant_id, query_text),  
380 - ).fetchall()  
381 - return {str(row["spu_id"]): str(row["label"]) for row in rows}  
382 -  
383 - def upsert_labels(  
384 - self,  
385 - tenant_id: str,  
386 - query_text: str,  
387 - labels: Dict[str, str],  
388 - judge_model: str,  
389 - raw_response: str,  
390 - ) -> None:  
391 - now = utc_now_iso()  
392 - rows = []  
393 - for spu_id, label in labels.items():  
394 - if label not in VALID_LABELS:  
395 - raise ValueError(f"invalid label: {label}")  
396 - rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))  
397 - self.conn.executemany(  
398 - """  
399 - INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)  
400 - VALUES (?, ?, ?, ?, ?, ?, ?)  
401 - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET  
402 - label=excluded.label,  
403 - judge_model=excluded.judge_model,  
404 - raw_response=excluded.raw_response,  
405 - updated_at=excluded.updated_at  
406 - """,  
407 - rows,  
408 - )  
409 - self.conn.commit()  
410 -  
411 - def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:  
412 - row = self.conn.execute(  
413 - """  
414 - SELECT profile_json  
415 - FROM query_profiles  
416 - WHERE tenant_id=? AND query_text=? AND prompt_version=?  
417 - """,  
418 - (tenant_id, query_text, prompt_version),  
419 - ).fetchone()  
420 - if not row:  
421 - return None  
422 - return json.loads(row["profile_json"])  
423 -  
424 - def upsert_query_profile(  
425 - self,  
426 - tenant_id: str,  
427 - query_text: str,  
428 - prompt_version: str,  
429 - judge_model: str,  
430 - profile: Dict[str, Any],  
431 - raw_response: str,  
432 - ) -> None:  
433 - self.conn.execute(  
434 - """  
435 - INSERT OR REPLACE INTO query_profiles  
436 - (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)  
437 - VALUES (?, ?, ?, ?, ?, ?, ?)  
438 - """,  
439 - (  
440 - tenant_id,  
441 - query_text,  
442 - prompt_version,  
443 - judge_model,  
444 - safe_json_dumps(profile),  
445 - raw_response,  
446 - utc_now_iso(),  
447 - ),  
448 - )  
449 - self.conn.commit()  
450 -  
451 - def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:  
452 - self.conn.execute(  
453 - """  
454 - INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)  
455 - VALUES (?, ?, ?, ?, ?, ?)  
456 - """,  
457 - (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),  
458 - )  
459 - self.conn.commit()  
460 -  
461 - def insert_batch_run(  
462 - self,  
463 - batch_id: str,  
464 - tenant_id: str,  
465 - output_json_path: Path,  
466 - report_markdown_path: Path,  
467 - config_snapshot_path: Path,  
468 - metadata: Dict[str, Any],  
469 - ) -> None:  
470 - self.conn.execute(  
471 - """  
472 - INSERT OR REPLACE INTO batch_runs  
473 - (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)  
474 - VALUES (?, ?, ?, ?, ?, ?, ?)  
475 - """,  
476 - (  
477 - batch_id,  
478 - tenant_id,  
479 - str(output_json_path),  
480 - str(report_markdown_path),  
481 - str(config_snapshot_path),  
482 - safe_json_dumps(metadata),  
483 - utc_now_iso(),  
484 - ),  
485 - )  
486 - self.conn.commit()  
487 -  
488 - def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:  
489 - rows = self.conn.execute(  
490 - """  
491 - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at  
492 - FROM batch_runs  
493 - ORDER BY created_at DESC  
494 - LIMIT ?  
495 - """,  
496 - (limit,),  
497 - ).fetchall()  
498 - items: List[Dict[str, Any]] = []  
499 - for row in rows:  
500 - items.append(  
501 - {  
502 - "batch_id": row["batch_id"],  
503 - "tenant_id": row["tenant_id"],  
504 - "output_json_path": row["output_json_path"],  
505 - "report_markdown_path": row["report_markdown_path"],  
506 - "config_snapshot_path": row["config_snapshot_path"],  
507 - "metadata": json.loads(row["metadata_json"]),  
508 - "created_at": row["created_at"],  
509 - }  
510 - )  
511 - return items  
512 -  
513 - def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]:  
514 - row = self.conn.execute(  
515 - """  
516 - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at  
517 - FROM batch_runs  
518 - WHERE batch_id = ?  
519 - """,  
520 - (batch_id,),  
521 - ).fetchone()  
522 - if row is None:  
523 - return None  
524 - return {  
525 - "batch_id": row["batch_id"],  
526 - "tenant_id": row["tenant_id"],  
527 - "output_json_path": row["output_json_path"],  
528 - "report_markdown_path": row["report_markdown_path"],  
529 - "config_snapshot_path": row["config_snapshot_path"],  
530 - "metadata": json.loads(row["metadata_json"]),  
531 - "created_at": row["created_at"],  
532 - }  
533 -  
534 - def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:  
535 - rows = self.conn.execute(  
536 - """  
537 - SELECT  
538 - query_text,  
539 - COUNT(*) AS total,  
540 - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,  
541 - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,  
542 - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,  
543 - MAX(updated_at) AS updated_at  
544 - FROM relevance_labels  
545 - WHERE tenant_id=?  
546 - GROUP BY query_text  
547 - ORDER BY query_text  
548 - """,  
549 - (tenant_id,),  
550 - ).fetchall()  
551 - return [  
552 - {  
553 - "query": str(row["query_text"]),  
554 - "total": int(row["total"]),  
555 - "exact_count": int(row["exact_count"] or 0),  
556 - "partial_count": int(row["partial_count"] or 0),  
557 - "irrelevant_count": int(row["irrelevant_count"] or 0),  
558 - "updated_at": row["updated_at"],  
559 - }  
560 - for row in rows  
561 - ]  
562 -  
563 - def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:  
564 - row = self.conn.execute(  
565 - """  
566 - SELECT  
567 - COUNT(*) AS total,  
568 - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,  
569 - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,  
570 - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,  
571 - MAX(updated_at) AS updated_at  
572 - FROM relevance_labels  
573 - WHERE tenant_id=? AND query_text=?  
574 - """,  
575 - (tenant_id, query_text),  
576 - ).fetchone()  
577 - return {  
578 - "query": query_text,  
579 - "total": int((row["total"] or 0) if row else 0),  
580 - "exact_count": int((row["exact_count"] or 0) if row else 0),  
581 - "partial_count": int((row["partial_count"] or 0) if row else 0),  
582 - "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),  
583 - "updated_at": row["updated_at"] if row else None,  
584 - }  
585 -  
586 -  
587 -class SearchServiceClient:  
588 - def __init__(self, base_url: str, tenant_id: str):  
589 - self.base_url = base_url.rstrip("/")  
590 - self.tenant_id = str(tenant_id)  
591 - self.session = requests.Session()  
592 -  
593 - def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:  
594 - response = self.session.post(  
595 - f"{self.base_url}/search/",  
596 - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},  
597 - json={"query": query, "size": size, "from": from_, "language": language},  
598 - timeout=120,  
599 - )  
600 - response.raise_for_status()  
601 - return response.json()  
602 -  
603 -  
604 -class RerankServiceClient:  
605 - def __init__(self, service_url: str):  
606 - self.service_url = service_url.rstrip("/")  
607 - self.session = requests.Session()  
608 -  
609 - def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:  
610 - payload: Dict[str, Any] = {  
611 - "query": query,  
612 - "docs": list(docs),  
613 - "normalize": normalize,  
614 - }  
615 - if top_n is not None:  
616 - payload["top_n"] = int(top_n)  
617 - response = self.session.post(self.service_url, json=payload, timeout=180)  
618 - response.raise_for_status()  
619 - data = response.json()  
620 - return list(data.get("scores") or []), dict(data.get("meta") or {})  
621 -  
622 -  
623 -class DashScopeLabelClient:  
624 - def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):  
625 - self.model = model  
626 - self.base_url = base_url.rstrip("/")  
627 - self.api_key = api_key  
628 - self.batch_size = int(batch_size)  
629 - self.session = requests.Session()  
630 -  
631 - def _chat(self, prompt: str) -> Tuple[str, str]:  
632 - response = self.session.post(  
633 - f"{self.base_url}/chat/completions",  
634 - headers={  
635 - "Authorization": f"Bearer {self.api_key}",  
636 - "Content-Type": "application/json",  
637 - },  
638 - json={  
639 - "model": self.model,  
640 - "messages": [{"role": "user", "content": prompt}],  
641 - "temperature": 0,  
642 - "top_p": 0.1,  
643 - },  
644 - timeout=180,  
645 - )  
646 - response.raise_for_status()  
647 - data = response.json()  
648 - content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()  
649 - return content, safe_json_dumps(data)  
650 -  
651 - def classify_batch_simple(  
652 - self,  
653 - query: str,  
654 - docs: Sequence[Dict[str, Any]],  
655 - ) -> Tuple[List[str], str]:  
656 - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]  
657 - prompt = (  
658 - "You are an e-commerce search result relevance evaluation assistant. "  
659 - "Based on the user query and each product's information, output the relevance level for each product.\n\n"  
660 - "## Relevance Level Criteria\n"  
661 - "Exact โ€” Fully matches the user's search intent.\n"  
662 - "Partial โ€” Primary intent satisfied (same category or similar use, basically aligns with search intent), "  
663 - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"  
664 - "Irrelevant โ€” Category or use case mismatched, primary intent not satisfied.\n\n"  
665 - "Additional judging guidance:\n"  
666 - "- If the query clearly names a product type, product type matching has the highest priority. "  
667 - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "  
668 - "bra vs top, backpack vs bag are not interchangeable.\n"  
669 - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"  
670 - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"  
671 - "- Do not guess missing attributes.\n"  
672 - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"  
673 - "- Be conservative with Exact.\n\n"  
674 - f"Query: {query}\n\n"  
675 - "Products:\n"  
676 - + "\n".join(numbered_docs)  
677 - + "\n\n## Output Format\n"  
678 - f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. "  
679 - "They must correspond sequentially to the products above. Do not output any other information.\n"  
680 - )  
681 - content, raw_response = self._chat(prompt)  
682 - labels = []  
683 - for line in str(content or "").splitlines():  
684 - label = line.strip()  
685 - if label in VALID_LABELS:  
686 - labels.append(label)  
687 - if len(labels) != len(docs):  
688 - payload = _extract_json_blob(content)  
689 - if isinstance(payload, dict) and isinstance(payload.get("labels"), list):  
690 - labels = []  
691 - for item in payload["labels"][: len(docs)]:  
692 - if isinstance(item, dict):  
693 - label = str(item.get("label") or "").strip()  
694 - else:  
695 - label = str(item).strip()  
696 - if label in VALID_LABELS:  
697 - labels.append(label)  
698 - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):  
699 - raise ValueError(f"unexpected simple label output: {content!r}")  
700 - return labels, raw_response  
701 -  
702 - def extract_query_profile(  
703 - self,  
704 - query: str,  
705 - parser_hints: Dict[str, Any],  
706 - ) -> Tuple[Dict[str, Any], str]:  
707 - prompt = (  
708 - "You are building a structured intent profile for e-commerce relevance judging.\n"  
709 - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"  
710 - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"  
711 - "Return JSON with this schema:\n"  
712 - "{\n"  
713 - ' "normalized_query_en": string,\n'  
714 - ' "primary_category": string,\n'  
715 - ' "allowed_categories": [string],\n'  
716 - ' "required_attributes": [\n'  
717 - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'  
718 - " ],\n"  
719 - ' "notes": [string]\n'  
720 - "}\n\n"  
721 - "Guidelines:\n"  
722 - "- Exact later will require explicit evidence for all required attributes.\n"  
723 - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"  
724 - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"  
725 - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"  
726 - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"  
727 - "- For color, include conflicting colors only when clear from the query.\n\n"  
728 - f"Original query: {query}\n"  
729 - f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n"  
730 - )  
731 - content, raw_response = self._chat(prompt)  
732 - payload = _extract_json_blob(content)  
733 - if not isinstance(payload, dict):  
734 - raise ValueError(f"unexpected query profile payload: {content!r}")  
735 - payload.setdefault("normalized_query_en", query)  
736 - payload.setdefault("primary_category", "")  
737 - payload.setdefault("allowed_categories", [])  
738 - payload.setdefault("required_attributes", [])  
739 - payload.setdefault("notes", [])  
740 - return payload, raw_response  
741 -  
742 - def classify_batch_complex(  
743 - self,  
744 - query: str,  
745 - query_profile: Dict[str, Any],  
746 - docs: Sequence[Dict[str, Any]],  
747 - ) -> Tuple[List[str], str]:  
748 - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]  
749 - prompt = (  
750 - "You are an e-commerce search relevance judge.\n"  
751 - "Judge each product against the structured query profile below.\n\n"  
752 - "Relevance rules:\n"  
753 - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"  
754 - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"  
755 - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"  
756 - "- Be conservative with Exact.\n"  
757 - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"  
758 - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"  
759 - f"Original query: {query}\n"  
760 - f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n"  
761 - "Products:\n"  
762 - + "\n".join(numbered_docs)  
763 - + "\n\nReturn JSON only, with schema:\n"  
764 - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'  
765 - )  
766 - content, raw_response = self._chat(prompt)  
767 - payload = _extract_json_blob(content)  
768 - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):  
769 - raise ValueError(f"unexpected label payload: {content!r}")  
770 - labels_payload = payload["labels"]  
771 - labels: List[str] = []  
772 - for item in labels_payload[: len(docs)]:  
773 - if not isinstance(item, dict):  
774 - continue  
775 - label = str(item.get("label") or "").strip()  
776 - if label in VALID_LABELS:  
777 - labels.append(label)  
778 - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):  
779 - raise ValueError(f"unexpected label output: {content!r}")  
780 - return labels, raw_response  
781 -  
782 -  
783 -def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:  
784 - if k <= 0:  
785 - return 0.0  
786 - sliced = list(labels[:k])  
787 - if not sliced:  
788 - return 0.0  
789 - hits = sum(1 for label in sliced if label in relevant)  
790 - return hits / float(min(k, len(sliced)))  
791 -  
792 -  
793 -def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:  
794 - hit_count = 0  
795 - precision_sum = 0.0  
796 - for idx, label in enumerate(labels, start=1):  
797 - if label not in relevant:  
798 - continue  
799 - hit_count += 1  
800 - precision_sum += hit_count / idx  
801 - if hit_count == 0:  
802 - return 0.0  
803 - return precision_sum / hit_count  
804 -  
805 -  
806 -def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:  
807 - metrics: Dict[str, float] = {}  
808 - for k in (5, 10, 20, 50):  
809 - metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)  
810 - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)  
811 - metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)  
812 - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)  
813 - return metrics  
814 -  
815 -  
816 -def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:  
817 - if not metric_items:  
818 - return {}  
819 - keys = sorted(metric_items[0].keys())  
820 - return {  
821 - key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)  
822 - for key in keys  
823 - }  
824 -  
825 -  
826 -def label_distribution(labels: Sequence[str]) -> Dict[str, int]:  
827 - return {  
828 - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),  
829 - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),  
830 - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),  
831 - }  
832 -  
833 -  
834 -class SearchEvaluationFramework:  
835 - def __init__(  
836 - self,  
837 - tenant_id: str,  
838 - artifact_root: Path = DEFAULT_ARTIFACT_ROOT,  
839 - search_base_url: str = "http://localhost:6002",  
840 - labeler_mode: str = DEFAULT_LABELER_MODE,  
841 - ):  
842 - init_service(get_app_config().infrastructure.elasticsearch.host)  
843 - self.tenant_id = str(tenant_id)  
844 - self.artifact_root = ensure_dir(artifact_root)  
845 - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE  
846 - self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")  
847 - self.search_client = SearchServiceClient(search_base_url, self.tenant_id)  
848 - app_cfg = get_app_config()  
849 - rerank_service_url = str(  
850 - app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]  
851 - )  
852 - self.rerank_client = RerankServiceClient(rerank_service_url)  
853 - llm_cfg = app_cfg.services.translation.capabilities["llm"]  
854 - api_key = app_cfg.infrastructure.secrets.dashscope_api_key  
855 - if not api_key:  
856 - raise RuntimeError("dashscope_api_key is required for search evaluation annotation")  
857 - self.label_client = DashScopeLabelClient(  
858 - model=str(llm_cfg["model"]),  
859 - base_url=str(llm_cfg["base_url"]),  
860 - api_key=str(api_key),  
861 - )  
862 - self.query_parser = None  
863 -  
864 - def _get_query_parser(self):  
865 - if self.query_parser is None:  
866 - self.query_parser = get_query_parser()  
867 - return self.query_parser  
868 -  
869 - def build_query_parser_hints(self, query: str) -> Dict[str, Any]:  
870 - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])  
871 - payload = parsed.to_dict()  
872 - payload["text_for_rerank"] = parsed.text_for_rerank()  
873 - return payload  
874 -  
875 - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:  
876 - if self.labeler_mode != "complex":  
877 - raise RuntimeError("query profiles are only used in complex labeler mode")  
878 - if not force_refresh:  
879 - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)  
880 - if cached is not None:  
881 - return cached  
882 - parser_hints = self.build_query_parser_hints(query)  
883 - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)  
884 - profile["parser_hints"] = parser_hints  
885 - self.store.upsert_query_profile(  
886 - self.tenant_id,  
887 - query,  
888 - JUDGE_PROMPT_VERSION_COMPLEX,  
889 - self.label_client.model,  
890 - profile,  
891 - raw_response,  
892 - )  
893 - return profile  
894 -  
895 - @staticmethod  
896 - def _doc_evidence_text(doc: Dict[str, Any]) -> str:  
897 - pieces: List[str] = [  
898 - build_display_title(doc),  
899 - pick_text(doc.get("vendor"), "en"),  
900 - pick_text(doc.get("category_path"), "en"),  
901 - pick_text(doc.get("category_name"), "en"),  
902 - ]  
903 - for sku in doc.get("skus") or []:  
904 - pieces.extend(  
905 - [  
906 - str(sku.get("option1_value") or ""),  
907 - str(sku.get("option2_value") or ""),  
908 - str(sku.get("option3_value") or ""),  
909 - ]  
910 - )  
911 - for tag in doc.get("tags") or []:  
912 - pieces.append(str(tag))  
913 - return normalize_text(" | ".join(piece for piece in pieces if piece))  
914 -  
915 - def _apply_rule_based_label_guardrails(  
916 - self,  
917 - label: str,  
918 - query_profile: Dict[str, Any],  
919 - doc: Dict[str, Any],  
920 - ) -> str:  
921 - if label not in VALID_LABELS:  
922 - return label  
923 - evidence = self._doc_evidence_text(doc)  
924 - category = normalize_text(query_profile.get("primary_category"))  
925 - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]  
926 -  
927 - primary_category_match = True  
928 - if category:  
929 - primary_category_match = category in evidence  
930 - allowed_category_match = True  
931 - if allowed_categories:  
932 - allowed_category_match = any(signal in evidence for signal in allowed_categories)  
933 -  
934 - if label == RELEVANCE_EXACT and not primary_category_match:  
935 - if allowed_category_match:  
936 - label = RELEVANCE_PARTIAL  
937 - else:  
938 - return RELEVANCE_IRRELEVANT  
939 -  
940 - for attr in query_profile.get("required_attributes") or []:  
941 - if not isinstance(attr, dict):  
942 - continue  
943 - attr_name = normalize_text(attr.get("name"))  
944 - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:  
945 - continue  
946 - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]  
947 - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]  
948 - if attr_name == "fit":  
949 - if any(term in {"oversized", "oversize"} for term in required_terms):  
950 - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])  
951 - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):  
952 - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])  
953 - has_required = any(term in evidence for term in required_terms) if required_terms else True  
954 - has_conflict = any(term in evidence for term in conflicting_terms)  
955 -  
956 - if has_conflict:  
957 - return RELEVANCE_IRRELEVANT  
958 - if label == RELEVANCE_EXACT and not has_required:  
959 - label = RELEVANCE_PARTIAL  
960 -  
961 - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:  
962 - return RELEVANCE_IRRELEVANT  
963 -  
964 - return label  
965 -  
966 - @staticmethod  
967 - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:  
968 - option_values = list(item.get("option_values") or [])  
969 - while len(option_values) < 3:  
970 - option_values.append("")  
971 - product = dict(item.get("product") or {})  
972 - return {  
973 - "spu_id": item.get("spu_id"),  
974 - "title": product.get("title") or item.get("title"),  
975 - "vendor": product.get("vendor"),  
976 - "category_path": product.get("category"),  
977 - "category_name": product.get("category"),  
978 - "image_url": item.get("image_url") or product.get("image_url"),  
979 - "tags": product.get("tags") or [],  
980 - "skus": [  
981 - {  
982 - "option1_value": option_values[0],  
983 - "option2_value": option_values[1],  
984 - "option3_value": option_values[2],  
985 - }  
986 - ],  
987 - }  
988 -  
989 - def _collect_label_issues(  
990 - self,  
991 - label: str,  
992 - query_profile: Dict[str, Any],  
993 - doc: Dict[str, Any],  
994 - ) -> List[str]:  
995 - evidence = self._doc_evidence_text(doc)  
996 - issues: List[str] = []  
997 - category = normalize_text(query_profile.get("primary_category"))  
998 - allowed_categories = [  
999 - normalize_text(item)  
1000 - for item in query_profile.get("allowed_categories") or []  
1001 - if str(item).strip()  
1002 - ]  
1003 -  
1004 - primary_category_match = True if not category else category in evidence  
1005 - allowed_category_match = False if allowed_categories else primary_category_match  
1006 - if allowed_categories:  
1007 - allowed_category_match = any(signal in evidence for signal in allowed_categories)  
1008 -  
1009 - if label == RELEVANCE_EXACT and not primary_category_match:  
1010 - if allowed_category_match:  
1011 - issues.append("Exact missing primary category evidence")  
1012 - else:  
1013 - issues.append("Exact has category mismatch")  
1014 -  
1015 - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:  
1016 - issues.append("Partial has category mismatch")  
1017 -  
1018 - for attr in query_profile.get("required_attributes") or []:  
1019 - if not isinstance(attr, dict):  
1020 - continue  
1021 - attr_name = normalize_text(attr.get("name"))  
1022 - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:  
1023 - continue  
1024 - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]  
1025 - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]  
1026 - has_required = any(term in evidence for term in required_terms) if required_terms else True  
1027 - has_conflict = any(term in evidence for term in conflicting_terms)  
1028 -  
1029 - if has_conflict and label != RELEVANCE_IRRELEVANT:  
1030 - issues.append(f"{label} conflicts on {attr_name}")  
1031 - if label == RELEVANCE_EXACT and not has_required:  
1032 - issues.append(f"Exact missing {attr_name}")  
1033 - return issues  
1034 -  
1035 - def audit_live_query(  
1036 - self,  
1037 - query: str,  
1038 - *,  
1039 - top_k: int = 100,  
1040 - language: str = "en",  
1041 - auto_annotate: bool = False,  
1042 - ) -> Dict[str, Any]:  
1043 - live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)  
1044 - if self.labeler_mode != "complex":  
1045 - labels = [  
1046 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT  
1047 - for item in live["results"]  
1048 - ]  
1049 - return {  
1050 - "query": query,  
1051 - "tenant_id": self.tenant_id,  
1052 - "top_k": top_k,  
1053 - "metrics": live["metrics"],  
1054 - "distribution": label_distribution(labels),  
1055 - "query_profile": None,  
1056 - "suspicious": [],  
1057 - "results": live["results"],  
1058 - }  
1059 - query_profile = self.get_query_profile(query, force_refresh=False)  
1060 - suspicious: List[Dict[str, Any]] = []  
1061 -  
1062 - for item in live["results"]:  
1063 - doc = self._result_item_to_doc(item)  
1064 - issues = self._collect_label_issues(item["label"] or "", query_profile, doc)  
1065 - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)  
1066 - if suggested_label != (item["label"] or ""):  
1067 - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]  
1068 - if issues:  
1069 - suspicious.append(  
1070 - {  
1071 - "rank": item["rank"],  
1072 - "spu_id": item["spu_id"],  
1073 - "title": item["title"],  
1074 - "label": item["label"],  
1075 - "suggested_label": suggested_label,  
1076 - "issues": issues,  
1077 - }  
1078 - )  
1079 -  
1080 - labels = [  
1081 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT  
1082 - for item in live["results"]  
1083 - ]  
1084 - return {  
1085 - "query": query,  
1086 - "tenant_id": self.tenant_id,  
1087 - "top_k": top_k,  
1088 - "metrics": live["metrics"],  
1089 - "distribution": label_distribution(labels),  
1090 - "query_profile": query_profile,  
1091 - "suspicious": suspicious,  
1092 - "results": live["results"],  
1093 - }  
1094 -  
1095 - def queries_from_file(self, path: Path) -> List[str]:  
1096 - return [  
1097 - line.strip()  
1098 - for line in path.read_text(encoding="utf-8").splitlines()  
1099 - if line.strip() and not line.strip().startswith("#")  
1100 - ]  
1101 -  
1102 - def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:  
1103 - if not refresh and self.store.has_corpus(self.tenant_id):  
1104 - return self.store.get_corpus_docs(self.tenant_id)  
1105 -  
1106 - es_client = get_es_client().client  
1107 - index_name = get_tenant_index_name(self.tenant_id)  
1108 - docs: List[Dict[str, Any]] = []  
1109 - for hit in scan(  
1110 - client=es_client,  
1111 - index=index_name,  
1112 - query={  
1113 - "_source": [  
1114 - "spu_id",  
1115 - "title",  
1116 - "vendor",  
1117 - "category_path",  
1118 - "category_name",  
1119 - "image_url",  
1120 - "skus",  
1121 - "tags",  
1122 - ],  
1123 - "query": {"match_all": {}},  
1124 - },  
1125 - size=500,  
1126 - preserve_order=False,  
1127 - clear_scroll=True,  
1128 - ):  
1129 - source = dict(hit.get("_source") or {})  
1130 - source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")  
1131 - docs.append(source)  
1132 - self.store.upsert_corpus_docs(self.tenant_id, docs)  
1133 - return docs  
1134 -  
1135 - def full_corpus_rerank(  
1136 - self,  
1137 - query: str,  
1138 - docs: Sequence[Dict[str, Any]],  
1139 - batch_size: int = 24,  
1140 - force_refresh: bool = False,  
1141 - ) -> List[Dict[str, Any]]:  
1142 - cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)  
1143 - pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]  
1144 - if pending:  
1145 - new_scores: Dict[str, float] = {}  
1146 - for start in range(0, len(pending), batch_size):  
1147 - batch = pending[start : start + batch_size]  
1148 - scores = self._rerank_batch_with_retry(query=query, docs=batch)  
1149 - if len(scores) != len(batch):  
1150 - raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")  
1151 - for doc, score in zip(batch, scores):  
1152 - new_scores[str(doc.get("spu_id"))] = float(score)  
1153 - self.store.upsert_rerank_scores(  
1154 - self.tenant_id,  
1155 - query,  
1156 - new_scores,  
1157 - model_name="qwen3_vllm_score",  
1158 - )  
1159 - cached.update(new_scores)  
1160 -  
1161 - ranked = []  
1162 - for doc in docs:  
1163 - spu_id = str(doc.get("spu_id"))  
1164 - ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})  
1165 - ranked.sort(key=lambda item: item["score"], reverse=True)  
1166 - return ranked  
1167 -  
1168 - def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:  
1169 - if not docs:  
1170 - return []  
1171 - doc_texts = [build_rerank_doc(doc) for doc in docs]  
1172 - try:  
1173 - scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)  
1174 - return scores  
1175 - except Exception:  
1176 - if len(docs) == 1:  
1177 - return [-1.0]  
1178 - if len(docs) <= 6:  
1179 - scores: List[float] = []  
1180 - for doc in docs:  
1181 - scores.extend(self._rerank_batch_with_retry(query, [doc]))  
1182 - return scores  
1183 - mid = len(docs) // 2  
1184 - left = self._rerank_batch_with_retry(query, docs[:mid])  
1185 - right = self._rerank_batch_with_retry(query, docs[mid:])  
1186 - return left + right  
1187 -  
1188 - def annotate_missing_labels(  
1189 - self,  
1190 - query: str,  
1191 - docs: Sequence[Dict[str, Any]],  
1192 - force_refresh: bool = False,  
1193 - ) -> Dict[str, str]:  
1194 - labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)  
1195 - missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]  
1196 - if not missing_docs:  
1197 - return labels  
1198 -  
1199 - for start in range(0, len(missing_docs), self.label_client.batch_size):  
1200 - batch = missing_docs[start : start + self.label_client.batch_size]  
1201 - batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)  
1202 - for sub_labels, raw_response, sub_batch in batch_pairs:  
1203 - to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}  
1204 - self.store.upsert_labels(  
1205 - self.tenant_id,  
1206 - query,  
1207 - to_store,  
1208 - judge_model=self.label_client.model,  
1209 - raw_response=raw_response,  
1210 - )  
1211 - labels.update(to_store)  
1212 - time.sleep(0.1)  
1213 - return labels  
1214 -  
1215 - def _classify_with_retry(  
1216 - self,  
1217 - query: str,  
1218 - docs: Sequence[Dict[str, Any]],  
1219 - *,  
1220 - force_refresh: bool = False,  
1221 - ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:  
1222 - if not docs:  
1223 - return []  
1224 - try:  
1225 - if self.labeler_mode == "complex":  
1226 - query_profile = self.get_query_profile(query, force_refresh=force_refresh)  
1227 - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)  
1228 - labels = [  
1229 - self._apply_rule_based_label_guardrails(label, query_profile, doc)  
1230 - for doc, label in zip(docs, labels)  
1231 - ]  
1232 - else:  
1233 - labels, raw_response = self.label_client.classify_batch_simple(query, docs)  
1234 - return [(labels, raw_response, docs)]  
1235 - except Exception:  
1236 - if len(docs) == 1:  
1237 - raise  
1238 - mid = len(docs) // 2  
1239 - return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)  
1240 -  
1241 - def build_query_annotation_set(  
1242 - self,  
1243 - query: str,  
1244 - *,  
1245 - search_depth: int = 1000,  
1246 - rerank_depth: int = 10000,  
1247 - annotate_search_top_k: int = 120,  
1248 - annotate_rerank_top_k: int = 200,  
1249 - language: str = "en",  
1250 - force_refresh_rerank: bool = False,  
1251 - force_refresh_labels: bool = False,  
1252 - ) -> QueryBuildResult:  
1253 - search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)  
1254 - search_results = list(search_payload.get("results") or [])  
1255 - corpus = self.corpus_docs(refresh=False)  
1256 - full_rerank = self.full_corpus_rerank(  
1257 - query=query,  
1258 - docs=corpus,  
1259 - force_refresh=force_refresh_rerank,  
1260 - )  
1261 - rerank_depth_effective = min(rerank_depth, len(full_rerank))  
1262 -  
1263 - pool_docs: Dict[str, Dict[str, Any]] = {}  
1264 - for doc in search_results[:annotate_search_top_k]:  
1265 - pool_docs[str(doc.get("spu_id"))] = doc  
1266 - for item in full_rerank[:annotate_rerank_top_k]:  
1267 - pool_docs[str(item["spu_id"])] = item["doc"]  
1268 -  
1269 - labels = self.annotate_missing_labels(  
1270 - query=query,  
1271 - docs=list(pool_docs.values()),  
1272 - force_refresh=force_refresh_labels,  
1273 - )  
1274 -  
1275 - search_labeled_results: List[Dict[str, Any]] = []  
1276 - for rank, doc in enumerate(search_results, start=1):  
1277 - spu_id = str(doc.get("spu_id"))  
1278 - label = labels.get(spu_id)  
1279 - search_labeled_results.append(  
1280 - {  
1281 - "rank": rank,  
1282 - "spu_id": spu_id,  
1283 - "title": build_display_title(doc),  
1284 - "image_url": doc.get("image_url"),  
1285 - "rerank_score": None,  
1286 - "label": label,  
1287 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
1288 - "product": compact_product_payload(doc),  
1289 - }  
1290 - )  
1291 -  
1292 - rerank_top_results: List[Dict[str, Any]] = []  
1293 - for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):  
1294 - doc = item["doc"]  
1295 - spu_id = str(item["spu_id"])  
1296 - rerank_top_results.append(  
1297 - {  
1298 - "rank": rank,  
1299 - "spu_id": spu_id,  
1300 - "title": build_display_title(doc),  
1301 - "image_url": doc.get("image_url"),  
1302 - "rerank_score": round(float(item["score"]), 8),  
1303 - "label": labels.get(spu_id),  
1304 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
1305 - "product": compact_product_payload(doc),  
1306 - }  
1307 - )  
1308 -  
1309 - top100_labels = [  
1310 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT  
1311 - for item in search_labeled_results[:100]  
1312 - ]  
1313 - metrics = compute_query_metrics(top100_labels)  
1314 - output_dir = ensure_dir(self.artifact_root / "query_builds")  
1315 - run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"  
1316 - output_json_path = output_dir / f"{run_id}.json"  
1317 - payload = {  
1318 - "run_id": run_id,  
1319 - "created_at": utc_now_iso(),  
1320 - "tenant_id": self.tenant_id,  
1321 - "query": query,  
1322 - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),  
1323 - "search_total": int(search_payload.get("total") or 0),  
1324 - "search_depth_requested": search_depth,  
1325 - "search_depth_effective": len(search_results),  
1326 - "rerank_depth_requested": rerank_depth,  
1327 - "rerank_depth_effective": rerank_depth_effective,  
1328 - "corpus_size": len(corpus),  
1329 - "annotation_pool": {  
1330 - "annotate_search_top_k": annotate_search_top_k,  
1331 - "annotate_rerank_top_k": annotate_rerank_top_k,  
1332 - "pool_size": len(pool_docs),  
1333 - },  
1334 - "labeler_mode": self.labeler_mode,  
1335 - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,  
1336 - "metrics_top100": metrics,  
1337 - "search_results": search_labeled_results,  
1338 - "full_rerank_top": rerank_top_results,  
1339 - }  
1340 - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")  
1341 - self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])  
1342 - return QueryBuildResult(  
1343 - query=query,  
1344 - tenant_id=self.tenant_id,  
1345 - search_total=int(search_payload.get("total") or 0),  
1346 - search_depth=len(search_results),  
1347 - rerank_corpus_size=len(corpus),  
1348 - annotated_count=len(pool_docs),  
1349 - output_json_path=output_json_path,  
1350 - )  
1351 -  
1352 - def evaluate_live_query(  
1353 - self,  
1354 - query: str,  
1355 - top_k: int = 100,  
1356 - auto_annotate: bool = False,  
1357 - language: str = "en",  
1358 - force_refresh_labels: bool = False,  
1359 - ) -> Dict[str, Any]:  
1360 - search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)  
1361 - results = list(search_payload.get("results") or [])  
1362 - if auto_annotate:  
1363 - self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)  
1364 - labels = self.store.get_labels(self.tenant_id, query)  
1365 - recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}  
1366 - labeled = []  
1367 - unlabeled_hits = 0  
1368 - for rank, doc in enumerate(results[:top_k], start=1):  
1369 - spu_id = str(doc.get("spu_id"))  
1370 - label = labels.get(spu_id)  
1371 - if label not in VALID_LABELS:  
1372 - unlabeled_hits += 1  
1373 - labeled.append(  
1374 - {  
1375 - "rank": rank,  
1376 - "spu_id": spu_id,  
1377 - "title": build_display_title(doc),  
1378 - "image_url": doc.get("image_url"),  
1379 - "label": label,  
1380 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
1381 - "product": compact_product_payload(doc),  
1382 - }  
1383 - )  
1384 - metric_labels = [  
1385 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT  
1386 - for item in labeled  
1387 - ]  
1388 - label_stats = self.store.get_query_label_stats(self.tenant_id, query)  
1389 - rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)  
1390 - relevant_missing_ids = [  
1391 - spu_id  
1392 - for spu_id, label in labels.items()  
1393 - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids  
1394 - ]  
1395 - missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)  
1396 - missing_relevant = []  
1397 - for spu_id in relevant_missing_ids:  
1398 - doc = missing_docs_map.get(spu_id)  
1399 - if not doc:  
1400 - continue  
1401 - missing_relevant.append(  
1402 - {  
1403 - "spu_id": spu_id,  
1404 - "label": labels[spu_id],  
1405 - "rerank_score": rerank_scores.get(spu_id),  
1406 - "title": build_display_title(doc),  
1407 - "image_url": doc.get("image_url"),  
1408 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
1409 - "product": compact_product_payload(doc),  
1410 - }  
1411 - )  
1412 - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}  
1413 - missing_relevant.sort(  
1414 - key=lambda item: (  
1415 - label_order.get(str(item.get("label")), 9),  
1416 - -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),  
1417 - str(item.get("title") or ""),  
1418 - )  
1419 - )  
1420 - tips: List[str] = []  
1421 - if auto_annotate:  
1422 - tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")  
1423 - else:  
1424 - tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")  
1425 - if label_stats["total"] == 0:  
1426 - tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")  
1427 - if unlabeled_hits:  
1428 - tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")  
1429 - if not missing_relevant:  
1430 - tips.append("No cached Exact/Partial products were missed by this recall set.")  
1431 - return {  
1432 - "query": query,  
1433 - "tenant_id": self.tenant_id,  
1434 - "top_k": top_k,  
1435 - "metrics": compute_query_metrics(metric_labels),  
1436 - "results": labeled,  
1437 - "missing_relevant": missing_relevant,  
1438 - "label_stats": {  
1439 - **label_stats,  
1440 - "unlabeled_hits_treated_irrelevant": unlabeled_hits,  
1441 - "recalled_hits": len(labeled),  
1442 - "missing_relevant_count": len(missing_relevant),  
1443 - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),  
1444 - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),  
1445 - },  
1446 - "tips": tips,  
1447 - "total": int(search_payload.get("total") or 0),  
1448 - }  
1449 -  
1450 - def batch_evaluate(  
1451 - self,  
1452 - queries: Sequence[str],  
1453 - *,  
1454 - top_k: int = 100,  
1455 - auto_annotate: bool = True,  
1456 - language: str = "en",  
1457 - force_refresh_labels: bool = False,  
1458 - ) -> Dict[str, Any]:  
1459 - per_query = []  
1460 - for query in queries:  
1461 - live = self.evaluate_live_query(  
1462 - query,  
1463 - top_k=top_k,  
1464 - auto_annotate=auto_annotate,  
1465 - language=language,  
1466 - force_refresh_labels=force_refresh_labels,  
1467 - )  
1468 - labels = [  
1469 - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT  
1470 - for item in live["results"]  
1471 - ]  
1472 - per_query.append(  
1473 - {  
1474 - "query": live["query"],  
1475 - "tenant_id": live["tenant_id"],  
1476 - "top_k": live["top_k"],  
1477 - "metrics": live["metrics"],  
1478 - "distribution": label_distribution(labels),  
1479 - "total": live["total"],  
1480 - }  
1481 - )  
1482 - aggregate = aggregate_metrics([item["metrics"] for item in per_query])  
1483 - aggregate_distribution = {  
1484 - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),  
1485 - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),  
1486 - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),  
1487 - }  
1488 - batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"  
1489 - report_dir = ensure_dir(self.artifact_root / "batch_reports")  
1490 - config_snapshot_path = report_dir / f"{batch_id}_config.json"  
1491 - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()  
1492 - config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")  
1493 - output_json_path = report_dir / f"{batch_id}.json"  
1494 - report_md_path = report_dir / f"{batch_id}.md"  
1495 - payload = {  
1496 - "batch_id": batch_id,  
1497 - "created_at": utc_now_iso(),  
1498 - "tenant_id": self.tenant_id,  
1499 - "queries": list(queries),  
1500 - "top_k": top_k,  
1501 - "aggregate_metrics": aggregate,  
1502 - "aggregate_distribution": aggregate_distribution,  
1503 - "per_query": per_query,  
1504 - "config_snapshot_path": str(config_snapshot_path),  
1505 - }  
1506 - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")  
1507 - report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")  
1508 - self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)  
1509 - return payload  
1510 -  
1511 -  
1512 -def render_batch_report_markdown(payload: Dict[str, Any]) -> str:  
1513 - lines = [  
1514 - "# Search Batch Evaluation",  
1515 - "",  
1516 - f"- Batch ID: {payload['batch_id']}",  
1517 - f"- Created at: {payload['created_at']}",  
1518 - f"- Tenant ID: {payload['tenant_id']}",  
1519 - f"- Query count: {len(payload['queries'])}",  
1520 - f"- Top K: {payload['top_k']}",  
1521 - "",  
1522 - "## Aggregate Metrics",  
1523 - "",  
1524 - ]  
1525 - for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):  
1526 - lines.append(f"- {key}: {value}")  
1527 - distribution = payload.get("aggregate_distribution") or {}  
1528 - if distribution:  
1529 - lines.extend(  
1530 - [  
1531 - "",  
1532 - "## Label Distribution",  
1533 - "",  
1534 - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",  
1535 - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",  
1536 - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",  
1537 - ]  
1538 - )  
1539 - lines.extend(["", "## Per Query", ""])  
1540 - for item in payload.get("per_query") or []:  
1541 - lines.append(f"### {item['query']}")  
1542 - lines.append("")  
1543 - for key, value in sorted((item.get("metrics") or {}).items()):  
1544 - lines.append(f"- {key}: {value}")  
1545 - distribution = item.get("distribution") or {}  
1546 - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")  
1547 - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")  
1548 - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")  
1549 - lines.append("")  
1550 - return "\n".join(lines)  
1551 -  
1552 -  
1553 -class SearchEvalRequest(BaseModel):  
1554 - query: str  
1555 - top_k: int = Field(default=100, ge=1, le=500)  
1556 - auto_annotate: bool = False  
1557 - language: str = "en"  
1558 -  
1559 -  
1560 -class BatchEvalRequest(BaseModel):  
1561 - queries: Optional[List[str]] = None  
1562 - top_k: int = Field(default=100, ge=1, le=500)  
1563 - auto_annotate: bool = False  
1564 - language: str = "en"  
1565 - force_refresh_labels: bool = False  
1566 -  
1567 -  
1568 -def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:  
1569 - app = FastAPI(title="Search Evaluation UI", version="1.0.0")  
1570 -  
1571 - @app.get("/", response_class=HTMLResponse)  
1572 - def home() -> str:  
1573 - return WEB_APP_HTML  
1574 -  
1575 - @app.get("/api/queries")  
1576 - def api_queries() -> Dict[str, Any]:  
1577 - return {"queries": framework.queries_from_file(query_file)}  
1578 -  
1579 - @app.post("/api/search-eval")  
1580 - def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:  
1581 - return framework.evaluate_live_query(  
1582 - query=request.query,  
1583 - top_k=request.top_k,  
1584 - auto_annotate=request.auto_annotate,  
1585 - language=request.language,  
1586 - )  
1587 -  
1588 - @app.post("/api/batch-eval")  
1589 - def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:  
1590 - queries = request.queries or framework.queries_from_file(query_file)  
1591 - if not queries:  
1592 - raise HTTPException(status_code=400, detail="No queries provided")  
1593 - return framework.batch_evaluate(  
1594 - queries=queries,  
1595 - top_k=request.top_k,  
1596 - auto_annotate=request.auto_annotate,  
1597 - language=request.language,  
1598 - force_refresh_labels=request.force_refresh_labels,  
1599 - )  
1600 -  
1601 - @app.get("/api/history")  
1602 - def api_history() -> Dict[str, Any]:  
1603 - return {"history": framework.store.list_batch_runs(limit=20)}  
1604 -  
1605 - @app.get("/api/history/{batch_id}/report")  
1606 - def api_history_report(batch_id: str) -> Dict[str, Any]:  
1607 - row = framework.store.get_batch_run(batch_id)  
1608 - if row is None:  
1609 - raise HTTPException(status_code=404, detail="Unknown batch_id")  
1610 - report_path = Path(row["report_markdown_path"]).resolve()  
1611 - root = framework.artifact_root.resolve()  
1612 - try:  
1613 - report_path.relative_to(root)  
1614 - except ValueError:  
1615 - raise HTTPException(status_code=403, detail="Report path is outside artifact root")  
1616 - if not report_path.is_file():  
1617 - raise HTTPException(status_code=404, detail="Report file not found")  
1618 - return {  
1619 - "batch_id": row["batch_id"],  
1620 - "created_at": row["created_at"],  
1621 - "tenant_id": row["tenant_id"],  
1622 - "report_markdown_path": str(report_path),  
1623 - "markdown": report_path.read_text(encoding="utf-8"),  
1624 - }  
1625 -  
1626 - return app  
1627 -  
1628 -  
1629 -WEB_APP_HTML = """  
1630 -<!doctype html>  
1631 -<html lang="en">  
1632 -<head>  
1633 - <meta charset="utf-8" />  
1634 - <meta name="viewport" content="width=device-width, initial-scale=1" />  
1635 - <title>Search Evaluation</title>  
1636 - <style>  
1637 - :root {  
1638 - --bg: #f5f3ed;  
1639 - --panel: #fffdf8;  
1640 - --ink: #1f2a24;  
1641 - --muted: #6b756e;  
1642 - --line: #ddd4c6;  
1643 - --accent: #0f766e;  
1644 - --exact: #0f766e;  
1645 - --partial: #b7791f;  
1646 - --irrelevant: #b42318;  
1647 - }  
1648 - body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:  
1649 - radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),  
1650 - linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }  
1651 - .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }  
1652 - .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }  
1653 - .main { padding: 24px; }  
1654 - h1, h2 { margin: 0 0 12px; }  
1655 - .muted { color: var(--muted); }  
1656 - .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }  
1657 - .query-item {  
1658 - display: block; width: 100%; border: 0; background: transparent; text-align: left;  
1659 - padding: 10px 12px; border-radius: 10px; cursor: pointer;  
1660 - color: var(--ink); font-size: 15px; font-weight: 500;  
1661 - }  
1662 - .query-item:hover { background: #eef6f4; }  
1663 - .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }  
1664 - input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }  
1665 - button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }  
1666 - button.secondary { background: #d9e6e3; color: #12433d; }  
1667 - .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }  
1668 - .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }  
1669 - .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }  
1670 - .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }  
1671 - .results { display: grid; gap: 10px; }  
1672 - .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }  
1673 - .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }  
1674 - .Exact { background: var(--exact); }  
1675 - .Partial { background: var(--partial); }  
1676 - .Irrelevant { background: var(--irrelevant); }  
1677 - .Unknown { background: #637381; }  
1678 - .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }  
1679 - .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }  
1680 - .options { color: var(--muted); line-height: 1.5; font-size: 14px; }  
1681 - .section { margin-bottom: 28px; }  
1682 - .history { font-size: 13px; line-height: 1.5; }  
1683 - .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; }  
1684 - .history-item {  
1685 - display: block; width: 100%; border: 1px solid var(--line); background: var(--panel);  
1686 - text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer;  
1687 - color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s;  
1688 - }  
1689 - .history-item:hover { background: #eef6f4; border-color: #b8d4cd; }  
1690 - .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }  
1691 - .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; }  
1692 - .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; }  
1693 - .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; }  
1694 - .history-item .hstats span { color: var(--muted); }  
1695 - .report-modal-root {  
1696 - position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center;  
1697 - padding: 16px; box-sizing: border-box;  
1698 - }  
1699 - .report-modal-root.is-open { display: flex; }  
1700 - .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); }  
1701 - .report-modal-dialog {  
1702 - position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column;  
1703 - background: var(--panel); border: 1px solid var(--line); border-radius: 18px;  
1704 - box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18);  
1705 - }  
1706 - .report-modal-head {  
1707 - flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px;  
1708 - padding: 16px 18px; border-bottom: 1px solid var(--line);  
1709 - }  
1710 - .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; }  
1711 - .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; }  
1712 - .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; }  
1713 - .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); }  
1714 - .report-modal-body {  
1715 - flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px;  
1716 - font-size: 14px; line-height: 1.55;  
1717 - }  
1718 - .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; }  
1719 - .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; }  
1720 - .batch-report-md h2:first-of-type { margin-top: 0; }  
1721 - .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; }  
1722 - .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; }  
1723 - .batch-report-md li { margin: 0.2rem 0; }  
1724 - .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; }  
1725 - .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; }  
1726 - .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }  
1727 - .tip { margin-bottom: 6px; color: var(--muted); }  
1728 - </style>  
1729 -</head>  
1730 -<body>  
1731 - <div class="app">  
1732 - <aside class="sidebar">  
1733 - <h2>Queries</h2>  
1734 - <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>  
1735 - <div id="queryList" class="query-list"></div>  
1736 - <div class="section">  
1737 - <h2>History</h2>  
1738 - <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p>  
1739 - <div id="history" class="history muted">Loading...</div>  
1740 - </div>  
1741 - </aside>  
1742 - <main class="main">  
1743 - <h1>Search Evaluation</h1>  
1744 - <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>  
1745 - <div class="toolbar">  
1746 - <input id="queryInput" type="text" placeholder="Search query" />  
1747 - <button onclick="runSingle()">Evaluate Query</button>  
1748 - <button class="secondary" onclick="runBatch()">Batch Evaluation</button>  
1749 - </div>  
1750 - <div id="status" class="muted section"></div>  
1751 - <section class="section">  
1752 - <h2>Metrics</h2>  
1753 - <div id="metrics" class="grid"></div>  
1754 - </section>  
1755 - <section class="section">  
1756 - <h2>Top Results</h2>  
1757 - <div id="results" class="results"></div>  
1758 - </section>  
1759 - <section class="section">  
1760 - <h2>Missed Exact / Partial</h2>  
1761 - <div id="missingRelevant" class="results"></div>  
1762 - </section>  
1763 - <section class="section">  
1764 - <h2>Notes</h2>  
1765 - <div id="tips" class="tips muted"></div>  
1766 - </section>  
1767 - </main>  
1768 - </div>  
1769 - <div id="reportModal" class="report-modal-root" aria-hidden="true">  
1770 - <div class="report-modal-backdrop" data-close-report="1"></div>  
1771 - <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle">  
1772 - <div class="report-modal-head">  
1773 - <h3 id="reportModalTitle">Batch report</h3>  
1774 - <div class="head-actions">  
1775 - <button type="button" class="secondary" id="reportCopyPath">Copy path</button>  
1776 - <button type="button" onclick="closeReportModal()">Close</button>  
1777 - </div>  
1778 - </div>  
1779 - <div id="reportModalMeta" class="report-modal-meta muted"></div>  
1780 - <div id="reportModalBody" class="report-modal-body batch-report-md"></div>  
1781 - </div>  
1782 - </div>  
1783 - <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script>  
1784 - <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script>  
1785 - <script>  
1786 - async function fetchJSON(url, options) {  
1787 - const res = await fetch(url, options);  
1788 - if (!res.ok) throw new Error(await res.text());  
1789 - return await res.json();  
1790 - }  
1791 - function renderMetrics(metrics) {  
1792 - const root = document.getElementById('metrics');  
1793 - root.innerHTML = '';  
1794 - Object.entries(metrics || {}).forEach(([key, value]) => {  
1795 - const card = document.createElement('div');  
1796 - card.className = 'metric';  
1797 - card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;  
1798 - root.appendChild(card);  
1799 - });  
1800 - }  
1801 - function renderResults(results, rootId='results', showRank=true) {  
1802 - const mount = document.getElementById(rootId);  
1803 - mount.innerHTML = '';  
1804 - (results || []).forEach(item => {  
1805 - const label = item.label || 'Unknown';  
1806 - const box = document.createElement('div');  
1807 - box.className = 'result';  
1808 - box.innerHTML = `  
1809 - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>  
1810 - <img class="thumb" src="${item.image_url || ''}" alt="" />  
1811 - <div>  
1812 - <div class="title">${item.title || ''}</div>  
1813 - <div class="options">  
1814 - <div>${(item.option_values || [])[0] || ''}</div>  
1815 - <div>${(item.option_values || [])[1] || ''}</div>  
1816 - <div>${(item.option_values || [])[2] || ''}</div>  
1817 - </div>  
1818 - </div>`;  
1819 - mount.appendChild(box);  
1820 - });  
1821 - if (!(results || []).length) {  
1822 - mount.innerHTML = '<div class="muted">None.</div>';  
1823 - }  
1824 - }  
1825 - function renderTips(data) {  
1826 - const root = document.getElementById('tips');  
1827 - const tips = [...(data.tips || [])];  
1828 - const stats = data.label_stats || {};  
1829 - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);  
1830 - root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');  
1831 - }  
1832 - async function loadQueries() {  
1833 - const data = await fetchJSON('/api/queries');  
1834 - const root = document.getElementById('queryList');  
1835 - root.innerHTML = '';  
1836 - data.queries.forEach(query => {  
1837 - const btn = document.createElement('button');  
1838 - btn.className = 'query-item';  
1839 - btn.textContent = query;  
1840 - btn.onclick = () => {  
1841 - document.getElementById('queryInput').value = query;  
1842 - runSingle();  
1843 - };  
1844 - root.appendChild(btn);  
1845 - });  
1846 - }  
1847 - function fmtMetric(m, key, digits) {  
1848 - const v = m && m[key];  
1849 - if (v == null || Number.isNaN(Number(v))) return null;  
1850 - const n = Number(v);  
1851 - return n.toFixed(digits);  
1852 - }  
1853 - function historySummaryHtml(meta) {  
1854 - const m = meta && meta.aggregate_metrics;  
1855 - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;  
1856 - const parts = [];  
1857 - if (nq != null) parts.push(`<span>Queries</span> ${nq}`);  
1858 - const p10 = fmtMetric(m, 'P@10', 3);  
1859 - const p52 = fmtMetric(m, 'P@5_2_3', 3);  
1860 - const map3 = fmtMetric(m, 'MAP_3', 3);  
1861 - if (p10) parts.push(`<span>P@10</span> ${p10}`);  
1862 - if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`);  
1863 - if (map3) parts.push(`<span>MAP_3</span> ${map3}`);  
1864 - if (!parts.length) return '';  
1865 - return `<div class="hstats">${parts.join(' ยท ')}</div>`;  
1866 - }  
1867 - async function loadHistory() {  
1868 - const data = await fetchJSON('/api/history');  
1869 - const root = document.getElementById('history');  
1870 - root.classList.remove('muted');  
1871 - const items = data.history || [];  
1872 - if (!items.length) {  
1873 - root.innerHTML = '<span class="muted">No history yet.</span>';  
1874 - return;  
1875 - }  
1876 - root.innerHTML = `<div class="history-list"></div>`;  
1877 - const list = root.querySelector('.history-list');  
1878 - items.forEach(item => {  
1879 - const btn = document.createElement('button');  
1880 - btn.type = 'button';  
1881 - btn.className = 'history-item';  
1882 - btn.setAttribute('aria-label', `Open report ${item.batch_id}`);  
1883 - const sum = historySummaryHtml(item.metadata);  
1884 - btn.innerHTML = `<div class="hid">${item.batch_id}</div>  
1885 - <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`;  
1886 - btn.onclick = () => openBatchReport(item.batch_id);  
1887 - list.appendChild(btn);  
1888 - });  
1889 - }  
1890 - let _lastReportPath = '';  
1891 - function closeReportModal() {  
1892 - const el = document.getElementById('reportModal');  
1893 - el.classList.remove('is-open');  
1894 - el.setAttribute('aria-hidden', 'true');  
1895 - document.getElementById('reportModalBody').innerHTML = '';  
1896 - document.getElementById('reportModalMeta').textContent = '';  
1897 - }  
1898 - async function openBatchReport(batchId) {  
1899 - const el = document.getElementById('reportModal');  
1900 - const body = document.getElementById('reportModalBody');  
1901 - const metaEl = document.getElementById('reportModalMeta');  
1902 - const titleEl = document.getElementById('reportModalTitle');  
1903 - el.classList.add('is-open');  
1904 - el.setAttribute('aria-hidden', 'false');  
1905 - titleEl.textContent = batchId;  
1906 - metaEl.textContent = '';  
1907 - body.className = 'report-modal-body batch-report-md report-modal-loading';  
1908 - body.textContent = 'Loading reportโ€ฆ';  
1909 - try {  
1910 - const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report');  
1911 - _lastReportPath = rep.report_markdown_path || '';  
1912 - metaEl.textContent = rep.report_markdown_path || '';  
1913 - const raw = marked.parse(rep.markdown || '', { gfm: true });  
1914 - const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } });  
1915 - body.className = 'report-modal-body batch-report-md';  
1916 - body.innerHTML = safe;  
1917 - } catch (e) {  
1918 - body.className = 'report-modal-body report-modal-error';  
1919 - body.textContent = (e && e.message) ? e.message : String(e);  
1920 - }  
1921 - }  
1922 - document.getElementById('reportModal').addEventListener('click', (ev) => {  
1923 - if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal();  
1924 - });  
1925 - document.addEventListener('keydown', (ev) => {  
1926 - if (ev.key === 'Escape') closeReportModal();  
1927 - });  
1928 - document.getElementById('reportCopyPath').addEventListener('click', async () => {  
1929 - if (!_lastReportPath) return;  
1930 - try {  
1931 - await navigator.clipboard.writeText(_lastReportPath);  
1932 - } catch (_) {}  
1933 - });  
1934 - async function runSingle() {  
1935 - const query = document.getElementById('queryInput').value.trim();  
1936 - if (!query) return;  
1937 - document.getElementById('status').textContent = `Evaluating "${query}"...`;  
1938 - const data = await fetchJSON('/api/search-eval', {  
1939 - method: 'POST',  
1940 - headers: {'Content-Type': 'application/json'},  
1941 - body: JSON.stringify({query, top_k: 100, auto_annotate: false})  
1942 - });  
1943 - document.getElementById('status').textContent = `Done. total=${data.total}`;  
1944 - renderMetrics(data.metrics);  
1945 - renderResults(data.results, 'results', true);  
1946 - renderResults(data.missing_relevant, 'missingRelevant', false);  
1947 - renderTips(data);  
1948 - loadHistory();  
1949 - }  
1950 - async function runBatch() {  
1951 - document.getElementById('status').textContent = 'Running batch evaluation...';  
1952 - const data = await fetchJSON('/api/batch-eval', {  
1953 - method: 'POST',  
1954 - headers: {'Content-Type': 'application/json'},  
1955 - body: JSON.stringify({top_k: 100, auto_annotate: false})  
1956 - });  
1957 - document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;  
1958 - renderMetrics(data.aggregate_metrics);  
1959 - renderResults([], 'results', true);  
1960 - renderResults([], 'missingRelevant', false);  
1961 - document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';  
1962 - loadHistory();  
1963 - }  
1964 - loadQueries();  
1965 - loadHistory();  
1966 - </script>  
1967 -</body>  
1968 -</html>  
1969 -"""  
1970 -  
1971 -  
1972 -def build_cli_parser() -> argparse.ArgumentParser:  
1973 - parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")  
1974 - sub = parser.add_subparsers(dest="command", required=True)  
1975 -  
1976 - build = sub.add_parser("build", help="Build pooled annotation set for queries")  
1977 - build.add_argument("--tenant-id", default="163")  
1978 - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
1979 - build.add_argument("--search-depth", type=int, default=1000)  
1980 - build.add_argument("--rerank-depth", type=int, default=10000)  
1981 - build.add_argument("--annotate-search-top-k", type=int, default=120)  
1982 - build.add_argument("--annotate-rerank-top-k", type=int, default=200)  
1983 - build.add_argument("--language", default="en")  
1984 - build.add_argument("--force-refresh-rerank", action="store_true")  
1985 - build.add_argument("--force-refresh-labels", action="store_true")  
1986 - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])  
1987 -  
1988 - batch = sub.add_parser("batch", help="Run batch evaluation against live search")  
1989 - batch.add_argument("--tenant-id", default="163")  
1990 - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
1991 - batch.add_argument("--top-k", type=int, default=100)  
1992 - batch.add_argument("--language", default="en")  
1993 - batch.add_argument("--force-refresh-labels", action="store_true")  
1994 - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])  
1995 -  
1996 - audit = sub.add_parser("audit", help="Audit annotation quality for queries")  
1997 - audit.add_argument("--tenant-id", default="163")  
1998 - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
1999 - audit.add_argument("--top-k", type=int, default=100)  
2000 - audit.add_argument("--language", default="en")  
2001 - audit.add_argument("--limit-suspicious", type=int, default=5)  
2002 - audit.add_argument("--force-refresh-labels", action="store_true")  
2003 - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])  
2004 -  
2005 - serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")  
2006 - serve.add_argument("--tenant-id", default="163")  
2007 - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
2008 - serve.add_argument("--host", default="0.0.0.0")  
2009 - serve.add_argument("--port", type=int, default=6010)  
2010 - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])  
2011 -  
2012 - return parser  
2013 -  
2014 -  
2015 -def run_build(args: argparse.Namespace) -> None:  
2016 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)  
2017 - queries = framework.queries_from_file(Path(args.queries_file))  
2018 - summary = []  
2019 - for query in queries:  
2020 - result = framework.build_query_annotation_set(  
2021 - query=query,  
2022 - search_depth=args.search_depth,  
2023 - rerank_depth=args.rerank_depth,  
2024 - annotate_search_top_k=args.annotate_search_top_k,  
2025 - annotate_rerank_top_k=args.annotate_rerank_top_k,  
2026 - language=args.language,  
2027 - force_refresh_rerank=args.force_refresh_rerank,  
2028 - force_refresh_labels=args.force_refresh_labels,  
2029 - )  
2030 - summary.append(  
2031 - {  
2032 - "query": result.query,  
2033 - "search_total": result.search_total,  
2034 - "search_depth": result.search_depth,  
2035 - "rerank_corpus_size": result.rerank_corpus_size,  
2036 - "annotated_count": result.annotated_count,  
2037 - "output_json_path": str(result.output_json_path),  
2038 - }  
2039 - )  
2040 - print(  
2041 - f"[build] query={result.query!r} search_total={result.search_total} "  
2042 - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "  
2043 - f"annotated={result.annotated_count} output={result.output_json_path}"  
2044 - )  
2045 - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"  
2046 - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")  
2047 - print(f"[done] summary={out_path}")  
2048 -  
2049 -  
2050 -def run_batch(args: argparse.Namespace) -> None:  
2051 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)  
2052 - queries = framework.queries_from_file(Path(args.queries_file))  
2053 - payload = framework.batch_evaluate(  
2054 - queries=queries,  
2055 - top_k=args.top_k,  
2056 - auto_annotate=True,  
2057 - language=args.language,  
2058 - force_refresh_labels=args.force_refresh_labels,  
2059 - )  
2060 - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")  
2061 -  
2062 -  
2063 -def run_audit(args: argparse.Namespace) -> None:  
2064 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)  
2065 - queries = framework.queries_from_file(Path(args.queries_file))  
2066 - audit_items = []  
2067 - for query in queries:  
2068 - item = framework.audit_live_query(  
2069 - query=query,  
2070 - top_k=args.top_k,  
2071 - language=args.language,  
2072 - auto_annotate=not args.force_refresh_labels,  
2073 - )  
2074 - if args.force_refresh_labels:  
2075 - live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)  
2076 - framework.annotate_missing_labels(  
2077 - query=query,  
2078 - docs=list(live_payload.get("results") or [])[: args.top_k],  
2079 - force_refresh=True,  
2080 - )  
2081 - item = framework.audit_live_query(  
2082 - query=query,  
2083 - top_k=args.top_k,  
2084 - language=args.language,  
2085 - auto_annotate=False,  
2086 - )  
2087 - audit_items.append(  
2088 - {  
2089 - "query": query,  
2090 - "metrics": item["metrics"],  
2091 - "distribution": item["distribution"],  
2092 - "suspicious_count": len(item["suspicious"]),  
2093 - "suspicious_examples": item["suspicious"][: args.limit_suspicious],  
2094 - }  
2095 - )  
2096 - print(  
2097 - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"  
2098 - )  
2099 -  
2100 - summary = {  
2101 - "created_at": utc_now_iso(),  
2102 - "tenant_id": args.tenant_id,  
2103 - "top_k": args.top_k,  
2104 - "query_count": len(queries),  
2105 - "total_suspicious": sum(item["suspicious_count"] for item in audit_items),  
2106 - "queries": audit_items,  
2107 - }  
2108 - out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"  
2109 - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")  
2110 - print(f"[done] audit={out_path}")  
2111 -  
2112 -  
2113 -def run_serve(args: argparse.Namespace) -> None:  
2114 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)  
2115 - app = create_web_app(framework, Path(args.queries_file))  
2116 - import uvicorn  
2117 -  
2118 - uvicorn.run(app, host=args.host, port=args.port, log_level="info")  
2119 -  
2120 -  
2121 -def main() -> None:  
2122 - parser = build_cli_parser()  
2123 - args = parser.parse_args()  
2124 - if args.command == "build":  
2125 - run_build(args)  
2126 - return  
2127 - if args.command == "batch":  
2128 - run_batch(args)  
2129 - return  
2130 - if args.command == "audit":  
2131 - run_audit(args)  
2132 - return  
2133 - if args.command == "serve":  
2134 - run_serve(args)  
2135 - return  
2136 - raise SystemExit(f"unknown command: {args.command}")  
2137 -  
2138 -  
2139 -if __name__ == "__main__":  
2140 - main()  
scripts/evaluation/eval_framework/__init__.py 0 โ†’ 100644
@@ -0,0 +1,59 @@ @@ -0,0 +1,59 @@
  1 +"""
  2 +Search evaluation framework: pooled relevance annotation, live metrics, batch reports.
  3 +
  4 +Importing this package ensures the project root is on ``sys.path`` (for ``api.*`` imports).
  5 +"""
  6 +
  7 +from __future__ import annotations
  8 +
  9 +from .utils import ensure_project_on_path
  10 +
  11 +ensure_project_on_path()
  12 +
  13 +from .constants import ( # noqa: E402
  14 + DEFAULT_ARTIFACT_ROOT,
  15 + DEFAULT_LABELER_MODE,
  16 + DEFAULT_QUERY_FILE,
  17 + JUDGE_PROMPT_VERSION_COMPLEX,
  18 + JUDGE_PROMPT_VERSION_SIMPLE,
  19 + PROJECT_ROOT,
  20 + RELEVANCE_EXACT,
  21 + RELEVANCE_IRRELEVANT,
  22 + RELEVANCE_PARTIAL,
  23 + VALID_LABELS,
  24 +)
  25 +from .framework import SearchEvaluationFramework # noqa: E402
  26 +from .store import EvalStore, QueryBuildResult # noqa: E402
  27 +from .cli import build_cli_parser, main # noqa: E402
  28 +from .web_app import create_web_app # noqa: E402
  29 +from .reports import render_batch_report_markdown # noqa: E402
  30 +from .utils import ( # noqa: E402
  31 + ensure_dir,
  32 + sha1_text,
  33 + utc_now_iso,
  34 + utc_timestamp,
  35 +)
  36 +
  37 +__all__ = [
  38 + "DEFAULT_ARTIFACT_ROOT",
  39 + "DEFAULT_LABELER_MODE",
  40 + "DEFAULT_QUERY_FILE",
  41 + "EvalStore",
  42 + "JUDGE_PROMPT_VERSION_COMPLEX",
  43 + "JUDGE_PROMPT_VERSION_SIMPLE",
  44 + "PROJECT_ROOT",
  45 + "QueryBuildResult",
  46 + "RELEVANCE_EXACT",
  47 + "RELEVANCE_IRRELEVANT",
  48 + "RELEVANCE_PARTIAL",
  49 + "SearchEvaluationFramework",
  50 + "VALID_LABELS",
  51 + "build_cli_parser",
  52 + "create_web_app",
  53 + "ensure_dir",
  54 + "main",
  55 + "render_batch_report_markdown",
  56 + "sha1_text",
  57 + "utc_now_iso",
  58 + "utc_timestamp",
  59 +]
scripts/evaluation/eval_framework/__main__.py 0 โ†’ 100644
@@ -0,0 +1,4 @@ @@ -0,0 +1,4 @@
  1 +from .cli import main
  2 +
  3 +if __name__ == "__main__":
  4 + main()
scripts/evaluation/eval_framework/api_models.py 0 โ†’ 100644
@@ -0,0 +1,22 @@ @@ -0,0 +1,22 @@
  1 +"""Pydantic request bodies for the evaluation FastAPI app."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import List, Optional
  6 +
  7 +from pydantic import BaseModel, Field
  8 +
  9 +
  10 +class SearchEvalRequest(BaseModel):
  11 + query: str
  12 + top_k: int = Field(default=100, ge=1, le=500)
  13 + auto_annotate: bool = False
  14 + language: str = "en"
  15 +
  16 +
  17 +class BatchEvalRequest(BaseModel):
  18 + queries: Optional[List[str]] = None
  19 + top_k: int = Field(default=100, ge=1, le=500)
  20 + auto_annotate: bool = False
  21 + language: str = "en"
  22 + force_refresh_labels: bool = False
scripts/evaluation/eval_framework/cli.py 0 โ†’ 100644
@@ -0,0 +1,179 @@ @@ -0,0 +1,179 @@
  1 +"""CLI: build annotations, batch eval, audit, serve web UI."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import argparse
  6 +import json
  7 +from pathlib import Path
  8 +
  9 +from .constants import DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE
  10 +from .framework import SearchEvaluationFramework
  11 +from .utils import ensure_dir, utc_now_iso, utc_timestamp
  12 +from .web_app import create_web_app
  13 +
  14 +
  15 +def build_cli_parser() -> argparse.ArgumentParser:
  16 + parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
  17 + sub = parser.add_subparsers(dest="command", required=True)
  18 +
  19 + build = sub.add_parser("build", help="Build pooled annotation set for queries")
  20 + build.add_argument("--tenant-id", default="163")
  21 + build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  22 + build.add_argument("--search-depth", type=int, default=1000)
  23 + build.add_argument("--rerank-depth", type=int, default=10000)
  24 + build.add_argument("--annotate-search-top-k", type=int, default=120)
  25 + build.add_argument("--annotate-rerank-top-k", type=int, default=200)
  26 + build.add_argument("--language", default="en")
  27 + build.add_argument("--force-refresh-rerank", action="store_true")
  28 + build.add_argument("--force-refresh-labels", action="store_true")
  29 + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  30 +
  31 + batch = sub.add_parser("batch", help="Run batch evaluation against live search")
  32 + batch.add_argument("--tenant-id", default="163")
  33 + batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  34 + batch.add_argument("--top-k", type=int, default=100)
  35 + batch.add_argument("--language", default="en")
  36 + batch.add_argument("--force-refresh-labels", action="store_true")
  37 + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  38 +
  39 + audit = sub.add_parser("audit", help="Audit annotation quality for queries")
  40 + audit.add_argument("--tenant-id", default="163")
  41 + audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  42 + audit.add_argument("--top-k", type=int, default=100)
  43 + audit.add_argument("--language", default="en")
  44 + audit.add_argument("--limit-suspicious", type=int, default=5)
  45 + audit.add_argument("--force-refresh-labels", action="store_true")
  46 + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  47 +
  48 + serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
  49 + serve.add_argument("--tenant-id", default="163")
  50 + serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  51 + serve.add_argument("--host", default="0.0.0.0")
  52 + serve.add_argument("--port", type=int, default=6010)
  53 + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  54 +
  55 + return parser
  56 +
  57 +
  58 +def run_build(args: argparse.Namespace) -> None:
  59 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  60 + queries = framework.queries_from_file(Path(args.queries_file))
  61 + summary = []
  62 + for query in queries:
  63 + result = framework.build_query_annotation_set(
  64 + query=query,
  65 + search_depth=args.search_depth,
  66 + rerank_depth=args.rerank_depth,
  67 + annotate_search_top_k=args.annotate_search_top_k,
  68 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  69 + language=args.language,
  70 + force_refresh_rerank=args.force_refresh_rerank,
  71 + force_refresh_labels=args.force_refresh_labels,
  72 + )
  73 + summary.append(
  74 + {
  75 + "query": result.query,
  76 + "search_total": result.search_total,
  77 + "search_depth": result.search_depth,
  78 + "rerank_corpus_size": result.rerank_corpus_size,
  79 + "annotated_count": result.annotated_count,
  80 + "output_json_path": str(result.output_json_path),
  81 + }
  82 + )
  83 + print(
  84 + f"[build] query={result.query!r} search_total={result.search_total} "
  85 + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
  86 + f"annotated={result.annotated_count} output={result.output_json_path}"
  87 + )
  88 + out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
  89 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  90 + print(f"[done] summary={out_path}")
  91 +
  92 +
  93 +def run_batch(args: argparse.Namespace) -> None:
  94 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  95 + queries = framework.queries_from_file(Path(args.queries_file))
  96 + payload = framework.batch_evaluate(
  97 + queries=queries,
  98 + top_k=args.top_k,
  99 + auto_annotate=True,
  100 + language=args.language,
  101 + force_refresh_labels=args.force_refresh_labels,
  102 + )
  103 + print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
  104 +
  105 +
  106 +def run_audit(args: argparse.Namespace) -> None:
  107 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  108 + queries = framework.queries_from_file(Path(args.queries_file))
  109 + audit_items = []
  110 + for query in queries:
  111 + item = framework.audit_live_query(
  112 + query=query,
  113 + top_k=args.top_k,
  114 + language=args.language,
  115 + auto_annotate=not args.force_refresh_labels,
  116 + )
  117 + if args.force_refresh_labels:
  118 + live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
  119 + framework.annotate_missing_labels(
  120 + query=query,
  121 + docs=list(live_payload.get("results") or [])[: args.top_k],
  122 + force_refresh=True,
  123 + )
  124 + item = framework.audit_live_query(
  125 + query=query,
  126 + top_k=args.top_k,
  127 + language=args.language,
  128 + auto_annotate=False,
  129 + )
  130 + audit_items.append(
  131 + {
  132 + "query": query,
  133 + "metrics": item["metrics"],
  134 + "distribution": item["distribution"],
  135 + "suspicious_count": len(item["suspicious"]),
  136 + "suspicious_examples": item["suspicious"][: args.limit_suspicious],
  137 + }
  138 + )
  139 + print(
  140 + f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
  141 + )
  142 +
  143 + summary = {
  144 + "created_at": utc_now_iso(),
  145 + "tenant_id": args.tenant_id,
  146 + "top_k": args.top_k,
  147 + "query_count": len(queries),
  148 + "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
  149 + "queries": audit_items,
  150 + }
  151 + out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
  152 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  153 + print(f"[done] audit={out_path}")
  154 +
  155 +
  156 +def run_serve(args: argparse.Namespace) -> None:
  157 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  158 + app = create_web_app(framework, Path(args.queries_file))
  159 + import uvicorn
  160 +
  161 + uvicorn.run(app, host=args.host, port=args.port, log_level="info")
  162 +
  163 +
  164 +def main() -> None:
  165 + parser = build_cli_parser()
  166 + args = parser.parse_args()
  167 + if args.command == "build":
  168 + run_build(args)
  169 + return
  170 + if args.command == "batch":
  171 + run_batch(args)
  172 + return
  173 + if args.command == "audit":
  174 + run_audit(args)
  175 + return
  176 + if args.command == "serve":
  177 + run_serve(args)
  178 + return
  179 + raise SystemExit(f"unknown command: {args.command}")
scripts/evaluation/eval_framework/clients.py 0 โ†’ 100644
@@ -0,0 +1,149 @@ @@ -0,0 +1,149 @@
  1 +"""HTTP clients for search API, reranker, and DashScope chat (relevance labeling)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Any, Dict, List, Optional, Sequence, Tuple
  6 +
  7 +import requests
  8 +
  9 +from .constants import VALID_LABELS
  10 +from .prompts import (
  11 + classify_batch_complex_prompt,
  12 + classify_batch_simple_prompt,
  13 + extract_query_profile_prompt,
  14 +)
  15 +from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
  16 +
  17 +
  18 +class SearchServiceClient:
  19 + def __init__(self, base_url: str, tenant_id: str):
  20 + self.base_url = base_url.rstrip("/")
  21 + self.tenant_id = str(tenant_id)
  22 + self.session = requests.Session()
  23 +
  24 + def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
  25 + response = self.session.post(
  26 + f"{self.base_url}/search/",
  27 + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  28 + json={"query": query, "size": size, "from": from_, "language": language},
  29 + timeout=120,
  30 + )
  31 + response.raise_for_status()
  32 + return response.json()
  33 +
  34 +
  35 +class RerankServiceClient:
  36 + def __init__(self, service_url: str):
  37 + self.service_url = service_url.rstrip("/")
  38 + self.session = requests.Session()
  39 +
  40 + def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:
  41 + payload: Dict[str, Any] = {
  42 + "query": query,
  43 + "docs": list(docs),
  44 + "normalize": normalize,
  45 + }
  46 + if top_n is not None:
  47 + payload["top_n"] = int(top_n)
  48 + response = self.session.post(self.service_url, json=payload, timeout=180)
  49 + response.raise_for_status()
  50 + data = response.json()
  51 + return list(data.get("scores") or []), dict(data.get("meta") or {})
  52 +
  53 +
  54 +class DashScopeLabelClient:
  55 + def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
  56 + self.model = model
  57 + self.base_url = base_url.rstrip("/")
  58 + self.api_key = api_key
  59 + self.batch_size = int(batch_size)
  60 + self.session = requests.Session()
  61 +
  62 + def _chat(self, prompt: str) -> Tuple[str, str]:
  63 + response = self.session.post(
  64 + f"{self.base_url}/chat/completions",
  65 + headers={
  66 + "Authorization": f"Bearer {self.api_key}",
  67 + "Content-Type": "application/json",
  68 + },
  69 + json={
  70 + "model": self.model,
  71 + "messages": [{"role": "user", "content": prompt}],
  72 + "temperature": 0,
  73 + "top_p": 0.1,
  74 + },
  75 + timeout=180,
  76 + )
  77 + response.raise_for_status()
  78 + data = response.json()
  79 + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
  80 + return content, safe_json_dumps(data)
  81 +
  82 + def classify_batch_simple(
  83 + self,
  84 + query: str,
  85 + docs: Sequence[Dict[str, Any]],
  86 + ) -> Tuple[List[str], str]:
  87 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  88 + prompt = classify_batch_simple_prompt(query, numbered_docs)
  89 + content, raw_response = self._chat(prompt)
  90 + labels = []
  91 + for line in str(content or "").splitlines():
  92 + label = line.strip()
  93 + if label in VALID_LABELS:
  94 + labels.append(label)
  95 + if len(labels) != len(docs):
  96 + payload = extract_json_blob(content)
  97 + if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
  98 + labels = []
  99 + for item in payload["labels"][: len(docs)]:
  100 + if isinstance(item, dict):
  101 + label = str(item.get("label") or "").strip()
  102 + else:
  103 + label = str(item).strip()
  104 + if label in VALID_LABELS:
  105 + labels.append(label)
  106 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  107 + raise ValueError(f"unexpected simple label output: {content!r}")
  108 + return labels, raw_response
  109 +
  110 + def extract_query_profile(
  111 + self,
  112 + query: str,
  113 + parser_hints: Dict[str, Any],
  114 + ) -> Tuple[Dict[str, Any], str]:
  115 + prompt = extract_query_profile_prompt(query, parser_hints)
  116 + content, raw_response = self._chat(prompt)
  117 + payload = extract_json_blob(content)
  118 + if not isinstance(payload, dict):
  119 + raise ValueError(f"unexpected query profile payload: {content!r}")
  120 + payload.setdefault("normalized_query_en", query)
  121 + payload.setdefault("primary_category", "")
  122 + payload.setdefault("allowed_categories", [])
  123 + payload.setdefault("required_attributes", [])
  124 + payload.setdefault("notes", [])
  125 + return payload, raw_response
  126 +
  127 + def classify_batch_complex(
  128 + self,
  129 + query: str,
  130 + query_profile: Dict[str, Any],
  131 + docs: Sequence[Dict[str, Any]],
  132 + ) -> Tuple[List[str], str]:
  133 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  134 + prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs)
  135 + content, raw_response = self._chat(prompt)
  136 + payload = extract_json_blob(content)
  137 + if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
  138 + raise ValueError(f"unexpected label payload: {content!r}")
  139 + labels_payload = payload["labels"]
  140 + labels: List[str] = []
  141 + for item in labels_payload[: len(docs)]:
  142 + if not isinstance(item, dict):
  143 + continue
  144 + label = str(item.get("label") or "").strip()
  145 + if label in VALID_LABELS:
  146 + labels.append(label)
  147 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  148 + raise ValueError(f"unexpected label output: {content!r}")
  149 + return labels, raw_response
scripts/evaluation/eval_framework/constants.py 0 โ†’ 100644
@@ -0,0 +1,19 @@ @@ -0,0 +1,19 @@
  1 +"""Paths and shared constants for search evaluation."""
  2 +
  3 +from pathlib import Path
  4 +
  5 +_PKG_DIR = Path(__file__).resolve().parent
  6 +_SCRIPTS_EVAL_DIR = _PKG_DIR.parent
  7 +PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
  8 +
  9 +RELEVANCE_EXACT = "Exact"
  10 +RELEVANCE_PARTIAL = "Partial"
  11 +RELEVANCE_IRRELEVANT = "Irrelevant"
  12 +VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
  13 +
  14 +DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
  15 +DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
  16 +
  17 +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
  18 +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
  19 +DEFAULT_LABELER_MODE = "simple"
scripts/evaluation/eval_framework/framework.py 0 โ†’ 100644
@@ -0,0 +1,719 @@ @@ -0,0 +1,719 @@
  1 +"""Core orchestration: corpus, rerank, LLM labels, live/batch evaluation."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +import time
  7 +from pathlib import Path
  8 +from typing import Any, Dict, List, Sequence, Tuple
  9 +
  10 +import requests
  11 +from elasticsearch.helpers import scan
  12 +
  13 +from api.app import get_app_config, get_es_client, get_query_parser, init_service
  14 +from indexer.mapping_generator import get_tenant_index_name
  15 +
  16 +from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
  17 +from .constants import (
  18 + DEFAULT_ARTIFACT_ROOT,
  19 + DEFAULT_LABELER_MODE,
  20 + JUDGE_PROMPT_VERSION_COMPLEX,
  21 + RELEVANCE_EXACT,
  22 + RELEVANCE_IRRELEVANT,
  23 + RELEVANCE_PARTIAL,
  24 + VALID_LABELS,
  25 +)
  26 +from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
  27 +from .reports import render_batch_report_markdown
  28 +from .store import EvalStore, QueryBuildResult
  29 +from .utils import (
  30 + build_display_title,
  31 + build_rerank_doc,
  32 + compact_option_values,
  33 + compact_product_payload,
  34 + ensure_dir,
  35 + normalize_text,
  36 + pick_text,
  37 + sha1_text,
  38 + utc_now_iso,
  39 + utc_timestamp,
  40 +)
  41 +
  42 +
  43 +class SearchEvaluationFramework:
  44 + def __init__(
  45 + self,
  46 + tenant_id: str,
  47 + artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
  48 + search_base_url: str = "http://localhost:6002",
  49 + labeler_mode: str = DEFAULT_LABELER_MODE,
  50 + ):
  51 + init_service(get_app_config().infrastructure.elasticsearch.host)
  52 + self.tenant_id = str(tenant_id)
  53 + self.artifact_root = ensure_dir(artifact_root)
  54 + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
  55 + self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
  56 + self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
  57 + app_cfg = get_app_config()
  58 + rerank_service_url = str(
  59 + app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
  60 + )
  61 + self.rerank_client = RerankServiceClient(rerank_service_url)
  62 + llm_cfg = app_cfg.services.translation.capabilities["llm"]
  63 + api_key = app_cfg.infrastructure.secrets.dashscope_api_key
  64 + if not api_key:
  65 + raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
  66 + self.label_client = DashScopeLabelClient(
  67 + model=str(llm_cfg["model"]),
  68 + base_url=str(llm_cfg["base_url"]),
  69 + api_key=str(api_key),
  70 + )
  71 + self.query_parser = None
  72 +
  73 + def _get_query_parser(self):
  74 + if self.query_parser is None:
  75 + self.query_parser = get_query_parser()
  76 + return self.query_parser
  77 +
  78 + def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
  79 + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
  80 + payload = parsed.to_dict()
  81 + payload["text_for_rerank"] = parsed.text_for_rerank()
  82 + return payload
  83 +
  84 + def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
  85 + if self.labeler_mode != "complex":
  86 + raise RuntimeError("query profiles are only used in complex labeler mode")
  87 + if not force_refresh:
  88 + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
  89 + if cached is not None:
  90 + return cached
  91 + parser_hints = self.build_query_parser_hints(query)
  92 + profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
  93 + profile["parser_hints"] = parser_hints
  94 + self.store.upsert_query_profile(
  95 + self.tenant_id,
  96 + query,
  97 + JUDGE_PROMPT_VERSION_COMPLEX,
  98 + self.label_client.model,
  99 + profile,
  100 + raw_response,
  101 + )
  102 + return profile
  103 +
  104 + @staticmethod
  105 + def _doc_evidence_text(doc: Dict[str, Any]) -> str:
  106 + pieces: List[str] = [
  107 + build_display_title(doc),
  108 + pick_text(doc.get("vendor"), "en"),
  109 + pick_text(doc.get("category_path"), "en"),
  110 + pick_text(doc.get("category_name"), "en"),
  111 + ]
  112 + for sku in doc.get("skus") or []:
  113 + pieces.extend(
  114 + [
  115 + str(sku.get("option1_value") or ""),
  116 + str(sku.get("option2_value") or ""),
  117 + str(sku.get("option3_value") or ""),
  118 + ]
  119 + )
  120 + for tag in doc.get("tags") or []:
  121 + pieces.append(str(tag))
  122 + return normalize_text(" | ".join(piece for piece in pieces if piece))
  123 +
  124 + def _apply_rule_based_label_guardrails(
  125 + self,
  126 + label: str,
  127 + query_profile: Dict[str, Any],
  128 + doc: Dict[str, Any],
  129 + ) -> str:
  130 + if label not in VALID_LABELS:
  131 + return label
  132 + evidence = self._doc_evidence_text(doc)
  133 + category = normalize_text(query_profile.get("primary_category"))
  134 + allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
  135 +
  136 + primary_category_match = True
  137 + if category:
  138 + primary_category_match = category in evidence
  139 + allowed_category_match = True
  140 + if allowed_categories:
  141 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  142 +
  143 + if label == RELEVANCE_EXACT and not primary_category_match:
  144 + if allowed_category_match:
  145 + label = RELEVANCE_PARTIAL
  146 + else:
  147 + return RELEVANCE_IRRELEVANT
  148 +
  149 + for attr in query_profile.get("required_attributes") or []:
  150 + if not isinstance(attr, dict):
  151 + continue
  152 + attr_name = normalize_text(attr.get("name"))
  153 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
  154 + continue
  155 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  156 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  157 + if attr_name == "fit":
  158 + if any(term in {"oversized", "oversize"} for term in required_terms):
  159 + conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
  160 + if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
  161 + conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
  162 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  163 + has_conflict = any(term in evidence for term in conflicting_terms)
  164 +
  165 + if has_conflict:
  166 + return RELEVANCE_IRRELEVANT
  167 + if label == RELEVANCE_EXACT and not has_required:
  168 + label = RELEVANCE_PARTIAL
  169 +
  170 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  171 + return RELEVANCE_IRRELEVANT
  172 +
  173 + return label
  174 +
  175 + @staticmethod
  176 + def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
  177 + option_values = list(item.get("option_values") or [])
  178 + while len(option_values) < 3:
  179 + option_values.append("")
  180 + product = dict(item.get("product") or {})
  181 + return {
  182 + "spu_id": item.get("spu_id"),
  183 + "title": product.get("title") or item.get("title"),
  184 + "vendor": product.get("vendor"),
  185 + "category_path": product.get("category"),
  186 + "category_name": product.get("category"),
  187 + "image_url": item.get("image_url") or product.get("image_url"),
  188 + "tags": product.get("tags") or [],
  189 + "skus": [
  190 + {
  191 + "option1_value": option_values[0],
  192 + "option2_value": option_values[1],
  193 + "option3_value": option_values[2],
  194 + }
  195 + ],
  196 + }
  197 +
  198 + def _collect_label_issues(
  199 + self,
  200 + label: str,
  201 + query_profile: Dict[str, Any],
  202 + doc: Dict[str, Any],
  203 + ) -> List[str]:
  204 + evidence = self._doc_evidence_text(doc)
  205 + issues: List[str] = []
  206 + category = normalize_text(query_profile.get("primary_category"))
  207 + allowed_categories = [
  208 + normalize_text(item)
  209 + for item in query_profile.get("allowed_categories") or []
  210 + if str(item).strip()
  211 + ]
  212 +
  213 + primary_category_match = True if not category else category in evidence
  214 + allowed_category_match = False if allowed_categories else primary_category_match
  215 + if allowed_categories:
  216 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  217 +
  218 + if label == RELEVANCE_EXACT and not primary_category_match:
  219 + if allowed_category_match:
  220 + issues.append("Exact missing primary category evidence")
  221 + else:
  222 + issues.append("Exact has category mismatch")
  223 +
  224 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  225 + issues.append("Partial has category mismatch")
  226 +
  227 + for attr in query_profile.get("required_attributes") or []:
  228 + if not isinstance(attr, dict):
  229 + continue
  230 + attr_name = normalize_text(attr.get("name"))
  231 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
  232 + continue
  233 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  234 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  235 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  236 + has_conflict = any(term in evidence for term in conflicting_terms)
  237 +
  238 + if has_conflict and label != RELEVANCE_IRRELEVANT:
  239 + issues.append(f"{label} conflicts on {attr_name}")
  240 + if label == RELEVANCE_EXACT and not has_required:
  241 + issues.append(f"Exact missing {attr_name}")
  242 + return issues
  243 +
  244 + def audit_live_query(
  245 + self,
  246 + query: str,
  247 + *,
  248 + top_k: int = 100,
  249 + language: str = "en",
  250 + auto_annotate: bool = False,
  251 + ) -> Dict[str, Any]:
  252 + live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
  253 + if self.labeler_mode != "complex":
  254 + labels = [
  255 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  256 + for item in live["results"]
  257 + ]
  258 + return {
  259 + "query": query,
  260 + "tenant_id": self.tenant_id,
  261 + "top_k": top_k,
  262 + "metrics": live["metrics"],
  263 + "distribution": label_distribution(labels),
  264 + "query_profile": None,
  265 + "suspicious": [],
  266 + "results": live["results"],
  267 + }
  268 + query_profile = self.get_query_profile(query, force_refresh=False)
  269 + suspicious: List[Dict[str, Any]] = []
  270 +
  271 + for item in live["results"]:
  272 + doc = self._result_item_to_doc(item)
  273 + issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
  274 + suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
  275 + if suggested_label != (item["label"] or ""):
  276 + issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
  277 + if issues:
  278 + suspicious.append(
  279 + {
  280 + "rank": item["rank"],
  281 + "spu_id": item["spu_id"],
  282 + "title": item["title"],
  283 + "label": item["label"],
  284 + "suggested_label": suggested_label,
  285 + "issues": issues,
  286 + }
  287 + )
  288 +
  289 + labels = [
  290 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  291 + for item in live["results"]
  292 + ]
  293 + return {
  294 + "query": query,
  295 + "tenant_id": self.tenant_id,
  296 + "top_k": top_k,
  297 + "metrics": live["metrics"],
  298 + "distribution": label_distribution(labels),
  299 + "query_profile": query_profile,
  300 + "suspicious": suspicious,
  301 + "results": live["results"],
  302 + }
  303 +
  304 + def queries_from_file(self, path: Path) -> List[str]:
  305 + return [
  306 + line.strip()
  307 + for line in path.read_text(encoding="utf-8").splitlines()
  308 + if line.strip() and not line.strip().startswith("#")
  309 + ]
  310 +
  311 + def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
  312 + if not refresh and self.store.has_corpus(self.tenant_id):
  313 + return self.store.get_corpus_docs(self.tenant_id)
  314 +
  315 + es_client = get_es_client().client
  316 + index_name = get_tenant_index_name(self.tenant_id)
  317 + docs: List[Dict[str, Any]] = []
  318 + for hit in scan(
  319 + client=es_client,
  320 + index=index_name,
  321 + query={
  322 + "_source": [
  323 + "spu_id",
  324 + "title",
  325 + "vendor",
  326 + "category_path",
  327 + "category_name",
  328 + "image_url",
  329 + "skus",
  330 + "tags",
  331 + ],
  332 + "query": {"match_all": {}},
  333 + },
  334 + size=500,
  335 + preserve_order=False,
  336 + clear_scroll=True,
  337 + ):
  338 + source = dict(hit.get("_source") or {})
  339 + source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
  340 + docs.append(source)
  341 + self.store.upsert_corpus_docs(self.tenant_id, docs)
  342 + return docs
  343 +
  344 + def full_corpus_rerank(
  345 + self,
  346 + query: str,
  347 + docs: Sequence[Dict[str, Any]],
  348 + batch_size: int = 24,
  349 + force_refresh: bool = False,
  350 + ) -> List[Dict[str, Any]]:
  351 + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
  352 + pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
  353 + if pending:
  354 + new_scores: Dict[str, float] = {}
  355 + for start in range(0, len(pending), batch_size):
  356 + batch = pending[start : start + batch_size]
  357 + scores = self._rerank_batch_with_retry(query=query, docs=batch)
  358 + if len(scores) != len(batch):
  359 + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
  360 + for doc, score in zip(batch, scores):
  361 + new_scores[str(doc.get("spu_id"))] = float(score)
  362 + self.store.upsert_rerank_scores(
  363 + self.tenant_id,
  364 + query,
  365 + new_scores,
  366 + model_name="qwen3_vllm_score",
  367 + )
  368 + cached.update(new_scores)
  369 +
  370 + ranked = []
  371 + for doc in docs:
  372 + spu_id = str(doc.get("spu_id"))
  373 + ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
  374 + ranked.sort(key=lambda item: item["score"], reverse=True)
  375 + return ranked
  376 +
  377 + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
  378 + if not docs:
  379 + return []
  380 + doc_texts = [build_rerank_doc(doc) for doc in docs]
  381 + try:
  382 + scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
  383 + return scores
  384 + except Exception:
  385 + if len(docs) == 1:
  386 + return [-1.0]
  387 + if len(docs) <= 6:
  388 + scores: List[float] = []
  389 + for doc in docs:
  390 + scores.extend(self._rerank_batch_with_retry(query, [doc]))
  391 + return scores
  392 + mid = len(docs) // 2
  393 + left = self._rerank_batch_with_retry(query, docs[:mid])
  394 + right = self._rerank_batch_with_retry(query, docs[mid:])
  395 + return left + right
  396 +
  397 + def annotate_missing_labels(
  398 + self,
  399 + query: str,
  400 + docs: Sequence[Dict[str, Any]],
  401 + force_refresh: bool = False,
  402 + ) -> Dict[str, str]:
  403 + labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
  404 + missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
  405 + if not missing_docs:
  406 + return labels
  407 +
  408 + for start in range(0, len(missing_docs), self.label_client.batch_size):
  409 + batch = missing_docs[start : start + self.label_client.batch_size]
  410 + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
  411 + for sub_labels, raw_response, sub_batch in batch_pairs:
  412 + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
  413 + self.store.upsert_labels(
  414 + self.tenant_id,
  415 + query,
  416 + to_store,
  417 + judge_model=self.label_client.model,
  418 + raw_response=raw_response,
  419 + )
  420 + labels.update(to_store)
  421 + time.sleep(0.1)
  422 + return labels
  423 +
  424 + def _classify_with_retry(
  425 + self,
  426 + query: str,
  427 + docs: Sequence[Dict[str, Any]],
  428 + *,
  429 + force_refresh: bool = False,
  430 + ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
  431 + if not docs:
  432 + return []
  433 + try:
  434 + if self.labeler_mode == "complex":
  435 + query_profile = self.get_query_profile(query, force_refresh=force_refresh)
  436 + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
  437 + labels = [
  438 + self._apply_rule_based_label_guardrails(label, query_profile, doc)
  439 + for doc, label in zip(docs, labels)
  440 + ]
  441 + else:
  442 + labels, raw_response = self.label_client.classify_batch_simple(query, docs)
  443 + return [(labels, raw_response, docs)]
  444 + except Exception:
  445 + if len(docs) == 1:
  446 + raise
  447 + mid = len(docs) // 2
  448 + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
  449 +
  450 + def build_query_annotation_set(
  451 + self,
  452 + query: str,
  453 + *,
  454 + search_depth: int = 1000,
  455 + rerank_depth: int = 10000,
  456 + annotate_search_top_k: int = 120,
  457 + annotate_rerank_top_k: int = 200,
  458 + language: str = "en",
  459 + force_refresh_rerank: bool = False,
  460 + force_refresh_labels: bool = False,
  461 + ) -> QueryBuildResult:
  462 + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
  463 + search_results = list(search_payload.get("results") or [])
  464 + corpus = self.corpus_docs(refresh=False)
  465 + full_rerank = self.full_corpus_rerank(
  466 + query=query,
  467 + docs=corpus,
  468 + force_refresh=force_refresh_rerank,
  469 + )
  470 + rerank_depth_effective = min(rerank_depth, len(full_rerank))
  471 +
  472 + pool_docs: Dict[str, Dict[str, Any]] = {}
  473 + for doc in search_results[:annotate_search_top_k]:
  474 + pool_docs[str(doc.get("spu_id"))] = doc
  475 + for item in full_rerank[:annotate_rerank_top_k]:
  476 + pool_docs[str(item["spu_id"])] = item["doc"]
  477 +
  478 + labels = self.annotate_missing_labels(
  479 + query=query,
  480 + docs=list(pool_docs.values()),
  481 + force_refresh=force_refresh_labels,
  482 + )
  483 +
  484 + search_labeled_results: List[Dict[str, Any]] = []
  485 + for rank, doc in enumerate(search_results, start=1):
  486 + spu_id = str(doc.get("spu_id"))
  487 + label = labels.get(spu_id)
  488 + search_labeled_results.append(
  489 + {
  490 + "rank": rank,
  491 + "spu_id": spu_id,
  492 + "title": build_display_title(doc),
  493 + "image_url": doc.get("image_url"),
  494 + "rerank_score": None,
  495 + "label": label,
  496 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  497 + "product": compact_product_payload(doc),
  498 + }
  499 + )
  500 +
  501 + rerank_top_results: List[Dict[str, Any]] = []
  502 + for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
  503 + doc = item["doc"]
  504 + spu_id = str(item["spu_id"])
  505 + rerank_top_results.append(
  506 + {
  507 + "rank": rank,
  508 + "spu_id": spu_id,
  509 + "title": build_display_title(doc),
  510 + "image_url": doc.get("image_url"),
  511 + "rerank_score": round(float(item["score"]), 8),
  512 + "label": labels.get(spu_id),
  513 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  514 + "product": compact_product_payload(doc),
  515 + }
  516 + )
  517 +
  518 + top100_labels = [
  519 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  520 + for item in search_labeled_results[:100]
  521 + ]
  522 + metrics = compute_query_metrics(top100_labels)
  523 + output_dir = ensure_dir(self.artifact_root / "query_builds")
  524 + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
  525 + output_json_path = output_dir / f"{run_id}.json"
  526 + payload = {
  527 + "run_id": run_id,
  528 + "created_at": utc_now_iso(),
  529 + "tenant_id": self.tenant_id,
  530 + "query": query,
  531 + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
  532 + "search_total": int(search_payload.get("total") or 0),
  533 + "search_depth_requested": search_depth,
  534 + "search_depth_effective": len(search_results),
  535 + "rerank_depth_requested": rerank_depth,
  536 + "rerank_depth_effective": rerank_depth_effective,
  537 + "corpus_size": len(corpus),
  538 + "annotation_pool": {
  539 + "annotate_search_top_k": annotate_search_top_k,
  540 + "annotate_rerank_top_k": annotate_rerank_top_k,
  541 + "pool_size": len(pool_docs),
  542 + },
  543 + "labeler_mode": self.labeler_mode,
  544 + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
  545 + "metrics_top100": metrics,
  546 + "search_results": search_labeled_results,
  547 + "full_rerank_top": rerank_top_results,
  548 + }
  549 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  550 + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
  551 + return QueryBuildResult(
  552 + query=query,
  553 + tenant_id=self.tenant_id,
  554 + search_total=int(search_payload.get("total") or 0),
  555 + search_depth=len(search_results),
  556 + rerank_corpus_size=len(corpus),
  557 + annotated_count=len(pool_docs),
  558 + output_json_path=output_json_path,
  559 + )
  560 +
  561 + def evaluate_live_query(
  562 + self,
  563 + query: str,
  564 + top_k: int = 100,
  565 + auto_annotate: bool = False,
  566 + language: str = "en",
  567 + force_refresh_labels: bool = False,
  568 + ) -> Dict[str, Any]:
  569 + search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
  570 + results = list(search_payload.get("results") or [])
  571 + if auto_annotate:
  572 + self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
  573 + labels = self.store.get_labels(self.tenant_id, query)
  574 + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
  575 + labeled = []
  576 + unlabeled_hits = 0
  577 + for rank, doc in enumerate(results[:top_k], start=1):
  578 + spu_id = str(doc.get("spu_id"))
  579 + label = labels.get(spu_id)
  580 + if label not in VALID_LABELS:
  581 + unlabeled_hits += 1
  582 + labeled.append(
  583 + {
  584 + "rank": rank,
  585 + "spu_id": spu_id,
  586 + "title": build_display_title(doc),
  587 + "image_url": doc.get("image_url"),
  588 + "label": label,
  589 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  590 + "product": compact_product_payload(doc),
  591 + }
  592 + )
  593 + metric_labels = [
  594 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  595 + for item in labeled
  596 + ]
  597 + label_stats = self.store.get_query_label_stats(self.tenant_id, query)
  598 + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
  599 + relevant_missing_ids = [
  600 + spu_id
  601 + for spu_id, label in labels.items()
  602 + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
  603 + ]
  604 + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
  605 + missing_relevant = []
  606 + for spu_id in relevant_missing_ids:
  607 + doc = missing_docs_map.get(spu_id)
  608 + if not doc:
  609 + continue
  610 + missing_relevant.append(
  611 + {
  612 + "spu_id": spu_id,
  613 + "label": labels[spu_id],
  614 + "rerank_score": rerank_scores.get(spu_id),
  615 + "title": build_display_title(doc),
  616 + "image_url": doc.get("image_url"),
  617 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  618 + "product": compact_product_payload(doc),
  619 + }
  620 + )
  621 + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
  622 + missing_relevant.sort(
  623 + key=lambda item: (
  624 + label_order.get(str(item.get("label")), 9),
  625 + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
  626 + str(item.get("title") or ""),
  627 + )
  628 + )
  629 + tips: List[str] = []
  630 + if auto_annotate:
  631 + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
  632 + else:
  633 + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
  634 + if label_stats["total"] == 0:
  635 + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
  636 + if unlabeled_hits:
  637 + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
  638 + if not missing_relevant:
  639 + tips.append("No cached Exact/Partial products were missed by this recall set.")
  640 + return {
  641 + "query": query,
  642 + "tenant_id": self.tenant_id,
  643 + "top_k": top_k,
  644 + "metrics": compute_query_metrics(metric_labels),
  645 + "results": labeled,
  646 + "missing_relevant": missing_relevant,
  647 + "label_stats": {
  648 + **label_stats,
  649 + "unlabeled_hits_treated_irrelevant": unlabeled_hits,
  650 + "recalled_hits": len(labeled),
  651 + "missing_relevant_count": len(missing_relevant),
  652 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
  653 + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
  654 + },
  655 + "tips": tips,
  656 + "total": int(search_payload.get("total") or 0),
  657 + }
  658 +
  659 + def batch_evaluate(
  660 + self,
  661 + queries: Sequence[str],
  662 + *,
  663 + top_k: int = 100,
  664 + auto_annotate: bool = True,
  665 + language: str = "en",
  666 + force_refresh_labels: bool = False,
  667 + ) -> Dict[str, Any]:
  668 + per_query = []
  669 + for query in queries:
  670 + live = self.evaluate_live_query(
  671 + query,
  672 + top_k=top_k,
  673 + auto_annotate=auto_annotate,
  674 + language=language,
  675 + force_refresh_labels=force_refresh_labels,
  676 + )
  677 + labels = [
  678 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  679 + for item in live["results"]
  680 + ]
  681 + per_query.append(
  682 + {
  683 + "query": live["query"],
  684 + "tenant_id": live["tenant_id"],
  685 + "top_k": live["top_k"],
  686 + "metrics": live["metrics"],
  687 + "distribution": label_distribution(labels),
  688 + "total": live["total"],
  689 + }
  690 + )
  691 + aggregate = aggregate_metrics([item["metrics"] for item in per_query])
  692 + aggregate_distribution = {
  693 + RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
  694 + RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
  695 + RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
  696 + }
  697 + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
  698 + report_dir = ensure_dir(self.artifact_root / "batch_reports")
  699 + config_snapshot_path = report_dir / f"{batch_id}_config.json"
  700 + config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
  701 + config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
  702 + output_json_path = report_dir / f"{batch_id}.json"
  703 + report_md_path = report_dir / f"{batch_id}.md"
  704 + payload = {
  705 + "batch_id": batch_id,
  706 + "created_at": utc_now_iso(),
  707 + "tenant_id": self.tenant_id,
  708 + "queries": list(queries),
  709 + "top_k": top_k,
  710 + "aggregate_metrics": aggregate,
  711 + "aggregate_distribution": aggregate_distribution,
  712 + "per_query": per_query,
  713 + "config_snapshot_path": str(config_snapshot_path),
  714 + }
  715 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  716 + report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
  717 + self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
  718 + return payload
  719 +
scripts/evaluation/eval_framework/metrics.py 0 โ†’ 100644
@@ -0,0 +1,58 @@ @@ -0,0 +1,58 @@
  1 +"""IR metrics for labeled result lists."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Dict, Sequence
  6 +
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  8 +
  9 +
  10 +def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
  11 + if k <= 0:
  12 + return 0.0
  13 + sliced = list(labels[:k])
  14 + if not sliced:
  15 + return 0.0
  16 + hits = sum(1 for label in sliced if label in relevant)
  17 + return hits / float(min(k, len(sliced)))
  18 +
  19 +
  20 +def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
  21 + hit_count = 0
  22 + precision_sum = 0.0
  23 + for idx, label in enumerate(labels, start=1):
  24 + if label not in relevant:
  25 + continue
  26 + hit_count += 1
  27 + precision_sum += hit_count / idx
  28 + if hit_count == 0:
  29 + return 0.0
  30 + return precision_sum / hit_count
  31 +
  32 +
  33 +def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
  34 + metrics: Dict[str, float] = {}
  35 + for k in (5, 10, 20, 50):
  36 + metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
  37 + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  38 + metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
  39 + metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  40 + return metrics
  41 +
  42 +
  43 +def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:
  44 + if not metric_items:
  45 + return {}
  46 + keys = sorted(metric_items[0].keys())
  47 + return {
  48 + key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)
  49 + for key in keys
  50 + }
  51 +
  52 +
  53 +def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
  54 + return {
  55 + RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
  56 + RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
  57 + RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
  58 + }
scripts/evaluation/eval_framework/prompts.py 0 โ†’ 100644
@@ -0,0 +1,89 @@ @@ -0,0 +1,89 @@
  1 +"""LLM prompt templates for relevance judging (keep wording changes here)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +from typing import Any, Dict, Sequence
  7 +
  8 +
  9 +def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
  10 + lines = "\n".join(numbered_doc_lines)
  11 + n = len(numbered_doc_lines)
  12 + return (
  13 + "You are an e-commerce search result relevance evaluation assistant. "
  14 + "Based on the user query and each product's information, output the relevance level for each product.\n\n"
  15 + "## Relevance Level Criteria\n"
  16 + "Exact โ€” Fully matches the user's search intent.\n"
  17 + "Partial โ€” Primary intent satisfied (same category or similar use, basically aligns with search intent), "
  18 + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
  19 + "Irrelevant โ€” Category or use case mismatched, primary intent not satisfied.\n\n"
  20 + "Additional judging guidance:\n"
  21 + "- If the query clearly names a product type, product type matching has the highest priority. "
  22 + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
  23 + "bra vs top, backpack vs bag are not interchangeable.\n"
  24 + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
  25 + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
  26 + "- Do not guess missing attributes.\n"
  27 + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
  28 + "- Be conservative with Exact.\n\n"
  29 + f"Query: {query}\n\n"
  30 + "Products:\n"
  31 + f"{lines}\n\n"
  32 + "## Output Format\n"
  33 + f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
  34 + "They must correspond sequentially to the products above. Do not output any other information.\n"
  35 + )
  36 +
  37 +
  38 +def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
  39 + hints_json = json.dumps(parser_hints, ensure_ascii=False)
  40 + return (
  41 + "You are building a structured intent profile for e-commerce relevance judging.\n"
  42 + "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
  43 + "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
  44 + "Return JSON with this schema:\n"
  45 + "{\n"
  46 + ' "normalized_query_en": string,\n'
  47 + ' "primary_category": string,\n'
  48 + ' "allowed_categories": [string],\n'
  49 + ' "required_attributes": [\n'
  50 + ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
  51 + " ],\n"
  52 + ' "notes": [string]\n'
  53 + "}\n\n"
  54 + "Guidelines:\n"
  55 + "- Exact later will require explicit evidence for all required attributes.\n"
  56 + "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
  57 + "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
  58 + "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
  59 + "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
  60 + "- For color, include conflicting colors only when clear from the query.\n\n"
  61 + f"Original query: {query}\n"
  62 + f"Parser hints JSON: {hints_json}\n"
  63 + )
  64 +
  65 +
  66 +def classify_batch_complex_prompt(
  67 + query: str,
  68 + query_profile: Dict[str, Any],
  69 + numbered_doc_lines: Sequence[str],
  70 +) -> str:
  71 + lines = "\n".join(numbered_doc_lines)
  72 + profile_json = json.dumps(query_profile, ensure_ascii=False)
  73 + return (
  74 + "You are an e-commerce search relevance judge.\n"
  75 + "Judge each product against the structured query profile below.\n\n"
  76 + "Relevance rules:\n"
  77 + "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
  78 + "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
  79 + "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
  80 + "- Be conservative with Exact.\n"
  81 + "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
  82 + "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
  83 + f"Original query: {query}\n"
  84 + f"Structured query profile JSON: {profile_json}\n\n"
  85 + "Products:\n"
  86 + f"{lines}\n\n"
  87 + "Return JSON only, with schema:\n"
  88 + '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
  89 + )
scripts/evaluation/eval_framework/reports.py 0 โ†’ 100644
@@ -0,0 +1,48 @@ @@ -0,0 +1,48 @@
  1 +"""Markdown and text reports for batch evaluation."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Any, Dict
  6 +
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  8 +
  9 +
  10 +def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
  11 + lines = [
  12 + "# Search Batch Evaluation",
  13 + "",
  14 + f"- Batch ID: {payload['batch_id']}",
  15 + f"- Created at: {payload['created_at']}",
  16 + f"- Tenant ID: {payload['tenant_id']}",
  17 + f"- Query count: {len(payload['queries'])}",
  18 + f"- Top K: {payload['top_k']}",
  19 + "",
  20 + "## Aggregate Metrics",
  21 + "",
  22 + ]
  23 + for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):
  24 + lines.append(f"- {key}: {value}")
  25 + distribution = payload.get("aggregate_distribution") or {}
  26 + if distribution:
  27 + lines.extend(
  28 + [
  29 + "",
  30 + "## Label Distribution",
  31 + "",
  32 + f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
  33 + f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
  34 + f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
  35 + ]
  36 + )
  37 + lines.extend(["", "## Per Query", ""])
  38 + for item in payload.get("per_query") or []:
  39 + lines.append(f"### {item['query']}")
  40 + lines.append("")
  41 + for key, value in sorted((item.get("metrics") or {}).items()):
  42 + lines.append(f"- {key}: {value}")
  43 + distribution = item.get("distribution") or {}
  44 + lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
  45 + lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
  46 + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
  47 + lines.append("")
  48 + return "\n".join(lines)
scripts/evaluation/eval_framework/static/eval_web.css 0 โ†’ 100644
@@ -0,0 +1,91 @@ @@ -0,0 +1,91 @@
  1 +:root {
  2 + --bg: #f5f3ed;
  3 + --panel: #fffdf8;
  4 + --ink: #1f2a24;
  5 + --muted: #6b756e;
  6 + --line: #ddd4c6;
  7 + --accent: #0f766e;
  8 + --exact: #0f766e;
  9 + --partial: #b7791f;
  10 + --irrelevant: #b42318;
  11 + }
  12 + body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:
  13 + radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),
  14 + linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }
  15 + .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }
  16 + .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }
  17 + .main { padding: 24px; }
  18 + h1, h2 { margin: 0 0 12px; }
  19 + .muted { color: var(--muted); }
  20 + .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }
  21 + .query-item {
  22 + display: block; width: 100%; border: 0; background: transparent; text-align: left;
  23 + padding: 10px 12px; border-radius: 10px; cursor: pointer;
  24 + color: var(--ink); font-size: 15px; font-weight: 500;
  25 + }
  26 + .query-item:hover { background: #eef6f4; }
  27 + .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }
  28 + input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }
  29 + button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }
  30 + button.secondary { background: #d9e6e3; color: #12433d; }
  31 + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }
  32 + .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }
  33 + .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
  34 + .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }
  35 + .results { display: grid; gap: 10px; }
  36 + .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
  37 + .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
  38 + .Exact { background: var(--exact); }
  39 + .Partial { background: var(--partial); }
  40 + .Irrelevant { background: var(--irrelevant); }
  41 + .Unknown { background: #637381; }
  42 + .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
  43 + .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
  44 + .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
  45 + .section { margin-bottom: 28px; }
  46 + .history { font-size: 13px; line-height: 1.5; }
  47 + .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; }
  48 + .history-item {
  49 + display: block; width: 100%; border: 1px solid var(--line); background: var(--panel);
  50 + text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer;
  51 + color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s;
  52 + }
  53 + .history-item:hover { background: #eef6f4; border-color: #b8d4cd; }
  54 + .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
  55 + .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; }
  56 + .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; }
  57 + .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; }
  58 + .history-item .hstats span { color: var(--muted); }
  59 + .report-modal-root {
  60 + position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center;
  61 + padding: 16px; box-sizing: border-box;
  62 + }
  63 + .report-modal-root.is-open { display: flex; }
  64 + .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); }
  65 + .report-modal-dialog {
  66 + position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column;
  67 + background: var(--panel); border: 1px solid var(--line); border-radius: 18px;
  68 + box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18);
  69 + }
  70 + .report-modal-head {
  71 + flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px;
  72 + padding: 16px 18px; border-bottom: 1px solid var(--line);
  73 + }
  74 + .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; }
  75 + .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; }
  76 + .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; }
  77 + .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); }
  78 + .report-modal-body {
  79 + flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px;
  80 + font-size: 14px; line-height: 1.55;
  81 + }
  82 + .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; }
  83 + .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; }
  84 + .batch-report-md h2:first-of-type { margin-top: 0; }
  85 + .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; }
  86 + .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; }
  87 + .batch-report-md li { margin: 0.2rem 0; }
  88 + .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; }
  89 + .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; }
  90 + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
  91 + .tip { margin-bottom: 6px; color: var(--muted); }
scripts/evaluation/eval_framework/static/eval_web.js 0 โ†’ 100644
@@ -0,0 +1,181 @@ @@ -0,0 +1,181 @@
  1 + async function fetchJSON(url, options) {
  2 + const res = await fetch(url, options);
  3 + if (!res.ok) throw new Error(await res.text());
  4 + return await res.json();
  5 + }
  6 + function renderMetrics(metrics) {
  7 + const root = document.getElementById('metrics');
  8 + root.innerHTML = '';
  9 + Object.entries(metrics || {}).forEach(([key, value]) => {
  10 + const card = document.createElement('div');
  11 + card.className = 'metric';
  12 + card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;
  13 + root.appendChild(card);
  14 + });
  15 + }
  16 + function renderResults(results, rootId='results', showRank=true) {
  17 + const mount = document.getElementById(rootId);
  18 + mount.innerHTML = '';
  19 + (results || []).forEach(item => {
  20 + const label = item.label || 'Unknown';
  21 + const box = document.createElement('div');
  22 + box.className = 'result';
  23 + box.innerHTML = `
  24 + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
  25 + <img class="thumb" src="${item.image_url || ''}" alt="" />
  26 + <div>
  27 + <div class="title">${item.title || ''}</div>
  28 + <div class="options">
  29 + <div>${(item.option_values || [])[0] || ''}</div>
  30 + <div>${(item.option_values || [])[1] || ''}</div>
  31 + <div>${(item.option_values || [])[2] || ''}</div>
  32 + </div>
  33 + </div>`;
  34 + mount.appendChild(box);
  35 + });
  36 + if (!(results || []).length) {
  37 + mount.innerHTML = '<div class="muted">None.</div>';
  38 + }
  39 + }
  40 + function renderTips(data) {
  41 + const root = document.getElementById('tips');
  42 + const tips = [...(data.tips || [])];
  43 + const stats = data.label_stats || {};
  44 + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
  45 + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
  46 + }
  47 + async function loadQueries() {
  48 + const data = await fetchJSON('/api/queries');
  49 + const root = document.getElementById('queryList');
  50 + root.innerHTML = '';
  51 + data.queries.forEach(query => {
  52 + const btn = document.createElement('button');
  53 + btn.className = 'query-item';
  54 + btn.textContent = query;
  55 + btn.onclick = () => {
  56 + document.getElementById('queryInput').value = query;
  57 + runSingle();
  58 + };
  59 + root.appendChild(btn);
  60 + });
  61 + }
  62 + function fmtMetric(m, key, digits) {
  63 + const v = m && m[key];
  64 + if (v == null || Number.isNaN(Number(v))) return null;
  65 + const n = Number(v);
  66 + return n.toFixed(digits);
  67 + }
  68 + function historySummaryHtml(meta) {
  69 + const m = meta && meta.aggregate_metrics;
  70 + const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
  71 + const parts = [];
  72 + if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
  73 + const p10 = fmtMetric(m, 'P@10', 3);
  74 + const p52 = fmtMetric(m, 'P@5_2_3', 3);
  75 + const map3 = fmtMetric(m, 'MAP_3', 3);
  76 + if (p10) parts.push(`<span>P@10</span> ${p10}`);
  77 + if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`);
  78 + if (map3) parts.push(`<span>MAP_3</span> ${map3}`);
  79 + if (!parts.length) return '';
  80 + return `<div class="hstats">${parts.join(' ยท ')}</div>`;
  81 + }
  82 + async function loadHistory() {
  83 + const data = await fetchJSON('/api/history');
  84 + const root = document.getElementById('history');
  85 + root.classList.remove('muted');
  86 + const items = data.history || [];
  87 + if (!items.length) {
  88 + root.innerHTML = '<span class="muted">No history yet.</span>';
  89 + return;
  90 + }
  91 + root.innerHTML = `<div class="history-list"></div>`;
  92 + const list = root.querySelector('.history-list');
  93 + items.forEach(item => {
  94 + const btn = document.createElement('button');
  95 + btn.type = 'button';
  96 + btn.className = 'history-item';
  97 + btn.setAttribute('aria-label', `Open report ${item.batch_id}`);
  98 + const sum = historySummaryHtml(item.metadata);
  99 + btn.innerHTML = `<div class="hid">${item.batch_id}</div>
  100 + <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`;
  101 + btn.onclick = () => openBatchReport(item.batch_id);
  102 + list.appendChild(btn);
  103 + });
  104 + }
  105 + let _lastReportPath = '';
  106 + function closeReportModal() {
  107 + const el = document.getElementById('reportModal');
  108 + el.classList.remove('is-open');
  109 + el.setAttribute('aria-hidden', 'true');
  110 + document.getElementById('reportModalBody').innerHTML = '';
  111 + document.getElementById('reportModalMeta').textContent = '';
  112 + }
  113 + async function openBatchReport(batchId) {
  114 + const el = document.getElementById('reportModal');
  115 + const body = document.getElementById('reportModalBody');
  116 + const metaEl = document.getElementById('reportModalMeta');
  117 + const titleEl = document.getElementById('reportModalTitle');
  118 + el.classList.add('is-open');
  119 + el.setAttribute('aria-hidden', 'false');
  120 + titleEl.textContent = batchId;
  121 + metaEl.textContent = '';
  122 + body.className = 'report-modal-body batch-report-md report-modal-loading';
  123 + body.textContent = 'Loading reportโ€ฆ';
  124 + try {
  125 + const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report');
  126 + _lastReportPath = rep.report_markdown_path || '';
  127 + metaEl.textContent = rep.report_markdown_path || '';
  128 + const raw = marked.parse(rep.markdown || '', { gfm: true });
  129 + const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } });
  130 + body.className = 'report-modal-body batch-report-md';
  131 + body.innerHTML = safe;
  132 + } catch (e) {
  133 + body.className = 'report-modal-body report-modal-error';
  134 + body.textContent = (e && e.message) ? e.message : String(e);
  135 + }
  136 + }
  137 + document.getElementById('reportModal').addEventListener('click', (ev) => {
  138 + if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal();
  139 + });
  140 + document.addEventListener('keydown', (ev) => {
  141 + if (ev.key === 'Escape') closeReportModal();
  142 + });
  143 + document.getElementById('reportCopyPath').addEventListener('click', async () => {
  144 + if (!_lastReportPath) return;
  145 + try {
  146 + await navigator.clipboard.writeText(_lastReportPath);
  147 + } catch (_) {}
  148 + });
  149 + async function runSingle() {
  150 + const query = document.getElementById('queryInput').value.trim();
  151 + if (!query) return;
  152 + document.getElementById('status').textContent = `Evaluating "${query}"...`;
  153 + const data = await fetchJSON('/api/search-eval', {
  154 + method: 'POST',
  155 + headers: {'Content-Type': 'application/json'},
  156 + body: JSON.stringify({query, top_k: 100, auto_annotate: false})
  157 + });
  158 + document.getElementById('status').textContent = `Done. total=${data.total}`;
  159 + renderMetrics(data.metrics);
  160 + renderResults(data.results, 'results', true);
  161 + renderResults(data.missing_relevant, 'missingRelevant', false);
  162 + renderTips(data);
  163 + loadHistory();
  164 + }
  165 + async function runBatch() {
  166 + document.getElementById('status').textContent = 'Running batch evaluation...';
  167 + const data = await fetchJSON('/api/batch-eval', {
  168 + method: 'POST',
  169 + headers: {'Content-Type': 'application/json'},
  170 + body: JSON.stringify({top_k: 100, auto_annotate: false})
  171 + });
  172 + document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
  173 + renderMetrics(data.aggregate_metrics);
  174 + renderResults([], 'results', true);
  175 + renderResults([], 'missingRelevant', false);
  176 + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
  177 + loadHistory();
  178 + }
  179 + loadQueries();
  180 + loadHistory();
  181 +
scripts/evaluation/eval_framework/static/index.html 0 โ†’ 100644
@@ -0,0 +1,70 @@ @@ -0,0 +1,70 @@
  1 +<!doctype html>
  2 +<html lang="en">
  3 +<head>
  4 + <meta charset="utf-8" />
  5 + <meta name="viewport" content="width=device-width, initial-scale=1" />
  6 + <title>Search Evaluation</title>
  7 + <link rel="stylesheet" href="/static/eval_web.css" />
  8 +
  9 +</head>
  10 +<body>
  11 + <div class="app">
  12 + <aside class="sidebar">
  13 + <h2>Queries</h2>
  14 + <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
  15 + <div id="queryList" class="query-list"></div>
  16 + <div class="section">
  17 + <h2>History</h2>
  18 + <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p>
  19 + <div id="history" class="history muted">Loading...</div>
  20 + </div>
  21 + </aside>
  22 + <main class="main">
  23 + <h1>Search Evaluation</h1>
  24 + <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
  25 + <div class="toolbar">
  26 + <input id="queryInput" type="text" placeholder="Search query" />
  27 + <button onclick="runSingle()">Evaluate Query</button>
  28 + <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
  29 + </div>
  30 + <div id="status" class="muted section"></div>
  31 + <section class="section">
  32 + <h2>Metrics</h2>
  33 + <div id="metrics" class="grid"></div>
  34 + </section>
  35 + <section class="section">
  36 + <h2>Top Results</h2>
  37 + <div id="results" class="results"></div>
  38 + </section>
  39 + <section class="section">
  40 + <h2>Missed Exact / Partial</h2>
  41 + <div id="missingRelevant" class="results"></div>
  42 + </section>
  43 + <section class="section">
  44 + <h2>Notes</h2>
  45 + <div id="tips" class="tips muted"></div>
  46 + </section>
  47 + </main>
  48 + </div>
  49 + <div id="reportModal" class="report-modal-root" aria-hidden="true">
  50 + <div class="report-modal-backdrop" data-close-report="1"></div>
  51 + <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle">
  52 + <div class="report-modal-head">
  53 + <h3 id="reportModalTitle">Batch report</h3>
  54 + <div class="head-actions">
  55 + <button type="button" class="secondary" id="reportCopyPath">Copy path</button>
  56 + <button type="button" onclick="closeReportModal()">Close</button>
  57 + </div>
  58 + </div>
  59 + <div id="reportModalMeta" class="report-modal-meta muted"></div>
  60 + <div id="reportModalBody" class="report-modal-body batch-report-md"></div>
  61 + </div>
  62 + </div>
  63 +
  64 +
  65 +
  66 + <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script>
  67 + <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script>
  68 + <script src="/static/eval_web.js"></script>
  69 +</body>
  70 +</html>
0 \ No newline at end of file 71 \ No newline at end of file
scripts/evaluation/eval_framework/store.py 0 โ†’ 100644
@@ -0,0 +1,426 @@ @@ -0,0 +1,426 @@
  1 +"""SQLite persistence for evaluation corpus, labels, rerank scores, and run metadata."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +import sqlite3
  7 +from dataclasses import dataclass
  8 +from pathlib import Path
  9 +from typing import Any, Dict, List, Optional, Sequence
  10 +
  11 +from .constants import VALID_LABELS
  12 +from .utils import ensure_dir, safe_json_dumps, utc_now_iso
  13 +
  14 +
  15 +@dataclass
  16 +class QueryBuildResult:
  17 + query: str
  18 + tenant_id: str
  19 + search_total: int
  20 + search_depth: int
  21 + rerank_corpus_size: int
  22 + annotated_count: int
  23 + output_json_path: Path
  24 +
  25 +
  26 +class EvalStore:
  27 + def __init__(self, db_path: Path):
  28 + self.db_path = db_path
  29 + ensure_dir(db_path.parent)
  30 + self.conn = sqlite3.connect(str(db_path), check_same_thread=False)
  31 + self.conn.row_factory = sqlite3.Row
  32 + self._init_schema()
  33 +
  34 + def _init_schema(self) -> None:
  35 + self.conn.executescript(
  36 + """
  37 + CREATE TABLE IF NOT EXISTS corpus_docs (
  38 + tenant_id TEXT NOT NULL,
  39 + spu_id TEXT NOT NULL,
  40 + title_json TEXT,
  41 + vendor_json TEXT,
  42 + category_path_json TEXT,
  43 + category_name_json TEXT,
  44 + image_url TEXT,
  45 + skus_json TEXT,
  46 + tags_json TEXT,
  47 + raw_json TEXT NOT NULL,
  48 + updated_at TEXT NOT NULL,
  49 + PRIMARY KEY (tenant_id, spu_id)
  50 + );
  51 +
  52 + CREATE TABLE IF NOT EXISTS rerank_scores (
  53 + tenant_id TEXT NOT NULL,
  54 + query_text TEXT NOT NULL,
  55 + spu_id TEXT NOT NULL,
  56 + score REAL NOT NULL,
  57 + model_name TEXT,
  58 + updated_at TEXT NOT NULL,
  59 + PRIMARY KEY (tenant_id, query_text, spu_id)
  60 + );
  61 +
  62 + CREATE TABLE IF NOT EXISTS relevance_labels (
  63 + tenant_id TEXT NOT NULL,
  64 + query_text TEXT NOT NULL,
  65 + spu_id TEXT NOT NULL,
  66 + label TEXT NOT NULL,
  67 + judge_model TEXT,
  68 + raw_response TEXT,
  69 + updated_at TEXT NOT NULL,
  70 + PRIMARY KEY (tenant_id, query_text, spu_id)
  71 + );
  72 +
  73 + CREATE TABLE IF NOT EXISTS build_runs (
  74 + run_id TEXT PRIMARY KEY,
  75 + tenant_id TEXT NOT NULL,
  76 + query_text TEXT NOT NULL,
  77 + output_json_path TEXT NOT NULL,
  78 + metadata_json TEXT NOT NULL,
  79 + created_at TEXT NOT NULL
  80 + );
  81 +
  82 + CREATE TABLE IF NOT EXISTS batch_runs (
  83 + batch_id TEXT PRIMARY KEY,
  84 + tenant_id TEXT NOT NULL,
  85 + output_json_path TEXT NOT NULL,
  86 + report_markdown_path TEXT NOT NULL,
  87 + config_snapshot_path TEXT NOT NULL,
  88 + metadata_json TEXT NOT NULL,
  89 + created_at TEXT NOT NULL
  90 + );
  91 +
  92 + CREATE TABLE IF NOT EXISTS query_profiles (
  93 + tenant_id TEXT NOT NULL,
  94 + query_text TEXT NOT NULL,
  95 + prompt_version TEXT NOT NULL,
  96 + judge_model TEXT,
  97 + profile_json TEXT NOT NULL,
  98 + raw_response TEXT NOT NULL,
  99 + updated_at TEXT NOT NULL,
  100 + PRIMARY KEY (tenant_id, query_text, prompt_version)
  101 + );
  102 + """
  103 + )
  104 + self.conn.commit()
  105 +
  106 + def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
  107 + now = utc_now_iso()
  108 + rows = []
  109 + for doc in docs:
  110 + rows.append(
  111 + (
  112 + tenant_id,
  113 + str(doc.get("spu_id") or ""),
  114 + safe_json_dumps(doc.get("title")),
  115 + safe_json_dumps(doc.get("vendor")),
  116 + safe_json_dumps(doc.get("category_path")),
  117 + safe_json_dumps(doc.get("category_name")),
  118 + str(doc.get("image_url") or ""),
  119 + safe_json_dumps(doc.get("skus") or []),
  120 + safe_json_dumps(doc.get("tags") or []),
  121 + safe_json_dumps(doc),
  122 + now,
  123 + )
  124 + )
  125 + self.conn.executemany(
  126 + """
  127 + INSERT INTO corpus_docs (
  128 + tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,
  129 + image_url, skus_json, tags_json, raw_json, updated_at
  130 + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
  131 + ON CONFLICT(tenant_id, spu_id) DO UPDATE SET
  132 + title_json=excluded.title_json,
  133 + vendor_json=excluded.vendor_json,
  134 + category_path_json=excluded.category_path_json,
  135 + category_name_json=excluded.category_name_json,
  136 + image_url=excluded.image_url,
  137 + skus_json=excluded.skus_json,
  138 + tags_json=excluded.tags_json,
  139 + raw_json=excluded.raw_json,
  140 + updated_at=excluded.updated_at
  141 + """,
  142 + rows,
  143 + )
  144 + self.conn.commit()
  145 +
  146 + def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:
  147 + rows = self.conn.execute(
  148 + "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",
  149 + (tenant_id,),
  150 + ).fetchall()
  151 + return [json.loads(row["raw_json"]) for row in rows]
  152 +
  153 + def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
  154 + keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]
  155 + if not keys:
  156 + return {}
  157 + placeholders = ",".join("?" for _ in keys)
  158 + rows = self.conn.execute(
  159 + f"""
  160 + SELECT spu_id, raw_json
  161 + FROM corpus_docs
  162 + WHERE tenant_id=? AND spu_id IN ({placeholders})
  163 + """,
  164 + [tenant_id, *keys],
  165 + ).fetchall()
  166 + return {
  167 + str(row["spu_id"]): json.loads(row["raw_json"])
  168 + for row in rows
  169 + }
  170 +
  171 + def has_corpus(self, tenant_id: str) -> bool:
  172 + row = self.conn.execute(
  173 + "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",
  174 + (tenant_id,),
  175 + ).fetchone()
  176 + return bool(row and row["n"] > 0)
  177 +
  178 + def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:
  179 + rows = self.conn.execute(
  180 + """
  181 + SELECT spu_id, score
  182 + FROM rerank_scores
  183 + WHERE tenant_id=? AND query_text=?
  184 + """,
  185 + (tenant_id, query_text),
  186 + ).fetchall()
  187 + return {str(row["spu_id"]): float(row["score"]) for row in rows}
  188 +
  189 + def upsert_rerank_scores(
  190 + self,
  191 + tenant_id: str,
  192 + query_text: str,
  193 + scores: Dict[str, float],
  194 + model_name: str,
  195 + ) -> None:
  196 + now = utc_now_iso()
  197 + rows = [
  198 + (tenant_id, query_text, spu_id, float(score), model_name, now)
  199 + for spu_id, score in scores.items()
  200 + ]
  201 + self.conn.executemany(
  202 + """
  203 + INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)
  204 + VALUES (?, ?, ?, ?, ?, ?)
  205 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  206 + score=excluded.score,
  207 + model_name=excluded.model_name,
  208 + updated_at=excluded.updated_at
  209 + """,
  210 + rows,
  211 + )
  212 + self.conn.commit()
  213 +
  214 + def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:
  215 + rows = self.conn.execute(
  216 + """
  217 + SELECT spu_id, label
  218 + FROM relevance_labels
  219 + WHERE tenant_id=? AND query_text=?
  220 + """,
  221 + (tenant_id, query_text),
  222 + ).fetchall()
  223 + return {str(row["spu_id"]): str(row["label"]) for row in rows}
  224 +
  225 + def upsert_labels(
  226 + self,
  227 + tenant_id: str,
  228 + query_text: str,
  229 + labels: Dict[str, str],
  230 + judge_model: str,
  231 + raw_response: str,
  232 + ) -> None:
  233 + now = utc_now_iso()
  234 + rows = []
  235 + for spu_id, label in labels.items():
  236 + if label not in VALID_LABELS:
  237 + raise ValueError(f"invalid label: {label}")
  238 + rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))
  239 + self.conn.executemany(
  240 + """
  241 + INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)
  242 + VALUES (?, ?, ?, ?, ?, ?, ?)
  243 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  244 + label=excluded.label,
  245 + judge_model=excluded.judge_model,
  246 + raw_response=excluded.raw_response,
  247 + updated_at=excluded.updated_at
  248 + """,
  249 + rows,
  250 + )
  251 + self.conn.commit()
  252 +
  253 + def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:
  254 + row = self.conn.execute(
  255 + """
  256 + SELECT profile_json
  257 + FROM query_profiles
  258 + WHERE tenant_id=? AND query_text=? AND prompt_version=?
  259 + """,
  260 + (tenant_id, query_text, prompt_version),
  261 + ).fetchone()
  262 + if not row:
  263 + return None
  264 + return json.loads(row["profile_json"])
  265 +
  266 + def upsert_query_profile(
  267 + self,
  268 + tenant_id: str,
  269 + query_text: str,
  270 + prompt_version: str,
  271 + judge_model: str,
  272 + profile: Dict[str, Any],
  273 + raw_response: str,
  274 + ) -> None:
  275 + self.conn.execute(
  276 + """
  277 + INSERT OR REPLACE INTO query_profiles
  278 + (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)
  279 + VALUES (?, ?, ?, ?, ?, ?, ?)
  280 + """,
  281 + (
  282 + tenant_id,
  283 + query_text,
  284 + prompt_version,
  285 + judge_model,
  286 + safe_json_dumps(profile),
  287 + raw_response,
  288 + utc_now_iso(),
  289 + ),
  290 + )
  291 + self.conn.commit()
  292 +
  293 + def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
  294 + self.conn.execute(
  295 + """
  296 + INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
  297 + VALUES (?, ?, ?, ?, ?, ?)
  298 + """,
  299 + (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
  300 + )
  301 + self.conn.commit()
  302 +
  303 + def insert_batch_run(
  304 + self,
  305 + batch_id: str,
  306 + tenant_id: str,
  307 + output_json_path: Path,
  308 + report_markdown_path: Path,
  309 + config_snapshot_path: Path,
  310 + metadata: Dict[str, Any],
  311 + ) -> None:
  312 + self.conn.execute(
  313 + """
  314 + INSERT OR REPLACE INTO batch_runs
  315 + (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
  316 + VALUES (?, ?, ?, ?, ?, ?, ?)
  317 + """,
  318 + (
  319 + batch_id,
  320 + tenant_id,
  321 + str(output_json_path),
  322 + str(report_markdown_path),
  323 + str(config_snapshot_path),
  324 + safe_json_dumps(metadata),
  325 + utc_now_iso(),
  326 + ),
  327 + )
  328 + self.conn.commit()
  329 +
  330 + def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
  331 + rows = self.conn.execute(
  332 + """
  333 + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  334 + FROM batch_runs
  335 + ORDER BY created_at DESC
  336 + LIMIT ?
  337 + """,
  338 + (limit,),
  339 + ).fetchall()
  340 + items: List[Dict[str, Any]] = []
  341 + for row in rows:
  342 + items.append(
  343 + {
  344 + "batch_id": row["batch_id"],
  345 + "tenant_id": row["tenant_id"],
  346 + "output_json_path": row["output_json_path"],
  347 + "report_markdown_path": row["report_markdown_path"],
  348 + "config_snapshot_path": row["config_snapshot_path"],
  349 + "metadata": json.loads(row["metadata_json"]),
  350 + "created_at": row["created_at"],
  351 + }
  352 + )
  353 + return items
  354 +
  355 + def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]:
  356 + row = self.conn.execute(
  357 + """
  358 + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  359 + FROM batch_runs
  360 + WHERE batch_id = ?
  361 + """,
  362 + (batch_id,),
  363 + ).fetchone()
  364 + if row is None:
  365 + return None
  366 + return {
  367 + "batch_id": row["batch_id"],
  368 + "tenant_id": row["tenant_id"],
  369 + "output_json_path": row["output_json_path"],
  370 + "report_markdown_path": row["report_markdown_path"],
  371 + "config_snapshot_path": row["config_snapshot_path"],
  372 + "metadata": json.loads(row["metadata_json"]),
  373 + "created_at": row["created_at"],
  374 + }
  375 +
  376 + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:
  377 + rows = self.conn.execute(
  378 + """
  379 + SELECT
  380 + query_text,
  381 + COUNT(*) AS total,
  382 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  383 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  384 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  385 + MAX(updated_at) AS updated_at
  386 + FROM relevance_labels
  387 + WHERE tenant_id=?
  388 + GROUP BY query_text
  389 + ORDER BY query_text
  390 + """,
  391 + (tenant_id,),
  392 + ).fetchall()
  393 + return [
  394 + {
  395 + "query": str(row["query_text"]),
  396 + "total": int(row["total"]),
  397 + "exact_count": int(row["exact_count"] or 0),
  398 + "partial_count": int(row["partial_count"] or 0),
  399 + "irrelevant_count": int(row["irrelevant_count"] or 0),
  400 + "updated_at": row["updated_at"],
  401 + }
  402 + for row in rows
  403 + ]
  404 +
  405 + def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:
  406 + row = self.conn.execute(
  407 + """
  408 + SELECT
  409 + COUNT(*) AS total,
  410 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  411 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  412 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  413 + MAX(updated_at) AS updated_at
  414 + FROM relevance_labels
  415 + WHERE tenant_id=? AND query_text=?
  416 + """,
  417 + (tenant_id, query_text),
  418 + ).fetchone()
  419 + return {
  420 + "query": query_text,
  421 + "total": int((row["total"] or 0) if row else 0),
  422 + "exact_count": int((row["exact_count"] or 0) if row else 0),
  423 + "partial_count": int((row["partial_count"] or 0) if row else 0),
  424 + "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
  425 + "updated_at": row["updated_at"] if row else None,
  426 + }
scripts/evaluation/eval_framework/utils.py 0 โ†’ 100644
@@ -0,0 +1,145 @@ @@ -0,0 +1,145 @@
  1 +"""Small helpers: time, JSON, document text, LLM output parsing."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import hashlib
  6 +import json
  7 +import re
  8 +from datetime import datetime, timezone
  9 +from pathlib import Path
  10 +from typing import Any, Dict, List, Sequence, Tuple
  11 +
  12 +from .constants import PROJECT_ROOT
  13 +
  14 +
  15 +def utc_now_iso() -> str:
  16 + return datetime.now(timezone.utc).isoformat()
  17 +
  18 +
  19 +def utc_timestamp() -> str:
  20 + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
  21 +
  22 +
  23 +def ensure_dir(path: Path) -> Path:
  24 + path.mkdir(parents=True, exist_ok=True)
  25 + return path
  26 +
  27 +
  28 +def sha1_text(text: str) -> str:
  29 + return hashlib.sha1(text.encode("utf-8")).hexdigest()
  30 +
  31 +
  32 +def pick_text(value: Any, preferred_lang: str = "en") -> str:
  33 + if value is None:
  34 + return ""
  35 + if isinstance(value, dict):
  36 + return str(
  37 + value.get(preferred_lang)
  38 + or value.get("en")
  39 + or value.get("zh")
  40 + or next((v for v in value.values() if v), "")
  41 + ).strip()
  42 + return str(value).strip()
  43 +
  44 +
  45 +def safe_json_dumps(data: Any) -> str:
  46 + return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
  47 +
  48 +
  49 +def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
  50 + if not skus:
  51 + return "", "", ""
  52 + first = skus[0] or {}
  53 + return (
  54 + str(first.get("option1_value") or "").strip(),
  55 + str(first.get("option2_value") or "").strip(),
  56 + str(first.get("option3_value") or "").strip(),
  57 + )
  58 +
  59 +
  60 +def build_display_title(doc: Dict[str, Any]) -> str:
  61 + title = doc.get("title")
  62 + en = pick_text(title, "en")
  63 + zh = pick_text(title, "zh")
  64 + if en and zh and en != zh:
  65 + return f"{en} / {zh}"
  66 + return en or zh
  67 +
  68 +
  69 +def build_rerank_doc(doc: Dict[str, Any]) -> str:
  70 + title = build_display_title(doc)
  71 + return title[:400]
  72 +
  73 +
  74 +def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
  75 + title = build_display_title(doc)
  76 + option1, option2, option3 = compact_option_values(doc.get("skus") or [])
  77 + vendor = pick_text(doc.get("vendor"), "en")
  78 + category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
  79 + tags = doc.get("tags") or []
  80 + tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
  81 + parts = [title]
  82 + if option1:
  83 + parts.append(f"option1={option1}")
  84 + if option2:
  85 + parts.append(f"option2={option2}")
  86 + if option3:
  87 + parts.append(f"option3={option3}")
  88 + if vendor:
  89 + parts.append(f"vendor={vendor}")
  90 + if category:
  91 + parts.append(f"category={category}")
  92 + if tags_text:
  93 + parts.append(f"tags={tags_text}")
  94 + return f"{idx}. " + " | ".join(part for part in parts if part)
  95 +
  96 +
  97 +def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
  98 + return {
  99 + "spu_id": str(doc.get("spu_id") or ""),
  100 + "title": build_display_title(doc),
  101 + "image_url": doc.get("image_url"),
  102 + "vendor": pick_text(doc.get("vendor"), "en"),
  103 + "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
  104 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  105 + "tags": list((doc.get("tags") or [])[:6]),
  106 + }
  107 +
  108 +
  109 +def normalize_text(text: Any) -> str:
  110 + value = str(text or "").strip().lower()
  111 + value = re.sub(r"\s+", " ", value)
  112 + return value
  113 +
  114 +
  115 +def extract_json_blob(text: str) -> Any:
  116 + cleaned = str(text or "").strip()
  117 + candidates: List[str] = [cleaned]
  118 + fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
  119 + candidates.extend(match.strip() for match in fence_matches if match.strip())
  120 +
  121 + for candidate in candidates:
  122 + try:
  123 + return json.loads(candidate)
  124 + except Exception:
  125 + pass
  126 +
  127 + starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
  128 + ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
  129 + for start in starts:
  130 + for end in reversed(ends):
  131 + if end <= start:
  132 + continue
  133 + fragment = cleaned[start : end + 1]
  134 + try:
  135 + return json.loads(fragment)
  136 + except Exception:
  137 + continue
  138 + raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
  139 +
  140 +
  141 +def ensure_project_on_path() -> None:
  142 + import sys
  143 +
  144 + if str(PROJECT_ROOT) not in sys.path:
  145 + sys.path.insert(0, str(PROJECT_ROOT))
scripts/evaluation/eval_framework/web_app.py 0 โ†’ 100644
@@ -0,0 +1,85 @@ @@ -0,0 +1,85 @@
  1 +"""FastAPI app for the search evaluation UI (static frontend + JSON APIs)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from pathlib import Path
  6 +from typing import Any, Dict
  7 +
  8 +from fastapi import FastAPI, HTTPException
  9 +from fastapi.responses import HTMLResponse
  10 +from fastapi.staticfiles import StaticFiles
  11 +
  12 +from .api_models import BatchEvalRequest, SearchEvalRequest
  13 +from .constants import DEFAULT_QUERY_FILE
  14 +from .framework import SearchEvaluationFramework
  15 +
  16 +_STATIC_DIR = Path(__file__).resolve().parent / "static"
  17 +
  18 +
  19 +def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
  20 + app = FastAPI(title="Search Evaluation UI", version="1.0.0")
  21 +
  22 + app.mount(
  23 + "/static",
  24 + StaticFiles(directory=str(_STATIC_DIR)),
  25 + name="static",
  26 + )
  27 +
  28 + index_path = _STATIC_DIR / "index.html"
  29 +
  30 + @app.get("/", response_class=HTMLResponse)
  31 + def home() -> str:
  32 + return index_path.read_text(encoding="utf-8")
  33 +
  34 + @app.get("/api/queries")
  35 + def api_queries() -> Dict[str, Any]:
  36 + return {"queries": framework.queries_from_file(query_file)}
  37 +
  38 + @app.post("/api/search-eval")
  39 + def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
  40 + return framework.evaluate_live_query(
  41 + query=request.query,
  42 + top_k=request.top_k,
  43 + auto_annotate=request.auto_annotate,
  44 + language=request.language,
  45 + )
  46 +
  47 + @app.post("/api/batch-eval")
  48 + def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
  49 + queries = request.queries or framework.queries_from_file(query_file)
  50 + if not queries:
  51 + raise HTTPException(status_code=400, detail="No queries provided")
  52 + return framework.batch_evaluate(
  53 + queries=queries,
  54 + top_k=request.top_k,
  55 + auto_annotate=request.auto_annotate,
  56 + language=request.language,
  57 + force_refresh_labels=request.force_refresh_labels,
  58 + )
  59 +
  60 + @app.get("/api/history")
  61 + def api_history() -> Dict[str, Any]:
  62 + return {"history": framework.store.list_batch_runs(limit=20)}
  63 +
  64 + @app.get("/api/history/{batch_id}/report")
  65 + def api_history_report(batch_id: str) -> Dict[str, Any]:
  66 + row = framework.store.get_batch_run(batch_id)
  67 + if row is None:
  68 + raise HTTPException(status_code=404, detail="Unknown batch_id")
  69 + report_path = Path(row["report_markdown_path"]).resolve()
  70 + root = framework.artifact_root.resolve()
  71 + try:
  72 + report_path.relative_to(root)
  73 + except ValueError:
  74 + raise HTTPException(status_code=403, detail="Report path is outside artifact root")
  75 + if not report_path.is_file():
  76 + raise HTTPException(status_code=404, detail="Report file not found")
  77 + return {
  78 + "batch_id": row["batch_id"],
  79 + "created_at": row["created_at"],
  80 + "tenant_id": row["tenant_id"],
  81 + "report_markdown_path": str(report_path),
  82 + "markdown": report_path.read_text(encoding="utf-8"),
  83 + }
  84 +
  85 + return app