Commit c81b0fc12093cca9a8eef590e545a71a3fc2cd1c

Authored by tangwang
1 parent 7b8d9e1a

scripts/evaluation/eval_framework

scripts/evaluation/README.md
... ... @@ -19,12 +19,12 @@ The framework supports four related tasks:
19 19  
20 20 ## Files
21 21  
22   -- `eval_framework.py`
23   - Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation.
  22 +- `eval_framework/` (Python package)
  23 + Modular layout: `framework.py` (orchestration), `store.py` (SQLite), `clients.py` (search/rerank/LLM), `prompts.py` (judge templates), `metrics.py`, `reports.py`, `web_app.py`, `cli.py`, and `static/` (evaluation UI HTML/CSS/JS).
24 24 - `build_annotation_set.py`
25   - Thin CLI entrypoint into `eval_framework.py`.
  25 + Thin CLI entrypoint into `eval_framework`.
26 26 - `serve_eval_web.py`
27   - Thin web entrypoint into `eval_framework.py`.
  27 + Thin web entrypoint into `eval_framework`.
28 28 - `tune_fusion.py`
29 29 Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports.
30 30 - `fusion_experiments_shortlist.json`
... ...
scripts/evaluation/eval_framework.py deleted
... ... @@ -1,2140 +0,0 @@
1   -#!/usr/bin/env python3
2   -"""
3   -Search evaluation framework for pooled relevance annotation, live metrics, and reports.
4   -"""
5   -
6   -from __future__ import annotations
7   -
8   -import argparse
9   -import hashlib
10   -import json
11   -import math
12   -import os
13   -import re
14   -import sqlite3
15   -import sys
16   -import time
17   -from dataclasses import dataclass
18   -from datetime import datetime, timezone
19   -from pathlib import Path
20   -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
21   -
22   -import requests
23   -from elasticsearch.helpers import scan
24   -from fastapi import FastAPI, HTTPException
25   -from fastapi.responses import HTMLResponse
26   -from pydantic import BaseModel, Field
27   -
28   -PROJECT_ROOT = Path(__file__).resolve().parents[2]
29   -if str(PROJECT_ROOT) not in sys.path:
30   - sys.path.insert(0, str(PROJECT_ROOT))
31   -
32   -from api.app import get_app_config, get_es_client, get_query_parser, init_service
33   -from indexer.mapping_generator import get_tenant_index_name
34   -
35   -
36   -RELEVANCE_EXACT = "Exact"
37   -RELEVANCE_PARTIAL = "Partial"
38   -RELEVANCE_IRRELEVANT = "Irrelevant"
39   -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
40   -DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
41   -DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
42   -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
43   -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
44   -DEFAULT_LABELER_MODE = "simple"
45   -
46   -
47   -def utc_now_iso() -> str:
48   - return datetime.now(timezone.utc).isoformat()
49   -
50   -
51   -def utc_timestamp() -> str:
52   - return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
53   -
54   -
55   -def ensure_dir(path: Path) -> Path:
56   - path.mkdir(parents=True, exist_ok=True)
57   - return path
58   -
59   -
60   -def sha1_text(text: str) -> str:
61   - return hashlib.sha1(text.encode("utf-8")).hexdigest()
62   -
63   -
64   -def pick_text(value: Any, preferred_lang: str = "en") -> str:
65   - if value is None:
66   - return ""
67   - if isinstance(value, dict):
68   - return str(
69   - value.get(preferred_lang)
70   - or value.get("en")
71   - or value.get("zh")
72   - or next((v for v in value.values() if v), "")
73   - ).strip()
74   - return str(value).strip()
75   -
76   -
77   -def safe_json_dumps(data: Any) -> str:
78   - return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
79   -
80   -
81   -def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
82   - if not skus:
83   - return "", "", ""
84   - first = skus[0] or {}
85   - return (
86   - str(first.get("option1_value") or "").strip(),
87   - str(first.get("option2_value") or "").strip(),
88   - str(first.get("option3_value") or "").strip(),
89   - )
90   -
91   -
92   -def build_display_title(doc: Dict[str, Any]) -> str:
93   - title = doc.get("title")
94   - en = pick_text(title, "en")
95   - zh = pick_text(title, "zh")
96   - if en and zh and en != zh:
97   - return f"{en} / {zh}"
98   - return en or zh
99   -
100   -
101   -def build_rerank_doc(doc: Dict[str, Any]) -> str:
102   - title = build_display_title(doc)
103   - return title[:400]
104   -
105   -
106   -def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
107   - title = build_display_title(doc)
108   - option1, option2, option3 = compact_option_values(doc.get("skus") or [])
109   - vendor = pick_text(doc.get("vendor"), "en")
110   - category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
111   - tags = doc.get("tags") or []
112   - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
113   - parts = [title]
114   - if option1:
115   - parts.append(f"option1={option1}")
116   - if option2:
117   - parts.append(f"option2={option2}")
118   - if option3:
119   - parts.append(f"option3={option3}")
120   - if vendor:
121   - parts.append(f"vendor={vendor}")
122   - if category:
123   - parts.append(f"category={category}")
124   - if tags_text:
125   - parts.append(f"tags={tags_text}")
126   - return f"{idx}. " + " | ".join(part for part in parts if part)
127   -
128   -
129   -def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
130   - return {
131   - "spu_id": str(doc.get("spu_id") or ""),
132   - "title": build_display_title(doc),
133   - "image_url": doc.get("image_url"),
134   - "vendor": pick_text(doc.get("vendor"), "en"),
135   - "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
136   - "option_values": list(compact_option_values(doc.get("skus") or [])),
137   - "tags": list((doc.get("tags") or [])[:6]),
138   - }
139   -
140   -
141   -def normalize_text(text: Any) -> str:
142   - value = str(text or "").strip().lower()
143   - value = re.sub(r"\s+", " ", value)
144   - return value
145   -
146   -
147   -def _extract_json_blob(text: str) -> Any:
148   - cleaned = str(text or "").strip()
149   - candidates: List[str] = [cleaned]
150   - fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
151   - candidates.extend(match.strip() for match in fence_matches if match.strip())
152   -
153   - for candidate in candidates:
154   - try:
155   - return json.loads(candidate)
156   - except Exception:
157   - pass
158   -
159   - starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
160   - ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
161   - for start in starts:
162   - for end in reversed(ends):
163   - if end <= start:
164   - continue
165   - fragment = cleaned[start : end + 1]
166   - try:
167   - return json.loads(fragment)
168   - except Exception:
169   - continue
170   - raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
171   -
172   -
173   -@dataclass
174   -class QueryBuildResult:
175   - query: str
176   - tenant_id: str
177   - search_total: int
178   - search_depth: int
179   - rerank_corpus_size: int
180   - annotated_count: int
181   - output_json_path: Path
182   -
183   -
184   -class EvalStore:
185   - def __init__(self, db_path: Path):
186   - self.db_path = db_path
187   - ensure_dir(db_path.parent)
188   - self.conn = sqlite3.connect(str(db_path), check_same_thread=False)
189   - self.conn.row_factory = sqlite3.Row
190   - self._init_schema()
191   -
192   - def _init_schema(self) -> None:
193   - self.conn.executescript(
194   - """
195   - CREATE TABLE IF NOT EXISTS corpus_docs (
196   - tenant_id TEXT NOT NULL,
197   - spu_id TEXT NOT NULL,
198   - title_json TEXT,
199   - vendor_json TEXT,
200   - category_path_json TEXT,
201   - category_name_json TEXT,
202   - image_url TEXT,
203   - skus_json TEXT,
204   - tags_json TEXT,
205   - raw_json TEXT NOT NULL,
206   - updated_at TEXT NOT NULL,
207   - PRIMARY KEY (tenant_id, spu_id)
208   - );
209   -
210   - CREATE TABLE IF NOT EXISTS rerank_scores (
211   - tenant_id TEXT NOT NULL,
212   - query_text TEXT NOT NULL,
213   - spu_id TEXT NOT NULL,
214   - score REAL NOT NULL,
215   - model_name TEXT,
216   - updated_at TEXT NOT NULL,
217   - PRIMARY KEY (tenant_id, query_text, spu_id)
218   - );
219   -
220   - CREATE TABLE IF NOT EXISTS relevance_labels (
221   - tenant_id TEXT NOT NULL,
222   - query_text TEXT NOT NULL,
223   - spu_id TEXT NOT NULL,
224   - label TEXT NOT NULL,
225   - judge_model TEXT,
226   - raw_response TEXT,
227   - updated_at TEXT NOT NULL,
228   - PRIMARY KEY (tenant_id, query_text, spu_id)
229   - );
230   -
231   - CREATE TABLE IF NOT EXISTS build_runs (
232   - run_id TEXT PRIMARY KEY,
233   - tenant_id TEXT NOT NULL,
234   - query_text TEXT NOT NULL,
235   - output_json_path TEXT NOT NULL,
236   - metadata_json TEXT NOT NULL,
237   - created_at TEXT NOT NULL
238   - );
239   -
240   - CREATE TABLE IF NOT EXISTS batch_runs (
241   - batch_id TEXT PRIMARY KEY,
242   - tenant_id TEXT NOT NULL,
243   - output_json_path TEXT NOT NULL,
244   - report_markdown_path TEXT NOT NULL,
245   - config_snapshot_path TEXT NOT NULL,
246   - metadata_json TEXT NOT NULL,
247   - created_at TEXT NOT NULL
248   - );
249   -
250   - CREATE TABLE IF NOT EXISTS query_profiles (
251   - tenant_id TEXT NOT NULL,
252   - query_text TEXT NOT NULL,
253   - prompt_version TEXT NOT NULL,
254   - judge_model TEXT,
255   - profile_json TEXT NOT NULL,
256   - raw_response TEXT NOT NULL,
257   - updated_at TEXT NOT NULL,
258   - PRIMARY KEY (tenant_id, query_text, prompt_version)
259   - );
260   - """
261   - )
262   - self.conn.commit()
263   -
264   - def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
265   - now = utc_now_iso()
266   - rows = []
267   - for doc in docs:
268   - rows.append(
269   - (
270   - tenant_id,
271   - str(doc.get("spu_id") or ""),
272   - safe_json_dumps(doc.get("title")),
273   - safe_json_dumps(doc.get("vendor")),
274   - safe_json_dumps(doc.get("category_path")),
275   - safe_json_dumps(doc.get("category_name")),
276   - str(doc.get("image_url") or ""),
277   - safe_json_dumps(doc.get("skus") or []),
278   - safe_json_dumps(doc.get("tags") or []),
279   - safe_json_dumps(doc),
280   - now,
281   - )
282   - )
283   - self.conn.executemany(
284   - """
285   - INSERT INTO corpus_docs (
286   - tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,
287   - image_url, skus_json, tags_json, raw_json, updated_at
288   - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
289   - ON CONFLICT(tenant_id, spu_id) DO UPDATE SET
290   - title_json=excluded.title_json,
291   - vendor_json=excluded.vendor_json,
292   - category_path_json=excluded.category_path_json,
293   - category_name_json=excluded.category_name_json,
294   - image_url=excluded.image_url,
295   - skus_json=excluded.skus_json,
296   - tags_json=excluded.tags_json,
297   - raw_json=excluded.raw_json,
298   - updated_at=excluded.updated_at
299   - """,
300   - rows,
301   - )
302   - self.conn.commit()
303   -
304   - def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:
305   - rows = self.conn.execute(
306   - "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",
307   - (tenant_id,),
308   - ).fetchall()
309   - return [json.loads(row["raw_json"]) for row in rows]
310   -
311   - def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
312   - keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]
313   - if not keys:
314   - return {}
315   - placeholders = ",".join("?" for _ in keys)
316   - rows = self.conn.execute(
317   - f"""
318   - SELECT spu_id, raw_json
319   - FROM corpus_docs
320   - WHERE tenant_id=? AND spu_id IN ({placeholders})
321   - """,
322   - [tenant_id, *keys],
323   - ).fetchall()
324   - return {
325   - str(row["spu_id"]): json.loads(row["raw_json"])
326   - for row in rows
327   - }
328   -
329   - def has_corpus(self, tenant_id: str) -> bool:
330   - row = self.conn.execute(
331   - "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",
332   - (tenant_id,),
333   - ).fetchone()
334   - return bool(row and row["n"] > 0)
335   -
336   - def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:
337   - rows = self.conn.execute(
338   - """
339   - SELECT spu_id, score
340   - FROM rerank_scores
341   - WHERE tenant_id=? AND query_text=?
342   - """,
343   - (tenant_id, query_text),
344   - ).fetchall()
345   - return {str(row["spu_id"]): float(row["score"]) for row in rows}
346   -
347   - def upsert_rerank_scores(
348   - self,
349   - tenant_id: str,
350   - query_text: str,
351   - scores: Dict[str, float],
352   - model_name: str,
353   - ) -> None:
354   - now = utc_now_iso()
355   - rows = [
356   - (tenant_id, query_text, spu_id, float(score), model_name, now)
357   - for spu_id, score in scores.items()
358   - ]
359   - self.conn.executemany(
360   - """
361   - INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)
362   - VALUES (?, ?, ?, ?, ?, ?)
363   - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
364   - score=excluded.score,
365   - model_name=excluded.model_name,
366   - updated_at=excluded.updated_at
367   - """,
368   - rows,
369   - )
370   - self.conn.commit()
371   -
372   - def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:
373   - rows = self.conn.execute(
374   - """
375   - SELECT spu_id, label
376   - FROM relevance_labels
377   - WHERE tenant_id=? AND query_text=?
378   - """,
379   - (tenant_id, query_text),
380   - ).fetchall()
381   - return {str(row["spu_id"]): str(row["label"]) for row in rows}
382   -
383   - def upsert_labels(
384   - self,
385   - tenant_id: str,
386   - query_text: str,
387   - labels: Dict[str, str],
388   - judge_model: str,
389   - raw_response: str,
390   - ) -> None:
391   - now = utc_now_iso()
392   - rows = []
393   - for spu_id, label in labels.items():
394   - if label not in VALID_LABELS:
395   - raise ValueError(f"invalid label: {label}")
396   - rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))
397   - self.conn.executemany(
398   - """
399   - INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)
400   - VALUES (?, ?, ?, ?, ?, ?, ?)
401   - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
402   - label=excluded.label,
403   - judge_model=excluded.judge_model,
404   - raw_response=excluded.raw_response,
405   - updated_at=excluded.updated_at
406   - """,
407   - rows,
408   - )
409   - self.conn.commit()
410   -
411   - def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:
412   - row = self.conn.execute(
413   - """
414   - SELECT profile_json
415   - FROM query_profiles
416   - WHERE tenant_id=? AND query_text=? AND prompt_version=?
417   - """,
418   - (tenant_id, query_text, prompt_version),
419   - ).fetchone()
420   - if not row:
421   - return None
422   - return json.loads(row["profile_json"])
423   -
424   - def upsert_query_profile(
425   - self,
426   - tenant_id: str,
427   - query_text: str,
428   - prompt_version: str,
429   - judge_model: str,
430   - profile: Dict[str, Any],
431   - raw_response: str,
432   - ) -> None:
433   - self.conn.execute(
434   - """
435   - INSERT OR REPLACE INTO query_profiles
436   - (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)
437   - VALUES (?, ?, ?, ?, ?, ?, ?)
438   - """,
439   - (
440   - tenant_id,
441   - query_text,
442   - prompt_version,
443   - judge_model,
444   - safe_json_dumps(profile),
445   - raw_response,
446   - utc_now_iso(),
447   - ),
448   - )
449   - self.conn.commit()
450   -
451   - def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
452   - self.conn.execute(
453   - """
454   - INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
455   - VALUES (?, ?, ?, ?, ?, ?)
456   - """,
457   - (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
458   - )
459   - self.conn.commit()
460   -
461   - def insert_batch_run(
462   - self,
463   - batch_id: str,
464   - tenant_id: str,
465   - output_json_path: Path,
466   - report_markdown_path: Path,
467   - config_snapshot_path: Path,
468   - metadata: Dict[str, Any],
469   - ) -> None:
470   - self.conn.execute(
471   - """
472   - INSERT OR REPLACE INTO batch_runs
473   - (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
474   - VALUES (?, ?, ?, ?, ?, ?, ?)
475   - """,
476   - (
477   - batch_id,
478   - tenant_id,
479   - str(output_json_path),
480   - str(report_markdown_path),
481   - str(config_snapshot_path),
482   - safe_json_dumps(metadata),
483   - utc_now_iso(),
484   - ),
485   - )
486   - self.conn.commit()
487   -
488   - def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
489   - rows = self.conn.execute(
490   - """
491   - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
492   - FROM batch_runs
493   - ORDER BY created_at DESC
494   - LIMIT ?
495   - """,
496   - (limit,),
497   - ).fetchall()
498   - items: List[Dict[str, Any]] = []
499   - for row in rows:
500   - items.append(
501   - {
502   - "batch_id": row["batch_id"],
503   - "tenant_id": row["tenant_id"],
504   - "output_json_path": row["output_json_path"],
505   - "report_markdown_path": row["report_markdown_path"],
506   - "config_snapshot_path": row["config_snapshot_path"],
507   - "metadata": json.loads(row["metadata_json"]),
508   - "created_at": row["created_at"],
509   - }
510   - )
511   - return items
512   -
513   - def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]:
514   - row = self.conn.execute(
515   - """
516   - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
517   - FROM batch_runs
518   - WHERE batch_id = ?
519   - """,
520   - (batch_id,),
521   - ).fetchone()
522   - if row is None:
523   - return None
524   - return {
525   - "batch_id": row["batch_id"],
526   - "tenant_id": row["tenant_id"],
527   - "output_json_path": row["output_json_path"],
528   - "report_markdown_path": row["report_markdown_path"],
529   - "config_snapshot_path": row["config_snapshot_path"],
530   - "metadata": json.loads(row["metadata_json"]),
531   - "created_at": row["created_at"],
532   - }
533   -
534   - def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:
535   - rows = self.conn.execute(
536   - """
537   - SELECT
538   - query_text,
539   - COUNT(*) AS total,
540   - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
541   - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
542   - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
543   - MAX(updated_at) AS updated_at
544   - FROM relevance_labels
545   - WHERE tenant_id=?
546   - GROUP BY query_text
547   - ORDER BY query_text
548   - """,
549   - (tenant_id,),
550   - ).fetchall()
551   - return [
552   - {
553   - "query": str(row["query_text"]),
554   - "total": int(row["total"]),
555   - "exact_count": int(row["exact_count"] or 0),
556   - "partial_count": int(row["partial_count"] or 0),
557   - "irrelevant_count": int(row["irrelevant_count"] or 0),
558   - "updated_at": row["updated_at"],
559   - }
560   - for row in rows
561   - ]
562   -
563   - def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:
564   - row = self.conn.execute(
565   - """
566   - SELECT
567   - COUNT(*) AS total,
568   - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
569   - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
570   - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
571   - MAX(updated_at) AS updated_at
572   - FROM relevance_labels
573   - WHERE tenant_id=? AND query_text=?
574   - """,
575   - (tenant_id, query_text),
576   - ).fetchone()
577   - return {
578   - "query": query_text,
579   - "total": int((row["total"] or 0) if row else 0),
580   - "exact_count": int((row["exact_count"] or 0) if row else 0),
581   - "partial_count": int((row["partial_count"] or 0) if row else 0),
582   - "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
583   - "updated_at": row["updated_at"] if row else None,
584   - }
585   -
586   -
587   -class SearchServiceClient:
588   - def __init__(self, base_url: str, tenant_id: str):
589   - self.base_url = base_url.rstrip("/")
590   - self.tenant_id = str(tenant_id)
591   - self.session = requests.Session()
592   -
593   - def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
594   - response = self.session.post(
595   - f"{self.base_url}/search/",
596   - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
597   - json={"query": query, "size": size, "from": from_, "language": language},
598   - timeout=120,
599   - )
600   - response.raise_for_status()
601   - return response.json()
602   -
603   -
604   -class RerankServiceClient:
605   - def __init__(self, service_url: str):
606   - self.service_url = service_url.rstrip("/")
607   - self.session = requests.Session()
608   -
609   - def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:
610   - payload: Dict[str, Any] = {
611   - "query": query,
612   - "docs": list(docs),
613   - "normalize": normalize,
614   - }
615   - if top_n is not None:
616   - payload["top_n"] = int(top_n)
617   - response = self.session.post(self.service_url, json=payload, timeout=180)
618   - response.raise_for_status()
619   - data = response.json()
620   - return list(data.get("scores") or []), dict(data.get("meta") or {})
621   -
622   -
623   -class DashScopeLabelClient:
624   - def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
625   - self.model = model
626   - self.base_url = base_url.rstrip("/")
627   - self.api_key = api_key
628   - self.batch_size = int(batch_size)
629   - self.session = requests.Session()
630   -
631   - def _chat(self, prompt: str) -> Tuple[str, str]:
632   - response = self.session.post(
633   - f"{self.base_url}/chat/completions",
634   - headers={
635   - "Authorization": f"Bearer {self.api_key}",
636   - "Content-Type": "application/json",
637   - },
638   - json={
639   - "model": self.model,
640   - "messages": [{"role": "user", "content": prompt}],
641   - "temperature": 0,
642   - "top_p": 0.1,
643   - },
644   - timeout=180,
645   - )
646   - response.raise_for_status()
647   - data = response.json()
648   - content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
649   - return content, safe_json_dumps(data)
650   -
651   - def classify_batch_simple(
652   - self,
653   - query: str,
654   - docs: Sequence[Dict[str, Any]],
655   - ) -> Tuple[List[str], str]:
656   - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
657   - prompt = (
658   - "You are an e-commerce search result relevance evaluation assistant. "
659   - "Based on the user query and each product's information, output the relevance level for each product.\n\n"
660   - "## Relevance Level Criteria\n"
661   - "Exact โ€” Fully matches the user's search intent.\n"
662   - "Partial โ€” Primary intent satisfied (same category or similar use, basically aligns with search intent), "
663   - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
664   - "Irrelevant โ€” Category or use case mismatched, primary intent not satisfied.\n\n"
665   - "Additional judging guidance:\n"
666   - "- If the query clearly names a product type, product type matching has the highest priority. "
667   - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
668   - "bra vs top, backpack vs bag are not interchangeable.\n"
669   - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
670   - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
671   - "- Do not guess missing attributes.\n"
672   - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
673   - "- Be conservative with Exact.\n\n"
674   - f"Query: {query}\n\n"
675   - "Products:\n"
676   - + "\n".join(numbered_docs)
677   - + "\n\n## Output Format\n"
678   - f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
679   - "They must correspond sequentially to the products above. Do not output any other information.\n"
680   - )
681   - content, raw_response = self._chat(prompt)
682   - labels = []
683   - for line in str(content or "").splitlines():
684   - label = line.strip()
685   - if label in VALID_LABELS:
686   - labels.append(label)
687   - if len(labels) != len(docs):
688   - payload = _extract_json_blob(content)
689   - if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
690   - labels = []
691   - for item in payload["labels"][: len(docs)]:
692   - if isinstance(item, dict):
693   - label = str(item.get("label") or "").strip()
694   - else:
695   - label = str(item).strip()
696   - if label in VALID_LABELS:
697   - labels.append(label)
698   - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
699   - raise ValueError(f"unexpected simple label output: {content!r}")
700   - return labels, raw_response
701   -
702   - def extract_query_profile(
703   - self,
704   - query: str,
705   - parser_hints: Dict[str, Any],
706   - ) -> Tuple[Dict[str, Any], str]:
707   - prompt = (
708   - "You are building a structured intent profile for e-commerce relevance judging.\n"
709   - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
710   - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
711   - "Return JSON with this schema:\n"
712   - "{\n"
713   - ' "normalized_query_en": string,\n'
714   - ' "primary_category": string,\n'
715   - ' "allowed_categories": [string],\n'
716   - ' "required_attributes": [\n'
717   - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
718   - " ],\n"
719   - ' "notes": [string]\n'
720   - "}\n\n"
721   - "Guidelines:\n"
722   - "- Exact later will require explicit evidence for all required attributes.\n"
723   - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
724   - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
725   - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
726   - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
727   - "- For color, include conflicting colors only when clear from the query.\n\n"
728   - f"Original query: {query}\n"
729   - f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n"
730   - )
731   - content, raw_response = self._chat(prompt)
732   - payload = _extract_json_blob(content)
733   - if not isinstance(payload, dict):
734   - raise ValueError(f"unexpected query profile payload: {content!r}")
735   - payload.setdefault("normalized_query_en", query)
736   - payload.setdefault("primary_category", "")
737   - payload.setdefault("allowed_categories", [])
738   - payload.setdefault("required_attributes", [])
739   - payload.setdefault("notes", [])
740   - return payload, raw_response
741   -
742   - def classify_batch_complex(
743   - self,
744   - query: str,
745   - query_profile: Dict[str, Any],
746   - docs: Sequence[Dict[str, Any]],
747   - ) -> Tuple[List[str], str]:
748   - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
749   - prompt = (
750   - "You are an e-commerce search relevance judge.\n"
751   - "Judge each product against the structured query profile below.\n\n"
752   - "Relevance rules:\n"
753   - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
754   - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
755   - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
756   - "- Be conservative with Exact.\n"
757   - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
758   - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
759   - f"Original query: {query}\n"
760   - f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n"
761   - "Products:\n"
762   - + "\n".join(numbered_docs)
763   - + "\n\nReturn JSON only, with schema:\n"
764   - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
765   - )
766   - content, raw_response = self._chat(prompt)
767   - payload = _extract_json_blob(content)
768   - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
769   - raise ValueError(f"unexpected label payload: {content!r}")
770   - labels_payload = payload["labels"]
771   - labels: List[str] = []
772   - for item in labels_payload[: len(docs)]:
773   - if not isinstance(item, dict):
774   - continue
775   - label = str(item.get("label") or "").strip()
776   - if label in VALID_LABELS:
777   - labels.append(label)
778   - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
779   - raise ValueError(f"unexpected label output: {content!r}")
780   - return labels, raw_response
781   -
782   -
783   -def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
784   - if k <= 0:
785   - return 0.0
786   - sliced = list(labels[:k])
787   - if not sliced:
788   - return 0.0
789   - hits = sum(1 for label in sliced if label in relevant)
790   - return hits / float(min(k, len(sliced)))
791   -
792   -
793   -def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
794   - hit_count = 0
795   - precision_sum = 0.0
796   - for idx, label in enumerate(labels, start=1):
797   - if label not in relevant:
798   - continue
799   - hit_count += 1
800   - precision_sum += hit_count / idx
801   - if hit_count == 0:
802   - return 0.0
803   - return precision_sum / hit_count
804   -
805   -
806   -def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
807   - metrics: Dict[str, float] = {}
808   - for k in (5, 10, 20, 50):
809   - metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
810   - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
811   - metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
812   - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
813   - return metrics
814   -
815   -
816   -def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:
817   - if not metric_items:
818   - return {}
819   - keys = sorted(metric_items[0].keys())
820   - return {
821   - key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)
822   - for key in keys
823   - }
824   -
825   -
826   -def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
827   - return {
828   - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
829   - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
830   - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
831   - }
832   -
833   -
834   -class SearchEvaluationFramework:
835   - def __init__(
836   - self,
837   - tenant_id: str,
838   - artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
839   - search_base_url: str = "http://localhost:6002",
840   - labeler_mode: str = DEFAULT_LABELER_MODE,
841   - ):
842   - init_service(get_app_config().infrastructure.elasticsearch.host)
843   - self.tenant_id = str(tenant_id)
844   - self.artifact_root = ensure_dir(artifact_root)
845   - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
846   - self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
847   - self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
848   - app_cfg = get_app_config()
849   - rerank_service_url = str(
850   - app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
851   - )
852   - self.rerank_client = RerankServiceClient(rerank_service_url)
853   - llm_cfg = app_cfg.services.translation.capabilities["llm"]
854   - api_key = app_cfg.infrastructure.secrets.dashscope_api_key
855   - if not api_key:
856   - raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
857   - self.label_client = DashScopeLabelClient(
858   - model=str(llm_cfg["model"]),
859   - base_url=str(llm_cfg["base_url"]),
860   - api_key=str(api_key),
861   - )
862   - self.query_parser = None
863   -
864   - def _get_query_parser(self):
865   - if self.query_parser is None:
866   - self.query_parser = get_query_parser()
867   - return self.query_parser
868   -
869   - def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
870   - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
871   - payload = parsed.to_dict()
872   - payload["text_for_rerank"] = parsed.text_for_rerank()
873   - return payload
874   -
875   - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
876   - if self.labeler_mode != "complex":
877   - raise RuntimeError("query profiles are only used in complex labeler mode")
878   - if not force_refresh:
879   - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
880   - if cached is not None:
881   - return cached
882   - parser_hints = self.build_query_parser_hints(query)
883   - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
884   - profile["parser_hints"] = parser_hints
885   - self.store.upsert_query_profile(
886   - self.tenant_id,
887   - query,
888   - JUDGE_PROMPT_VERSION_COMPLEX,
889   - self.label_client.model,
890   - profile,
891   - raw_response,
892   - )
893   - return profile
894   -
895   - @staticmethod
896   - def _doc_evidence_text(doc: Dict[str, Any]) -> str:
897   - pieces: List[str] = [
898   - build_display_title(doc),
899   - pick_text(doc.get("vendor"), "en"),
900   - pick_text(doc.get("category_path"), "en"),
901   - pick_text(doc.get("category_name"), "en"),
902   - ]
903   - for sku in doc.get("skus") or []:
904   - pieces.extend(
905   - [
906   - str(sku.get("option1_value") or ""),
907   - str(sku.get("option2_value") or ""),
908   - str(sku.get("option3_value") or ""),
909   - ]
910   - )
911   - for tag in doc.get("tags") or []:
912   - pieces.append(str(tag))
913   - return normalize_text(" | ".join(piece for piece in pieces if piece))
914   -
915   - def _apply_rule_based_label_guardrails(
916   - self,
917   - label: str,
918   - query_profile: Dict[str, Any],
919   - doc: Dict[str, Any],
920   - ) -> str:
921   - if label not in VALID_LABELS:
922   - return label
923   - evidence = self._doc_evidence_text(doc)
924   - category = normalize_text(query_profile.get("primary_category"))
925   - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
926   -
927   - primary_category_match = True
928   - if category:
929   - primary_category_match = category in evidence
930   - allowed_category_match = True
931   - if allowed_categories:
932   - allowed_category_match = any(signal in evidence for signal in allowed_categories)
933   -
934   - if label == RELEVANCE_EXACT and not primary_category_match:
935   - if allowed_category_match:
936   - label = RELEVANCE_PARTIAL
937   - else:
938   - return RELEVANCE_IRRELEVANT
939   -
940   - for attr in query_profile.get("required_attributes") or []:
941   - if not isinstance(attr, dict):
942   - continue
943   - attr_name = normalize_text(attr.get("name"))
944   - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
945   - continue
946   - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
947   - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
948   - if attr_name == "fit":
949   - if any(term in {"oversized", "oversize"} for term in required_terms):
950   - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
951   - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
952   - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
953   - has_required = any(term in evidence for term in required_terms) if required_terms else True
954   - has_conflict = any(term in evidence for term in conflicting_terms)
955   -
956   - if has_conflict:
957   - return RELEVANCE_IRRELEVANT
958   - if label == RELEVANCE_EXACT and not has_required:
959   - label = RELEVANCE_PARTIAL
960   -
961   - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
962   - return RELEVANCE_IRRELEVANT
963   -
964   - return label
965   -
966   - @staticmethod
967   - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
968   - option_values = list(item.get("option_values") or [])
969   - while len(option_values) < 3:
970   - option_values.append("")
971   - product = dict(item.get("product") or {})
972   - return {
973   - "spu_id": item.get("spu_id"),
974   - "title": product.get("title") or item.get("title"),
975   - "vendor": product.get("vendor"),
976   - "category_path": product.get("category"),
977   - "category_name": product.get("category"),
978   - "image_url": item.get("image_url") or product.get("image_url"),
979   - "tags": product.get("tags") or [],
980   - "skus": [
981   - {
982   - "option1_value": option_values[0],
983   - "option2_value": option_values[1],
984   - "option3_value": option_values[2],
985   - }
986   - ],
987   - }
988   -
989   - def _collect_label_issues(
990   - self,
991   - label: str,
992   - query_profile: Dict[str, Any],
993   - doc: Dict[str, Any],
994   - ) -> List[str]:
995   - evidence = self._doc_evidence_text(doc)
996   - issues: List[str] = []
997   - category = normalize_text(query_profile.get("primary_category"))
998   - allowed_categories = [
999   - normalize_text(item)
1000   - for item in query_profile.get("allowed_categories") or []
1001   - if str(item).strip()
1002   - ]
1003   -
1004   - primary_category_match = True if not category else category in evidence
1005   - allowed_category_match = False if allowed_categories else primary_category_match
1006   - if allowed_categories:
1007   - allowed_category_match = any(signal in evidence for signal in allowed_categories)
1008   -
1009   - if label == RELEVANCE_EXACT and not primary_category_match:
1010   - if allowed_category_match:
1011   - issues.append("Exact missing primary category evidence")
1012   - else:
1013   - issues.append("Exact has category mismatch")
1014   -
1015   - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
1016   - issues.append("Partial has category mismatch")
1017   -
1018   - for attr in query_profile.get("required_attributes") or []:
1019   - if not isinstance(attr, dict):
1020   - continue
1021   - attr_name = normalize_text(attr.get("name"))
1022   - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
1023   - continue
1024   - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
1025   - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
1026   - has_required = any(term in evidence for term in required_terms) if required_terms else True
1027   - has_conflict = any(term in evidence for term in conflicting_terms)
1028   -
1029   - if has_conflict and label != RELEVANCE_IRRELEVANT:
1030   - issues.append(f"{label} conflicts on {attr_name}")
1031   - if label == RELEVANCE_EXACT and not has_required:
1032   - issues.append(f"Exact missing {attr_name}")
1033   - return issues
1034   -
1035   - def audit_live_query(
1036   - self,
1037   - query: str,
1038   - *,
1039   - top_k: int = 100,
1040   - language: str = "en",
1041   - auto_annotate: bool = False,
1042   - ) -> Dict[str, Any]:
1043   - live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
1044   - if self.labeler_mode != "complex":
1045   - labels = [
1046   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1047   - for item in live["results"]
1048   - ]
1049   - return {
1050   - "query": query,
1051   - "tenant_id": self.tenant_id,
1052   - "top_k": top_k,
1053   - "metrics": live["metrics"],
1054   - "distribution": label_distribution(labels),
1055   - "query_profile": None,
1056   - "suspicious": [],
1057   - "results": live["results"],
1058   - }
1059   - query_profile = self.get_query_profile(query, force_refresh=False)
1060   - suspicious: List[Dict[str, Any]] = []
1061   -
1062   - for item in live["results"]:
1063   - doc = self._result_item_to_doc(item)
1064   - issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
1065   - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
1066   - if suggested_label != (item["label"] or ""):
1067   - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
1068   - if issues:
1069   - suspicious.append(
1070   - {
1071   - "rank": item["rank"],
1072   - "spu_id": item["spu_id"],
1073   - "title": item["title"],
1074   - "label": item["label"],
1075   - "suggested_label": suggested_label,
1076   - "issues": issues,
1077   - }
1078   - )
1079   -
1080   - labels = [
1081   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1082   - for item in live["results"]
1083   - ]
1084   - return {
1085   - "query": query,
1086   - "tenant_id": self.tenant_id,
1087   - "top_k": top_k,
1088   - "metrics": live["metrics"],
1089   - "distribution": label_distribution(labels),
1090   - "query_profile": query_profile,
1091   - "suspicious": suspicious,
1092   - "results": live["results"],
1093   - }
1094   -
1095   - def queries_from_file(self, path: Path) -> List[str]:
1096   - return [
1097   - line.strip()
1098   - for line in path.read_text(encoding="utf-8").splitlines()
1099   - if line.strip() and not line.strip().startswith("#")
1100   - ]
1101   -
1102   - def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
1103   - if not refresh and self.store.has_corpus(self.tenant_id):
1104   - return self.store.get_corpus_docs(self.tenant_id)
1105   -
1106   - es_client = get_es_client().client
1107   - index_name = get_tenant_index_name(self.tenant_id)
1108   - docs: List[Dict[str, Any]] = []
1109   - for hit in scan(
1110   - client=es_client,
1111   - index=index_name,
1112   - query={
1113   - "_source": [
1114   - "spu_id",
1115   - "title",
1116   - "vendor",
1117   - "category_path",
1118   - "category_name",
1119   - "image_url",
1120   - "skus",
1121   - "tags",
1122   - ],
1123   - "query": {"match_all": {}},
1124   - },
1125   - size=500,
1126   - preserve_order=False,
1127   - clear_scroll=True,
1128   - ):
1129   - source = dict(hit.get("_source") or {})
1130   - source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
1131   - docs.append(source)
1132   - self.store.upsert_corpus_docs(self.tenant_id, docs)
1133   - return docs
1134   -
1135   - def full_corpus_rerank(
1136   - self,
1137   - query: str,
1138   - docs: Sequence[Dict[str, Any]],
1139   - batch_size: int = 24,
1140   - force_refresh: bool = False,
1141   - ) -> List[Dict[str, Any]]:
1142   - cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
1143   - pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
1144   - if pending:
1145   - new_scores: Dict[str, float] = {}
1146   - for start in range(0, len(pending), batch_size):
1147   - batch = pending[start : start + batch_size]
1148   - scores = self._rerank_batch_with_retry(query=query, docs=batch)
1149   - if len(scores) != len(batch):
1150   - raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
1151   - for doc, score in zip(batch, scores):
1152   - new_scores[str(doc.get("spu_id"))] = float(score)
1153   - self.store.upsert_rerank_scores(
1154   - self.tenant_id,
1155   - query,
1156   - new_scores,
1157   - model_name="qwen3_vllm_score",
1158   - )
1159   - cached.update(new_scores)
1160   -
1161   - ranked = []
1162   - for doc in docs:
1163   - spu_id = str(doc.get("spu_id"))
1164   - ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
1165   - ranked.sort(key=lambda item: item["score"], reverse=True)
1166   - return ranked
1167   -
1168   - def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
1169   - if not docs:
1170   - return []
1171   - doc_texts = [build_rerank_doc(doc) for doc in docs]
1172   - try:
1173   - scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
1174   - return scores
1175   - except Exception:
1176   - if len(docs) == 1:
1177   - return [-1.0]
1178   - if len(docs) <= 6:
1179   - scores: List[float] = []
1180   - for doc in docs:
1181   - scores.extend(self._rerank_batch_with_retry(query, [doc]))
1182   - return scores
1183   - mid = len(docs) // 2
1184   - left = self._rerank_batch_with_retry(query, docs[:mid])
1185   - right = self._rerank_batch_with_retry(query, docs[mid:])
1186   - return left + right
1187   -
1188   - def annotate_missing_labels(
1189   - self,
1190   - query: str,
1191   - docs: Sequence[Dict[str, Any]],
1192   - force_refresh: bool = False,
1193   - ) -> Dict[str, str]:
1194   - labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
1195   - missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
1196   - if not missing_docs:
1197   - return labels
1198   -
1199   - for start in range(0, len(missing_docs), self.label_client.batch_size):
1200   - batch = missing_docs[start : start + self.label_client.batch_size]
1201   - batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
1202   - for sub_labels, raw_response, sub_batch in batch_pairs:
1203   - to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
1204   - self.store.upsert_labels(
1205   - self.tenant_id,
1206   - query,
1207   - to_store,
1208   - judge_model=self.label_client.model,
1209   - raw_response=raw_response,
1210   - )
1211   - labels.update(to_store)
1212   - time.sleep(0.1)
1213   - return labels
1214   -
1215   - def _classify_with_retry(
1216   - self,
1217   - query: str,
1218   - docs: Sequence[Dict[str, Any]],
1219   - *,
1220   - force_refresh: bool = False,
1221   - ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
1222   - if not docs:
1223   - return []
1224   - try:
1225   - if self.labeler_mode == "complex":
1226   - query_profile = self.get_query_profile(query, force_refresh=force_refresh)
1227   - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
1228   - labels = [
1229   - self._apply_rule_based_label_guardrails(label, query_profile, doc)
1230   - for doc, label in zip(docs, labels)
1231   - ]
1232   - else:
1233   - labels, raw_response = self.label_client.classify_batch_simple(query, docs)
1234   - return [(labels, raw_response, docs)]
1235   - except Exception:
1236   - if len(docs) == 1:
1237   - raise
1238   - mid = len(docs) // 2
1239   - return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
1240   -
1241   - def build_query_annotation_set(
1242   - self,
1243   - query: str,
1244   - *,
1245   - search_depth: int = 1000,
1246   - rerank_depth: int = 10000,
1247   - annotate_search_top_k: int = 120,
1248   - annotate_rerank_top_k: int = 200,
1249   - language: str = "en",
1250   - force_refresh_rerank: bool = False,
1251   - force_refresh_labels: bool = False,
1252   - ) -> QueryBuildResult:
1253   - search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
1254   - search_results = list(search_payload.get("results") or [])
1255   - corpus = self.corpus_docs(refresh=False)
1256   - full_rerank = self.full_corpus_rerank(
1257   - query=query,
1258   - docs=corpus,
1259   - force_refresh=force_refresh_rerank,
1260   - )
1261   - rerank_depth_effective = min(rerank_depth, len(full_rerank))
1262   -
1263   - pool_docs: Dict[str, Dict[str, Any]] = {}
1264   - for doc in search_results[:annotate_search_top_k]:
1265   - pool_docs[str(doc.get("spu_id"))] = doc
1266   - for item in full_rerank[:annotate_rerank_top_k]:
1267   - pool_docs[str(item["spu_id"])] = item["doc"]
1268   -
1269   - labels = self.annotate_missing_labels(
1270   - query=query,
1271   - docs=list(pool_docs.values()),
1272   - force_refresh=force_refresh_labels,
1273   - )
1274   -
1275   - search_labeled_results: List[Dict[str, Any]] = []
1276   - for rank, doc in enumerate(search_results, start=1):
1277   - spu_id = str(doc.get("spu_id"))
1278   - label = labels.get(spu_id)
1279   - search_labeled_results.append(
1280   - {
1281   - "rank": rank,
1282   - "spu_id": spu_id,
1283   - "title": build_display_title(doc),
1284   - "image_url": doc.get("image_url"),
1285   - "rerank_score": None,
1286   - "label": label,
1287   - "option_values": list(compact_option_values(doc.get("skus") or [])),
1288   - "product": compact_product_payload(doc),
1289   - }
1290   - )
1291   -
1292   - rerank_top_results: List[Dict[str, Any]] = []
1293   - for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
1294   - doc = item["doc"]
1295   - spu_id = str(item["spu_id"])
1296   - rerank_top_results.append(
1297   - {
1298   - "rank": rank,
1299   - "spu_id": spu_id,
1300   - "title": build_display_title(doc),
1301   - "image_url": doc.get("image_url"),
1302   - "rerank_score": round(float(item["score"]), 8),
1303   - "label": labels.get(spu_id),
1304   - "option_values": list(compact_option_values(doc.get("skus") or [])),
1305   - "product": compact_product_payload(doc),
1306   - }
1307   - )
1308   -
1309   - top100_labels = [
1310   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1311   - for item in search_labeled_results[:100]
1312   - ]
1313   - metrics = compute_query_metrics(top100_labels)
1314   - output_dir = ensure_dir(self.artifact_root / "query_builds")
1315   - run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
1316   - output_json_path = output_dir / f"{run_id}.json"
1317   - payload = {
1318   - "run_id": run_id,
1319   - "created_at": utc_now_iso(),
1320   - "tenant_id": self.tenant_id,
1321   - "query": query,
1322   - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
1323   - "search_total": int(search_payload.get("total") or 0),
1324   - "search_depth_requested": search_depth,
1325   - "search_depth_effective": len(search_results),
1326   - "rerank_depth_requested": rerank_depth,
1327   - "rerank_depth_effective": rerank_depth_effective,
1328   - "corpus_size": len(corpus),
1329   - "annotation_pool": {
1330   - "annotate_search_top_k": annotate_search_top_k,
1331   - "annotate_rerank_top_k": annotate_rerank_top_k,
1332   - "pool_size": len(pool_docs),
1333   - },
1334   - "labeler_mode": self.labeler_mode,
1335   - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
1336   - "metrics_top100": metrics,
1337   - "search_results": search_labeled_results,
1338   - "full_rerank_top": rerank_top_results,
1339   - }
1340   - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
1341   - self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
1342   - return QueryBuildResult(
1343   - query=query,
1344   - tenant_id=self.tenant_id,
1345   - search_total=int(search_payload.get("total") or 0),
1346   - search_depth=len(search_results),
1347   - rerank_corpus_size=len(corpus),
1348   - annotated_count=len(pool_docs),
1349   - output_json_path=output_json_path,
1350   - )
1351   -
1352   - def evaluate_live_query(
1353   - self,
1354   - query: str,
1355   - top_k: int = 100,
1356   - auto_annotate: bool = False,
1357   - language: str = "en",
1358   - force_refresh_labels: bool = False,
1359   - ) -> Dict[str, Any]:
1360   - search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
1361   - results = list(search_payload.get("results") or [])
1362   - if auto_annotate:
1363   - self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
1364   - labels = self.store.get_labels(self.tenant_id, query)
1365   - recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
1366   - labeled = []
1367   - unlabeled_hits = 0
1368   - for rank, doc in enumerate(results[:top_k], start=1):
1369   - spu_id = str(doc.get("spu_id"))
1370   - label = labels.get(spu_id)
1371   - if label not in VALID_LABELS:
1372   - unlabeled_hits += 1
1373   - labeled.append(
1374   - {
1375   - "rank": rank,
1376   - "spu_id": spu_id,
1377   - "title": build_display_title(doc),
1378   - "image_url": doc.get("image_url"),
1379   - "label": label,
1380   - "option_values": list(compact_option_values(doc.get("skus") or [])),
1381   - "product": compact_product_payload(doc),
1382   - }
1383   - )
1384   - metric_labels = [
1385   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1386   - for item in labeled
1387   - ]
1388   - label_stats = self.store.get_query_label_stats(self.tenant_id, query)
1389   - rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
1390   - relevant_missing_ids = [
1391   - spu_id
1392   - for spu_id, label in labels.items()
1393   - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
1394   - ]
1395   - missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
1396   - missing_relevant = []
1397   - for spu_id in relevant_missing_ids:
1398   - doc = missing_docs_map.get(spu_id)
1399   - if not doc:
1400   - continue
1401   - missing_relevant.append(
1402   - {
1403   - "spu_id": spu_id,
1404   - "label": labels[spu_id],
1405   - "rerank_score": rerank_scores.get(spu_id),
1406   - "title": build_display_title(doc),
1407   - "image_url": doc.get("image_url"),
1408   - "option_values": list(compact_option_values(doc.get("skus") or [])),
1409   - "product": compact_product_payload(doc),
1410   - }
1411   - )
1412   - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
1413   - missing_relevant.sort(
1414   - key=lambda item: (
1415   - label_order.get(str(item.get("label")), 9),
1416   - -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
1417   - str(item.get("title") or ""),
1418   - )
1419   - )
1420   - tips: List[str] = []
1421   - if auto_annotate:
1422   - tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
1423   - else:
1424   - tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
1425   - if label_stats["total"] == 0:
1426   - tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
1427   - if unlabeled_hits:
1428   - tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
1429   - if not missing_relevant:
1430   - tips.append("No cached Exact/Partial products were missed by this recall set.")
1431   - return {
1432   - "query": query,
1433   - "tenant_id": self.tenant_id,
1434   - "top_k": top_k,
1435   - "metrics": compute_query_metrics(metric_labels),
1436   - "results": labeled,
1437   - "missing_relevant": missing_relevant,
1438   - "label_stats": {
1439   - **label_stats,
1440   - "unlabeled_hits_treated_irrelevant": unlabeled_hits,
1441   - "recalled_hits": len(labeled),
1442   - "missing_relevant_count": len(missing_relevant),
1443   - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
1444   - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
1445   - },
1446   - "tips": tips,
1447   - "total": int(search_payload.get("total") or 0),
1448   - }
1449   -
1450   - def batch_evaluate(
1451   - self,
1452   - queries: Sequence[str],
1453   - *,
1454   - top_k: int = 100,
1455   - auto_annotate: bool = True,
1456   - language: str = "en",
1457   - force_refresh_labels: bool = False,
1458   - ) -> Dict[str, Any]:
1459   - per_query = []
1460   - for query in queries:
1461   - live = self.evaluate_live_query(
1462   - query,
1463   - top_k=top_k,
1464   - auto_annotate=auto_annotate,
1465   - language=language,
1466   - force_refresh_labels=force_refresh_labels,
1467   - )
1468   - labels = [
1469   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1470   - for item in live["results"]
1471   - ]
1472   - per_query.append(
1473   - {
1474   - "query": live["query"],
1475   - "tenant_id": live["tenant_id"],
1476   - "top_k": live["top_k"],
1477   - "metrics": live["metrics"],
1478   - "distribution": label_distribution(labels),
1479   - "total": live["total"],
1480   - }
1481   - )
1482   - aggregate = aggregate_metrics([item["metrics"] for item in per_query])
1483   - aggregate_distribution = {
1484   - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
1485   - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
1486   - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
1487   - }
1488   - batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
1489   - report_dir = ensure_dir(self.artifact_root / "batch_reports")
1490   - config_snapshot_path = report_dir / f"{batch_id}_config.json"
1491   - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
1492   - config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
1493   - output_json_path = report_dir / f"{batch_id}.json"
1494   - report_md_path = report_dir / f"{batch_id}.md"
1495   - payload = {
1496   - "batch_id": batch_id,
1497   - "created_at": utc_now_iso(),
1498   - "tenant_id": self.tenant_id,
1499   - "queries": list(queries),
1500   - "top_k": top_k,
1501   - "aggregate_metrics": aggregate,
1502   - "aggregate_distribution": aggregate_distribution,
1503   - "per_query": per_query,
1504   - "config_snapshot_path": str(config_snapshot_path),
1505   - }
1506   - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
1507   - report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
1508   - self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
1509   - return payload
1510   -
1511   -
1512   -def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
1513   - lines = [
1514   - "# Search Batch Evaluation",
1515   - "",
1516   - f"- Batch ID: {payload['batch_id']}",
1517   - f"- Created at: {payload['created_at']}",
1518   - f"- Tenant ID: {payload['tenant_id']}",
1519   - f"- Query count: {len(payload['queries'])}",
1520   - f"- Top K: {payload['top_k']}",
1521   - "",
1522   - "## Aggregate Metrics",
1523   - "",
1524   - ]
1525   - for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):
1526   - lines.append(f"- {key}: {value}")
1527   - distribution = payload.get("aggregate_distribution") or {}
1528   - if distribution:
1529   - lines.extend(
1530   - [
1531   - "",
1532   - "## Label Distribution",
1533   - "",
1534   - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
1535   - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
1536   - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
1537   - ]
1538   - )
1539   - lines.extend(["", "## Per Query", ""])
1540   - for item in payload.get("per_query") or []:
1541   - lines.append(f"### {item['query']}")
1542   - lines.append("")
1543   - for key, value in sorted((item.get("metrics") or {}).items()):
1544   - lines.append(f"- {key}: {value}")
1545   - distribution = item.get("distribution") or {}
1546   - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
1547   - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
1548   - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
1549   - lines.append("")
1550   - return "\n".join(lines)
1551   -
1552   -
1553   -class SearchEvalRequest(BaseModel):
1554   - query: str
1555   - top_k: int = Field(default=100, ge=1, le=500)
1556   - auto_annotate: bool = False
1557   - language: str = "en"
1558   -
1559   -
1560   -class BatchEvalRequest(BaseModel):
1561   - queries: Optional[List[str]] = None
1562   - top_k: int = Field(default=100, ge=1, le=500)
1563   - auto_annotate: bool = False
1564   - language: str = "en"
1565   - force_refresh_labels: bool = False
1566   -
1567   -
1568   -def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
1569   - app = FastAPI(title="Search Evaluation UI", version="1.0.0")
1570   -
1571   - @app.get("/", response_class=HTMLResponse)
1572   - def home() -> str:
1573   - return WEB_APP_HTML
1574   -
1575   - @app.get("/api/queries")
1576   - def api_queries() -> Dict[str, Any]:
1577   - return {"queries": framework.queries_from_file(query_file)}
1578   -
1579   - @app.post("/api/search-eval")
1580   - def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
1581   - return framework.evaluate_live_query(
1582   - query=request.query,
1583   - top_k=request.top_k,
1584   - auto_annotate=request.auto_annotate,
1585   - language=request.language,
1586   - )
1587   -
1588   - @app.post("/api/batch-eval")
1589   - def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
1590   - queries = request.queries or framework.queries_from_file(query_file)
1591   - if not queries:
1592   - raise HTTPException(status_code=400, detail="No queries provided")
1593   - return framework.batch_evaluate(
1594   - queries=queries,
1595   - top_k=request.top_k,
1596   - auto_annotate=request.auto_annotate,
1597   - language=request.language,
1598   - force_refresh_labels=request.force_refresh_labels,
1599   - )
1600   -
1601   - @app.get("/api/history")
1602   - def api_history() -> Dict[str, Any]:
1603   - return {"history": framework.store.list_batch_runs(limit=20)}
1604   -
1605   - @app.get("/api/history/{batch_id}/report")
1606   - def api_history_report(batch_id: str) -> Dict[str, Any]:
1607   - row = framework.store.get_batch_run(batch_id)
1608   - if row is None:
1609   - raise HTTPException(status_code=404, detail="Unknown batch_id")
1610   - report_path = Path(row["report_markdown_path"]).resolve()
1611   - root = framework.artifact_root.resolve()
1612   - try:
1613   - report_path.relative_to(root)
1614   - except ValueError:
1615   - raise HTTPException(status_code=403, detail="Report path is outside artifact root")
1616   - if not report_path.is_file():
1617   - raise HTTPException(status_code=404, detail="Report file not found")
1618   - return {
1619   - "batch_id": row["batch_id"],
1620   - "created_at": row["created_at"],
1621   - "tenant_id": row["tenant_id"],
1622   - "report_markdown_path": str(report_path),
1623   - "markdown": report_path.read_text(encoding="utf-8"),
1624   - }
1625   -
1626   - return app
1627   -
1628   -
1629   -WEB_APP_HTML = """
1630   -<!doctype html>
1631   -<html lang="en">
1632   -<head>
1633   - <meta charset="utf-8" />
1634   - <meta name="viewport" content="width=device-width, initial-scale=1" />
1635   - <title>Search Evaluation</title>
1636   - <style>
1637   - :root {
1638   - --bg: #f5f3ed;
1639   - --panel: #fffdf8;
1640   - --ink: #1f2a24;
1641   - --muted: #6b756e;
1642   - --line: #ddd4c6;
1643   - --accent: #0f766e;
1644   - --exact: #0f766e;
1645   - --partial: #b7791f;
1646   - --irrelevant: #b42318;
1647   - }
1648   - body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:
1649   - radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),
1650   - linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }
1651   - .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }
1652   - .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }
1653   - .main { padding: 24px; }
1654   - h1, h2 { margin: 0 0 12px; }
1655   - .muted { color: var(--muted); }
1656   - .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }
1657   - .query-item {
1658   - display: block; width: 100%; border: 0; background: transparent; text-align: left;
1659   - padding: 10px 12px; border-radius: 10px; cursor: pointer;
1660   - color: var(--ink); font-size: 15px; font-weight: 500;
1661   - }
1662   - .query-item:hover { background: #eef6f4; }
1663   - .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }
1664   - input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }
1665   - button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }
1666   - button.secondary { background: #d9e6e3; color: #12433d; }
1667   - .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }
1668   - .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }
1669   - .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
1670   - .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }
1671   - .results { display: grid; gap: 10px; }
1672   - .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
1673   - .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
1674   - .Exact { background: var(--exact); }
1675   - .Partial { background: var(--partial); }
1676   - .Irrelevant { background: var(--irrelevant); }
1677   - .Unknown { background: #637381; }
1678   - .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
1679   - .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
1680   - .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
1681   - .section { margin-bottom: 28px; }
1682   - .history { font-size: 13px; line-height: 1.5; }
1683   - .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; }
1684   - .history-item {
1685   - display: block; width: 100%; border: 1px solid var(--line); background: var(--panel);
1686   - text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer;
1687   - color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s;
1688   - }
1689   - .history-item:hover { background: #eef6f4; border-color: #b8d4cd; }
1690   - .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
1691   - .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; }
1692   - .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; }
1693   - .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; }
1694   - .history-item .hstats span { color: var(--muted); }
1695   - .report-modal-root {
1696   - position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center;
1697   - padding: 16px; box-sizing: border-box;
1698   - }
1699   - .report-modal-root.is-open { display: flex; }
1700   - .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); }
1701   - .report-modal-dialog {
1702   - position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column;
1703   - background: var(--panel); border: 1px solid var(--line); border-radius: 18px;
1704   - box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18);
1705   - }
1706   - .report-modal-head {
1707   - flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px;
1708   - padding: 16px 18px; border-bottom: 1px solid var(--line);
1709   - }
1710   - .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; }
1711   - .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; }
1712   - .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; }
1713   - .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); }
1714   - .report-modal-body {
1715   - flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px;
1716   - font-size: 14px; line-height: 1.55;
1717   - }
1718   - .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; }
1719   - .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; }
1720   - .batch-report-md h2:first-of-type { margin-top: 0; }
1721   - .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; }
1722   - .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; }
1723   - .batch-report-md li { margin: 0.2rem 0; }
1724   - .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; }
1725   - .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; }
1726   - .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
1727   - .tip { margin-bottom: 6px; color: var(--muted); }
1728   - </style>
1729   -</head>
1730   -<body>
1731   - <div class="app">
1732   - <aside class="sidebar">
1733   - <h2>Queries</h2>
1734   - <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
1735   - <div id="queryList" class="query-list"></div>
1736   - <div class="section">
1737   - <h2>History</h2>
1738   - <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p>
1739   - <div id="history" class="history muted">Loading...</div>
1740   - </div>
1741   - </aside>
1742   - <main class="main">
1743   - <h1>Search Evaluation</h1>
1744   - <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
1745   - <div class="toolbar">
1746   - <input id="queryInput" type="text" placeholder="Search query" />
1747   - <button onclick="runSingle()">Evaluate Query</button>
1748   - <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
1749   - </div>
1750   - <div id="status" class="muted section"></div>
1751   - <section class="section">
1752   - <h2>Metrics</h2>
1753   - <div id="metrics" class="grid"></div>
1754   - </section>
1755   - <section class="section">
1756   - <h2>Top Results</h2>
1757   - <div id="results" class="results"></div>
1758   - </section>
1759   - <section class="section">
1760   - <h2>Missed Exact / Partial</h2>
1761   - <div id="missingRelevant" class="results"></div>
1762   - </section>
1763   - <section class="section">
1764   - <h2>Notes</h2>
1765   - <div id="tips" class="tips muted"></div>
1766   - </section>
1767   - </main>
1768   - </div>
1769   - <div id="reportModal" class="report-modal-root" aria-hidden="true">
1770   - <div class="report-modal-backdrop" data-close-report="1"></div>
1771   - <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle">
1772   - <div class="report-modal-head">
1773   - <h3 id="reportModalTitle">Batch report</h3>
1774   - <div class="head-actions">
1775   - <button type="button" class="secondary" id="reportCopyPath">Copy path</button>
1776   - <button type="button" onclick="closeReportModal()">Close</button>
1777   - </div>
1778   - </div>
1779   - <div id="reportModalMeta" class="report-modal-meta muted"></div>
1780   - <div id="reportModalBody" class="report-modal-body batch-report-md"></div>
1781   - </div>
1782   - </div>
1783   - <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script>
1784   - <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script>
1785   - <script>
1786   - async function fetchJSON(url, options) {
1787   - const res = await fetch(url, options);
1788   - if (!res.ok) throw new Error(await res.text());
1789   - return await res.json();
1790   - }
1791   - function renderMetrics(metrics) {
1792   - const root = document.getElementById('metrics');
1793   - root.innerHTML = '';
1794   - Object.entries(metrics || {}).forEach(([key, value]) => {
1795   - const card = document.createElement('div');
1796   - card.className = 'metric';
1797   - card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;
1798   - root.appendChild(card);
1799   - });
1800   - }
1801   - function renderResults(results, rootId='results', showRank=true) {
1802   - const mount = document.getElementById(rootId);
1803   - mount.innerHTML = '';
1804   - (results || []).forEach(item => {
1805   - const label = item.label || 'Unknown';
1806   - const box = document.createElement('div');
1807   - box.className = 'result';
1808   - box.innerHTML = `
1809   - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
1810   - <img class="thumb" src="${item.image_url || ''}" alt="" />
1811   - <div>
1812   - <div class="title">${item.title || ''}</div>
1813   - <div class="options">
1814   - <div>${(item.option_values || [])[0] || ''}</div>
1815   - <div>${(item.option_values || [])[1] || ''}</div>
1816   - <div>${(item.option_values || [])[2] || ''}</div>
1817   - </div>
1818   - </div>`;
1819   - mount.appendChild(box);
1820   - });
1821   - if (!(results || []).length) {
1822   - mount.innerHTML = '<div class="muted">None.</div>';
1823   - }
1824   - }
1825   - function renderTips(data) {
1826   - const root = document.getElementById('tips');
1827   - const tips = [...(data.tips || [])];
1828   - const stats = data.label_stats || {};
1829   - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
1830   - root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
1831   - }
1832   - async function loadQueries() {
1833   - const data = await fetchJSON('/api/queries');
1834   - const root = document.getElementById('queryList');
1835   - root.innerHTML = '';
1836   - data.queries.forEach(query => {
1837   - const btn = document.createElement('button');
1838   - btn.className = 'query-item';
1839   - btn.textContent = query;
1840   - btn.onclick = () => {
1841   - document.getElementById('queryInput').value = query;
1842   - runSingle();
1843   - };
1844   - root.appendChild(btn);
1845   - });
1846   - }
1847   - function fmtMetric(m, key, digits) {
1848   - const v = m && m[key];
1849   - if (v == null || Number.isNaN(Number(v))) return null;
1850   - const n = Number(v);
1851   - return n.toFixed(digits);
1852   - }
1853   - function historySummaryHtml(meta) {
1854   - const m = meta && meta.aggregate_metrics;
1855   - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
1856   - const parts = [];
1857   - if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
1858   - const p10 = fmtMetric(m, 'P@10', 3);
1859   - const p52 = fmtMetric(m, 'P@5_2_3', 3);
1860   - const map3 = fmtMetric(m, 'MAP_3', 3);
1861   - if (p10) parts.push(`<span>P@10</span> ${p10}`);
1862   - if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`);
1863   - if (map3) parts.push(`<span>MAP_3</span> ${map3}`);
1864   - if (!parts.length) return '';
1865   - return `<div class="hstats">${parts.join(' ยท ')}</div>`;
1866   - }
1867   - async function loadHistory() {
1868   - const data = await fetchJSON('/api/history');
1869   - const root = document.getElementById('history');
1870   - root.classList.remove('muted');
1871   - const items = data.history || [];
1872   - if (!items.length) {
1873   - root.innerHTML = '<span class="muted">No history yet.</span>';
1874   - return;
1875   - }
1876   - root.innerHTML = `<div class="history-list"></div>`;
1877   - const list = root.querySelector('.history-list');
1878   - items.forEach(item => {
1879   - const btn = document.createElement('button');
1880   - btn.type = 'button';
1881   - btn.className = 'history-item';
1882   - btn.setAttribute('aria-label', `Open report ${item.batch_id}`);
1883   - const sum = historySummaryHtml(item.metadata);
1884   - btn.innerHTML = `<div class="hid">${item.batch_id}</div>
1885   - <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`;
1886   - btn.onclick = () => openBatchReport(item.batch_id);
1887   - list.appendChild(btn);
1888   - });
1889   - }
1890   - let _lastReportPath = '';
1891   - function closeReportModal() {
1892   - const el = document.getElementById('reportModal');
1893   - el.classList.remove('is-open');
1894   - el.setAttribute('aria-hidden', 'true');
1895   - document.getElementById('reportModalBody').innerHTML = '';
1896   - document.getElementById('reportModalMeta').textContent = '';
1897   - }
1898   - async function openBatchReport(batchId) {
1899   - const el = document.getElementById('reportModal');
1900   - const body = document.getElementById('reportModalBody');
1901   - const metaEl = document.getElementById('reportModalMeta');
1902   - const titleEl = document.getElementById('reportModalTitle');
1903   - el.classList.add('is-open');
1904   - el.setAttribute('aria-hidden', 'false');
1905   - titleEl.textContent = batchId;
1906   - metaEl.textContent = '';
1907   - body.className = 'report-modal-body batch-report-md report-modal-loading';
1908   - body.textContent = 'Loading reportโ€ฆ';
1909   - try {
1910   - const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report');
1911   - _lastReportPath = rep.report_markdown_path || '';
1912   - metaEl.textContent = rep.report_markdown_path || '';
1913   - const raw = marked.parse(rep.markdown || '', { gfm: true });
1914   - const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } });
1915   - body.className = 'report-modal-body batch-report-md';
1916   - body.innerHTML = safe;
1917   - } catch (e) {
1918   - body.className = 'report-modal-body report-modal-error';
1919   - body.textContent = (e && e.message) ? e.message : String(e);
1920   - }
1921   - }
1922   - document.getElementById('reportModal').addEventListener('click', (ev) => {
1923   - if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal();
1924   - });
1925   - document.addEventListener('keydown', (ev) => {
1926   - if (ev.key === 'Escape') closeReportModal();
1927   - });
1928   - document.getElementById('reportCopyPath').addEventListener('click', async () => {
1929   - if (!_lastReportPath) return;
1930   - try {
1931   - await navigator.clipboard.writeText(_lastReportPath);
1932   - } catch (_) {}
1933   - });
1934   - async function runSingle() {
1935   - const query = document.getElementById('queryInput').value.trim();
1936   - if (!query) return;
1937   - document.getElementById('status').textContent = `Evaluating "${query}"...`;
1938   - const data = await fetchJSON('/api/search-eval', {
1939   - method: 'POST',
1940   - headers: {'Content-Type': 'application/json'},
1941   - body: JSON.stringify({query, top_k: 100, auto_annotate: false})
1942   - });
1943   - document.getElementById('status').textContent = `Done. total=${data.total}`;
1944   - renderMetrics(data.metrics);
1945   - renderResults(data.results, 'results', true);
1946   - renderResults(data.missing_relevant, 'missingRelevant', false);
1947   - renderTips(data);
1948   - loadHistory();
1949   - }
1950   - async function runBatch() {
1951   - document.getElementById('status').textContent = 'Running batch evaluation...';
1952   - const data = await fetchJSON('/api/batch-eval', {
1953   - method: 'POST',
1954   - headers: {'Content-Type': 'application/json'},
1955   - body: JSON.stringify({top_k: 100, auto_annotate: false})
1956   - });
1957   - document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
1958   - renderMetrics(data.aggregate_metrics);
1959   - renderResults([], 'results', true);
1960   - renderResults([], 'missingRelevant', false);
1961   - document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
1962   - loadHistory();
1963   - }
1964   - loadQueries();
1965   - loadHistory();
1966   - </script>
1967   -</body>
1968   -</html>
1969   -"""
1970   -
1971   -
1972   -def build_cli_parser() -> argparse.ArgumentParser:
1973   - parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
1974   - sub = parser.add_subparsers(dest="command", required=True)
1975   -
1976   - build = sub.add_parser("build", help="Build pooled annotation set for queries")
1977   - build.add_argument("--tenant-id", default="163")
1978   - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1979   - build.add_argument("--search-depth", type=int, default=1000)
1980   - build.add_argument("--rerank-depth", type=int, default=10000)
1981   - build.add_argument("--annotate-search-top-k", type=int, default=120)
1982   - build.add_argument("--annotate-rerank-top-k", type=int, default=200)
1983   - build.add_argument("--language", default="en")
1984   - build.add_argument("--force-refresh-rerank", action="store_true")
1985   - build.add_argument("--force-refresh-labels", action="store_true")
1986   - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1987   -
1988   - batch = sub.add_parser("batch", help="Run batch evaluation against live search")
1989   - batch.add_argument("--tenant-id", default="163")
1990   - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1991   - batch.add_argument("--top-k", type=int, default=100)
1992   - batch.add_argument("--language", default="en")
1993   - batch.add_argument("--force-refresh-labels", action="store_true")
1994   - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1995   -
1996   - audit = sub.add_parser("audit", help="Audit annotation quality for queries")
1997   - audit.add_argument("--tenant-id", default="163")
1998   - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1999   - audit.add_argument("--top-k", type=int, default=100)
2000   - audit.add_argument("--language", default="en")
2001   - audit.add_argument("--limit-suspicious", type=int, default=5)
2002   - audit.add_argument("--force-refresh-labels", action="store_true")
2003   - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
2004   -
2005   - serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
2006   - serve.add_argument("--tenant-id", default="163")
2007   - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
2008   - serve.add_argument("--host", default="0.0.0.0")
2009   - serve.add_argument("--port", type=int, default=6010)
2010   - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
2011   -
2012   - return parser
2013   -
2014   -
2015   -def run_build(args: argparse.Namespace) -> None:
2016   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
2017   - queries = framework.queries_from_file(Path(args.queries_file))
2018   - summary = []
2019   - for query in queries:
2020   - result = framework.build_query_annotation_set(
2021   - query=query,
2022   - search_depth=args.search_depth,
2023   - rerank_depth=args.rerank_depth,
2024   - annotate_search_top_k=args.annotate_search_top_k,
2025   - annotate_rerank_top_k=args.annotate_rerank_top_k,
2026   - language=args.language,
2027   - force_refresh_rerank=args.force_refresh_rerank,
2028   - force_refresh_labels=args.force_refresh_labels,
2029   - )
2030   - summary.append(
2031   - {
2032   - "query": result.query,
2033   - "search_total": result.search_total,
2034   - "search_depth": result.search_depth,
2035   - "rerank_corpus_size": result.rerank_corpus_size,
2036   - "annotated_count": result.annotated_count,
2037   - "output_json_path": str(result.output_json_path),
2038   - }
2039   - )
2040   - print(
2041   - f"[build] query={result.query!r} search_total={result.search_total} "
2042   - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
2043   - f"annotated={result.annotated_count} output={result.output_json_path}"
2044   - )
2045   - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
2046   - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
2047   - print(f"[done] summary={out_path}")
2048   -
2049   -
2050   -def run_batch(args: argparse.Namespace) -> None:
2051   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
2052   - queries = framework.queries_from_file(Path(args.queries_file))
2053   - payload = framework.batch_evaluate(
2054   - queries=queries,
2055   - top_k=args.top_k,
2056   - auto_annotate=True,
2057   - language=args.language,
2058   - force_refresh_labels=args.force_refresh_labels,
2059   - )
2060   - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
2061   -
2062   -
2063   -def run_audit(args: argparse.Namespace) -> None:
2064   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
2065   - queries = framework.queries_from_file(Path(args.queries_file))
2066   - audit_items = []
2067   - for query in queries:
2068   - item = framework.audit_live_query(
2069   - query=query,
2070   - top_k=args.top_k,
2071   - language=args.language,
2072   - auto_annotate=not args.force_refresh_labels,
2073   - )
2074   - if args.force_refresh_labels:
2075   - live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
2076   - framework.annotate_missing_labels(
2077   - query=query,
2078   - docs=list(live_payload.get("results") or [])[: args.top_k],
2079   - force_refresh=True,
2080   - )
2081   - item = framework.audit_live_query(
2082   - query=query,
2083   - top_k=args.top_k,
2084   - language=args.language,
2085   - auto_annotate=False,
2086   - )
2087   - audit_items.append(
2088   - {
2089   - "query": query,
2090   - "metrics": item["metrics"],
2091   - "distribution": item["distribution"],
2092   - "suspicious_count": len(item["suspicious"]),
2093   - "suspicious_examples": item["suspicious"][: args.limit_suspicious],
2094   - }
2095   - )
2096   - print(
2097   - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
2098   - )
2099   -
2100   - summary = {
2101   - "created_at": utc_now_iso(),
2102   - "tenant_id": args.tenant_id,
2103   - "top_k": args.top_k,
2104   - "query_count": len(queries),
2105   - "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
2106   - "queries": audit_items,
2107   - }
2108   - out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
2109   - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
2110   - print(f"[done] audit={out_path}")
2111   -
2112   -
2113   -def run_serve(args: argparse.Namespace) -> None:
2114   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
2115   - app = create_web_app(framework, Path(args.queries_file))
2116   - import uvicorn
2117   -
2118   - uvicorn.run(app, host=args.host, port=args.port, log_level="info")
2119   -
2120   -
2121   -def main() -> None:
2122   - parser = build_cli_parser()
2123   - args = parser.parse_args()
2124   - if args.command == "build":
2125   - run_build(args)
2126   - return
2127   - if args.command == "batch":
2128   - run_batch(args)
2129   - return
2130   - if args.command == "audit":
2131   - run_audit(args)
2132   - return
2133   - if args.command == "serve":
2134   - run_serve(args)
2135   - return
2136   - raise SystemExit(f"unknown command: {args.command}")
2137   -
2138   -
2139   -if __name__ == "__main__":
2140   - main()
scripts/evaluation/eval_framework/__init__.py 0 โ†’ 100644
... ... @@ -0,0 +1,59 @@
  1 +"""
  2 +Search evaluation framework: pooled relevance annotation, live metrics, batch reports.
  3 +
  4 +Importing this package ensures the project root is on ``sys.path`` (for ``api.*`` imports).
  5 +"""
  6 +
  7 +from __future__ import annotations
  8 +
  9 +from .utils import ensure_project_on_path
  10 +
  11 +ensure_project_on_path()
  12 +
  13 +from .constants import ( # noqa: E402
  14 + DEFAULT_ARTIFACT_ROOT,
  15 + DEFAULT_LABELER_MODE,
  16 + DEFAULT_QUERY_FILE,
  17 + JUDGE_PROMPT_VERSION_COMPLEX,
  18 + JUDGE_PROMPT_VERSION_SIMPLE,
  19 + PROJECT_ROOT,
  20 + RELEVANCE_EXACT,
  21 + RELEVANCE_IRRELEVANT,
  22 + RELEVANCE_PARTIAL,
  23 + VALID_LABELS,
  24 +)
  25 +from .framework import SearchEvaluationFramework # noqa: E402
  26 +from .store import EvalStore, QueryBuildResult # noqa: E402
  27 +from .cli import build_cli_parser, main # noqa: E402
  28 +from .web_app import create_web_app # noqa: E402
  29 +from .reports import render_batch_report_markdown # noqa: E402
  30 +from .utils import ( # noqa: E402
  31 + ensure_dir,
  32 + sha1_text,
  33 + utc_now_iso,
  34 + utc_timestamp,
  35 +)
  36 +
  37 +__all__ = [
  38 + "DEFAULT_ARTIFACT_ROOT",
  39 + "DEFAULT_LABELER_MODE",
  40 + "DEFAULT_QUERY_FILE",
  41 + "EvalStore",
  42 + "JUDGE_PROMPT_VERSION_COMPLEX",
  43 + "JUDGE_PROMPT_VERSION_SIMPLE",
  44 + "PROJECT_ROOT",
  45 + "QueryBuildResult",
  46 + "RELEVANCE_EXACT",
  47 + "RELEVANCE_IRRELEVANT",
  48 + "RELEVANCE_PARTIAL",
  49 + "SearchEvaluationFramework",
  50 + "VALID_LABELS",
  51 + "build_cli_parser",
  52 + "create_web_app",
  53 + "ensure_dir",
  54 + "main",
  55 + "render_batch_report_markdown",
  56 + "sha1_text",
  57 + "utc_now_iso",
  58 + "utc_timestamp",
  59 +]
... ...
scripts/evaluation/eval_framework/__main__.py 0 โ†’ 100644
... ... @@ -0,0 +1,4 @@
  1 +from .cli import main
  2 +
  3 +if __name__ == "__main__":
  4 + main()
... ...
scripts/evaluation/eval_framework/api_models.py 0 โ†’ 100644
... ... @@ -0,0 +1,22 @@
  1 +"""Pydantic request bodies for the evaluation FastAPI app."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import List, Optional
  6 +
  7 +from pydantic import BaseModel, Field
  8 +
  9 +
  10 +class SearchEvalRequest(BaseModel):
  11 + query: str
  12 + top_k: int = Field(default=100, ge=1, le=500)
  13 + auto_annotate: bool = False
  14 + language: str = "en"
  15 +
  16 +
  17 +class BatchEvalRequest(BaseModel):
  18 + queries: Optional[List[str]] = None
  19 + top_k: int = Field(default=100, ge=1, le=500)
  20 + auto_annotate: bool = False
  21 + language: str = "en"
  22 + force_refresh_labels: bool = False
... ...
scripts/evaluation/eval_framework/cli.py 0 โ†’ 100644
... ... @@ -0,0 +1,179 @@
  1 +"""CLI: build annotations, batch eval, audit, serve web UI."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import argparse
  6 +import json
  7 +from pathlib import Path
  8 +
  9 +from .constants import DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE
  10 +from .framework import SearchEvaluationFramework
  11 +from .utils import ensure_dir, utc_now_iso, utc_timestamp
  12 +from .web_app import create_web_app
  13 +
  14 +
  15 +def build_cli_parser() -> argparse.ArgumentParser:
  16 + parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
  17 + sub = parser.add_subparsers(dest="command", required=True)
  18 +
  19 + build = sub.add_parser("build", help="Build pooled annotation set for queries")
  20 + build.add_argument("--tenant-id", default="163")
  21 + build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  22 + build.add_argument("--search-depth", type=int, default=1000)
  23 + build.add_argument("--rerank-depth", type=int, default=10000)
  24 + build.add_argument("--annotate-search-top-k", type=int, default=120)
  25 + build.add_argument("--annotate-rerank-top-k", type=int, default=200)
  26 + build.add_argument("--language", default="en")
  27 + build.add_argument("--force-refresh-rerank", action="store_true")
  28 + build.add_argument("--force-refresh-labels", action="store_true")
  29 + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  30 +
  31 + batch = sub.add_parser("batch", help="Run batch evaluation against live search")
  32 + batch.add_argument("--tenant-id", default="163")
  33 + batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  34 + batch.add_argument("--top-k", type=int, default=100)
  35 + batch.add_argument("--language", default="en")
  36 + batch.add_argument("--force-refresh-labels", action="store_true")
  37 + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  38 +
  39 + audit = sub.add_parser("audit", help="Audit annotation quality for queries")
  40 + audit.add_argument("--tenant-id", default="163")
  41 + audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  42 + audit.add_argument("--top-k", type=int, default=100)
  43 + audit.add_argument("--language", default="en")
  44 + audit.add_argument("--limit-suspicious", type=int, default=5)
  45 + audit.add_argument("--force-refresh-labels", action="store_true")
  46 + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  47 +
  48 + serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
  49 + serve.add_argument("--tenant-id", default="163")
  50 + serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  51 + serve.add_argument("--host", default="0.0.0.0")
  52 + serve.add_argument("--port", type=int, default=6010)
  53 + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  54 +
  55 + return parser
  56 +
  57 +
  58 +def run_build(args: argparse.Namespace) -> None:
  59 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  60 + queries = framework.queries_from_file(Path(args.queries_file))
  61 + summary = []
  62 + for query in queries:
  63 + result = framework.build_query_annotation_set(
  64 + query=query,
  65 + search_depth=args.search_depth,
  66 + rerank_depth=args.rerank_depth,
  67 + annotate_search_top_k=args.annotate_search_top_k,
  68 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  69 + language=args.language,
  70 + force_refresh_rerank=args.force_refresh_rerank,
  71 + force_refresh_labels=args.force_refresh_labels,
  72 + )
  73 + summary.append(
  74 + {
  75 + "query": result.query,
  76 + "search_total": result.search_total,
  77 + "search_depth": result.search_depth,
  78 + "rerank_corpus_size": result.rerank_corpus_size,
  79 + "annotated_count": result.annotated_count,
  80 + "output_json_path": str(result.output_json_path),
  81 + }
  82 + )
  83 + print(
  84 + f"[build] query={result.query!r} search_total={result.search_total} "
  85 + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
  86 + f"annotated={result.annotated_count} output={result.output_json_path}"
  87 + )
  88 + out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
  89 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  90 + print(f"[done] summary={out_path}")
  91 +
  92 +
  93 +def run_batch(args: argparse.Namespace) -> None:
  94 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  95 + queries = framework.queries_from_file(Path(args.queries_file))
  96 + payload = framework.batch_evaluate(
  97 + queries=queries,
  98 + top_k=args.top_k,
  99 + auto_annotate=True,
  100 + language=args.language,
  101 + force_refresh_labels=args.force_refresh_labels,
  102 + )
  103 + print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
  104 +
  105 +
  106 +def run_audit(args: argparse.Namespace) -> None:
  107 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  108 + queries = framework.queries_from_file(Path(args.queries_file))
  109 + audit_items = []
  110 + for query in queries:
  111 + item = framework.audit_live_query(
  112 + query=query,
  113 + top_k=args.top_k,
  114 + language=args.language,
  115 + auto_annotate=not args.force_refresh_labels,
  116 + )
  117 + if args.force_refresh_labels:
  118 + live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
  119 + framework.annotate_missing_labels(
  120 + query=query,
  121 + docs=list(live_payload.get("results") or [])[: args.top_k],
  122 + force_refresh=True,
  123 + )
  124 + item = framework.audit_live_query(
  125 + query=query,
  126 + top_k=args.top_k,
  127 + language=args.language,
  128 + auto_annotate=False,
  129 + )
  130 + audit_items.append(
  131 + {
  132 + "query": query,
  133 + "metrics": item["metrics"],
  134 + "distribution": item["distribution"],
  135 + "suspicious_count": len(item["suspicious"]),
  136 + "suspicious_examples": item["suspicious"][: args.limit_suspicious],
  137 + }
  138 + )
  139 + print(
  140 + f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
  141 + )
  142 +
  143 + summary = {
  144 + "created_at": utc_now_iso(),
  145 + "tenant_id": args.tenant_id,
  146 + "top_k": args.top_k,
  147 + "query_count": len(queries),
  148 + "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
  149 + "queries": audit_items,
  150 + }
  151 + out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
  152 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  153 + print(f"[done] audit={out_path}")
  154 +
  155 +
  156 +def run_serve(args: argparse.Namespace) -> None:
  157 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
  158 + app = create_web_app(framework, Path(args.queries_file))
  159 + import uvicorn
  160 +
  161 + uvicorn.run(app, host=args.host, port=args.port, log_level="info")
  162 +
  163 +
  164 +def main() -> None:
  165 + parser = build_cli_parser()
  166 + args = parser.parse_args()
  167 + if args.command == "build":
  168 + run_build(args)
  169 + return
  170 + if args.command == "batch":
  171 + run_batch(args)
  172 + return
  173 + if args.command == "audit":
  174 + run_audit(args)
  175 + return
  176 + if args.command == "serve":
  177 + run_serve(args)
  178 + return
  179 + raise SystemExit(f"unknown command: {args.command}")
... ...
scripts/evaluation/eval_framework/clients.py 0 โ†’ 100644
... ... @@ -0,0 +1,149 @@
  1 +"""HTTP clients for search API, reranker, and DashScope chat (relevance labeling)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Any, Dict, List, Optional, Sequence, Tuple
  6 +
  7 +import requests
  8 +
  9 +from .constants import VALID_LABELS
  10 +from .prompts import (
  11 + classify_batch_complex_prompt,
  12 + classify_batch_simple_prompt,
  13 + extract_query_profile_prompt,
  14 +)
  15 +from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
  16 +
  17 +
  18 +class SearchServiceClient:
  19 + def __init__(self, base_url: str, tenant_id: str):
  20 + self.base_url = base_url.rstrip("/")
  21 + self.tenant_id = str(tenant_id)
  22 + self.session = requests.Session()
  23 +
  24 + def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
  25 + response = self.session.post(
  26 + f"{self.base_url}/search/",
  27 + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  28 + json={"query": query, "size": size, "from": from_, "language": language},
  29 + timeout=120,
  30 + )
  31 + response.raise_for_status()
  32 + return response.json()
  33 +
  34 +
  35 +class RerankServiceClient:
  36 + def __init__(self, service_url: str):
  37 + self.service_url = service_url.rstrip("/")
  38 + self.session = requests.Session()
  39 +
  40 + def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:
  41 + payload: Dict[str, Any] = {
  42 + "query": query,
  43 + "docs": list(docs),
  44 + "normalize": normalize,
  45 + }
  46 + if top_n is not None:
  47 + payload["top_n"] = int(top_n)
  48 + response = self.session.post(self.service_url, json=payload, timeout=180)
  49 + response.raise_for_status()
  50 + data = response.json()
  51 + return list(data.get("scores") or []), dict(data.get("meta") or {})
  52 +
  53 +
  54 +class DashScopeLabelClient:
  55 + def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
  56 + self.model = model
  57 + self.base_url = base_url.rstrip("/")
  58 + self.api_key = api_key
  59 + self.batch_size = int(batch_size)
  60 + self.session = requests.Session()
  61 +
  62 + def _chat(self, prompt: str) -> Tuple[str, str]:
  63 + response = self.session.post(
  64 + f"{self.base_url}/chat/completions",
  65 + headers={
  66 + "Authorization": f"Bearer {self.api_key}",
  67 + "Content-Type": "application/json",
  68 + },
  69 + json={
  70 + "model": self.model,
  71 + "messages": [{"role": "user", "content": prompt}],
  72 + "temperature": 0,
  73 + "top_p": 0.1,
  74 + },
  75 + timeout=180,
  76 + )
  77 + response.raise_for_status()
  78 + data = response.json()
  79 + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
  80 + return content, safe_json_dumps(data)
  81 +
  82 + def classify_batch_simple(
  83 + self,
  84 + query: str,
  85 + docs: Sequence[Dict[str, Any]],
  86 + ) -> Tuple[List[str], str]:
  87 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  88 + prompt = classify_batch_simple_prompt(query, numbered_docs)
  89 + content, raw_response = self._chat(prompt)
  90 + labels = []
  91 + for line in str(content or "").splitlines():
  92 + label = line.strip()
  93 + if label in VALID_LABELS:
  94 + labels.append(label)
  95 + if len(labels) != len(docs):
  96 + payload = extract_json_blob(content)
  97 + if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
  98 + labels = []
  99 + for item in payload["labels"][: len(docs)]:
  100 + if isinstance(item, dict):
  101 + label = str(item.get("label") or "").strip()
  102 + else:
  103 + label = str(item).strip()
  104 + if label in VALID_LABELS:
  105 + labels.append(label)
  106 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  107 + raise ValueError(f"unexpected simple label output: {content!r}")
  108 + return labels, raw_response
  109 +
  110 + def extract_query_profile(
  111 + self,
  112 + query: str,
  113 + parser_hints: Dict[str, Any],
  114 + ) -> Tuple[Dict[str, Any], str]:
  115 + prompt = extract_query_profile_prompt(query, parser_hints)
  116 + content, raw_response = self._chat(prompt)
  117 + payload = extract_json_blob(content)
  118 + if not isinstance(payload, dict):
  119 + raise ValueError(f"unexpected query profile payload: {content!r}")
  120 + payload.setdefault("normalized_query_en", query)
  121 + payload.setdefault("primary_category", "")
  122 + payload.setdefault("allowed_categories", [])
  123 + payload.setdefault("required_attributes", [])
  124 + payload.setdefault("notes", [])
  125 + return payload, raw_response
  126 +
  127 + def classify_batch_complex(
  128 + self,
  129 + query: str,
  130 + query_profile: Dict[str, Any],
  131 + docs: Sequence[Dict[str, Any]],
  132 + ) -> Tuple[List[str], str]:
  133 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  134 + prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs)
  135 + content, raw_response = self._chat(prompt)
  136 + payload = extract_json_blob(content)
  137 + if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
  138 + raise ValueError(f"unexpected label payload: {content!r}")
  139 + labels_payload = payload["labels"]
  140 + labels: List[str] = []
  141 + for item in labels_payload[: len(docs)]:
  142 + if not isinstance(item, dict):
  143 + continue
  144 + label = str(item.get("label") or "").strip()
  145 + if label in VALID_LABELS:
  146 + labels.append(label)
  147 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  148 + raise ValueError(f"unexpected label output: {content!r}")
  149 + return labels, raw_response
... ...
scripts/evaluation/eval_framework/constants.py 0 โ†’ 100644
... ... @@ -0,0 +1,19 @@
  1 +"""Paths and shared constants for search evaluation."""
  2 +
  3 +from pathlib import Path
  4 +
  5 +_PKG_DIR = Path(__file__).resolve().parent
  6 +_SCRIPTS_EVAL_DIR = _PKG_DIR.parent
  7 +PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
  8 +
  9 +RELEVANCE_EXACT = "Exact"
  10 +RELEVANCE_PARTIAL = "Partial"
  11 +RELEVANCE_IRRELEVANT = "Irrelevant"
  12 +VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
  13 +
  14 +DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
  15 +DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
  16 +
  17 +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
  18 +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
  19 +DEFAULT_LABELER_MODE = "simple"
... ...
scripts/evaluation/eval_framework/framework.py 0 โ†’ 100644
... ... @@ -0,0 +1,719 @@
  1 +"""Core orchestration: corpus, rerank, LLM labels, live/batch evaluation."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +import time
  7 +from pathlib import Path
  8 +from typing import Any, Dict, List, Sequence, Tuple
  9 +
  10 +import requests
  11 +from elasticsearch.helpers import scan
  12 +
  13 +from api.app import get_app_config, get_es_client, get_query_parser, init_service
  14 +from indexer.mapping_generator import get_tenant_index_name
  15 +
  16 +from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
  17 +from .constants import (
  18 + DEFAULT_ARTIFACT_ROOT,
  19 + DEFAULT_LABELER_MODE,
  20 + JUDGE_PROMPT_VERSION_COMPLEX,
  21 + RELEVANCE_EXACT,
  22 + RELEVANCE_IRRELEVANT,
  23 + RELEVANCE_PARTIAL,
  24 + VALID_LABELS,
  25 +)
  26 +from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
  27 +from .reports import render_batch_report_markdown
  28 +from .store import EvalStore, QueryBuildResult
  29 +from .utils import (
  30 + build_display_title,
  31 + build_rerank_doc,
  32 + compact_option_values,
  33 + compact_product_payload,
  34 + ensure_dir,
  35 + normalize_text,
  36 + pick_text,
  37 + sha1_text,
  38 + utc_now_iso,
  39 + utc_timestamp,
  40 +)
  41 +
  42 +
  43 +class SearchEvaluationFramework:
  44 + def __init__(
  45 + self,
  46 + tenant_id: str,
  47 + artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
  48 + search_base_url: str = "http://localhost:6002",
  49 + labeler_mode: str = DEFAULT_LABELER_MODE,
  50 + ):
  51 + init_service(get_app_config().infrastructure.elasticsearch.host)
  52 + self.tenant_id = str(tenant_id)
  53 + self.artifact_root = ensure_dir(artifact_root)
  54 + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
  55 + self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
  56 + self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
  57 + app_cfg = get_app_config()
  58 + rerank_service_url = str(
  59 + app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
  60 + )
  61 + self.rerank_client = RerankServiceClient(rerank_service_url)
  62 + llm_cfg = app_cfg.services.translation.capabilities["llm"]
  63 + api_key = app_cfg.infrastructure.secrets.dashscope_api_key
  64 + if not api_key:
  65 + raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
  66 + self.label_client = DashScopeLabelClient(
  67 + model=str(llm_cfg["model"]),
  68 + base_url=str(llm_cfg["base_url"]),
  69 + api_key=str(api_key),
  70 + )
  71 + self.query_parser = None
  72 +
  73 + def _get_query_parser(self):
  74 + if self.query_parser is None:
  75 + self.query_parser = get_query_parser()
  76 + return self.query_parser
  77 +
  78 + def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
  79 + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
  80 + payload = parsed.to_dict()
  81 + payload["text_for_rerank"] = parsed.text_for_rerank()
  82 + return payload
  83 +
  84 + def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
  85 + if self.labeler_mode != "complex":
  86 + raise RuntimeError("query profiles are only used in complex labeler mode")
  87 + if not force_refresh:
  88 + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
  89 + if cached is not None:
  90 + return cached
  91 + parser_hints = self.build_query_parser_hints(query)
  92 + profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
  93 + profile["parser_hints"] = parser_hints
  94 + self.store.upsert_query_profile(
  95 + self.tenant_id,
  96 + query,
  97 + JUDGE_PROMPT_VERSION_COMPLEX,
  98 + self.label_client.model,
  99 + profile,
  100 + raw_response,
  101 + )
  102 + return profile
  103 +
  104 + @staticmethod
  105 + def _doc_evidence_text(doc: Dict[str, Any]) -> str:
  106 + pieces: List[str] = [
  107 + build_display_title(doc),
  108 + pick_text(doc.get("vendor"), "en"),
  109 + pick_text(doc.get("category_path"), "en"),
  110 + pick_text(doc.get("category_name"), "en"),
  111 + ]
  112 + for sku in doc.get("skus") or []:
  113 + pieces.extend(
  114 + [
  115 + str(sku.get("option1_value") or ""),
  116 + str(sku.get("option2_value") or ""),
  117 + str(sku.get("option3_value") or ""),
  118 + ]
  119 + )
  120 + for tag in doc.get("tags") or []:
  121 + pieces.append(str(tag))
  122 + return normalize_text(" | ".join(piece for piece in pieces if piece))
  123 +
  124 + def _apply_rule_based_label_guardrails(
  125 + self,
  126 + label: str,
  127 + query_profile: Dict[str, Any],
  128 + doc: Dict[str, Any],
  129 + ) -> str:
  130 + if label not in VALID_LABELS:
  131 + return label
  132 + evidence = self._doc_evidence_text(doc)
  133 + category = normalize_text(query_profile.get("primary_category"))
  134 + allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
  135 +
  136 + primary_category_match = True
  137 + if category:
  138 + primary_category_match = category in evidence
  139 + allowed_category_match = True
  140 + if allowed_categories:
  141 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  142 +
  143 + if label == RELEVANCE_EXACT and not primary_category_match:
  144 + if allowed_category_match:
  145 + label = RELEVANCE_PARTIAL
  146 + else:
  147 + return RELEVANCE_IRRELEVANT
  148 +
  149 + for attr in query_profile.get("required_attributes") or []:
  150 + if not isinstance(attr, dict):
  151 + continue
  152 + attr_name = normalize_text(attr.get("name"))
  153 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
  154 + continue
  155 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  156 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  157 + if attr_name == "fit":
  158 + if any(term in {"oversized", "oversize"} for term in required_terms):
  159 + conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
  160 + if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
  161 + conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
  162 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  163 + has_conflict = any(term in evidence for term in conflicting_terms)
  164 +
  165 + if has_conflict:
  166 + return RELEVANCE_IRRELEVANT
  167 + if label == RELEVANCE_EXACT and not has_required:
  168 + label = RELEVANCE_PARTIAL
  169 +
  170 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  171 + return RELEVANCE_IRRELEVANT
  172 +
  173 + return label
  174 +
  175 + @staticmethod
  176 + def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
  177 + option_values = list(item.get("option_values") or [])
  178 + while len(option_values) < 3:
  179 + option_values.append("")
  180 + product = dict(item.get("product") or {})
  181 + return {
  182 + "spu_id": item.get("spu_id"),
  183 + "title": product.get("title") or item.get("title"),
  184 + "vendor": product.get("vendor"),
  185 + "category_path": product.get("category"),
  186 + "category_name": product.get("category"),
  187 + "image_url": item.get("image_url") or product.get("image_url"),
  188 + "tags": product.get("tags") or [],
  189 + "skus": [
  190 + {
  191 + "option1_value": option_values[0],
  192 + "option2_value": option_values[1],
  193 + "option3_value": option_values[2],
  194 + }
  195 + ],
  196 + }
  197 +
  198 + def _collect_label_issues(
  199 + self,
  200 + label: str,
  201 + query_profile: Dict[str, Any],
  202 + doc: Dict[str, Any],
  203 + ) -> List[str]:
  204 + evidence = self._doc_evidence_text(doc)
  205 + issues: List[str] = []
  206 + category = normalize_text(query_profile.get("primary_category"))
  207 + allowed_categories = [
  208 + normalize_text(item)
  209 + for item in query_profile.get("allowed_categories") or []
  210 + if str(item).strip()
  211 + ]
  212 +
  213 + primary_category_match = True if not category else category in evidence
  214 + allowed_category_match = False if allowed_categories else primary_category_match
  215 + if allowed_categories:
  216 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  217 +
  218 + if label == RELEVANCE_EXACT and not primary_category_match:
  219 + if allowed_category_match:
  220 + issues.append("Exact missing primary category evidence")
  221 + else:
  222 + issues.append("Exact has category mismatch")
  223 +
  224 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  225 + issues.append("Partial has category mismatch")
  226 +
  227 + for attr in query_profile.get("required_attributes") or []:
  228 + if not isinstance(attr, dict):
  229 + continue
  230 + attr_name = normalize_text(attr.get("name"))
  231 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
  232 + continue
  233 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  234 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  235 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  236 + has_conflict = any(term in evidence for term in conflicting_terms)
  237 +
  238 + if has_conflict and label != RELEVANCE_IRRELEVANT:
  239 + issues.append(f"{label} conflicts on {attr_name}")
  240 + if label == RELEVANCE_EXACT and not has_required:
  241 + issues.append(f"Exact missing {attr_name}")
  242 + return issues
  243 +
  244 + def audit_live_query(
  245 + self,
  246 + query: str,
  247 + *,
  248 + top_k: int = 100,
  249 + language: str = "en",
  250 + auto_annotate: bool = False,
  251 + ) -> Dict[str, Any]:
  252 + live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
  253 + if self.labeler_mode != "complex":
  254 + labels = [
  255 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  256 + for item in live["results"]
  257 + ]
  258 + return {
  259 + "query": query,
  260 + "tenant_id": self.tenant_id,
  261 + "top_k": top_k,
  262 + "metrics": live["metrics"],
  263 + "distribution": label_distribution(labels),
  264 + "query_profile": None,
  265 + "suspicious": [],
  266 + "results": live["results"],
  267 + }
  268 + query_profile = self.get_query_profile(query, force_refresh=False)
  269 + suspicious: List[Dict[str, Any]] = []
  270 +
  271 + for item in live["results"]:
  272 + doc = self._result_item_to_doc(item)
  273 + issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
  274 + suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
  275 + if suggested_label != (item["label"] or ""):
  276 + issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
  277 + if issues:
  278 + suspicious.append(
  279 + {
  280 + "rank": item["rank"],
  281 + "spu_id": item["spu_id"],
  282 + "title": item["title"],
  283 + "label": item["label"],
  284 + "suggested_label": suggested_label,
  285 + "issues": issues,
  286 + }
  287 + )
  288 +
  289 + labels = [
  290 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  291 + for item in live["results"]
  292 + ]
  293 + return {
  294 + "query": query,
  295 + "tenant_id": self.tenant_id,
  296 + "top_k": top_k,
  297 + "metrics": live["metrics"],
  298 + "distribution": label_distribution(labels),
  299 + "query_profile": query_profile,
  300 + "suspicious": suspicious,
  301 + "results": live["results"],
  302 + }
  303 +
  304 + def queries_from_file(self, path: Path) -> List[str]:
  305 + return [
  306 + line.strip()
  307 + for line in path.read_text(encoding="utf-8").splitlines()
  308 + if line.strip() and not line.strip().startswith("#")
  309 + ]
  310 +
  311 + def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
  312 + if not refresh and self.store.has_corpus(self.tenant_id):
  313 + return self.store.get_corpus_docs(self.tenant_id)
  314 +
  315 + es_client = get_es_client().client
  316 + index_name = get_tenant_index_name(self.tenant_id)
  317 + docs: List[Dict[str, Any]] = []
  318 + for hit in scan(
  319 + client=es_client,
  320 + index=index_name,
  321 + query={
  322 + "_source": [
  323 + "spu_id",
  324 + "title",
  325 + "vendor",
  326 + "category_path",
  327 + "category_name",
  328 + "image_url",
  329 + "skus",
  330 + "tags",
  331 + ],
  332 + "query": {"match_all": {}},
  333 + },
  334 + size=500,
  335 + preserve_order=False,
  336 + clear_scroll=True,
  337 + ):
  338 + source = dict(hit.get("_source") or {})
  339 + source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
  340 + docs.append(source)
  341 + self.store.upsert_corpus_docs(self.tenant_id, docs)
  342 + return docs
  343 +
  344 + def full_corpus_rerank(
  345 + self,
  346 + query: str,
  347 + docs: Sequence[Dict[str, Any]],
  348 + batch_size: int = 24,
  349 + force_refresh: bool = False,
  350 + ) -> List[Dict[str, Any]]:
  351 + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
  352 + pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
  353 + if pending:
  354 + new_scores: Dict[str, float] = {}
  355 + for start in range(0, len(pending), batch_size):
  356 + batch = pending[start : start + batch_size]
  357 + scores = self._rerank_batch_with_retry(query=query, docs=batch)
  358 + if len(scores) != len(batch):
  359 + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
  360 + for doc, score in zip(batch, scores):
  361 + new_scores[str(doc.get("spu_id"))] = float(score)
  362 + self.store.upsert_rerank_scores(
  363 + self.tenant_id,
  364 + query,
  365 + new_scores,
  366 + model_name="qwen3_vllm_score",
  367 + )
  368 + cached.update(new_scores)
  369 +
  370 + ranked = []
  371 + for doc in docs:
  372 + spu_id = str(doc.get("spu_id"))
  373 + ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
  374 + ranked.sort(key=lambda item: item["score"], reverse=True)
  375 + return ranked
  376 +
  377 + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
  378 + if not docs:
  379 + return []
  380 + doc_texts = [build_rerank_doc(doc) for doc in docs]
  381 + try:
  382 + scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
  383 + return scores
  384 + except Exception:
  385 + if len(docs) == 1:
  386 + return [-1.0]
  387 + if len(docs) <= 6:
  388 + scores: List[float] = []
  389 + for doc in docs:
  390 + scores.extend(self._rerank_batch_with_retry(query, [doc]))
  391 + return scores
  392 + mid = len(docs) // 2
  393 + left = self._rerank_batch_with_retry(query, docs[:mid])
  394 + right = self._rerank_batch_with_retry(query, docs[mid:])
  395 + return left + right
  396 +
  397 + def annotate_missing_labels(
  398 + self,
  399 + query: str,
  400 + docs: Sequence[Dict[str, Any]],
  401 + force_refresh: bool = False,
  402 + ) -> Dict[str, str]:
  403 + labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
  404 + missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
  405 + if not missing_docs:
  406 + return labels
  407 +
  408 + for start in range(0, len(missing_docs), self.label_client.batch_size):
  409 + batch = missing_docs[start : start + self.label_client.batch_size]
  410 + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
  411 + for sub_labels, raw_response, sub_batch in batch_pairs:
  412 + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
  413 + self.store.upsert_labels(
  414 + self.tenant_id,
  415 + query,
  416 + to_store,
  417 + judge_model=self.label_client.model,
  418 + raw_response=raw_response,
  419 + )
  420 + labels.update(to_store)
  421 + time.sleep(0.1)
  422 + return labels
  423 +
  424 + def _classify_with_retry(
  425 + self,
  426 + query: str,
  427 + docs: Sequence[Dict[str, Any]],
  428 + *,
  429 + force_refresh: bool = False,
  430 + ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
  431 + if not docs:
  432 + return []
  433 + try:
  434 + if self.labeler_mode == "complex":
  435 + query_profile = self.get_query_profile(query, force_refresh=force_refresh)
  436 + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
  437 + labels = [
  438 + self._apply_rule_based_label_guardrails(label, query_profile, doc)
  439 + for doc, label in zip(docs, labels)
  440 + ]
  441 + else:
  442 + labels, raw_response = self.label_client.classify_batch_simple(query, docs)
  443 + return [(labels, raw_response, docs)]
  444 + except Exception:
  445 + if len(docs) == 1:
  446 + raise
  447 + mid = len(docs) // 2
  448 + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
  449 +
  450 + def build_query_annotation_set(
  451 + self,
  452 + query: str,
  453 + *,
  454 + search_depth: int = 1000,
  455 + rerank_depth: int = 10000,
  456 + annotate_search_top_k: int = 120,
  457 + annotate_rerank_top_k: int = 200,
  458 + language: str = "en",
  459 + force_refresh_rerank: bool = False,
  460 + force_refresh_labels: bool = False,
  461 + ) -> QueryBuildResult:
  462 + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
  463 + search_results = list(search_payload.get("results") or [])
  464 + corpus = self.corpus_docs(refresh=False)
  465 + full_rerank = self.full_corpus_rerank(
  466 + query=query,
  467 + docs=corpus,
  468 + force_refresh=force_refresh_rerank,
  469 + )
  470 + rerank_depth_effective = min(rerank_depth, len(full_rerank))
  471 +
  472 + pool_docs: Dict[str, Dict[str, Any]] = {}
  473 + for doc in search_results[:annotate_search_top_k]:
  474 + pool_docs[str(doc.get("spu_id"))] = doc
  475 + for item in full_rerank[:annotate_rerank_top_k]:
  476 + pool_docs[str(item["spu_id"])] = item["doc"]
  477 +
  478 + labels = self.annotate_missing_labels(
  479 + query=query,
  480 + docs=list(pool_docs.values()),
  481 + force_refresh=force_refresh_labels,
  482 + )
  483 +
  484 + search_labeled_results: List[Dict[str, Any]] = []
  485 + for rank, doc in enumerate(search_results, start=1):
  486 + spu_id = str(doc.get("spu_id"))
  487 + label = labels.get(spu_id)
  488 + search_labeled_results.append(
  489 + {
  490 + "rank": rank,
  491 + "spu_id": spu_id,
  492 + "title": build_display_title(doc),
  493 + "image_url": doc.get("image_url"),
  494 + "rerank_score": None,
  495 + "label": label,
  496 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  497 + "product": compact_product_payload(doc),
  498 + }
  499 + )
  500 +
  501 + rerank_top_results: List[Dict[str, Any]] = []
  502 + for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
  503 + doc = item["doc"]
  504 + spu_id = str(item["spu_id"])
  505 + rerank_top_results.append(
  506 + {
  507 + "rank": rank,
  508 + "spu_id": spu_id,
  509 + "title": build_display_title(doc),
  510 + "image_url": doc.get("image_url"),
  511 + "rerank_score": round(float(item["score"]), 8),
  512 + "label": labels.get(spu_id),
  513 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  514 + "product": compact_product_payload(doc),
  515 + }
  516 + )
  517 +
  518 + top100_labels = [
  519 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  520 + for item in search_labeled_results[:100]
  521 + ]
  522 + metrics = compute_query_metrics(top100_labels)
  523 + output_dir = ensure_dir(self.artifact_root / "query_builds")
  524 + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
  525 + output_json_path = output_dir / f"{run_id}.json"
  526 + payload = {
  527 + "run_id": run_id,
  528 + "created_at": utc_now_iso(),
  529 + "tenant_id": self.tenant_id,
  530 + "query": query,
  531 + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
  532 + "search_total": int(search_payload.get("total") or 0),
  533 + "search_depth_requested": search_depth,
  534 + "search_depth_effective": len(search_results),
  535 + "rerank_depth_requested": rerank_depth,
  536 + "rerank_depth_effective": rerank_depth_effective,
  537 + "corpus_size": len(corpus),
  538 + "annotation_pool": {
  539 + "annotate_search_top_k": annotate_search_top_k,
  540 + "annotate_rerank_top_k": annotate_rerank_top_k,
  541 + "pool_size": len(pool_docs),
  542 + },
  543 + "labeler_mode": self.labeler_mode,
  544 + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
  545 + "metrics_top100": metrics,
  546 + "search_results": search_labeled_results,
  547 + "full_rerank_top": rerank_top_results,
  548 + }
  549 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  550 + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
  551 + return QueryBuildResult(
  552 + query=query,
  553 + tenant_id=self.tenant_id,
  554 + search_total=int(search_payload.get("total") or 0),
  555 + search_depth=len(search_results),
  556 + rerank_corpus_size=len(corpus),
  557 + annotated_count=len(pool_docs),
  558 + output_json_path=output_json_path,
  559 + )
  560 +
  561 + def evaluate_live_query(
  562 + self,
  563 + query: str,
  564 + top_k: int = 100,
  565 + auto_annotate: bool = False,
  566 + language: str = "en",
  567 + force_refresh_labels: bool = False,
  568 + ) -> Dict[str, Any]:
  569 + search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
  570 + results = list(search_payload.get("results") or [])
  571 + if auto_annotate:
  572 + self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
  573 + labels = self.store.get_labels(self.tenant_id, query)
  574 + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
  575 + labeled = []
  576 + unlabeled_hits = 0
  577 + for rank, doc in enumerate(results[:top_k], start=1):
  578 + spu_id = str(doc.get("spu_id"))
  579 + label = labels.get(spu_id)
  580 + if label not in VALID_LABELS:
  581 + unlabeled_hits += 1
  582 + labeled.append(
  583 + {
  584 + "rank": rank,
  585 + "spu_id": spu_id,
  586 + "title": build_display_title(doc),
  587 + "image_url": doc.get("image_url"),
  588 + "label": label,
  589 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  590 + "product": compact_product_payload(doc),
  591 + }
  592 + )
  593 + metric_labels = [
  594 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  595 + for item in labeled
  596 + ]
  597 + label_stats = self.store.get_query_label_stats(self.tenant_id, query)
  598 + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
  599 + relevant_missing_ids = [
  600 + spu_id
  601 + for spu_id, label in labels.items()
  602 + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
  603 + ]
  604 + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
  605 + missing_relevant = []
  606 + for spu_id in relevant_missing_ids:
  607 + doc = missing_docs_map.get(spu_id)
  608 + if not doc:
  609 + continue
  610 + missing_relevant.append(
  611 + {
  612 + "spu_id": spu_id,
  613 + "label": labels[spu_id],
  614 + "rerank_score": rerank_scores.get(spu_id),
  615 + "title": build_display_title(doc),
  616 + "image_url": doc.get("image_url"),
  617 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  618 + "product": compact_product_payload(doc),
  619 + }
  620 + )
  621 + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
  622 + missing_relevant.sort(
  623 + key=lambda item: (
  624 + label_order.get(str(item.get("label")), 9),
  625 + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
  626 + str(item.get("title") or ""),
  627 + )
  628 + )
  629 + tips: List[str] = []
  630 + if auto_annotate:
  631 + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
  632 + else:
  633 + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
  634 + if label_stats["total"] == 0:
  635 + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
  636 + if unlabeled_hits:
  637 + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
  638 + if not missing_relevant:
  639 + tips.append("No cached Exact/Partial products were missed by this recall set.")
  640 + return {
  641 + "query": query,
  642 + "tenant_id": self.tenant_id,
  643 + "top_k": top_k,
  644 + "metrics": compute_query_metrics(metric_labels),
  645 + "results": labeled,
  646 + "missing_relevant": missing_relevant,
  647 + "label_stats": {
  648 + **label_stats,
  649 + "unlabeled_hits_treated_irrelevant": unlabeled_hits,
  650 + "recalled_hits": len(labeled),
  651 + "missing_relevant_count": len(missing_relevant),
  652 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
  653 + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
  654 + },
  655 + "tips": tips,
  656 + "total": int(search_payload.get("total") or 0),
  657 + }
  658 +
  659 + def batch_evaluate(
  660 + self,
  661 + queries: Sequence[str],
  662 + *,
  663 + top_k: int = 100,
  664 + auto_annotate: bool = True,
  665 + language: str = "en",
  666 + force_refresh_labels: bool = False,
  667 + ) -> Dict[str, Any]:
  668 + per_query = []
  669 + for query in queries:
  670 + live = self.evaluate_live_query(
  671 + query,
  672 + top_k=top_k,
  673 + auto_annotate=auto_annotate,
  674 + language=language,
  675 + force_refresh_labels=force_refresh_labels,
  676 + )
  677 + labels = [
  678 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  679 + for item in live["results"]
  680 + ]
  681 + per_query.append(
  682 + {
  683 + "query": live["query"],
  684 + "tenant_id": live["tenant_id"],
  685 + "top_k": live["top_k"],
  686 + "metrics": live["metrics"],
  687 + "distribution": label_distribution(labels),
  688 + "total": live["total"],
  689 + }
  690 + )
  691 + aggregate = aggregate_metrics([item["metrics"] for item in per_query])
  692 + aggregate_distribution = {
  693 + RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
  694 + RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
  695 + RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
  696 + }
  697 + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
  698 + report_dir = ensure_dir(self.artifact_root / "batch_reports")
  699 + config_snapshot_path = report_dir / f"{batch_id}_config.json"
  700 + config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
  701 + config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
  702 + output_json_path = report_dir / f"{batch_id}.json"
  703 + report_md_path = report_dir / f"{batch_id}.md"
  704 + payload = {
  705 + "batch_id": batch_id,
  706 + "created_at": utc_now_iso(),
  707 + "tenant_id": self.tenant_id,
  708 + "queries": list(queries),
  709 + "top_k": top_k,
  710 + "aggregate_metrics": aggregate,
  711 + "aggregate_distribution": aggregate_distribution,
  712 + "per_query": per_query,
  713 + "config_snapshot_path": str(config_snapshot_path),
  714 + }
  715 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  716 + report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
  717 + self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
  718 + return payload
  719 +
... ...
scripts/evaluation/eval_framework/metrics.py 0 โ†’ 100644
... ... @@ -0,0 +1,58 @@
  1 +"""IR metrics for labeled result lists."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Dict, Sequence
  6 +
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  8 +
  9 +
  10 +def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
  11 + if k <= 0:
  12 + return 0.0
  13 + sliced = list(labels[:k])
  14 + if not sliced:
  15 + return 0.0
  16 + hits = sum(1 for label in sliced if label in relevant)
  17 + return hits / float(min(k, len(sliced)))
  18 +
  19 +
  20 +def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
  21 + hit_count = 0
  22 + precision_sum = 0.0
  23 + for idx, label in enumerate(labels, start=1):
  24 + if label not in relevant:
  25 + continue
  26 + hit_count += 1
  27 + precision_sum += hit_count / idx
  28 + if hit_count == 0:
  29 + return 0.0
  30 + return precision_sum / hit_count
  31 +
  32 +
  33 +def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
  34 + metrics: Dict[str, float] = {}
  35 + for k in (5, 10, 20, 50):
  36 + metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
  37 + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  38 + metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
  39 + metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  40 + return metrics
  41 +
  42 +
  43 +def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:
  44 + if not metric_items:
  45 + return {}
  46 + keys = sorted(metric_items[0].keys())
  47 + return {
  48 + key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)
  49 + for key in keys
  50 + }
  51 +
  52 +
  53 +def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
  54 + return {
  55 + RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
  56 + RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
  57 + RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
  58 + }
... ...
scripts/evaluation/eval_framework/prompts.py 0 โ†’ 100644
... ... @@ -0,0 +1,89 @@
  1 +"""LLM prompt templates for relevance judging (keep wording changes here)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +from typing import Any, Dict, Sequence
  7 +
  8 +
  9 +def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
  10 + lines = "\n".join(numbered_doc_lines)
  11 + n = len(numbered_doc_lines)
  12 + return (
  13 + "You are an e-commerce search result relevance evaluation assistant. "
  14 + "Based on the user query and each product's information, output the relevance level for each product.\n\n"
  15 + "## Relevance Level Criteria\n"
  16 + "Exact โ€” Fully matches the user's search intent.\n"
  17 + "Partial โ€” Primary intent satisfied (same category or similar use, basically aligns with search intent), "
  18 + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
  19 + "Irrelevant โ€” Category or use case mismatched, primary intent not satisfied.\n\n"
  20 + "Additional judging guidance:\n"
  21 + "- If the query clearly names a product type, product type matching has the highest priority. "
  22 + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
  23 + "bra vs top, backpack vs bag are not interchangeable.\n"
  24 + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
  25 + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
  26 + "- Do not guess missing attributes.\n"
  27 + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
  28 + "- Be conservative with Exact.\n\n"
  29 + f"Query: {query}\n\n"
  30 + "Products:\n"
  31 + f"{lines}\n\n"
  32 + "## Output Format\n"
  33 + f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
  34 + "They must correspond sequentially to the products above. Do not output any other information.\n"
  35 + )
  36 +
  37 +
  38 +def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
  39 + hints_json = json.dumps(parser_hints, ensure_ascii=False)
  40 + return (
  41 + "You are building a structured intent profile for e-commerce relevance judging.\n"
  42 + "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
  43 + "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
  44 + "Return JSON with this schema:\n"
  45 + "{\n"
  46 + ' "normalized_query_en": string,\n'
  47 + ' "primary_category": string,\n'
  48 + ' "allowed_categories": [string],\n'
  49 + ' "required_attributes": [\n'
  50 + ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
  51 + " ],\n"
  52 + ' "notes": [string]\n'
  53 + "}\n\n"
  54 + "Guidelines:\n"
  55 + "- Exact later will require explicit evidence for all required attributes.\n"
  56 + "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
  57 + "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
  58 + "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
  59 + "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
  60 + "- For color, include conflicting colors only when clear from the query.\n\n"
  61 + f"Original query: {query}\n"
  62 + f"Parser hints JSON: {hints_json}\n"
  63 + )
  64 +
  65 +
  66 +def classify_batch_complex_prompt(
  67 + query: str,
  68 + query_profile: Dict[str, Any],
  69 + numbered_doc_lines: Sequence[str],
  70 +) -> str:
  71 + lines = "\n".join(numbered_doc_lines)
  72 + profile_json = json.dumps(query_profile, ensure_ascii=False)
  73 + return (
  74 + "You are an e-commerce search relevance judge.\n"
  75 + "Judge each product against the structured query profile below.\n\n"
  76 + "Relevance rules:\n"
  77 + "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
  78 + "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
  79 + "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
  80 + "- Be conservative with Exact.\n"
  81 + "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
  82 + "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
  83 + f"Original query: {query}\n"
  84 + f"Structured query profile JSON: {profile_json}\n\n"
  85 + "Products:\n"
  86 + f"{lines}\n\n"
  87 + "Return JSON only, with schema:\n"
  88 + '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
  89 + )
... ...
scripts/evaluation/eval_framework/reports.py 0 โ†’ 100644
... ... @@ -0,0 +1,48 @@
  1 +"""Markdown and text reports for batch evaluation."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from typing import Any, Dict
  6 +
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  8 +
  9 +
  10 +def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
  11 + lines = [
  12 + "# Search Batch Evaluation",
  13 + "",
  14 + f"- Batch ID: {payload['batch_id']}",
  15 + f"- Created at: {payload['created_at']}",
  16 + f"- Tenant ID: {payload['tenant_id']}",
  17 + f"- Query count: {len(payload['queries'])}",
  18 + f"- Top K: {payload['top_k']}",
  19 + "",
  20 + "## Aggregate Metrics",
  21 + "",
  22 + ]
  23 + for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):
  24 + lines.append(f"- {key}: {value}")
  25 + distribution = payload.get("aggregate_distribution") or {}
  26 + if distribution:
  27 + lines.extend(
  28 + [
  29 + "",
  30 + "## Label Distribution",
  31 + "",
  32 + f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
  33 + f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
  34 + f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
  35 + ]
  36 + )
  37 + lines.extend(["", "## Per Query", ""])
  38 + for item in payload.get("per_query") or []:
  39 + lines.append(f"### {item['query']}")
  40 + lines.append("")
  41 + for key, value in sorted((item.get("metrics") or {}).items()):
  42 + lines.append(f"- {key}: {value}")
  43 + distribution = item.get("distribution") or {}
  44 + lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
  45 + lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
  46 + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
  47 + lines.append("")
  48 + return "\n".join(lines)
... ...
scripts/evaluation/eval_framework/static/eval_web.css 0 โ†’ 100644
... ... @@ -0,0 +1,91 @@
  1 +:root {
  2 + --bg: #f5f3ed;
  3 + --panel: #fffdf8;
  4 + --ink: #1f2a24;
  5 + --muted: #6b756e;
  6 + --line: #ddd4c6;
  7 + --accent: #0f766e;
  8 + --exact: #0f766e;
  9 + --partial: #b7791f;
  10 + --irrelevant: #b42318;
  11 + }
  12 + body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:
  13 + radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),
  14 + linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }
  15 + .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }
  16 + .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }
  17 + .main { padding: 24px; }
  18 + h1, h2 { margin: 0 0 12px; }
  19 + .muted { color: var(--muted); }
  20 + .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }
  21 + .query-item {
  22 + display: block; width: 100%; border: 0; background: transparent; text-align: left;
  23 + padding: 10px 12px; border-radius: 10px; cursor: pointer;
  24 + color: var(--ink); font-size: 15px; font-weight: 500;
  25 + }
  26 + .query-item:hover { background: #eef6f4; }
  27 + .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }
  28 + input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }
  29 + button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }
  30 + button.secondary { background: #d9e6e3; color: #12433d; }
  31 + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }
  32 + .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }
  33 + .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
  34 + .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }
  35 + .results { display: grid; gap: 10px; }
  36 + .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
  37 + .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
  38 + .Exact { background: var(--exact); }
  39 + .Partial { background: var(--partial); }
  40 + .Irrelevant { background: var(--irrelevant); }
  41 + .Unknown { background: #637381; }
  42 + .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
  43 + .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
  44 + .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
  45 + .section { margin-bottom: 28px; }
  46 + .history { font-size: 13px; line-height: 1.5; }
  47 + .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; }
  48 + .history-item {
  49 + display: block; width: 100%; border: 1px solid var(--line); background: var(--panel);
  50 + text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer;
  51 + color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s;
  52 + }
  53 + .history-item:hover { background: #eef6f4; border-color: #b8d4cd; }
  54 + .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; }
  55 + .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; }
  56 + .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; }
  57 + .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; }
  58 + .history-item .hstats span { color: var(--muted); }
  59 + .report-modal-root {
  60 + position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center;
  61 + padding: 16px; box-sizing: border-box;
  62 + }
  63 + .report-modal-root.is-open { display: flex; }
  64 + .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); }
  65 + .report-modal-dialog {
  66 + position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column;
  67 + background: var(--panel); border: 1px solid var(--line); border-radius: 18px;
  68 + box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18);
  69 + }
  70 + .report-modal-head {
  71 + flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px;
  72 + padding: 16px 18px; border-bottom: 1px solid var(--line);
  73 + }
  74 + .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; }
  75 + .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; }
  76 + .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; }
  77 + .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); }
  78 + .report-modal-body {
  79 + flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px;
  80 + font-size: 14px; line-height: 1.55;
  81 + }
  82 + .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; }
  83 + .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; }
  84 + .batch-report-md h2:first-of-type { margin-top: 0; }
  85 + .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; }
  86 + .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; }
  87 + .batch-report-md li { margin: 0.2rem 0; }
  88 + .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; }
  89 + .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; }
  90 + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
  91 + .tip { margin-bottom: 6px; color: var(--muted); }
... ...
scripts/evaluation/eval_framework/static/eval_web.js 0 โ†’ 100644
... ... @@ -0,0 +1,181 @@
  1 + async function fetchJSON(url, options) {
  2 + const res = await fetch(url, options);
  3 + if (!res.ok) throw new Error(await res.text());
  4 + return await res.json();
  5 + }
  6 + function renderMetrics(metrics) {
  7 + const root = document.getElementById('metrics');
  8 + root.innerHTML = '';
  9 + Object.entries(metrics || {}).forEach(([key, value]) => {
  10 + const card = document.createElement('div');
  11 + card.className = 'metric';
  12 + card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;
  13 + root.appendChild(card);
  14 + });
  15 + }
  16 + function renderResults(results, rootId='results', showRank=true) {
  17 + const mount = document.getElementById(rootId);
  18 + mount.innerHTML = '';
  19 + (results || []).forEach(item => {
  20 + const label = item.label || 'Unknown';
  21 + const box = document.createElement('div');
  22 + box.className = 'result';
  23 + box.innerHTML = `
  24 + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
  25 + <img class="thumb" src="${item.image_url || ''}" alt="" />
  26 + <div>
  27 + <div class="title">${item.title || ''}</div>
  28 + <div class="options">
  29 + <div>${(item.option_values || [])[0] || ''}</div>
  30 + <div>${(item.option_values || [])[1] || ''}</div>
  31 + <div>${(item.option_values || [])[2] || ''}</div>
  32 + </div>
  33 + </div>`;
  34 + mount.appendChild(box);
  35 + });
  36 + if (!(results || []).length) {
  37 + mount.innerHTML = '<div class="muted">None.</div>';
  38 + }
  39 + }
  40 + function renderTips(data) {
  41 + const root = document.getElementById('tips');
  42 + const tips = [...(data.tips || [])];
  43 + const stats = data.label_stats || {};
  44 + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
  45 + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
  46 + }
  47 + async function loadQueries() {
  48 + const data = await fetchJSON('/api/queries');
  49 + const root = document.getElementById('queryList');
  50 + root.innerHTML = '';
  51 + data.queries.forEach(query => {
  52 + const btn = document.createElement('button');
  53 + btn.className = 'query-item';
  54 + btn.textContent = query;
  55 + btn.onclick = () => {
  56 + document.getElementById('queryInput').value = query;
  57 + runSingle();
  58 + };
  59 + root.appendChild(btn);
  60 + });
  61 + }
  62 + function fmtMetric(m, key, digits) {
  63 + const v = m && m[key];
  64 + if (v == null || Number.isNaN(Number(v))) return null;
  65 + const n = Number(v);
  66 + return n.toFixed(digits);
  67 + }
  68 + function historySummaryHtml(meta) {
  69 + const m = meta && meta.aggregate_metrics;
  70 + const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
  71 + const parts = [];
  72 + if (nq != null) parts.push(`<span>Queries</span> ${nq}`);
  73 + const p10 = fmtMetric(m, 'P@10', 3);
  74 + const p52 = fmtMetric(m, 'P@5_2_3', 3);
  75 + const map3 = fmtMetric(m, 'MAP_3', 3);
  76 + if (p10) parts.push(`<span>P@10</span> ${p10}`);
  77 + if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`);
  78 + if (map3) parts.push(`<span>MAP_3</span> ${map3}`);
  79 + if (!parts.length) return '';
  80 + return `<div class="hstats">${parts.join(' ยท ')}</div>`;
  81 + }
  82 + async function loadHistory() {
  83 + const data = await fetchJSON('/api/history');
  84 + const root = document.getElementById('history');
  85 + root.classList.remove('muted');
  86 + const items = data.history || [];
  87 + if (!items.length) {
  88 + root.innerHTML = '<span class="muted">No history yet.</span>';
  89 + return;
  90 + }
  91 + root.innerHTML = `<div class="history-list"></div>`;
  92 + const list = root.querySelector('.history-list');
  93 + items.forEach(item => {
  94 + const btn = document.createElement('button');
  95 + btn.type = 'button';
  96 + btn.className = 'history-item';
  97 + btn.setAttribute('aria-label', `Open report ${item.batch_id}`);
  98 + const sum = historySummaryHtml(item.metadata);
  99 + btn.innerHTML = `<div class="hid">${item.batch_id}</div>
  100 + <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`;
  101 + btn.onclick = () => openBatchReport(item.batch_id);
  102 + list.appendChild(btn);
  103 + });
  104 + }
  105 + let _lastReportPath = '';
  106 + function closeReportModal() {
  107 + const el = document.getElementById('reportModal');
  108 + el.classList.remove('is-open');
  109 + el.setAttribute('aria-hidden', 'true');
  110 + document.getElementById('reportModalBody').innerHTML = '';
  111 + document.getElementById('reportModalMeta').textContent = '';
  112 + }
  113 + async function openBatchReport(batchId) {
  114 + const el = document.getElementById('reportModal');
  115 + const body = document.getElementById('reportModalBody');
  116 + const metaEl = document.getElementById('reportModalMeta');
  117 + const titleEl = document.getElementById('reportModalTitle');
  118 + el.classList.add('is-open');
  119 + el.setAttribute('aria-hidden', 'false');
  120 + titleEl.textContent = batchId;
  121 + metaEl.textContent = '';
  122 + body.className = 'report-modal-body batch-report-md report-modal-loading';
  123 + body.textContent = 'Loading reportโ€ฆ';
  124 + try {
  125 + const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report');
  126 + _lastReportPath = rep.report_markdown_path || '';
  127 + metaEl.textContent = rep.report_markdown_path || '';
  128 + const raw = marked.parse(rep.markdown || '', { gfm: true });
  129 + const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } });
  130 + body.className = 'report-modal-body batch-report-md';
  131 + body.innerHTML = safe;
  132 + } catch (e) {
  133 + body.className = 'report-modal-body report-modal-error';
  134 + body.textContent = (e && e.message) ? e.message : String(e);
  135 + }
  136 + }
  137 + document.getElementById('reportModal').addEventListener('click', (ev) => {
  138 + if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal();
  139 + });
  140 + document.addEventListener('keydown', (ev) => {
  141 + if (ev.key === 'Escape') closeReportModal();
  142 + });
  143 + document.getElementById('reportCopyPath').addEventListener('click', async () => {
  144 + if (!_lastReportPath) return;
  145 + try {
  146 + await navigator.clipboard.writeText(_lastReportPath);
  147 + } catch (_) {}
  148 + });
  149 + async function runSingle() {
  150 + const query = document.getElementById('queryInput').value.trim();
  151 + if (!query) return;
  152 + document.getElementById('status').textContent = `Evaluating "${query}"...`;
  153 + const data = await fetchJSON('/api/search-eval', {
  154 + method: 'POST',
  155 + headers: {'Content-Type': 'application/json'},
  156 + body: JSON.stringify({query, top_k: 100, auto_annotate: false})
  157 + });
  158 + document.getElementById('status').textContent = `Done. total=${data.total}`;
  159 + renderMetrics(data.metrics);
  160 + renderResults(data.results, 'results', true);
  161 + renderResults(data.missing_relevant, 'missingRelevant', false);
  162 + renderTips(data);
  163 + loadHistory();
  164 + }
  165 + async function runBatch() {
  166 + document.getElementById('status').textContent = 'Running batch evaluation...';
  167 + const data = await fetchJSON('/api/batch-eval', {
  168 + method: 'POST',
  169 + headers: {'Content-Type': 'application/json'},
  170 + body: JSON.stringify({top_k: 100, auto_annotate: false})
  171 + });
  172 + document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
  173 + renderMetrics(data.aggregate_metrics);
  174 + renderResults([], 'results', true);
  175 + renderResults([], 'missingRelevant', false);
  176 + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
  177 + loadHistory();
  178 + }
  179 + loadQueries();
  180 + loadHistory();
  181 +
... ...
scripts/evaluation/eval_framework/static/index.html 0 โ†’ 100644
... ... @@ -0,0 +1,70 @@
  1 +<!doctype html>
  2 +<html lang="en">
  3 +<head>
  4 + <meta charset="utf-8" />
  5 + <meta name="viewport" content="width=device-width, initial-scale=1" />
  6 + <title>Search Evaluation</title>
  7 + <link rel="stylesheet" href="/static/eval_web.css" />
  8 +
  9 +</head>
  10 +<body>
  11 + <div class="app">
  12 + <aside class="sidebar">
  13 + <h2>Queries</h2>
  14 + <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
  15 + <div id="queryList" class="query-list"></div>
  16 + <div class="section">
  17 + <h2>History</h2>
  18 + <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p>
  19 + <div id="history" class="history muted">Loading...</div>
  20 + </div>
  21 + </aside>
  22 + <main class="main">
  23 + <h1>Search Evaluation</h1>
  24 + <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
  25 + <div class="toolbar">
  26 + <input id="queryInput" type="text" placeholder="Search query" />
  27 + <button onclick="runSingle()">Evaluate Query</button>
  28 + <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
  29 + </div>
  30 + <div id="status" class="muted section"></div>
  31 + <section class="section">
  32 + <h2>Metrics</h2>
  33 + <div id="metrics" class="grid"></div>
  34 + </section>
  35 + <section class="section">
  36 + <h2>Top Results</h2>
  37 + <div id="results" class="results"></div>
  38 + </section>
  39 + <section class="section">
  40 + <h2>Missed Exact / Partial</h2>
  41 + <div id="missingRelevant" class="results"></div>
  42 + </section>
  43 + <section class="section">
  44 + <h2>Notes</h2>
  45 + <div id="tips" class="tips muted"></div>
  46 + </section>
  47 + </main>
  48 + </div>
  49 + <div id="reportModal" class="report-modal-root" aria-hidden="true">
  50 + <div class="report-modal-backdrop" data-close-report="1"></div>
  51 + <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle">
  52 + <div class="report-modal-head">
  53 + <h3 id="reportModalTitle">Batch report</h3>
  54 + <div class="head-actions">
  55 + <button type="button" class="secondary" id="reportCopyPath">Copy path</button>
  56 + <button type="button" onclick="closeReportModal()">Close</button>
  57 + </div>
  58 + </div>
  59 + <div id="reportModalMeta" class="report-modal-meta muted"></div>
  60 + <div id="reportModalBody" class="report-modal-body batch-report-md"></div>
  61 + </div>
  62 + </div>
  63 +
  64 +
  65 +
  66 + <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script>
  67 + <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script>
  68 + <script src="/static/eval_web.js"></script>
  69 +</body>
  70 +</html>
0 71 \ No newline at end of file
... ...
scripts/evaluation/eval_framework/store.py 0 โ†’ 100644
... ... @@ -0,0 +1,426 @@
  1 +"""SQLite persistence for evaluation corpus, labels, rerank scores, and run metadata."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import json
  6 +import sqlite3
  7 +from dataclasses import dataclass
  8 +from pathlib import Path
  9 +from typing import Any, Dict, List, Optional, Sequence
  10 +
  11 +from .constants import VALID_LABELS
  12 +from .utils import ensure_dir, safe_json_dumps, utc_now_iso
  13 +
  14 +
  15 +@dataclass
  16 +class QueryBuildResult:
  17 + query: str
  18 + tenant_id: str
  19 + search_total: int
  20 + search_depth: int
  21 + rerank_corpus_size: int
  22 + annotated_count: int
  23 + output_json_path: Path
  24 +
  25 +
  26 +class EvalStore:
  27 + def __init__(self, db_path: Path):
  28 + self.db_path = db_path
  29 + ensure_dir(db_path.parent)
  30 + self.conn = sqlite3.connect(str(db_path), check_same_thread=False)
  31 + self.conn.row_factory = sqlite3.Row
  32 + self._init_schema()
  33 +
  34 + def _init_schema(self) -> None:
  35 + self.conn.executescript(
  36 + """
  37 + CREATE TABLE IF NOT EXISTS corpus_docs (
  38 + tenant_id TEXT NOT NULL,
  39 + spu_id TEXT NOT NULL,
  40 + title_json TEXT,
  41 + vendor_json TEXT,
  42 + category_path_json TEXT,
  43 + category_name_json TEXT,
  44 + image_url TEXT,
  45 + skus_json TEXT,
  46 + tags_json TEXT,
  47 + raw_json TEXT NOT NULL,
  48 + updated_at TEXT NOT NULL,
  49 + PRIMARY KEY (tenant_id, spu_id)
  50 + );
  51 +
  52 + CREATE TABLE IF NOT EXISTS rerank_scores (
  53 + tenant_id TEXT NOT NULL,
  54 + query_text TEXT NOT NULL,
  55 + spu_id TEXT NOT NULL,
  56 + score REAL NOT NULL,
  57 + model_name TEXT,
  58 + updated_at TEXT NOT NULL,
  59 + PRIMARY KEY (tenant_id, query_text, spu_id)
  60 + );
  61 +
  62 + CREATE TABLE IF NOT EXISTS relevance_labels (
  63 + tenant_id TEXT NOT NULL,
  64 + query_text TEXT NOT NULL,
  65 + spu_id TEXT NOT NULL,
  66 + label TEXT NOT NULL,
  67 + judge_model TEXT,
  68 + raw_response TEXT,
  69 + updated_at TEXT NOT NULL,
  70 + PRIMARY KEY (tenant_id, query_text, spu_id)
  71 + );
  72 +
  73 + CREATE TABLE IF NOT EXISTS build_runs (
  74 + run_id TEXT PRIMARY KEY,
  75 + tenant_id TEXT NOT NULL,
  76 + query_text TEXT NOT NULL,
  77 + output_json_path TEXT NOT NULL,
  78 + metadata_json TEXT NOT NULL,
  79 + created_at TEXT NOT NULL
  80 + );
  81 +
  82 + CREATE TABLE IF NOT EXISTS batch_runs (
  83 + batch_id TEXT PRIMARY KEY,
  84 + tenant_id TEXT NOT NULL,
  85 + output_json_path TEXT NOT NULL,
  86 + report_markdown_path TEXT NOT NULL,
  87 + config_snapshot_path TEXT NOT NULL,
  88 + metadata_json TEXT NOT NULL,
  89 + created_at TEXT NOT NULL
  90 + );
  91 +
  92 + CREATE TABLE IF NOT EXISTS query_profiles (
  93 + tenant_id TEXT NOT NULL,
  94 + query_text TEXT NOT NULL,
  95 + prompt_version TEXT NOT NULL,
  96 + judge_model TEXT,
  97 + profile_json TEXT NOT NULL,
  98 + raw_response TEXT NOT NULL,
  99 + updated_at TEXT NOT NULL,
  100 + PRIMARY KEY (tenant_id, query_text, prompt_version)
  101 + );
  102 + """
  103 + )
  104 + self.conn.commit()
  105 +
  106 + def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
  107 + now = utc_now_iso()
  108 + rows = []
  109 + for doc in docs:
  110 + rows.append(
  111 + (
  112 + tenant_id,
  113 + str(doc.get("spu_id") or ""),
  114 + safe_json_dumps(doc.get("title")),
  115 + safe_json_dumps(doc.get("vendor")),
  116 + safe_json_dumps(doc.get("category_path")),
  117 + safe_json_dumps(doc.get("category_name")),
  118 + str(doc.get("image_url") or ""),
  119 + safe_json_dumps(doc.get("skus") or []),
  120 + safe_json_dumps(doc.get("tags") or []),
  121 + safe_json_dumps(doc),
  122 + now,
  123 + )
  124 + )
  125 + self.conn.executemany(
  126 + """
  127 + INSERT INTO corpus_docs (
  128 + tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,
  129 + image_url, skus_json, tags_json, raw_json, updated_at
  130 + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
  131 + ON CONFLICT(tenant_id, spu_id) DO UPDATE SET
  132 + title_json=excluded.title_json,
  133 + vendor_json=excluded.vendor_json,
  134 + category_path_json=excluded.category_path_json,
  135 + category_name_json=excluded.category_name_json,
  136 + image_url=excluded.image_url,
  137 + skus_json=excluded.skus_json,
  138 + tags_json=excluded.tags_json,
  139 + raw_json=excluded.raw_json,
  140 + updated_at=excluded.updated_at
  141 + """,
  142 + rows,
  143 + )
  144 + self.conn.commit()
  145 +
  146 + def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:
  147 + rows = self.conn.execute(
  148 + "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",
  149 + (tenant_id,),
  150 + ).fetchall()
  151 + return [json.loads(row["raw_json"]) for row in rows]
  152 +
  153 + def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
  154 + keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]
  155 + if not keys:
  156 + return {}
  157 + placeholders = ",".join("?" for _ in keys)
  158 + rows = self.conn.execute(
  159 + f"""
  160 + SELECT spu_id, raw_json
  161 + FROM corpus_docs
  162 + WHERE tenant_id=? AND spu_id IN ({placeholders})
  163 + """,
  164 + [tenant_id, *keys],
  165 + ).fetchall()
  166 + return {
  167 + str(row["spu_id"]): json.loads(row["raw_json"])
  168 + for row in rows
  169 + }
  170 +
  171 + def has_corpus(self, tenant_id: str) -> bool:
  172 + row = self.conn.execute(
  173 + "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",
  174 + (tenant_id,),
  175 + ).fetchone()
  176 + return bool(row and row["n"] > 0)
  177 +
  178 + def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:
  179 + rows = self.conn.execute(
  180 + """
  181 + SELECT spu_id, score
  182 + FROM rerank_scores
  183 + WHERE tenant_id=? AND query_text=?
  184 + """,
  185 + (tenant_id, query_text),
  186 + ).fetchall()
  187 + return {str(row["spu_id"]): float(row["score"]) for row in rows}
  188 +
  189 + def upsert_rerank_scores(
  190 + self,
  191 + tenant_id: str,
  192 + query_text: str,
  193 + scores: Dict[str, float],
  194 + model_name: str,
  195 + ) -> None:
  196 + now = utc_now_iso()
  197 + rows = [
  198 + (tenant_id, query_text, spu_id, float(score), model_name, now)
  199 + for spu_id, score in scores.items()
  200 + ]
  201 + self.conn.executemany(
  202 + """
  203 + INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)
  204 + VALUES (?, ?, ?, ?, ?, ?)
  205 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  206 + score=excluded.score,
  207 + model_name=excluded.model_name,
  208 + updated_at=excluded.updated_at
  209 + """,
  210 + rows,
  211 + )
  212 + self.conn.commit()
  213 +
  214 + def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:
  215 + rows = self.conn.execute(
  216 + """
  217 + SELECT spu_id, label
  218 + FROM relevance_labels
  219 + WHERE tenant_id=? AND query_text=?
  220 + """,
  221 + (tenant_id, query_text),
  222 + ).fetchall()
  223 + return {str(row["spu_id"]): str(row["label"]) for row in rows}
  224 +
  225 + def upsert_labels(
  226 + self,
  227 + tenant_id: str,
  228 + query_text: str,
  229 + labels: Dict[str, str],
  230 + judge_model: str,
  231 + raw_response: str,
  232 + ) -> None:
  233 + now = utc_now_iso()
  234 + rows = []
  235 + for spu_id, label in labels.items():
  236 + if label not in VALID_LABELS:
  237 + raise ValueError(f"invalid label: {label}")
  238 + rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))
  239 + self.conn.executemany(
  240 + """
  241 + INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)
  242 + VALUES (?, ?, ?, ?, ?, ?, ?)
  243 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  244 + label=excluded.label,
  245 + judge_model=excluded.judge_model,
  246 + raw_response=excluded.raw_response,
  247 + updated_at=excluded.updated_at
  248 + """,
  249 + rows,
  250 + )
  251 + self.conn.commit()
  252 +
  253 + def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:
  254 + row = self.conn.execute(
  255 + """
  256 + SELECT profile_json
  257 + FROM query_profiles
  258 + WHERE tenant_id=? AND query_text=? AND prompt_version=?
  259 + """,
  260 + (tenant_id, query_text, prompt_version),
  261 + ).fetchone()
  262 + if not row:
  263 + return None
  264 + return json.loads(row["profile_json"])
  265 +
  266 + def upsert_query_profile(
  267 + self,
  268 + tenant_id: str,
  269 + query_text: str,
  270 + prompt_version: str,
  271 + judge_model: str,
  272 + profile: Dict[str, Any],
  273 + raw_response: str,
  274 + ) -> None:
  275 + self.conn.execute(
  276 + """
  277 + INSERT OR REPLACE INTO query_profiles
  278 + (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)
  279 + VALUES (?, ?, ?, ?, ?, ?, ?)
  280 + """,
  281 + (
  282 + tenant_id,
  283 + query_text,
  284 + prompt_version,
  285 + judge_model,
  286 + safe_json_dumps(profile),
  287 + raw_response,
  288 + utc_now_iso(),
  289 + ),
  290 + )
  291 + self.conn.commit()
  292 +
  293 + def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
  294 + self.conn.execute(
  295 + """
  296 + INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
  297 + VALUES (?, ?, ?, ?, ?, ?)
  298 + """,
  299 + (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
  300 + )
  301 + self.conn.commit()
  302 +
  303 + def insert_batch_run(
  304 + self,
  305 + batch_id: str,
  306 + tenant_id: str,
  307 + output_json_path: Path,
  308 + report_markdown_path: Path,
  309 + config_snapshot_path: Path,
  310 + metadata: Dict[str, Any],
  311 + ) -> None:
  312 + self.conn.execute(
  313 + """
  314 + INSERT OR REPLACE INTO batch_runs
  315 + (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
  316 + VALUES (?, ?, ?, ?, ?, ?, ?)
  317 + """,
  318 + (
  319 + batch_id,
  320 + tenant_id,
  321 + str(output_json_path),
  322 + str(report_markdown_path),
  323 + str(config_snapshot_path),
  324 + safe_json_dumps(metadata),
  325 + utc_now_iso(),
  326 + ),
  327 + )
  328 + self.conn.commit()
  329 +
  330 + def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
  331 + rows = self.conn.execute(
  332 + """
  333 + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  334 + FROM batch_runs
  335 + ORDER BY created_at DESC
  336 + LIMIT ?
  337 + """,
  338 + (limit,),
  339 + ).fetchall()
  340 + items: List[Dict[str, Any]] = []
  341 + for row in rows:
  342 + items.append(
  343 + {
  344 + "batch_id": row["batch_id"],
  345 + "tenant_id": row["tenant_id"],
  346 + "output_json_path": row["output_json_path"],
  347 + "report_markdown_path": row["report_markdown_path"],
  348 + "config_snapshot_path": row["config_snapshot_path"],
  349 + "metadata": json.loads(row["metadata_json"]),
  350 + "created_at": row["created_at"],
  351 + }
  352 + )
  353 + return items
  354 +
  355 + def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]:
  356 + row = self.conn.execute(
  357 + """
  358 + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  359 + FROM batch_runs
  360 + WHERE batch_id = ?
  361 + """,
  362 + (batch_id,),
  363 + ).fetchone()
  364 + if row is None:
  365 + return None
  366 + return {
  367 + "batch_id": row["batch_id"],
  368 + "tenant_id": row["tenant_id"],
  369 + "output_json_path": row["output_json_path"],
  370 + "report_markdown_path": row["report_markdown_path"],
  371 + "config_snapshot_path": row["config_snapshot_path"],
  372 + "metadata": json.loads(row["metadata_json"]),
  373 + "created_at": row["created_at"],
  374 + }
  375 +
  376 + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:
  377 + rows = self.conn.execute(
  378 + """
  379 + SELECT
  380 + query_text,
  381 + COUNT(*) AS total,
  382 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  383 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  384 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  385 + MAX(updated_at) AS updated_at
  386 + FROM relevance_labels
  387 + WHERE tenant_id=?
  388 + GROUP BY query_text
  389 + ORDER BY query_text
  390 + """,
  391 + (tenant_id,),
  392 + ).fetchall()
  393 + return [
  394 + {
  395 + "query": str(row["query_text"]),
  396 + "total": int(row["total"]),
  397 + "exact_count": int(row["exact_count"] or 0),
  398 + "partial_count": int(row["partial_count"] or 0),
  399 + "irrelevant_count": int(row["irrelevant_count"] or 0),
  400 + "updated_at": row["updated_at"],
  401 + }
  402 + for row in rows
  403 + ]
  404 +
  405 + def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:
  406 + row = self.conn.execute(
  407 + """
  408 + SELECT
  409 + COUNT(*) AS total,
  410 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  411 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  412 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  413 + MAX(updated_at) AS updated_at
  414 + FROM relevance_labels
  415 + WHERE tenant_id=? AND query_text=?
  416 + """,
  417 + (tenant_id, query_text),
  418 + ).fetchone()
  419 + return {
  420 + "query": query_text,
  421 + "total": int((row["total"] or 0) if row else 0),
  422 + "exact_count": int((row["exact_count"] or 0) if row else 0),
  423 + "partial_count": int((row["partial_count"] or 0) if row else 0),
  424 + "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
  425 + "updated_at": row["updated_at"] if row else None,
  426 + }
... ...
scripts/evaluation/eval_framework/utils.py 0 โ†’ 100644
... ... @@ -0,0 +1,145 @@
  1 +"""Small helpers: time, JSON, document text, LLM output parsing."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import hashlib
  6 +import json
  7 +import re
  8 +from datetime import datetime, timezone
  9 +from pathlib import Path
  10 +from typing import Any, Dict, List, Sequence, Tuple
  11 +
  12 +from .constants import PROJECT_ROOT
  13 +
  14 +
  15 +def utc_now_iso() -> str:
  16 + return datetime.now(timezone.utc).isoformat()
  17 +
  18 +
  19 +def utc_timestamp() -> str:
  20 + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
  21 +
  22 +
  23 +def ensure_dir(path: Path) -> Path:
  24 + path.mkdir(parents=True, exist_ok=True)
  25 + return path
  26 +
  27 +
  28 +def sha1_text(text: str) -> str:
  29 + return hashlib.sha1(text.encode("utf-8")).hexdigest()
  30 +
  31 +
  32 +def pick_text(value: Any, preferred_lang: str = "en") -> str:
  33 + if value is None:
  34 + return ""
  35 + if isinstance(value, dict):
  36 + return str(
  37 + value.get(preferred_lang)
  38 + or value.get("en")
  39 + or value.get("zh")
  40 + or next((v for v in value.values() if v), "")
  41 + ).strip()
  42 + return str(value).strip()
  43 +
  44 +
  45 +def safe_json_dumps(data: Any) -> str:
  46 + return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
  47 +
  48 +
  49 +def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
  50 + if not skus:
  51 + return "", "", ""
  52 + first = skus[0] or {}
  53 + return (
  54 + str(first.get("option1_value") or "").strip(),
  55 + str(first.get("option2_value") or "").strip(),
  56 + str(first.get("option3_value") or "").strip(),
  57 + )
  58 +
  59 +
  60 +def build_display_title(doc: Dict[str, Any]) -> str:
  61 + title = doc.get("title")
  62 + en = pick_text(title, "en")
  63 + zh = pick_text(title, "zh")
  64 + if en and zh and en != zh:
  65 + return f"{en} / {zh}"
  66 + return en or zh
  67 +
  68 +
  69 +def build_rerank_doc(doc: Dict[str, Any]) -> str:
  70 + title = build_display_title(doc)
  71 + return title[:400]
  72 +
  73 +
  74 +def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
  75 + title = build_display_title(doc)
  76 + option1, option2, option3 = compact_option_values(doc.get("skus") or [])
  77 + vendor = pick_text(doc.get("vendor"), "en")
  78 + category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
  79 + tags = doc.get("tags") or []
  80 + tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
  81 + parts = [title]
  82 + if option1:
  83 + parts.append(f"option1={option1}")
  84 + if option2:
  85 + parts.append(f"option2={option2}")
  86 + if option3:
  87 + parts.append(f"option3={option3}")
  88 + if vendor:
  89 + parts.append(f"vendor={vendor}")
  90 + if category:
  91 + parts.append(f"category={category}")
  92 + if tags_text:
  93 + parts.append(f"tags={tags_text}")
  94 + return f"{idx}. " + " | ".join(part for part in parts if part)
  95 +
  96 +
  97 +def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
  98 + return {
  99 + "spu_id": str(doc.get("spu_id") or ""),
  100 + "title": build_display_title(doc),
  101 + "image_url": doc.get("image_url"),
  102 + "vendor": pick_text(doc.get("vendor"), "en"),
  103 + "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
  104 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  105 + "tags": list((doc.get("tags") or [])[:6]),
  106 + }
  107 +
  108 +
  109 +def normalize_text(text: Any) -> str:
  110 + value = str(text or "").strip().lower()
  111 + value = re.sub(r"\s+", " ", value)
  112 + return value
  113 +
  114 +
  115 +def extract_json_blob(text: str) -> Any:
  116 + cleaned = str(text or "").strip()
  117 + candidates: List[str] = [cleaned]
  118 + fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
  119 + candidates.extend(match.strip() for match in fence_matches if match.strip())
  120 +
  121 + for candidate in candidates:
  122 + try:
  123 + return json.loads(candidate)
  124 + except Exception:
  125 + pass
  126 +
  127 + starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
  128 + ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
  129 + for start in starts:
  130 + for end in reversed(ends):
  131 + if end <= start:
  132 + continue
  133 + fragment = cleaned[start : end + 1]
  134 + try:
  135 + return json.loads(fragment)
  136 + except Exception:
  137 + continue
  138 + raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
  139 +
  140 +
  141 +def ensure_project_on_path() -> None:
  142 + import sys
  143 +
  144 + if str(PROJECT_ROOT) not in sys.path:
  145 + sys.path.insert(0, str(PROJECT_ROOT))
... ...
scripts/evaluation/eval_framework/web_app.py 0 โ†’ 100644
... ... @@ -0,0 +1,85 @@
  1 +"""FastAPI app for the search evaluation UI (static frontend + JSON APIs)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from pathlib import Path
  6 +from typing import Any, Dict
  7 +
  8 +from fastapi import FastAPI, HTTPException
  9 +from fastapi.responses import HTMLResponse
  10 +from fastapi.staticfiles import StaticFiles
  11 +
  12 +from .api_models import BatchEvalRequest, SearchEvalRequest
  13 +from .constants import DEFAULT_QUERY_FILE
  14 +from .framework import SearchEvaluationFramework
  15 +
  16 +_STATIC_DIR = Path(__file__).resolve().parent / "static"
  17 +
  18 +
  19 +def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
  20 + app = FastAPI(title="Search Evaluation UI", version="1.0.0")
  21 +
  22 + app.mount(
  23 + "/static",
  24 + StaticFiles(directory=str(_STATIC_DIR)),
  25 + name="static",
  26 + )
  27 +
  28 + index_path = _STATIC_DIR / "index.html"
  29 +
  30 + @app.get("/", response_class=HTMLResponse)
  31 + def home() -> str:
  32 + return index_path.read_text(encoding="utf-8")
  33 +
  34 + @app.get("/api/queries")
  35 + def api_queries() -> Dict[str, Any]:
  36 + return {"queries": framework.queries_from_file(query_file)}
  37 +
  38 + @app.post("/api/search-eval")
  39 + def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
  40 + return framework.evaluate_live_query(
  41 + query=request.query,
  42 + top_k=request.top_k,
  43 + auto_annotate=request.auto_annotate,
  44 + language=request.language,
  45 + )
  46 +
  47 + @app.post("/api/batch-eval")
  48 + def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
  49 + queries = request.queries or framework.queries_from_file(query_file)
  50 + if not queries:
  51 + raise HTTPException(status_code=400, detail="No queries provided")
  52 + return framework.batch_evaluate(
  53 + queries=queries,
  54 + top_k=request.top_k,
  55 + auto_annotate=request.auto_annotate,
  56 + language=request.language,
  57 + force_refresh_labels=request.force_refresh_labels,
  58 + )
  59 +
  60 + @app.get("/api/history")
  61 + def api_history() -> Dict[str, Any]:
  62 + return {"history": framework.store.list_batch_runs(limit=20)}
  63 +
  64 + @app.get("/api/history/{batch_id}/report")
  65 + def api_history_report(batch_id: str) -> Dict[str, Any]:
  66 + row = framework.store.get_batch_run(batch_id)
  67 + if row is None:
  68 + raise HTTPException(status_code=404, detail="Unknown batch_id")
  69 + report_path = Path(row["report_markdown_path"]).resolve()
  70 + root = framework.artifact_root.resolve()
  71 + try:
  72 + report_path.relative_to(root)
  73 + except ValueError:
  74 + raise HTTPException(status_code=403, detail="Report path is outside artifact root")
  75 + if not report_path.is_file():
  76 + raise HTTPException(status_code=404, detail="Report file not found")
  77 + return {
  78 + "batch_id": row["batch_id"],
  79 + "created_at": row["created_at"],
  80 + "tenant_id": row["tenant_id"],
  81 + "report_markdown_path": str(report_path),
  82 + "markdown": report_path.read_text(encoding="utf-8"),
  83 + }
  84 +
  85 + return app
... ...