Commit c81b0fc12093cca9a8eef590e545a71a3fc2cd1c
1 parent
7b8d9e1a
scripts/evaluation/eval_framework
Showing
18 changed files
with
2348 additions
and
2144 deletions
Show diff stats
scripts/evaluation/README.md
| ... | ... | @@ -19,12 +19,12 @@ The framework supports four related tasks: |
| 19 | 19 | |
| 20 | 20 | ## Files |
| 21 | 21 | |
| 22 | -- `eval_framework.py` | |
| 23 | - Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation. | |
| 22 | +- `eval_framework/` (Python package) | |
| 23 | + Modular layout: `framework.py` (orchestration), `store.py` (SQLite), `clients.py` (search/rerank/LLM), `prompts.py` (judge templates), `metrics.py`, `reports.py`, `web_app.py`, `cli.py`, and `static/` (evaluation UI HTML/CSS/JS). | |
| 24 | 24 | - `build_annotation_set.py` |
| 25 | - Thin CLI entrypoint into `eval_framework.py`. | |
| 25 | + Thin CLI entrypoint into `eval_framework`. | |
| 26 | 26 | - `serve_eval_web.py` |
| 27 | - Thin web entrypoint into `eval_framework.py`. | |
| 27 | + Thin web entrypoint into `eval_framework`. | |
| 28 | 28 | - `tune_fusion.py` |
| 29 | 29 | Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports. |
| 30 | 30 | - `fusion_experiments_shortlist.json` | ... | ... |
scripts/evaluation/eval_framework.py deleted
| ... | ... | @@ -1,2140 +0,0 @@ |
| 1 | -#!/usr/bin/env python3 | |
| 2 | -""" | |
| 3 | -Search evaluation framework for pooled relevance annotation, live metrics, and reports. | |
| 4 | -""" | |
| 5 | - | |
| 6 | -from __future__ import annotations | |
| 7 | - | |
| 8 | -import argparse | |
| 9 | -import hashlib | |
| 10 | -import json | |
| 11 | -import math | |
| 12 | -import os | |
| 13 | -import re | |
| 14 | -import sqlite3 | |
| 15 | -import sys | |
| 16 | -import time | |
| 17 | -from dataclasses import dataclass | |
| 18 | -from datetime import datetime, timezone | |
| 19 | -from pathlib import Path | |
| 20 | -from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | |
| 21 | - | |
| 22 | -import requests | |
| 23 | -from elasticsearch.helpers import scan | |
| 24 | -from fastapi import FastAPI, HTTPException | |
| 25 | -from fastapi.responses import HTMLResponse | |
| 26 | -from pydantic import BaseModel, Field | |
| 27 | - | |
| 28 | -PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| 29 | -if str(PROJECT_ROOT) not in sys.path: | |
| 30 | - sys.path.insert(0, str(PROJECT_ROOT)) | |
| 31 | - | |
| 32 | -from api.app import get_app_config, get_es_client, get_query_parser, init_service | |
| 33 | -from indexer.mapping_generator import get_tenant_index_name | |
| 34 | - | |
| 35 | - | |
| 36 | -RELEVANCE_EXACT = "Exact" | |
| 37 | -RELEVANCE_PARTIAL = "Partial" | |
| 38 | -RELEVANCE_IRRELEVANT = "Irrelevant" | |
| 39 | -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | |
| 40 | -DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | |
| 41 | -DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" | |
| 42 | -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" | |
| 43 | -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | |
| 44 | -DEFAULT_LABELER_MODE = "simple" | |
| 45 | - | |
| 46 | - | |
| 47 | -def utc_now_iso() -> str: | |
| 48 | - return datetime.now(timezone.utc).isoformat() | |
| 49 | - | |
| 50 | - | |
| 51 | -def utc_timestamp() -> str: | |
| 52 | - return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| 53 | - | |
| 54 | - | |
| 55 | -def ensure_dir(path: Path) -> Path: | |
| 56 | - path.mkdir(parents=True, exist_ok=True) | |
| 57 | - return path | |
| 58 | - | |
| 59 | - | |
| 60 | -def sha1_text(text: str) -> str: | |
| 61 | - return hashlib.sha1(text.encode("utf-8")).hexdigest() | |
| 62 | - | |
| 63 | - | |
| 64 | -def pick_text(value: Any, preferred_lang: str = "en") -> str: | |
| 65 | - if value is None: | |
| 66 | - return "" | |
| 67 | - if isinstance(value, dict): | |
| 68 | - return str( | |
| 69 | - value.get(preferred_lang) | |
| 70 | - or value.get("en") | |
| 71 | - or value.get("zh") | |
| 72 | - or next((v for v in value.values() if v), "") | |
| 73 | - ).strip() | |
| 74 | - return str(value).strip() | |
| 75 | - | |
| 76 | - | |
| 77 | -def safe_json_dumps(data: Any) -> str: | |
| 78 | - return json.dumps(data, ensure_ascii=False, separators=(",", ":")) | |
| 79 | - | |
| 80 | - | |
| 81 | -def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]: | |
| 82 | - if not skus: | |
| 83 | - return "", "", "" | |
| 84 | - first = skus[0] or {} | |
| 85 | - return ( | |
| 86 | - str(first.get("option1_value") or "").strip(), | |
| 87 | - str(first.get("option2_value") or "").strip(), | |
| 88 | - str(first.get("option3_value") or "").strip(), | |
| 89 | - ) | |
| 90 | - | |
| 91 | - | |
| 92 | -def build_display_title(doc: Dict[str, Any]) -> str: | |
| 93 | - title = doc.get("title") | |
| 94 | - en = pick_text(title, "en") | |
| 95 | - zh = pick_text(title, "zh") | |
| 96 | - if en and zh and en != zh: | |
| 97 | - return f"{en} / {zh}" | |
| 98 | - return en or zh | |
| 99 | - | |
| 100 | - | |
| 101 | -def build_rerank_doc(doc: Dict[str, Any]) -> str: | |
| 102 | - title = build_display_title(doc) | |
| 103 | - return title[:400] | |
| 104 | - | |
| 105 | - | |
| 106 | -def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: | |
| 107 | - title = build_display_title(doc) | |
| 108 | - option1, option2, option3 = compact_option_values(doc.get("skus") or []) | |
| 109 | - vendor = pick_text(doc.get("vendor"), "en") | |
| 110 | - category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") | |
| 111 | - tags = doc.get("tags") or [] | |
| 112 | - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) | |
| 113 | - parts = [title] | |
| 114 | - if option1: | |
| 115 | - parts.append(f"option1={option1}") | |
| 116 | - if option2: | |
| 117 | - parts.append(f"option2={option2}") | |
| 118 | - if option3: | |
| 119 | - parts.append(f"option3={option3}") | |
| 120 | - if vendor: | |
| 121 | - parts.append(f"vendor={vendor}") | |
| 122 | - if category: | |
| 123 | - parts.append(f"category={category}") | |
| 124 | - if tags_text: | |
| 125 | - parts.append(f"tags={tags_text}") | |
| 126 | - return f"{idx}. " + " | ".join(part for part in parts if part) | |
| 127 | - | |
| 128 | - | |
| 129 | -def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: | |
| 130 | - return { | |
| 131 | - "spu_id": str(doc.get("spu_id") or ""), | |
| 132 | - "title": build_display_title(doc), | |
| 133 | - "image_url": doc.get("image_url"), | |
| 134 | - "vendor": pick_text(doc.get("vendor"), "en"), | |
| 135 | - "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), | |
| 136 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 137 | - "tags": list((doc.get("tags") or [])[:6]), | |
| 138 | - } | |
| 139 | - | |
| 140 | - | |
| 141 | -def normalize_text(text: Any) -> str: | |
| 142 | - value = str(text or "").strip().lower() | |
| 143 | - value = re.sub(r"\s+", " ", value) | |
| 144 | - return value | |
| 145 | - | |
| 146 | - | |
| 147 | -def _extract_json_blob(text: str) -> Any: | |
| 148 | - cleaned = str(text or "").strip() | |
| 149 | - candidates: List[str] = [cleaned] | |
| 150 | - fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I) | |
| 151 | - candidates.extend(match.strip() for match in fence_matches if match.strip()) | |
| 152 | - | |
| 153 | - for candidate in candidates: | |
| 154 | - try: | |
| 155 | - return json.loads(candidate) | |
| 156 | - except Exception: | |
| 157 | - pass | |
| 158 | - | |
| 159 | - starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"] | |
| 160 | - ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"] | |
| 161 | - for start in starts: | |
| 162 | - for end in reversed(ends): | |
| 163 | - if end <= start: | |
| 164 | - continue | |
| 165 | - fragment = cleaned[start : end + 1] | |
| 166 | - try: | |
| 167 | - return json.loads(fragment) | |
| 168 | - except Exception: | |
| 169 | - continue | |
| 170 | - raise ValueError(f"failed to parse json from: {cleaned[:500]!r}") | |
| 171 | - | |
| 172 | - | |
| 173 | -@dataclass | |
| 174 | -class QueryBuildResult: | |
| 175 | - query: str | |
| 176 | - tenant_id: str | |
| 177 | - search_total: int | |
| 178 | - search_depth: int | |
| 179 | - rerank_corpus_size: int | |
| 180 | - annotated_count: int | |
| 181 | - output_json_path: Path | |
| 182 | - | |
| 183 | - | |
| 184 | -class EvalStore: | |
| 185 | - def __init__(self, db_path: Path): | |
| 186 | - self.db_path = db_path | |
| 187 | - ensure_dir(db_path.parent) | |
| 188 | - self.conn = sqlite3.connect(str(db_path), check_same_thread=False) | |
| 189 | - self.conn.row_factory = sqlite3.Row | |
| 190 | - self._init_schema() | |
| 191 | - | |
| 192 | - def _init_schema(self) -> None: | |
| 193 | - self.conn.executescript( | |
| 194 | - """ | |
| 195 | - CREATE TABLE IF NOT EXISTS corpus_docs ( | |
| 196 | - tenant_id TEXT NOT NULL, | |
| 197 | - spu_id TEXT NOT NULL, | |
| 198 | - title_json TEXT, | |
| 199 | - vendor_json TEXT, | |
| 200 | - category_path_json TEXT, | |
| 201 | - category_name_json TEXT, | |
| 202 | - image_url TEXT, | |
| 203 | - skus_json TEXT, | |
| 204 | - tags_json TEXT, | |
| 205 | - raw_json TEXT NOT NULL, | |
| 206 | - updated_at TEXT NOT NULL, | |
| 207 | - PRIMARY KEY (tenant_id, spu_id) | |
| 208 | - ); | |
| 209 | - | |
| 210 | - CREATE TABLE IF NOT EXISTS rerank_scores ( | |
| 211 | - tenant_id TEXT NOT NULL, | |
| 212 | - query_text TEXT NOT NULL, | |
| 213 | - spu_id TEXT NOT NULL, | |
| 214 | - score REAL NOT NULL, | |
| 215 | - model_name TEXT, | |
| 216 | - updated_at TEXT NOT NULL, | |
| 217 | - PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 218 | - ); | |
| 219 | - | |
| 220 | - CREATE TABLE IF NOT EXISTS relevance_labels ( | |
| 221 | - tenant_id TEXT NOT NULL, | |
| 222 | - query_text TEXT NOT NULL, | |
| 223 | - spu_id TEXT NOT NULL, | |
| 224 | - label TEXT NOT NULL, | |
| 225 | - judge_model TEXT, | |
| 226 | - raw_response TEXT, | |
| 227 | - updated_at TEXT NOT NULL, | |
| 228 | - PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 229 | - ); | |
| 230 | - | |
| 231 | - CREATE TABLE IF NOT EXISTS build_runs ( | |
| 232 | - run_id TEXT PRIMARY KEY, | |
| 233 | - tenant_id TEXT NOT NULL, | |
| 234 | - query_text TEXT NOT NULL, | |
| 235 | - output_json_path TEXT NOT NULL, | |
| 236 | - metadata_json TEXT NOT NULL, | |
| 237 | - created_at TEXT NOT NULL | |
| 238 | - ); | |
| 239 | - | |
| 240 | - CREATE TABLE IF NOT EXISTS batch_runs ( | |
| 241 | - batch_id TEXT PRIMARY KEY, | |
| 242 | - tenant_id TEXT NOT NULL, | |
| 243 | - output_json_path TEXT NOT NULL, | |
| 244 | - report_markdown_path TEXT NOT NULL, | |
| 245 | - config_snapshot_path TEXT NOT NULL, | |
| 246 | - metadata_json TEXT NOT NULL, | |
| 247 | - created_at TEXT NOT NULL | |
| 248 | - ); | |
| 249 | - | |
| 250 | - CREATE TABLE IF NOT EXISTS query_profiles ( | |
| 251 | - tenant_id TEXT NOT NULL, | |
| 252 | - query_text TEXT NOT NULL, | |
| 253 | - prompt_version TEXT NOT NULL, | |
| 254 | - judge_model TEXT, | |
| 255 | - profile_json TEXT NOT NULL, | |
| 256 | - raw_response TEXT NOT NULL, | |
| 257 | - updated_at TEXT NOT NULL, | |
| 258 | - PRIMARY KEY (tenant_id, query_text, prompt_version) | |
| 259 | - ); | |
| 260 | - """ | |
| 261 | - ) | |
| 262 | - self.conn.commit() | |
| 263 | - | |
| 264 | - def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None: | |
| 265 | - now = utc_now_iso() | |
| 266 | - rows = [] | |
| 267 | - for doc in docs: | |
| 268 | - rows.append( | |
| 269 | - ( | |
| 270 | - tenant_id, | |
| 271 | - str(doc.get("spu_id") or ""), | |
| 272 | - safe_json_dumps(doc.get("title")), | |
| 273 | - safe_json_dumps(doc.get("vendor")), | |
| 274 | - safe_json_dumps(doc.get("category_path")), | |
| 275 | - safe_json_dumps(doc.get("category_name")), | |
| 276 | - str(doc.get("image_url") or ""), | |
| 277 | - safe_json_dumps(doc.get("skus") or []), | |
| 278 | - safe_json_dumps(doc.get("tags") or []), | |
| 279 | - safe_json_dumps(doc), | |
| 280 | - now, | |
| 281 | - ) | |
| 282 | - ) | |
| 283 | - self.conn.executemany( | |
| 284 | - """ | |
| 285 | - INSERT INTO corpus_docs ( | |
| 286 | - tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json, | |
| 287 | - image_url, skus_json, tags_json, raw_json, updated_at | |
| 288 | - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| 289 | - ON CONFLICT(tenant_id, spu_id) DO UPDATE SET | |
| 290 | - title_json=excluded.title_json, | |
| 291 | - vendor_json=excluded.vendor_json, | |
| 292 | - category_path_json=excluded.category_path_json, | |
| 293 | - category_name_json=excluded.category_name_json, | |
| 294 | - image_url=excluded.image_url, | |
| 295 | - skus_json=excluded.skus_json, | |
| 296 | - tags_json=excluded.tags_json, | |
| 297 | - raw_json=excluded.raw_json, | |
| 298 | - updated_at=excluded.updated_at | |
| 299 | - """, | |
| 300 | - rows, | |
| 301 | - ) | |
| 302 | - self.conn.commit() | |
| 303 | - | |
| 304 | - def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 305 | - rows = self.conn.execute( | |
| 306 | - "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id", | |
| 307 | - (tenant_id,), | |
| 308 | - ).fetchall() | |
| 309 | - return [json.loads(row["raw_json"]) for row in rows] | |
| 310 | - | |
| 311 | - def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]: | |
| 312 | - keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()] | |
| 313 | - if not keys: | |
| 314 | - return {} | |
| 315 | - placeholders = ",".join("?" for _ in keys) | |
| 316 | - rows = self.conn.execute( | |
| 317 | - f""" | |
| 318 | - SELECT spu_id, raw_json | |
| 319 | - FROM corpus_docs | |
| 320 | - WHERE tenant_id=? AND spu_id IN ({placeholders}) | |
| 321 | - """, | |
| 322 | - [tenant_id, *keys], | |
| 323 | - ).fetchall() | |
| 324 | - return { | |
| 325 | - str(row["spu_id"]): json.loads(row["raw_json"]) | |
| 326 | - for row in rows | |
| 327 | - } | |
| 328 | - | |
| 329 | - def has_corpus(self, tenant_id: str) -> bool: | |
| 330 | - row = self.conn.execute( | |
| 331 | - "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?", | |
| 332 | - (tenant_id,), | |
| 333 | - ).fetchone() | |
| 334 | - return bool(row and row["n"] > 0) | |
| 335 | - | |
| 336 | - def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]: | |
| 337 | - rows = self.conn.execute( | |
| 338 | - """ | |
| 339 | - SELECT spu_id, score | |
| 340 | - FROM rerank_scores | |
| 341 | - WHERE tenant_id=? AND query_text=? | |
| 342 | - """, | |
| 343 | - (tenant_id, query_text), | |
| 344 | - ).fetchall() | |
| 345 | - return {str(row["spu_id"]): float(row["score"]) for row in rows} | |
| 346 | - | |
| 347 | - def upsert_rerank_scores( | |
| 348 | - self, | |
| 349 | - tenant_id: str, | |
| 350 | - query_text: str, | |
| 351 | - scores: Dict[str, float], | |
| 352 | - model_name: str, | |
| 353 | - ) -> None: | |
| 354 | - now = utc_now_iso() | |
| 355 | - rows = [ | |
| 356 | - (tenant_id, query_text, spu_id, float(score), model_name, now) | |
| 357 | - for spu_id, score in scores.items() | |
| 358 | - ] | |
| 359 | - self.conn.executemany( | |
| 360 | - """ | |
| 361 | - INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at) | |
| 362 | - VALUES (?, ?, ?, ?, ?, ?) | |
| 363 | - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 364 | - score=excluded.score, | |
| 365 | - model_name=excluded.model_name, | |
| 366 | - updated_at=excluded.updated_at | |
| 367 | - """, | |
| 368 | - rows, | |
| 369 | - ) | |
| 370 | - self.conn.commit() | |
| 371 | - | |
| 372 | - def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]: | |
| 373 | - rows = self.conn.execute( | |
| 374 | - """ | |
| 375 | - SELECT spu_id, label | |
| 376 | - FROM relevance_labels | |
| 377 | - WHERE tenant_id=? AND query_text=? | |
| 378 | - """, | |
| 379 | - (tenant_id, query_text), | |
| 380 | - ).fetchall() | |
| 381 | - return {str(row["spu_id"]): str(row["label"]) for row in rows} | |
| 382 | - | |
| 383 | - def upsert_labels( | |
| 384 | - self, | |
| 385 | - tenant_id: str, | |
| 386 | - query_text: str, | |
| 387 | - labels: Dict[str, str], | |
| 388 | - judge_model: str, | |
| 389 | - raw_response: str, | |
| 390 | - ) -> None: | |
| 391 | - now = utc_now_iso() | |
| 392 | - rows = [] | |
| 393 | - for spu_id, label in labels.items(): | |
| 394 | - if label not in VALID_LABELS: | |
| 395 | - raise ValueError(f"invalid label: {label}") | |
| 396 | - rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now)) | |
| 397 | - self.conn.executemany( | |
| 398 | - """ | |
| 399 | - INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at) | |
| 400 | - VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 401 | - ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 402 | - label=excluded.label, | |
| 403 | - judge_model=excluded.judge_model, | |
| 404 | - raw_response=excluded.raw_response, | |
| 405 | - updated_at=excluded.updated_at | |
| 406 | - """, | |
| 407 | - rows, | |
| 408 | - ) | |
| 409 | - self.conn.commit() | |
| 410 | - | |
| 411 | - def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]: | |
| 412 | - row = self.conn.execute( | |
| 413 | - """ | |
| 414 | - SELECT profile_json | |
| 415 | - FROM query_profiles | |
| 416 | - WHERE tenant_id=? AND query_text=? AND prompt_version=? | |
| 417 | - """, | |
| 418 | - (tenant_id, query_text, prompt_version), | |
| 419 | - ).fetchone() | |
| 420 | - if not row: | |
| 421 | - return None | |
| 422 | - return json.loads(row["profile_json"]) | |
| 423 | - | |
| 424 | - def upsert_query_profile( | |
| 425 | - self, | |
| 426 | - tenant_id: str, | |
| 427 | - query_text: str, | |
| 428 | - prompt_version: str, | |
| 429 | - judge_model: str, | |
| 430 | - profile: Dict[str, Any], | |
| 431 | - raw_response: str, | |
| 432 | - ) -> None: | |
| 433 | - self.conn.execute( | |
| 434 | - """ | |
| 435 | - INSERT OR REPLACE INTO query_profiles | |
| 436 | - (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at) | |
| 437 | - VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 438 | - """, | |
| 439 | - ( | |
| 440 | - tenant_id, | |
| 441 | - query_text, | |
| 442 | - prompt_version, | |
| 443 | - judge_model, | |
| 444 | - safe_json_dumps(profile), | |
| 445 | - raw_response, | |
| 446 | - utc_now_iso(), | |
| 447 | - ), | |
| 448 | - ) | |
| 449 | - self.conn.commit() | |
| 450 | - | |
| 451 | - def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None: | |
| 452 | - self.conn.execute( | |
| 453 | - """ | |
| 454 | - INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at) | |
| 455 | - VALUES (?, ?, ?, ?, ?, ?) | |
| 456 | - """, | |
| 457 | - (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()), | |
| 458 | - ) | |
| 459 | - self.conn.commit() | |
| 460 | - | |
| 461 | - def insert_batch_run( | |
| 462 | - self, | |
| 463 | - batch_id: str, | |
| 464 | - tenant_id: str, | |
| 465 | - output_json_path: Path, | |
| 466 | - report_markdown_path: Path, | |
| 467 | - config_snapshot_path: Path, | |
| 468 | - metadata: Dict[str, Any], | |
| 469 | - ) -> None: | |
| 470 | - self.conn.execute( | |
| 471 | - """ | |
| 472 | - INSERT OR REPLACE INTO batch_runs | |
| 473 | - (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at) | |
| 474 | - VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 475 | - """, | |
| 476 | - ( | |
| 477 | - batch_id, | |
| 478 | - tenant_id, | |
| 479 | - str(output_json_path), | |
| 480 | - str(report_markdown_path), | |
| 481 | - str(config_snapshot_path), | |
| 482 | - safe_json_dumps(metadata), | |
| 483 | - utc_now_iso(), | |
| 484 | - ), | |
| 485 | - ) | |
| 486 | - self.conn.commit() | |
| 487 | - | |
| 488 | - def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]: | |
| 489 | - rows = self.conn.execute( | |
| 490 | - """ | |
| 491 | - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | |
| 492 | - FROM batch_runs | |
| 493 | - ORDER BY created_at DESC | |
| 494 | - LIMIT ? | |
| 495 | - """, | |
| 496 | - (limit,), | |
| 497 | - ).fetchall() | |
| 498 | - items: List[Dict[str, Any]] = [] | |
| 499 | - for row in rows: | |
| 500 | - items.append( | |
| 501 | - { | |
| 502 | - "batch_id": row["batch_id"], | |
| 503 | - "tenant_id": row["tenant_id"], | |
| 504 | - "output_json_path": row["output_json_path"], | |
| 505 | - "report_markdown_path": row["report_markdown_path"], | |
| 506 | - "config_snapshot_path": row["config_snapshot_path"], | |
| 507 | - "metadata": json.loads(row["metadata_json"]), | |
| 508 | - "created_at": row["created_at"], | |
| 509 | - } | |
| 510 | - ) | |
| 511 | - return items | |
| 512 | - | |
| 513 | - def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]: | |
| 514 | - row = self.conn.execute( | |
| 515 | - """ | |
| 516 | - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | |
| 517 | - FROM batch_runs | |
| 518 | - WHERE batch_id = ? | |
| 519 | - """, | |
| 520 | - (batch_id,), | |
| 521 | - ).fetchone() | |
| 522 | - if row is None: | |
| 523 | - return None | |
| 524 | - return { | |
| 525 | - "batch_id": row["batch_id"], | |
| 526 | - "tenant_id": row["tenant_id"], | |
| 527 | - "output_json_path": row["output_json_path"], | |
| 528 | - "report_markdown_path": row["report_markdown_path"], | |
| 529 | - "config_snapshot_path": row["config_snapshot_path"], | |
| 530 | - "metadata": json.loads(row["metadata_json"]), | |
| 531 | - "created_at": row["created_at"], | |
| 532 | - } | |
| 533 | - | |
| 534 | - def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 535 | - rows = self.conn.execute( | |
| 536 | - """ | |
| 537 | - SELECT | |
| 538 | - query_text, | |
| 539 | - COUNT(*) AS total, | |
| 540 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 541 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 542 | - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 543 | - MAX(updated_at) AS updated_at | |
| 544 | - FROM relevance_labels | |
| 545 | - WHERE tenant_id=? | |
| 546 | - GROUP BY query_text | |
| 547 | - ORDER BY query_text | |
| 548 | - """, | |
| 549 | - (tenant_id,), | |
| 550 | - ).fetchall() | |
| 551 | - return [ | |
| 552 | - { | |
| 553 | - "query": str(row["query_text"]), | |
| 554 | - "total": int(row["total"]), | |
| 555 | - "exact_count": int(row["exact_count"] or 0), | |
| 556 | - "partial_count": int(row["partial_count"] or 0), | |
| 557 | - "irrelevant_count": int(row["irrelevant_count"] or 0), | |
| 558 | - "updated_at": row["updated_at"], | |
| 559 | - } | |
| 560 | - for row in rows | |
| 561 | - ] | |
| 562 | - | |
| 563 | - def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]: | |
| 564 | - row = self.conn.execute( | |
| 565 | - """ | |
| 566 | - SELECT | |
| 567 | - COUNT(*) AS total, | |
| 568 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 569 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 570 | - SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 571 | - MAX(updated_at) AS updated_at | |
| 572 | - FROM relevance_labels | |
| 573 | - WHERE tenant_id=? AND query_text=? | |
| 574 | - """, | |
| 575 | - (tenant_id, query_text), | |
| 576 | - ).fetchone() | |
| 577 | - return { | |
| 578 | - "query": query_text, | |
| 579 | - "total": int((row["total"] or 0) if row else 0), | |
| 580 | - "exact_count": int((row["exact_count"] or 0) if row else 0), | |
| 581 | - "partial_count": int((row["partial_count"] or 0) if row else 0), | |
| 582 | - "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), | |
| 583 | - "updated_at": row["updated_at"] if row else None, | |
| 584 | - } | |
| 585 | - | |
| 586 | - | |
| 587 | -class SearchServiceClient: | |
| 588 | - def __init__(self, base_url: str, tenant_id: str): | |
| 589 | - self.base_url = base_url.rstrip("/") | |
| 590 | - self.tenant_id = str(tenant_id) | |
| 591 | - self.session = requests.Session() | |
| 592 | - | |
| 593 | - def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]: | |
| 594 | - response = self.session.post( | |
| 595 | - f"{self.base_url}/search/", | |
| 596 | - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 597 | - json={"query": query, "size": size, "from": from_, "language": language}, | |
| 598 | - timeout=120, | |
| 599 | - ) | |
| 600 | - response.raise_for_status() | |
| 601 | - return response.json() | |
| 602 | - | |
| 603 | - | |
| 604 | -class RerankServiceClient: | |
| 605 | - def __init__(self, service_url: str): | |
| 606 | - self.service_url = service_url.rstrip("/") | |
| 607 | - self.session = requests.Session() | |
| 608 | - | |
| 609 | - def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]: | |
| 610 | - payload: Dict[str, Any] = { | |
| 611 | - "query": query, | |
| 612 | - "docs": list(docs), | |
| 613 | - "normalize": normalize, | |
| 614 | - } | |
| 615 | - if top_n is not None: | |
| 616 | - payload["top_n"] = int(top_n) | |
| 617 | - response = self.session.post(self.service_url, json=payload, timeout=180) | |
| 618 | - response.raise_for_status() | |
| 619 | - data = response.json() | |
| 620 | - return list(data.get("scores") or []), dict(data.get("meta") or {}) | |
| 621 | - | |
| 622 | - | |
| 623 | -class DashScopeLabelClient: | |
| 624 | - def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40): | |
| 625 | - self.model = model | |
| 626 | - self.base_url = base_url.rstrip("/") | |
| 627 | - self.api_key = api_key | |
| 628 | - self.batch_size = int(batch_size) | |
| 629 | - self.session = requests.Session() | |
| 630 | - | |
| 631 | - def _chat(self, prompt: str) -> Tuple[str, str]: | |
| 632 | - response = self.session.post( | |
| 633 | - f"{self.base_url}/chat/completions", | |
| 634 | - headers={ | |
| 635 | - "Authorization": f"Bearer {self.api_key}", | |
| 636 | - "Content-Type": "application/json", | |
| 637 | - }, | |
| 638 | - json={ | |
| 639 | - "model": self.model, | |
| 640 | - "messages": [{"role": "user", "content": prompt}], | |
| 641 | - "temperature": 0, | |
| 642 | - "top_p": 0.1, | |
| 643 | - }, | |
| 644 | - timeout=180, | |
| 645 | - ) | |
| 646 | - response.raise_for_status() | |
| 647 | - data = response.json() | |
| 648 | - content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | |
| 649 | - return content, safe_json_dumps(data) | |
| 650 | - | |
| 651 | - def classify_batch_simple( | |
| 652 | - self, | |
| 653 | - query: str, | |
| 654 | - docs: Sequence[Dict[str, Any]], | |
| 655 | - ) -> Tuple[List[str], str]: | |
| 656 | - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 657 | - prompt = ( | |
| 658 | - "You are an e-commerce search result relevance evaluation assistant. " | |
| 659 | - "Based on the user query and each product's information, output the relevance level for each product.\n\n" | |
| 660 | - "## Relevance Level Criteria\n" | |
| 661 | - "Exact โ Fully matches the user's search intent.\n" | |
| 662 | - "Partial โ Primary intent satisfied (same category or similar use, basically aligns with search intent), " | |
| 663 | - "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | |
| 664 | - "Irrelevant โ Category or use case mismatched, primary intent not satisfied.\n\n" | |
| 665 | - "Additional judging guidance:\n" | |
| 666 | - "- If the query clearly names a product type, product type matching has the highest priority. " | |
| 667 | - "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | |
| 668 | - "bra vs top, backpack vs bag are not interchangeable.\n" | |
| 669 | - "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | |
| 670 | - "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | |
| 671 | - "- Do not guess missing attributes.\n" | |
| 672 | - "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | |
| 673 | - "- Be conservative with Exact.\n\n" | |
| 674 | - f"Query: {query}\n\n" | |
| 675 | - "Products:\n" | |
| 676 | - + "\n".join(numbered_docs) | |
| 677 | - + "\n\n## Output Format\n" | |
| 678 | - f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | |
| 679 | - "They must correspond sequentially to the products above. Do not output any other information.\n" | |
| 680 | - ) | |
| 681 | - content, raw_response = self._chat(prompt) | |
| 682 | - labels = [] | |
| 683 | - for line in str(content or "").splitlines(): | |
| 684 | - label = line.strip() | |
| 685 | - if label in VALID_LABELS: | |
| 686 | - labels.append(label) | |
| 687 | - if len(labels) != len(docs): | |
| 688 | - payload = _extract_json_blob(content) | |
| 689 | - if isinstance(payload, dict) and isinstance(payload.get("labels"), list): | |
| 690 | - labels = [] | |
| 691 | - for item in payload["labels"][: len(docs)]: | |
| 692 | - if isinstance(item, dict): | |
| 693 | - label = str(item.get("label") or "").strip() | |
| 694 | - else: | |
| 695 | - label = str(item).strip() | |
| 696 | - if label in VALID_LABELS: | |
| 697 | - labels.append(label) | |
| 698 | - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 699 | - raise ValueError(f"unexpected simple label output: {content!r}") | |
| 700 | - return labels, raw_response | |
| 701 | - | |
| 702 | - def extract_query_profile( | |
| 703 | - self, | |
| 704 | - query: str, | |
| 705 | - parser_hints: Dict[str, Any], | |
| 706 | - ) -> Tuple[Dict[str, Any], str]: | |
| 707 | - prompt = ( | |
| 708 | - "You are building a structured intent profile for e-commerce relevance judging.\n" | |
| 709 | - "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" | |
| 710 | - "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" | |
| 711 | - "Return JSON with this schema:\n" | |
| 712 | - "{\n" | |
| 713 | - ' "normalized_query_en": string,\n' | |
| 714 | - ' "primary_category": string,\n' | |
| 715 | - ' "allowed_categories": [string],\n' | |
| 716 | - ' "required_attributes": [\n' | |
| 717 | - ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' | |
| 718 | - " ],\n" | |
| 719 | - ' "notes": [string]\n' | |
| 720 | - "}\n\n" | |
| 721 | - "Guidelines:\n" | |
| 722 | - "- Exact later will require explicit evidence for all required attributes.\n" | |
| 723 | - "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" | |
| 724 | - "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" | |
| 725 | - "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" | |
| 726 | - "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" | |
| 727 | - "- For color, include conflicting colors only when clear from the query.\n\n" | |
| 728 | - f"Original query: {query}\n" | |
| 729 | - f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n" | |
| 730 | - ) | |
| 731 | - content, raw_response = self._chat(prompt) | |
| 732 | - payload = _extract_json_blob(content) | |
| 733 | - if not isinstance(payload, dict): | |
| 734 | - raise ValueError(f"unexpected query profile payload: {content!r}") | |
| 735 | - payload.setdefault("normalized_query_en", query) | |
| 736 | - payload.setdefault("primary_category", "") | |
| 737 | - payload.setdefault("allowed_categories", []) | |
| 738 | - payload.setdefault("required_attributes", []) | |
| 739 | - payload.setdefault("notes", []) | |
| 740 | - return payload, raw_response | |
| 741 | - | |
| 742 | - def classify_batch_complex( | |
| 743 | - self, | |
| 744 | - query: str, | |
| 745 | - query_profile: Dict[str, Any], | |
| 746 | - docs: Sequence[Dict[str, Any]], | |
| 747 | - ) -> Tuple[List[str], str]: | |
| 748 | - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 749 | - prompt = ( | |
| 750 | - "You are an e-commerce search relevance judge.\n" | |
| 751 | - "Judge each product against the structured query profile below.\n\n" | |
| 752 | - "Relevance rules:\n" | |
| 753 | - "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" | |
| 754 | - "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" | |
| 755 | - "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" | |
| 756 | - "- Be conservative with Exact.\n" | |
| 757 | - "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" | |
| 758 | - "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" | |
| 759 | - f"Original query: {query}\n" | |
| 760 | - f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n" | |
| 761 | - "Products:\n" | |
| 762 | - + "\n".join(numbered_docs) | |
| 763 | - + "\n\nReturn JSON only, with schema:\n" | |
| 764 | - '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' | |
| 765 | - ) | |
| 766 | - content, raw_response = self._chat(prompt) | |
| 767 | - payload = _extract_json_blob(content) | |
| 768 | - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): | |
| 769 | - raise ValueError(f"unexpected label payload: {content!r}") | |
| 770 | - labels_payload = payload["labels"] | |
| 771 | - labels: List[str] = [] | |
| 772 | - for item in labels_payload[: len(docs)]: | |
| 773 | - if not isinstance(item, dict): | |
| 774 | - continue | |
| 775 | - label = str(item.get("label") or "").strip() | |
| 776 | - if label in VALID_LABELS: | |
| 777 | - labels.append(label) | |
| 778 | - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 779 | - raise ValueError(f"unexpected label output: {content!r}") | |
| 780 | - return labels, raw_response | |
| 781 | - | |
| 782 | - | |
| 783 | -def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: | |
| 784 | - if k <= 0: | |
| 785 | - return 0.0 | |
| 786 | - sliced = list(labels[:k]) | |
| 787 | - if not sliced: | |
| 788 | - return 0.0 | |
| 789 | - hits = sum(1 for label in sliced if label in relevant) | |
| 790 | - return hits / float(min(k, len(sliced))) | |
| 791 | - | |
| 792 | - | |
| 793 | -def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: | |
| 794 | - hit_count = 0 | |
| 795 | - precision_sum = 0.0 | |
| 796 | - for idx, label in enumerate(labels, start=1): | |
| 797 | - if label not in relevant: | |
| 798 | - continue | |
| 799 | - hit_count += 1 | |
| 800 | - precision_sum += hit_count / idx | |
| 801 | - if hit_count == 0: | |
| 802 | - return 0.0 | |
| 803 | - return precision_sum / hit_count | |
| 804 | - | |
| 805 | - | |
| 806 | -def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: | |
| 807 | - metrics: Dict[str, float] = {} | |
| 808 | - for k in (5, 10, 20, 50): | |
| 809 | - metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) | |
| 810 | - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 811 | - metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) | |
| 812 | - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 813 | - return metrics | |
| 814 | - | |
| 815 | - | |
| 816 | -def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]: | |
| 817 | - if not metric_items: | |
| 818 | - return {} | |
| 819 | - keys = sorted(metric_items[0].keys()) | |
| 820 | - return { | |
| 821 | - key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6) | |
| 822 | - for key in keys | |
| 823 | - } | |
| 824 | - | |
| 825 | - | |
| 826 | -def label_distribution(labels: Sequence[str]) -> Dict[str, int]: | |
| 827 | - return { | |
| 828 | - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), | |
| 829 | - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), | |
| 830 | - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), | |
| 831 | - } | |
| 832 | - | |
| 833 | - | |
| 834 | -class SearchEvaluationFramework: | |
| 835 | - def __init__( | |
| 836 | - self, | |
| 837 | - tenant_id: str, | |
| 838 | - artifact_root: Path = DEFAULT_ARTIFACT_ROOT, | |
| 839 | - search_base_url: str = "http://localhost:6002", | |
| 840 | - labeler_mode: str = DEFAULT_LABELER_MODE, | |
| 841 | - ): | |
| 842 | - init_service(get_app_config().infrastructure.elasticsearch.host) | |
| 843 | - self.tenant_id = str(tenant_id) | |
| 844 | - self.artifact_root = ensure_dir(artifact_root) | |
| 845 | - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | |
| 846 | - self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") | |
| 847 | - self.search_client = SearchServiceClient(search_base_url, self.tenant_id) | |
| 848 | - app_cfg = get_app_config() | |
| 849 | - rerank_service_url = str( | |
| 850 | - app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] | |
| 851 | - ) | |
| 852 | - self.rerank_client = RerankServiceClient(rerank_service_url) | |
| 853 | - llm_cfg = app_cfg.services.translation.capabilities["llm"] | |
| 854 | - api_key = app_cfg.infrastructure.secrets.dashscope_api_key | |
| 855 | - if not api_key: | |
| 856 | - raise RuntimeError("dashscope_api_key is required for search evaluation annotation") | |
| 857 | - self.label_client = DashScopeLabelClient( | |
| 858 | - model=str(llm_cfg["model"]), | |
| 859 | - base_url=str(llm_cfg["base_url"]), | |
| 860 | - api_key=str(api_key), | |
| 861 | - ) | |
| 862 | - self.query_parser = None | |
| 863 | - | |
| 864 | - def _get_query_parser(self): | |
| 865 | - if self.query_parser is None: | |
| 866 | - self.query_parser = get_query_parser() | |
| 867 | - return self.query_parser | |
| 868 | - | |
| 869 | - def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | |
| 870 | - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 871 | - payload = parsed.to_dict() | |
| 872 | - payload["text_for_rerank"] = parsed.text_for_rerank() | |
| 873 | - return payload | |
| 874 | - | |
| 875 | - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | |
| 876 | - if self.labeler_mode != "complex": | |
| 877 | - raise RuntimeError("query profiles are only used in complex labeler mode") | |
| 878 | - if not force_refresh: | |
| 879 | - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) | |
| 880 | - if cached is not None: | |
| 881 | - return cached | |
| 882 | - parser_hints = self.build_query_parser_hints(query) | |
| 883 | - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) | |
| 884 | - profile["parser_hints"] = parser_hints | |
| 885 | - self.store.upsert_query_profile( | |
| 886 | - self.tenant_id, | |
| 887 | - query, | |
| 888 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 889 | - self.label_client.model, | |
| 890 | - profile, | |
| 891 | - raw_response, | |
| 892 | - ) | |
| 893 | - return profile | |
| 894 | - | |
| 895 | - @staticmethod | |
| 896 | - def _doc_evidence_text(doc: Dict[str, Any]) -> str: | |
| 897 | - pieces: List[str] = [ | |
| 898 | - build_display_title(doc), | |
| 899 | - pick_text(doc.get("vendor"), "en"), | |
| 900 | - pick_text(doc.get("category_path"), "en"), | |
| 901 | - pick_text(doc.get("category_name"), "en"), | |
| 902 | - ] | |
| 903 | - for sku in doc.get("skus") or []: | |
| 904 | - pieces.extend( | |
| 905 | - [ | |
| 906 | - str(sku.get("option1_value") or ""), | |
| 907 | - str(sku.get("option2_value") or ""), | |
| 908 | - str(sku.get("option3_value") or ""), | |
| 909 | - ] | |
| 910 | - ) | |
| 911 | - for tag in doc.get("tags") or []: | |
| 912 | - pieces.append(str(tag)) | |
| 913 | - return normalize_text(" | ".join(piece for piece in pieces if piece)) | |
| 914 | - | |
| 915 | - def _apply_rule_based_label_guardrails( | |
| 916 | - self, | |
| 917 | - label: str, | |
| 918 | - query_profile: Dict[str, Any], | |
| 919 | - doc: Dict[str, Any], | |
| 920 | - ) -> str: | |
| 921 | - if label not in VALID_LABELS: | |
| 922 | - return label | |
| 923 | - evidence = self._doc_evidence_text(doc) | |
| 924 | - category = normalize_text(query_profile.get("primary_category")) | |
| 925 | - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] | |
| 926 | - | |
| 927 | - primary_category_match = True | |
| 928 | - if category: | |
| 929 | - primary_category_match = category in evidence | |
| 930 | - allowed_category_match = True | |
| 931 | - if allowed_categories: | |
| 932 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 933 | - | |
| 934 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 935 | - if allowed_category_match: | |
| 936 | - label = RELEVANCE_PARTIAL | |
| 937 | - else: | |
| 938 | - return RELEVANCE_IRRELEVANT | |
| 939 | - | |
| 940 | - for attr in query_profile.get("required_attributes") or []: | |
| 941 | - if not isinstance(attr, dict): | |
| 942 | - continue | |
| 943 | - attr_name = normalize_text(attr.get("name")) | |
| 944 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: | |
| 945 | - continue | |
| 946 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 947 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 948 | - if attr_name == "fit": | |
| 949 | - if any(term in {"oversized", "oversize"} for term in required_terms): | |
| 950 | - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) | |
| 951 | - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): | |
| 952 | - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) | |
| 953 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 954 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 955 | - | |
| 956 | - if has_conflict: | |
| 957 | - return RELEVANCE_IRRELEVANT | |
| 958 | - if label == RELEVANCE_EXACT and not has_required: | |
| 959 | - label = RELEVANCE_PARTIAL | |
| 960 | - | |
| 961 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 962 | - return RELEVANCE_IRRELEVANT | |
| 963 | - | |
| 964 | - return label | |
| 965 | - | |
| 966 | - @staticmethod | |
| 967 | - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: | |
| 968 | - option_values = list(item.get("option_values") or []) | |
| 969 | - while len(option_values) < 3: | |
| 970 | - option_values.append("") | |
| 971 | - product = dict(item.get("product") or {}) | |
| 972 | - return { | |
| 973 | - "spu_id": item.get("spu_id"), | |
| 974 | - "title": product.get("title") or item.get("title"), | |
| 975 | - "vendor": product.get("vendor"), | |
| 976 | - "category_path": product.get("category"), | |
| 977 | - "category_name": product.get("category"), | |
| 978 | - "image_url": item.get("image_url") or product.get("image_url"), | |
| 979 | - "tags": product.get("tags") or [], | |
| 980 | - "skus": [ | |
| 981 | - { | |
| 982 | - "option1_value": option_values[0], | |
| 983 | - "option2_value": option_values[1], | |
| 984 | - "option3_value": option_values[2], | |
| 985 | - } | |
| 986 | - ], | |
| 987 | - } | |
| 988 | - | |
| 989 | - def _collect_label_issues( | |
| 990 | - self, | |
| 991 | - label: str, | |
| 992 | - query_profile: Dict[str, Any], | |
| 993 | - doc: Dict[str, Any], | |
| 994 | - ) -> List[str]: | |
| 995 | - evidence = self._doc_evidence_text(doc) | |
| 996 | - issues: List[str] = [] | |
| 997 | - category = normalize_text(query_profile.get("primary_category")) | |
| 998 | - allowed_categories = [ | |
| 999 | - normalize_text(item) | |
| 1000 | - for item in query_profile.get("allowed_categories") or [] | |
| 1001 | - if str(item).strip() | |
| 1002 | - ] | |
| 1003 | - | |
| 1004 | - primary_category_match = True if not category else category in evidence | |
| 1005 | - allowed_category_match = False if allowed_categories else primary_category_match | |
| 1006 | - if allowed_categories: | |
| 1007 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 1008 | - | |
| 1009 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 1010 | - if allowed_category_match: | |
| 1011 | - issues.append("Exact missing primary category evidence") | |
| 1012 | - else: | |
| 1013 | - issues.append("Exact has category mismatch") | |
| 1014 | - | |
| 1015 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 1016 | - issues.append("Partial has category mismatch") | |
| 1017 | - | |
| 1018 | - for attr in query_profile.get("required_attributes") or []: | |
| 1019 | - if not isinstance(attr, dict): | |
| 1020 | - continue | |
| 1021 | - attr_name = normalize_text(attr.get("name")) | |
| 1022 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: | |
| 1023 | - continue | |
| 1024 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 1025 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 1026 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 1027 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 1028 | - | |
| 1029 | - if has_conflict and label != RELEVANCE_IRRELEVANT: | |
| 1030 | - issues.append(f"{label} conflicts on {attr_name}") | |
| 1031 | - if label == RELEVANCE_EXACT and not has_required: | |
| 1032 | - issues.append(f"Exact missing {attr_name}") | |
| 1033 | - return issues | |
| 1034 | - | |
| 1035 | - def audit_live_query( | |
| 1036 | - self, | |
| 1037 | - query: str, | |
| 1038 | - *, | |
| 1039 | - top_k: int = 100, | |
| 1040 | - language: str = "en", | |
| 1041 | - auto_annotate: bool = False, | |
| 1042 | - ) -> Dict[str, Any]: | |
| 1043 | - live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) | |
| 1044 | - if self.labeler_mode != "complex": | |
| 1045 | - labels = [ | |
| 1046 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1047 | - for item in live["results"] | |
| 1048 | - ] | |
| 1049 | - return { | |
| 1050 | - "query": query, | |
| 1051 | - "tenant_id": self.tenant_id, | |
| 1052 | - "top_k": top_k, | |
| 1053 | - "metrics": live["metrics"], | |
| 1054 | - "distribution": label_distribution(labels), | |
| 1055 | - "query_profile": None, | |
| 1056 | - "suspicious": [], | |
| 1057 | - "results": live["results"], | |
| 1058 | - } | |
| 1059 | - query_profile = self.get_query_profile(query, force_refresh=False) | |
| 1060 | - suspicious: List[Dict[str, Any]] = [] | |
| 1061 | - | |
| 1062 | - for item in live["results"]: | |
| 1063 | - doc = self._result_item_to_doc(item) | |
| 1064 | - issues = self._collect_label_issues(item["label"] or "", query_profile, doc) | |
| 1065 | - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) | |
| 1066 | - if suggested_label != (item["label"] or ""): | |
| 1067 | - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] | |
| 1068 | - if issues: | |
| 1069 | - suspicious.append( | |
| 1070 | - { | |
| 1071 | - "rank": item["rank"], | |
| 1072 | - "spu_id": item["spu_id"], | |
| 1073 | - "title": item["title"], | |
| 1074 | - "label": item["label"], | |
| 1075 | - "suggested_label": suggested_label, | |
| 1076 | - "issues": issues, | |
| 1077 | - } | |
| 1078 | - ) | |
| 1079 | - | |
| 1080 | - labels = [ | |
| 1081 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1082 | - for item in live["results"] | |
| 1083 | - ] | |
| 1084 | - return { | |
| 1085 | - "query": query, | |
| 1086 | - "tenant_id": self.tenant_id, | |
| 1087 | - "top_k": top_k, | |
| 1088 | - "metrics": live["metrics"], | |
| 1089 | - "distribution": label_distribution(labels), | |
| 1090 | - "query_profile": query_profile, | |
| 1091 | - "suspicious": suspicious, | |
| 1092 | - "results": live["results"], | |
| 1093 | - } | |
| 1094 | - | |
| 1095 | - def queries_from_file(self, path: Path) -> List[str]: | |
| 1096 | - return [ | |
| 1097 | - line.strip() | |
| 1098 | - for line in path.read_text(encoding="utf-8").splitlines() | |
| 1099 | - if line.strip() and not line.strip().startswith("#") | |
| 1100 | - ] | |
| 1101 | - | |
| 1102 | - def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]: | |
| 1103 | - if not refresh and self.store.has_corpus(self.tenant_id): | |
| 1104 | - return self.store.get_corpus_docs(self.tenant_id) | |
| 1105 | - | |
| 1106 | - es_client = get_es_client().client | |
| 1107 | - index_name = get_tenant_index_name(self.tenant_id) | |
| 1108 | - docs: List[Dict[str, Any]] = [] | |
| 1109 | - for hit in scan( | |
| 1110 | - client=es_client, | |
| 1111 | - index=index_name, | |
| 1112 | - query={ | |
| 1113 | - "_source": [ | |
| 1114 | - "spu_id", | |
| 1115 | - "title", | |
| 1116 | - "vendor", | |
| 1117 | - "category_path", | |
| 1118 | - "category_name", | |
| 1119 | - "image_url", | |
| 1120 | - "skus", | |
| 1121 | - "tags", | |
| 1122 | - ], | |
| 1123 | - "query": {"match_all": {}}, | |
| 1124 | - }, | |
| 1125 | - size=500, | |
| 1126 | - preserve_order=False, | |
| 1127 | - clear_scroll=True, | |
| 1128 | - ): | |
| 1129 | - source = dict(hit.get("_source") or {}) | |
| 1130 | - source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "") | |
| 1131 | - docs.append(source) | |
| 1132 | - self.store.upsert_corpus_docs(self.tenant_id, docs) | |
| 1133 | - return docs | |
| 1134 | - | |
| 1135 | - def full_corpus_rerank( | |
| 1136 | - self, | |
| 1137 | - query: str, | |
| 1138 | - docs: Sequence[Dict[str, Any]], | |
| 1139 | - batch_size: int = 24, | |
| 1140 | - force_refresh: bool = False, | |
| 1141 | - ) -> List[Dict[str, Any]]: | |
| 1142 | - cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) | |
| 1143 | - pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached] | |
| 1144 | - if pending: | |
| 1145 | - new_scores: Dict[str, float] = {} | |
| 1146 | - for start in range(0, len(pending), batch_size): | |
| 1147 | - batch = pending[start : start + batch_size] | |
| 1148 | - scores = self._rerank_batch_with_retry(query=query, docs=batch) | |
| 1149 | - if len(scores) != len(batch): | |
| 1150 | - raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs") | |
| 1151 | - for doc, score in zip(batch, scores): | |
| 1152 | - new_scores[str(doc.get("spu_id"))] = float(score) | |
| 1153 | - self.store.upsert_rerank_scores( | |
| 1154 | - self.tenant_id, | |
| 1155 | - query, | |
| 1156 | - new_scores, | |
| 1157 | - model_name="qwen3_vllm_score", | |
| 1158 | - ) | |
| 1159 | - cached.update(new_scores) | |
| 1160 | - | |
| 1161 | - ranked = [] | |
| 1162 | - for doc in docs: | |
| 1163 | - spu_id = str(doc.get("spu_id")) | |
| 1164 | - ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc}) | |
| 1165 | - ranked.sort(key=lambda item: item["score"], reverse=True) | |
| 1166 | - return ranked | |
| 1167 | - | |
| 1168 | - def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: | |
| 1169 | - if not docs: | |
| 1170 | - return [] | |
| 1171 | - doc_texts = [build_rerank_doc(doc) for doc in docs] | |
| 1172 | - try: | |
| 1173 | - scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False) | |
| 1174 | - return scores | |
| 1175 | - except Exception: | |
| 1176 | - if len(docs) == 1: | |
| 1177 | - return [-1.0] | |
| 1178 | - if len(docs) <= 6: | |
| 1179 | - scores: List[float] = [] | |
| 1180 | - for doc in docs: | |
| 1181 | - scores.extend(self._rerank_batch_with_retry(query, [doc])) | |
| 1182 | - return scores | |
| 1183 | - mid = len(docs) // 2 | |
| 1184 | - left = self._rerank_batch_with_retry(query, docs[:mid]) | |
| 1185 | - right = self._rerank_batch_with_retry(query, docs[mid:]) | |
| 1186 | - return left + right | |
| 1187 | - | |
| 1188 | - def annotate_missing_labels( | |
| 1189 | - self, | |
| 1190 | - query: str, | |
| 1191 | - docs: Sequence[Dict[str, Any]], | |
| 1192 | - force_refresh: bool = False, | |
| 1193 | - ) -> Dict[str, str]: | |
| 1194 | - labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) | |
| 1195 | - missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] | |
| 1196 | - if not missing_docs: | |
| 1197 | - return labels | |
| 1198 | - | |
| 1199 | - for start in range(0, len(missing_docs), self.label_client.batch_size): | |
| 1200 | - batch = missing_docs[start : start + self.label_client.batch_size] | |
| 1201 | - batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh) | |
| 1202 | - for sub_labels, raw_response, sub_batch in batch_pairs: | |
| 1203 | - to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} | |
| 1204 | - self.store.upsert_labels( | |
| 1205 | - self.tenant_id, | |
| 1206 | - query, | |
| 1207 | - to_store, | |
| 1208 | - judge_model=self.label_client.model, | |
| 1209 | - raw_response=raw_response, | |
| 1210 | - ) | |
| 1211 | - labels.update(to_store) | |
| 1212 | - time.sleep(0.1) | |
| 1213 | - return labels | |
| 1214 | - | |
| 1215 | - def _classify_with_retry( | |
| 1216 | - self, | |
| 1217 | - query: str, | |
| 1218 | - docs: Sequence[Dict[str, Any]], | |
| 1219 | - *, | |
| 1220 | - force_refresh: bool = False, | |
| 1221 | - ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: | |
| 1222 | - if not docs: | |
| 1223 | - return [] | |
| 1224 | - try: | |
| 1225 | - if self.labeler_mode == "complex": | |
| 1226 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 1227 | - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | |
| 1228 | - labels = [ | |
| 1229 | - self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 1230 | - for doc, label in zip(docs, labels) | |
| 1231 | - ] | |
| 1232 | - else: | |
| 1233 | - labels, raw_response = self.label_client.classify_batch_simple(query, docs) | |
| 1234 | - return [(labels, raw_response, docs)] | |
| 1235 | - except Exception: | |
| 1236 | - if len(docs) == 1: | |
| 1237 | - raise | |
| 1238 | - mid = len(docs) // 2 | |
| 1239 | - return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) | |
| 1240 | - | |
| 1241 | - def build_query_annotation_set( | |
| 1242 | - self, | |
| 1243 | - query: str, | |
| 1244 | - *, | |
| 1245 | - search_depth: int = 1000, | |
| 1246 | - rerank_depth: int = 10000, | |
| 1247 | - annotate_search_top_k: int = 120, | |
| 1248 | - annotate_rerank_top_k: int = 200, | |
| 1249 | - language: str = "en", | |
| 1250 | - force_refresh_rerank: bool = False, | |
| 1251 | - force_refresh_labels: bool = False, | |
| 1252 | - ) -> QueryBuildResult: | |
| 1253 | - search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language) | |
| 1254 | - search_results = list(search_payload.get("results") or []) | |
| 1255 | - corpus = self.corpus_docs(refresh=False) | |
| 1256 | - full_rerank = self.full_corpus_rerank( | |
| 1257 | - query=query, | |
| 1258 | - docs=corpus, | |
| 1259 | - force_refresh=force_refresh_rerank, | |
| 1260 | - ) | |
| 1261 | - rerank_depth_effective = min(rerank_depth, len(full_rerank)) | |
| 1262 | - | |
| 1263 | - pool_docs: Dict[str, Dict[str, Any]] = {} | |
| 1264 | - for doc in search_results[:annotate_search_top_k]: | |
| 1265 | - pool_docs[str(doc.get("spu_id"))] = doc | |
| 1266 | - for item in full_rerank[:annotate_rerank_top_k]: | |
| 1267 | - pool_docs[str(item["spu_id"])] = item["doc"] | |
| 1268 | - | |
| 1269 | - labels = self.annotate_missing_labels( | |
| 1270 | - query=query, | |
| 1271 | - docs=list(pool_docs.values()), | |
| 1272 | - force_refresh=force_refresh_labels, | |
| 1273 | - ) | |
| 1274 | - | |
| 1275 | - search_labeled_results: List[Dict[str, Any]] = [] | |
| 1276 | - for rank, doc in enumerate(search_results, start=1): | |
| 1277 | - spu_id = str(doc.get("spu_id")) | |
| 1278 | - label = labels.get(spu_id) | |
| 1279 | - search_labeled_results.append( | |
| 1280 | - { | |
| 1281 | - "rank": rank, | |
| 1282 | - "spu_id": spu_id, | |
| 1283 | - "title": build_display_title(doc), | |
| 1284 | - "image_url": doc.get("image_url"), | |
| 1285 | - "rerank_score": None, | |
| 1286 | - "label": label, | |
| 1287 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1288 | - "product": compact_product_payload(doc), | |
| 1289 | - } | |
| 1290 | - ) | |
| 1291 | - | |
| 1292 | - rerank_top_results: List[Dict[str, Any]] = [] | |
| 1293 | - for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1): | |
| 1294 | - doc = item["doc"] | |
| 1295 | - spu_id = str(item["spu_id"]) | |
| 1296 | - rerank_top_results.append( | |
| 1297 | - { | |
| 1298 | - "rank": rank, | |
| 1299 | - "spu_id": spu_id, | |
| 1300 | - "title": build_display_title(doc), | |
| 1301 | - "image_url": doc.get("image_url"), | |
| 1302 | - "rerank_score": round(float(item["score"]), 8), | |
| 1303 | - "label": labels.get(spu_id), | |
| 1304 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1305 | - "product": compact_product_payload(doc), | |
| 1306 | - } | |
| 1307 | - ) | |
| 1308 | - | |
| 1309 | - top100_labels = [ | |
| 1310 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1311 | - for item in search_labeled_results[:100] | |
| 1312 | - ] | |
| 1313 | - metrics = compute_query_metrics(top100_labels) | |
| 1314 | - output_dir = ensure_dir(self.artifact_root / "query_builds") | |
| 1315 | - run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" | |
| 1316 | - output_json_path = output_dir / f"{run_id}.json" | |
| 1317 | - payload = { | |
| 1318 | - "run_id": run_id, | |
| 1319 | - "created_at": utc_now_iso(), | |
| 1320 | - "tenant_id": self.tenant_id, | |
| 1321 | - "query": query, | |
| 1322 | - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 1323 | - "search_total": int(search_payload.get("total") or 0), | |
| 1324 | - "search_depth_requested": search_depth, | |
| 1325 | - "search_depth_effective": len(search_results), | |
| 1326 | - "rerank_depth_requested": rerank_depth, | |
| 1327 | - "rerank_depth_effective": rerank_depth_effective, | |
| 1328 | - "corpus_size": len(corpus), | |
| 1329 | - "annotation_pool": { | |
| 1330 | - "annotate_search_top_k": annotate_search_top_k, | |
| 1331 | - "annotate_rerank_top_k": annotate_rerank_top_k, | |
| 1332 | - "pool_size": len(pool_docs), | |
| 1333 | - }, | |
| 1334 | - "labeler_mode": self.labeler_mode, | |
| 1335 | - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | |
| 1336 | - "metrics_top100": metrics, | |
| 1337 | - "search_results": search_labeled_results, | |
| 1338 | - "full_rerank_top": rerank_top_results, | |
| 1339 | - } | |
| 1340 | - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1341 | - self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"]) | |
| 1342 | - return QueryBuildResult( | |
| 1343 | - query=query, | |
| 1344 | - tenant_id=self.tenant_id, | |
| 1345 | - search_total=int(search_payload.get("total") or 0), | |
| 1346 | - search_depth=len(search_results), | |
| 1347 | - rerank_corpus_size=len(corpus), | |
| 1348 | - annotated_count=len(pool_docs), | |
| 1349 | - output_json_path=output_json_path, | |
| 1350 | - ) | |
| 1351 | - | |
| 1352 | - def evaluate_live_query( | |
| 1353 | - self, | |
| 1354 | - query: str, | |
| 1355 | - top_k: int = 100, | |
| 1356 | - auto_annotate: bool = False, | |
| 1357 | - language: str = "en", | |
| 1358 | - force_refresh_labels: bool = False, | |
| 1359 | - ) -> Dict[str, Any]: | |
| 1360 | - search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language) | |
| 1361 | - results = list(search_payload.get("results") or []) | |
| 1362 | - if auto_annotate: | |
| 1363 | - self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) | |
| 1364 | - labels = self.store.get_labels(self.tenant_id, query) | |
| 1365 | - recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]} | |
| 1366 | - labeled = [] | |
| 1367 | - unlabeled_hits = 0 | |
| 1368 | - for rank, doc in enumerate(results[:top_k], start=1): | |
| 1369 | - spu_id = str(doc.get("spu_id")) | |
| 1370 | - label = labels.get(spu_id) | |
| 1371 | - if label not in VALID_LABELS: | |
| 1372 | - unlabeled_hits += 1 | |
| 1373 | - labeled.append( | |
| 1374 | - { | |
| 1375 | - "rank": rank, | |
| 1376 | - "spu_id": spu_id, | |
| 1377 | - "title": build_display_title(doc), | |
| 1378 | - "image_url": doc.get("image_url"), | |
| 1379 | - "label": label, | |
| 1380 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1381 | - "product": compact_product_payload(doc), | |
| 1382 | - } | |
| 1383 | - ) | |
| 1384 | - metric_labels = [ | |
| 1385 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1386 | - for item in labeled | |
| 1387 | - ] | |
| 1388 | - label_stats = self.store.get_query_label_stats(self.tenant_id, query) | |
| 1389 | - rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) | |
| 1390 | - relevant_missing_ids = [ | |
| 1391 | - spu_id | |
| 1392 | - for spu_id, label in labels.items() | |
| 1393 | - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | |
| 1394 | - ] | |
| 1395 | - missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) | |
| 1396 | - missing_relevant = [] | |
| 1397 | - for spu_id in relevant_missing_ids: | |
| 1398 | - doc = missing_docs_map.get(spu_id) | |
| 1399 | - if not doc: | |
| 1400 | - continue | |
| 1401 | - missing_relevant.append( | |
| 1402 | - { | |
| 1403 | - "spu_id": spu_id, | |
| 1404 | - "label": labels[spu_id], | |
| 1405 | - "rerank_score": rerank_scores.get(spu_id), | |
| 1406 | - "title": build_display_title(doc), | |
| 1407 | - "image_url": doc.get("image_url"), | |
| 1408 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1409 | - "product": compact_product_payload(doc), | |
| 1410 | - } | |
| 1411 | - ) | |
| 1412 | - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | |
| 1413 | - missing_relevant.sort( | |
| 1414 | - key=lambda item: ( | |
| 1415 | - label_order.get(str(item.get("label")), 9), | |
| 1416 | - -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")), | |
| 1417 | - str(item.get("title") or ""), | |
| 1418 | - ) | |
| 1419 | - ) | |
| 1420 | - tips: List[str] = [] | |
| 1421 | - if auto_annotate: | |
| 1422 | - tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.") | |
| 1423 | - else: | |
| 1424 | - tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.") | |
| 1425 | - if label_stats["total"] == 0: | |
| 1426 | - tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.") | |
| 1427 | - if unlabeled_hits: | |
| 1428 | - tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") | |
| 1429 | - if not missing_relevant: | |
| 1430 | - tips.append("No cached Exact/Partial products were missed by this recall set.") | |
| 1431 | - return { | |
| 1432 | - "query": query, | |
| 1433 | - "tenant_id": self.tenant_id, | |
| 1434 | - "top_k": top_k, | |
| 1435 | - "metrics": compute_query_metrics(metric_labels), | |
| 1436 | - "results": labeled, | |
| 1437 | - "missing_relevant": missing_relevant, | |
| 1438 | - "label_stats": { | |
| 1439 | - **label_stats, | |
| 1440 | - "unlabeled_hits_treated_irrelevant": unlabeled_hits, | |
| 1441 | - "recalled_hits": len(labeled), | |
| 1442 | - "missing_relevant_count": len(missing_relevant), | |
| 1443 | - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | |
| 1444 | - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | |
| 1445 | - }, | |
| 1446 | - "tips": tips, | |
| 1447 | - "total": int(search_payload.get("total") or 0), | |
| 1448 | - } | |
| 1449 | - | |
| 1450 | - def batch_evaluate( | |
| 1451 | - self, | |
| 1452 | - queries: Sequence[str], | |
| 1453 | - *, | |
| 1454 | - top_k: int = 100, | |
| 1455 | - auto_annotate: bool = True, | |
| 1456 | - language: str = "en", | |
| 1457 | - force_refresh_labels: bool = False, | |
| 1458 | - ) -> Dict[str, Any]: | |
| 1459 | - per_query = [] | |
| 1460 | - for query in queries: | |
| 1461 | - live = self.evaluate_live_query( | |
| 1462 | - query, | |
| 1463 | - top_k=top_k, | |
| 1464 | - auto_annotate=auto_annotate, | |
| 1465 | - language=language, | |
| 1466 | - force_refresh_labels=force_refresh_labels, | |
| 1467 | - ) | |
| 1468 | - labels = [ | |
| 1469 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1470 | - for item in live["results"] | |
| 1471 | - ] | |
| 1472 | - per_query.append( | |
| 1473 | - { | |
| 1474 | - "query": live["query"], | |
| 1475 | - "tenant_id": live["tenant_id"], | |
| 1476 | - "top_k": live["top_k"], | |
| 1477 | - "metrics": live["metrics"], | |
| 1478 | - "distribution": label_distribution(labels), | |
| 1479 | - "total": live["total"], | |
| 1480 | - } | |
| 1481 | - ) | |
| 1482 | - aggregate = aggregate_metrics([item["metrics"] for item in per_query]) | |
| 1483 | - aggregate_distribution = { | |
| 1484 | - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | |
| 1485 | - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), | |
| 1486 | - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), | |
| 1487 | - } | |
| 1488 | - batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | |
| 1489 | - report_dir = ensure_dir(self.artifact_root / "batch_reports") | |
| 1490 | - config_snapshot_path = report_dir / f"{batch_id}_config.json" | |
| 1491 | - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() | |
| 1492 | - config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1493 | - output_json_path = report_dir / f"{batch_id}.json" | |
| 1494 | - report_md_path = report_dir / f"{batch_id}.md" | |
| 1495 | - payload = { | |
| 1496 | - "batch_id": batch_id, | |
| 1497 | - "created_at": utc_now_iso(), | |
| 1498 | - "tenant_id": self.tenant_id, | |
| 1499 | - "queries": list(queries), | |
| 1500 | - "top_k": top_k, | |
| 1501 | - "aggregate_metrics": aggregate, | |
| 1502 | - "aggregate_distribution": aggregate_distribution, | |
| 1503 | - "per_query": per_query, | |
| 1504 | - "config_snapshot_path": str(config_snapshot_path), | |
| 1505 | - } | |
| 1506 | - output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1507 | - report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") | |
| 1508 | - self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) | |
| 1509 | - return payload | |
| 1510 | - | |
| 1511 | - | |
| 1512 | -def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | |
| 1513 | - lines = [ | |
| 1514 | - "# Search Batch Evaluation", | |
| 1515 | - "", | |
| 1516 | - f"- Batch ID: {payload['batch_id']}", | |
| 1517 | - f"- Created at: {payload['created_at']}", | |
| 1518 | - f"- Tenant ID: {payload['tenant_id']}", | |
| 1519 | - f"- Query count: {len(payload['queries'])}", | |
| 1520 | - f"- Top K: {payload['top_k']}", | |
| 1521 | - "", | |
| 1522 | - "## Aggregate Metrics", | |
| 1523 | - "", | |
| 1524 | - ] | |
| 1525 | - for key, value in sorted((payload.get("aggregate_metrics") or {}).items()): | |
| 1526 | - lines.append(f"- {key}: {value}") | |
| 1527 | - distribution = payload.get("aggregate_distribution") or {} | |
| 1528 | - if distribution: | |
| 1529 | - lines.extend( | |
| 1530 | - [ | |
| 1531 | - "", | |
| 1532 | - "## Label Distribution", | |
| 1533 | - "", | |
| 1534 | - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 1535 | - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", | |
| 1536 | - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", | |
| 1537 | - ] | |
| 1538 | - ) | |
| 1539 | - lines.extend(["", "## Per Query", ""]) | |
| 1540 | - for item in payload.get("per_query") or []: | |
| 1541 | - lines.append(f"### {item['query']}") | |
| 1542 | - lines.append("") | |
| 1543 | - for key, value in sorted((item.get("metrics") or {}).items()): | |
| 1544 | - lines.append(f"- {key}: {value}") | |
| 1545 | - distribution = item.get("distribution") or {} | |
| 1546 | - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 1547 | - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") | |
| 1548 | - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") | |
| 1549 | - lines.append("") | |
| 1550 | - return "\n".join(lines) | |
| 1551 | - | |
| 1552 | - | |
| 1553 | -class SearchEvalRequest(BaseModel): | |
| 1554 | - query: str | |
| 1555 | - top_k: int = Field(default=100, ge=1, le=500) | |
| 1556 | - auto_annotate: bool = False | |
| 1557 | - language: str = "en" | |
| 1558 | - | |
| 1559 | - | |
| 1560 | -class BatchEvalRequest(BaseModel): | |
| 1561 | - queries: Optional[List[str]] = None | |
| 1562 | - top_k: int = Field(default=100, ge=1, le=500) | |
| 1563 | - auto_annotate: bool = False | |
| 1564 | - language: str = "en" | |
| 1565 | - force_refresh_labels: bool = False | |
| 1566 | - | |
| 1567 | - | |
| 1568 | -def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI: | |
| 1569 | - app = FastAPI(title="Search Evaluation UI", version="1.0.0") | |
| 1570 | - | |
| 1571 | - @app.get("/", response_class=HTMLResponse) | |
| 1572 | - def home() -> str: | |
| 1573 | - return WEB_APP_HTML | |
| 1574 | - | |
| 1575 | - @app.get("/api/queries") | |
| 1576 | - def api_queries() -> Dict[str, Any]: | |
| 1577 | - return {"queries": framework.queries_from_file(query_file)} | |
| 1578 | - | |
| 1579 | - @app.post("/api/search-eval") | |
| 1580 | - def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]: | |
| 1581 | - return framework.evaluate_live_query( | |
| 1582 | - query=request.query, | |
| 1583 | - top_k=request.top_k, | |
| 1584 | - auto_annotate=request.auto_annotate, | |
| 1585 | - language=request.language, | |
| 1586 | - ) | |
| 1587 | - | |
| 1588 | - @app.post("/api/batch-eval") | |
| 1589 | - def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]: | |
| 1590 | - queries = request.queries or framework.queries_from_file(query_file) | |
| 1591 | - if not queries: | |
| 1592 | - raise HTTPException(status_code=400, detail="No queries provided") | |
| 1593 | - return framework.batch_evaluate( | |
| 1594 | - queries=queries, | |
| 1595 | - top_k=request.top_k, | |
| 1596 | - auto_annotate=request.auto_annotate, | |
| 1597 | - language=request.language, | |
| 1598 | - force_refresh_labels=request.force_refresh_labels, | |
| 1599 | - ) | |
| 1600 | - | |
| 1601 | - @app.get("/api/history") | |
| 1602 | - def api_history() -> Dict[str, Any]: | |
| 1603 | - return {"history": framework.store.list_batch_runs(limit=20)} | |
| 1604 | - | |
| 1605 | - @app.get("/api/history/{batch_id}/report") | |
| 1606 | - def api_history_report(batch_id: str) -> Dict[str, Any]: | |
| 1607 | - row = framework.store.get_batch_run(batch_id) | |
| 1608 | - if row is None: | |
| 1609 | - raise HTTPException(status_code=404, detail="Unknown batch_id") | |
| 1610 | - report_path = Path(row["report_markdown_path"]).resolve() | |
| 1611 | - root = framework.artifact_root.resolve() | |
| 1612 | - try: | |
| 1613 | - report_path.relative_to(root) | |
| 1614 | - except ValueError: | |
| 1615 | - raise HTTPException(status_code=403, detail="Report path is outside artifact root") | |
| 1616 | - if not report_path.is_file(): | |
| 1617 | - raise HTTPException(status_code=404, detail="Report file not found") | |
| 1618 | - return { | |
| 1619 | - "batch_id": row["batch_id"], | |
| 1620 | - "created_at": row["created_at"], | |
| 1621 | - "tenant_id": row["tenant_id"], | |
| 1622 | - "report_markdown_path": str(report_path), | |
| 1623 | - "markdown": report_path.read_text(encoding="utf-8"), | |
| 1624 | - } | |
| 1625 | - | |
| 1626 | - return app | |
| 1627 | - | |
| 1628 | - | |
| 1629 | -WEB_APP_HTML = """ | |
| 1630 | -<!doctype html> | |
| 1631 | -<html lang="en"> | |
| 1632 | -<head> | |
| 1633 | - <meta charset="utf-8" /> | |
| 1634 | - <meta name="viewport" content="width=device-width, initial-scale=1" /> | |
| 1635 | - <title>Search Evaluation</title> | |
| 1636 | - <style> | |
| 1637 | - :root { | |
| 1638 | - --bg: #f5f3ed; | |
| 1639 | - --panel: #fffdf8; | |
| 1640 | - --ink: #1f2a24; | |
| 1641 | - --muted: #6b756e; | |
| 1642 | - --line: #ddd4c6; | |
| 1643 | - --accent: #0f766e; | |
| 1644 | - --exact: #0f766e; | |
| 1645 | - --partial: #b7791f; | |
| 1646 | - --irrelevant: #b42318; | |
| 1647 | - } | |
| 1648 | - body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background: | |
| 1649 | - radial-gradient(circle at top left, #f0e6d6 0, transparent 28%), | |
| 1650 | - linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); } | |
| 1651 | - .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; } | |
| 1652 | - .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); } | |
| 1653 | - .main { padding: 24px; } | |
| 1654 | - h1, h2 { margin: 0 0 12px; } | |
| 1655 | - .muted { color: var(--muted); } | |
| 1656 | - .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } | |
| 1657 | - .query-item { | |
| 1658 | - display: block; width: 100%; border: 0; background: transparent; text-align: left; | |
| 1659 | - padding: 10px 12px; border-radius: 10px; cursor: pointer; | |
| 1660 | - color: var(--ink); font-size: 15px; font-weight: 500; | |
| 1661 | - } | |
| 1662 | - .query-item:hover { background: #eef6f4; } | |
| 1663 | - .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } | |
| 1664 | - input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } | |
| 1665 | - button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; } | |
| 1666 | - button.secondary { background: #d9e6e3; color: #12433d; } | |
| 1667 | - .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; } | |
| 1668 | - .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; } | |
| 1669 | - .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; } | |
| 1670 | - .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; } | |
| 1671 | - .results { display: grid; gap: 10px; } | |
| 1672 | - .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } | |
| 1673 | - .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } | |
| 1674 | - .Exact { background: var(--exact); } | |
| 1675 | - .Partial { background: var(--partial); } | |
| 1676 | - .Irrelevant { background: var(--irrelevant); } | |
| 1677 | - .Unknown { background: #637381; } | |
| 1678 | - .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } | |
| 1679 | - .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; } | |
| 1680 | - .options { color: var(--muted); line-height: 1.5; font-size: 14px; } | |
| 1681 | - .section { margin-bottom: 28px; } | |
| 1682 | - .history { font-size: 13px; line-height: 1.5; } | |
| 1683 | - .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; } | |
| 1684 | - .history-item { | |
| 1685 | - display: block; width: 100%; border: 1px solid var(--line); background: var(--panel); | |
| 1686 | - text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer; | |
| 1687 | - color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s; | |
| 1688 | - } | |
| 1689 | - .history-item:hover { background: #eef6f4; border-color: #b8d4cd; } | |
| 1690 | - .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } | |
| 1691 | - .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; } | |
| 1692 | - .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; } | |
| 1693 | - .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; } | |
| 1694 | - .history-item .hstats span { color: var(--muted); } | |
| 1695 | - .report-modal-root { | |
| 1696 | - position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center; | |
| 1697 | - padding: 16px; box-sizing: border-box; | |
| 1698 | - } | |
| 1699 | - .report-modal-root.is-open { display: flex; } | |
| 1700 | - .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); } | |
| 1701 | - .report-modal-dialog { | |
| 1702 | - position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column; | |
| 1703 | - background: var(--panel); border: 1px solid var(--line); border-radius: 18px; | |
| 1704 | - box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18); | |
| 1705 | - } | |
| 1706 | - .report-modal-head { | |
| 1707 | - flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px; | |
| 1708 | - padding: 16px 18px; border-bottom: 1px solid var(--line); | |
| 1709 | - } | |
| 1710 | - .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; } | |
| 1711 | - .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; } | |
| 1712 | - .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; } | |
| 1713 | - .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); } | |
| 1714 | - .report-modal-body { | |
| 1715 | - flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px; | |
| 1716 | - font-size: 14px; line-height: 1.55; | |
| 1717 | - } | |
| 1718 | - .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; } | |
| 1719 | - .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; } | |
| 1720 | - .batch-report-md h2:first-of-type { margin-top: 0; } | |
| 1721 | - .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; } | |
| 1722 | - .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; } | |
| 1723 | - .batch-report-md li { margin: 0.2rem 0; } | |
| 1724 | - .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; } | |
| 1725 | - .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } | |
| 1726 | - .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } | |
| 1727 | - .tip { margin-bottom: 6px; color: var(--muted); } | |
| 1728 | - </style> | |
| 1729 | -</head> | |
| 1730 | -<body> | |
| 1731 | - <div class="app"> | |
| 1732 | - <aside class="sidebar"> | |
| 1733 | - <h2>Queries</h2> | |
| 1734 | - <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p> | |
| 1735 | - <div id="queryList" class="query-list"></div> | |
| 1736 | - <div class="section"> | |
| 1737 | - <h2>History</h2> | |
| 1738 | - <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p> | |
| 1739 | - <div id="history" class="history muted">Loading...</div> | |
| 1740 | - </div> | |
| 1741 | - </aside> | |
| 1742 | - <main class="main"> | |
| 1743 | - <h1>Search Evaluation</h1> | |
| 1744 | - <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p> | |
| 1745 | - <div class="toolbar"> | |
| 1746 | - <input id="queryInput" type="text" placeholder="Search query" /> | |
| 1747 | - <button onclick="runSingle()">Evaluate Query</button> | |
| 1748 | - <button class="secondary" onclick="runBatch()">Batch Evaluation</button> | |
| 1749 | - </div> | |
| 1750 | - <div id="status" class="muted section"></div> | |
| 1751 | - <section class="section"> | |
| 1752 | - <h2>Metrics</h2> | |
| 1753 | - <div id="metrics" class="grid"></div> | |
| 1754 | - </section> | |
| 1755 | - <section class="section"> | |
| 1756 | - <h2>Top Results</h2> | |
| 1757 | - <div id="results" class="results"></div> | |
| 1758 | - </section> | |
| 1759 | - <section class="section"> | |
| 1760 | - <h2>Missed Exact / Partial</h2> | |
| 1761 | - <div id="missingRelevant" class="results"></div> | |
| 1762 | - </section> | |
| 1763 | - <section class="section"> | |
| 1764 | - <h2>Notes</h2> | |
| 1765 | - <div id="tips" class="tips muted"></div> | |
| 1766 | - </section> | |
| 1767 | - </main> | |
| 1768 | - </div> | |
| 1769 | - <div id="reportModal" class="report-modal-root" aria-hidden="true"> | |
| 1770 | - <div class="report-modal-backdrop" data-close-report="1"></div> | |
| 1771 | - <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle"> | |
| 1772 | - <div class="report-modal-head"> | |
| 1773 | - <h3 id="reportModalTitle">Batch report</h3> | |
| 1774 | - <div class="head-actions"> | |
| 1775 | - <button type="button" class="secondary" id="reportCopyPath">Copy path</button> | |
| 1776 | - <button type="button" onclick="closeReportModal()">Close</button> | |
| 1777 | - </div> | |
| 1778 | - </div> | |
| 1779 | - <div id="reportModalMeta" class="report-modal-meta muted"></div> | |
| 1780 | - <div id="reportModalBody" class="report-modal-body batch-report-md"></div> | |
| 1781 | - </div> | |
| 1782 | - </div> | |
| 1783 | - <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script> | |
| 1784 | - <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script> | |
| 1785 | - <script> | |
| 1786 | - async function fetchJSON(url, options) { | |
| 1787 | - const res = await fetch(url, options); | |
| 1788 | - if (!res.ok) throw new Error(await res.text()); | |
| 1789 | - return await res.json(); | |
| 1790 | - } | |
| 1791 | - function renderMetrics(metrics) { | |
| 1792 | - const root = document.getElementById('metrics'); | |
| 1793 | - root.innerHTML = ''; | |
| 1794 | - Object.entries(metrics || {}).forEach(([key, value]) => { | |
| 1795 | - const card = document.createElement('div'); | |
| 1796 | - card.className = 'metric'; | |
| 1797 | - card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`; | |
| 1798 | - root.appendChild(card); | |
| 1799 | - }); | |
| 1800 | - } | |
| 1801 | - function renderResults(results, rootId='results', showRank=true) { | |
| 1802 | - const mount = document.getElementById(rootId); | |
| 1803 | - mount.innerHTML = ''; | |
| 1804 | - (results || []).forEach(item => { | |
| 1805 | - const label = item.label || 'Unknown'; | |
| 1806 | - const box = document.createElement('div'); | |
| 1807 | - box.className = 'result'; | |
| 1808 | - box.innerHTML = ` | |
| 1809 | - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 1810 | - <img class="thumb" src="${item.image_url || ''}" alt="" /> | |
| 1811 | - <div> | |
| 1812 | - <div class="title">${item.title || ''}</div> | |
| 1813 | - <div class="options"> | |
| 1814 | - <div>${(item.option_values || [])[0] || ''}</div> | |
| 1815 | - <div>${(item.option_values || [])[1] || ''}</div> | |
| 1816 | - <div>${(item.option_values || [])[2] || ''}</div> | |
| 1817 | - </div> | |
| 1818 | - </div>`; | |
| 1819 | - mount.appendChild(box); | |
| 1820 | - }); | |
| 1821 | - if (!(results || []).length) { | |
| 1822 | - mount.innerHTML = '<div class="muted">None.</div>'; | |
| 1823 | - } | |
| 1824 | - } | |
| 1825 | - function renderTips(data) { | |
| 1826 | - const root = document.getElementById('tips'); | |
| 1827 | - const tips = [...(data.tips || [])]; | |
| 1828 | - const stats = data.label_stats || {}; | |
| 1829 | - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | |
| 1830 | - root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); | |
| 1831 | - } | |
| 1832 | - async function loadQueries() { | |
| 1833 | - const data = await fetchJSON('/api/queries'); | |
| 1834 | - const root = document.getElementById('queryList'); | |
| 1835 | - root.innerHTML = ''; | |
| 1836 | - data.queries.forEach(query => { | |
| 1837 | - const btn = document.createElement('button'); | |
| 1838 | - btn.className = 'query-item'; | |
| 1839 | - btn.textContent = query; | |
| 1840 | - btn.onclick = () => { | |
| 1841 | - document.getElementById('queryInput').value = query; | |
| 1842 | - runSingle(); | |
| 1843 | - }; | |
| 1844 | - root.appendChild(btn); | |
| 1845 | - }); | |
| 1846 | - } | |
| 1847 | - function fmtMetric(m, key, digits) { | |
| 1848 | - const v = m && m[key]; | |
| 1849 | - if (v == null || Number.isNaN(Number(v))) return null; | |
| 1850 | - const n = Number(v); | |
| 1851 | - return n.toFixed(digits); | |
| 1852 | - } | |
| 1853 | - function historySummaryHtml(meta) { | |
| 1854 | - const m = meta && meta.aggregate_metrics; | |
| 1855 | - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | |
| 1856 | - const parts = []; | |
| 1857 | - if (nq != null) parts.push(`<span>Queries</span> ${nq}`); | |
| 1858 | - const p10 = fmtMetric(m, 'P@10', 3); | |
| 1859 | - const p52 = fmtMetric(m, 'P@5_2_3', 3); | |
| 1860 | - const map3 = fmtMetric(m, 'MAP_3', 3); | |
| 1861 | - if (p10) parts.push(`<span>P@10</span> ${p10}`); | |
| 1862 | - if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`); | |
| 1863 | - if (map3) parts.push(`<span>MAP_3</span> ${map3}`); | |
| 1864 | - if (!parts.length) return ''; | |
| 1865 | - return `<div class="hstats">${parts.join(' ยท ')}</div>`; | |
| 1866 | - } | |
| 1867 | - async function loadHistory() { | |
| 1868 | - const data = await fetchJSON('/api/history'); | |
| 1869 | - const root = document.getElementById('history'); | |
| 1870 | - root.classList.remove('muted'); | |
| 1871 | - const items = data.history || []; | |
| 1872 | - if (!items.length) { | |
| 1873 | - root.innerHTML = '<span class="muted">No history yet.</span>'; | |
| 1874 | - return; | |
| 1875 | - } | |
| 1876 | - root.innerHTML = `<div class="history-list"></div>`; | |
| 1877 | - const list = root.querySelector('.history-list'); | |
| 1878 | - items.forEach(item => { | |
| 1879 | - const btn = document.createElement('button'); | |
| 1880 | - btn.type = 'button'; | |
| 1881 | - btn.className = 'history-item'; | |
| 1882 | - btn.setAttribute('aria-label', `Open report ${item.batch_id}`); | |
| 1883 | - const sum = historySummaryHtml(item.metadata); | |
| 1884 | - btn.innerHTML = `<div class="hid">${item.batch_id}</div> | |
| 1885 | - <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`; | |
| 1886 | - btn.onclick = () => openBatchReport(item.batch_id); | |
| 1887 | - list.appendChild(btn); | |
| 1888 | - }); | |
| 1889 | - } | |
| 1890 | - let _lastReportPath = ''; | |
| 1891 | - function closeReportModal() { | |
| 1892 | - const el = document.getElementById('reportModal'); | |
| 1893 | - el.classList.remove('is-open'); | |
| 1894 | - el.setAttribute('aria-hidden', 'true'); | |
| 1895 | - document.getElementById('reportModalBody').innerHTML = ''; | |
| 1896 | - document.getElementById('reportModalMeta').textContent = ''; | |
| 1897 | - } | |
| 1898 | - async function openBatchReport(batchId) { | |
| 1899 | - const el = document.getElementById('reportModal'); | |
| 1900 | - const body = document.getElementById('reportModalBody'); | |
| 1901 | - const metaEl = document.getElementById('reportModalMeta'); | |
| 1902 | - const titleEl = document.getElementById('reportModalTitle'); | |
| 1903 | - el.classList.add('is-open'); | |
| 1904 | - el.setAttribute('aria-hidden', 'false'); | |
| 1905 | - titleEl.textContent = batchId; | |
| 1906 | - metaEl.textContent = ''; | |
| 1907 | - body.className = 'report-modal-body batch-report-md report-modal-loading'; | |
| 1908 | - body.textContent = 'Loading reportโฆ'; | |
| 1909 | - try { | |
| 1910 | - const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report'); | |
| 1911 | - _lastReportPath = rep.report_markdown_path || ''; | |
| 1912 | - metaEl.textContent = rep.report_markdown_path || ''; | |
| 1913 | - const raw = marked.parse(rep.markdown || '', { gfm: true }); | |
| 1914 | - const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } }); | |
| 1915 | - body.className = 'report-modal-body batch-report-md'; | |
| 1916 | - body.innerHTML = safe; | |
| 1917 | - } catch (e) { | |
| 1918 | - body.className = 'report-modal-body report-modal-error'; | |
| 1919 | - body.textContent = (e && e.message) ? e.message : String(e); | |
| 1920 | - } | |
| 1921 | - } | |
| 1922 | - document.getElementById('reportModal').addEventListener('click', (ev) => { | |
| 1923 | - if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal(); | |
| 1924 | - }); | |
| 1925 | - document.addEventListener('keydown', (ev) => { | |
| 1926 | - if (ev.key === 'Escape') closeReportModal(); | |
| 1927 | - }); | |
| 1928 | - document.getElementById('reportCopyPath').addEventListener('click', async () => { | |
| 1929 | - if (!_lastReportPath) return; | |
| 1930 | - try { | |
| 1931 | - await navigator.clipboard.writeText(_lastReportPath); | |
| 1932 | - } catch (_) {} | |
| 1933 | - }); | |
| 1934 | - async function runSingle() { | |
| 1935 | - const query = document.getElementById('queryInput').value.trim(); | |
| 1936 | - if (!query) return; | |
| 1937 | - document.getElementById('status').textContent = `Evaluating "${query}"...`; | |
| 1938 | - const data = await fetchJSON('/api/search-eval', { | |
| 1939 | - method: 'POST', | |
| 1940 | - headers: {'Content-Type': 'application/json'}, | |
| 1941 | - body: JSON.stringify({query, top_k: 100, auto_annotate: false}) | |
| 1942 | - }); | |
| 1943 | - document.getElementById('status').textContent = `Done. total=${data.total}`; | |
| 1944 | - renderMetrics(data.metrics); | |
| 1945 | - renderResults(data.results, 'results', true); | |
| 1946 | - renderResults(data.missing_relevant, 'missingRelevant', false); | |
| 1947 | - renderTips(data); | |
| 1948 | - loadHistory(); | |
| 1949 | - } | |
| 1950 | - async function runBatch() { | |
| 1951 | - document.getElementById('status').textContent = 'Running batch evaluation...'; | |
| 1952 | - const data = await fetchJSON('/api/batch-eval', { | |
| 1953 | - method: 'POST', | |
| 1954 | - headers: {'Content-Type': 'application/json'}, | |
| 1955 | - body: JSON.stringify({top_k: 100, auto_annotate: false}) | |
| 1956 | - }); | |
| 1957 | - document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; | |
| 1958 | - renderMetrics(data.aggregate_metrics); | |
| 1959 | - renderResults([], 'results', true); | |
| 1960 | - renderResults([], 'missingRelevant', false); | |
| 1961 | - document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>'; | |
| 1962 | - loadHistory(); | |
| 1963 | - } | |
| 1964 | - loadQueries(); | |
| 1965 | - loadHistory(); | |
| 1966 | - </script> | |
| 1967 | -</body> | |
| 1968 | -</html> | |
| 1969 | -""" | |
| 1970 | - | |
| 1971 | - | |
| 1972 | -def build_cli_parser() -> argparse.ArgumentParser: | |
| 1973 | - parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") | |
| 1974 | - sub = parser.add_subparsers(dest="command", required=True) | |
| 1975 | - | |
| 1976 | - build = sub.add_parser("build", help="Build pooled annotation set for queries") | |
| 1977 | - build.add_argument("--tenant-id", default="163") | |
| 1978 | - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1979 | - build.add_argument("--search-depth", type=int, default=1000) | |
| 1980 | - build.add_argument("--rerank-depth", type=int, default=10000) | |
| 1981 | - build.add_argument("--annotate-search-top-k", type=int, default=120) | |
| 1982 | - build.add_argument("--annotate-rerank-top-k", type=int, default=200) | |
| 1983 | - build.add_argument("--language", default="en") | |
| 1984 | - build.add_argument("--force-refresh-rerank", action="store_true") | |
| 1985 | - build.add_argument("--force-refresh-labels", action="store_true") | |
| 1986 | - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1987 | - | |
| 1988 | - batch = sub.add_parser("batch", help="Run batch evaluation against live search") | |
| 1989 | - batch.add_argument("--tenant-id", default="163") | |
| 1990 | - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1991 | - batch.add_argument("--top-k", type=int, default=100) | |
| 1992 | - batch.add_argument("--language", default="en") | |
| 1993 | - batch.add_argument("--force-refresh-labels", action="store_true") | |
| 1994 | - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1995 | - | |
| 1996 | - audit = sub.add_parser("audit", help="Audit annotation quality for queries") | |
| 1997 | - audit.add_argument("--tenant-id", default="163") | |
| 1998 | - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1999 | - audit.add_argument("--top-k", type=int, default=100) | |
| 2000 | - audit.add_argument("--language", default="en") | |
| 2001 | - audit.add_argument("--limit-suspicious", type=int, default=5) | |
| 2002 | - audit.add_argument("--force-refresh-labels", action="store_true") | |
| 2003 | - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 2004 | - | |
| 2005 | - serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | |
| 2006 | - serve.add_argument("--tenant-id", default="163") | |
| 2007 | - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 2008 | - serve.add_argument("--host", default="0.0.0.0") | |
| 2009 | - serve.add_argument("--port", type=int, default=6010) | |
| 2010 | - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 2011 | - | |
| 2012 | - return parser | |
| 2013 | - | |
| 2014 | - | |
| 2015 | -def run_build(args: argparse.Namespace) -> None: | |
| 2016 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 2017 | - queries = framework.queries_from_file(Path(args.queries_file)) | |
| 2018 | - summary = [] | |
| 2019 | - for query in queries: | |
| 2020 | - result = framework.build_query_annotation_set( | |
| 2021 | - query=query, | |
| 2022 | - search_depth=args.search_depth, | |
| 2023 | - rerank_depth=args.rerank_depth, | |
| 2024 | - annotate_search_top_k=args.annotate_search_top_k, | |
| 2025 | - annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 2026 | - language=args.language, | |
| 2027 | - force_refresh_rerank=args.force_refresh_rerank, | |
| 2028 | - force_refresh_labels=args.force_refresh_labels, | |
| 2029 | - ) | |
| 2030 | - summary.append( | |
| 2031 | - { | |
| 2032 | - "query": result.query, | |
| 2033 | - "search_total": result.search_total, | |
| 2034 | - "search_depth": result.search_depth, | |
| 2035 | - "rerank_corpus_size": result.rerank_corpus_size, | |
| 2036 | - "annotated_count": result.annotated_count, | |
| 2037 | - "output_json_path": str(result.output_json_path), | |
| 2038 | - } | |
| 2039 | - ) | |
| 2040 | - print( | |
| 2041 | - f"[build] query={result.query!r} search_total={result.search_total} " | |
| 2042 | - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | |
| 2043 | - f"annotated={result.annotated_count} output={result.output_json_path}" | |
| 2044 | - ) | |
| 2045 | - out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | |
| 2046 | - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 2047 | - print(f"[done] summary={out_path}") | |
| 2048 | - | |
| 2049 | - | |
| 2050 | -def run_batch(args: argparse.Namespace) -> None: | |
| 2051 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 2052 | - queries = framework.queries_from_file(Path(args.queries_file)) | |
| 2053 | - payload = framework.batch_evaluate( | |
| 2054 | - queries=queries, | |
| 2055 | - top_k=args.top_k, | |
| 2056 | - auto_annotate=True, | |
| 2057 | - language=args.language, | |
| 2058 | - force_refresh_labels=args.force_refresh_labels, | |
| 2059 | - ) | |
| 2060 | - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}") | |
| 2061 | - | |
| 2062 | - | |
| 2063 | -def run_audit(args: argparse.Namespace) -> None: | |
| 2064 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 2065 | - queries = framework.queries_from_file(Path(args.queries_file)) | |
| 2066 | - audit_items = [] | |
| 2067 | - for query in queries: | |
| 2068 | - item = framework.audit_live_query( | |
| 2069 | - query=query, | |
| 2070 | - top_k=args.top_k, | |
| 2071 | - language=args.language, | |
| 2072 | - auto_annotate=not args.force_refresh_labels, | |
| 2073 | - ) | |
| 2074 | - if args.force_refresh_labels: | |
| 2075 | - live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language) | |
| 2076 | - framework.annotate_missing_labels( | |
| 2077 | - query=query, | |
| 2078 | - docs=list(live_payload.get("results") or [])[: args.top_k], | |
| 2079 | - force_refresh=True, | |
| 2080 | - ) | |
| 2081 | - item = framework.audit_live_query( | |
| 2082 | - query=query, | |
| 2083 | - top_k=args.top_k, | |
| 2084 | - language=args.language, | |
| 2085 | - auto_annotate=False, | |
| 2086 | - ) | |
| 2087 | - audit_items.append( | |
| 2088 | - { | |
| 2089 | - "query": query, | |
| 2090 | - "metrics": item["metrics"], | |
| 2091 | - "distribution": item["distribution"], | |
| 2092 | - "suspicious_count": len(item["suspicious"]), | |
| 2093 | - "suspicious_examples": item["suspicious"][: args.limit_suspicious], | |
| 2094 | - } | |
| 2095 | - ) | |
| 2096 | - print( | |
| 2097 | - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}" | |
| 2098 | - ) | |
| 2099 | - | |
| 2100 | - summary = { | |
| 2101 | - "created_at": utc_now_iso(), | |
| 2102 | - "tenant_id": args.tenant_id, | |
| 2103 | - "top_k": args.top_k, | |
| 2104 | - "query_count": len(queries), | |
| 2105 | - "total_suspicious": sum(item["suspicious_count"] for item in audit_items), | |
| 2106 | - "queries": audit_items, | |
| 2107 | - } | |
| 2108 | - out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" | |
| 2109 | - out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 2110 | - print(f"[done] audit={out_path}") | |
| 2111 | - | |
| 2112 | - | |
| 2113 | -def run_serve(args: argparse.Namespace) -> None: | |
| 2114 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 2115 | - app = create_web_app(framework, Path(args.queries_file)) | |
| 2116 | - import uvicorn | |
| 2117 | - | |
| 2118 | - uvicorn.run(app, host=args.host, port=args.port, log_level="info") | |
| 2119 | - | |
| 2120 | - | |
| 2121 | -def main() -> None: | |
| 2122 | - parser = build_cli_parser() | |
| 2123 | - args = parser.parse_args() | |
| 2124 | - if args.command == "build": | |
| 2125 | - run_build(args) | |
| 2126 | - return | |
| 2127 | - if args.command == "batch": | |
| 2128 | - run_batch(args) | |
| 2129 | - return | |
| 2130 | - if args.command == "audit": | |
| 2131 | - run_audit(args) | |
| 2132 | - return | |
| 2133 | - if args.command == "serve": | |
| 2134 | - run_serve(args) | |
| 2135 | - return | |
| 2136 | - raise SystemExit(f"unknown command: {args.command}") | |
| 2137 | - | |
| 2138 | - | |
| 2139 | -if __name__ == "__main__": | |
| 2140 | - main() |
| ... | ... | @@ -0,0 +1,59 @@ |
| 1 | +""" | |
| 2 | +Search evaluation framework: pooled relevance annotation, live metrics, batch reports. | |
| 3 | + | |
| 4 | +Importing this package ensures the project root is on ``sys.path`` (for ``api.*`` imports). | |
| 5 | +""" | |
| 6 | + | |
| 7 | +from __future__ import annotations | |
| 8 | + | |
| 9 | +from .utils import ensure_project_on_path | |
| 10 | + | |
| 11 | +ensure_project_on_path() | |
| 12 | + | |
| 13 | +from .constants import ( # noqa: E402 | |
| 14 | + DEFAULT_ARTIFACT_ROOT, | |
| 15 | + DEFAULT_LABELER_MODE, | |
| 16 | + DEFAULT_QUERY_FILE, | |
| 17 | + JUDGE_PROMPT_VERSION_COMPLEX, | |
| 18 | + JUDGE_PROMPT_VERSION_SIMPLE, | |
| 19 | + PROJECT_ROOT, | |
| 20 | + RELEVANCE_EXACT, | |
| 21 | + RELEVANCE_IRRELEVANT, | |
| 22 | + RELEVANCE_PARTIAL, | |
| 23 | + VALID_LABELS, | |
| 24 | +) | |
| 25 | +from .framework import SearchEvaluationFramework # noqa: E402 | |
| 26 | +from .store import EvalStore, QueryBuildResult # noqa: E402 | |
| 27 | +from .cli import build_cli_parser, main # noqa: E402 | |
| 28 | +from .web_app import create_web_app # noqa: E402 | |
| 29 | +from .reports import render_batch_report_markdown # noqa: E402 | |
| 30 | +from .utils import ( # noqa: E402 | |
| 31 | + ensure_dir, | |
| 32 | + sha1_text, | |
| 33 | + utc_now_iso, | |
| 34 | + utc_timestamp, | |
| 35 | +) | |
| 36 | + | |
| 37 | +__all__ = [ | |
| 38 | + "DEFAULT_ARTIFACT_ROOT", | |
| 39 | + "DEFAULT_LABELER_MODE", | |
| 40 | + "DEFAULT_QUERY_FILE", | |
| 41 | + "EvalStore", | |
| 42 | + "JUDGE_PROMPT_VERSION_COMPLEX", | |
| 43 | + "JUDGE_PROMPT_VERSION_SIMPLE", | |
| 44 | + "PROJECT_ROOT", | |
| 45 | + "QueryBuildResult", | |
| 46 | + "RELEVANCE_EXACT", | |
| 47 | + "RELEVANCE_IRRELEVANT", | |
| 48 | + "RELEVANCE_PARTIAL", | |
| 49 | + "SearchEvaluationFramework", | |
| 50 | + "VALID_LABELS", | |
| 51 | + "build_cli_parser", | |
| 52 | + "create_web_app", | |
| 53 | + "ensure_dir", | |
| 54 | + "main", | |
| 55 | + "render_batch_report_markdown", | |
| 56 | + "sha1_text", | |
| 57 | + "utc_now_iso", | |
| 58 | + "utc_timestamp", | |
| 59 | +] | ... | ... |
| ... | ... | @@ -0,0 +1,22 @@ |
| 1 | +"""Pydantic request bodies for the evaluation FastAPI app.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from typing import List, Optional | |
| 6 | + | |
| 7 | +from pydantic import BaseModel, Field | |
| 8 | + | |
| 9 | + | |
| 10 | +class SearchEvalRequest(BaseModel): | |
| 11 | + query: str | |
| 12 | + top_k: int = Field(default=100, ge=1, le=500) | |
| 13 | + auto_annotate: bool = False | |
| 14 | + language: str = "en" | |
| 15 | + | |
| 16 | + | |
| 17 | +class BatchEvalRequest(BaseModel): | |
| 18 | + queries: Optional[List[str]] = None | |
| 19 | + top_k: int = Field(default=100, ge=1, le=500) | |
| 20 | + auto_annotate: bool = False | |
| 21 | + language: str = "en" | |
| 22 | + force_refresh_labels: bool = False | ... | ... |
| ... | ... | @@ -0,0 +1,179 @@ |
| 1 | +"""CLI: build annotations, batch eval, audit, serve web UI.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import argparse | |
| 6 | +import json | |
| 7 | +from pathlib import Path | |
| 8 | + | |
| 9 | +from .constants import DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE | |
| 10 | +from .framework import SearchEvaluationFramework | |
| 11 | +from .utils import ensure_dir, utc_now_iso, utc_timestamp | |
| 12 | +from .web_app import create_web_app | |
| 13 | + | |
| 14 | + | |
| 15 | +def build_cli_parser() -> argparse.ArgumentParser: | |
| 16 | + parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") | |
| 17 | + sub = parser.add_subparsers(dest="command", required=True) | |
| 18 | + | |
| 19 | + build = sub.add_parser("build", help="Build pooled annotation set for queries") | |
| 20 | + build.add_argument("--tenant-id", default="163") | |
| 21 | + build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 22 | + build.add_argument("--search-depth", type=int, default=1000) | |
| 23 | + build.add_argument("--rerank-depth", type=int, default=10000) | |
| 24 | + build.add_argument("--annotate-search-top-k", type=int, default=120) | |
| 25 | + build.add_argument("--annotate-rerank-top-k", type=int, default=200) | |
| 26 | + build.add_argument("--language", default="en") | |
| 27 | + build.add_argument("--force-refresh-rerank", action="store_true") | |
| 28 | + build.add_argument("--force-refresh-labels", action="store_true") | |
| 29 | + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 30 | + | |
| 31 | + batch = sub.add_parser("batch", help="Run batch evaluation against live search") | |
| 32 | + batch.add_argument("--tenant-id", default="163") | |
| 33 | + batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 34 | + batch.add_argument("--top-k", type=int, default=100) | |
| 35 | + batch.add_argument("--language", default="en") | |
| 36 | + batch.add_argument("--force-refresh-labels", action="store_true") | |
| 37 | + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 38 | + | |
| 39 | + audit = sub.add_parser("audit", help="Audit annotation quality for queries") | |
| 40 | + audit.add_argument("--tenant-id", default="163") | |
| 41 | + audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 42 | + audit.add_argument("--top-k", type=int, default=100) | |
| 43 | + audit.add_argument("--language", default="en") | |
| 44 | + audit.add_argument("--limit-suspicious", type=int, default=5) | |
| 45 | + audit.add_argument("--force-refresh-labels", action="store_true") | |
| 46 | + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 47 | + | |
| 48 | + serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | |
| 49 | + serve.add_argument("--tenant-id", default="163") | |
| 50 | + serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 51 | + serve.add_argument("--host", default="0.0.0.0") | |
| 52 | + serve.add_argument("--port", type=int, default=6010) | |
| 53 | + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 54 | + | |
| 55 | + return parser | |
| 56 | + | |
| 57 | + | |
| 58 | +def run_build(args: argparse.Namespace) -> None: | |
| 59 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 60 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 61 | + summary = [] | |
| 62 | + for query in queries: | |
| 63 | + result = framework.build_query_annotation_set( | |
| 64 | + query=query, | |
| 65 | + search_depth=args.search_depth, | |
| 66 | + rerank_depth=args.rerank_depth, | |
| 67 | + annotate_search_top_k=args.annotate_search_top_k, | |
| 68 | + annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 69 | + language=args.language, | |
| 70 | + force_refresh_rerank=args.force_refresh_rerank, | |
| 71 | + force_refresh_labels=args.force_refresh_labels, | |
| 72 | + ) | |
| 73 | + summary.append( | |
| 74 | + { | |
| 75 | + "query": result.query, | |
| 76 | + "search_total": result.search_total, | |
| 77 | + "search_depth": result.search_depth, | |
| 78 | + "rerank_corpus_size": result.rerank_corpus_size, | |
| 79 | + "annotated_count": result.annotated_count, | |
| 80 | + "output_json_path": str(result.output_json_path), | |
| 81 | + } | |
| 82 | + ) | |
| 83 | + print( | |
| 84 | + f"[build] query={result.query!r} search_total={result.search_total} " | |
| 85 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | |
| 86 | + f"annotated={result.annotated_count} output={result.output_json_path}" | |
| 87 | + ) | |
| 88 | + out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | |
| 89 | + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 90 | + print(f"[done] summary={out_path}") | |
| 91 | + | |
| 92 | + | |
| 93 | +def run_batch(args: argparse.Namespace) -> None: | |
| 94 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 95 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 96 | + payload = framework.batch_evaluate( | |
| 97 | + queries=queries, | |
| 98 | + top_k=args.top_k, | |
| 99 | + auto_annotate=True, | |
| 100 | + language=args.language, | |
| 101 | + force_refresh_labels=args.force_refresh_labels, | |
| 102 | + ) | |
| 103 | + print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}") | |
| 104 | + | |
| 105 | + | |
| 106 | +def run_audit(args: argparse.Namespace) -> None: | |
| 107 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 108 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 109 | + audit_items = [] | |
| 110 | + for query in queries: | |
| 111 | + item = framework.audit_live_query( | |
| 112 | + query=query, | |
| 113 | + top_k=args.top_k, | |
| 114 | + language=args.language, | |
| 115 | + auto_annotate=not args.force_refresh_labels, | |
| 116 | + ) | |
| 117 | + if args.force_refresh_labels: | |
| 118 | + live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language) | |
| 119 | + framework.annotate_missing_labels( | |
| 120 | + query=query, | |
| 121 | + docs=list(live_payload.get("results") or [])[: args.top_k], | |
| 122 | + force_refresh=True, | |
| 123 | + ) | |
| 124 | + item = framework.audit_live_query( | |
| 125 | + query=query, | |
| 126 | + top_k=args.top_k, | |
| 127 | + language=args.language, | |
| 128 | + auto_annotate=False, | |
| 129 | + ) | |
| 130 | + audit_items.append( | |
| 131 | + { | |
| 132 | + "query": query, | |
| 133 | + "metrics": item["metrics"], | |
| 134 | + "distribution": item["distribution"], | |
| 135 | + "suspicious_count": len(item["suspicious"]), | |
| 136 | + "suspicious_examples": item["suspicious"][: args.limit_suspicious], | |
| 137 | + } | |
| 138 | + ) | |
| 139 | + print( | |
| 140 | + f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}" | |
| 141 | + ) | |
| 142 | + | |
| 143 | + summary = { | |
| 144 | + "created_at": utc_now_iso(), | |
| 145 | + "tenant_id": args.tenant_id, | |
| 146 | + "top_k": args.top_k, | |
| 147 | + "query_count": len(queries), | |
| 148 | + "total_suspicious": sum(item["suspicious_count"] for item in audit_items), | |
| 149 | + "queries": audit_items, | |
| 150 | + } | |
| 151 | + out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" | |
| 152 | + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 153 | + print(f"[done] audit={out_path}") | |
| 154 | + | |
| 155 | + | |
| 156 | +def run_serve(args: argparse.Namespace) -> None: | |
| 157 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 158 | + app = create_web_app(framework, Path(args.queries_file)) | |
| 159 | + import uvicorn | |
| 160 | + | |
| 161 | + uvicorn.run(app, host=args.host, port=args.port, log_level="info") | |
| 162 | + | |
| 163 | + | |
| 164 | +def main() -> None: | |
| 165 | + parser = build_cli_parser() | |
| 166 | + args = parser.parse_args() | |
| 167 | + if args.command == "build": | |
| 168 | + run_build(args) | |
| 169 | + return | |
| 170 | + if args.command == "batch": | |
| 171 | + run_batch(args) | |
| 172 | + return | |
| 173 | + if args.command == "audit": | |
| 174 | + run_audit(args) | |
| 175 | + return | |
| 176 | + if args.command == "serve": | |
| 177 | + run_serve(args) | |
| 178 | + return | |
| 179 | + raise SystemExit(f"unknown command: {args.command}") | ... | ... |
| ... | ... | @@ -0,0 +1,149 @@ |
| 1 | +"""HTTP clients for search API, reranker, and DashScope chat (relevance labeling).""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from typing import Any, Dict, List, Optional, Sequence, Tuple | |
| 6 | + | |
| 7 | +import requests | |
| 8 | + | |
| 9 | +from .constants import VALID_LABELS | |
| 10 | +from .prompts import ( | |
| 11 | + classify_batch_complex_prompt, | |
| 12 | + classify_batch_simple_prompt, | |
| 13 | + extract_query_profile_prompt, | |
| 14 | +) | |
| 15 | +from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps | |
| 16 | + | |
| 17 | + | |
| 18 | +class SearchServiceClient: | |
| 19 | + def __init__(self, base_url: str, tenant_id: str): | |
| 20 | + self.base_url = base_url.rstrip("/") | |
| 21 | + self.tenant_id = str(tenant_id) | |
| 22 | + self.session = requests.Session() | |
| 23 | + | |
| 24 | + def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]: | |
| 25 | + response = self.session.post( | |
| 26 | + f"{self.base_url}/search/", | |
| 27 | + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 28 | + json={"query": query, "size": size, "from": from_, "language": language}, | |
| 29 | + timeout=120, | |
| 30 | + ) | |
| 31 | + response.raise_for_status() | |
| 32 | + return response.json() | |
| 33 | + | |
| 34 | + | |
| 35 | +class RerankServiceClient: | |
| 36 | + def __init__(self, service_url: str): | |
| 37 | + self.service_url = service_url.rstrip("/") | |
| 38 | + self.session = requests.Session() | |
| 39 | + | |
| 40 | + def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]: | |
| 41 | + payload: Dict[str, Any] = { | |
| 42 | + "query": query, | |
| 43 | + "docs": list(docs), | |
| 44 | + "normalize": normalize, | |
| 45 | + } | |
| 46 | + if top_n is not None: | |
| 47 | + payload["top_n"] = int(top_n) | |
| 48 | + response = self.session.post(self.service_url, json=payload, timeout=180) | |
| 49 | + response.raise_for_status() | |
| 50 | + data = response.json() | |
| 51 | + return list(data.get("scores") or []), dict(data.get("meta") or {}) | |
| 52 | + | |
| 53 | + | |
| 54 | +class DashScopeLabelClient: | |
| 55 | + def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40): | |
| 56 | + self.model = model | |
| 57 | + self.base_url = base_url.rstrip("/") | |
| 58 | + self.api_key = api_key | |
| 59 | + self.batch_size = int(batch_size) | |
| 60 | + self.session = requests.Session() | |
| 61 | + | |
| 62 | + def _chat(self, prompt: str) -> Tuple[str, str]: | |
| 63 | + response = self.session.post( | |
| 64 | + f"{self.base_url}/chat/completions", | |
| 65 | + headers={ | |
| 66 | + "Authorization": f"Bearer {self.api_key}", | |
| 67 | + "Content-Type": "application/json", | |
| 68 | + }, | |
| 69 | + json={ | |
| 70 | + "model": self.model, | |
| 71 | + "messages": [{"role": "user", "content": prompt}], | |
| 72 | + "temperature": 0, | |
| 73 | + "top_p": 0.1, | |
| 74 | + }, | |
| 75 | + timeout=180, | |
| 76 | + ) | |
| 77 | + response.raise_for_status() | |
| 78 | + data = response.json() | |
| 79 | + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | |
| 80 | + return content, safe_json_dumps(data) | |
| 81 | + | |
| 82 | + def classify_batch_simple( | |
| 83 | + self, | |
| 84 | + query: str, | |
| 85 | + docs: Sequence[Dict[str, Any]], | |
| 86 | + ) -> Tuple[List[str], str]: | |
| 87 | + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 88 | + prompt = classify_batch_simple_prompt(query, numbered_docs) | |
| 89 | + content, raw_response = self._chat(prompt) | |
| 90 | + labels = [] | |
| 91 | + for line in str(content or "").splitlines(): | |
| 92 | + label = line.strip() | |
| 93 | + if label in VALID_LABELS: | |
| 94 | + labels.append(label) | |
| 95 | + if len(labels) != len(docs): | |
| 96 | + payload = extract_json_blob(content) | |
| 97 | + if isinstance(payload, dict) and isinstance(payload.get("labels"), list): | |
| 98 | + labels = [] | |
| 99 | + for item in payload["labels"][: len(docs)]: | |
| 100 | + if isinstance(item, dict): | |
| 101 | + label = str(item.get("label") or "").strip() | |
| 102 | + else: | |
| 103 | + label = str(item).strip() | |
| 104 | + if label in VALID_LABELS: | |
| 105 | + labels.append(label) | |
| 106 | + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 107 | + raise ValueError(f"unexpected simple label output: {content!r}") | |
| 108 | + return labels, raw_response | |
| 109 | + | |
| 110 | + def extract_query_profile( | |
| 111 | + self, | |
| 112 | + query: str, | |
| 113 | + parser_hints: Dict[str, Any], | |
| 114 | + ) -> Tuple[Dict[str, Any], str]: | |
| 115 | + prompt = extract_query_profile_prompt(query, parser_hints) | |
| 116 | + content, raw_response = self._chat(prompt) | |
| 117 | + payload = extract_json_blob(content) | |
| 118 | + if not isinstance(payload, dict): | |
| 119 | + raise ValueError(f"unexpected query profile payload: {content!r}") | |
| 120 | + payload.setdefault("normalized_query_en", query) | |
| 121 | + payload.setdefault("primary_category", "") | |
| 122 | + payload.setdefault("allowed_categories", []) | |
| 123 | + payload.setdefault("required_attributes", []) | |
| 124 | + payload.setdefault("notes", []) | |
| 125 | + return payload, raw_response | |
| 126 | + | |
| 127 | + def classify_batch_complex( | |
| 128 | + self, | |
| 129 | + query: str, | |
| 130 | + query_profile: Dict[str, Any], | |
| 131 | + docs: Sequence[Dict[str, Any]], | |
| 132 | + ) -> Tuple[List[str], str]: | |
| 133 | + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 134 | + prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs) | |
| 135 | + content, raw_response = self._chat(prompt) | |
| 136 | + payload = extract_json_blob(content) | |
| 137 | + if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): | |
| 138 | + raise ValueError(f"unexpected label payload: {content!r}") | |
| 139 | + labels_payload = payload["labels"] | |
| 140 | + labels: List[str] = [] | |
| 141 | + for item in labels_payload[: len(docs)]: | |
| 142 | + if not isinstance(item, dict): | |
| 143 | + continue | |
| 144 | + label = str(item.get("label") or "").strip() | |
| 145 | + if label in VALID_LABELS: | |
| 146 | + labels.append(label) | |
| 147 | + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 148 | + raise ValueError(f"unexpected label output: {content!r}") | |
| 149 | + return labels, raw_response | ... | ... |
| ... | ... | @@ -0,0 +1,19 @@ |
| 1 | +"""Paths and shared constants for search evaluation.""" | |
| 2 | + | |
| 3 | +from pathlib import Path | |
| 4 | + | |
| 5 | +_PKG_DIR = Path(__file__).resolve().parent | |
| 6 | +_SCRIPTS_EVAL_DIR = _PKG_DIR.parent | |
| 7 | +PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] | |
| 8 | + | |
| 9 | +RELEVANCE_EXACT = "Exact" | |
| 10 | +RELEVANCE_PARTIAL = "Partial" | |
| 11 | +RELEVANCE_IRRELEVANT = "Irrelevant" | |
| 12 | +VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | |
| 13 | + | |
| 14 | +DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | |
| 15 | +DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" | |
| 16 | + | |
| 17 | +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" | |
| 18 | +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | |
| 19 | +DEFAULT_LABELER_MODE = "simple" | ... | ... |
| ... | ... | @@ -0,0 +1,719 @@ |
| 1 | +"""Core orchestration: corpus, rerank, LLM labels, live/batch evaluation.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import json | |
| 6 | +import time | |
| 7 | +from pathlib import Path | |
| 8 | +from typing import Any, Dict, List, Sequence, Tuple | |
| 9 | + | |
| 10 | +import requests | |
| 11 | +from elasticsearch.helpers import scan | |
| 12 | + | |
| 13 | +from api.app import get_app_config, get_es_client, get_query_parser, init_service | |
| 14 | +from indexer.mapping_generator import get_tenant_index_name | |
| 15 | + | |
| 16 | +from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient | |
| 17 | +from .constants import ( | |
| 18 | + DEFAULT_ARTIFACT_ROOT, | |
| 19 | + DEFAULT_LABELER_MODE, | |
| 20 | + JUDGE_PROMPT_VERSION_COMPLEX, | |
| 21 | + RELEVANCE_EXACT, | |
| 22 | + RELEVANCE_IRRELEVANT, | |
| 23 | + RELEVANCE_PARTIAL, | |
| 24 | + VALID_LABELS, | |
| 25 | +) | |
| 26 | +from .metrics import aggregate_metrics, compute_query_metrics, label_distribution | |
| 27 | +from .reports import render_batch_report_markdown | |
| 28 | +from .store import EvalStore, QueryBuildResult | |
| 29 | +from .utils import ( | |
| 30 | + build_display_title, | |
| 31 | + build_rerank_doc, | |
| 32 | + compact_option_values, | |
| 33 | + compact_product_payload, | |
| 34 | + ensure_dir, | |
| 35 | + normalize_text, | |
| 36 | + pick_text, | |
| 37 | + sha1_text, | |
| 38 | + utc_now_iso, | |
| 39 | + utc_timestamp, | |
| 40 | +) | |
| 41 | + | |
| 42 | + | |
| 43 | +class SearchEvaluationFramework: | |
| 44 | + def __init__( | |
| 45 | + self, | |
| 46 | + tenant_id: str, | |
| 47 | + artifact_root: Path = DEFAULT_ARTIFACT_ROOT, | |
| 48 | + search_base_url: str = "http://localhost:6002", | |
| 49 | + labeler_mode: str = DEFAULT_LABELER_MODE, | |
| 50 | + ): | |
| 51 | + init_service(get_app_config().infrastructure.elasticsearch.host) | |
| 52 | + self.tenant_id = str(tenant_id) | |
| 53 | + self.artifact_root = ensure_dir(artifact_root) | |
| 54 | + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | |
| 55 | + self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") | |
| 56 | + self.search_client = SearchServiceClient(search_base_url, self.tenant_id) | |
| 57 | + app_cfg = get_app_config() | |
| 58 | + rerank_service_url = str( | |
| 59 | + app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] | |
| 60 | + ) | |
| 61 | + self.rerank_client = RerankServiceClient(rerank_service_url) | |
| 62 | + llm_cfg = app_cfg.services.translation.capabilities["llm"] | |
| 63 | + api_key = app_cfg.infrastructure.secrets.dashscope_api_key | |
| 64 | + if not api_key: | |
| 65 | + raise RuntimeError("dashscope_api_key is required for search evaluation annotation") | |
| 66 | + self.label_client = DashScopeLabelClient( | |
| 67 | + model=str(llm_cfg["model"]), | |
| 68 | + base_url=str(llm_cfg["base_url"]), | |
| 69 | + api_key=str(api_key), | |
| 70 | + ) | |
| 71 | + self.query_parser = None | |
| 72 | + | |
| 73 | + def _get_query_parser(self): | |
| 74 | + if self.query_parser is None: | |
| 75 | + self.query_parser = get_query_parser() | |
| 76 | + return self.query_parser | |
| 77 | + | |
| 78 | + def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | |
| 79 | + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 80 | + payload = parsed.to_dict() | |
| 81 | + payload["text_for_rerank"] = parsed.text_for_rerank() | |
| 82 | + return payload | |
| 83 | + | |
| 84 | + def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | |
| 85 | + if self.labeler_mode != "complex": | |
| 86 | + raise RuntimeError("query profiles are only used in complex labeler mode") | |
| 87 | + if not force_refresh: | |
| 88 | + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) | |
| 89 | + if cached is not None: | |
| 90 | + return cached | |
| 91 | + parser_hints = self.build_query_parser_hints(query) | |
| 92 | + profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) | |
| 93 | + profile["parser_hints"] = parser_hints | |
| 94 | + self.store.upsert_query_profile( | |
| 95 | + self.tenant_id, | |
| 96 | + query, | |
| 97 | + JUDGE_PROMPT_VERSION_COMPLEX, | |
| 98 | + self.label_client.model, | |
| 99 | + profile, | |
| 100 | + raw_response, | |
| 101 | + ) | |
| 102 | + return profile | |
| 103 | + | |
| 104 | + @staticmethod | |
| 105 | + def _doc_evidence_text(doc: Dict[str, Any]) -> str: | |
| 106 | + pieces: List[str] = [ | |
| 107 | + build_display_title(doc), | |
| 108 | + pick_text(doc.get("vendor"), "en"), | |
| 109 | + pick_text(doc.get("category_path"), "en"), | |
| 110 | + pick_text(doc.get("category_name"), "en"), | |
| 111 | + ] | |
| 112 | + for sku in doc.get("skus") or []: | |
| 113 | + pieces.extend( | |
| 114 | + [ | |
| 115 | + str(sku.get("option1_value") or ""), | |
| 116 | + str(sku.get("option2_value") or ""), | |
| 117 | + str(sku.get("option3_value") or ""), | |
| 118 | + ] | |
| 119 | + ) | |
| 120 | + for tag in doc.get("tags") or []: | |
| 121 | + pieces.append(str(tag)) | |
| 122 | + return normalize_text(" | ".join(piece for piece in pieces if piece)) | |
| 123 | + | |
| 124 | + def _apply_rule_based_label_guardrails( | |
| 125 | + self, | |
| 126 | + label: str, | |
| 127 | + query_profile: Dict[str, Any], | |
| 128 | + doc: Dict[str, Any], | |
| 129 | + ) -> str: | |
| 130 | + if label not in VALID_LABELS: | |
| 131 | + return label | |
| 132 | + evidence = self._doc_evidence_text(doc) | |
| 133 | + category = normalize_text(query_profile.get("primary_category")) | |
| 134 | + allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] | |
| 135 | + | |
| 136 | + primary_category_match = True | |
| 137 | + if category: | |
| 138 | + primary_category_match = category in evidence | |
| 139 | + allowed_category_match = True | |
| 140 | + if allowed_categories: | |
| 141 | + allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 142 | + | |
| 143 | + if label == RELEVANCE_EXACT and not primary_category_match: | |
| 144 | + if allowed_category_match: | |
| 145 | + label = RELEVANCE_PARTIAL | |
| 146 | + else: | |
| 147 | + return RELEVANCE_IRRELEVANT | |
| 148 | + | |
| 149 | + for attr in query_profile.get("required_attributes") or []: | |
| 150 | + if not isinstance(attr, dict): | |
| 151 | + continue | |
| 152 | + attr_name = normalize_text(attr.get("name")) | |
| 153 | + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: | |
| 154 | + continue | |
| 155 | + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 156 | + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 157 | + if attr_name == "fit": | |
| 158 | + if any(term in {"oversized", "oversize"} for term in required_terms): | |
| 159 | + conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) | |
| 160 | + if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): | |
| 161 | + conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) | |
| 162 | + has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 163 | + has_conflict = any(term in evidence for term in conflicting_terms) | |
| 164 | + | |
| 165 | + if has_conflict: | |
| 166 | + return RELEVANCE_IRRELEVANT | |
| 167 | + if label == RELEVANCE_EXACT and not has_required: | |
| 168 | + label = RELEVANCE_PARTIAL | |
| 169 | + | |
| 170 | + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 171 | + return RELEVANCE_IRRELEVANT | |
| 172 | + | |
| 173 | + return label | |
| 174 | + | |
| 175 | + @staticmethod | |
| 176 | + def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: | |
| 177 | + option_values = list(item.get("option_values") or []) | |
| 178 | + while len(option_values) < 3: | |
| 179 | + option_values.append("") | |
| 180 | + product = dict(item.get("product") or {}) | |
| 181 | + return { | |
| 182 | + "spu_id": item.get("spu_id"), | |
| 183 | + "title": product.get("title") or item.get("title"), | |
| 184 | + "vendor": product.get("vendor"), | |
| 185 | + "category_path": product.get("category"), | |
| 186 | + "category_name": product.get("category"), | |
| 187 | + "image_url": item.get("image_url") or product.get("image_url"), | |
| 188 | + "tags": product.get("tags") or [], | |
| 189 | + "skus": [ | |
| 190 | + { | |
| 191 | + "option1_value": option_values[0], | |
| 192 | + "option2_value": option_values[1], | |
| 193 | + "option3_value": option_values[2], | |
| 194 | + } | |
| 195 | + ], | |
| 196 | + } | |
| 197 | + | |
| 198 | + def _collect_label_issues( | |
| 199 | + self, | |
| 200 | + label: str, | |
| 201 | + query_profile: Dict[str, Any], | |
| 202 | + doc: Dict[str, Any], | |
| 203 | + ) -> List[str]: | |
| 204 | + evidence = self._doc_evidence_text(doc) | |
| 205 | + issues: List[str] = [] | |
| 206 | + category = normalize_text(query_profile.get("primary_category")) | |
| 207 | + allowed_categories = [ | |
| 208 | + normalize_text(item) | |
| 209 | + for item in query_profile.get("allowed_categories") or [] | |
| 210 | + if str(item).strip() | |
| 211 | + ] | |
| 212 | + | |
| 213 | + primary_category_match = True if not category else category in evidence | |
| 214 | + allowed_category_match = False if allowed_categories else primary_category_match | |
| 215 | + if allowed_categories: | |
| 216 | + allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 217 | + | |
| 218 | + if label == RELEVANCE_EXACT and not primary_category_match: | |
| 219 | + if allowed_category_match: | |
| 220 | + issues.append("Exact missing primary category evidence") | |
| 221 | + else: | |
| 222 | + issues.append("Exact has category mismatch") | |
| 223 | + | |
| 224 | + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 225 | + issues.append("Partial has category mismatch") | |
| 226 | + | |
| 227 | + for attr in query_profile.get("required_attributes") or []: | |
| 228 | + if not isinstance(attr, dict): | |
| 229 | + continue | |
| 230 | + attr_name = normalize_text(attr.get("name")) | |
| 231 | + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: | |
| 232 | + continue | |
| 233 | + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 234 | + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 235 | + has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 236 | + has_conflict = any(term in evidence for term in conflicting_terms) | |
| 237 | + | |
| 238 | + if has_conflict and label != RELEVANCE_IRRELEVANT: | |
| 239 | + issues.append(f"{label} conflicts on {attr_name}") | |
| 240 | + if label == RELEVANCE_EXACT and not has_required: | |
| 241 | + issues.append(f"Exact missing {attr_name}") | |
| 242 | + return issues | |
| 243 | + | |
| 244 | + def audit_live_query( | |
| 245 | + self, | |
| 246 | + query: str, | |
| 247 | + *, | |
| 248 | + top_k: int = 100, | |
| 249 | + language: str = "en", | |
| 250 | + auto_annotate: bool = False, | |
| 251 | + ) -> Dict[str, Any]: | |
| 252 | + live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) | |
| 253 | + if self.labeler_mode != "complex": | |
| 254 | + labels = [ | |
| 255 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 256 | + for item in live["results"] | |
| 257 | + ] | |
| 258 | + return { | |
| 259 | + "query": query, | |
| 260 | + "tenant_id": self.tenant_id, | |
| 261 | + "top_k": top_k, | |
| 262 | + "metrics": live["metrics"], | |
| 263 | + "distribution": label_distribution(labels), | |
| 264 | + "query_profile": None, | |
| 265 | + "suspicious": [], | |
| 266 | + "results": live["results"], | |
| 267 | + } | |
| 268 | + query_profile = self.get_query_profile(query, force_refresh=False) | |
| 269 | + suspicious: List[Dict[str, Any]] = [] | |
| 270 | + | |
| 271 | + for item in live["results"]: | |
| 272 | + doc = self._result_item_to_doc(item) | |
| 273 | + issues = self._collect_label_issues(item["label"] or "", query_profile, doc) | |
| 274 | + suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) | |
| 275 | + if suggested_label != (item["label"] or ""): | |
| 276 | + issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] | |
| 277 | + if issues: | |
| 278 | + suspicious.append( | |
| 279 | + { | |
| 280 | + "rank": item["rank"], | |
| 281 | + "spu_id": item["spu_id"], | |
| 282 | + "title": item["title"], | |
| 283 | + "label": item["label"], | |
| 284 | + "suggested_label": suggested_label, | |
| 285 | + "issues": issues, | |
| 286 | + } | |
| 287 | + ) | |
| 288 | + | |
| 289 | + labels = [ | |
| 290 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 291 | + for item in live["results"] | |
| 292 | + ] | |
| 293 | + return { | |
| 294 | + "query": query, | |
| 295 | + "tenant_id": self.tenant_id, | |
| 296 | + "top_k": top_k, | |
| 297 | + "metrics": live["metrics"], | |
| 298 | + "distribution": label_distribution(labels), | |
| 299 | + "query_profile": query_profile, | |
| 300 | + "suspicious": suspicious, | |
| 301 | + "results": live["results"], | |
| 302 | + } | |
| 303 | + | |
| 304 | + def queries_from_file(self, path: Path) -> List[str]: | |
| 305 | + return [ | |
| 306 | + line.strip() | |
| 307 | + for line in path.read_text(encoding="utf-8").splitlines() | |
| 308 | + if line.strip() and not line.strip().startswith("#") | |
| 309 | + ] | |
| 310 | + | |
| 311 | + def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]: | |
| 312 | + if not refresh and self.store.has_corpus(self.tenant_id): | |
| 313 | + return self.store.get_corpus_docs(self.tenant_id) | |
| 314 | + | |
| 315 | + es_client = get_es_client().client | |
| 316 | + index_name = get_tenant_index_name(self.tenant_id) | |
| 317 | + docs: List[Dict[str, Any]] = [] | |
| 318 | + for hit in scan( | |
| 319 | + client=es_client, | |
| 320 | + index=index_name, | |
| 321 | + query={ | |
| 322 | + "_source": [ | |
| 323 | + "spu_id", | |
| 324 | + "title", | |
| 325 | + "vendor", | |
| 326 | + "category_path", | |
| 327 | + "category_name", | |
| 328 | + "image_url", | |
| 329 | + "skus", | |
| 330 | + "tags", | |
| 331 | + ], | |
| 332 | + "query": {"match_all": {}}, | |
| 333 | + }, | |
| 334 | + size=500, | |
| 335 | + preserve_order=False, | |
| 336 | + clear_scroll=True, | |
| 337 | + ): | |
| 338 | + source = dict(hit.get("_source") or {}) | |
| 339 | + source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "") | |
| 340 | + docs.append(source) | |
| 341 | + self.store.upsert_corpus_docs(self.tenant_id, docs) | |
| 342 | + return docs | |
| 343 | + | |
| 344 | + def full_corpus_rerank( | |
| 345 | + self, | |
| 346 | + query: str, | |
| 347 | + docs: Sequence[Dict[str, Any]], | |
| 348 | + batch_size: int = 24, | |
| 349 | + force_refresh: bool = False, | |
| 350 | + ) -> List[Dict[str, Any]]: | |
| 351 | + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) | |
| 352 | + pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached] | |
| 353 | + if pending: | |
| 354 | + new_scores: Dict[str, float] = {} | |
| 355 | + for start in range(0, len(pending), batch_size): | |
| 356 | + batch = pending[start : start + batch_size] | |
| 357 | + scores = self._rerank_batch_with_retry(query=query, docs=batch) | |
| 358 | + if len(scores) != len(batch): | |
| 359 | + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs") | |
| 360 | + for doc, score in zip(batch, scores): | |
| 361 | + new_scores[str(doc.get("spu_id"))] = float(score) | |
| 362 | + self.store.upsert_rerank_scores( | |
| 363 | + self.tenant_id, | |
| 364 | + query, | |
| 365 | + new_scores, | |
| 366 | + model_name="qwen3_vllm_score", | |
| 367 | + ) | |
| 368 | + cached.update(new_scores) | |
| 369 | + | |
| 370 | + ranked = [] | |
| 371 | + for doc in docs: | |
| 372 | + spu_id = str(doc.get("spu_id")) | |
| 373 | + ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc}) | |
| 374 | + ranked.sort(key=lambda item: item["score"], reverse=True) | |
| 375 | + return ranked | |
| 376 | + | |
| 377 | + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: | |
| 378 | + if not docs: | |
| 379 | + return [] | |
| 380 | + doc_texts = [build_rerank_doc(doc) for doc in docs] | |
| 381 | + try: | |
| 382 | + scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False) | |
| 383 | + return scores | |
| 384 | + except Exception: | |
| 385 | + if len(docs) == 1: | |
| 386 | + return [-1.0] | |
| 387 | + if len(docs) <= 6: | |
| 388 | + scores: List[float] = [] | |
| 389 | + for doc in docs: | |
| 390 | + scores.extend(self._rerank_batch_with_retry(query, [doc])) | |
| 391 | + return scores | |
| 392 | + mid = len(docs) // 2 | |
| 393 | + left = self._rerank_batch_with_retry(query, docs[:mid]) | |
| 394 | + right = self._rerank_batch_with_retry(query, docs[mid:]) | |
| 395 | + return left + right | |
| 396 | + | |
| 397 | + def annotate_missing_labels( | |
| 398 | + self, | |
| 399 | + query: str, | |
| 400 | + docs: Sequence[Dict[str, Any]], | |
| 401 | + force_refresh: bool = False, | |
| 402 | + ) -> Dict[str, str]: | |
| 403 | + labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) | |
| 404 | + missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] | |
| 405 | + if not missing_docs: | |
| 406 | + return labels | |
| 407 | + | |
| 408 | + for start in range(0, len(missing_docs), self.label_client.batch_size): | |
| 409 | + batch = missing_docs[start : start + self.label_client.batch_size] | |
| 410 | + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh) | |
| 411 | + for sub_labels, raw_response, sub_batch in batch_pairs: | |
| 412 | + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} | |
| 413 | + self.store.upsert_labels( | |
| 414 | + self.tenant_id, | |
| 415 | + query, | |
| 416 | + to_store, | |
| 417 | + judge_model=self.label_client.model, | |
| 418 | + raw_response=raw_response, | |
| 419 | + ) | |
| 420 | + labels.update(to_store) | |
| 421 | + time.sleep(0.1) | |
| 422 | + return labels | |
| 423 | + | |
| 424 | + def _classify_with_retry( | |
| 425 | + self, | |
| 426 | + query: str, | |
| 427 | + docs: Sequence[Dict[str, Any]], | |
| 428 | + *, | |
| 429 | + force_refresh: bool = False, | |
| 430 | + ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: | |
| 431 | + if not docs: | |
| 432 | + return [] | |
| 433 | + try: | |
| 434 | + if self.labeler_mode == "complex": | |
| 435 | + query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 436 | + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | |
| 437 | + labels = [ | |
| 438 | + self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 439 | + for doc, label in zip(docs, labels) | |
| 440 | + ] | |
| 441 | + else: | |
| 442 | + labels, raw_response = self.label_client.classify_batch_simple(query, docs) | |
| 443 | + return [(labels, raw_response, docs)] | |
| 444 | + except Exception: | |
| 445 | + if len(docs) == 1: | |
| 446 | + raise | |
| 447 | + mid = len(docs) // 2 | |
| 448 | + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) | |
| 449 | + | |
| 450 | + def build_query_annotation_set( | |
| 451 | + self, | |
| 452 | + query: str, | |
| 453 | + *, | |
| 454 | + search_depth: int = 1000, | |
| 455 | + rerank_depth: int = 10000, | |
| 456 | + annotate_search_top_k: int = 120, | |
| 457 | + annotate_rerank_top_k: int = 200, | |
| 458 | + language: str = "en", | |
| 459 | + force_refresh_rerank: bool = False, | |
| 460 | + force_refresh_labels: bool = False, | |
| 461 | + ) -> QueryBuildResult: | |
| 462 | + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language) | |
| 463 | + search_results = list(search_payload.get("results") or []) | |
| 464 | + corpus = self.corpus_docs(refresh=False) | |
| 465 | + full_rerank = self.full_corpus_rerank( | |
| 466 | + query=query, | |
| 467 | + docs=corpus, | |
| 468 | + force_refresh=force_refresh_rerank, | |
| 469 | + ) | |
| 470 | + rerank_depth_effective = min(rerank_depth, len(full_rerank)) | |
| 471 | + | |
| 472 | + pool_docs: Dict[str, Dict[str, Any]] = {} | |
| 473 | + for doc in search_results[:annotate_search_top_k]: | |
| 474 | + pool_docs[str(doc.get("spu_id"))] = doc | |
| 475 | + for item in full_rerank[:annotate_rerank_top_k]: | |
| 476 | + pool_docs[str(item["spu_id"])] = item["doc"] | |
| 477 | + | |
| 478 | + labels = self.annotate_missing_labels( | |
| 479 | + query=query, | |
| 480 | + docs=list(pool_docs.values()), | |
| 481 | + force_refresh=force_refresh_labels, | |
| 482 | + ) | |
| 483 | + | |
| 484 | + search_labeled_results: List[Dict[str, Any]] = [] | |
| 485 | + for rank, doc in enumerate(search_results, start=1): | |
| 486 | + spu_id = str(doc.get("spu_id")) | |
| 487 | + label = labels.get(spu_id) | |
| 488 | + search_labeled_results.append( | |
| 489 | + { | |
| 490 | + "rank": rank, | |
| 491 | + "spu_id": spu_id, | |
| 492 | + "title": build_display_title(doc), | |
| 493 | + "image_url": doc.get("image_url"), | |
| 494 | + "rerank_score": None, | |
| 495 | + "label": label, | |
| 496 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 497 | + "product": compact_product_payload(doc), | |
| 498 | + } | |
| 499 | + ) | |
| 500 | + | |
| 501 | + rerank_top_results: List[Dict[str, Any]] = [] | |
| 502 | + for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1): | |
| 503 | + doc = item["doc"] | |
| 504 | + spu_id = str(item["spu_id"]) | |
| 505 | + rerank_top_results.append( | |
| 506 | + { | |
| 507 | + "rank": rank, | |
| 508 | + "spu_id": spu_id, | |
| 509 | + "title": build_display_title(doc), | |
| 510 | + "image_url": doc.get("image_url"), | |
| 511 | + "rerank_score": round(float(item["score"]), 8), | |
| 512 | + "label": labels.get(spu_id), | |
| 513 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 514 | + "product": compact_product_payload(doc), | |
| 515 | + } | |
| 516 | + ) | |
| 517 | + | |
| 518 | + top100_labels = [ | |
| 519 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 520 | + for item in search_labeled_results[:100] | |
| 521 | + ] | |
| 522 | + metrics = compute_query_metrics(top100_labels) | |
| 523 | + output_dir = ensure_dir(self.artifact_root / "query_builds") | |
| 524 | + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" | |
| 525 | + output_json_path = output_dir / f"{run_id}.json" | |
| 526 | + payload = { | |
| 527 | + "run_id": run_id, | |
| 528 | + "created_at": utc_now_iso(), | |
| 529 | + "tenant_id": self.tenant_id, | |
| 530 | + "query": query, | |
| 531 | + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 532 | + "search_total": int(search_payload.get("total") or 0), | |
| 533 | + "search_depth_requested": search_depth, | |
| 534 | + "search_depth_effective": len(search_results), | |
| 535 | + "rerank_depth_requested": rerank_depth, | |
| 536 | + "rerank_depth_effective": rerank_depth_effective, | |
| 537 | + "corpus_size": len(corpus), | |
| 538 | + "annotation_pool": { | |
| 539 | + "annotate_search_top_k": annotate_search_top_k, | |
| 540 | + "annotate_rerank_top_k": annotate_rerank_top_k, | |
| 541 | + "pool_size": len(pool_docs), | |
| 542 | + }, | |
| 543 | + "labeler_mode": self.labeler_mode, | |
| 544 | + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | |
| 545 | + "metrics_top100": metrics, | |
| 546 | + "search_results": search_labeled_results, | |
| 547 | + "full_rerank_top": rerank_top_results, | |
| 548 | + } | |
| 549 | + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 550 | + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"]) | |
| 551 | + return QueryBuildResult( | |
| 552 | + query=query, | |
| 553 | + tenant_id=self.tenant_id, | |
| 554 | + search_total=int(search_payload.get("total") or 0), | |
| 555 | + search_depth=len(search_results), | |
| 556 | + rerank_corpus_size=len(corpus), | |
| 557 | + annotated_count=len(pool_docs), | |
| 558 | + output_json_path=output_json_path, | |
| 559 | + ) | |
| 560 | + | |
| 561 | + def evaluate_live_query( | |
| 562 | + self, | |
| 563 | + query: str, | |
| 564 | + top_k: int = 100, | |
| 565 | + auto_annotate: bool = False, | |
| 566 | + language: str = "en", | |
| 567 | + force_refresh_labels: bool = False, | |
| 568 | + ) -> Dict[str, Any]: | |
| 569 | + search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language) | |
| 570 | + results = list(search_payload.get("results") or []) | |
| 571 | + if auto_annotate: | |
| 572 | + self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) | |
| 573 | + labels = self.store.get_labels(self.tenant_id, query) | |
| 574 | + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]} | |
| 575 | + labeled = [] | |
| 576 | + unlabeled_hits = 0 | |
| 577 | + for rank, doc in enumerate(results[:top_k], start=1): | |
| 578 | + spu_id = str(doc.get("spu_id")) | |
| 579 | + label = labels.get(spu_id) | |
| 580 | + if label not in VALID_LABELS: | |
| 581 | + unlabeled_hits += 1 | |
| 582 | + labeled.append( | |
| 583 | + { | |
| 584 | + "rank": rank, | |
| 585 | + "spu_id": spu_id, | |
| 586 | + "title": build_display_title(doc), | |
| 587 | + "image_url": doc.get("image_url"), | |
| 588 | + "label": label, | |
| 589 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 590 | + "product": compact_product_payload(doc), | |
| 591 | + } | |
| 592 | + ) | |
| 593 | + metric_labels = [ | |
| 594 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 595 | + for item in labeled | |
| 596 | + ] | |
| 597 | + label_stats = self.store.get_query_label_stats(self.tenant_id, query) | |
| 598 | + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) | |
| 599 | + relevant_missing_ids = [ | |
| 600 | + spu_id | |
| 601 | + for spu_id, label in labels.items() | |
| 602 | + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | |
| 603 | + ] | |
| 604 | + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) | |
| 605 | + missing_relevant = [] | |
| 606 | + for spu_id in relevant_missing_ids: | |
| 607 | + doc = missing_docs_map.get(spu_id) | |
| 608 | + if not doc: | |
| 609 | + continue | |
| 610 | + missing_relevant.append( | |
| 611 | + { | |
| 612 | + "spu_id": spu_id, | |
| 613 | + "label": labels[spu_id], | |
| 614 | + "rerank_score": rerank_scores.get(spu_id), | |
| 615 | + "title": build_display_title(doc), | |
| 616 | + "image_url": doc.get("image_url"), | |
| 617 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 618 | + "product": compact_product_payload(doc), | |
| 619 | + } | |
| 620 | + ) | |
| 621 | + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | |
| 622 | + missing_relevant.sort( | |
| 623 | + key=lambda item: ( | |
| 624 | + label_order.get(str(item.get("label")), 9), | |
| 625 | + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")), | |
| 626 | + str(item.get("title") or ""), | |
| 627 | + ) | |
| 628 | + ) | |
| 629 | + tips: List[str] = [] | |
| 630 | + if auto_annotate: | |
| 631 | + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.") | |
| 632 | + else: | |
| 633 | + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.") | |
| 634 | + if label_stats["total"] == 0: | |
| 635 | + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.") | |
| 636 | + if unlabeled_hits: | |
| 637 | + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") | |
| 638 | + if not missing_relevant: | |
| 639 | + tips.append("No cached Exact/Partial products were missed by this recall set.") | |
| 640 | + return { | |
| 641 | + "query": query, | |
| 642 | + "tenant_id": self.tenant_id, | |
| 643 | + "top_k": top_k, | |
| 644 | + "metrics": compute_query_metrics(metric_labels), | |
| 645 | + "results": labeled, | |
| 646 | + "missing_relevant": missing_relevant, | |
| 647 | + "label_stats": { | |
| 648 | + **label_stats, | |
| 649 | + "unlabeled_hits_treated_irrelevant": unlabeled_hits, | |
| 650 | + "recalled_hits": len(labeled), | |
| 651 | + "missing_relevant_count": len(missing_relevant), | |
| 652 | + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | |
| 653 | + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | |
| 654 | + }, | |
| 655 | + "tips": tips, | |
| 656 | + "total": int(search_payload.get("total") or 0), | |
| 657 | + } | |
| 658 | + | |
| 659 | + def batch_evaluate( | |
| 660 | + self, | |
| 661 | + queries: Sequence[str], | |
| 662 | + *, | |
| 663 | + top_k: int = 100, | |
| 664 | + auto_annotate: bool = True, | |
| 665 | + language: str = "en", | |
| 666 | + force_refresh_labels: bool = False, | |
| 667 | + ) -> Dict[str, Any]: | |
| 668 | + per_query = [] | |
| 669 | + for query in queries: | |
| 670 | + live = self.evaluate_live_query( | |
| 671 | + query, | |
| 672 | + top_k=top_k, | |
| 673 | + auto_annotate=auto_annotate, | |
| 674 | + language=language, | |
| 675 | + force_refresh_labels=force_refresh_labels, | |
| 676 | + ) | |
| 677 | + labels = [ | |
| 678 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 679 | + for item in live["results"] | |
| 680 | + ] | |
| 681 | + per_query.append( | |
| 682 | + { | |
| 683 | + "query": live["query"], | |
| 684 | + "tenant_id": live["tenant_id"], | |
| 685 | + "top_k": live["top_k"], | |
| 686 | + "metrics": live["metrics"], | |
| 687 | + "distribution": label_distribution(labels), | |
| 688 | + "total": live["total"], | |
| 689 | + } | |
| 690 | + ) | |
| 691 | + aggregate = aggregate_metrics([item["metrics"] for item in per_query]) | |
| 692 | + aggregate_distribution = { | |
| 693 | + RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | |
| 694 | + RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), | |
| 695 | + RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), | |
| 696 | + } | |
| 697 | + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | |
| 698 | + report_dir = ensure_dir(self.artifact_root / "batch_reports") | |
| 699 | + config_snapshot_path = report_dir / f"{batch_id}_config.json" | |
| 700 | + config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() | |
| 701 | + config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 702 | + output_json_path = report_dir / f"{batch_id}.json" | |
| 703 | + report_md_path = report_dir / f"{batch_id}.md" | |
| 704 | + payload = { | |
| 705 | + "batch_id": batch_id, | |
| 706 | + "created_at": utc_now_iso(), | |
| 707 | + "tenant_id": self.tenant_id, | |
| 708 | + "queries": list(queries), | |
| 709 | + "top_k": top_k, | |
| 710 | + "aggregate_metrics": aggregate, | |
| 711 | + "aggregate_distribution": aggregate_distribution, | |
| 712 | + "per_query": per_query, | |
| 713 | + "config_snapshot_path": str(config_snapshot_path), | |
| 714 | + } | |
| 715 | + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 716 | + report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") | |
| 717 | + self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) | |
| 718 | + return payload | |
| 719 | + | ... | ... |
| ... | ... | @@ -0,0 +1,58 @@ |
| 1 | +"""IR metrics for labeled result lists.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from typing import Dict, Sequence | |
| 6 | + | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 8 | + | |
| 9 | + | |
| 10 | +def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: | |
| 11 | + if k <= 0: | |
| 12 | + return 0.0 | |
| 13 | + sliced = list(labels[:k]) | |
| 14 | + if not sliced: | |
| 15 | + return 0.0 | |
| 16 | + hits = sum(1 for label in sliced if label in relevant) | |
| 17 | + return hits / float(min(k, len(sliced))) | |
| 18 | + | |
| 19 | + | |
| 20 | +def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: | |
| 21 | + hit_count = 0 | |
| 22 | + precision_sum = 0.0 | |
| 23 | + for idx, label in enumerate(labels, start=1): | |
| 24 | + if label not in relevant: | |
| 25 | + continue | |
| 26 | + hit_count += 1 | |
| 27 | + precision_sum += hit_count / idx | |
| 28 | + if hit_count == 0: | |
| 29 | + return 0.0 | |
| 30 | + return precision_sum / hit_count | |
| 31 | + | |
| 32 | + | |
| 33 | +def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: | |
| 34 | + metrics: Dict[str, float] = {} | |
| 35 | + for k in (5, 10, 20, 50): | |
| 36 | + metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) | |
| 37 | + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 38 | + metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) | |
| 39 | + metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 40 | + return metrics | |
| 41 | + | |
| 42 | + | |
| 43 | +def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]: | |
| 44 | + if not metric_items: | |
| 45 | + return {} | |
| 46 | + keys = sorted(metric_items[0].keys()) | |
| 47 | + return { | |
| 48 | + key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6) | |
| 49 | + for key in keys | |
| 50 | + } | |
| 51 | + | |
| 52 | + | |
| 53 | +def label_distribution(labels: Sequence[str]) -> Dict[str, int]: | |
| 54 | + return { | |
| 55 | + RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), | |
| 56 | + RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), | |
| 57 | + RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), | |
| 58 | + } | ... | ... |
| ... | ... | @@ -0,0 +1,89 @@ |
| 1 | +"""LLM prompt templates for relevance judging (keep wording changes here).""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import json | |
| 6 | +from typing import Any, Dict, Sequence | |
| 7 | + | |
| 8 | + | |
| 9 | +def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | |
| 10 | + lines = "\n".join(numbered_doc_lines) | |
| 11 | + n = len(numbered_doc_lines) | |
| 12 | + return ( | |
| 13 | + "You are an e-commerce search result relevance evaluation assistant. " | |
| 14 | + "Based on the user query and each product's information, output the relevance level for each product.\n\n" | |
| 15 | + "## Relevance Level Criteria\n" | |
| 16 | + "Exact โ Fully matches the user's search intent.\n" | |
| 17 | + "Partial โ Primary intent satisfied (same category or similar use, basically aligns with search intent), " | |
| 18 | + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | |
| 19 | + "Irrelevant โ Category or use case mismatched, primary intent not satisfied.\n\n" | |
| 20 | + "Additional judging guidance:\n" | |
| 21 | + "- If the query clearly names a product type, product type matching has the highest priority. " | |
| 22 | + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | |
| 23 | + "bra vs top, backpack vs bag are not interchangeable.\n" | |
| 24 | + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | |
| 25 | + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | |
| 26 | + "- Do not guess missing attributes.\n" | |
| 27 | + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | |
| 28 | + "- Be conservative with Exact.\n\n" | |
| 29 | + f"Query: {query}\n\n" | |
| 30 | + "Products:\n" | |
| 31 | + f"{lines}\n\n" | |
| 32 | + "## Output Format\n" | |
| 33 | + f"Strictly output {n} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | |
| 34 | + "They must correspond sequentially to the products above. Do not output any other information.\n" | |
| 35 | + ) | |
| 36 | + | |
| 37 | + | |
| 38 | +def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: | |
| 39 | + hints_json = json.dumps(parser_hints, ensure_ascii=False) | |
| 40 | + return ( | |
| 41 | + "You are building a structured intent profile for e-commerce relevance judging.\n" | |
| 42 | + "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" | |
| 43 | + "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" | |
| 44 | + "Return JSON with this schema:\n" | |
| 45 | + "{\n" | |
| 46 | + ' "normalized_query_en": string,\n' | |
| 47 | + ' "primary_category": string,\n' | |
| 48 | + ' "allowed_categories": [string],\n' | |
| 49 | + ' "required_attributes": [\n' | |
| 50 | + ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' | |
| 51 | + " ],\n" | |
| 52 | + ' "notes": [string]\n' | |
| 53 | + "}\n\n" | |
| 54 | + "Guidelines:\n" | |
| 55 | + "- Exact later will require explicit evidence for all required attributes.\n" | |
| 56 | + "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" | |
| 57 | + "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" | |
| 58 | + "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" | |
| 59 | + "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" | |
| 60 | + "- For color, include conflicting colors only when clear from the query.\n\n" | |
| 61 | + f"Original query: {query}\n" | |
| 62 | + f"Parser hints JSON: {hints_json}\n" | |
| 63 | + ) | |
| 64 | + | |
| 65 | + | |
| 66 | +def classify_batch_complex_prompt( | |
| 67 | + query: str, | |
| 68 | + query_profile: Dict[str, Any], | |
| 69 | + numbered_doc_lines: Sequence[str], | |
| 70 | +) -> str: | |
| 71 | + lines = "\n".join(numbered_doc_lines) | |
| 72 | + profile_json = json.dumps(query_profile, ensure_ascii=False) | |
| 73 | + return ( | |
| 74 | + "You are an e-commerce search relevance judge.\n" | |
| 75 | + "Judge each product against the structured query profile below.\n\n" | |
| 76 | + "Relevance rules:\n" | |
| 77 | + "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" | |
| 78 | + "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" | |
| 79 | + "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" | |
| 80 | + "- Be conservative with Exact.\n" | |
| 81 | + "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" | |
| 82 | + "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" | |
| 83 | + f"Original query: {query}\n" | |
| 84 | + f"Structured query profile JSON: {profile_json}\n\n" | |
| 85 | + "Products:\n" | |
| 86 | + f"{lines}\n\n" | |
| 87 | + "Return JSON only, with schema:\n" | |
| 88 | + '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' | |
| 89 | + ) | ... | ... |
| ... | ... | @@ -0,0 +1,48 @@ |
| 1 | +"""Markdown and text reports for batch evaluation.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from typing import Any, Dict | |
| 6 | + | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 8 | + | |
| 9 | + | |
| 10 | +def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | |
| 11 | + lines = [ | |
| 12 | + "# Search Batch Evaluation", | |
| 13 | + "", | |
| 14 | + f"- Batch ID: {payload['batch_id']}", | |
| 15 | + f"- Created at: {payload['created_at']}", | |
| 16 | + f"- Tenant ID: {payload['tenant_id']}", | |
| 17 | + f"- Query count: {len(payload['queries'])}", | |
| 18 | + f"- Top K: {payload['top_k']}", | |
| 19 | + "", | |
| 20 | + "## Aggregate Metrics", | |
| 21 | + "", | |
| 22 | + ] | |
| 23 | + for key, value in sorted((payload.get("aggregate_metrics") or {}).items()): | |
| 24 | + lines.append(f"- {key}: {value}") | |
| 25 | + distribution = payload.get("aggregate_distribution") or {} | |
| 26 | + if distribution: | |
| 27 | + lines.extend( | |
| 28 | + [ | |
| 29 | + "", | |
| 30 | + "## Label Distribution", | |
| 31 | + "", | |
| 32 | + f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 33 | + f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", | |
| 34 | + f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", | |
| 35 | + ] | |
| 36 | + ) | |
| 37 | + lines.extend(["", "## Per Query", ""]) | |
| 38 | + for item in payload.get("per_query") or []: | |
| 39 | + lines.append(f"### {item['query']}") | |
| 40 | + lines.append("") | |
| 41 | + for key, value in sorted((item.get("metrics") or {}).items()): | |
| 42 | + lines.append(f"- {key}: {value}") | |
| 43 | + distribution = item.get("distribution") or {} | |
| 44 | + lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 45 | + lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") | |
| 46 | + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") | |
| 47 | + lines.append("") | |
| 48 | + return "\n".join(lines) | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.css
0 โ 100644
| ... | ... | @@ -0,0 +1,91 @@ |
| 1 | +:root { | |
| 2 | + --bg: #f5f3ed; | |
| 3 | + --panel: #fffdf8; | |
| 4 | + --ink: #1f2a24; | |
| 5 | + --muted: #6b756e; | |
| 6 | + --line: #ddd4c6; | |
| 7 | + --accent: #0f766e; | |
| 8 | + --exact: #0f766e; | |
| 9 | + --partial: #b7791f; | |
| 10 | + --irrelevant: #b42318; | |
| 11 | + } | |
| 12 | + body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background: | |
| 13 | + radial-gradient(circle at top left, #f0e6d6 0, transparent 28%), | |
| 14 | + linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); } | |
| 15 | + .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; } | |
| 16 | + .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); } | |
| 17 | + .main { padding: 24px; } | |
| 18 | + h1, h2 { margin: 0 0 12px; } | |
| 19 | + .muted { color: var(--muted); } | |
| 20 | + .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } | |
| 21 | + .query-item { | |
| 22 | + display: block; width: 100%; border: 0; background: transparent; text-align: left; | |
| 23 | + padding: 10px 12px; border-radius: 10px; cursor: pointer; | |
| 24 | + color: var(--ink); font-size: 15px; font-weight: 500; | |
| 25 | + } | |
| 26 | + .query-item:hover { background: #eef6f4; } | |
| 27 | + .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } | |
| 28 | + input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } | |
| 29 | + button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; } | |
| 30 | + button.secondary { background: #d9e6e3; color: #12433d; } | |
| 31 | + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; } | |
| 32 | + .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; } | |
| 33 | + .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; } | |
| 34 | + .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; } | |
| 35 | + .results { display: grid; gap: 10px; } | |
| 36 | + .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } | |
| 37 | + .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } | |
| 38 | + .Exact { background: var(--exact); } | |
| 39 | + .Partial { background: var(--partial); } | |
| 40 | + .Irrelevant { background: var(--irrelevant); } | |
| 41 | + .Unknown { background: #637381; } | |
| 42 | + .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } | |
| 43 | + .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; } | |
| 44 | + .options { color: var(--muted); line-height: 1.5; font-size: 14px; } | |
| 45 | + .section { margin-bottom: 28px; } | |
| 46 | + .history { font-size: 13px; line-height: 1.5; } | |
| 47 | + .history-list { max-height: 42vh; overflow: auto; display: flex; flex-direction: column; gap: 8px; margin-top: 8px; } | |
| 48 | + .history-item { | |
| 49 | + display: block; width: 100%; border: 1px solid var(--line); background: var(--panel); | |
| 50 | + text-align: left; padding: 10px 12px; border-radius: 12px; cursor: pointer; | |
| 51 | + color: var(--ink); font-size: 13px; transition: background 0.15s, border-color 0.15s, box-shadow 0.15s; | |
| 52 | + } | |
| 53 | + .history-item:hover { background: #eef6f4; border-color: #b8d4cd; } | |
| 54 | + .history-item:focus-visible { outline: 2px solid var(--accent); outline-offset: 2px; } | |
| 55 | + .history-item .hid { font-weight: 700; font-size: 12px; word-break: break-all; color: #12433d; } | |
| 56 | + .history-item .hmeta { color: var(--muted); font-size: 11px; margin-top: 4px; } | |
| 57 | + .history-item .hstats { margin-top: 6px; font-size: 12px; color: var(--ink); line-height: 1.45; } | |
| 58 | + .history-item .hstats span { color: var(--muted); } | |
| 59 | + .report-modal-root { | |
| 60 | + position: fixed; inset: 0; z-index: 200; display: none; align-items: center; justify-content: center; | |
| 61 | + padding: 16px; box-sizing: border-box; | |
| 62 | + } | |
| 63 | + .report-modal-root.is-open { display: flex; } | |
| 64 | + .report-modal-backdrop { position: absolute; inset: 0; background: rgba(31, 42, 36, 0.45); backdrop-filter: blur(4px); } | |
| 65 | + .report-modal-dialog { | |
| 66 | + position: relative; z-index: 1; width: min(920px, 100%); max-height: min(92vh, 900px); display: flex; flex-direction: column; | |
| 67 | + background: var(--panel); border: 1px solid var(--line); border-radius: 18px; | |
| 68 | + box-shadow: 0 24px 48px rgba(31, 42, 36, 0.18); | |
| 69 | + } | |
| 70 | + .report-modal-head { | |
| 71 | + flex: 0 0 auto; display: flex; align-items: flex-start; justify-content: space-between; gap: 12px; | |
| 72 | + padding: 16px 18px; border-bottom: 1px solid var(--line); | |
| 73 | + } | |
| 74 | + .report-modal-head h3 { margin: 0; font-size: 15px; font-weight: 700; word-break: break-all; } | |
| 75 | + .report-modal-head .head-actions { display: flex; gap: 8px; flex-shrink: 0; } | |
| 76 | + .report-modal-head button { padding: 8px 12px; font-size: 13px; border-radius: 10px; } | |
| 77 | + .report-modal-meta { flex: 0 0 auto; padding: 10px 18px; font-size: 12px; border-bottom: 1px solid var(--line); background: rgba(255,253,248,0.9); } | |
| 78 | + .report-modal-body { | |
| 79 | + flex: 1 1 auto; overflow: auto; padding: 18px 22px 22px; | |
| 80 | + font-size: 14px; line-height: 1.55; | |
| 81 | + } | |
| 82 | + .batch-report-md h1 { font-size: 1.35rem; margin: 0 0 0.75rem; color: #12433d; } | |
| 83 | + .batch-report-md h2 { font-size: 1.05rem; margin: 1.35rem 0 0.6rem; padding-bottom: 0.35rem; border-bottom: 1px solid var(--line); color: #1a5249; } | |
| 84 | + .batch-report-md h2:first-of-type { margin-top: 0; } | |
| 85 | + .batch-report-md h3 { font-size: 0.95rem; margin: 1rem 0 0.4rem; color: var(--ink); font-weight: 700; } | |
| 86 | + .batch-report-md ul { margin: 0.35rem 0 0.5rem; padding-left: 1.25rem; } | |
| 87 | + .batch-report-md li { margin: 0.2rem 0; } | |
| 88 | + .batch-report-md code { font-size: 0.88em; background: #e8e4d8; padding: 0.12em 0.35em; border-radius: 4px; } | |
| 89 | + .report-modal-body.report-modal-loading, .report-modal-body.report-modal-error { color: var(--muted); font-style: italic; } | |
| 90 | + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } | |
| 91 | + .tip { margin-bottom: 6px; color: var(--muted); } | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.js
0 โ 100644
| ... | ... | @@ -0,0 +1,181 @@ |
| 1 | + async function fetchJSON(url, options) { | |
| 2 | + const res = await fetch(url, options); | |
| 3 | + if (!res.ok) throw new Error(await res.text()); | |
| 4 | + return await res.json(); | |
| 5 | + } | |
| 6 | + function renderMetrics(metrics) { | |
| 7 | + const root = document.getElementById('metrics'); | |
| 8 | + root.innerHTML = ''; | |
| 9 | + Object.entries(metrics || {}).forEach(([key, value]) => { | |
| 10 | + const card = document.createElement('div'); | |
| 11 | + card.className = 'metric'; | |
| 12 | + card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`; | |
| 13 | + root.appendChild(card); | |
| 14 | + }); | |
| 15 | + } | |
| 16 | + function renderResults(results, rootId='results', showRank=true) { | |
| 17 | + const mount = document.getElementById(rootId); | |
| 18 | + mount.innerHTML = ''; | |
| 19 | + (results || []).forEach(item => { | |
| 20 | + const label = item.label || 'Unknown'; | |
| 21 | + const box = document.createElement('div'); | |
| 22 | + box.className = 'result'; | |
| 23 | + box.innerHTML = ` | |
| 24 | + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 25 | + <img class="thumb" src="${item.image_url || ''}" alt="" /> | |
| 26 | + <div> | |
| 27 | + <div class="title">${item.title || ''}</div> | |
| 28 | + <div class="options"> | |
| 29 | + <div>${(item.option_values || [])[0] || ''}</div> | |
| 30 | + <div>${(item.option_values || [])[1] || ''}</div> | |
| 31 | + <div>${(item.option_values || [])[2] || ''}</div> | |
| 32 | + </div> | |
| 33 | + </div>`; | |
| 34 | + mount.appendChild(box); | |
| 35 | + }); | |
| 36 | + if (!(results || []).length) { | |
| 37 | + mount.innerHTML = '<div class="muted">None.</div>'; | |
| 38 | + } | |
| 39 | + } | |
| 40 | + function renderTips(data) { | |
| 41 | + const root = document.getElementById('tips'); | |
| 42 | + const tips = [...(data.tips || [])]; | |
| 43 | + const stats = data.label_stats || {}; | |
| 44 | + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | |
| 45 | + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); | |
| 46 | + } | |
| 47 | + async function loadQueries() { | |
| 48 | + const data = await fetchJSON('/api/queries'); | |
| 49 | + const root = document.getElementById('queryList'); | |
| 50 | + root.innerHTML = ''; | |
| 51 | + data.queries.forEach(query => { | |
| 52 | + const btn = document.createElement('button'); | |
| 53 | + btn.className = 'query-item'; | |
| 54 | + btn.textContent = query; | |
| 55 | + btn.onclick = () => { | |
| 56 | + document.getElementById('queryInput').value = query; | |
| 57 | + runSingle(); | |
| 58 | + }; | |
| 59 | + root.appendChild(btn); | |
| 60 | + }); | |
| 61 | + } | |
| 62 | + function fmtMetric(m, key, digits) { | |
| 63 | + const v = m && m[key]; | |
| 64 | + if (v == null || Number.isNaN(Number(v))) return null; | |
| 65 | + const n = Number(v); | |
| 66 | + return n.toFixed(digits); | |
| 67 | + } | |
| 68 | + function historySummaryHtml(meta) { | |
| 69 | + const m = meta && meta.aggregate_metrics; | |
| 70 | + const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | |
| 71 | + const parts = []; | |
| 72 | + if (nq != null) parts.push(`<span>Queries</span> ${nq}`); | |
| 73 | + const p10 = fmtMetric(m, 'P@10', 3); | |
| 74 | + const p52 = fmtMetric(m, 'P@5_2_3', 3); | |
| 75 | + const map3 = fmtMetric(m, 'MAP_3', 3); | |
| 76 | + if (p10) parts.push(`<span>P@10</span> ${p10}`); | |
| 77 | + if (p52) parts.push(`<span>P@5_2_3</span> ${p52}`); | |
| 78 | + if (map3) parts.push(`<span>MAP_3</span> ${map3}`); | |
| 79 | + if (!parts.length) return ''; | |
| 80 | + return `<div class="hstats">${parts.join(' ยท ')}</div>`; | |
| 81 | + } | |
| 82 | + async function loadHistory() { | |
| 83 | + const data = await fetchJSON('/api/history'); | |
| 84 | + const root = document.getElementById('history'); | |
| 85 | + root.classList.remove('muted'); | |
| 86 | + const items = data.history || []; | |
| 87 | + if (!items.length) { | |
| 88 | + root.innerHTML = '<span class="muted">No history yet.</span>'; | |
| 89 | + return; | |
| 90 | + } | |
| 91 | + root.innerHTML = `<div class="history-list"></div>`; | |
| 92 | + const list = root.querySelector('.history-list'); | |
| 93 | + items.forEach(item => { | |
| 94 | + const btn = document.createElement('button'); | |
| 95 | + btn.type = 'button'; | |
| 96 | + btn.className = 'history-item'; | |
| 97 | + btn.setAttribute('aria-label', `Open report ${item.batch_id}`); | |
| 98 | + const sum = historySummaryHtml(item.metadata); | |
| 99 | + btn.innerHTML = `<div class="hid">${item.batch_id}</div> | |
| 100 | + <div class="hmeta">${item.created_at} ยท tenant ${item.tenant_id}</div>${sum}`; | |
| 101 | + btn.onclick = () => openBatchReport(item.batch_id); | |
| 102 | + list.appendChild(btn); | |
| 103 | + }); | |
| 104 | + } | |
| 105 | + let _lastReportPath = ''; | |
| 106 | + function closeReportModal() { | |
| 107 | + const el = document.getElementById('reportModal'); | |
| 108 | + el.classList.remove('is-open'); | |
| 109 | + el.setAttribute('aria-hidden', 'true'); | |
| 110 | + document.getElementById('reportModalBody').innerHTML = ''; | |
| 111 | + document.getElementById('reportModalMeta').textContent = ''; | |
| 112 | + } | |
| 113 | + async function openBatchReport(batchId) { | |
| 114 | + const el = document.getElementById('reportModal'); | |
| 115 | + const body = document.getElementById('reportModalBody'); | |
| 116 | + const metaEl = document.getElementById('reportModalMeta'); | |
| 117 | + const titleEl = document.getElementById('reportModalTitle'); | |
| 118 | + el.classList.add('is-open'); | |
| 119 | + el.setAttribute('aria-hidden', 'false'); | |
| 120 | + titleEl.textContent = batchId; | |
| 121 | + metaEl.textContent = ''; | |
| 122 | + body.className = 'report-modal-body batch-report-md report-modal-loading'; | |
| 123 | + body.textContent = 'Loading reportโฆ'; | |
| 124 | + try { | |
| 125 | + const rep = await fetchJSON('/api/history/' + encodeURIComponent(batchId) + '/report'); | |
| 126 | + _lastReportPath = rep.report_markdown_path || ''; | |
| 127 | + metaEl.textContent = rep.report_markdown_path || ''; | |
| 128 | + const raw = marked.parse(rep.markdown || '', { gfm: true }); | |
| 129 | + const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } }); | |
| 130 | + body.className = 'report-modal-body batch-report-md'; | |
| 131 | + body.innerHTML = safe; | |
| 132 | + } catch (e) { | |
| 133 | + body.className = 'report-modal-body report-modal-error'; | |
| 134 | + body.textContent = (e && e.message) ? e.message : String(e); | |
| 135 | + } | |
| 136 | + } | |
| 137 | + document.getElementById('reportModal').addEventListener('click', (ev) => { | |
| 138 | + if (ev.target && ev.target.getAttribute('data-close-report') === '1') closeReportModal(); | |
| 139 | + }); | |
| 140 | + document.addEventListener('keydown', (ev) => { | |
| 141 | + if (ev.key === 'Escape') closeReportModal(); | |
| 142 | + }); | |
| 143 | + document.getElementById('reportCopyPath').addEventListener('click', async () => { | |
| 144 | + if (!_lastReportPath) return; | |
| 145 | + try { | |
| 146 | + await navigator.clipboard.writeText(_lastReportPath); | |
| 147 | + } catch (_) {} | |
| 148 | + }); | |
| 149 | + async function runSingle() { | |
| 150 | + const query = document.getElementById('queryInput').value.trim(); | |
| 151 | + if (!query) return; | |
| 152 | + document.getElementById('status').textContent = `Evaluating "${query}"...`; | |
| 153 | + const data = await fetchJSON('/api/search-eval', { | |
| 154 | + method: 'POST', | |
| 155 | + headers: {'Content-Type': 'application/json'}, | |
| 156 | + body: JSON.stringify({query, top_k: 100, auto_annotate: false}) | |
| 157 | + }); | |
| 158 | + document.getElementById('status').textContent = `Done. total=${data.total}`; | |
| 159 | + renderMetrics(data.metrics); | |
| 160 | + renderResults(data.results, 'results', true); | |
| 161 | + renderResults(data.missing_relevant, 'missingRelevant', false); | |
| 162 | + renderTips(data); | |
| 163 | + loadHistory(); | |
| 164 | + } | |
| 165 | + async function runBatch() { | |
| 166 | + document.getElementById('status').textContent = 'Running batch evaluation...'; | |
| 167 | + const data = await fetchJSON('/api/batch-eval', { | |
| 168 | + method: 'POST', | |
| 169 | + headers: {'Content-Type': 'application/json'}, | |
| 170 | + body: JSON.stringify({top_k: 100, auto_annotate: false}) | |
| 171 | + }); | |
| 172 | + document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; | |
| 173 | + renderMetrics(data.aggregate_metrics); | |
| 174 | + renderResults([], 'results', true); | |
| 175 | + renderResults([], 'missingRelevant', false); | |
| 176 | + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>'; | |
| 177 | + loadHistory(); | |
| 178 | + } | |
| 179 | + loadQueries(); | |
| 180 | + loadHistory(); | |
| 181 | + | ... | ... |
scripts/evaluation/eval_framework/static/index.html
0 โ 100644
| ... | ... | @@ -0,0 +1,70 @@ |
| 1 | +<!doctype html> | |
| 2 | +<html lang="en"> | |
| 3 | +<head> | |
| 4 | + <meta charset="utf-8" /> | |
| 5 | + <meta name="viewport" content="width=device-width, initial-scale=1" /> | |
| 6 | + <title>Search Evaluation</title> | |
| 7 | + <link rel="stylesheet" href="/static/eval_web.css" /> | |
| 8 | + | |
| 9 | +</head> | |
| 10 | +<body> | |
| 11 | + <div class="app"> | |
| 12 | + <aside class="sidebar"> | |
| 13 | + <h2>Queries</h2> | |
| 14 | + <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p> | |
| 15 | + <div id="queryList" class="query-list"></div> | |
| 16 | + <div class="section"> | |
| 17 | + <h2>History</h2> | |
| 18 | + <p class="muted" style="font-size:12px;margin:0 0 4px">Click a run to open the batch markdown report.</p> | |
| 19 | + <div id="history" class="history muted">Loading...</div> | |
| 20 | + </div> | |
| 21 | + </aside> | |
| 22 | + <main class="main"> | |
| 23 | + <h1>Search Evaluation</h1> | |
| 24 | + <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p> | |
| 25 | + <div class="toolbar"> | |
| 26 | + <input id="queryInput" type="text" placeholder="Search query" /> | |
| 27 | + <button onclick="runSingle()">Evaluate Query</button> | |
| 28 | + <button class="secondary" onclick="runBatch()">Batch Evaluation</button> | |
| 29 | + </div> | |
| 30 | + <div id="status" class="muted section"></div> | |
| 31 | + <section class="section"> | |
| 32 | + <h2>Metrics</h2> | |
| 33 | + <div id="metrics" class="grid"></div> | |
| 34 | + </section> | |
| 35 | + <section class="section"> | |
| 36 | + <h2>Top Results</h2> | |
| 37 | + <div id="results" class="results"></div> | |
| 38 | + </section> | |
| 39 | + <section class="section"> | |
| 40 | + <h2>Missed Exact / Partial</h2> | |
| 41 | + <div id="missingRelevant" class="results"></div> | |
| 42 | + </section> | |
| 43 | + <section class="section"> | |
| 44 | + <h2>Notes</h2> | |
| 45 | + <div id="tips" class="tips muted"></div> | |
| 46 | + </section> | |
| 47 | + </main> | |
| 48 | + </div> | |
| 49 | + <div id="reportModal" class="report-modal-root" aria-hidden="true"> | |
| 50 | + <div class="report-modal-backdrop" data-close-report="1"></div> | |
| 51 | + <div class="report-modal-dialog" role="dialog" aria-modal="true" aria-labelledby="reportModalTitle"> | |
| 52 | + <div class="report-modal-head"> | |
| 53 | + <h3 id="reportModalTitle">Batch report</h3> | |
| 54 | + <div class="head-actions"> | |
| 55 | + <button type="button" class="secondary" id="reportCopyPath">Copy path</button> | |
| 56 | + <button type="button" onclick="closeReportModal()">Close</button> | |
| 57 | + </div> | |
| 58 | + </div> | |
| 59 | + <div id="reportModalMeta" class="report-modal-meta muted"></div> | |
| 60 | + <div id="reportModalBody" class="report-modal-body batch-report-md"></div> | |
| 61 | + </div> | |
| 62 | + </div> | |
| 63 | + | |
| 64 | + | |
| 65 | + | |
| 66 | + <script src="https://cdn.jsdelivr.net/npm/marked@12.0.2/marked.min.js"></script> | |
| 67 | + <script src="https://cdn.jsdelivr.net/npm/dompurify@3.1.6/dist/purify.min.js"></script> | |
| 68 | + <script src="/static/eval_web.js"></script> | |
| 69 | +</body> | |
| 70 | +</html> | |
| 0 | 71 | \ No newline at end of file | ... | ... |
| ... | ... | @@ -0,0 +1,426 @@ |
| 1 | +"""SQLite persistence for evaluation corpus, labels, rerank scores, and run metadata.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import json | |
| 6 | +import sqlite3 | |
| 7 | +from dataclasses import dataclass | |
| 8 | +from pathlib import Path | |
| 9 | +from typing import Any, Dict, List, Optional, Sequence | |
| 10 | + | |
| 11 | +from .constants import VALID_LABELS | |
| 12 | +from .utils import ensure_dir, safe_json_dumps, utc_now_iso | |
| 13 | + | |
| 14 | + | |
| 15 | +@dataclass | |
| 16 | +class QueryBuildResult: | |
| 17 | + query: str | |
| 18 | + tenant_id: str | |
| 19 | + search_total: int | |
| 20 | + search_depth: int | |
| 21 | + rerank_corpus_size: int | |
| 22 | + annotated_count: int | |
| 23 | + output_json_path: Path | |
| 24 | + | |
| 25 | + | |
| 26 | +class EvalStore: | |
| 27 | + def __init__(self, db_path: Path): | |
| 28 | + self.db_path = db_path | |
| 29 | + ensure_dir(db_path.parent) | |
| 30 | + self.conn = sqlite3.connect(str(db_path), check_same_thread=False) | |
| 31 | + self.conn.row_factory = sqlite3.Row | |
| 32 | + self._init_schema() | |
| 33 | + | |
| 34 | + def _init_schema(self) -> None: | |
| 35 | + self.conn.executescript( | |
| 36 | + """ | |
| 37 | + CREATE TABLE IF NOT EXISTS corpus_docs ( | |
| 38 | + tenant_id TEXT NOT NULL, | |
| 39 | + spu_id TEXT NOT NULL, | |
| 40 | + title_json TEXT, | |
| 41 | + vendor_json TEXT, | |
| 42 | + category_path_json TEXT, | |
| 43 | + category_name_json TEXT, | |
| 44 | + image_url TEXT, | |
| 45 | + skus_json TEXT, | |
| 46 | + tags_json TEXT, | |
| 47 | + raw_json TEXT NOT NULL, | |
| 48 | + updated_at TEXT NOT NULL, | |
| 49 | + PRIMARY KEY (tenant_id, spu_id) | |
| 50 | + ); | |
| 51 | + | |
| 52 | + CREATE TABLE IF NOT EXISTS rerank_scores ( | |
| 53 | + tenant_id TEXT NOT NULL, | |
| 54 | + query_text TEXT NOT NULL, | |
| 55 | + spu_id TEXT NOT NULL, | |
| 56 | + score REAL NOT NULL, | |
| 57 | + model_name TEXT, | |
| 58 | + updated_at TEXT NOT NULL, | |
| 59 | + PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 60 | + ); | |
| 61 | + | |
| 62 | + CREATE TABLE IF NOT EXISTS relevance_labels ( | |
| 63 | + tenant_id TEXT NOT NULL, | |
| 64 | + query_text TEXT NOT NULL, | |
| 65 | + spu_id TEXT NOT NULL, | |
| 66 | + label TEXT NOT NULL, | |
| 67 | + judge_model TEXT, | |
| 68 | + raw_response TEXT, | |
| 69 | + updated_at TEXT NOT NULL, | |
| 70 | + PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 71 | + ); | |
| 72 | + | |
| 73 | + CREATE TABLE IF NOT EXISTS build_runs ( | |
| 74 | + run_id TEXT PRIMARY KEY, | |
| 75 | + tenant_id TEXT NOT NULL, | |
| 76 | + query_text TEXT NOT NULL, | |
| 77 | + output_json_path TEXT NOT NULL, | |
| 78 | + metadata_json TEXT NOT NULL, | |
| 79 | + created_at TEXT NOT NULL | |
| 80 | + ); | |
| 81 | + | |
| 82 | + CREATE TABLE IF NOT EXISTS batch_runs ( | |
| 83 | + batch_id TEXT PRIMARY KEY, | |
| 84 | + tenant_id TEXT NOT NULL, | |
| 85 | + output_json_path TEXT NOT NULL, | |
| 86 | + report_markdown_path TEXT NOT NULL, | |
| 87 | + config_snapshot_path TEXT NOT NULL, | |
| 88 | + metadata_json TEXT NOT NULL, | |
| 89 | + created_at TEXT NOT NULL | |
| 90 | + ); | |
| 91 | + | |
| 92 | + CREATE TABLE IF NOT EXISTS query_profiles ( | |
| 93 | + tenant_id TEXT NOT NULL, | |
| 94 | + query_text TEXT NOT NULL, | |
| 95 | + prompt_version TEXT NOT NULL, | |
| 96 | + judge_model TEXT, | |
| 97 | + profile_json TEXT NOT NULL, | |
| 98 | + raw_response TEXT NOT NULL, | |
| 99 | + updated_at TEXT NOT NULL, | |
| 100 | + PRIMARY KEY (tenant_id, query_text, prompt_version) | |
| 101 | + ); | |
| 102 | + """ | |
| 103 | + ) | |
| 104 | + self.conn.commit() | |
| 105 | + | |
| 106 | + def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None: | |
| 107 | + now = utc_now_iso() | |
| 108 | + rows = [] | |
| 109 | + for doc in docs: | |
| 110 | + rows.append( | |
| 111 | + ( | |
| 112 | + tenant_id, | |
| 113 | + str(doc.get("spu_id") or ""), | |
| 114 | + safe_json_dumps(doc.get("title")), | |
| 115 | + safe_json_dumps(doc.get("vendor")), | |
| 116 | + safe_json_dumps(doc.get("category_path")), | |
| 117 | + safe_json_dumps(doc.get("category_name")), | |
| 118 | + str(doc.get("image_url") or ""), | |
| 119 | + safe_json_dumps(doc.get("skus") or []), | |
| 120 | + safe_json_dumps(doc.get("tags") or []), | |
| 121 | + safe_json_dumps(doc), | |
| 122 | + now, | |
| 123 | + ) | |
| 124 | + ) | |
| 125 | + self.conn.executemany( | |
| 126 | + """ | |
| 127 | + INSERT INTO corpus_docs ( | |
| 128 | + tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json, | |
| 129 | + image_url, skus_json, tags_json, raw_json, updated_at | |
| 130 | + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| 131 | + ON CONFLICT(tenant_id, spu_id) DO UPDATE SET | |
| 132 | + title_json=excluded.title_json, | |
| 133 | + vendor_json=excluded.vendor_json, | |
| 134 | + category_path_json=excluded.category_path_json, | |
| 135 | + category_name_json=excluded.category_name_json, | |
| 136 | + image_url=excluded.image_url, | |
| 137 | + skus_json=excluded.skus_json, | |
| 138 | + tags_json=excluded.tags_json, | |
| 139 | + raw_json=excluded.raw_json, | |
| 140 | + updated_at=excluded.updated_at | |
| 141 | + """, | |
| 142 | + rows, | |
| 143 | + ) | |
| 144 | + self.conn.commit() | |
| 145 | + | |
| 146 | + def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 147 | + rows = self.conn.execute( | |
| 148 | + "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id", | |
| 149 | + (tenant_id,), | |
| 150 | + ).fetchall() | |
| 151 | + return [json.loads(row["raw_json"]) for row in rows] | |
| 152 | + | |
| 153 | + def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]: | |
| 154 | + keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()] | |
| 155 | + if not keys: | |
| 156 | + return {} | |
| 157 | + placeholders = ",".join("?" for _ in keys) | |
| 158 | + rows = self.conn.execute( | |
| 159 | + f""" | |
| 160 | + SELECT spu_id, raw_json | |
| 161 | + FROM corpus_docs | |
| 162 | + WHERE tenant_id=? AND spu_id IN ({placeholders}) | |
| 163 | + """, | |
| 164 | + [tenant_id, *keys], | |
| 165 | + ).fetchall() | |
| 166 | + return { | |
| 167 | + str(row["spu_id"]): json.loads(row["raw_json"]) | |
| 168 | + for row in rows | |
| 169 | + } | |
| 170 | + | |
| 171 | + def has_corpus(self, tenant_id: str) -> bool: | |
| 172 | + row = self.conn.execute( | |
| 173 | + "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?", | |
| 174 | + (tenant_id,), | |
| 175 | + ).fetchone() | |
| 176 | + return bool(row and row["n"] > 0) | |
| 177 | + | |
| 178 | + def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]: | |
| 179 | + rows = self.conn.execute( | |
| 180 | + """ | |
| 181 | + SELECT spu_id, score | |
| 182 | + FROM rerank_scores | |
| 183 | + WHERE tenant_id=? AND query_text=? | |
| 184 | + """, | |
| 185 | + (tenant_id, query_text), | |
| 186 | + ).fetchall() | |
| 187 | + return {str(row["spu_id"]): float(row["score"]) for row in rows} | |
| 188 | + | |
| 189 | + def upsert_rerank_scores( | |
| 190 | + self, | |
| 191 | + tenant_id: str, | |
| 192 | + query_text: str, | |
| 193 | + scores: Dict[str, float], | |
| 194 | + model_name: str, | |
| 195 | + ) -> None: | |
| 196 | + now = utc_now_iso() | |
| 197 | + rows = [ | |
| 198 | + (tenant_id, query_text, spu_id, float(score), model_name, now) | |
| 199 | + for spu_id, score in scores.items() | |
| 200 | + ] | |
| 201 | + self.conn.executemany( | |
| 202 | + """ | |
| 203 | + INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at) | |
| 204 | + VALUES (?, ?, ?, ?, ?, ?) | |
| 205 | + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 206 | + score=excluded.score, | |
| 207 | + model_name=excluded.model_name, | |
| 208 | + updated_at=excluded.updated_at | |
| 209 | + """, | |
| 210 | + rows, | |
| 211 | + ) | |
| 212 | + self.conn.commit() | |
| 213 | + | |
| 214 | + def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]: | |
| 215 | + rows = self.conn.execute( | |
| 216 | + """ | |
| 217 | + SELECT spu_id, label | |
| 218 | + FROM relevance_labels | |
| 219 | + WHERE tenant_id=? AND query_text=? | |
| 220 | + """, | |
| 221 | + (tenant_id, query_text), | |
| 222 | + ).fetchall() | |
| 223 | + return {str(row["spu_id"]): str(row["label"]) for row in rows} | |
| 224 | + | |
| 225 | + def upsert_labels( | |
| 226 | + self, | |
| 227 | + tenant_id: str, | |
| 228 | + query_text: str, | |
| 229 | + labels: Dict[str, str], | |
| 230 | + judge_model: str, | |
| 231 | + raw_response: str, | |
| 232 | + ) -> None: | |
| 233 | + now = utc_now_iso() | |
| 234 | + rows = [] | |
| 235 | + for spu_id, label in labels.items(): | |
| 236 | + if label not in VALID_LABELS: | |
| 237 | + raise ValueError(f"invalid label: {label}") | |
| 238 | + rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now)) | |
| 239 | + self.conn.executemany( | |
| 240 | + """ | |
| 241 | + INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at) | |
| 242 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 243 | + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 244 | + label=excluded.label, | |
| 245 | + judge_model=excluded.judge_model, | |
| 246 | + raw_response=excluded.raw_response, | |
| 247 | + updated_at=excluded.updated_at | |
| 248 | + """, | |
| 249 | + rows, | |
| 250 | + ) | |
| 251 | + self.conn.commit() | |
| 252 | + | |
| 253 | + def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]: | |
| 254 | + row = self.conn.execute( | |
| 255 | + """ | |
| 256 | + SELECT profile_json | |
| 257 | + FROM query_profiles | |
| 258 | + WHERE tenant_id=? AND query_text=? AND prompt_version=? | |
| 259 | + """, | |
| 260 | + (tenant_id, query_text, prompt_version), | |
| 261 | + ).fetchone() | |
| 262 | + if not row: | |
| 263 | + return None | |
| 264 | + return json.loads(row["profile_json"]) | |
| 265 | + | |
| 266 | + def upsert_query_profile( | |
| 267 | + self, | |
| 268 | + tenant_id: str, | |
| 269 | + query_text: str, | |
| 270 | + prompt_version: str, | |
| 271 | + judge_model: str, | |
| 272 | + profile: Dict[str, Any], | |
| 273 | + raw_response: str, | |
| 274 | + ) -> None: | |
| 275 | + self.conn.execute( | |
| 276 | + """ | |
| 277 | + INSERT OR REPLACE INTO query_profiles | |
| 278 | + (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at) | |
| 279 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 280 | + """, | |
| 281 | + ( | |
| 282 | + tenant_id, | |
| 283 | + query_text, | |
| 284 | + prompt_version, | |
| 285 | + judge_model, | |
| 286 | + safe_json_dumps(profile), | |
| 287 | + raw_response, | |
| 288 | + utc_now_iso(), | |
| 289 | + ), | |
| 290 | + ) | |
| 291 | + self.conn.commit() | |
| 292 | + | |
| 293 | + def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None: | |
| 294 | + self.conn.execute( | |
| 295 | + """ | |
| 296 | + INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at) | |
| 297 | + VALUES (?, ?, ?, ?, ?, ?) | |
| 298 | + """, | |
| 299 | + (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()), | |
| 300 | + ) | |
| 301 | + self.conn.commit() | |
| 302 | + | |
| 303 | + def insert_batch_run( | |
| 304 | + self, | |
| 305 | + batch_id: str, | |
| 306 | + tenant_id: str, | |
| 307 | + output_json_path: Path, | |
| 308 | + report_markdown_path: Path, | |
| 309 | + config_snapshot_path: Path, | |
| 310 | + metadata: Dict[str, Any], | |
| 311 | + ) -> None: | |
| 312 | + self.conn.execute( | |
| 313 | + """ | |
| 314 | + INSERT OR REPLACE INTO batch_runs | |
| 315 | + (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at) | |
| 316 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 317 | + """, | |
| 318 | + ( | |
| 319 | + batch_id, | |
| 320 | + tenant_id, | |
| 321 | + str(output_json_path), | |
| 322 | + str(report_markdown_path), | |
| 323 | + str(config_snapshot_path), | |
| 324 | + safe_json_dumps(metadata), | |
| 325 | + utc_now_iso(), | |
| 326 | + ), | |
| 327 | + ) | |
| 328 | + self.conn.commit() | |
| 329 | + | |
| 330 | + def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]: | |
| 331 | + rows = self.conn.execute( | |
| 332 | + """ | |
| 333 | + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | |
| 334 | + FROM batch_runs | |
| 335 | + ORDER BY created_at DESC | |
| 336 | + LIMIT ? | |
| 337 | + """, | |
| 338 | + (limit,), | |
| 339 | + ).fetchall() | |
| 340 | + items: List[Dict[str, Any]] = [] | |
| 341 | + for row in rows: | |
| 342 | + items.append( | |
| 343 | + { | |
| 344 | + "batch_id": row["batch_id"], | |
| 345 | + "tenant_id": row["tenant_id"], | |
| 346 | + "output_json_path": row["output_json_path"], | |
| 347 | + "report_markdown_path": row["report_markdown_path"], | |
| 348 | + "config_snapshot_path": row["config_snapshot_path"], | |
| 349 | + "metadata": json.loads(row["metadata_json"]), | |
| 350 | + "created_at": row["created_at"], | |
| 351 | + } | |
| 352 | + ) | |
| 353 | + return items | |
| 354 | + | |
| 355 | + def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]: | |
| 356 | + row = self.conn.execute( | |
| 357 | + """ | |
| 358 | + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | |
| 359 | + FROM batch_runs | |
| 360 | + WHERE batch_id = ? | |
| 361 | + """, | |
| 362 | + (batch_id,), | |
| 363 | + ).fetchone() | |
| 364 | + if row is None: | |
| 365 | + return None | |
| 366 | + return { | |
| 367 | + "batch_id": row["batch_id"], | |
| 368 | + "tenant_id": row["tenant_id"], | |
| 369 | + "output_json_path": row["output_json_path"], | |
| 370 | + "report_markdown_path": row["report_markdown_path"], | |
| 371 | + "config_snapshot_path": row["config_snapshot_path"], | |
| 372 | + "metadata": json.loads(row["metadata_json"]), | |
| 373 | + "created_at": row["created_at"], | |
| 374 | + } | |
| 375 | + | |
| 376 | + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 377 | + rows = self.conn.execute( | |
| 378 | + """ | |
| 379 | + SELECT | |
| 380 | + query_text, | |
| 381 | + COUNT(*) AS total, | |
| 382 | + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 383 | + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 384 | + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 385 | + MAX(updated_at) AS updated_at | |
| 386 | + FROM relevance_labels | |
| 387 | + WHERE tenant_id=? | |
| 388 | + GROUP BY query_text | |
| 389 | + ORDER BY query_text | |
| 390 | + """, | |
| 391 | + (tenant_id,), | |
| 392 | + ).fetchall() | |
| 393 | + return [ | |
| 394 | + { | |
| 395 | + "query": str(row["query_text"]), | |
| 396 | + "total": int(row["total"]), | |
| 397 | + "exact_count": int(row["exact_count"] or 0), | |
| 398 | + "partial_count": int(row["partial_count"] or 0), | |
| 399 | + "irrelevant_count": int(row["irrelevant_count"] or 0), | |
| 400 | + "updated_at": row["updated_at"], | |
| 401 | + } | |
| 402 | + for row in rows | |
| 403 | + ] | |
| 404 | + | |
| 405 | + def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]: | |
| 406 | + row = self.conn.execute( | |
| 407 | + """ | |
| 408 | + SELECT | |
| 409 | + COUNT(*) AS total, | |
| 410 | + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 411 | + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 412 | + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 413 | + MAX(updated_at) AS updated_at | |
| 414 | + FROM relevance_labels | |
| 415 | + WHERE tenant_id=? AND query_text=? | |
| 416 | + """, | |
| 417 | + (tenant_id, query_text), | |
| 418 | + ).fetchone() | |
| 419 | + return { | |
| 420 | + "query": query_text, | |
| 421 | + "total": int((row["total"] or 0) if row else 0), | |
| 422 | + "exact_count": int((row["exact_count"] or 0) if row else 0), | |
| 423 | + "partial_count": int((row["partial_count"] or 0) if row else 0), | |
| 424 | + "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), | |
| 425 | + "updated_at": row["updated_at"] if row else None, | |
| 426 | + } | ... | ... |
| ... | ... | @@ -0,0 +1,145 @@ |
| 1 | +"""Small helpers: time, JSON, document text, LLM output parsing.""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import hashlib | |
| 6 | +import json | |
| 7 | +import re | |
| 8 | +from datetime import datetime, timezone | |
| 9 | +from pathlib import Path | |
| 10 | +from typing import Any, Dict, List, Sequence, Tuple | |
| 11 | + | |
| 12 | +from .constants import PROJECT_ROOT | |
| 13 | + | |
| 14 | + | |
| 15 | +def utc_now_iso() -> str: | |
| 16 | + return datetime.now(timezone.utc).isoformat() | |
| 17 | + | |
| 18 | + | |
| 19 | +def utc_timestamp() -> str: | |
| 20 | + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| 21 | + | |
| 22 | + | |
| 23 | +def ensure_dir(path: Path) -> Path: | |
| 24 | + path.mkdir(parents=True, exist_ok=True) | |
| 25 | + return path | |
| 26 | + | |
| 27 | + | |
| 28 | +def sha1_text(text: str) -> str: | |
| 29 | + return hashlib.sha1(text.encode("utf-8")).hexdigest() | |
| 30 | + | |
| 31 | + | |
| 32 | +def pick_text(value: Any, preferred_lang: str = "en") -> str: | |
| 33 | + if value is None: | |
| 34 | + return "" | |
| 35 | + if isinstance(value, dict): | |
| 36 | + return str( | |
| 37 | + value.get(preferred_lang) | |
| 38 | + or value.get("en") | |
| 39 | + or value.get("zh") | |
| 40 | + or next((v for v in value.values() if v), "") | |
| 41 | + ).strip() | |
| 42 | + return str(value).strip() | |
| 43 | + | |
| 44 | + | |
| 45 | +def safe_json_dumps(data: Any) -> str: | |
| 46 | + return json.dumps(data, ensure_ascii=False, separators=(",", ":")) | |
| 47 | + | |
| 48 | + | |
| 49 | +def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]: | |
| 50 | + if not skus: | |
| 51 | + return "", "", "" | |
| 52 | + first = skus[0] or {} | |
| 53 | + return ( | |
| 54 | + str(first.get("option1_value") or "").strip(), | |
| 55 | + str(first.get("option2_value") or "").strip(), | |
| 56 | + str(first.get("option3_value") or "").strip(), | |
| 57 | + ) | |
| 58 | + | |
| 59 | + | |
| 60 | +def build_display_title(doc: Dict[str, Any]) -> str: | |
| 61 | + title = doc.get("title") | |
| 62 | + en = pick_text(title, "en") | |
| 63 | + zh = pick_text(title, "zh") | |
| 64 | + if en and zh and en != zh: | |
| 65 | + return f"{en} / {zh}" | |
| 66 | + return en or zh | |
| 67 | + | |
| 68 | + | |
| 69 | +def build_rerank_doc(doc: Dict[str, Any]) -> str: | |
| 70 | + title = build_display_title(doc) | |
| 71 | + return title[:400] | |
| 72 | + | |
| 73 | + | |
| 74 | +def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: | |
| 75 | + title = build_display_title(doc) | |
| 76 | + option1, option2, option3 = compact_option_values(doc.get("skus") or []) | |
| 77 | + vendor = pick_text(doc.get("vendor"), "en") | |
| 78 | + category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") | |
| 79 | + tags = doc.get("tags") or [] | |
| 80 | + tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) | |
| 81 | + parts = [title] | |
| 82 | + if option1: | |
| 83 | + parts.append(f"option1={option1}") | |
| 84 | + if option2: | |
| 85 | + parts.append(f"option2={option2}") | |
| 86 | + if option3: | |
| 87 | + parts.append(f"option3={option3}") | |
| 88 | + if vendor: | |
| 89 | + parts.append(f"vendor={vendor}") | |
| 90 | + if category: | |
| 91 | + parts.append(f"category={category}") | |
| 92 | + if tags_text: | |
| 93 | + parts.append(f"tags={tags_text}") | |
| 94 | + return f"{idx}. " + " | ".join(part for part in parts if part) | |
| 95 | + | |
| 96 | + | |
| 97 | +def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: | |
| 98 | + return { | |
| 99 | + "spu_id": str(doc.get("spu_id") or ""), | |
| 100 | + "title": build_display_title(doc), | |
| 101 | + "image_url": doc.get("image_url"), | |
| 102 | + "vendor": pick_text(doc.get("vendor"), "en"), | |
| 103 | + "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), | |
| 104 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 105 | + "tags": list((doc.get("tags") or [])[:6]), | |
| 106 | + } | |
| 107 | + | |
| 108 | + | |
| 109 | +def normalize_text(text: Any) -> str: | |
| 110 | + value = str(text or "").strip().lower() | |
| 111 | + value = re.sub(r"\s+", " ", value) | |
| 112 | + return value | |
| 113 | + | |
| 114 | + | |
| 115 | +def extract_json_blob(text: str) -> Any: | |
| 116 | + cleaned = str(text or "").strip() | |
| 117 | + candidates: List[str] = [cleaned] | |
| 118 | + fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I) | |
| 119 | + candidates.extend(match.strip() for match in fence_matches if match.strip()) | |
| 120 | + | |
| 121 | + for candidate in candidates: | |
| 122 | + try: | |
| 123 | + return json.loads(candidate) | |
| 124 | + except Exception: | |
| 125 | + pass | |
| 126 | + | |
| 127 | + starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"] | |
| 128 | + ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"] | |
| 129 | + for start in starts: | |
| 130 | + for end in reversed(ends): | |
| 131 | + if end <= start: | |
| 132 | + continue | |
| 133 | + fragment = cleaned[start : end + 1] | |
| 134 | + try: | |
| 135 | + return json.loads(fragment) | |
| 136 | + except Exception: | |
| 137 | + continue | |
| 138 | + raise ValueError(f"failed to parse json from: {cleaned[:500]!r}") | |
| 139 | + | |
| 140 | + | |
| 141 | +def ensure_project_on_path() -> None: | |
| 142 | + import sys | |
| 143 | + | |
| 144 | + if str(PROJECT_ROOT) not in sys.path: | |
| 145 | + sys.path.insert(0, str(PROJECT_ROOT)) | ... | ... |
| ... | ... | @@ -0,0 +1,85 @@ |
| 1 | +"""FastAPI app for the search evaluation UI (static frontend + JSON APIs).""" | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +from pathlib import Path | |
| 6 | +from typing import Any, Dict | |
| 7 | + | |
| 8 | +from fastapi import FastAPI, HTTPException | |
| 9 | +from fastapi.responses import HTMLResponse | |
| 10 | +from fastapi.staticfiles import StaticFiles | |
| 11 | + | |
| 12 | +from .api_models import BatchEvalRequest, SearchEvalRequest | |
| 13 | +from .constants import DEFAULT_QUERY_FILE | |
| 14 | +from .framework import SearchEvaluationFramework | |
| 15 | + | |
| 16 | +_STATIC_DIR = Path(__file__).resolve().parent / "static" | |
| 17 | + | |
| 18 | + | |
| 19 | +def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI: | |
| 20 | + app = FastAPI(title="Search Evaluation UI", version="1.0.0") | |
| 21 | + | |
| 22 | + app.mount( | |
| 23 | + "/static", | |
| 24 | + StaticFiles(directory=str(_STATIC_DIR)), | |
| 25 | + name="static", | |
| 26 | + ) | |
| 27 | + | |
| 28 | + index_path = _STATIC_DIR / "index.html" | |
| 29 | + | |
| 30 | + @app.get("/", response_class=HTMLResponse) | |
| 31 | + def home() -> str: | |
| 32 | + return index_path.read_text(encoding="utf-8") | |
| 33 | + | |
| 34 | + @app.get("/api/queries") | |
| 35 | + def api_queries() -> Dict[str, Any]: | |
| 36 | + return {"queries": framework.queries_from_file(query_file)} | |
| 37 | + | |
| 38 | + @app.post("/api/search-eval") | |
| 39 | + def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]: | |
| 40 | + return framework.evaluate_live_query( | |
| 41 | + query=request.query, | |
| 42 | + top_k=request.top_k, | |
| 43 | + auto_annotate=request.auto_annotate, | |
| 44 | + language=request.language, | |
| 45 | + ) | |
| 46 | + | |
| 47 | + @app.post("/api/batch-eval") | |
| 48 | + def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]: | |
| 49 | + queries = request.queries or framework.queries_from_file(query_file) | |
| 50 | + if not queries: | |
| 51 | + raise HTTPException(status_code=400, detail="No queries provided") | |
| 52 | + return framework.batch_evaluate( | |
| 53 | + queries=queries, | |
| 54 | + top_k=request.top_k, | |
| 55 | + auto_annotate=request.auto_annotate, | |
| 56 | + language=request.language, | |
| 57 | + force_refresh_labels=request.force_refresh_labels, | |
| 58 | + ) | |
| 59 | + | |
| 60 | + @app.get("/api/history") | |
| 61 | + def api_history() -> Dict[str, Any]: | |
| 62 | + return {"history": framework.store.list_batch_runs(limit=20)} | |
| 63 | + | |
| 64 | + @app.get("/api/history/{batch_id}/report") | |
| 65 | + def api_history_report(batch_id: str) -> Dict[str, Any]: | |
| 66 | + row = framework.store.get_batch_run(batch_id) | |
| 67 | + if row is None: | |
| 68 | + raise HTTPException(status_code=404, detail="Unknown batch_id") | |
| 69 | + report_path = Path(row["report_markdown_path"]).resolve() | |
| 70 | + root = framework.artifact_root.resolve() | |
| 71 | + try: | |
| 72 | + report_path.relative_to(root) | |
| 73 | + except ValueError: | |
| 74 | + raise HTTPException(status_code=403, detail="Report path is outside artifact root") | |
| 75 | + if not report_path.is_file(): | |
| 76 | + raise HTTPException(status_code=404, detail="Report file not found") | |
| 77 | + return { | |
| 78 | + "batch_id": row["batch_id"], | |
| 79 | + "created_at": row["created_at"], | |
| 80 | + "tenant_id": row["tenant_id"], | |
| 81 | + "report_markdown_path": str(report_path), | |
| 82 | + "markdown": report_path.read_text(encoding="utf-8"), | |
| 83 | + } | |
| 84 | + | |
| 85 | + return app | ... | ... |