From a345b01f79e0926df2b7acc0ca9d3bb31715a654 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 10:00:45 +0800 Subject: [PATCH] eval framework --- docs/Usage-Guide.md | 2 +- docs/issue-2026-03-31-评估框架-done-0331.md | 2 +- docs/相关性检索优化说明.md | 2 +- scripts/evaluation/README.md | 10 +++++----- scripts/evaluation/eval_framework/__init__.py | 16 ++++++++-------- scripts/evaluation/eval_framework/cli.py | 21 ++++----------------- scripts/evaluation/eval_framework/clients.py | 81 ++++++++++++++++++++++++--------------------------------------------------------- scripts/evaluation/eval_framework/constants.py | 29 +++++++++++++++++++++++------ scripts/evaluation/eval_framework/framework.py | 255 +++++++++++++++++++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- scripts/evaluation/eval_framework/metrics.py | 17 +++++++++++------ scripts/evaluation/eval_framework/prompts.py | 80 +++++--------------------------------------------------------------------------- scripts/evaluation/eval_framework/reports.py | 12 +++++++----- scripts/evaluation/eval_framework/static/eval_web.css | 9 +++++---- scripts/evaluation/eval_framework/static/eval_web.js | 8 ++++++-- scripts/evaluation/eval_framework/static/index.html | 2 +- scripts/evaluation/eval_framework/store.py | 20 ++++++++++++-------- scripts/evaluation/quick_start_eval.sh | 52 ---------------------------------------------------- scripts/evaluation/start_eval.sh | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ 18 files changed, 183 insertions(+), 485 deletions(-) delete mode 100755 scripts/evaluation/quick_start_eval.sh create mode 100755 scripts/evaluation/start_eval.sh diff --git a/docs/Usage-Guide.md b/docs/Usage-Guide.md index ae627c9..9686d11 100644 --- a/docs/Usage-Guide.md +++ b/docs/Usage-Guide.md @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t ./scripts/service_ctl.sh restart backend sleep 3 ./scripts/service_ctl.sh status backend -./scripts/evaluation/quick_start_eval.sh batch +./scripts/evaluation/start_eval.sh.sh batch ``` 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 diff --git a/docs/issue-2026-03-31-评估框架-done-0331.md b/docs/issue-2026-03-31-评估框架-done-0331.md index 3783695..257ce6a 100644 --- a/docs/issue-2026-03-31-评估框架-done-0331.md +++ b/docs/issue-2026-03-31-评估框架-done-0331.md @@ -138,7 +138,7 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何 @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py -@quick_start_eval.sh (29-35) +@start_eval.sh.sh (29-35) 请以如下流程为准,进行改造: 如果重建的话,对每个query: 每个搜索结果应该会扫描全库, diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md index 6b0e7ce..29d158c 100644 --- a/docs/相关性检索优化说明.md +++ b/docs/相关性检索优化说明.md @@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t ./scripts/service_ctl.sh restart backend sleep 3 ./scripts/service_ctl.sh status backend -./scripts/evaluation/quick_start_eval.sh batch +./scripts/evaluation/start_eval.sh.sh batch ``` 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index 9366411..59b43a8 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, | `fusion_experiments_round1.json` | Broader first-round experiments | | `queries/queries.txt` | Canonical evaluation queries | | `README_Requirement.md` | Product/requirements reference | -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | +| `start_eval.sh.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | ## Quick start (repo root) @@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS ```bash # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM -./scripts/evaluation/quick_start_eval.sh batch +./scripts/evaluation/start_eval.sh.sh batch # Deep rebuild: per-query full corpus rerank (outside search top-500 pool) + LLM in 50-doc batches along global sort order (early stop; expensive) -./scripts/evaluation/quick_start_eval.sh batch-rebuild +./scripts/evaluation/start_eval.sh.sh batch-rebuild # UI: http://127.0.0.1:6010/ -./scripts/evaluation/quick_start_eval.sh serve +./scripts/evaluation/start_eval.sh.sh serve # or: ./scripts/service_ctl.sh start eval-web ``` @@ -71,7 +71,7 @@ Explicit equivalents: Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline). -### `quick_start_eval.sh batch-rebuild` (deep annotation rebuild) +### `start_eval.sh.sh batch-rebuild` (deep annotation rebuild) This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. diff --git a/scripts/evaluation/eval_framework/__init__.py b/scripts/evaluation/eval_framework/__init__.py index acbcaff..236fb67 100644 --- a/scripts/evaluation/eval_framework/__init__.py +++ b/scripts/evaluation/eval_framework/__init__.py @@ -12,15 +12,15 @@ ensure_project_on_path() from .constants import ( # noqa: E402 DEFAULT_ARTIFACT_ROOT, - DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE, - JUDGE_PROMPT_VERSION_COMPLEX, - JUDGE_PROMPT_VERSION_SIMPLE, PROJECT_ROOT, RELEVANCE_EXACT, + RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, - RELEVANCE_PARTIAL, + RELEVANCE_LOW, + RELEVANCE_NON_IRRELEVANT, VALID_LABELS, + normalize_stored_label, ) from .framework import SearchEvaluationFramework # noqa: E402 from .store import EvalStore, QueryBuildResult # noqa: E402 @@ -36,22 +36,22 @@ from .utils import ( # noqa: E402 __all__ = [ "DEFAULT_ARTIFACT_ROOT", - "DEFAULT_LABELER_MODE", "DEFAULT_QUERY_FILE", "EvalStore", - "JUDGE_PROMPT_VERSION_COMPLEX", - "JUDGE_PROMPT_VERSION_SIMPLE", "PROJECT_ROOT", "QueryBuildResult", "RELEVANCE_EXACT", + "RELEVANCE_HIGH", "RELEVANCE_IRRELEVANT", - "RELEVANCE_PARTIAL", + "RELEVANCE_LOW", + "RELEVANCE_NON_IRRELEVANT", "SearchEvaluationFramework", "VALID_LABELS", "build_cli_parser", "create_web_app", "ensure_dir", "main", + "normalize_stored_label", "render_batch_report_markdown", "sha1_text", "utc_now_iso", diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 6421a0a..9417776 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -8,7 +8,6 @@ from pathlib import Path from typing import Any, Dict from .constants import ( - DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE, DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, @@ -103,7 +102,6 @@ def build_cli_parser() -> argparse.ArgumentParser: build.add_argument("--language", default="en") build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) add_judge_llm_args(build) batch = sub.add_parser("batch", help="Run batch evaluation against live search") @@ -112,7 +110,6 @@ def build_cli_parser() -> argparse.ArgumentParser: batch.add_argument("--top-k", type=int, default=100) batch.add_argument("--language", default="en") batch.add_argument("--force-refresh-labels", action="store_true") - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) add_judge_llm_args(batch) audit = sub.add_parser("audit", help="Audit annotation quality for queries") @@ -122,7 +119,6 @@ def build_cli_parser() -> argparse.ArgumentParser: audit.add_argument("--language", default="en") audit.add_argument("--limit-suspicious", type=int, default=5) audit.add_argument("--force-refresh-labels", action="store_true") - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) add_judge_llm_args(audit) serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") @@ -130,16 +126,13 @@ def build_cli_parser() -> argparse.ArgumentParser: serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) serve.add_argument("--host", default="0.0.0.0") serve.add_argument("--port", type=int, default=6010) - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) add_judge_llm_args(serve) return parser def run_build(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework( - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) - ) + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) summary = [] rebuild_kwargs = {} @@ -191,9 +184,7 @@ def run_build(args: argparse.Namespace) -> None: def run_batch(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework( - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) - ) + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) payload = framework.batch_evaluate( queries=queries, @@ -206,9 +197,7 @@ def run_batch(args: argparse.Namespace) -> None: def run_audit(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework( - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) - ) + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) audit_items = [] for query in queries: @@ -258,9 +247,7 @@ def run_audit(args: argparse.Namespace) -> None: def run_serve(args: argparse.Namespace) -> None: - framework = SearchEvaluationFramework( - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) - ) + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) app = create_web_app(framework, Path(args.queries_file)) import uvicorn diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index d228e42..77edd20 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -11,14 +11,21 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple import requests from .constants import VALID_LABELS -from .prompts import ( - classify_batch_complex_prompt, - classify_batch_simple_prompt, - extract_query_profile_prompt, -) +from .prompts import classify_prompt from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps +def _canonicalize_judge_label(raw: str) -> str | None: + s = str(raw or "").strip().strip('"').strip("'") + if s in VALID_LABELS: + return s + low = s.lower() + for v in VALID_LABELS: + if v.lower() == low: + return v + return None + + class SearchServiceClient: def __init__(self, base_url: str, tenant_id: str): self.base_url = base_url.rstrip("/") @@ -224,71 +231,31 @@ class DashScopeLabelClient: return obj return None - def classify_batch_simple( + def classify_batch( self, query: str, docs: Sequence[Dict[str, Any]], ) -> Tuple[List[str], str]: numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] - prompt = classify_batch_simple_prompt(query, numbered_docs) + prompt = classify_prompt(query, numbered_docs) content, raw_response = self._chat(prompt) - labels = [] + labels: List[str] = [] for line in str(content or "").splitlines(): - label = line.strip() - if label in VALID_LABELS: - labels.append(label) + canon = _canonicalize_judge_label(line) + if canon is not None: + labels.append(canon) if len(labels) != len(docs): payload = extract_json_blob(content) if isinstance(payload, dict) and isinstance(payload.get("labels"), list): labels = [] for item in payload["labels"][: len(docs)]: if isinstance(item, dict): - label = str(item.get("label") or "").strip() + raw_l = str(item.get("label") or "").strip() else: - label = str(item).strip() - if label in VALID_LABELS: - labels.append(label) - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): - raise ValueError(f"unexpected simple label output: {content!r}") - return labels, raw_response - - def extract_query_profile( - self, - query: str, - parser_hints: Dict[str, Any], - ) -> Tuple[Dict[str, Any], str]: - prompt = extract_query_profile_prompt(query, parser_hints) - content, raw_response = self._chat(prompt) - payload = extract_json_blob(content) - if not isinstance(payload, dict): - raise ValueError(f"unexpected query profile payload: {content!r}") - payload.setdefault("normalized_query_en", query) - payload.setdefault("primary_category", "") - payload.setdefault("allowed_categories", []) - payload.setdefault("required_attributes", []) - payload.setdefault("notes", []) - return payload, raw_response - - def classify_batch_complex( - self, - query: str, - query_profile: Dict[str, Any], - docs: Sequence[Dict[str, Any]], - ) -> Tuple[List[str], str]: - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] - prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs) - content, raw_response = self._chat(prompt) - payload = extract_json_blob(content) - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): - raise ValueError(f"unexpected label payload: {content!r}") - labels_payload = payload["labels"] - labels: List[str] = [] - for item in labels_payload[: len(docs)]: - if not isinstance(item, dict): - continue - label = str(item.get("label") or "").strip() - if label in VALID_LABELS: - labels.append(label) + raw_l = str(item).strip() + canon = _canonicalize_judge_label(raw_l) + if canon is not None: + labels.append(canon) if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): - raise ValueError(f"unexpected label output: {content!r}") + raise ValueError(f"unexpected classify output: {content!r}") return labels, raw_response diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index eea6182..4dacc1d 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -6,17 +6,34 @@ _PKG_DIR = Path(__file__).resolve().parent _SCRIPTS_EVAL_DIR = _PKG_DIR.parent PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] -RELEVANCE_EXACT = "Exact" -RELEVANCE_PARTIAL = "Partial" +# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) +RELEVANCE_EXACT = "Exact Match" +RELEVANCE_HIGH = "High Relevant" +RELEVANCE_LOW = "Low Relevant" RELEVANCE_IRRELEVANT = "Irrelevant" -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} + +VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) + +# Precision / MAP "positive" set (all non-irrelevant tiers) +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) + +_LEGACY_LABEL_MAP = { + "Exact": RELEVANCE_EXACT, + "Partial": RELEVANCE_HIGH, +} + + +def normalize_stored_label(label: str) -> str: + """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels.""" + s = str(label).strip() + if s in VALID_LABELS: + return s + return _LEGACY_LABEL_MAP.get(s, s) + DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" -DEFAULT_LABELER_MODE = "simple" # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) DEFAULT_JUDGE_MODEL = "qwen3.5-flash" DEFAULT_JUDGE_ENABLE_THINKING = True diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index ab6260c..32c815a 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Sequence, Tuple import requests from elasticsearch.helpers import scan -from api.app import get_app_config, get_es_client, get_query_parser, init_service +from api.app import get_app_config, get_es_client, init_service from indexer.mapping_generator import get_tenant_index_name from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient @@ -21,7 +21,6 @@ from .constants import ( DEFAULT_JUDGE_DASHSCOPE_BATCH, DEFAULT_JUDGE_ENABLE_THINKING, DEFAULT_JUDGE_MODEL, - DEFAULT_LABELER_MODE, DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, DEFAULT_REBUILD_LLM_BATCH_SIZE, @@ -30,10 +29,11 @@ from .constants import ( DEFAULT_RERANK_HIGH_SKIP_COUNT, DEFAULT_RERANK_HIGH_THRESHOLD, DEFAULT_SEARCH_RECALL_TOP_K, - JUDGE_PROMPT_VERSION_COMPLEX, RELEVANCE_EXACT, + RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, - RELEVANCE_PARTIAL, + RELEVANCE_LOW, + RELEVANCE_NON_IRRELEVANT, VALID_LABELS, ) from .metrics import aggregate_metrics, compute_query_metrics, label_distribution @@ -45,8 +45,6 @@ from .utils import ( compact_option_values, compact_product_payload, ensure_dir, - normalize_text, - pick_text, sha1_text, utc_now_iso, utc_timestamp, @@ -77,7 +75,6 @@ class SearchEvaluationFramework: tenant_id: str, artifact_root: Path = DEFAULT_ARTIFACT_ROOT, search_base_url: str = "http://localhost:6002", - labeler_mode: str = DEFAULT_LABELER_MODE, *, judge_model: str | None = None, enable_thinking: bool | None = None, @@ -86,7 +83,6 @@ class SearchEvaluationFramework: init_service(get_app_config().infrastructure.elasticsearch.host) self.tenant_id = str(tenant_id) self.artifact_root = ensure_dir(artifact_root) - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") self.search_client = SearchServiceClient(search_base_url, self.tenant_id) app_cfg = get_app_config() @@ -112,178 +108,6 @@ class SearchEvaluationFramework: enable_thinking=et, use_batch=use_batch, ) - self.query_parser = None - - def _get_query_parser(self): - if self.query_parser is None: - self.query_parser = get_query_parser() - return self.query_parser - - def build_query_parser_hints(self, query: str) -> Dict[str, Any]: - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) - payload = parsed.to_dict() - payload["text_for_rerank"] = parsed.text_for_rerank() - return payload - - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: - if self.labeler_mode != "complex": - raise RuntimeError("query profiles are only used in complex labeler mode") - if not force_refresh: - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) - if cached is not None: - return cached - parser_hints = self.build_query_parser_hints(query) - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) - profile["parser_hints"] = parser_hints - self.store.upsert_query_profile( - self.tenant_id, - query, - JUDGE_PROMPT_VERSION_COMPLEX, - self.label_client.model, - profile, - raw_response, - ) - return profile - - @staticmethod - def _doc_evidence_text(doc: Dict[str, Any]) -> str: - pieces: List[str] = [ - build_display_title(doc), - pick_text(doc.get("vendor"), "en"), - pick_text(doc.get("category_path"), "en"), - pick_text(doc.get("category_name"), "en"), - ] - for sku in doc.get("skus") or []: - pieces.extend( - [ - str(sku.get("option1_value") or ""), - str(sku.get("option2_value") or ""), - str(sku.get("option3_value") or ""), - ] - ) - for tag in doc.get("tags") or []: - pieces.append(str(tag)) - return normalize_text(" | ".join(piece for piece in pieces if piece)) - - def _apply_rule_based_label_guardrails( - self, - label: str, - query_profile: Dict[str, Any], - doc: Dict[str, Any], - ) -> str: - if label not in VALID_LABELS: - return label - evidence = self._doc_evidence_text(doc) - category = normalize_text(query_profile.get("primary_category")) - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] - - primary_category_match = True - if category: - primary_category_match = category in evidence - allowed_category_match = True - if allowed_categories: - allowed_category_match = any(signal in evidence for signal in allowed_categories) - - if label == RELEVANCE_EXACT and not primary_category_match: - if allowed_category_match: - label = RELEVANCE_PARTIAL - else: - return RELEVANCE_IRRELEVANT - - for attr in query_profile.get("required_attributes") or []: - if not isinstance(attr, dict): - continue - attr_name = normalize_text(attr.get("name")) - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: - continue - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] - if attr_name == "fit": - if any(term in {"oversized", "oversize"} for term in required_terms): - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) - has_required = any(term in evidence for term in required_terms) if required_terms else True - has_conflict = any(term in evidence for term in conflicting_terms) - - if has_conflict: - return RELEVANCE_IRRELEVANT - if label == RELEVANCE_EXACT and not has_required: - label = RELEVANCE_PARTIAL - - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: - return RELEVANCE_IRRELEVANT - - return label - - @staticmethod - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: - option_values = list(item.get("option_values") or []) - while len(option_values) < 3: - option_values.append("") - product = dict(item.get("product") or {}) - return { - "spu_id": item.get("spu_id"), - "title": product.get("title") or item.get("title"), - "vendor": product.get("vendor"), - "category_path": product.get("category"), - "category_name": product.get("category"), - "image_url": item.get("image_url") or product.get("image_url"), - "tags": product.get("tags") or [], - "skus": [ - { - "option1_value": option_values[0], - "option2_value": option_values[1], - "option3_value": option_values[2], - } - ], - } - - def _collect_label_issues( - self, - label: str, - query_profile: Dict[str, Any], - doc: Dict[str, Any], - ) -> List[str]: - evidence = self._doc_evidence_text(doc) - issues: List[str] = [] - category = normalize_text(query_profile.get("primary_category")) - allowed_categories = [ - normalize_text(item) - for item in query_profile.get("allowed_categories") or [] - if str(item).strip() - ] - - primary_category_match = True if not category else category in evidence - allowed_category_match = False if allowed_categories else primary_category_match - if allowed_categories: - allowed_category_match = any(signal in evidence for signal in allowed_categories) - - if label == RELEVANCE_EXACT and not primary_category_match: - if allowed_category_match: - issues.append("Exact missing primary category evidence") - else: - issues.append("Exact has category mismatch") - - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: - issues.append("Partial has category mismatch") - - for attr in query_profile.get("required_attributes") or []: - if not isinstance(attr, dict): - continue - attr_name = normalize_text(attr.get("name")) - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: - continue - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] - has_required = any(term in evidence for term in required_terms) if required_terms else True - has_conflict = any(term in evidence for term in conflicting_terms) - - if has_conflict and label != RELEVANCE_IRRELEVANT: - issues.append(f"{label} conflicts on {attr_name}") - if label == RELEVANCE_EXACT and not has_required: - issues.append(f"Exact missing {attr_name}") - return issues def audit_live_query( self, @@ -294,42 +118,6 @@ class SearchEvaluationFramework: auto_annotate: bool = False, ) -> Dict[str, Any]: live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) - if self.labeler_mode != "complex": - labels = [ - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT - for item in live["results"] - ] - return { - "query": query, - "tenant_id": self.tenant_id, - "top_k": top_k, - "metrics": live["metrics"], - "distribution": label_distribution(labels), - "query_profile": None, - "suspicious": [], - "results": live["results"], - } - query_profile = self.get_query_profile(query, force_refresh=False) - suspicious: List[Dict[str, Any]] = [] - - for item in live["results"]: - doc = self._result_item_to_doc(item) - issues = self._collect_label_issues(item["label"] or "", query_profile, doc) - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) - if suggested_label != (item["label"] or ""): - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] - if issues: - suspicious.append( - { - "rank": item["rank"], - "spu_id": item["spu_id"], - "title": item["title"], - "label": item["label"], - "suggested_label": suggested_label, - "issues": issues, - } - ) - labels = [ item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT for item in live["results"] @@ -340,8 +128,8 @@ class SearchEvaluationFramework: "top_k": top_k, "metrics": live["metrics"], "distribution": label_distribution(labels), - "query_profile": query_profile, - "suspicious": suspicious, + "query_profile": None, + "suspicious": [], "results": live["results"], } @@ -521,15 +309,7 @@ class SearchEvaluationFramework: if not docs: return [] try: - if self.labeler_mode == "complex": - query_profile = self.get_query_profile(query, force_refresh=force_refresh) - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) - labels = [ - self._apply_rule_based_label_guardrails(label, query_profile, doc) - for doc, label in zip(docs, labels) - ] - else: - labels, raw_response = self.label_client.classify_batch_simple(query, docs) + labels, raw_response = self.label_client.classify_batch(query, docs) return [(labels, raw_response, docs)] except Exception: if len(docs) == 1: @@ -727,8 +507,6 @@ class SearchEvaluationFramework: "annotate_rerank_top_k": annotate_rerank_top_k, "pool_size": len(pool_docs), }, - "labeler_mode": self.labeler_mode, - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, "metrics_top100": metrics, "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, @@ -903,8 +681,6 @@ class SearchEvaluationFramework: "rebuild": rebuild_meta, "ordered_union_size": pool_docs_count, }, - "labeler_mode": self.labeler_mode, - "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None, "metrics_top100": metrics, "search_results": search_labeled_results, "full_rerank_top": rerank_top_results, @@ -970,7 +746,7 @@ class SearchEvaluationFramework: relevant_missing_ids = [ spu_id for spu_id, label in labels.items() - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids + if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids ] missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) missing_relevant = [] @@ -992,7 +768,12 @@ class SearchEvaluationFramework: "product": compact_product_payload(doc), } ) - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} + label_order = { + RELEVANCE_EXACT: 0, + RELEVANCE_HIGH: 1, + RELEVANCE_LOW: 2, + RELEVANCE_IRRELEVANT: 3, + } missing_relevant.sort( key=lambda item: ( label_order.get(str(item.get("label")), 9), @@ -1010,7 +791,7 @@ class SearchEvaluationFramework: if unlabeled_hits: tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") if not missing_relevant: - tips.append("No cached Exact/Partial products were missed by this recall set.") + tips.append("No cached non-irrelevant products were missed by this recall set.") return { "query": query, "tenant_id": self.tenant_id, @@ -1024,7 +805,8 @@ class SearchEvaluationFramework: "recalled_hits": len(labeled), "missing_relevant_count": len(missing_relevant), "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), }, "tips": tips, "total": int(search_payload.get("total") or 0), @@ -1065,7 +847,8 @@ class SearchEvaluationFramework: aggregate = aggregate_metrics([item["metrics"] for item in per_query]) aggregate_distribution = { RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), + RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), + RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), } batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py index b6f5681..542a993 100644 --- a/scripts/evaluation/eval_framework/metrics.py +++ b/scripts/evaluation/eval_framework/metrics.py @@ -4,7 +4,7 @@ from __future__ import annotations from typing import Dict, Sequence -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: @@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> fl sliced = list(labels[:k]) if not sliced: return 0.0 - hits = sum(1 for label in sliced if label in relevant) + rel = set(relevant) + hits = sum(1 for label in sliced if label in rel) return hits / float(min(k, len(sliced))) def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: + rel = set(relevant) hit_count = 0 precision_sum = 0.0 for idx, label in enumerate(labels, start=1): - if label not in relevant: + if label not in rel: continue hit_count += 1 precision_sum += hit_count / idx @@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: + """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names).""" metrics: Dict[str, float] = {} + non_irrel = list(RELEVANCE_NON_IRRELEVANT) for k in (5, 10, 20, 50): metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6) metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) + metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6) return metrics @@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo def label_distribution(labels: Sequence[str]) -> Dict[str, int]: return { RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), + RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), + RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), } diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py index c1225d8..5fc9201 100644 --- a/scripts/evaluation/eval_framework/prompts.py +++ b/scripts/evaluation/eval_framework/prompts.py @@ -2,10 +2,9 @@ from __future__ import annotations -import json -from typing import Any, Dict, Sequence +from typing import Sequence -_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance judgment assistant for a fashion e-commerce search system. +_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. Given a user query and the information for each product, assign a relevance label to each product. Your goal is to judge relevance from the perspective of e-commerce search ranking. @@ -154,7 +153,7 @@ The output lines must correspond to the products above in the same order. Do not output anything else. """ -_CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。 +_CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。 给定用户查询词以及每个商品的信息,请为每个商品分配一个相关性标签。 你的目标是从电商搜索排序的角度,判断商品是否满足用户的购物意图。 @@ -294,76 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中 """ -def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: +def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: lines = "\n".join(numbered_doc_lines) n = len(numbered_doc_lines) - return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) - - -_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. -Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. -Be conservative: only mark an attribute as required if the user explicitly asked for it. - -Return JSON with this schema: -{{ - "normalized_query_en": string, - "primary_category": string, - "allowed_categories": [string], - "required_attributes": [ - {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} - ], - "notes": [string] -}} - -Guidelines: -- Exact later will require explicit evidence for all required attributes. -- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. -- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. -- If the query includes color, fit, silhouette, or length, include them as required_attributes. -- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. -- For color, include conflicting colors only when clear from the query. - -Original query: {query} -Parser hints JSON: {hints_json} -""" - - -def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: - hints_json = json.dumps(parser_hints, ensure_ascii=False) - return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) - - -_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. -Judge each product against the structured query profile below. - -Relevance rules: -- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. -- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. -- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. -- Be conservative with Exact. -- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. -- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. - -Original query: {query} -Structured query profile JSON: {profile_json} - -Products: -{lines} - -Return JSON only, with schema: -{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} -""" - - -def classify_batch_complex_prompt( - query: str, - query_profile: Dict[str, Any], - numbered_doc_lines: Sequence[str], -) -> str: - lines = "\n".join(numbered_doc_lines) - profile_json = json.dumps(query_profile, ensure_ascii=False) - return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( - query=query, - profile_json=profile_json, - lines=lines, - ) + return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n) diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py index 3fe4908..7587b57 100644 --- a/scripts/evaluation/eval_framework/reports.py +++ b/scripts/evaluation/eval_framework/reports.py @@ -4,7 +4,7 @@ from __future__ import annotations from typing import Any, Dict -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL +from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW def render_batch_report_markdown(payload: Dict[str, Any]) -> str: @@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: "", "## Label Distribution", "", - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", + f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}", + f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", + f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}", f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", ] ) @@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: for key, value in sorted((item.get("metrics") or {}).items()): lines.append(f"- {key}: {value}") distribution = item.get("distribution") or {} - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") + lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}") + lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") + lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}") lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") lines.append("") return "\n".join(lines) diff --git a/scripts/evaluation/eval_framework/static/eval_web.css b/scripts/evaluation/eval_framework/static/eval_web.css index ece16ed..e8472ba 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.css +++ b/scripts/evaluation/eval_framework/static/eval_web.css @@ -35,10 +35,11 @@ .results { display: grid; gap: 10px; } .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } - .Exact { background: var(--exact); } - .Partial { background: var(--partial); } - .Irrelevant { background: var(--irrelevant); } - .Unknown { background: #637381; } + .label-exact-match { background: var(--exact); } + .label-high-relevant { background: var(--partial); } + .label-low-relevant { background: #6b5b95; } + .label-irrelevant { background: var(--irrelevant); } + .badge-unknown { background: #637381; } .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; } .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; } diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js index 4d63e68..33411b2 100644 --- a/scripts/evaluation/eval_framework/static/eval_web.js +++ b/scripts/evaluation/eval_framework/static/eval_web.js @@ -13,6 +13,10 @@ root.appendChild(card); }); } + function labelBadgeClass(label) { + if (!label || label === 'Unknown') return 'badge-unknown'; + return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-'); + } function renderResults(results, rootId='results', showRank=true) { const mount = document.getElementById(rootId); mount.innerHTML = ''; @@ -21,7 +25,7 @@ const box = document.createElement('div'); box.className = 'result'; box.innerHTML = ` -
${label}
${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}
+
${label}
${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}
${item.title || ''}
@@ -42,7 +46,7 @@ const root = document.getElementById('tips'); const tips = [...(data.tips || [])]; const stats = data.label_stats || {}; - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`); root.innerHTML = tips.map(text => `
${text}
`).join(''); } async function loadQueries() { diff --git a/scripts/evaluation/eval_framework/static/index.html b/scripts/evaluation/eval_framework/static/index.html index 42273f2..3333781 100644 --- a/scripts/evaluation/eval_framework/static/index.html +++ b/scripts/evaluation/eval_framework/static/index.html @@ -37,7 +37,7 @@
-

Missed Exact / Partial

+

Missed non-irrelevant (cached)

diff --git a/scripts/evaluation/eval_framework/store.py b/scripts/evaluation/eval_framework/store.py index 8c16787..bc9dea8 100644 --- a/scripts/evaluation/eval_framework/store.py +++ b/scripts/evaluation/eval_framework/store.py @@ -8,7 +8,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Any, Dict, List, Optional, Sequence -from .constants import VALID_LABELS +from .constants import VALID_LABELS, normalize_stored_label from .utils import ensure_dir, safe_json_dumps, utc_now_iso @@ -220,7 +220,7 @@ class EvalStore: """, (tenant_id, query_text), ).fetchall() - return {str(row["spu_id"]): str(row["label"]) for row in rows} + return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows} def upsert_labels( self, @@ -379,8 +379,9 @@ class EvalStore: SELECT query_text, COUNT(*) AS total, - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, MAX(updated_at) AS updated_at FROM relevance_labels @@ -395,7 +396,8 @@ class EvalStore: "query": str(row["query_text"]), "total": int(row["total"]), "exact_count": int(row["exact_count"] or 0), - "partial_count": int(row["partial_count"] or 0), + "high_relevant_count": int(row["high_relevant_count"] or 0), + "low_relevant_count": int(row["low_relevant_count"] or 0), "irrelevant_count": int(row["irrelevant_count"] or 0), "updated_at": row["updated_at"], } @@ -407,8 +409,9 @@ class EvalStore: """ SELECT COUNT(*) AS total, - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, MAX(updated_at) AS updated_at FROM relevance_labels @@ -420,7 +423,8 @@ class EvalStore: "query": query_text, "total": int((row["total"] or 0) if row else 0), "exact_count": int((row["exact_count"] or 0) if row else 0), - "partial_count": int((row["partial_count"] or 0) if row else 0), + "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0), + "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0), "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), "updated_at": row["updated_at"] if row else None, } diff --git a/scripts/evaluation/quick_start_eval.sh b/scripts/evaluation/quick_start_eval.sh deleted file mode 100755 index 0bcbf32..0000000 --- a/scripts/evaluation/quick_start_eval.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash -# Search evaluation quick entrypoints. Run from any cwd; resolves repo root. -set -euo pipefail - -ROOT="$(cd "$(dirname "$0")/../.." && pwd)" -cd "$ROOT" -PY="${ROOT}/.venv/bin/python" -TENANT_ID="${TENANT_ID:-163}" -QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" - -usage() { - echo "Usage: $0 batch|batch-rebuild|serve" - echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" - echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" - echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" -} - -case "${1:-}" in - batch) - exec "$PY" scripts/evaluation/build_annotation_set.py batch \ - --tenant-id "$TENANT_ID" \ - --queries-file "$QUERIES" \ - --top-k 50 \ - --language en \ - --labeler-mode simple - ;; - batch-rebuild) - exec "$PY" scripts/evaluation/build_annotation_set.py build \ - --tenant-id "$TENANT_ID" \ - --queries-file "$QUERIES" \ - --search-depth 500 \ - --rerank-depth 10000 \ - --force-refresh-rerank \ - --force-refresh-labels \ - --language en \ - --labeler-mode simple - ;; - serve) - EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" - EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" - exec "$PY" scripts/evaluation/serve_eval_web.py serve \ - --tenant-id "$TENANT_ID" \ - --queries-file "$QUERIES" \ - --host "$EVAL_WEB_HOST" \ - --port "$EVAL_WEB_PORT" - ;; - *) - usage - exit 1 - ;; -esac diff --git a/scripts/evaluation/start_eval.sh b/scripts/evaluation/start_eval.sh new file mode 100755 index 0000000..dc097c3 --- /dev/null +++ b/scripts/evaluation/start_eval.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Search evaluation quick entrypoints. Run from any cwd; resolves repo root. +set -euo pipefail + +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" +cd "$ROOT" +PY="${ROOT}/.venv/bin/python" +TENANT_ID="${TENANT_ID:-163}" +QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" + +usage() { + echo "Usage: $0 batch|batch-rebuild|serve" + echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" + echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" + echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" +} + +case "${1:-}" in + batch) + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ + --tenant-id "$TENANT_ID" \ + --queries-file "$QUERIES" \ + --top-k 50 \ + --language en + ;; + batch-rebuild) + exec "$PY" scripts/evaluation/build_annotation_set.py build \ + --tenant-id "$TENANT_ID" \ + --queries-file "$QUERIES" \ + --search-depth 500 \ + --rerank-depth 10000 \ + --force-refresh-rerank \ + --force-refresh-labels \ + --language en + ;; + serve) + EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" + EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}" + exec "$PY" scripts/evaluation/serve_eval_web.py serve \ + --tenant-id "$TENANT_ID" \ + --queries-file "$QUERIES" \ + --host "$EVAL_WEB_HOST" \ + --port "$EVAL_WEB_PORT" + ;; + *) + usage + exit 1 + ;; +esac -- libgit2 0.21.2