From a345b01f79e0926df2b7acc0ca9d3bb31715a654 Mon Sep 17 00:00:00 2001
From: tangwang <tangwang@hsyl>
Date: Wed, 1 Apr 2026 10:00:45 +0800
Subject: [PATCH] eval framework

---
 docs/Usage-Guide.md                                   |   2 +-
 docs/issue-2026-03-31-评估框架-done-0331.md       |   2 +-
 docs/相关性检索优化说明.md                   |   2 +-
 scripts/evaluation/README.md                          |  10 +++++-----
 scripts/evaluation/eval_framework/__init__.py         |  16 ++++++++--------
 scripts/evaluation/eval_framework/cli.py              |  21 ++++-----------------
 scripts/evaluation/eval_framework/clients.py          |  81 ++++++++++++++++++++++++---------------------------------------------------------
 scripts/evaluation/eval_framework/constants.py        |  29 +++++++++++++++++++++++------
 scripts/evaluation/eval_framework/framework.py        | 255 +++++++++++++++++++--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 scripts/evaluation/eval_framework/metrics.py          |  17 +++++++++++------
 scripts/evaluation/eval_framework/prompts.py          |  80 +++++---------------------------------------------------------------------------
 scripts/evaluation/eval_framework/reports.py          |  12 +++++++-----
 scripts/evaluation/eval_framework/static/eval_web.css |   9 +++++----
 scripts/evaluation/eval_framework/static/eval_web.js  |   8 ++++++--
 scripts/evaluation/eval_framework/static/index.html   |   2 +-
 scripts/evaluation/eval_framework/store.py            |  20 ++++++++++++--------
 scripts/evaluation/quick_start_eval.sh                |  52 ----------------------------------------------------
 scripts/evaluation/start_eval.sh                      |  50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 18 files changed, 183 insertions(+), 485 deletions(-)
 delete mode 100755 scripts/evaluation/quick_start_eval.sh
 create mode 100755 scripts/evaluation/start_eval.sh

diff --git a/docs/Usage-Guide.md b/docs/Usage-Guide.md
index ae627c9..9686d11 100644
--- a/docs/Usage-Guide.md
+++ b/docs/Usage-Guide.md
@@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
 ./scripts/service_ctl.sh restart backend
 sleep 3
 ./scripts/service_ctl.sh status backend
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh.sh batch
 ```
 
 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`（SQLite、`batch_reports/` 下的 JSON/Markdown 等）。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
diff --git a/docs/issue-2026-03-31-评估框架-done-0331.md b/docs/issue-2026-03-31-评估框架-done-0331.md
index 3783695..257ce6a 100644
--- a/docs/issue-2026-03-31-评估框架-done-0331.md
+++ b/docs/issue-2026-03-31-评估框架-done-0331.md
@@ -138,7 +138,7 @@ queries默认是queries/queries.txt，填入左侧列表框，点击其中任何
 
 
 @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py 
-@quick_start_eval.sh (29-35) 
+@start_eval.sh.sh (29-35) 
 请以如下流程为准，进行改造：
 如果重建的话，对每个query：
 每个搜索结果应该会扫描全库，
diff --git a/docs/相关性检索优化说明.md b/docs/相关性检索优化说明.md
index 6b0e7ce..29d158c 100644
--- a/docs/相关性检索优化说明.md
+++ b/docs/相关性检索优化说明.md
@@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
 ./scripts/service_ctl.sh restart backend
 sleep 3
 ./scripts/service_ctl.sh status backend
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh.sh batch
 ```
 
 评估产物在 `artifacts/search_evaluation/`（如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown）。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md
index 9366411..59b43a8 100644
--- a/scripts/evaluation/README.md
+++ b/scripts/evaluation/README.md
@@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, 
 | `fusion_experiments_round1.json` | Broader first-round experiments |
 | `queries/queries.txt` | Canonical evaluation queries |
 | `README_Requirement.md` | Product/requirements reference |
-| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
+| `start_eval.sh.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
 | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. |
 
 ## Quick start (repo root)
@@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS
 
 ```bash
 # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM
-./scripts/evaluation/quick_start_eval.sh batch
+./scripts/evaluation/start_eval.sh.sh batch
 
 # Deep rebuild: per-query full corpus rerank (outside search top-500 pool) + LLM in 50-doc batches along global sort order (early stop; expensive)
-./scripts/evaluation/quick_start_eval.sh batch-rebuild
+./scripts/evaluation/start_eval.sh.sh batch-rebuild
 
 # UI: http://127.0.0.1:6010/
-./scripts/evaluation/quick_start_eval.sh serve
+./scripts/evaluation/start_eval.sh.sh serve
 # or: ./scripts/service_ctl.sh start eval-web
 ```
 
@@ -71,7 +71,7 @@ Explicit equivalents:
 
 Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline).
 
-### `quick_start_eval.sh batch-rebuild` (deep annotation rebuild)
+### `start_eval.sh.sh batch-rebuild` (deep annotation rebuild)
 
 This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`.
 
diff --git a/scripts/evaluation/eval_framework/__init__.py b/scripts/evaluation/eval_framework/__init__.py
index acbcaff..236fb67 100644
--- a/scripts/evaluation/eval_framework/__init__.py
+++ b/scripts/evaluation/eval_framework/__init__.py
@@ -12,15 +12,15 @@ ensure_project_on_path()
 
 from .constants import (  # noqa: E402
     DEFAULT_ARTIFACT_ROOT,
-    DEFAULT_LABELER_MODE,
     DEFAULT_QUERY_FILE,
-    JUDGE_PROMPT_VERSION_COMPLEX,
-    JUDGE_PROMPT_VERSION_SIMPLE,
     PROJECT_ROOT,
     RELEVANCE_EXACT,
+    RELEVANCE_HIGH,
     RELEVANCE_IRRELEVANT,
-    RELEVANCE_PARTIAL,
+    RELEVANCE_LOW,
+    RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
+    normalize_stored_label,
 )
 from .framework import SearchEvaluationFramework  # noqa: E402
 from .store import EvalStore, QueryBuildResult  # noqa: E402
@@ -36,22 +36,22 @@ from .utils import (  # noqa: E402
 
 __all__ = [
     "DEFAULT_ARTIFACT_ROOT",
-    "DEFAULT_LABELER_MODE",
     "DEFAULT_QUERY_FILE",
     "EvalStore",
-    "JUDGE_PROMPT_VERSION_COMPLEX",
-    "JUDGE_PROMPT_VERSION_SIMPLE",
     "PROJECT_ROOT",
     "QueryBuildResult",
     "RELEVANCE_EXACT",
+    "RELEVANCE_HIGH",
     "RELEVANCE_IRRELEVANT",
-    "RELEVANCE_PARTIAL",
+    "RELEVANCE_LOW",
+    "RELEVANCE_NON_IRRELEVANT",
     "SearchEvaluationFramework",
     "VALID_LABELS",
     "build_cli_parser",
     "create_web_app",
     "ensure_dir",
     "main",
+    "normalize_stored_label",
     "render_batch_report_markdown",
     "sha1_text",
     "utc_now_iso",
diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py
index 6421a0a..9417776 100644
--- a/scripts/evaluation/eval_framework/cli.py
+++ b/scripts/evaluation/eval_framework/cli.py
@@ -8,7 +8,6 @@ from pathlib import Path
 from typing import Any, Dict
 
 from .constants import (
-    DEFAULT_LABELER_MODE,
     DEFAULT_QUERY_FILE,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
@@ -103,7 +102,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
     build.add_argument("--language", default="en")
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
-    build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
     add_judge_llm_args(build)
 
     batch = sub.add_parser("batch", help="Run batch evaluation against live search")
@@ -112,7 +110,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
     batch.add_argument("--top-k", type=int, default=100)
     batch.add_argument("--language", default="en")
     batch.add_argument("--force-refresh-labels", action="store_true")
-    batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
     add_judge_llm_args(batch)
 
     audit = sub.add_parser("audit", help="Audit annotation quality for queries")
@@ -122,7 +119,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
     audit.add_argument("--language", default="en")
     audit.add_argument("--limit-suspicious", type=int, default=5)
     audit.add_argument("--force-refresh-labels", action="store_true")
-    audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
     add_judge_llm_args(audit)
 
     serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
@@ -130,16 +126,13 @@ def build_cli_parser() -> argparse.ArgumentParser:
     serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
     serve.add_argument("--host", default="0.0.0.0")
     serve.add_argument("--port", type=int, default=6010)
-    serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
     add_judge_llm_args(serve)
 
     return parser
 
 
 def run_build(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(
-        tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
-    )
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     summary = []
     rebuild_kwargs = {}
@@ -191,9 +184,7 @@ def run_build(args: argparse.Namespace) -> None:
 
 
 def run_batch(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(
-        tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
-    )
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     payload = framework.batch_evaluate(
         queries=queries,
@@ -206,9 +197,7 @@ def run_batch(args: argparse.Namespace) -> None:
 
 
 def run_audit(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(
-        tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
-    )
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
     audit_items = []
     for query in queries:
@@ -258,9 +247,7 @@ def run_audit(args: argparse.Namespace) -> None:
 
 
 def run_serve(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(
-        tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
-    )
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     app = create_web_app(framework, Path(args.queries_file))
     import uvicorn
 
diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py
index d228e42..77edd20 100644
--- a/scripts/evaluation/eval_framework/clients.py
+++ b/scripts/evaluation/eval_framework/clients.py
@@ -11,14 +11,21 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
 import requests
 
 from .constants import VALID_LABELS
-from .prompts import (
-    classify_batch_complex_prompt,
-    classify_batch_simple_prompt,
-    extract_query_profile_prompt,
-)
+from .prompts import classify_prompt
 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
 
 
+def _canonicalize_judge_label(raw: str) -> str | None:
+    s = str(raw or "").strip().strip('"').strip("'")
+    if s in VALID_LABELS:
+        return s
+    low = s.lower()
+    for v in VALID_LABELS:
+        if v.lower() == low:
+            return v
+    return None
+
+
 class SearchServiceClient:
     def __init__(self, base_url: str, tenant_id: str):
         self.base_url = base_url.rstrip("/")
@@ -224,71 +231,31 @@ class DashScopeLabelClient:
                 return obj
         return None
 
-    def classify_batch_simple(
+    def classify_batch(
         self,
         query: str,
         docs: Sequence[Dict[str, Any]],
     ) -> Tuple[List[str], str]:
         numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
-        prompt = classify_batch_simple_prompt(query, numbered_docs)
+        prompt = classify_prompt(query, numbered_docs)
         content, raw_response = self._chat(prompt)
-        labels = []
+        labels: List[str] = []
         for line in str(content or "").splitlines():
-            label = line.strip()
-            if label in VALID_LABELS:
-                labels.append(label)
+            canon = _canonicalize_judge_label(line)
+            if canon is not None:
+                labels.append(canon)
         if len(labels) != len(docs):
             payload = extract_json_blob(content)
             if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
                 labels = []
                 for item in payload["labels"][: len(docs)]:
                     if isinstance(item, dict):
-                        label = str(item.get("label") or "").strip()
+                        raw_l = str(item.get("label") or "").strip()
                     else:
-                        label = str(item).strip()
-                    if label in VALID_LABELS:
-                        labels.append(label)
-        if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
-            raise ValueError(f"unexpected simple label output: {content!r}")
-        return labels, raw_response
-
-    def extract_query_profile(
-        self,
-        query: str,
-        parser_hints: Dict[str, Any],
-    ) -> Tuple[Dict[str, Any], str]:
-        prompt = extract_query_profile_prompt(query, parser_hints)
-        content, raw_response = self._chat(prompt)
-        payload = extract_json_blob(content)
-        if not isinstance(payload, dict):
-            raise ValueError(f"unexpected query profile payload: {content!r}")
-        payload.setdefault("normalized_query_en", query)
-        payload.setdefault("primary_category", "")
-        payload.setdefault("allowed_categories", [])
-        payload.setdefault("required_attributes", [])
-        payload.setdefault("notes", [])
-        return payload, raw_response
-
-    def classify_batch_complex(
-        self,
-        query: str,
-        query_profile: Dict[str, Any],
-        docs: Sequence[Dict[str, Any]],
-    ) -> Tuple[List[str], str]:
-        numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
-        prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs)
-        content, raw_response = self._chat(prompt)
-        payload = extract_json_blob(content)
-        if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
-            raise ValueError(f"unexpected label payload: {content!r}")
-        labels_payload = payload["labels"]
-        labels: List[str] = []
-        for item in labels_payload[: len(docs)]:
-            if not isinstance(item, dict):
-                continue
-            label = str(item.get("label") or "").strip()
-            if label in VALID_LABELS:
-                labels.append(label)
+                        raw_l = str(item).strip()
+                    canon = _canonicalize_judge_label(raw_l)
+                    if canon is not None:
+                        labels.append(canon)
         if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
-            raise ValueError(f"unexpected label output: {content!r}")
+            raise ValueError(f"unexpected classify output: {content!r}")
         return labels, raw_response
diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py
index eea6182..4dacc1d 100644
--- a/scripts/evaluation/eval_framework/constants.py
+++ b/scripts/evaluation/eval_framework/constants.py
@@ -6,17 +6,34 @@ _PKG_DIR = Path(__file__).resolve().parent
 _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
 
-RELEVANCE_EXACT = "Exact"
-RELEVANCE_PARTIAL = "Partial"
+# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
+RELEVANCE_EXACT = "Exact Match"
+RELEVANCE_HIGH = "High Relevant"
+RELEVANCE_LOW = "Low Relevant"
 RELEVANCE_IRRELEVANT = "Irrelevant"
-VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
+
+VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
+
+# Precision / MAP "positive" set (all non-irrelevant tiers)
+RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
+
+_LEGACY_LABEL_MAP = {
+    "Exact": RELEVANCE_EXACT,
+    "Partial": RELEVANCE_HIGH,
+}
+
+
+def normalize_stored_label(label: str) -> str:
+    """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels."""
+    s = str(label).strip()
+    if s in VALID_LABELS:
+        return s
+    return _LEGACY_LABEL_MAP.get(s, s)
+
 
 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
 DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
 
-JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
-JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
-DEFAULT_LABELER_MODE = "simple"
 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
 DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
 DEFAULT_JUDGE_ENABLE_THINKING = True
diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py
index ab6260c..32c815a 100644
--- a/scripts/evaluation/eval_framework/framework.py
+++ b/scripts/evaluation/eval_framework/framework.py
@@ -10,7 +10,7 @@ from typing import Any, Dict, List, Sequence, Tuple
 import requests
 from elasticsearch.helpers import scan
 
-from api.app import get_app_config, get_es_client, get_query_parser, init_service
+from api.app import get_app_config, get_es_client, init_service
 from indexer.mapping_generator import get_tenant_index_name
 
 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
@@ -21,7 +21,6 @@ from .constants import (
     DEFAULT_JUDGE_DASHSCOPE_BATCH,
     DEFAULT_JUDGE_ENABLE_THINKING,
     DEFAULT_JUDGE_MODEL,
-    DEFAULT_LABELER_MODE,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
     DEFAULT_REBUILD_LLM_BATCH_SIZE,
@@ -30,10 +29,11 @@ from .constants import (
     DEFAULT_RERANK_HIGH_SKIP_COUNT,
     DEFAULT_RERANK_HIGH_THRESHOLD,
     DEFAULT_SEARCH_RECALL_TOP_K,
-    JUDGE_PROMPT_VERSION_COMPLEX,
     RELEVANCE_EXACT,
+    RELEVANCE_HIGH,
     RELEVANCE_IRRELEVANT,
-    RELEVANCE_PARTIAL,
+    RELEVANCE_LOW,
+    RELEVANCE_NON_IRRELEVANT,
     VALID_LABELS,
 )
 from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
@@ -45,8 +45,6 @@ from .utils import (
     compact_option_values,
     compact_product_payload,
     ensure_dir,
-    normalize_text,
-    pick_text,
     sha1_text,
     utc_now_iso,
     utc_timestamp,
@@ -77,7 +75,6 @@ class SearchEvaluationFramework:
         tenant_id: str,
         artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
         search_base_url: str = "http://localhost:6002",
-        labeler_mode: str = DEFAULT_LABELER_MODE,
         *,
         judge_model: str | None = None,
         enable_thinking: bool | None = None,
@@ -86,7 +83,6 @@ class SearchEvaluationFramework:
         init_service(get_app_config().infrastructure.elasticsearch.host)
         self.tenant_id = str(tenant_id)
         self.artifact_root = ensure_dir(artifact_root)
-        self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
         self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
         self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
         app_cfg = get_app_config()
@@ -112,178 +108,6 @@ class SearchEvaluationFramework:
             enable_thinking=et,
             use_batch=use_batch,
         )
-        self.query_parser = None
-    
-    def _get_query_parser(self):
-        if self.query_parser is None:
-            self.query_parser = get_query_parser()
-        return self.query_parser
-
-    def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
-        parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
-        payload = parsed.to_dict()
-        payload["text_for_rerank"] = parsed.text_for_rerank()
-        return payload
-
-    def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
-        if self.labeler_mode != "complex":
-            raise RuntimeError("query profiles are only used in complex labeler mode")
-        if not force_refresh:
-            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
-            if cached is not None:
-                return cached
-        parser_hints = self.build_query_parser_hints(query)
-        profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
-        profile["parser_hints"] = parser_hints
-        self.store.upsert_query_profile(
-            self.tenant_id,
-            query,
-            JUDGE_PROMPT_VERSION_COMPLEX,
-            self.label_client.model,
-            profile,
-            raw_response,
-        )
-        return profile
-
-    @staticmethod
-    def _doc_evidence_text(doc: Dict[str, Any]) -> str:
-        pieces: List[str] = [
-            build_display_title(doc),
-            pick_text(doc.get("vendor"), "en"),
-            pick_text(doc.get("category_path"), "en"),
-            pick_text(doc.get("category_name"), "en"),
-        ]
-        for sku in doc.get("skus") or []:
-            pieces.extend(
-                [
-                    str(sku.get("option1_value") or ""),
-                    str(sku.get("option2_value") or ""),
-                    str(sku.get("option3_value") or ""),
-                ]
-            )
-        for tag in doc.get("tags") or []:
-            pieces.append(str(tag))
-        return normalize_text(" | ".join(piece for piece in pieces if piece))
-
-    def _apply_rule_based_label_guardrails(
-        self,
-        label: str,
-        query_profile: Dict[str, Any],
-        doc: Dict[str, Any],
-    ) -> str:
-        if label not in VALID_LABELS:
-            return label
-        evidence = self._doc_evidence_text(doc)
-        category = normalize_text(query_profile.get("primary_category"))
-        allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
-
-        primary_category_match = True
-        if category:
-            primary_category_match = category in evidence
-        allowed_category_match = True
-        if allowed_categories:
-            allowed_category_match = any(signal in evidence for signal in allowed_categories)
-
-        if label == RELEVANCE_EXACT and not primary_category_match:
-            if allowed_category_match:
-                label = RELEVANCE_PARTIAL
-            else:
-                return RELEVANCE_IRRELEVANT
-
-        for attr in query_profile.get("required_attributes") or []:
-            if not isinstance(attr, dict):
-                continue
-            attr_name = normalize_text(attr.get("name"))
-            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
-                continue
-            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
-            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
-            if attr_name == "fit":
-                if any(term in {"oversized", "oversize"} for term in required_terms):
-                    conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
-                if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
-                    conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
-            has_required = any(term in evidence for term in required_terms) if required_terms else True
-            has_conflict = any(term in evidence for term in conflicting_terms)
-
-            if has_conflict:
-                return RELEVANCE_IRRELEVANT
-            if label == RELEVANCE_EXACT and not has_required:
-                label = RELEVANCE_PARTIAL
-
-        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
-            return RELEVANCE_IRRELEVANT
-
-        return label
-
-    @staticmethod
-    def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
-        option_values = list(item.get("option_values") or [])
-        while len(option_values) < 3:
-            option_values.append("")
-        product = dict(item.get("product") or {})
-        return {
-            "spu_id": item.get("spu_id"),
-            "title": product.get("title") or item.get("title"),
-            "vendor": product.get("vendor"),
-            "category_path": product.get("category"),
-            "category_name": product.get("category"),
-            "image_url": item.get("image_url") or product.get("image_url"),
-            "tags": product.get("tags") or [],
-            "skus": [
-                {
-                    "option1_value": option_values[0],
-                    "option2_value": option_values[1],
-                    "option3_value": option_values[2],
-                }
-            ],
-        }
-
-    def _collect_label_issues(
-        self,
-        label: str,
-        query_profile: Dict[str, Any],
-        doc: Dict[str, Any],
-    ) -> List[str]:
-        evidence = self._doc_evidence_text(doc)
-        issues: List[str] = []
-        category = normalize_text(query_profile.get("primary_category"))
-        allowed_categories = [
-            normalize_text(item)
-            for item in query_profile.get("allowed_categories") or []
-            if str(item).strip()
-        ]
-
-        primary_category_match = True if not category else category in evidence
-        allowed_category_match = False if allowed_categories else primary_category_match
-        if allowed_categories:
-            allowed_category_match = any(signal in evidence for signal in allowed_categories)
-
-        if label == RELEVANCE_EXACT and not primary_category_match:
-            if allowed_category_match:
-                issues.append("Exact missing primary category evidence")
-            else:
-                issues.append("Exact has category mismatch")
-
-        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
-            issues.append("Partial has category mismatch")
-
-        for attr in query_profile.get("required_attributes") or []:
-            if not isinstance(attr, dict):
-                continue
-            attr_name = normalize_text(attr.get("name"))
-            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
-                continue
-            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
-            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
-            has_required = any(term in evidence for term in required_terms) if required_terms else True
-            has_conflict = any(term in evidence for term in conflicting_terms)
-
-            if has_conflict and label != RELEVANCE_IRRELEVANT:
-                issues.append(f"{label} conflicts on {attr_name}")
-            if label == RELEVANCE_EXACT and not has_required:
-                issues.append(f"Exact missing {attr_name}")
-        return issues
 
     def audit_live_query(
         self,
@@ -294,42 +118,6 @@ class SearchEvaluationFramework:
         auto_annotate: bool = False,
     ) -> Dict[str, Any]:
         live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
-        if self.labeler_mode != "complex":
-            labels = [
-                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
-                for item in live["results"]
-            ]
-            return {
-                "query": query,
-                "tenant_id": self.tenant_id,
-                "top_k": top_k,
-                "metrics": live["metrics"],
-                "distribution": label_distribution(labels),
-                "query_profile": None,
-                "suspicious": [],
-                "results": live["results"],
-            }
-        query_profile = self.get_query_profile(query, force_refresh=False)
-        suspicious: List[Dict[str, Any]] = []
-
-        for item in live["results"]:
-            doc = self._result_item_to_doc(item)
-            issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
-            suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
-            if suggested_label != (item["label"] or ""):
-                issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
-            if issues:
-                suspicious.append(
-                    {
-                        "rank": item["rank"],
-                        "spu_id": item["spu_id"],
-                        "title": item["title"],
-                        "label": item["label"],
-                        "suggested_label": suggested_label,
-                        "issues": issues,
-                    }
-                )
-
         labels = [
             item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
             for item in live["results"]
@@ -340,8 +128,8 @@ class SearchEvaluationFramework:
             "top_k": top_k,
             "metrics": live["metrics"],
             "distribution": label_distribution(labels),
-            "query_profile": query_profile,
-            "suspicious": suspicious,
+            "query_profile": None,
+            "suspicious": [],
             "results": live["results"],
         }
 
@@ -521,15 +309,7 @@ class SearchEvaluationFramework:
         if not docs:
             return []
         try:
-            if self.labeler_mode == "complex":
-                query_profile = self.get_query_profile(query, force_refresh=force_refresh)
-                labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
-                labels = [
-                    self._apply_rule_based_label_guardrails(label, query_profile, doc)
-                    for doc, label in zip(docs, labels)
-                ]
-            else:
-                labels, raw_response = self.label_client.classify_batch_simple(query, docs)
+            labels, raw_response = self.label_client.classify_batch(query, docs)
             return [(labels, raw_response, docs)]
         except Exception:
             if len(docs) == 1:
@@ -727,8 +507,6 @@ class SearchEvaluationFramework:
                 "annotate_rerank_top_k": annotate_rerank_top_k,
                 "pool_size": len(pool_docs),
             },
-            "labeler_mode": self.labeler_mode,
-            "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
             "metrics_top100": metrics,
             "search_results": search_labeled_results,
             "full_rerank_top": rerank_top_results,
@@ -903,8 +681,6 @@ class SearchEvaluationFramework:
                 "rebuild": rebuild_meta,
                 "ordered_union_size": pool_docs_count,
             },
-            "labeler_mode": self.labeler_mode,
-            "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None,
             "metrics_top100": metrics,
             "search_results": search_labeled_results,
             "full_rerank_top": rerank_top_results,
@@ -970,7 +746,7 @@ class SearchEvaluationFramework:
         relevant_missing_ids = [
             spu_id
             for spu_id, label in labels.items()
-            if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
+            if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids
         ]
         missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
         missing_relevant = []
@@ -992,7 +768,12 @@ class SearchEvaluationFramework:
                     "product": compact_product_payload(doc),
                 }
             )
-        label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
+        label_order = {
+            RELEVANCE_EXACT: 0,
+            RELEVANCE_HIGH: 1,
+            RELEVANCE_LOW: 2,
+            RELEVANCE_IRRELEVANT: 3,
+        }
         missing_relevant.sort(
             key=lambda item: (
                 label_order.get(str(item.get("label")), 9),
@@ -1010,7 +791,7 @@ class SearchEvaluationFramework:
         if unlabeled_hits:
             tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
         if not missing_relevant:
-            tips.append("No cached Exact/Partial products were missed by this recall set.")
+            tips.append("No cached non-irrelevant products were missed by this recall set.")
         return {
             "query": query,
             "tenant_id": self.tenant_id,
@@ -1024,7 +805,8 @@ class SearchEvaluationFramework:
                 "recalled_hits": len(labeled),
                 "missing_relevant_count": len(missing_relevant),
                 "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
-                "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
+                "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
+                "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
             },
             "tips": tips,
             "total": int(search_payload.get("total") or 0),
@@ -1065,7 +847,8 @@ class SearchEvaluationFramework:
         aggregate = aggregate_metrics([item["metrics"] for item in per_query])
         aggregate_distribution = {
             RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
-            RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
+            RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
+            RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
             RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
         }
         batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
diff --git a/scripts/evaluation/eval_framework/metrics.py b/scripts/evaluation/eval_framework/metrics.py
index b6f5681..542a993 100644
--- a/scripts/evaluation/eval_framework/metrics.py
+++ b/scripts/evaluation/eval_framework/metrics.py
@@ -4,7 +4,7 @@ from __future__ import annotations
 
 from typing import Dict, Sequence
 
-from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
+from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT
 
 
 def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
@@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> fl
     sliced = list(labels[:k])
     if not sliced:
         return 0.0
-    hits = sum(1 for label in sliced if label in relevant)
+    rel = set(relevant)
+    hits = sum(1 for label in sliced if label in rel)
     return hits / float(min(k, len(sliced)))
 
 
 def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
+    rel = set(relevant)
     hit_count = 0
     precision_sum = 0.0
     for idx, label in enumerate(labels, start=1):
-        if label not in relevant:
+        if label not in rel:
             continue
         hit_count += 1
         precision_sum += hit_count / idx
@@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
 
 
 def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
+    """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names)."""
     metrics: Dict[str, float] = {}
+    non_irrel = list(RELEVANCE_NON_IRRELEVANT)
     for k in (5, 10, 20, 50):
         metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
-        metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+        metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6)
     metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
-    metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+    metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6)
     return metrics
 
 
@@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo
 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
     return {
         RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
-        RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
+        RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
+        RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
         RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
     }
diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py
index c1225d8..5fc9201 100644
--- a/scripts/evaluation/eval_framework/prompts.py
+++ b/scripts/evaluation/eval_framework/prompts.py
@@ -2,10 +2,9 @@
 
 from __future__ import annotations
 
-import json
-from typing import Any, Dict, Sequence
+from typing import Sequence
 
-_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance judgment assistant for a fashion e-commerce search system.
+_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system.
 Given a user query and the information for each product, assign a relevance label to each product.
 
 Your goal is to judge relevance from the perspective of e-commerce search ranking.
@@ -154,7 +153,7 @@ The output lines must correspond to the products above in the same order.
 Do not output anything else.
 """
 
-_CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。
+_CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性判断助手。
 给定用户查询词以及每个商品的信息，请为每个商品分配一个相关性标签。
 
 你的目标是从电商搜索排序的角度，判断商品是否满足用户的购物意图。
@@ -294,76 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个服饰电商搜索系统中�
 """
 
 
-def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
+def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
     lines = "\n".join(numbered_doc_lines)
     n = len(numbered_doc_lines)
-    return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n)
-
-
-_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging.
-Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.
-Be conservative: only mark an attribute as required if the user explicitly asked for it.
-
-Return JSON with this schema:
-{{
-  "normalized_query_en": string,
-  "primary_category": string,
-  "allowed_categories": [string],
-  "required_attributes": [
-    {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}}
-  ],
-  "notes": [string]
-}}
-
-Guidelines:
-- Exact later will require explicit evidence for all required attributes.
-- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.
-- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.
-- If the query includes color, fit, silhouette, or length, include them as required_attributes.
-- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.
-- For color, include conflicting colors only when clear from the query.
-
-Original query: {query}
-Parser hints JSON: {hints_json}
-"""
-
-
-def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
-    hints_json = json.dumps(parser_hints, ensure_ascii=False)
-    return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json)
-
-
-_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge.
-Judge each product against the structured query profile below.
-
-Relevance rules:
-- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.
-- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.
-- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.
-- Be conservative with Exact.
-- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.
-- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.
-
-Original query: {query}
-Structured query profile JSON: {profile_json}
-
-Products:
-{lines}
-
-Return JSON only, with schema:
-{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}}
-"""
-
-
-def classify_batch_complex_prompt(
-    query: str,
-    query_profile: Dict[str, Any],
-    numbered_doc_lines: Sequence[str],
-) -> str:
-    lines = "\n".join(numbered_doc_lines)
-    profile_json = json.dumps(query_profile, ensure_ascii=False)
-    return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format(
-        query=query,
-        profile_json=profile_json,
-        lines=lines,
-    )
+    return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n)
diff --git a/scripts/evaluation/eval_framework/reports.py b/scripts/evaluation/eval_framework/reports.py
index 3fe4908..7587b57 100644
--- a/scripts/evaluation/eval_framework/reports.py
+++ b/scripts/evaluation/eval_framework/reports.py
@@ -4,7 +4,7 @@ from __future__ import annotations
 
 from typing import Any, Dict
 
-from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
+from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
 
 
 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
@@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
                 "",
                 "## Label Distribution",
                 "",
-                f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
-                f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
+                f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}",
+                f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
+                f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
                 f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
             ]
         )
@@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
         for key, value in sorted((item.get("metrics") or {}).items()):
             lines.append(f"- {key}: {value}")
         distribution = item.get("distribution") or {}
-        lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
-        lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
+        lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}")
+        lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
+        lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
         lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
         lines.append("")
     return "\n".join(lines)
diff --git a/scripts/evaluation/eval_framework/static/eval_web.css b/scripts/evaluation/eval_framework/static/eval_web.css
index ece16ed..e8472ba 100644
--- a/scripts/evaluation/eval_framework/static/eval_web.css
+++ b/scripts/evaluation/eval_framework/static/eval_web.css
@@ -35,10 +35,11 @@
     .results { display: grid; gap: 10px; }
     .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
     .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
-    .Exact { background: var(--exact); }
-    .Partial { background: var(--partial); }
-    .Irrelevant { background: var(--irrelevant); }
-    .Unknown { background: #637381; }
+    .label-exact-match { background: var(--exact); }
+    .label-high-relevant { background: var(--partial); }
+    .label-low-relevant { background: #6b5b95; }
+    .label-irrelevant { background: var(--irrelevant); }
+    .badge-unknown { background: #637381; }
     .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
     .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
     .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; }
diff --git a/scripts/evaluation/eval_framework/static/eval_web.js b/scripts/evaluation/eval_framework/static/eval_web.js
index 4d63e68..33411b2 100644
--- a/scripts/evaluation/eval_framework/static/eval_web.js
+++ b/scripts/evaluation/eval_framework/static/eval_web.js
@@ -13,6 +13,10 @@
         root.appendChild(card);
       });
     }
+    function labelBadgeClass(label) {
+      if (!label || label === 'Unknown') return 'badge-unknown';
+      return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-');
+    }
     function renderResults(results, rootId='results', showRank=true) {
       const mount = document.getElementById(rootId);
       mount.innerHTML = '';
@@ -21,7 +25,7 @@
         const box = document.createElement('div');
         box.className = 'result';
         box.innerHTML = `
-          <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
+          <div><span class="badge ${labelBadgeClass(label)}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
           <img class="thumb" src="${item.image_url || ''}" alt="" />
           <div>
             <div class="title">${item.title || ''}</div>
@@ -42,7 +46,7 @@
       const root = document.getElementById('tips');
       const tips = [...(data.tips || [])];
       const stats = data.label_stats || {};
-      tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
+      tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`);
       root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
     }
     async function loadQueries() {
diff --git a/scripts/evaluation/eval_framework/static/index.html b/scripts/evaluation/eval_framework/static/index.html
index 42273f2..3333781 100644
--- a/scripts/evaluation/eval_framework/static/index.html
+++ b/scripts/evaluation/eval_framework/static/index.html
@@ -37,7 +37,7 @@
         <div id="results" class="results"></div>
       </section>
       <section class="section">
-        <h2>Missed Exact / Partial</h2>
+        <h2>Missed non-irrelevant (cached)</h2>
         <div id="missingRelevant" class="results"></div>
       </section>
       <section class="section">
diff --git a/scripts/evaluation/eval_framework/store.py b/scripts/evaluation/eval_framework/store.py
index 8c16787..bc9dea8 100644
--- a/scripts/evaluation/eval_framework/store.py
+++ b/scripts/evaluation/eval_framework/store.py
@@ -8,7 +8,7 @@ from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Sequence
 
-from .constants import VALID_LABELS
+from .constants import VALID_LABELS, normalize_stored_label
 from .utils import ensure_dir, safe_json_dumps, utc_now_iso
 
 
@@ -220,7 +220,7 @@ class EvalStore:
             """,
             (tenant_id, query_text),
         ).fetchall()
-        return {str(row["spu_id"]): str(row["label"]) for row in rows}
+        return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows}
 
     def upsert_labels(
         self,
@@ -379,8 +379,9 @@ class EvalStore:
             SELECT
               query_text,
               COUNT(*) AS total,
-              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
-              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
+              SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
               SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
               MAX(updated_at) AS updated_at
             FROM relevance_labels
@@ -395,7 +396,8 @@ class EvalStore:
                 "query": str(row["query_text"]),
                 "total": int(row["total"]),
                 "exact_count": int(row["exact_count"] or 0),
-                "partial_count": int(row["partial_count"] or 0),
+                "high_relevant_count": int(row["high_relevant_count"] or 0),
+                "low_relevant_count": int(row["low_relevant_count"] or 0),
                 "irrelevant_count": int(row["irrelevant_count"] or 0),
                 "updated_at": row["updated_at"],
             }
@@ -407,8 +409,9 @@ class EvalStore:
             """
             SELECT
               COUNT(*) AS total,
-              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
-              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
+              SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
               SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
               MAX(updated_at) AS updated_at
             FROM relevance_labels
@@ -420,7 +423,8 @@ class EvalStore:
             "query": query_text,
             "total": int((row["total"] or 0) if row else 0),
             "exact_count": int((row["exact_count"] or 0) if row else 0),
-            "partial_count": int((row["partial_count"] or 0) if row else 0),
+            "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0),
+            "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0),
             "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
             "updated_at": row["updated_at"] if row else None,
         }
diff --git a/scripts/evaluation/quick_start_eval.sh b/scripts/evaluation/quick_start_eval.sh
deleted file mode 100755
index 0bcbf32..0000000
--- a/scripts/evaluation/quick_start_eval.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-# Search evaluation quick entrypoints. Run from any cwd; resolves repo root.
-set -euo pipefail
-
-ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
-cd "$ROOT"
-PY="${ROOT}/.venv/bin/python"
-TENANT_ID="${TENANT_ID:-163}"
-QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
-
-usage() {
-  echo "Usage: $0 batch|batch-rebuild|serve"
-  echo "  batch          — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)"
-  echo "  batch-rebuild  — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
-  echo "  serve          — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
-  echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
-}
-
-case "${1:-}" in
-  batch)
-    exec "$PY" scripts/evaluation/build_annotation_set.py batch \
-      --tenant-id "$TENANT_ID" \
-      --queries-file "$QUERIES" \
-      --top-k 50 \
-      --language en \
-      --labeler-mode simple
-    ;;
-  batch-rebuild)
-    exec "$PY" scripts/evaluation/build_annotation_set.py build \
-      --tenant-id "$TENANT_ID" \
-      --queries-file "$QUERIES" \
-      --search-depth 500 \
-      --rerank-depth 10000 \
-      --force-refresh-rerank \
-      --force-refresh-labels \
-      --language en \
-      --labeler-mode simple
-    ;;
-  serve)
-    EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
-    EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}"
-    exec "$PY" scripts/evaluation/serve_eval_web.py serve \
-      --tenant-id "$TENANT_ID" \
-      --queries-file "$QUERIES" \
-      --host "$EVAL_WEB_HOST" \
-      --port "$EVAL_WEB_PORT"
-    ;;
-  *)
-    usage
-    exit 1
-    ;;
-esac
diff --git a/scripts/evaluation/start_eval.sh b/scripts/evaluation/start_eval.sh
new file mode 100755
index 0000000..dc097c3
--- /dev/null
+++ b/scripts/evaluation/start_eval.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+# Search evaluation quick entrypoints. Run from any cwd; resolves repo root.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$ROOT"
+PY="${ROOT}/.venv/bin/python"
+TENANT_ID="${TENANT_ID:-163}"
+QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
+
+usage() {
+  echo "Usage: $0 batch|batch-rebuild|serve"
+  echo "  batch          — batch eval: live search every query, LLM only for missing labels (top_k=50)"
+  echo "  batch-rebuild  — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
+  echo "  serve          — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
+  echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
+}
+
+case "${1:-}" in
+  batch)
+    exec "$PY" scripts/evaluation/build_annotation_set.py batch \
+      --tenant-id "$TENANT_ID" \
+      --queries-file "$QUERIES" \
+      --top-k 50 \
+      --language en
+    ;;
+  batch-rebuild)
+    exec "$PY" scripts/evaluation/build_annotation_set.py build \
+      --tenant-id "$TENANT_ID" \
+      --queries-file "$QUERIES" \
+      --search-depth 500 \
+      --rerank-depth 10000 \
+      --force-refresh-rerank \
+      --force-refresh-labels \
+      --language en
+    ;;
+  serve)
+    EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
+    EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}"
+    exec "$PY" scripts/evaluation/serve_eval_web.py serve \
+      --tenant-id "$TENANT_ID" \
+      --queries-file "$QUERIES" \
+      --host "$EVAL_WEB_HOST" \
+      --port "$EVAL_WEB_PORT"
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac
--
libgit2 0.21.2