Commit a345b01f79e0926df2b7acc0ca9d3bb31715a654

Authored by tangwang
1 parent 46d94a05

eval framework

docs/Usage-Guide.md
... ... @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
202 202 ./scripts/service_ctl.sh restart backend
203 203 sleep 3
204 204 ./scripts/service_ctl.sh status backend
205   -./scripts/evaluation/quick_start_eval.sh batch
  205 +./scripts/evaluation/start_eval.sh.sh batch
206 206 ```
207 207  
208 208 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
... ...
docs/issue-2026-03-31-评估框架-done-0331.md
... ... @@ -138,7 +138,7 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何
138 138  
139 139  
140 140 @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py
141   -@quick_start_eval.sh (29-35)
  141 +@start_eval.sh.sh (29-35)
142 142 请以如下流程为准,进行改造:
143 143 如果重建的话,对每个query:
144 144 每个搜索结果应该会扫描全库,
... ...
docs/相关性检索优化说明.md
... ... @@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t
240 240 ./scripts/service_ctl.sh restart backend
241 241 sleep 3
242 242 ./scripts/service_ctl.sh status backend
243   -./scripts/evaluation/quick_start_eval.sh batch
  243 +./scripts/evaluation/start_eval.sh.sh batch
244 244 ```
245 245  
246 246 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。
... ...
scripts/evaluation/README.md
... ... @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API,
23 23 | `fusion_experiments_round1.json` | Broader first-round experiments |
24 24 | `queries/queries.txt` | Canonical evaluation queries |
25 25 | `README_Requirement.md` | Product/requirements reference |
26   -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
  26 +| `start_eval.sh.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
27 27 | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. |
28 28  
29 29 ## Quick start (repo root)
... ... @@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS
32 32  
33 33 ```bash
34 34 # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM
35   -./scripts/evaluation/quick_start_eval.sh batch
  35 +./scripts/evaluation/start_eval.sh.sh batch
36 36  
37 37 # Deep rebuild: per-query full corpus rerank (outside search top-500 pool) + LLM in 50-doc batches along global sort order (early stop; expensive)
38   -./scripts/evaluation/quick_start_eval.sh batch-rebuild
  38 +./scripts/evaluation/start_eval.sh.sh batch-rebuild
39 39  
40 40 # UI: http://127.0.0.1:6010/
41   -./scripts/evaluation/quick_start_eval.sh serve
  41 +./scripts/evaluation/start_eval.sh.sh serve
42 42 # or: ./scripts/service_ctl.sh start eval-web
43 43 ```
44 44  
... ... @@ -71,7 +71,7 @@ Explicit equivalents:
71 71  
72 72 Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline).
73 73  
74   -### `quick_start_eval.sh batch-rebuild` (deep annotation rebuild)
  74 +### `start_eval.sh.sh batch-rebuild` (deep annotation rebuild)
75 75  
76 76 This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`.
77 77  
... ...
scripts/evaluation/eval_framework/__init__.py
... ... @@ -12,15 +12,15 @@ ensure_project_on_path()
12 12  
13 13 from .constants import ( # noqa: E402
14 14 DEFAULT_ARTIFACT_ROOT,
15   - DEFAULT_LABELER_MODE,
16 15 DEFAULT_QUERY_FILE,
17   - JUDGE_PROMPT_VERSION_COMPLEX,
18   - JUDGE_PROMPT_VERSION_SIMPLE,
19 16 PROJECT_ROOT,
20 17 RELEVANCE_EXACT,
  18 + RELEVANCE_HIGH,
21 19 RELEVANCE_IRRELEVANT,
22   - RELEVANCE_PARTIAL,
  20 + RELEVANCE_LOW,
  21 + RELEVANCE_NON_IRRELEVANT,
23 22 VALID_LABELS,
  23 + normalize_stored_label,
24 24 )
25 25 from .framework import SearchEvaluationFramework # noqa: E402
26 26 from .store import EvalStore, QueryBuildResult # noqa: E402
... ... @@ -36,22 +36,22 @@ from .utils import ( # noqa: E402
36 36  
37 37 __all__ = [
38 38 "DEFAULT_ARTIFACT_ROOT",
39   - "DEFAULT_LABELER_MODE",
40 39 "DEFAULT_QUERY_FILE",
41 40 "EvalStore",
42   - "JUDGE_PROMPT_VERSION_COMPLEX",
43   - "JUDGE_PROMPT_VERSION_SIMPLE",
44 41 "PROJECT_ROOT",
45 42 "QueryBuildResult",
46 43 "RELEVANCE_EXACT",
  44 + "RELEVANCE_HIGH",
47 45 "RELEVANCE_IRRELEVANT",
48   - "RELEVANCE_PARTIAL",
  46 + "RELEVANCE_LOW",
  47 + "RELEVANCE_NON_IRRELEVANT",
49 48 "SearchEvaluationFramework",
50 49 "VALID_LABELS",
51 50 "build_cli_parser",
52 51 "create_web_app",
53 52 "ensure_dir",
54 53 "main",
  54 + "normalize_stored_label",
55 55 "render_batch_report_markdown",
56 56 "sha1_text",
57 57 "utc_now_iso",
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -8,7 +8,6 @@ from pathlib import Path
8 8 from typing import Any, Dict
9 9  
10 10 from .constants import (
11   - DEFAULT_LABELER_MODE,
12 11 DEFAULT_QUERY_FILE,
13 12 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
14 13 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
... ... @@ -103,7 +102,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
103 102 build.add_argument("--language", default="en")
104 103 build.add_argument("--force-refresh-rerank", action="store_true")
105 104 build.add_argument("--force-refresh-labels", action="store_true")
106   - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
107 105 add_judge_llm_args(build)
108 106  
109 107 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
... ... @@ -112,7 +110,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
112 110 batch.add_argument("--top-k", type=int, default=100)
113 111 batch.add_argument("--language", default="en")
114 112 batch.add_argument("--force-refresh-labels", action="store_true")
115   - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
116 113 add_judge_llm_args(batch)
117 114  
118 115 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
... ... @@ -122,7 +119,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
122 119 audit.add_argument("--language", default="en")
123 120 audit.add_argument("--limit-suspicious", type=int, default=5)
124 121 audit.add_argument("--force-refresh-labels", action="store_true")
125   - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
126 122 add_judge_llm_args(audit)
127 123  
128 124 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
... ... @@ -130,16 +126,13 @@ def build_cli_parser() -> argparse.ArgumentParser:
130 126 serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
131 127 serve.add_argument("--host", default="0.0.0.0")
132 128 serve.add_argument("--port", type=int, default=6010)
133   - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
134 129 add_judge_llm_args(serve)
135 130  
136 131 return parser
137 132  
138 133  
139 134 def run_build(args: argparse.Namespace) -> None:
140   - framework = SearchEvaluationFramework(
141   - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
142   - )
  135 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
143 136 queries = framework.queries_from_file(Path(args.queries_file))
144 137 summary = []
145 138 rebuild_kwargs = {}
... ... @@ -191,9 +184,7 @@ def run_build(args: argparse.Namespace) -> None:
191 184  
192 185  
193 186 def run_batch(args: argparse.Namespace) -> None:
194   - framework = SearchEvaluationFramework(
195   - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
196   - )
  187 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
197 188 queries = framework.queries_from_file(Path(args.queries_file))
198 189 payload = framework.batch_evaluate(
199 190 queries=queries,
... ... @@ -206,9 +197,7 @@ def run_batch(args: argparse.Namespace) -> None:
206 197  
207 198  
208 199 def run_audit(args: argparse.Namespace) -> None:
209   - framework = SearchEvaluationFramework(
210   - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
211   - )
  200 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
212 201 queries = framework.queries_from_file(Path(args.queries_file))
213 202 audit_items = []
214 203 for query in queries:
... ... @@ -258,9 +247,7 @@ def run_audit(args: argparse.Namespace) -> None:
258 247  
259 248  
260 249 def run_serve(args: argparse.Namespace) -> None:
261   - framework = SearchEvaluationFramework(
262   - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args)
263   - )
  250 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
264 251 app = create_web_app(framework, Path(args.queries_file))
265 252 import uvicorn
266 253  
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -11,14 +11,21 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
11 11 import requests
12 12  
13 13 from .constants import VALID_LABELS
14   -from .prompts import (
15   - classify_batch_complex_prompt,
16   - classify_batch_simple_prompt,
17   - extract_query_profile_prompt,
18   -)
  14 +from .prompts import classify_prompt
19 15 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
20 16  
21 17  
  18 +def _canonicalize_judge_label(raw: str) -> str | None:
  19 + s = str(raw or "").strip().strip('"').strip("'")
  20 + if s in VALID_LABELS:
  21 + return s
  22 + low = s.lower()
  23 + for v in VALID_LABELS:
  24 + if v.lower() == low:
  25 + return v
  26 + return None
  27 +
  28 +
22 29 class SearchServiceClient:
23 30 def __init__(self, base_url: str, tenant_id: str):
24 31 self.base_url = base_url.rstrip("/")
... ... @@ -224,71 +231,31 @@ class DashScopeLabelClient:
224 231 return obj
225 232 return None
226 233  
227   - def classify_batch_simple(
  234 + def classify_batch(
228 235 self,
229 236 query: str,
230 237 docs: Sequence[Dict[str, Any]],
231 238 ) -> Tuple[List[str], str]:
232 239 numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
233   - prompt = classify_batch_simple_prompt(query, numbered_docs)
  240 + prompt = classify_prompt(query, numbered_docs)
234 241 content, raw_response = self._chat(prompt)
235   - labels = []
  242 + labels: List[str] = []
236 243 for line in str(content or "").splitlines():
237   - label = line.strip()
238   - if label in VALID_LABELS:
239   - labels.append(label)
  244 + canon = _canonicalize_judge_label(line)
  245 + if canon is not None:
  246 + labels.append(canon)
240 247 if len(labels) != len(docs):
241 248 payload = extract_json_blob(content)
242 249 if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
243 250 labels = []
244 251 for item in payload["labels"][: len(docs)]:
245 252 if isinstance(item, dict):
246   - label = str(item.get("label") or "").strip()
  253 + raw_l = str(item.get("label") or "").strip()
247 254 else:
248   - label = str(item).strip()
249   - if label in VALID_LABELS:
250   - labels.append(label)
251   - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
252   - raise ValueError(f"unexpected simple label output: {content!r}")
253   - return labels, raw_response
254   -
255   - def extract_query_profile(
256   - self,
257   - query: str,
258   - parser_hints: Dict[str, Any],
259   - ) -> Tuple[Dict[str, Any], str]:
260   - prompt = extract_query_profile_prompt(query, parser_hints)
261   - content, raw_response = self._chat(prompt)
262   - payload = extract_json_blob(content)
263   - if not isinstance(payload, dict):
264   - raise ValueError(f"unexpected query profile payload: {content!r}")
265   - payload.setdefault("normalized_query_en", query)
266   - payload.setdefault("primary_category", "")
267   - payload.setdefault("allowed_categories", [])
268   - payload.setdefault("required_attributes", [])
269   - payload.setdefault("notes", [])
270   - return payload, raw_response
271   -
272   - def classify_batch_complex(
273   - self,
274   - query: str,
275   - query_profile: Dict[str, Any],
276   - docs: Sequence[Dict[str, Any]],
277   - ) -> Tuple[List[str], str]:
278   - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
279   - prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs)
280   - content, raw_response = self._chat(prompt)
281   - payload = extract_json_blob(content)
282   - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
283   - raise ValueError(f"unexpected label payload: {content!r}")
284   - labels_payload = payload["labels"]
285   - labels: List[str] = []
286   - for item in labels_payload[: len(docs)]:
287   - if not isinstance(item, dict):
288   - continue
289   - label = str(item.get("label") or "").strip()
290   - if label in VALID_LABELS:
291   - labels.append(label)
  255 + raw_l = str(item).strip()
  256 + canon = _canonicalize_judge_label(raw_l)
  257 + if canon is not None:
  258 + labels.append(canon)
292 259 if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
293   - raise ValueError(f"unexpected label output: {content!r}")
  260 + raise ValueError(f"unexpected classify output: {content!r}")
294 261 return labels, raw_response
... ...
scripts/evaluation/eval_framework/constants.py
... ... @@ -6,17 +6,34 @@ _PKG_DIR = Path(__file__).resolve().parent
6 6 _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
7 7 PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
8 8  
9   -RELEVANCE_EXACT = "Exact"
10   -RELEVANCE_PARTIAL = "Partial"
  9 +# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
  10 +RELEVANCE_EXACT = "Exact Match"
  11 +RELEVANCE_HIGH = "High Relevant"
  12 +RELEVANCE_LOW = "Low Relevant"
11 13 RELEVANCE_IRRELEVANT = "Irrelevant"
12   -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
  14 +
  15 +VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
  16 +
  17 +# Precision / MAP "positive" set (all non-irrelevant tiers)
  18 +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
  19 +
  20 +_LEGACY_LABEL_MAP = {
  21 + "Exact": RELEVANCE_EXACT,
  22 + "Partial": RELEVANCE_HIGH,
  23 +}
  24 +
  25 +
  26 +def normalize_stored_label(label: str) -> str:
  27 + """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels."""
  28 + s = str(label).strip()
  29 + if s in VALID_LABELS:
  30 + return s
  31 + return _LEGACY_LABEL_MAP.get(s, s)
  32 +
13 33  
14 34 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
15 35 DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
16 36  
17   -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
18   -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
19   -DEFAULT_LABELER_MODE = "simple"
20 37 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
21 38 DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
22 39 DEFAULT_JUDGE_ENABLE_THINKING = True
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Sequence, Tuple
10 10 import requests
11 11 from elasticsearch.helpers import scan
12 12  
13   -from api.app import get_app_config, get_es_client, get_query_parser, init_service
  13 +from api.app import get_app_config, get_es_client, init_service
14 14 from indexer.mapping_generator import get_tenant_index_name
15 15  
16 16 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
... ... @@ -21,7 +21,6 @@ from .constants import (
21 21 DEFAULT_JUDGE_DASHSCOPE_BATCH,
22 22 DEFAULT_JUDGE_ENABLE_THINKING,
23 23 DEFAULT_JUDGE_MODEL,
24   - DEFAULT_LABELER_MODE,
25 24 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
26 25 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
27 26 DEFAULT_REBUILD_LLM_BATCH_SIZE,
... ... @@ -30,10 +29,11 @@ from .constants import (
30 29 DEFAULT_RERANK_HIGH_SKIP_COUNT,
31 30 DEFAULT_RERANK_HIGH_THRESHOLD,
32 31 DEFAULT_SEARCH_RECALL_TOP_K,
33   - JUDGE_PROMPT_VERSION_COMPLEX,
34 32 RELEVANCE_EXACT,
  33 + RELEVANCE_HIGH,
35 34 RELEVANCE_IRRELEVANT,
36   - RELEVANCE_PARTIAL,
  35 + RELEVANCE_LOW,
  36 + RELEVANCE_NON_IRRELEVANT,
37 37 VALID_LABELS,
38 38 )
39 39 from .metrics import aggregate_metrics, compute_query_metrics, label_distribution
... ... @@ -45,8 +45,6 @@ from .utils import (
45 45 compact_option_values,
46 46 compact_product_payload,
47 47 ensure_dir,
48   - normalize_text,
49   - pick_text,
50 48 sha1_text,
51 49 utc_now_iso,
52 50 utc_timestamp,
... ... @@ -77,7 +75,6 @@ class SearchEvaluationFramework:
77 75 tenant_id: str,
78 76 artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
79 77 search_base_url: str = "http://localhost:6002",
80   - labeler_mode: str = DEFAULT_LABELER_MODE,
81 78 *,
82 79 judge_model: str | None = None,
83 80 enable_thinking: bool | None = None,
... ... @@ -86,7 +83,6 @@ class SearchEvaluationFramework:
86 83 init_service(get_app_config().infrastructure.elasticsearch.host)
87 84 self.tenant_id = str(tenant_id)
88 85 self.artifact_root = ensure_dir(artifact_root)
89   - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
90 86 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
91 87 self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
92 88 app_cfg = get_app_config()
... ... @@ -112,178 +108,6 @@ class SearchEvaluationFramework:
112 108 enable_thinking=et,
113 109 use_batch=use_batch,
114 110 )
115   - self.query_parser = None
116   -
117   - def _get_query_parser(self):
118   - if self.query_parser is None:
119   - self.query_parser = get_query_parser()
120   - return self.query_parser
121   -
122   - def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
123   - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
124   - payload = parsed.to_dict()
125   - payload["text_for_rerank"] = parsed.text_for_rerank()
126   - return payload
127   -
128   - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
129   - if self.labeler_mode != "complex":
130   - raise RuntimeError("query profiles are only used in complex labeler mode")
131   - if not force_refresh:
132   - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
133   - if cached is not None:
134   - return cached
135   - parser_hints = self.build_query_parser_hints(query)
136   - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
137   - profile["parser_hints"] = parser_hints
138   - self.store.upsert_query_profile(
139   - self.tenant_id,
140   - query,
141   - JUDGE_PROMPT_VERSION_COMPLEX,
142   - self.label_client.model,
143   - profile,
144   - raw_response,
145   - )
146   - return profile
147   -
148   - @staticmethod
149   - def _doc_evidence_text(doc: Dict[str, Any]) -> str:
150   - pieces: List[str] = [
151   - build_display_title(doc),
152   - pick_text(doc.get("vendor"), "en"),
153   - pick_text(doc.get("category_path"), "en"),
154   - pick_text(doc.get("category_name"), "en"),
155   - ]
156   - for sku in doc.get("skus") or []:
157   - pieces.extend(
158   - [
159   - str(sku.get("option1_value") or ""),
160   - str(sku.get("option2_value") or ""),
161   - str(sku.get("option3_value") or ""),
162   - ]
163   - )
164   - for tag in doc.get("tags") or []:
165   - pieces.append(str(tag))
166   - return normalize_text(" | ".join(piece for piece in pieces if piece))
167   -
168   - def _apply_rule_based_label_guardrails(
169   - self,
170   - label: str,
171   - query_profile: Dict[str, Any],
172   - doc: Dict[str, Any],
173   - ) -> str:
174   - if label not in VALID_LABELS:
175   - return label
176   - evidence = self._doc_evidence_text(doc)
177   - category = normalize_text(query_profile.get("primary_category"))
178   - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
179   -
180   - primary_category_match = True
181   - if category:
182   - primary_category_match = category in evidence
183   - allowed_category_match = True
184   - if allowed_categories:
185   - allowed_category_match = any(signal in evidence for signal in allowed_categories)
186   -
187   - if label == RELEVANCE_EXACT and not primary_category_match:
188   - if allowed_category_match:
189   - label = RELEVANCE_PARTIAL
190   - else:
191   - return RELEVANCE_IRRELEVANT
192   -
193   - for attr in query_profile.get("required_attributes") or []:
194   - if not isinstance(attr, dict):
195   - continue
196   - attr_name = normalize_text(attr.get("name"))
197   - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
198   - continue
199   - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
200   - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
201   - if attr_name == "fit":
202   - if any(term in {"oversized", "oversize"} for term in required_terms):
203   - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
204   - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
205   - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
206   - has_required = any(term in evidence for term in required_terms) if required_terms else True
207   - has_conflict = any(term in evidence for term in conflicting_terms)
208   -
209   - if has_conflict:
210   - return RELEVANCE_IRRELEVANT
211   - if label == RELEVANCE_EXACT and not has_required:
212   - label = RELEVANCE_PARTIAL
213   -
214   - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
215   - return RELEVANCE_IRRELEVANT
216   -
217   - return label
218   -
219   - @staticmethod
220   - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
221   - option_values = list(item.get("option_values") or [])
222   - while len(option_values) < 3:
223   - option_values.append("")
224   - product = dict(item.get("product") or {})
225   - return {
226   - "spu_id": item.get("spu_id"),
227   - "title": product.get("title") or item.get("title"),
228   - "vendor": product.get("vendor"),
229   - "category_path": product.get("category"),
230   - "category_name": product.get("category"),
231   - "image_url": item.get("image_url") or product.get("image_url"),
232   - "tags": product.get("tags") or [],
233   - "skus": [
234   - {
235   - "option1_value": option_values[0],
236   - "option2_value": option_values[1],
237   - "option3_value": option_values[2],
238   - }
239   - ],
240   - }
241   -
242   - def _collect_label_issues(
243   - self,
244   - label: str,
245   - query_profile: Dict[str, Any],
246   - doc: Dict[str, Any],
247   - ) -> List[str]:
248   - evidence = self._doc_evidence_text(doc)
249   - issues: List[str] = []
250   - category = normalize_text(query_profile.get("primary_category"))
251   - allowed_categories = [
252   - normalize_text(item)
253   - for item in query_profile.get("allowed_categories") or []
254   - if str(item).strip()
255   - ]
256   -
257   - primary_category_match = True if not category else category in evidence
258   - allowed_category_match = False if allowed_categories else primary_category_match
259   - if allowed_categories:
260   - allowed_category_match = any(signal in evidence for signal in allowed_categories)
261   -
262   - if label == RELEVANCE_EXACT and not primary_category_match:
263   - if allowed_category_match:
264   - issues.append("Exact missing primary category evidence")
265   - else:
266   - issues.append("Exact has category mismatch")
267   -
268   - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
269   - issues.append("Partial has category mismatch")
270   -
271   - for attr in query_profile.get("required_attributes") or []:
272   - if not isinstance(attr, dict):
273   - continue
274   - attr_name = normalize_text(attr.get("name"))
275   - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
276   - continue
277   - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
278   - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
279   - has_required = any(term in evidence for term in required_terms) if required_terms else True
280   - has_conflict = any(term in evidence for term in conflicting_terms)
281   -
282   - if has_conflict and label != RELEVANCE_IRRELEVANT:
283   - issues.append(f"{label} conflicts on {attr_name}")
284   - if label == RELEVANCE_EXACT and not has_required:
285   - issues.append(f"Exact missing {attr_name}")
286   - return issues
287 111  
288 112 def audit_live_query(
289 113 self,
... ... @@ -294,42 +118,6 @@ class SearchEvaluationFramework:
294 118 auto_annotate: bool = False,
295 119 ) -> Dict[str, Any]:
296 120 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
297   - if self.labeler_mode != "complex":
298   - labels = [
299   - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
300   - for item in live["results"]
301   - ]
302   - return {
303   - "query": query,
304   - "tenant_id": self.tenant_id,
305   - "top_k": top_k,
306   - "metrics": live["metrics"],
307   - "distribution": label_distribution(labels),
308   - "query_profile": None,
309   - "suspicious": [],
310   - "results": live["results"],
311   - }
312   - query_profile = self.get_query_profile(query, force_refresh=False)
313   - suspicious: List[Dict[str, Any]] = []
314   -
315   - for item in live["results"]:
316   - doc = self._result_item_to_doc(item)
317   - issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
318   - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
319   - if suggested_label != (item["label"] or ""):
320   - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
321   - if issues:
322   - suspicious.append(
323   - {
324   - "rank": item["rank"],
325   - "spu_id": item["spu_id"],
326   - "title": item["title"],
327   - "label": item["label"],
328   - "suggested_label": suggested_label,
329   - "issues": issues,
330   - }
331   - )
332   -
333 121 labels = [
334 122 item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
335 123 for item in live["results"]
... ... @@ -340,8 +128,8 @@ class SearchEvaluationFramework:
340 128 "top_k": top_k,
341 129 "metrics": live["metrics"],
342 130 "distribution": label_distribution(labels),
343   - "query_profile": query_profile,
344   - "suspicious": suspicious,
  131 + "query_profile": None,
  132 + "suspicious": [],
345 133 "results": live["results"],
346 134 }
347 135  
... ... @@ -521,15 +309,7 @@ class SearchEvaluationFramework:
521 309 if not docs:
522 310 return []
523 311 try:
524   - if self.labeler_mode == "complex":
525   - query_profile = self.get_query_profile(query, force_refresh=force_refresh)
526   - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
527   - labels = [
528   - self._apply_rule_based_label_guardrails(label, query_profile, doc)
529   - for doc, label in zip(docs, labels)
530   - ]
531   - else:
532   - labels, raw_response = self.label_client.classify_batch_simple(query, docs)
  312 + labels, raw_response = self.label_client.classify_batch(query, docs)
533 313 return [(labels, raw_response, docs)]
534 314 except Exception:
535 315 if len(docs) == 1:
... ... @@ -727,8 +507,6 @@ class SearchEvaluationFramework:
727 507 "annotate_rerank_top_k": annotate_rerank_top_k,
728 508 "pool_size": len(pool_docs),
729 509 },
730   - "labeler_mode": self.labeler_mode,
731   - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
732 510 "metrics_top100": metrics,
733 511 "search_results": search_labeled_results,
734 512 "full_rerank_top": rerank_top_results,
... ... @@ -903,8 +681,6 @@ class SearchEvaluationFramework:
903 681 "rebuild": rebuild_meta,
904 682 "ordered_union_size": pool_docs_count,
905 683 },
906   - "labeler_mode": self.labeler_mode,
907   - "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None,
908 684 "metrics_top100": metrics,
909 685 "search_results": search_labeled_results,
910 686 "full_rerank_top": rerank_top_results,
... ... @@ -970,7 +746,7 @@ class SearchEvaluationFramework:
970 746 relevant_missing_ids = [
971 747 spu_id
972 748 for spu_id, label in labels.items()
973   - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
  749 + if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids
974 750 ]
975 751 missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
976 752 missing_relevant = []
... ... @@ -992,7 +768,12 @@ class SearchEvaluationFramework:
992 768 "product": compact_product_payload(doc),
993 769 }
994 770 )
995   - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
  771 + label_order = {
  772 + RELEVANCE_EXACT: 0,
  773 + RELEVANCE_HIGH: 1,
  774 + RELEVANCE_LOW: 2,
  775 + RELEVANCE_IRRELEVANT: 3,
  776 + }
996 777 missing_relevant.sort(
997 778 key=lambda item: (
998 779 label_order.get(str(item.get("label")), 9),
... ... @@ -1010,7 +791,7 @@ class SearchEvaluationFramework:
1010 791 if unlabeled_hits:
1011 792 tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
1012 793 if not missing_relevant:
1013   - tips.append("No cached Exact/Partial products were missed by this recall set.")
  794 + tips.append("No cached non-irrelevant products were missed by this recall set.")
1014 795 return {
1015 796 "query": query,
1016 797 "tenant_id": self.tenant_id,
... ... @@ -1024,7 +805,8 @@ class SearchEvaluationFramework:
1024 805 "recalled_hits": len(labeled),
1025 806 "missing_relevant_count": len(missing_relevant),
1026 807 "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
1027   - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
  808 + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH),
  809 + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW),
1028 810 },
1029 811 "tips": tips,
1030 812 "total": int(search_payload.get("total") or 0),
... ... @@ -1065,7 +847,8 @@ class SearchEvaluationFramework:
1065 847 aggregate = aggregate_metrics([item["metrics"] for item in per_query])
1066 848 aggregate_distribution = {
1067 849 RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
1068   - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
  850 + RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query),
  851 + RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query),
1069 852 RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
1070 853 }
1071 854 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
... ...
scripts/evaluation/eval_framework/metrics.py
... ... @@ -4,7 +4,7 @@ from __future__ import annotations
4 4  
5 5 from typing import Dict, Sequence
6 6  
7   -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT
8 8  
9 9  
10 10 def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
... ... @@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -&gt; fl
13 13 sliced = list(labels[:k])
14 14 if not sliced:
15 15 return 0.0
16   - hits = sum(1 for label in sliced if label in relevant)
  16 + rel = set(relevant)
  17 + hits = sum(1 for label in sliced if label in rel)
17 18 return hits / float(min(k, len(sliced)))
18 19  
19 20  
20 21 def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
  22 + rel = set(relevant)
21 23 hit_count = 0
22 24 precision_sum = 0.0
23 25 for idx, label in enumerate(labels, start=1):
24   - if label not in relevant:
  26 + if label not in rel:
25 27 continue
26 28 hit_count += 1
27 29 precision_sum += hit_count / idx
... ... @@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -&gt; float:
31 33  
32 34  
33 35 def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
  36 + """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names)."""
34 37 metrics: Dict[str, float] = {}
  38 + non_irrel = list(RELEVANCE_NON_IRRELEVANT)
35 39 for k in (5, 10, 20, 50):
36 40 metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
37   - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  41 + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6)
38 42 metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
39   - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  43 + metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6)
40 44 return metrics
41 45  
42 46  
... ... @@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -&gt; Dict[str, flo
53 57 def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
54 58 return {
55 59 RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
56   - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
  60 + RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH),
  61 + RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW),
57 62 RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
58 63 }
... ...
scripts/evaluation/eval_framework/prompts.py
... ... @@ -2,10 +2,9 @@
2 2  
3 3 from __future__ import annotations
4 4  
5   -import json
6   -from typing import Any, Dict, Sequence
  5 +from typing import Sequence
7 6  
8   -_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance judgment assistant for a fashion e-commerce search system.
  7 +_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system.
9 8 Given a user query and the information for each product, assign a relevance label to each product.
10 9  
11 10 Your goal is to judge relevance from the perspective of e-commerce search ranking.
... ... @@ -154,7 +153,7 @@ The output lines must correspond to the products above in the same order.
154 153 Do not output anything else.
155 154 """
156 155  
157   -_CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """你是一个æœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸­çš„相关性判断助手。
  156 +_CLASSIFY_TEMPLATE_ZH = """你是一个æœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸­çš„相关性判断助手。
158 157 给定用户查询è¯ä»¥åŠæ¯ä¸ªå•†å“的信æ¯ï¼Œè¯·ä¸ºæ¯ä¸ªå•†å“分é…一个相关性标签。
159 158  
160 159 你的目标是从电商æœç´¢æŽ’åºçš„è§’åº¦ï¼Œåˆ¤æ–­å•†å“æ˜¯å¦æ»¡è¶³ç”¨æˆ·çš„购物æ„图。
... ... @@ -294,76 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = &quot;&quot;&quot;你是一个æœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸­ç
294 293 """
295 294  
296 295  
297   -def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
  296 +def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
298 297 lines = "\n".join(numbered_doc_lines)
299 298 n = len(numbered_doc_lines)
300   - return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n)
301   -
302   -
303   -_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging.
304   -Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.
305   -Be conservative: only mark an attribute as required if the user explicitly asked for it.
306   -
307   -Return JSON with this schema:
308   -{{
309   - "normalized_query_en": string,
310   - "primary_category": string,
311   - "allowed_categories": [string],
312   - "required_attributes": [
313   - {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}}
314   - ],
315   - "notes": [string]
316   -}}
317   -
318   -Guidelines:
319   -- Exact later will require explicit evidence for all required attributes.
320   -- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.
321   -- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.
322   -- If the query includes color, fit, silhouette, or length, include them as required_attributes.
323   -- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.
324   -- For color, include conflicting colors only when clear from the query.
325   -
326   -Original query: {query}
327   -Parser hints JSON: {hints_json}
328   -"""
329   -
330   -
331   -def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str:
332   - hints_json = json.dumps(parser_hints, ensure_ascii=False)
333   - return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json)
334   -
335   -
336   -_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge.
337   -Judge each product against the structured query profile below.
338   -
339   -Relevance rules:
340   -- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.
341   -- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.
342   -- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.
343   -- Be conservative with Exact.
344   -- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.
345   -- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.
346   -
347   -Original query: {query}
348   -Structured query profile JSON: {profile_json}
349   -
350   -Products:
351   -{lines}
352   -
353   -Return JSON only, with schema:
354   -{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}}
355   -"""
356   -
357   -
358   -def classify_batch_complex_prompt(
359   - query: str,
360   - query_profile: Dict[str, Any],
361   - numbered_doc_lines: Sequence[str],
362   -) -> str:
363   - lines = "\n".join(numbered_doc_lines)
364   - profile_json = json.dumps(query_profile, ensure_ascii=False)
365   - return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format(
366   - query=query,
367   - profile_json=profile_json,
368   - lines=lines,
369   - )
  299 + return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n)
... ...
scripts/evaluation/eval_framework/reports.py
... ... @@ -4,7 +4,7 @@ from __future__ import annotations
4 4  
5 5 from typing import Any, Dict
6 6  
7   -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL
  7 +from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW
8 8  
9 9  
10 10 def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
... ... @@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
29 29 "",
30 30 "## Label Distribution",
31 31 "",
32   - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
33   - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
  32 + f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}",
  33 + f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}",
  34 + f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}",
34 35 f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
35 36 ]
36 37 )
... ... @@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
41 42 for key, value in sorted((item.get("metrics") or {}).items()):
42 43 lines.append(f"- {key}: {value}")
43 44 distribution = item.get("distribution") or {}
44   - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
45   - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
  45 + lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}")
  46 + lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}")
  47 + lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}")
46 48 lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
47 49 lines.append("")
48 50 return "\n".join(lines)
... ...
scripts/evaluation/eval_framework/static/eval_web.css
... ... @@ -35,10 +35,11 @@
35 35 .results { display: grid; gap: 10px; }
36 36 .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
37 37 .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
38   - .Exact { background: var(--exact); }
39   - .Partial { background: var(--partial); }
40   - .Irrelevant { background: var(--irrelevant); }
41   - .Unknown { background: #637381; }
  38 + .label-exact-match { background: var(--exact); }
  39 + .label-high-relevant { background: var(--partial); }
  40 + .label-low-relevant { background: #6b5b95; }
  41 + .label-irrelevant { background: var(--irrelevant); }
  42 + .badge-unknown { background: #637381; }
42 43 .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
43 44 .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; }
44 45 .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; }
... ...
scripts/evaluation/eval_framework/static/eval_web.js
... ... @@ -13,6 +13,10 @@
13 13 root.appendChild(card);
14 14 });
15 15 }
  16 + function labelBadgeClass(label) {
  17 + if (!label || label === 'Unknown') return 'badge-unknown';
  18 + return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-');
  19 + }
16 20 function renderResults(results, rootId='results', showRank=true) {
17 21 const mount = document.getElementById(rootId);
18 22 mount.innerHTML = '';
... ... @@ -21,7 +25,7 @@
21 25 const box = document.createElement('div');
22 26 box.className = 'result';
23 27 box.innerHTML = `
24   - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
  28 + <div><span class="badge ${labelBadgeClass(label)}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
25 29 <img class="thumb" src="${item.image_url || ''}" alt="" />
26 30 <div>
27 31 <div class="title">${item.title || ''}</div>
... ... @@ -42,7 +46,7 @@
42 46 const root = document.getElementById('tips');
43 47 const tips = [...(data.tips || [])];
44 48 const stats = data.label_stats || {};
45   - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
  49 + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`);
46 50 root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
47 51 }
48 52 async function loadQueries() {
... ...
scripts/evaluation/eval_framework/static/index.html
... ... @@ -37,7 +37,7 @@
37 37 <div id="results" class="results"></div>
38 38 </section>
39 39 <section class="section">
40   - <h2>Missed Exact / Partial</h2>
  40 + <h2>Missed non-irrelevant (cached)</h2>
41 41 <div id="missingRelevant" class="results"></div>
42 42 </section>
43 43 <section class="section">
... ...
scripts/evaluation/eval_framework/store.py
... ... @@ -8,7 +8,7 @@ from dataclasses import dataclass
8 8 from pathlib import Path
9 9 from typing import Any, Dict, List, Optional, Sequence
10 10  
11   -from .constants import VALID_LABELS
  11 +from .constants import VALID_LABELS, normalize_stored_label
12 12 from .utils import ensure_dir, safe_json_dumps, utc_now_iso
13 13  
14 14  
... ... @@ -220,7 +220,7 @@ class EvalStore:
220 220 """,
221 221 (tenant_id, query_text),
222 222 ).fetchall()
223   - return {str(row["spu_id"]): str(row["label"]) for row in rows}
  223 + return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows}
224 224  
225 225 def upsert_labels(
226 226 self,
... ... @@ -379,8 +379,9 @@ class EvalStore:
379 379 SELECT
380 380 query_text,
381 381 COUNT(*) AS total,
382   - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
383   - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  382 + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
  383 + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
  384 + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
384 385 SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
385 386 MAX(updated_at) AS updated_at
386 387 FROM relevance_labels
... ... @@ -395,7 +396,8 @@ class EvalStore:
395 396 "query": str(row["query_text"]),
396 397 "total": int(row["total"]),
397 398 "exact_count": int(row["exact_count"] or 0),
398   - "partial_count": int(row["partial_count"] or 0),
  399 + "high_relevant_count": int(row["high_relevant_count"] or 0),
  400 + "low_relevant_count": int(row["low_relevant_count"] or 0),
399 401 "irrelevant_count": int(row["irrelevant_count"] or 0),
400 402 "updated_at": row["updated_at"],
401 403 }
... ... @@ -407,8 +409,9 @@ class EvalStore:
407 409 """
408 410 SELECT
409 411 COUNT(*) AS total,
410   - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
411   - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  412 + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count,
  413 + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count,
  414 + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count,
412 415 SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
413 416 MAX(updated_at) AS updated_at
414 417 FROM relevance_labels
... ... @@ -420,7 +423,8 @@ class EvalStore:
420 423 "query": query_text,
421 424 "total": int((row["total"] or 0) if row else 0),
422 425 "exact_count": int((row["exact_count"] or 0) if row else 0),
423   - "partial_count": int((row["partial_count"] or 0) if row else 0),
  426 + "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0),
  427 + "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0),
424 428 "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
425 429 "updated_at": row["updated_at"] if row else None,
426 430 }
... ...
scripts/evaluation/quick_start_eval.sh renamed to scripts/evaluation/start_eval.sh
... ... @@ -10,7 +10,7 @@ QUERIES=&quot;${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}&quot;
10 10  
11 11 usage() {
12 12 echo "Usage: $0 batch|batch-rebuild|serve"
13   - echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)"
  13 + echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
14 14 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
15 15 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
16 16 echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
... ... @@ -22,8 +22,7 @@ case &quot;${1:-}&quot; in
22 22 --tenant-id "$TENANT_ID" \
23 23 --queries-file "$QUERIES" \
24 24 --top-k 50 \
25   - --language en \
26   - --labeler-mode simple
  25 + --language en
27 26 ;;
28 27 batch-rebuild)
29 28 exec "$PY" scripts/evaluation/build_annotation_set.py build \
... ... @@ -33,8 +32,7 @@ case &quot;${1:-}&quot; in
33 32 --rerank-depth 10000 \
34 33 --force-refresh-rerank \
35 34 --force-refresh-labels \
36   - --language en \
37   - --labeler-mode simple
  35 + --language en
38 36 ;;
39 37 serve)
40 38 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
... ...