Commit a345b01f79e0926df2b7acc0ca9d3bb31715a654
1 parent
46d94a05
eval framework
Showing
17 changed files
with
136 additions
and
438 deletions
Show diff stats
docs/Usage-Guide.md
| ... | ... | @@ -202,7 +202,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t |
| 202 | 202 | ./scripts/service_ctl.sh restart backend |
| 203 | 203 | sleep 3 |
| 204 | 204 | ./scripts/service_ctl.sh status backend |
| 205 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 205 | +./scripts/evaluation/start_eval.sh.sh batch | |
| 206 | 206 | ``` |
| 207 | 207 | |
| 208 | 208 | 离线批量评估会把标注与报表写到 `artifacts/search_evaluation/`(SQLite、`batch_reports/` 下的 JSON/Markdown 等)。说明与命令见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | ... | ... |
docs/issue-2026-03-31-评估框架-done-0331.md
| ... | ... | @@ -138,7 +138,7 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何 |
| 138 | 138 | |
| 139 | 139 | |
| 140 | 140 | @scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py |
| 141 | -@quick_start_eval.sh (29-35) | |
| 141 | +@start_eval.sh.sh (29-35) | |
| 142 | 142 | 请以如下流程为准,进行改造: |
| 143 | 143 | 如果重建的话,对每个query: |
| 144 | 144 | 每个搜索结果应该会扫描全库, | ... | ... |
docs/相关性检索优化说明.md
| ... | ... | @@ -240,7 +240,7 @@ python -m pytest -q tests/test_rerank_client.py tests/test_es_query_builder.py t |
| 240 | 240 | ./scripts/service_ctl.sh restart backend |
| 241 | 241 | sleep 3 |
| 242 | 242 | ./scripts/service_ctl.sh status backend |
| 243 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 243 | +./scripts/evaluation/start_eval.sh.sh batch | |
| 244 | 244 | ``` |
| 245 | 245 | |
| 246 | 246 | 评估产物在 `artifacts/search_evaluation/`(如 `search_eval.sqlite3`、`batch_reports/` 下的 JSON/Markdown)。流程与参数说明见 [scripts/evaluation/README.md](../scripts/evaluation/README.md)。 | ... | ... |
scripts/evaluation/README.md
| ... | ... | @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, |
| 23 | 23 | | `fusion_experiments_round1.json` | Broader first-round experiments | |
| 24 | 24 | | `queries/queries.txt` | Canonical evaluation queries | |
| 25 | 25 | | `README_Requirement.md` | Product/requirements reference | |
| 26 | -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 26 | +| `start_eval.sh.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | |
| 27 | 27 | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | |
| 28 | 28 | |
| 29 | 29 | ## Quick start (repo root) |
| ... | ... | @@ -32,13 +32,13 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS |
| 32 | 32 | |
| 33 | 33 | ```bash |
| 34 | 34 | # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM |
| 35 | -./scripts/evaluation/quick_start_eval.sh batch | |
| 35 | +./scripts/evaluation/start_eval.sh.sh batch | |
| 36 | 36 | |
| 37 | 37 | # Deep rebuild: per-query full corpus rerank (outside search top-500 pool) + LLM in 50-doc batches along global sort order (early stop; expensive) |
| 38 | -./scripts/evaluation/quick_start_eval.sh batch-rebuild | |
| 38 | +./scripts/evaluation/start_eval.sh.sh batch-rebuild | |
| 39 | 39 | |
| 40 | 40 | # UI: http://127.0.0.1:6010/ |
| 41 | -./scripts/evaluation/quick_start_eval.sh serve | |
| 41 | +./scripts/evaluation/start_eval.sh.sh serve | |
| 42 | 42 | # or: ./scripts/service_ctl.sh start eval-web |
| 43 | 43 | ``` |
| 44 | 44 | |
| ... | ... | @@ -71,7 +71,7 @@ Explicit equivalents: |
| 71 | 71 | |
| 72 | 72 | Each `batch` run walks the full queries file and writes a **batch report** under `batch_reports/`. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM (still only those hits—not the deep rebuild pipeline). |
| 73 | 73 | |
| 74 | -### `quick_start_eval.sh batch-rebuild` (deep annotation rebuild) | |
| 74 | +### `start_eval.sh.sh batch-rebuild` (deep annotation rebuild) | |
| 75 | 75 | |
| 76 | 76 | This runs `build_annotation_set.py build` with **`--force-refresh-labels`** and **`--force-refresh-rerank`** (see the explicit command block below). It does **not** run the `batch` subcommand: there is **no** aggregate batch report for this step; outputs are per-query JSON under `query_builds/` plus updates in `search_eval.sqlite3`. |
| 77 | 77 | ... | ... |
scripts/evaluation/eval_framework/__init__.py
| ... | ... | @@ -12,15 +12,15 @@ ensure_project_on_path() |
| 12 | 12 | |
| 13 | 13 | from .constants import ( # noqa: E402 |
| 14 | 14 | DEFAULT_ARTIFACT_ROOT, |
| 15 | - DEFAULT_LABELER_MODE, | |
| 16 | 15 | DEFAULT_QUERY_FILE, |
| 17 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 18 | - JUDGE_PROMPT_VERSION_SIMPLE, | |
| 19 | 16 | PROJECT_ROOT, |
| 20 | 17 | RELEVANCE_EXACT, |
| 18 | + RELEVANCE_HIGH, | |
| 21 | 19 | RELEVANCE_IRRELEVANT, |
| 22 | - RELEVANCE_PARTIAL, | |
| 20 | + RELEVANCE_LOW, | |
| 21 | + RELEVANCE_NON_IRRELEVANT, | |
| 23 | 22 | VALID_LABELS, |
| 23 | + normalize_stored_label, | |
| 24 | 24 | ) |
| 25 | 25 | from .framework import SearchEvaluationFramework # noqa: E402 |
| 26 | 26 | from .store import EvalStore, QueryBuildResult # noqa: E402 |
| ... | ... | @@ -36,22 +36,22 @@ from .utils import ( # noqa: E402 |
| 36 | 36 | |
| 37 | 37 | __all__ = [ |
| 38 | 38 | "DEFAULT_ARTIFACT_ROOT", |
| 39 | - "DEFAULT_LABELER_MODE", | |
| 40 | 39 | "DEFAULT_QUERY_FILE", |
| 41 | 40 | "EvalStore", |
| 42 | - "JUDGE_PROMPT_VERSION_COMPLEX", | |
| 43 | - "JUDGE_PROMPT_VERSION_SIMPLE", | |
| 44 | 41 | "PROJECT_ROOT", |
| 45 | 42 | "QueryBuildResult", |
| 46 | 43 | "RELEVANCE_EXACT", |
| 44 | + "RELEVANCE_HIGH", | |
| 47 | 45 | "RELEVANCE_IRRELEVANT", |
| 48 | - "RELEVANCE_PARTIAL", | |
| 46 | + "RELEVANCE_LOW", | |
| 47 | + "RELEVANCE_NON_IRRELEVANT", | |
| 49 | 48 | "SearchEvaluationFramework", |
| 50 | 49 | "VALID_LABELS", |
| 51 | 50 | "build_cli_parser", |
| 52 | 51 | "create_web_app", |
| 53 | 52 | "ensure_dir", |
| 54 | 53 | "main", |
| 54 | + "normalize_stored_label", | |
| 55 | 55 | "render_batch_report_markdown", |
| 56 | 56 | "sha1_text", |
| 57 | 57 | "utc_now_iso", | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -8,7 +8,6 @@ from pathlib import Path |
| 8 | 8 | from typing import Any, Dict |
| 9 | 9 | |
| 10 | 10 | from .constants import ( |
| 11 | - DEFAULT_LABELER_MODE, | |
| 12 | 11 | DEFAULT_QUERY_FILE, |
| 13 | 12 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 14 | 13 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| ... | ... | @@ -103,7 +102,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 103 | 102 | build.add_argument("--language", default="en") |
| 104 | 103 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 105 | 104 | build.add_argument("--force-refresh-labels", action="store_true") |
| 106 | - build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 107 | 105 | add_judge_llm_args(build) |
| 108 | 106 | |
| 109 | 107 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| ... | ... | @@ -112,7 +110,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 112 | 110 | batch.add_argument("--top-k", type=int, default=100) |
| 113 | 111 | batch.add_argument("--language", default="en") |
| 114 | 112 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 115 | - batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 116 | 113 | add_judge_llm_args(batch) |
| 117 | 114 | |
| 118 | 115 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| ... | ... | @@ -122,7 +119,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 122 | 119 | audit.add_argument("--language", default="en") |
| 123 | 120 | audit.add_argument("--limit-suspicious", type=int, default=5) |
| 124 | 121 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 125 | - audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 126 | 122 | add_judge_llm_args(audit) |
| 127 | 123 | |
| 128 | 124 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| ... | ... | @@ -130,16 +126,13 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 130 | 126 | serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) |
| 131 | 127 | serve.add_argument("--host", default="0.0.0.0") |
| 132 | 128 | serve.add_argument("--port", type=int, default=6010) |
| 133 | - serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 134 | 129 | add_judge_llm_args(serve) |
| 135 | 130 | |
| 136 | 131 | return parser |
| 137 | 132 | |
| 138 | 133 | |
| 139 | 134 | def run_build(args: argparse.Namespace) -> None: |
| 140 | - framework = SearchEvaluationFramework( | |
| 141 | - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) | |
| 142 | - ) | |
| 135 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 143 | 136 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 144 | 137 | summary = [] |
| 145 | 138 | rebuild_kwargs = {} |
| ... | ... | @@ -191,9 +184,7 @@ def run_build(args: argparse.Namespace) -> None: |
| 191 | 184 | |
| 192 | 185 | |
| 193 | 186 | def run_batch(args: argparse.Namespace) -> None: |
| 194 | - framework = SearchEvaluationFramework( | |
| 195 | - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) | |
| 196 | - ) | |
| 187 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 197 | 188 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 198 | 189 | payload = framework.batch_evaluate( |
| 199 | 190 | queries=queries, |
| ... | ... | @@ -206,9 +197,7 @@ def run_batch(args: argparse.Namespace) -> None: |
| 206 | 197 | |
| 207 | 198 | |
| 208 | 199 | def run_audit(args: argparse.Namespace) -> None: |
| 209 | - framework = SearchEvaluationFramework( | |
| 210 | - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) | |
| 211 | - ) | |
| 200 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 212 | 201 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 213 | 202 | audit_items = [] |
| 214 | 203 | for query in queries: |
| ... | ... | @@ -258,9 +247,7 @@ def run_audit(args: argparse.Namespace) -> None: |
| 258 | 247 | |
| 259 | 248 | |
| 260 | 249 | def run_serve(args: argparse.Namespace) -> None: |
| 261 | - framework = SearchEvaluationFramework( | |
| 262 | - tenant_id=args.tenant_id, labeler_mode=args.labeler_mode, **framework_kwargs_from_args(args) | |
| 263 | - ) | |
| 250 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | |
| 264 | 251 | app = create_web_app(framework, Path(args.queries_file)) |
| 265 | 252 | import uvicorn |
| 266 | 253 | ... | ... |
scripts/evaluation/eval_framework/clients.py
| ... | ... | @@ -11,14 +11,21 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple |
| 11 | 11 | import requests |
| 12 | 12 | |
| 13 | 13 | from .constants import VALID_LABELS |
| 14 | -from .prompts import ( | |
| 15 | - classify_batch_complex_prompt, | |
| 16 | - classify_batch_simple_prompt, | |
| 17 | - extract_query_profile_prompt, | |
| 18 | -) | |
| 14 | +from .prompts import classify_prompt | |
| 19 | 15 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps |
| 20 | 16 | |
| 21 | 17 | |
| 18 | +def _canonicalize_judge_label(raw: str) -> str | None: | |
| 19 | + s = str(raw or "").strip().strip('"').strip("'") | |
| 20 | + if s in VALID_LABELS: | |
| 21 | + return s | |
| 22 | + low = s.lower() | |
| 23 | + for v in VALID_LABELS: | |
| 24 | + if v.lower() == low: | |
| 25 | + return v | |
| 26 | + return None | |
| 27 | + | |
| 28 | + | |
| 22 | 29 | class SearchServiceClient: |
| 23 | 30 | def __init__(self, base_url: str, tenant_id: str): |
| 24 | 31 | self.base_url = base_url.rstrip("/") |
| ... | ... | @@ -224,71 +231,31 @@ class DashScopeLabelClient: |
| 224 | 231 | return obj |
| 225 | 232 | return None |
| 226 | 233 | |
| 227 | - def classify_batch_simple( | |
| 234 | + def classify_batch( | |
| 228 | 235 | self, |
| 229 | 236 | query: str, |
| 230 | 237 | docs: Sequence[Dict[str, Any]], |
| 231 | 238 | ) -> Tuple[List[str], str]: |
| 232 | 239 | numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] |
| 233 | - prompt = classify_batch_simple_prompt(query, numbered_docs) | |
| 240 | + prompt = classify_prompt(query, numbered_docs) | |
| 234 | 241 | content, raw_response = self._chat(prompt) |
| 235 | - labels = [] | |
| 242 | + labels: List[str] = [] | |
| 236 | 243 | for line in str(content or "").splitlines(): |
| 237 | - label = line.strip() | |
| 238 | - if label in VALID_LABELS: | |
| 239 | - labels.append(label) | |
| 244 | + canon = _canonicalize_judge_label(line) | |
| 245 | + if canon is not None: | |
| 246 | + labels.append(canon) | |
| 240 | 247 | if len(labels) != len(docs): |
| 241 | 248 | payload = extract_json_blob(content) |
| 242 | 249 | if isinstance(payload, dict) and isinstance(payload.get("labels"), list): |
| 243 | 250 | labels = [] |
| 244 | 251 | for item in payload["labels"][: len(docs)]: |
| 245 | 252 | if isinstance(item, dict): |
| 246 | - label = str(item.get("label") or "").strip() | |
| 253 | + raw_l = str(item.get("label") or "").strip() | |
| 247 | 254 | else: |
| 248 | - label = str(item).strip() | |
| 249 | - if label in VALID_LABELS: | |
| 250 | - labels.append(label) | |
| 251 | - if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 252 | - raise ValueError(f"unexpected simple label output: {content!r}") | |
| 253 | - return labels, raw_response | |
| 254 | - | |
| 255 | - def extract_query_profile( | |
| 256 | - self, | |
| 257 | - query: str, | |
| 258 | - parser_hints: Dict[str, Any], | |
| 259 | - ) -> Tuple[Dict[str, Any], str]: | |
| 260 | - prompt = extract_query_profile_prompt(query, parser_hints) | |
| 261 | - content, raw_response = self._chat(prompt) | |
| 262 | - payload = extract_json_blob(content) | |
| 263 | - if not isinstance(payload, dict): | |
| 264 | - raise ValueError(f"unexpected query profile payload: {content!r}") | |
| 265 | - payload.setdefault("normalized_query_en", query) | |
| 266 | - payload.setdefault("primary_category", "") | |
| 267 | - payload.setdefault("allowed_categories", []) | |
| 268 | - payload.setdefault("required_attributes", []) | |
| 269 | - payload.setdefault("notes", []) | |
| 270 | - return payload, raw_response | |
| 271 | - | |
| 272 | - def classify_batch_complex( | |
| 273 | - self, | |
| 274 | - query: str, | |
| 275 | - query_profile: Dict[str, Any], | |
| 276 | - docs: Sequence[Dict[str, Any]], | |
| 277 | - ) -> Tuple[List[str], str]: | |
| 278 | - numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 279 | - prompt = classify_batch_complex_prompt(query, query_profile, numbered_docs) | |
| 280 | - content, raw_response = self._chat(prompt) | |
| 281 | - payload = extract_json_blob(content) | |
| 282 | - if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): | |
| 283 | - raise ValueError(f"unexpected label payload: {content!r}") | |
| 284 | - labels_payload = payload["labels"] | |
| 285 | - labels: List[str] = [] | |
| 286 | - for item in labels_payload[: len(docs)]: | |
| 287 | - if not isinstance(item, dict): | |
| 288 | - continue | |
| 289 | - label = str(item.get("label") or "").strip() | |
| 290 | - if label in VALID_LABELS: | |
| 291 | - labels.append(label) | |
| 255 | + raw_l = str(item).strip() | |
| 256 | + canon = _canonicalize_judge_label(raw_l) | |
| 257 | + if canon is not None: | |
| 258 | + labels.append(canon) | |
| 292 | 259 | if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): |
| 293 | - raise ValueError(f"unexpected label output: {content!r}") | |
| 260 | + raise ValueError(f"unexpected classify output: {content!r}") | |
| 294 | 261 | return labels, raw_response | ... | ... |
scripts/evaluation/eval_framework/constants.py
| ... | ... | @@ -6,17 +6,34 @@ _PKG_DIR = Path(__file__).resolve().parent |
| 6 | 6 | _SCRIPTS_EVAL_DIR = _PKG_DIR.parent |
| 7 | 7 | PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] |
| 8 | 8 | |
| 9 | -RELEVANCE_EXACT = "Exact" | |
| 10 | -RELEVANCE_PARTIAL = "Partial" | |
| 9 | +# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) | |
| 10 | +RELEVANCE_EXACT = "Exact Match" | |
| 11 | +RELEVANCE_HIGH = "High Relevant" | |
| 12 | +RELEVANCE_LOW = "Low Relevant" | |
| 11 | 13 | RELEVANCE_IRRELEVANT = "Irrelevant" |
| 12 | -VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | |
| 14 | + | |
| 15 | +VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) | |
| 16 | + | |
| 17 | +# Precision / MAP "positive" set (all non-irrelevant tiers) | |
| 18 | +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) | |
| 19 | + | |
| 20 | +_LEGACY_LABEL_MAP = { | |
| 21 | + "Exact": RELEVANCE_EXACT, | |
| 22 | + "Partial": RELEVANCE_HIGH, | |
| 23 | +} | |
| 24 | + | |
| 25 | + | |
| 26 | +def normalize_stored_label(label: str) -> str: | |
| 27 | + """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels.""" | |
| 28 | + s = str(label).strip() | |
| 29 | + if s in VALID_LABELS: | |
| 30 | + return s | |
| 31 | + return _LEGACY_LABEL_MAP.get(s, s) | |
| 32 | + | |
| 13 | 33 | |
| 14 | 34 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| 15 | 35 | DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" |
| 16 | 36 | |
| 17 | -JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" | |
| 18 | -JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | |
| 19 | -DEFAULT_LABELER_MODE = "simple" | |
| 20 | 37 | # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) |
| 21 | 38 | DEFAULT_JUDGE_MODEL = "qwen3.5-flash" |
| 22 | 39 | DEFAULT_JUDGE_ENABLE_THINKING = True | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -10,7 +10,7 @@ from typing import Any, Dict, List, Sequence, Tuple |
| 10 | 10 | import requests |
| 11 | 11 | from elasticsearch.helpers import scan |
| 12 | 12 | |
| 13 | -from api.app import get_app_config, get_es_client, get_query_parser, init_service | |
| 13 | +from api.app import get_app_config, get_es_client, init_service | |
| 14 | 14 | from indexer.mapping_generator import get_tenant_index_name |
| 15 | 15 | |
| 16 | 16 | from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient |
| ... | ... | @@ -21,7 +21,6 @@ from .constants import ( |
| 21 | 21 | DEFAULT_JUDGE_DASHSCOPE_BATCH, |
| 22 | 22 | DEFAULT_JUDGE_ENABLE_THINKING, |
| 23 | 23 | DEFAULT_JUDGE_MODEL, |
| 24 | - DEFAULT_LABELER_MODE, | |
| 25 | 24 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 26 | 25 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| 27 | 26 | DEFAULT_REBUILD_LLM_BATCH_SIZE, |
| ... | ... | @@ -30,10 +29,11 @@ from .constants import ( |
| 30 | 29 | DEFAULT_RERANK_HIGH_SKIP_COUNT, |
| 31 | 30 | DEFAULT_RERANK_HIGH_THRESHOLD, |
| 32 | 31 | DEFAULT_SEARCH_RECALL_TOP_K, |
| 33 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 34 | 32 | RELEVANCE_EXACT, |
| 33 | + RELEVANCE_HIGH, | |
| 35 | 34 | RELEVANCE_IRRELEVANT, |
| 36 | - RELEVANCE_PARTIAL, | |
| 35 | + RELEVANCE_LOW, | |
| 36 | + RELEVANCE_NON_IRRELEVANT, | |
| 37 | 37 | VALID_LABELS, |
| 38 | 38 | ) |
| 39 | 39 | from .metrics import aggregate_metrics, compute_query_metrics, label_distribution |
| ... | ... | @@ -45,8 +45,6 @@ from .utils import ( |
| 45 | 45 | compact_option_values, |
| 46 | 46 | compact_product_payload, |
| 47 | 47 | ensure_dir, |
| 48 | - normalize_text, | |
| 49 | - pick_text, | |
| 50 | 48 | sha1_text, |
| 51 | 49 | utc_now_iso, |
| 52 | 50 | utc_timestamp, |
| ... | ... | @@ -77,7 +75,6 @@ class SearchEvaluationFramework: |
| 77 | 75 | tenant_id: str, |
| 78 | 76 | artifact_root: Path = DEFAULT_ARTIFACT_ROOT, |
| 79 | 77 | search_base_url: str = "http://localhost:6002", |
| 80 | - labeler_mode: str = DEFAULT_LABELER_MODE, | |
| 81 | 78 | *, |
| 82 | 79 | judge_model: str | None = None, |
| 83 | 80 | enable_thinking: bool | None = None, |
| ... | ... | @@ -86,7 +83,6 @@ class SearchEvaluationFramework: |
| 86 | 83 | init_service(get_app_config().infrastructure.elasticsearch.host) |
| 87 | 84 | self.tenant_id = str(tenant_id) |
| 88 | 85 | self.artifact_root = ensure_dir(artifact_root) |
| 89 | - self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | |
| 90 | 86 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") |
| 91 | 87 | self.search_client = SearchServiceClient(search_base_url, self.tenant_id) |
| 92 | 88 | app_cfg = get_app_config() |
| ... | ... | @@ -112,178 +108,6 @@ class SearchEvaluationFramework: |
| 112 | 108 | enable_thinking=et, |
| 113 | 109 | use_batch=use_batch, |
| 114 | 110 | ) |
| 115 | - self.query_parser = None | |
| 116 | - | |
| 117 | - def _get_query_parser(self): | |
| 118 | - if self.query_parser is None: | |
| 119 | - self.query_parser = get_query_parser() | |
| 120 | - return self.query_parser | |
| 121 | - | |
| 122 | - def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | |
| 123 | - parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 124 | - payload = parsed.to_dict() | |
| 125 | - payload["text_for_rerank"] = parsed.text_for_rerank() | |
| 126 | - return payload | |
| 127 | - | |
| 128 | - def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | |
| 129 | - if self.labeler_mode != "complex": | |
| 130 | - raise RuntimeError("query profiles are only used in complex labeler mode") | |
| 131 | - if not force_refresh: | |
| 132 | - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) | |
| 133 | - if cached is not None: | |
| 134 | - return cached | |
| 135 | - parser_hints = self.build_query_parser_hints(query) | |
| 136 | - profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) | |
| 137 | - profile["parser_hints"] = parser_hints | |
| 138 | - self.store.upsert_query_profile( | |
| 139 | - self.tenant_id, | |
| 140 | - query, | |
| 141 | - JUDGE_PROMPT_VERSION_COMPLEX, | |
| 142 | - self.label_client.model, | |
| 143 | - profile, | |
| 144 | - raw_response, | |
| 145 | - ) | |
| 146 | - return profile | |
| 147 | - | |
| 148 | - @staticmethod | |
| 149 | - def _doc_evidence_text(doc: Dict[str, Any]) -> str: | |
| 150 | - pieces: List[str] = [ | |
| 151 | - build_display_title(doc), | |
| 152 | - pick_text(doc.get("vendor"), "en"), | |
| 153 | - pick_text(doc.get("category_path"), "en"), | |
| 154 | - pick_text(doc.get("category_name"), "en"), | |
| 155 | - ] | |
| 156 | - for sku in doc.get("skus") or []: | |
| 157 | - pieces.extend( | |
| 158 | - [ | |
| 159 | - str(sku.get("option1_value") or ""), | |
| 160 | - str(sku.get("option2_value") or ""), | |
| 161 | - str(sku.get("option3_value") or ""), | |
| 162 | - ] | |
| 163 | - ) | |
| 164 | - for tag in doc.get("tags") or []: | |
| 165 | - pieces.append(str(tag)) | |
| 166 | - return normalize_text(" | ".join(piece for piece in pieces if piece)) | |
| 167 | - | |
| 168 | - def _apply_rule_based_label_guardrails( | |
| 169 | - self, | |
| 170 | - label: str, | |
| 171 | - query_profile: Dict[str, Any], | |
| 172 | - doc: Dict[str, Any], | |
| 173 | - ) -> str: | |
| 174 | - if label not in VALID_LABELS: | |
| 175 | - return label | |
| 176 | - evidence = self._doc_evidence_text(doc) | |
| 177 | - category = normalize_text(query_profile.get("primary_category")) | |
| 178 | - allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] | |
| 179 | - | |
| 180 | - primary_category_match = True | |
| 181 | - if category: | |
| 182 | - primary_category_match = category in evidence | |
| 183 | - allowed_category_match = True | |
| 184 | - if allowed_categories: | |
| 185 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 186 | - | |
| 187 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 188 | - if allowed_category_match: | |
| 189 | - label = RELEVANCE_PARTIAL | |
| 190 | - else: | |
| 191 | - return RELEVANCE_IRRELEVANT | |
| 192 | - | |
| 193 | - for attr in query_profile.get("required_attributes") or []: | |
| 194 | - if not isinstance(attr, dict): | |
| 195 | - continue | |
| 196 | - attr_name = normalize_text(attr.get("name")) | |
| 197 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: | |
| 198 | - continue | |
| 199 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 200 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 201 | - if attr_name == "fit": | |
| 202 | - if any(term in {"oversized", "oversize"} for term in required_terms): | |
| 203 | - conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) | |
| 204 | - if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): | |
| 205 | - conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) | |
| 206 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 207 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 208 | - | |
| 209 | - if has_conflict: | |
| 210 | - return RELEVANCE_IRRELEVANT | |
| 211 | - if label == RELEVANCE_EXACT and not has_required: | |
| 212 | - label = RELEVANCE_PARTIAL | |
| 213 | - | |
| 214 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 215 | - return RELEVANCE_IRRELEVANT | |
| 216 | - | |
| 217 | - return label | |
| 218 | - | |
| 219 | - @staticmethod | |
| 220 | - def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: | |
| 221 | - option_values = list(item.get("option_values") or []) | |
| 222 | - while len(option_values) < 3: | |
| 223 | - option_values.append("") | |
| 224 | - product = dict(item.get("product") or {}) | |
| 225 | - return { | |
| 226 | - "spu_id": item.get("spu_id"), | |
| 227 | - "title": product.get("title") or item.get("title"), | |
| 228 | - "vendor": product.get("vendor"), | |
| 229 | - "category_path": product.get("category"), | |
| 230 | - "category_name": product.get("category"), | |
| 231 | - "image_url": item.get("image_url") or product.get("image_url"), | |
| 232 | - "tags": product.get("tags") or [], | |
| 233 | - "skus": [ | |
| 234 | - { | |
| 235 | - "option1_value": option_values[0], | |
| 236 | - "option2_value": option_values[1], | |
| 237 | - "option3_value": option_values[2], | |
| 238 | - } | |
| 239 | - ], | |
| 240 | - } | |
| 241 | - | |
| 242 | - def _collect_label_issues( | |
| 243 | - self, | |
| 244 | - label: str, | |
| 245 | - query_profile: Dict[str, Any], | |
| 246 | - doc: Dict[str, Any], | |
| 247 | - ) -> List[str]: | |
| 248 | - evidence = self._doc_evidence_text(doc) | |
| 249 | - issues: List[str] = [] | |
| 250 | - category = normalize_text(query_profile.get("primary_category")) | |
| 251 | - allowed_categories = [ | |
| 252 | - normalize_text(item) | |
| 253 | - for item in query_profile.get("allowed_categories") or [] | |
| 254 | - if str(item).strip() | |
| 255 | - ] | |
| 256 | - | |
| 257 | - primary_category_match = True if not category else category in evidence | |
| 258 | - allowed_category_match = False if allowed_categories else primary_category_match | |
| 259 | - if allowed_categories: | |
| 260 | - allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 261 | - | |
| 262 | - if label == RELEVANCE_EXACT and not primary_category_match: | |
| 263 | - if allowed_category_match: | |
| 264 | - issues.append("Exact missing primary category evidence") | |
| 265 | - else: | |
| 266 | - issues.append("Exact has category mismatch") | |
| 267 | - | |
| 268 | - if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 269 | - issues.append("Partial has category mismatch") | |
| 270 | - | |
| 271 | - for attr in query_profile.get("required_attributes") or []: | |
| 272 | - if not isinstance(attr, dict): | |
| 273 | - continue | |
| 274 | - attr_name = normalize_text(attr.get("name")) | |
| 275 | - if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: | |
| 276 | - continue | |
| 277 | - required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 278 | - conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 279 | - has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 280 | - has_conflict = any(term in evidence for term in conflicting_terms) | |
| 281 | - | |
| 282 | - if has_conflict and label != RELEVANCE_IRRELEVANT: | |
| 283 | - issues.append(f"{label} conflicts on {attr_name}") | |
| 284 | - if label == RELEVANCE_EXACT and not has_required: | |
| 285 | - issues.append(f"Exact missing {attr_name}") | |
| 286 | - return issues | |
| 287 | 111 | |
| 288 | 112 | def audit_live_query( |
| 289 | 113 | self, |
| ... | ... | @@ -294,42 +118,6 @@ class SearchEvaluationFramework: |
| 294 | 118 | auto_annotate: bool = False, |
| 295 | 119 | ) -> Dict[str, Any]: |
| 296 | 120 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 297 | - if self.labeler_mode != "complex": | |
| 298 | - labels = [ | |
| 299 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 300 | - for item in live["results"] | |
| 301 | - ] | |
| 302 | - return { | |
| 303 | - "query": query, | |
| 304 | - "tenant_id": self.tenant_id, | |
| 305 | - "top_k": top_k, | |
| 306 | - "metrics": live["metrics"], | |
| 307 | - "distribution": label_distribution(labels), | |
| 308 | - "query_profile": None, | |
| 309 | - "suspicious": [], | |
| 310 | - "results": live["results"], | |
| 311 | - } | |
| 312 | - query_profile = self.get_query_profile(query, force_refresh=False) | |
| 313 | - suspicious: List[Dict[str, Any]] = [] | |
| 314 | - | |
| 315 | - for item in live["results"]: | |
| 316 | - doc = self._result_item_to_doc(item) | |
| 317 | - issues = self._collect_label_issues(item["label"] or "", query_profile, doc) | |
| 318 | - suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) | |
| 319 | - if suggested_label != (item["label"] or ""): | |
| 320 | - issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] | |
| 321 | - if issues: | |
| 322 | - suspicious.append( | |
| 323 | - { | |
| 324 | - "rank": item["rank"], | |
| 325 | - "spu_id": item["spu_id"], | |
| 326 | - "title": item["title"], | |
| 327 | - "label": item["label"], | |
| 328 | - "suggested_label": suggested_label, | |
| 329 | - "issues": issues, | |
| 330 | - } | |
| 331 | - ) | |
| 332 | - | |
| 333 | 121 | labels = [ |
| 334 | 122 | item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT |
| 335 | 123 | for item in live["results"] |
| ... | ... | @@ -340,8 +128,8 @@ class SearchEvaluationFramework: |
| 340 | 128 | "top_k": top_k, |
| 341 | 129 | "metrics": live["metrics"], |
| 342 | 130 | "distribution": label_distribution(labels), |
| 343 | - "query_profile": query_profile, | |
| 344 | - "suspicious": suspicious, | |
| 131 | + "query_profile": None, | |
| 132 | + "suspicious": [], | |
| 345 | 133 | "results": live["results"], |
| 346 | 134 | } |
| 347 | 135 | |
| ... | ... | @@ -521,15 +309,7 @@ class SearchEvaluationFramework: |
| 521 | 309 | if not docs: |
| 522 | 310 | return [] |
| 523 | 311 | try: |
| 524 | - if self.labeler_mode == "complex": | |
| 525 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 526 | - labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | |
| 527 | - labels = [ | |
| 528 | - self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 529 | - for doc, label in zip(docs, labels) | |
| 530 | - ] | |
| 531 | - else: | |
| 532 | - labels, raw_response = self.label_client.classify_batch_simple(query, docs) | |
| 312 | + labels, raw_response = self.label_client.classify_batch(query, docs) | |
| 533 | 313 | return [(labels, raw_response, docs)] |
| 534 | 314 | except Exception: |
| 535 | 315 | if len(docs) == 1: |
| ... | ... | @@ -727,8 +507,6 @@ class SearchEvaluationFramework: |
| 727 | 507 | "annotate_rerank_top_k": annotate_rerank_top_k, |
| 728 | 508 | "pool_size": len(pool_docs), |
| 729 | 509 | }, |
| 730 | - "labeler_mode": self.labeler_mode, | |
| 731 | - "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | |
| 732 | 510 | "metrics_top100": metrics, |
| 733 | 511 | "search_results": search_labeled_results, |
| 734 | 512 | "full_rerank_top": rerank_top_results, |
| ... | ... | @@ -903,8 +681,6 @@ class SearchEvaluationFramework: |
| 903 | 681 | "rebuild": rebuild_meta, |
| 904 | 682 | "ordered_union_size": pool_docs_count, |
| 905 | 683 | }, |
| 906 | - "labeler_mode": self.labeler_mode, | |
| 907 | - "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None, | |
| 908 | 684 | "metrics_top100": metrics, |
| 909 | 685 | "search_results": search_labeled_results, |
| 910 | 686 | "full_rerank_top": rerank_top_results, |
| ... | ... | @@ -970,7 +746,7 @@ class SearchEvaluationFramework: |
| 970 | 746 | relevant_missing_ids = [ |
| 971 | 747 | spu_id |
| 972 | 748 | for spu_id, label in labels.items() |
| 973 | - if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | |
| 749 | + if label in RELEVANCE_NON_IRRELEVANT and spu_id not in recalled_spu_ids | |
| 974 | 750 | ] |
| 975 | 751 | missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) |
| 976 | 752 | missing_relevant = [] |
| ... | ... | @@ -992,7 +768,12 @@ class SearchEvaluationFramework: |
| 992 | 768 | "product": compact_product_payload(doc), |
| 993 | 769 | } |
| 994 | 770 | ) |
| 995 | - label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | |
| 771 | + label_order = { | |
| 772 | + RELEVANCE_EXACT: 0, | |
| 773 | + RELEVANCE_HIGH: 1, | |
| 774 | + RELEVANCE_LOW: 2, | |
| 775 | + RELEVANCE_IRRELEVANT: 3, | |
| 776 | + } | |
| 996 | 777 | missing_relevant.sort( |
| 997 | 778 | key=lambda item: ( |
| 998 | 779 | label_order.get(str(item.get("label")), 9), |
| ... | ... | @@ -1010,7 +791,7 @@ class SearchEvaluationFramework: |
| 1010 | 791 | if unlabeled_hits: |
| 1011 | 792 | tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") |
| 1012 | 793 | if not missing_relevant: |
| 1013 | - tips.append("No cached Exact/Partial products were missed by this recall set.") | |
| 794 | + tips.append("No cached non-irrelevant products were missed by this recall set.") | |
| 1014 | 795 | return { |
| 1015 | 796 | "query": query, |
| 1016 | 797 | "tenant_id": self.tenant_id, |
| ... | ... | @@ -1024,7 +805,8 @@ class SearchEvaluationFramework: |
| 1024 | 805 | "recalled_hits": len(labeled), |
| 1025 | 806 | "missing_relevant_count": len(missing_relevant), |
| 1026 | 807 | "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), |
| 1027 | - "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | |
| 808 | + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), | |
| 809 | + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), | |
| 1028 | 810 | }, |
| 1029 | 811 | "tips": tips, |
| 1030 | 812 | "total": int(search_payload.get("total") or 0), |
| ... | ... | @@ -1065,7 +847,8 @@ class SearchEvaluationFramework: |
| 1065 | 847 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) |
| 1066 | 848 | aggregate_distribution = { |
| 1067 | 849 | RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), |
| 1068 | - RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), | |
| 850 | + RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), | |
| 851 | + RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), | |
| 1069 | 852 | RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), |
| 1070 | 853 | } |
| 1071 | 854 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | ... | ... |
scripts/evaluation/eval_framework/metrics.py
| ... | ... | @@ -4,7 +4,7 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from typing import Dict, Sequence |
| 6 | 6 | |
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_NON_IRRELEVANT | |
| 8 | 8 | |
| 9 | 9 | |
| 10 | 10 | def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: |
| ... | ... | @@ -13,15 +13,17 @@ def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> fl |
| 13 | 13 | sliced = list(labels[:k]) |
| 14 | 14 | if not sliced: |
| 15 | 15 | return 0.0 |
| 16 | - hits = sum(1 for label in sliced if label in relevant) | |
| 16 | + rel = set(relevant) | |
| 17 | + hits = sum(1 for label in sliced if label in rel) | |
| 17 | 18 | return hits / float(min(k, len(sliced))) |
| 18 | 19 | |
| 19 | 20 | |
| 20 | 21 | def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: |
| 22 | + rel = set(relevant) | |
| 21 | 23 | hit_count = 0 |
| 22 | 24 | precision_sum = 0.0 |
| 23 | 25 | for idx, label in enumerate(labels, start=1): |
| 24 | - if label not in relevant: | |
| 26 | + if label not in rel: | |
| 25 | 27 | continue |
| 26 | 28 | hit_count += 1 |
| 27 | 29 | precision_sum += hit_count / idx |
| ... | ... | @@ -31,12 +33,14 @@ def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: |
| 31 | 33 | |
| 32 | 34 | |
| 33 | 35 | def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: |
| 36 | + """P@k / MAP_3: Exact Match only. P@k_2_3 / MAP_2_3: any non-irrelevant tier (legacy metric names).""" | |
| 34 | 37 | metrics: Dict[str, float] = {} |
| 38 | + non_irrel = list(RELEVANCE_NON_IRRELEVANT) | |
| 35 | 39 | for k in (5, 10, 20, 50): |
| 36 | 40 | metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) |
| 37 | - metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 41 | + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, non_irrel), 6) | |
| 38 | 42 | metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) |
| 39 | - metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 43 | + metrics["MAP_2_3"] = round(average_precision(labels, non_irrel), 6) | |
| 40 | 44 | return metrics |
| 41 | 45 | |
| 42 | 46 | |
| ... | ... | @@ -53,6 +57,7 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo |
| 53 | 57 | def label_distribution(labels: Sequence[str]) -> Dict[str, int]: |
| 54 | 58 | return { |
| 55 | 59 | RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), |
| 56 | - RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), | |
| 60 | + RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), | |
| 61 | + RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), | |
| 57 | 62 | RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), |
| 58 | 63 | } | ... | ... |
scripts/evaluation/eval_framework/prompts.py
| ... | ... | @@ -2,10 +2,9 @@ |
| 2 | 2 | |
| 3 | 3 | from __future__ import annotations |
| 4 | 4 | |
| 5 | -import json | |
| 6 | -from typing import Any, Dict, Sequence | |
| 5 | +from typing import Sequence | |
| 7 | 6 | |
| 8 | -_CLASSIFY_BATCH_SIMPLE_TEMPLATE = """You are a relevance judgment assistant for a fashion e-commerce search system. | |
| 7 | +_CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. | |
| 9 | 8 | Given a user query and the information for each product, assign a relevance label to each product. |
| 10 | 9 | |
| 11 | 10 | Your goal is to judge relevance from the perspective of e-commerce search ranking. |
| ... | ... | @@ -154,7 +153,7 @@ The output lines must correspond to the products above in the same order. |
| 154 | 153 | Do not output anything else. |
| 155 | 154 | """ |
| 156 | 155 | |
| 157 | -_CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„相关性判æ–助手。 | |
| 156 | +_CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„相关性判æ–助手。 | |
| 158 | 157 | 给定用户查询è¯ä»¥åŠæ¯ä¸ªå•†å“的信æ¯ï¼Œè¯·ä¸ºæ¯ä¸ªå•†å“分é…ä¸€ä¸ªç›¸å…³æ€§æ ‡ç¾ã€‚ |
| 159 | 158 | |
| 160 | 159 | ä½ çš„ç›®æ ‡æ˜¯ä»Žç”µå•†æœç´¢æŽ’åºçš„角度,判æ–商哿˜¯å¦æ»¡è¶³ç”¨æˆ·çš„è´ç‰©æ„图。 |
| ... | ... | @@ -294,76 +293,7 @@ _CLASSIFY_BATCH_SIMPLE_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸ç |
| 294 | 293 | """ |
| 295 | 294 | |
| 296 | 295 | |
| 297 | -def classify_batch_simple_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | |
| 296 | +def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | |
| 298 | 297 | lines = "\n".join(numbered_doc_lines) |
| 299 | 298 | n = len(numbered_doc_lines) |
| 300 | - return _CLASSIFY_BATCH_SIMPLE_TEMPLATE.format(query=query, lines=lines, n=n) | |
| 301 | - | |
| 302 | - | |
| 303 | -_EXTRACT_QUERY_PROFILE_TEMPLATE = """You are building a structured intent profile for e-commerce relevance judging. | |
| 304 | -Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query. | |
| 305 | -Be conservative: only mark an attribute as required if the user explicitly asked for it. | |
| 306 | - | |
| 307 | -Return JSON with this schema: | |
| 308 | -{{ | |
| 309 | - "normalized_query_en": string, | |
| 310 | - "primary_category": string, | |
| 311 | - "allowed_categories": [string], | |
| 312 | - "required_attributes": [ | |
| 313 | - {{"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}} | |
| 314 | - ], | |
| 315 | - "notes": [string] | |
| 316 | -}} | |
| 317 | - | |
| 318 | -Guidelines: | |
| 319 | -- Exact later will require explicit evidence for all required attributes. | |
| 320 | -- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them. | |
| 321 | -- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact. | |
| 322 | -- If the query includes color, fit, silhouette, or length, include them as required_attributes. | |
| 323 | -- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight. | |
| 324 | -- For color, include conflicting colors only when clear from the query. | |
| 325 | - | |
| 326 | -Original query: {query} | |
| 327 | -Parser hints JSON: {hints_json} | |
| 328 | -""" | |
| 329 | - | |
| 330 | - | |
| 331 | -def extract_query_profile_prompt(query: str, parser_hints: Dict[str, Any]) -> str: | |
| 332 | - hints_json = json.dumps(parser_hints, ensure_ascii=False) | |
| 333 | - return _EXTRACT_QUERY_PROFILE_TEMPLATE.format(query=query, hints_json=hints_json) | |
| 334 | - | |
| 335 | - | |
| 336 | -_CLASSIFY_BATCH_COMPLEX_TEMPLATE = """You are an e-commerce search relevance judge. | |
| 337 | -Judge each product against the structured query profile below. | |
| 338 | - | |
| 339 | -Relevance rules: | |
| 340 | -- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact. | |
| 341 | -- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched. | |
| 342 | -- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts. | |
| 343 | -- Be conservative with Exact. | |
| 344 | -- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested. | |
| 345 | -- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries. | |
| 346 | - | |
| 347 | -Original query: {query} | |
| 348 | -Structured query profile JSON: {profile_json} | |
| 349 | - | |
| 350 | -Products: | |
| 351 | -{lines} | |
| 352 | - | |
| 353 | -Return JSON only, with schema: | |
| 354 | -{{"labels":[{{"index":1,"label":"Exact","reason":"short phrase"}}]}} | |
| 355 | -""" | |
| 356 | - | |
| 357 | - | |
| 358 | -def classify_batch_complex_prompt( | |
| 359 | - query: str, | |
| 360 | - query_profile: Dict[str, Any], | |
| 361 | - numbered_doc_lines: Sequence[str], | |
| 362 | -) -> str: | |
| 363 | - lines = "\n".join(numbered_doc_lines) | |
| 364 | - profile_json = json.dumps(query_profile, ensure_ascii=False) | |
| 365 | - return _CLASSIFY_BATCH_COMPLEX_TEMPLATE.format( | |
| 366 | - query=query, | |
| 367 | - profile_json=profile_json, | |
| 368 | - lines=lines, | |
| 369 | - ) | |
| 299 | + return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n) | ... | ... |
scripts/evaluation/eval_framework/reports.py
| ... | ... | @@ -4,7 +4,7 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from typing import Any, Dict |
| 6 | 6 | |
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, RELEVANCE_PARTIAL | |
| 7 | +from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW | |
| 8 | 8 | |
| 9 | 9 | |
| 10 | 10 | def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| ... | ... | @@ -29,8 +29,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 29 | 29 | "", |
| 30 | 30 | "## Label Distribution", |
| 31 | 31 | "", |
| 32 | - f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 33 | - f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", | |
| 32 | + f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 33 | + f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", | |
| 34 | + f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}", | |
| 34 | 35 | f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", |
| 35 | 36 | ] |
| 36 | 37 | ) |
| ... | ... | @@ -41,8 +42,9 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 41 | 42 | for key, value in sorted((item.get("metrics") or {}).items()): |
| 42 | 43 | lines.append(f"- {key}: {value}") |
| 43 | 44 | distribution = item.get("distribution") or {} |
| 44 | - lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 45 | - lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") | |
| 45 | + lines.append(f"- Exact Match: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 46 | + lines.append(f"- High Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") | |
| 47 | + lines.append(f"- Low Relevant: {distribution.get(RELEVANCE_LOW, 0)}") | |
| 46 | 48 | lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") |
| 47 | 49 | lines.append("") |
| 48 | 50 | return "\n".join(lines) | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.css
| ... | ... | @@ -35,10 +35,11 @@ |
| 35 | 35 | .results { display: grid; gap: 10px; } |
| 36 | 36 | .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } |
| 37 | 37 | .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } |
| 38 | - .Exact { background: var(--exact); } | |
| 39 | - .Partial { background: var(--partial); } | |
| 40 | - .Irrelevant { background: var(--irrelevant); } | |
| 41 | - .Unknown { background: #637381; } | |
| 38 | + .label-exact-match { background: var(--exact); } | |
| 39 | + .label-high-relevant { background: var(--partial); } | |
| 40 | + .label-low-relevant { background: #6b5b95; } | |
| 41 | + .label-irrelevant { background: var(--irrelevant); } | |
| 42 | + .badge-unknown { background: #637381; } | |
| 42 | 43 | .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } |
| 43 | 44 | .title { font-size: 16px; font-weight: 700; margin-bottom: 4px; } |
| 44 | 45 | .title-zh { font-size: 14px; font-weight: 500; color: var(--muted); margin-bottom: 8px; line-height: 1.4; } | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.js
| ... | ... | @@ -13,6 +13,10 @@ |
| 13 | 13 | root.appendChild(card); |
| 14 | 14 | }); |
| 15 | 15 | } |
| 16 | + function labelBadgeClass(label) { | |
| 17 | + if (!label || label === 'Unknown') return 'badge-unknown'; | |
| 18 | + return 'label-' + String(label).toLowerCase().replace(/\s+/g, '-'); | |
| 19 | + } | |
| 16 | 20 | function renderResults(results, rootId='results', showRank=true) { |
| 17 | 21 | const mount = document.getElementById(rootId); |
| 18 | 22 | mount.innerHTML = ''; |
| ... | ... | @@ -21,7 +25,7 @@ |
| 21 | 25 | const box = document.createElement('div'); |
| 22 | 26 | box.className = 'result'; |
| 23 | 27 | box.innerHTML = ` |
| 24 | - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 28 | + <div><span class="badge ${labelBadgeClass(label)}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 25 | 29 | <img class="thumb" src="${item.image_url || ''}" alt="" /> |
| 26 | 30 | <div> |
| 27 | 31 | <div class="title">${item.title || ''}</div> |
| ... | ... | @@ -42,7 +46,7 @@ |
| 42 | 46 | const root = document.getElementById('tips'); |
| 43 | 47 | const tips = [...(data.tips || [])]; |
| 44 | 48 | const stats = data.label_stats || {}; |
| 45 | - tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | |
| 49 | + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed (non-irrelevant): ${stats.missing_relevant_count || 0} — Exact: ${stats.missing_exact_count || 0}, High: ${stats.missing_high_count || 0}, Low: ${stats.missing_low_count || 0}.`); | |
| 46 | 50 | root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); |
| 47 | 51 | } |
| 48 | 52 | async function loadQueries() { | ... | ... |
scripts/evaluation/eval_framework/static/index.html
| ... | ... | @@ -37,7 +37,7 @@ |
| 37 | 37 | <div id="results" class="results"></div> |
| 38 | 38 | </section> |
| 39 | 39 | <section class="section"> |
| 40 | - <h2>Missed Exact / Partial</h2> | |
| 40 | + <h2>Missed non-irrelevant (cached)</h2> | |
| 41 | 41 | <div id="missingRelevant" class="results"></div> |
| 42 | 42 | </section> |
| 43 | 43 | <section class="section"> | ... | ... |
scripts/evaluation/eval_framework/store.py
| ... | ... | @@ -8,7 +8,7 @@ from dataclasses import dataclass |
| 8 | 8 | from pathlib import Path |
| 9 | 9 | from typing import Any, Dict, List, Optional, Sequence |
| 10 | 10 | |
| 11 | -from .constants import VALID_LABELS | |
| 11 | +from .constants import VALID_LABELS, normalize_stored_label | |
| 12 | 12 | from .utils import ensure_dir, safe_json_dumps, utc_now_iso |
| 13 | 13 | |
| 14 | 14 | |
| ... | ... | @@ -220,7 +220,7 @@ class EvalStore: |
| 220 | 220 | """, |
| 221 | 221 | (tenant_id, query_text), |
| 222 | 222 | ).fetchall() |
| 223 | - return {str(row["spu_id"]): str(row["label"]) for row in rows} | |
| 223 | + return {str(row["spu_id"]): normalize_stored_label(str(row["label"])) for row in rows} | |
| 224 | 224 | |
| 225 | 225 | def upsert_labels( |
| 226 | 226 | self, |
| ... | ... | @@ -379,8 +379,9 @@ class EvalStore: |
| 379 | 379 | SELECT |
| 380 | 380 | query_text, |
| 381 | 381 | COUNT(*) AS total, |
| 382 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 383 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 382 | + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, | |
| 383 | + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, | |
| 384 | + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, | |
| 384 | 385 | SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, |
| 385 | 386 | MAX(updated_at) AS updated_at |
| 386 | 387 | FROM relevance_labels |
| ... | ... | @@ -395,7 +396,8 @@ class EvalStore: |
| 395 | 396 | "query": str(row["query_text"]), |
| 396 | 397 | "total": int(row["total"]), |
| 397 | 398 | "exact_count": int(row["exact_count"] or 0), |
| 398 | - "partial_count": int(row["partial_count"] or 0), | |
| 399 | + "high_relevant_count": int(row["high_relevant_count"] or 0), | |
| 400 | + "low_relevant_count": int(row["low_relevant_count"] or 0), | |
| 399 | 401 | "irrelevant_count": int(row["irrelevant_count"] or 0), |
| 400 | 402 | "updated_at": row["updated_at"], |
| 401 | 403 | } |
| ... | ... | @@ -407,8 +409,9 @@ class EvalStore: |
| 407 | 409 | """ |
| 408 | 410 | SELECT |
| 409 | 411 | COUNT(*) AS total, |
| 410 | - SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 411 | - SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 412 | + SUM(CASE WHEN label IN ('Exact Match','Exact') THEN 1 ELSE 0 END) AS exact_count, | |
| 413 | + SUM(CASE WHEN label IN ('High Relevant','Partial') THEN 1 ELSE 0 END) AS high_relevant_count, | |
| 414 | + SUM(CASE WHEN label='Low Relevant' THEN 1 ELSE 0 END) AS low_relevant_count, | |
| 412 | 415 | SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, |
| 413 | 416 | MAX(updated_at) AS updated_at |
| 414 | 417 | FROM relevance_labels |
| ... | ... | @@ -420,7 +423,8 @@ class EvalStore: |
| 420 | 423 | "query": query_text, |
| 421 | 424 | "total": int((row["total"] or 0) if row else 0), |
| 422 | 425 | "exact_count": int((row["exact_count"] or 0) if row else 0), |
| 423 | - "partial_count": int((row["partial_count"] or 0) if row else 0), | |
| 426 | + "high_relevant_count": int((row["high_relevant_count"] or 0) if row else 0), | |
| 427 | + "low_relevant_count": int((row["low_relevant_count"] or 0) if row else 0), | |
| 424 | 428 | "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), |
| 425 | 429 | "updated_at": row["updated_at"] if row else None, |
| 426 | 430 | } | ... | ... |
scripts/evaluation/quick_start_eval.sh renamed to scripts/evaluation/start_eval.sh
| ... | ... | @@ -10,7 +10,7 @@ QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 10 | 10 | |
| 11 | 11 | usage() { |
| 12 | 12 | echo "Usage: $0 batch|batch-rebuild|serve" |
| 13 | - echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" | |
| 13 | + echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" | |
| 14 | 14 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 15 | 15 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 16 | 16 | echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" |
| ... | ... | @@ -22,8 +22,7 @@ case "${1:-}" in |
| 22 | 22 | --tenant-id "$TENANT_ID" \ |
| 23 | 23 | --queries-file "$QUERIES" \ |
| 24 | 24 | --top-k 50 \ |
| 25 | - --language en \ | |
| 26 | - --labeler-mode simple | |
| 25 | + --language en | |
| 27 | 26 | ;; |
| 28 | 27 | batch-rebuild) |
| 29 | 28 | exec "$PY" scripts/evaluation/build_annotation_set.py build \ |
| ... | ... | @@ -33,8 +32,7 @@ case "${1:-}" in |
| 33 | 32 | --rerank-depth 10000 \ |
| 34 | 33 | --force-refresh-rerank \ |
| 35 | 34 | --force-refresh-labels \ |
| 36 | - --language en \ | |
| 37 | - --labeler-mode simple | |
| 35 | + --language en | |
| 38 | 36 | ;; |
| 39 | 37 | serve) |
| 40 | 38 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ... | ... |