Commit d73ca84a48afc0945a533707c77ba3bbfaac9621
1 parent
1fdab52d
refine eval case snapshots and rename relevance levels
Showing
10 changed files
with
180 additions
and
83 deletions
Show diff stats
scripts/evaluation/README.md
| ... | ... | @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p |
| 127 | 127 | - **Composite tuning score: `Primary_Metric_Score`** |
| 128 | 128 | For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`). |
| 129 | 129 | - **Gain scheme** |
| 130 | - `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0` | |
| 131 | - The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. | |
| 130 | + `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0` | |
| 131 | + We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive. | |
| 132 | 132 | - **Why this is better** |
| 133 | 133 | `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`. |
| 134 | 134 | |
| ... | ... | @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc |
| 174 | 174 | |
| 175 | 175 | Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. |
| 176 | 176 | |
| 177 | +To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes: | |
| 178 | + | |
| 179 | +- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call | |
| 180 | +- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2` | |
| 181 | +- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score` | |
| 182 | + | |
| 183 | +The Markdown report now surfaces the same case context in a lighter human-readable form: | |
| 184 | + | |
| 185 | +- request id | |
| 186 | +- top-10 / top-20 label sequence | |
| 187 | +- top 5 result snapshot for quick scanning | |
| 188 | + | |
| 189 | +This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand. | |
| 190 | + | |
| 191 | +The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI. | |
| 192 | + | |
| 177 | 193 | ## Ranking debug and LTR prep |
| 178 | 194 | |
| 179 | 195 | `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work: | ... | ... |
scripts/evaluation/eval_framework/__init__.py
| ... | ... | @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402 |
| 14 | 14 | DEFAULT_ARTIFACT_ROOT, |
| 15 | 15 | DEFAULT_QUERY_FILE, |
| 16 | 16 | PROJECT_ROOT, |
| 17 | - RELEVANCE_EXACT, | |
| 18 | - RELEVANCE_HIGH, | |
| 19 | - RELEVANCE_IRRELEVANT, | |
| 20 | - RELEVANCE_LOW, | |
| 17 | + RELEVANCE_LV0, | |
| 18 | + RELEVANCE_LV1, | |
| 19 | + RELEVANCE_LV2, | |
| 20 | + RELEVANCE_LV3, | |
| 21 | 21 | RELEVANCE_NON_IRRELEVANT, |
| 22 | 22 | VALID_LABELS, |
| 23 | 23 | ) |
| ... | ... | @@ -39,10 +39,10 @@ __all__ = [ |
| 39 | 39 | "EvalStore", |
| 40 | 40 | "PROJECT_ROOT", |
| 41 | 41 | "QueryBuildResult", |
| 42 | - "RELEVANCE_EXACT", | |
| 43 | - "RELEVANCE_HIGH", | |
| 44 | - "RELEVANCE_IRRELEVANT", | |
| 45 | - "RELEVANCE_LOW", | |
| 42 | + "RELEVANCE_LV0", | |
| 43 | + "RELEVANCE_LV1", | |
| 44 | + "RELEVANCE_LV2", | |
| 45 | + "RELEVANCE_LV3", | |
| 46 | 46 | "RELEVANCE_NON_IRRELEVANT", |
| 47 | 47 | "SearchEvaluationFramework", |
| 48 | 48 | "VALID_LABELS", | ... | ... |
scripts/evaluation/eval_framework/clients.py
| ... | ... | @@ -157,6 +157,7 @@ class SearchServiceClient: |
| 157 | 157 | return self._request_json("GET", path, timeout=timeout) |
| 158 | 158 | |
| 159 | 159 | def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: |
| 160 | + request_id = uuid.uuid4().hex[:8] | |
| 160 | 161 | payload: Dict[str, Any] = { |
| 161 | 162 | "query": query, |
| 162 | 163 | "size": size, |
| ... | ... | @@ -165,13 +166,19 @@ class SearchServiceClient: |
| 165 | 166 | } |
| 166 | 167 | if debug: |
| 167 | 168 | payload["debug"] = True |
| 168 | - return self._request_json( | |
| 169 | + response = self._request_json( | |
| 169 | 170 | "POST", |
| 170 | 171 | "/search/", |
| 171 | 172 | timeout=120, |
| 172 | - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 173 | + headers={ | |
| 174 | + "Content-Type": "application/json", | |
| 175 | + "X-Tenant-ID": self.tenant_id, | |
| 176 | + "X-Request-ID": request_id, | |
| 177 | + }, | |
| 173 | 178 | json_payload=payload, |
| 174 | 179 | ) |
| 180 | + response["_eval_request_id"] = request_id | |
| 181 | + return response | |
| 175 | 182 | |
| 176 | 183 | |
| 177 | 184 | class RerankServiceClient: | ... | ... |
scripts/evaluation/eval_framework/constants.py
| ... | ... | @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent |
| 7 | 7 | PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] |
| 8 | 8 | |
| 9 | 9 | # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) |
| 10 | -RELEVANCE_EXACT = "Fully Relevant" | |
| 11 | -RELEVANCE_HIGH = "Mostly Relevant" | |
| 12 | -RELEVANCE_LOW = "Weakly Relevant" | |
| 13 | -RELEVANCE_IRRELEVANT = "Irrelevant" | |
| 10 | +RELEVANCE_LV3 = "Fully Relevant" | |
| 11 | +RELEVANCE_LV2 = "Mostly Relevant" | |
| 12 | +RELEVANCE_LV1 = "Weakly Relevant" | |
| 13 | +RELEVANCE_LV0 = "Irrelevant" | |
| 14 | 14 | |
| 15 | -VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) | |
| 15 | +VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0}) | |
| 16 | 16 | |
| 17 | 17 | # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. |
| 18 | -RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) | |
| 19 | -RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) | |
| 18 | +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1}) | |
| 19 | +RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2}) | |
| 20 | 20 | |
| 21 | 21 | # Graded relevance for ranking evaluation. |
| 22 | 22 | # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. |
| 23 | 23 | RELEVANCE_GRADE_MAP = { |
| 24 | - RELEVANCE_EXACT: 3, | |
| 25 | - RELEVANCE_HIGH: 2, | |
| 26 | - RELEVANCE_LOW: 1, | |
| 27 | - RELEVANCE_IRRELEVANT: 0, | |
| 24 | + RELEVANCE_LV3: 3, | |
| 25 | + RELEVANCE_LV2: 2, | |
| 26 | + RELEVANCE_LV1: 1, | |
| 27 | + RELEVANCE_LV0: 0, | |
| 28 | 28 | } |
| 29 | 29 | # 标准的gain计算方法:2^rel - 1 |
| 30 | 30 | # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度 |
| ... | ... | @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = { |
| 36 | 36 | |
| 37 | 37 | # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009). |
| 38 | 38 | STOP_PROB_MAP = { |
| 39 | - RELEVANCE_EXACT: 0.99, | |
| 40 | - RELEVANCE_HIGH: 0.8, | |
| 41 | - RELEVANCE_LOW: 0.1, | |
| 42 | - RELEVANCE_IRRELEVANT: 0.0, | |
| 39 | + RELEVANCE_LV3: 0.99, | |
| 40 | + RELEVANCE_LV2: 0.8, | |
| 41 | + RELEVANCE_LV1: 0.1, | |
| 42 | + RELEVANCE_LV0: 0.0, | |
| 43 | 43 | } |
| 44 | 44 | |
| 45 | 45 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| ... | ... | @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 |
| 78 | 78 | # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): |
| 79 | 79 | # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), |
| 80 | 80 | # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). |
| 81 | -# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant"). | |
| 81 | +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant"). | |
| 82 | 82 | # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak |
| 83 | 83 | # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). |
| 84 | 84 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -25,14 +25,14 @@ from .constants import ( |
| 25 | 25 | DEFAULT_RERANK_HIGH_SKIP_COUNT, |
| 26 | 26 | DEFAULT_RERANK_HIGH_THRESHOLD, |
| 27 | 27 | DEFAULT_SEARCH_RECALL_TOP_K, |
| 28 | - RELEVANCE_EXACT, | |
| 29 | 28 | RELEVANCE_GAIN_MAP, |
| 30 | - RELEVANCE_HIGH, | |
| 31 | - STOP_PROB_MAP, | |
| 32 | - RELEVANCE_IRRELEVANT, | |
| 33 | - RELEVANCE_LOW, | |
| 29 | + RELEVANCE_LV0, | |
| 30 | + RELEVANCE_LV1, | |
| 31 | + RELEVANCE_LV2, | |
| 32 | + RELEVANCE_LV3, | |
| 34 | 33 | RELEVANCE_NON_IRRELEVANT, |
| 35 | 34 | VALID_LABELS, |
| 35 | + STOP_PROB_MAP, | |
| 36 | 36 | ) |
| 37 | 37 | from .metrics import ( |
| 38 | 38 | PRIMARY_METRIC_GRADE_NORMALIZER, |
| ... | ... | @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: |
| 96 | 96 | return out |
| 97 | 97 | |
| 98 | 98 | |
| 99 | +def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str: | |
| 100 | + parts: List[str] = [] | |
| 101 | + for item in items[:limit]: | |
| 102 | + rank = int(item.get("rank") or 0) | |
| 103 | + label = str(item.get("label") or "") | |
| 104 | + grade = RELEVANCE_GAIN_MAP.get(label) | |
| 105 | + parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?") | |
| 106 | + return " | ".join(parts) | |
| 107 | + | |
| 108 | + | |
| 99 | 109 | class SearchEvaluationFramework: |
| 100 | 110 | def __init__( |
| 101 | 111 | self, |
| ... | ... | @@ -168,7 +178,7 @@ class SearchEvaluationFramework: |
| 168 | 178 | ) -> Dict[str, Any]: |
| 169 | 179 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 170 | 180 | labels = [ |
| 171 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 181 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 | |
| 172 | 182 | for item in live["results"] |
| 173 | 183 | ] |
| 174 | 184 | return { |
| ... | ... | @@ -432,7 +442,7 @@ class SearchEvaluationFramework: |
| 432 | 442 | |
| 433 | 443 | - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and |
| 434 | 444 | - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio`` |
| 435 | - (default 0.959; weak relevance = ``RELEVANCE_LOW``). | |
| 445 | + (default 0.959; weak relevance = ``RELEVANCE_LV1``). | |
| 436 | 446 | |
| 437 | 447 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. |
| 438 | 448 | Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached |
| ... | ... | @@ -474,9 +484,9 @@ class SearchEvaluationFramework: |
| 474 | 484 | time.sleep(0.1) |
| 475 | 485 | |
| 476 | 486 | n = len(batch_docs) |
| 477 | - exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) | |
| 478 | - irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) | |
| 479 | - low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW) | |
| 487 | + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3) | |
| 488 | + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0) | |
| 489 | + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1) | |
| 480 | 490 | exact_ratio = exact_n / n if n else 0.0 |
| 481 | 491 | irrelevant_ratio = irrel_n / n if n else 0.0 |
| 482 | 492 | low_ratio = low_n / n if n else 0.0 |
| ... | ... | @@ -633,7 +643,7 @@ class SearchEvaluationFramework: |
| 633 | 643 | ) |
| 634 | 644 | |
| 635 | 645 | top100_labels = [ |
| 636 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 646 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 | |
| 637 | 647 | for item in search_labeled_results[:100] |
| 638 | 648 | ] |
| 639 | 649 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) |
| ... | ... | @@ -843,7 +853,7 @@ class SearchEvaluationFramework: |
| 843 | 853 | ) |
| 844 | 854 | |
| 845 | 855 | top100_labels = [ |
| 846 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 856 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 | |
| 847 | 857 | for item in search_labeled_results[:100] |
| 848 | 858 | ] |
| 849 | 859 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) |
| ... | ... | @@ -920,16 +930,17 @@ class SearchEvaluationFramework: |
| 920 | 930 | "title_zh": title_zh if title_zh and title_zh != primary_title else "", |
| 921 | 931 | "image_url": doc.get("image_url"), |
| 922 | 932 | "label": label, |
| 933 | + "relevance_score": doc.get("relevance_score"), | |
| 923 | 934 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| 924 | 935 | "product": compact_product_payload(doc), |
| 925 | 936 | } |
| 926 | 937 | ) |
| 927 | 938 | metric_labels = [ |
| 928 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 939 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 | |
| 929 | 940 | for item in labeled |
| 930 | 941 | ] |
| 931 | 942 | ideal_labels = [ |
| 932 | - label if label in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 943 | + label if label in VALID_LABELS else RELEVANCE_LV0 | |
| 933 | 944 | for label in labels.values() |
| 934 | 945 | ] |
| 935 | 946 | label_stats = self.store.get_query_label_stats(self.tenant_id, query) |
| ... | ... | @@ -960,10 +971,10 @@ class SearchEvaluationFramework: |
| 960 | 971 | } |
| 961 | 972 | ) |
| 962 | 973 | label_order = { |
| 963 | - RELEVANCE_EXACT: 0, | |
| 964 | - RELEVANCE_HIGH: 1, | |
| 965 | - RELEVANCE_LOW: 2, | |
| 966 | - RELEVANCE_IRRELEVANT: 3, | |
| 974 | + RELEVANCE_LV3: 0, | |
| 975 | + RELEVANCE_LV2: 1, | |
| 976 | + RELEVANCE_LV1: 2, | |
| 977 | + RELEVANCE_LV0: 3, | |
| 967 | 978 | } |
| 968 | 979 | missing_relevant.sort( |
| 969 | 980 | key=lambda item: ( |
| ... | ... | @@ -989,6 +1000,7 @@ class SearchEvaluationFramework: |
| 989 | 1000 | "top_k": top_k, |
| 990 | 1001 | "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), |
| 991 | 1002 | "metric_context": _metric_context_payload(), |
| 1003 | + "request_id": str(search_payload.get("_eval_request_id") or ""), | |
| 992 | 1004 | "results": labeled, |
| 993 | 1005 | "missing_relevant": missing_relevant, |
| 994 | 1006 | "label_stats": { |
| ... | ... | @@ -996,9 +1008,9 @@ class SearchEvaluationFramework: |
| 996 | 1008 | "unlabeled_hits_treated_irrelevant": unlabeled_hits, |
| 997 | 1009 | "recalled_hits": len(labeled), |
| 998 | 1010 | "missing_relevant_count": len(missing_relevant), |
| 999 | - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | |
| 1000 | - "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), | |
| 1001 | - "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), | |
| 1011 | + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3), | |
| 1012 | + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2), | |
| 1013 | + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1), | |
| 1002 | 1014 | }, |
| 1003 | 1015 | "tips": tips, |
| 1004 | 1016 | "total": int(search_payload.get("total") or 0), |
| ... | ... | @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework: |
| 1014 | 1026 | force_refresh_labels: bool = False, |
| 1015 | 1027 | ) -> Dict[str, Any]: |
| 1016 | 1028 | per_query = [] |
| 1029 | + case_snapshot_top_n = min(max(int(top_k), 1), 20) | |
| 1017 | 1030 | total_q = len(queries) |
| 1018 | 1031 | _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) |
| 1019 | 1032 | for q_index, query in enumerate(queries, start=1): |
| ... | ... | @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework: |
| 1025 | 1038 | force_refresh_labels=force_refresh_labels, |
| 1026 | 1039 | ) |
| 1027 | 1040 | labels = [ |
| 1028 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1041 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 | |
| 1029 | 1042 | for item in live["results"] |
| 1030 | 1043 | ] |
| 1031 | 1044 | per_query.append( |
| ... | ... | @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework: |
| 1036 | 1049 | "metrics": live["metrics"], |
| 1037 | 1050 | "distribution": label_distribution(labels), |
| 1038 | 1051 | "total": live["total"], |
| 1052 | + "request_id": live.get("request_id") or "", | |
| 1053 | + "case_snapshot_top_n": case_snapshot_top_n, | |
| 1054 | + "top_label_sequence_top10": _encode_label_sequence(live["results"], 10), | |
| 1055 | + "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n), | |
| 1056 | + "top_results": [ | |
| 1057 | + { | |
| 1058 | + "rank": int(item.get("rank") or 0), | |
| 1059 | + "spu_id": str(item.get("spu_id") or ""), | |
| 1060 | + "label": item.get("label"), | |
| 1061 | + "title": item.get("title"), | |
| 1062 | + "title_zh": item.get("title_zh"), | |
| 1063 | + "relevance_score": item.get("relevance_score"), | |
| 1064 | + } | |
| 1065 | + for item in live["results"][:case_snapshot_top_n] | |
| 1066 | + ], | |
| 1039 | 1067 | } |
| 1040 | 1068 | ) |
| 1041 | 1069 | m = live["metrics"] |
| ... | ... | @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework: |
| 1055 | 1083 | ) |
| 1056 | 1084 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) |
| 1057 | 1085 | aggregate_distribution = { |
| 1058 | - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | |
| 1059 | - RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), | |
| 1060 | - RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), | |
| 1061 | - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), | |
| 1086 | + RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query), | |
| 1087 | + RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query), | |
| 1088 | + RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query), | |
| 1089 | + RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query), | |
| 1062 | 1090 | } |
| 1063 | 1091 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" |
| 1064 | 1092 | report_dir = ensure_dir(self.artifact_root / "batch_reports") | ... | ... |
scripts/evaluation/eval_framework/metrics.py
| ... | ... | @@ -6,12 +6,12 @@ import math |
| 6 | 6 | from typing import Dict, Iterable, Sequence |
| 7 | 7 | |
| 8 | 8 | from .constants import ( |
| 9 | - RELEVANCE_EXACT, | |
| 10 | 9 | RELEVANCE_GAIN_MAP, |
| 11 | 10 | RELEVANCE_GRADE_MAP, |
| 12 | - RELEVANCE_HIGH, | |
| 13 | - RELEVANCE_IRRELEVANT, | |
| 14 | - RELEVANCE_LOW, | |
| 11 | + RELEVANCE_LV0, | |
| 12 | + RELEVANCE_LV1, | |
| 13 | + RELEVANCE_LV2, | |
| 14 | + RELEVANCE_LV3, | |
| 15 | 15 | RELEVANCE_NON_IRRELEVANT, |
| 16 | 16 | RELEVANCE_STRONG, |
| 17 | 17 | STOP_PROB_MAP, |
| ... | ... | @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0 |
| 33 | 33 | def _normalize_label(label: str) -> str: |
| 34 | 34 | if label in RELEVANCE_GRADE_MAP: |
| 35 | 35 | return label |
| 36 | - return RELEVANCE_IRRELEVANT | |
| 36 | + return RELEVANCE_LV0 | |
| 37 | 37 | |
| 38 | 38 | |
| 39 | 39 | def _gains_for_labels(labels: Sequence[str]) -> list[float]: |
| ... | ... | @@ -135,7 +135,7 @@ def compute_query_metrics( |
| 135 | 135 | ideal = list(ideal_labels) if ideal_labels is not None else list(labels) |
| 136 | 136 | metrics: Dict[str, float] = {} |
| 137 | 137 | |
| 138 | - exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) | |
| 138 | + exact_hits = _binary_hits(labels, [RELEVANCE_LV3]) | |
| 139 | 139 | strong_hits = _binary_hits(labels, RELEVANCE_STRONG) |
| 140 | 140 | useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) |
| 141 | 141 | |
| ... | ... | @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo |
| 183 | 183 | |
| 184 | 184 | def label_distribution(labels: Sequence[str]) -> Dict[str, int]: |
| 185 | 185 | return { |
| 186 | - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), | |
| 187 | - RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), | |
| 188 | - RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), | |
| 189 | - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), | |
| 186 | + RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3), | |
| 187 | + RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2), | |
| 188 | + RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1), | |
| 189 | + RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0), | |
| 190 | 190 | } | ... | ... |
scripts/evaluation/eval_framework/reports.py
| ... | ... | @@ -4,7 +4,7 @@ from __future__ import annotations |
| 4 | 4 | |
| 5 | 5 | from typing import Any, Dict |
| 6 | 6 | |
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW | |
| 7 | +from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3 | |
| 8 | 8 | from .metrics import PRIMARY_METRIC_KEYS |
| 9 | 9 | |
| 10 | 10 | |
| ... | ... | @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: |
| 25 | 25 | lines.append(f"- {key}: {value}") |
| 26 | 26 | |
| 27 | 27 | |
| 28 | +def _label_level_code(label: str) -> str: | |
| 29 | + grade = RELEVANCE_GAIN_MAP.get(label) | |
| 30 | + return f"L{grade}" if grade is not None else "?" | |
| 31 | + | |
| 32 | + | |
| 33 | +def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None: | |
| 34 | + request_id = str(item.get("request_id") or "").strip() | |
| 35 | + if request_id: | |
| 36 | + lines.append(f"- Request ID: `{request_id}`") | |
| 37 | + seq10 = str(item.get("top_label_sequence_top10") or "").strip() | |
| 38 | + if seq10: | |
| 39 | + lines.append(f"- Top-10 Labels: `{seq10}`") | |
| 40 | + seq20 = str(item.get("top_label_sequence_top20") or "").strip() | |
| 41 | + if seq20 and seq20 != seq10: | |
| 42 | + lines.append(f"- Top-20 Labels: `{seq20}`") | |
| 43 | + top_results = item.get("top_results") or [] | |
| 44 | + if not top_results: | |
| 45 | + return | |
| 46 | + lines.append("- Case Snapshot:") | |
| 47 | + for result in top_results[:5]: | |
| 48 | + rank = int(result.get("rank") or 0) | |
| 49 | + label = _label_level_code(str(result.get("label") or "")) | |
| 50 | + spu_id = str(result.get("spu_id") or "") | |
| 51 | + title = str(result.get("title") or "") | |
| 52 | + title_zh = str(result.get("title_zh") or "") | |
| 53 | + relevance_score = result.get("relevance_score") | |
| 54 | + score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else "" | |
| 55 | + lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}") | |
| 56 | + if title_zh: | |
| 57 | + lines.append(f" zh: {title_zh}") | |
| 58 | + | |
| 59 | + | |
| 28 | 60 | def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 29 | 61 | lines = [ |
| 30 | 62 | "# Search Batch Evaluation", |
| ... | ... | @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 56 | 88 | "", |
| 57 | 89 | "## Label Distribution", |
| 58 | 90 | "", |
| 59 | - f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 60 | - f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", | |
| 61 | - f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}", | |
| 62 | - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", | |
| 91 | + f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}", | |
| 92 | + f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}", | |
| 93 | + f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}", | |
| 94 | + f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}", | |
| 63 | 95 | ] |
| 64 | 96 | ) |
| 65 | 97 | lines.extend(["", "## Per Query", ""]) |
| ... | ... | @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 68 | 100 | lines.append("") |
| 69 | 101 | _append_metric_block(lines, item.get("metrics") or {}) |
| 70 | 102 | distribution = item.get("distribution") or {} |
| 71 | - lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 72 | - lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") | |
| 73 | - lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}") | |
| 74 | - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") | |
| 103 | + lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}") | |
| 104 | + lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}") | |
| 105 | + lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}") | |
| 106 | + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}") | |
| 107 | + _append_case_snapshot(lines, item) | |
| 75 | 108 | lines.append("") |
| 76 | 109 | return "\n".join(lines) | ... | ... |
scripts/evaluation/eval_framework/static/eval_web.js
| ... | ... | @@ -190,7 +190,7 @@ async function loadQueries() { |
| 190 | 190 | |
| 191 | 191 | function historySummaryHtml(meta) { |
| 192 | 192 | const m = meta && meta.aggregate_metrics; |
| 193 | - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | |
| 193 | + const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | |
| 194 | 194 | const parts = []; |
| 195 | 195 | if (nq != null) parts.push(`<span>Queries</span> ${nq}`); |
| 196 | 196 | if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`); | ... | ... |
scripts/evaluation/eval_framework/store.py
| ... | ... | @@ -23,6 +23,18 @@ class QueryBuildResult: |
| 23 | 23 | output_json_path: Path |
| 24 | 24 | |
| 25 | 25 | |
| 26 | +def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]: | |
| 27 | + return { | |
| 28 | + "batch_id": metadata.get("batch_id"), | |
| 29 | + "created_at": metadata.get("created_at"), | |
| 30 | + "tenant_id": metadata.get("tenant_id"), | |
| 31 | + "top_k": metadata.get("top_k"), | |
| 32 | + "query_count": len(metadata.get("queries") or []), | |
| 33 | + "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}), | |
| 34 | + "metric_context": dict(metadata.get("metric_context") or {}), | |
| 35 | + } | |
| 36 | + | |
| 37 | + | |
| 26 | 38 | class EvalStore: |
| 27 | 39 | def __init__(self, db_path: Path): |
| 28 | 40 | self.db_path = db_path |
| ... | ... | @@ -339,6 +351,7 @@ class EvalStore: |
| 339 | 351 | ).fetchall() |
| 340 | 352 | items: List[Dict[str, Any]] = [] |
| 341 | 353 | for row in rows: |
| 354 | + metadata = json.loads(row["metadata_json"]) | |
| 342 | 355 | items.append( |
| 343 | 356 | { |
| 344 | 357 | "batch_id": row["batch_id"], |
| ... | ... | @@ -346,7 +359,7 @@ class EvalStore: |
| 346 | 359 | "output_json_path": row["output_json_path"], |
| 347 | 360 | "report_markdown_path": row["report_markdown_path"], |
| 348 | 361 | "config_snapshot_path": row["config_snapshot_path"], |
| 349 | - "metadata": json.loads(row["metadata_json"]), | |
| 362 | + "metadata": _compact_batch_metadata(metadata), | |
| 350 | 363 | "created_at": row["created_at"], |
| 351 | 364 | } |
| 352 | 365 | ) | ... | ... |
scripts/evaluation/offline_ltr_fit.py
| ... | ... | @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path: |
| 23 | 23 | |
| 24 | 24 | from scripts.evaluation.eval_framework.constants import ( |
| 25 | 25 | DEFAULT_ARTIFACT_ROOT, |
| 26 | - RELEVANCE_EXACT, | |
| 27 | 26 | RELEVANCE_GRADE_MAP, |
| 28 | - RELEVANCE_HIGH, | |
| 29 | - RELEVANCE_IRRELEVANT, | |
| 30 | - RELEVANCE_LOW, | |
| 27 | + RELEVANCE_LV0, | |
| 28 | + RELEVANCE_LV1, | |
| 29 | + RELEVANCE_LV2, | |
| 30 | + RELEVANCE_LV3, | |
| 31 | 31 | ) |
| 32 | 32 | from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics |
| 33 | 33 | from scripts.evaluation.eval_framework.store import EvalStore |
| ... | ... | @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp |
| 35 | 35 | |
| 36 | 36 | |
| 37 | 37 | LABELS_BY_GRADE = { |
| 38 | - 3: RELEVANCE_EXACT, | |
| 39 | - 2: RELEVANCE_HIGH, | |
| 40 | - 1: RELEVANCE_LOW, | |
| 41 | - 0: RELEVANCE_IRRELEVANT, | |
| 38 | + 3: RELEVANCE_LV3, | |
| 39 | + 2: RELEVANCE_LV2, | |
| 40 | + 1: RELEVANCE_LV1, | |
| 41 | + 0: RELEVANCE_LV0, | |
| 42 | 42 | } |
| 43 | 43 | |
| 44 | 44 | ... | ... |