Commit d73ca84a48afc0945a533707c77ba3bbfaac9621
1 parent
1fdab52d
refine eval case snapshots and rename relevance levels
Showing
10 changed files
with
180 additions
and
83 deletions
Show diff stats
scripts/evaluation/README.md
| @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p | @@ -127,8 +127,8 @@ This framework now follows graded ranking evaluation closer to e-commerce best p | ||
| 127 | - **Composite tuning score: `Primary_Metric_Score`** | 127 | - **Composite tuning score: `Primary_Metric_Score`** |
| 128 | For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`). | 128 | For experiment ranking we compute the mean of the primary scorecard after normalizing `Avg_Grade@10` by the max grade (`3`). |
| 129 | - **Gain scheme** | 129 | - **Gain scheme** |
| 130 | - `Fully Relevant=7`, `Mostly Relevant=3`, `Weakly Relevant=1`, `Irrelevant=0` | ||
| 131 | - The gains come from rel grades `3/2/1/0` with `gain = 2^rel - 1`, a standard `NDCG` setup. | 130 | + `Fully Relevant=3`, `Mostly Relevant=2`, `Weakly Relevant=1`, `Irrelevant=0` |
| 131 | + We keep the rel grades `3/2/1/0`, but the current implementation uses the grade values directly as gains so the exact/high gap is less aggressive. | ||
| 132 | - **Why this is better** | 132 | - **Why this is better** |
| 133 | `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`. | 133 | `NDCG` differentiates “exact”, “strong substitute”, and “weak substitute”, so swapping an `Fully Relevant` with a `Weakly Relevant` item is penalized more than swapping `Mostly Relevant` with `Weakly Relevant`. |
| 134 | 134 | ||
| @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc | @@ -174,6 +174,22 @@ Features: query list from `queries.txt`, single-query and batch evaluation, batc | ||
| 174 | 174 | ||
| 175 | Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. | 175 | Each run stores aggregate and per-query metrics, label distribution, timestamp, metric context (including gain scheme and primary metric), and an `/admin/config` snapshot, as Markdown and JSON under `batch_reports/`. |
| 176 | 176 | ||
| 177 | +To make later case analysis reproducible without digging through backend logs, each per-query record in the batch JSON now also includes: | ||
| 178 | + | ||
| 179 | +- `request_id` — the exact `X-Request-ID` sent by the evaluator for that live search call | ||
| 180 | +- `top_label_sequence_top10` / `top_label_sequence_top20` — compact label sequence strings such as `1:L3 | 2:L1 | 3:L2` | ||
| 181 | +- `top_results` — a lightweight top-20 snapshot with `rank`, `spu_id`, `label`, title fields, and `relevance_score` | ||
| 182 | + | ||
| 183 | +The Markdown report now surfaces the same case context in a lighter human-readable form: | ||
| 184 | + | ||
| 185 | +- request id | ||
| 186 | +- top-10 / top-20 label sequence | ||
| 187 | +- top 5 result snapshot for quick scanning | ||
| 188 | + | ||
| 189 | +This means a bad case can usually be reconstructed directly from the batch artifact itself, without replaying logs or joining SQLite tables by hand. | ||
| 190 | + | ||
| 191 | +The web history endpoint intentionally returns a compact summary only (aggregate metrics plus query count), so adding richer per-query snapshots to the batch payload does not bloat the history list UI. | ||
| 192 | + | ||
| 177 | ## Ranking debug and LTR prep | 193 | ## Ranking debug and LTR prep |
| 178 | 194 | ||
| 179 | `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work: | 195 | `debug_info` now exposes two extra layers that are useful for tuning and future learning-to-rank work: |
scripts/evaluation/eval_framework/__init__.py
| @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402 | @@ -14,10 +14,10 @@ from .constants import ( # noqa: E402 | ||
| 14 | DEFAULT_ARTIFACT_ROOT, | 14 | DEFAULT_ARTIFACT_ROOT, |
| 15 | DEFAULT_QUERY_FILE, | 15 | DEFAULT_QUERY_FILE, |
| 16 | PROJECT_ROOT, | 16 | PROJECT_ROOT, |
| 17 | - RELEVANCE_EXACT, | ||
| 18 | - RELEVANCE_HIGH, | ||
| 19 | - RELEVANCE_IRRELEVANT, | ||
| 20 | - RELEVANCE_LOW, | 17 | + RELEVANCE_LV0, |
| 18 | + RELEVANCE_LV1, | ||
| 19 | + RELEVANCE_LV2, | ||
| 20 | + RELEVANCE_LV3, | ||
| 21 | RELEVANCE_NON_IRRELEVANT, | 21 | RELEVANCE_NON_IRRELEVANT, |
| 22 | VALID_LABELS, | 22 | VALID_LABELS, |
| 23 | ) | 23 | ) |
| @@ -39,10 +39,10 @@ __all__ = [ | @@ -39,10 +39,10 @@ __all__ = [ | ||
| 39 | "EvalStore", | 39 | "EvalStore", |
| 40 | "PROJECT_ROOT", | 40 | "PROJECT_ROOT", |
| 41 | "QueryBuildResult", | 41 | "QueryBuildResult", |
| 42 | - "RELEVANCE_EXACT", | ||
| 43 | - "RELEVANCE_HIGH", | ||
| 44 | - "RELEVANCE_IRRELEVANT", | ||
| 45 | - "RELEVANCE_LOW", | 42 | + "RELEVANCE_LV0", |
| 43 | + "RELEVANCE_LV1", | ||
| 44 | + "RELEVANCE_LV2", | ||
| 45 | + "RELEVANCE_LV3", | ||
| 46 | "RELEVANCE_NON_IRRELEVANT", | 46 | "RELEVANCE_NON_IRRELEVANT", |
| 47 | "SearchEvaluationFramework", | 47 | "SearchEvaluationFramework", |
| 48 | "VALID_LABELS", | 48 | "VALID_LABELS", |
scripts/evaluation/eval_framework/clients.py
| @@ -157,6 +157,7 @@ class SearchServiceClient: | @@ -157,6 +157,7 @@ class SearchServiceClient: | ||
| 157 | return self._request_json("GET", path, timeout=timeout) | 157 | return self._request_json("GET", path, timeout=timeout) |
| 158 | 158 | ||
| 159 | def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: | 159 | def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: |
| 160 | + request_id = uuid.uuid4().hex[:8] | ||
| 160 | payload: Dict[str, Any] = { | 161 | payload: Dict[str, Any] = { |
| 161 | "query": query, | 162 | "query": query, |
| 162 | "size": size, | 163 | "size": size, |
| @@ -165,13 +166,19 @@ class SearchServiceClient: | @@ -165,13 +166,19 @@ class SearchServiceClient: | ||
| 165 | } | 166 | } |
| 166 | if debug: | 167 | if debug: |
| 167 | payload["debug"] = True | 168 | payload["debug"] = True |
| 168 | - return self._request_json( | 169 | + response = self._request_json( |
| 169 | "POST", | 170 | "POST", |
| 170 | "/search/", | 171 | "/search/", |
| 171 | timeout=120, | 172 | timeout=120, |
| 172 | - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | 173 | + headers={ |
| 174 | + "Content-Type": "application/json", | ||
| 175 | + "X-Tenant-ID": self.tenant_id, | ||
| 176 | + "X-Request-ID": request_id, | ||
| 177 | + }, | ||
| 173 | json_payload=payload, | 178 | json_payload=payload, |
| 174 | ) | 179 | ) |
| 180 | + response["_eval_request_id"] = request_id | ||
| 181 | + return response | ||
| 175 | 182 | ||
| 176 | 183 | ||
| 177 | class RerankServiceClient: | 184 | class RerankServiceClient: |
scripts/evaluation/eval_framework/constants.py
| @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent | @@ -7,24 +7,24 @@ _SCRIPTS_EVAL_DIR = _PKG_DIR.parent | ||
| 7 | PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] | 7 | PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] |
| 8 | 8 | ||
| 9 | # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) | 9 | # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) |
| 10 | -RELEVANCE_EXACT = "Fully Relevant" | ||
| 11 | -RELEVANCE_HIGH = "Mostly Relevant" | ||
| 12 | -RELEVANCE_LOW = "Weakly Relevant" | ||
| 13 | -RELEVANCE_IRRELEVANT = "Irrelevant" | 10 | +RELEVANCE_LV3 = "Fully Relevant" |
| 11 | +RELEVANCE_LV2 = "Mostly Relevant" | ||
| 12 | +RELEVANCE_LV1 = "Weakly Relevant" | ||
| 13 | +RELEVANCE_LV0 = "Irrelevant" | ||
| 14 | 14 | ||
| 15 | -VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) | 15 | +VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0}) |
| 16 | 16 | ||
| 17 | # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. | 17 | # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics. |
| 18 | -RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) | ||
| 19 | -RELEVANCE_STRONG = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH}) | 18 | +RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1}) |
| 19 | +RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2}) | ||
| 20 | 20 | ||
| 21 | # Graded relevance for ranking evaluation. | 21 | # Graded relevance for ranking evaluation. |
| 22 | # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. | 22 | # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics. |
| 23 | RELEVANCE_GRADE_MAP = { | 23 | RELEVANCE_GRADE_MAP = { |
| 24 | - RELEVANCE_EXACT: 3, | ||
| 25 | - RELEVANCE_HIGH: 2, | ||
| 26 | - RELEVANCE_LOW: 1, | ||
| 27 | - RELEVANCE_IRRELEVANT: 0, | 24 | + RELEVANCE_LV3: 3, |
| 25 | + RELEVANCE_LV2: 2, | ||
| 26 | + RELEVANCE_LV1: 1, | ||
| 27 | + RELEVANCE_LV0: 0, | ||
| 28 | } | 28 | } |
| 29 | # 标准的gain计算方法:2^rel - 1 | 29 | # 标准的gain计算方法:2^rel - 1 |
| 30 | # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度 | 30 | # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度 |
| @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = { | @@ -36,10 +36,10 @@ RELEVANCE_GAIN_MAP = { | ||
| 36 | 36 | ||
| 37 | # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009). | 37 | # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009). |
| 38 | STOP_PROB_MAP = { | 38 | STOP_PROB_MAP = { |
| 39 | - RELEVANCE_EXACT: 0.99, | ||
| 40 | - RELEVANCE_HIGH: 0.8, | ||
| 41 | - RELEVANCE_LOW: 0.1, | ||
| 42 | - RELEVANCE_IRRELEVANT: 0.0, | 39 | + RELEVANCE_LV3: 0.99, |
| 40 | + RELEVANCE_LV2: 0.8, | ||
| 41 | + RELEVANCE_LV1: 0.1, | ||
| 42 | + RELEVANCE_LV0: 0.0, | ||
| 43 | } | 43 | } |
| 44 | 44 | ||
| 45 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | 45 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | @@ -78,7 +78,7 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | ||
| 78 | # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): | 78 | # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): |
| 79 | # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), | 79 | # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), |
| 80 | # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). | 80 | # - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). |
| 81 | -# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Weakly Relevant"). | 81 | +# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant"). |
| 82 | # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak | 82 | # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak |
| 83 | # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). | 83 | # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). |
| 84 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 | 84 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 |
scripts/evaluation/eval_framework/framework.py
| @@ -25,14 +25,14 @@ from .constants import ( | @@ -25,14 +25,14 @@ from .constants import ( | ||
| 25 | DEFAULT_RERANK_HIGH_SKIP_COUNT, | 25 | DEFAULT_RERANK_HIGH_SKIP_COUNT, |
| 26 | DEFAULT_RERANK_HIGH_THRESHOLD, | 26 | DEFAULT_RERANK_HIGH_THRESHOLD, |
| 27 | DEFAULT_SEARCH_RECALL_TOP_K, | 27 | DEFAULT_SEARCH_RECALL_TOP_K, |
| 28 | - RELEVANCE_EXACT, | ||
| 29 | RELEVANCE_GAIN_MAP, | 28 | RELEVANCE_GAIN_MAP, |
| 30 | - RELEVANCE_HIGH, | ||
| 31 | - STOP_PROB_MAP, | ||
| 32 | - RELEVANCE_IRRELEVANT, | ||
| 33 | - RELEVANCE_LOW, | 29 | + RELEVANCE_LV0, |
| 30 | + RELEVANCE_LV1, | ||
| 31 | + RELEVANCE_LV2, | ||
| 32 | + RELEVANCE_LV3, | ||
| 34 | RELEVANCE_NON_IRRELEVANT, | 33 | RELEVANCE_NON_IRRELEVANT, |
| 35 | VALID_LABELS, | 34 | VALID_LABELS, |
| 35 | + STOP_PROB_MAP, | ||
| 36 | ) | 36 | ) |
| 37 | from .metrics import ( | 37 | from .metrics import ( |
| 38 | PRIMARY_METRIC_GRADE_NORMALIZER, | 38 | PRIMARY_METRIC_GRADE_NORMALIZER, |
| @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: | @@ -96,6 +96,16 @@ def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: | ||
| 96 | return out | 96 | return out |
| 97 | 97 | ||
| 98 | 98 | ||
| 99 | +def _encode_label_sequence(items: Sequence[Dict[str, Any]], limit: int) -> str: | ||
| 100 | + parts: List[str] = [] | ||
| 101 | + for item in items[:limit]: | ||
| 102 | + rank = int(item.get("rank") or 0) | ||
| 103 | + label = str(item.get("label") or "") | ||
| 104 | + grade = RELEVANCE_GAIN_MAP.get(label) | ||
| 105 | + parts.append(f"{rank}:L{grade}" if grade is not None else f"{rank}:?") | ||
| 106 | + return " | ".join(parts) | ||
| 107 | + | ||
| 108 | + | ||
| 99 | class SearchEvaluationFramework: | 109 | class SearchEvaluationFramework: |
| 100 | def __init__( | 110 | def __init__( |
| 101 | self, | 111 | self, |
| @@ -168,7 +178,7 @@ class SearchEvaluationFramework: | @@ -168,7 +178,7 @@ class SearchEvaluationFramework: | ||
| 168 | ) -> Dict[str, Any]: | 178 | ) -> Dict[str, Any]: |
| 169 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) | 179 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 170 | labels = [ | 180 | labels = [ |
| 171 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 181 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 |
| 172 | for item in live["results"] | 182 | for item in live["results"] |
| 173 | ] | 183 | ] |
| 174 | return { | 184 | return { |
| @@ -432,7 +442,7 @@ class SearchEvaluationFramework: | @@ -432,7 +442,7 @@ class SearchEvaluationFramework: | ||
| 432 | 442 | ||
| 433 | - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and | 443 | - ``#(Irrelevant)/n > irrelevant_stop_ratio`` (default 0.939), and |
| 434 | - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio`` | 444 | - ``( #(Irrelevant) + #(Weakly Relevant) ) / n > irrelevant_low_combined_stop_ratio`` |
| 435 | - (default 0.959; weak relevance = ``RELEVANCE_LOW``). | 445 | + (default 0.959; weak relevance = ``RELEVANCE_LV1``). |
| 436 | 446 | ||
| 437 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. | 447 | Maintain a streak of consecutive *bad* batches; any non-bad batch resets the streak to 0. |
| 438 | Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached | 448 | Stop labeling when ``streak >= stop_streak`` (default 3) or when ``max_batches`` is reached |
| @@ -474,9 +484,9 @@ class SearchEvaluationFramework: | @@ -474,9 +484,9 @@ class SearchEvaluationFramework: | ||
| 474 | time.sleep(0.1) | 484 | time.sleep(0.1) |
| 475 | 485 | ||
| 476 | n = len(batch_docs) | 486 | n = len(batch_docs) |
| 477 | - exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) | ||
| 478 | - irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) | ||
| 479 | - low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LOW) | 487 | + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV3) |
| 488 | + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV0) | ||
| 489 | + low_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_LV1) | ||
| 480 | exact_ratio = exact_n / n if n else 0.0 | 490 | exact_ratio = exact_n / n if n else 0.0 |
| 481 | irrelevant_ratio = irrel_n / n if n else 0.0 | 491 | irrelevant_ratio = irrel_n / n if n else 0.0 |
| 482 | low_ratio = low_n / n if n else 0.0 | 492 | low_ratio = low_n / n if n else 0.0 |
| @@ -633,7 +643,7 @@ class SearchEvaluationFramework: | @@ -633,7 +643,7 @@ class SearchEvaluationFramework: | ||
| 633 | ) | 643 | ) |
| 634 | 644 | ||
| 635 | top100_labels = [ | 645 | top100_labels = [ |
| 636 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 646 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 |
| 637 | for item in search_labeled_results[:100] | 647 | for item in search_labeled_results[:100] |
| 638 | ] | 648 | ] |
| 639 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) | 649 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) |
| @@ -843,7 +853,7 @@ class SearchEvaluationFramework: | @@ -843,7 +853,7 @@ class SearchEvaluationFramework: | ||
| 843 | ) | 853 | ) |
| 844 | 854 | ||
| 845 | top100_labels = [ | 855 | top100_labels = [ |
| 846 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 856 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 |
| 847 | for item in search_labeled_results[:100] | 857 | for item in search_labeled_results[:100] |
| 848 | ] | 858 | ] |
| 849 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) | 859 | metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values())) |
| @@ -920,16 +930,17 @@ class SearchEvaluationFramework: | @@ -920,16 +930,17 @@ class SearchEvaluationFramework: | ||
| 920 | "title_zh": title_zh if title_zh and title_zh != primary_title else "", | 930 | "title_zh": title_zh if title_zh and title_zh != primary_title else "", |
| 921 | "image_url": doc.get("image_url"), | 931 | "image_url": doc.get("image_url"), |
| 922 | "label": label, | 932 | "label": label, |
| 933 | + "relevance_score": doc.get("relevance_score"), | ||
| 923 | "option_values": list(compact_option_values(doc.get("skus") or [])), | 934 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| 924 | "product": compact_product_payload(doc), | 935 | "product": compact_product_payload(doc), |
| 925 | } | 936 | } |
| 926 | ) | 937 | ) |
| 927 | metric_labels = [ | 938 | metric_labels = [ |
| 928 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 939 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 |
| 929 | for item in labeled | 940 | for item in labeled |
| 930 | ] | 941 | ] |
| 931 | ideal_labels = [ | 942 | ideal_labels = [ |
| 932 | - label if label in VALID_LABELS else RELEVANCE_IRRELEVANT | 943 | + label if label in VALID_LABELS else RELEVANCE_LV0 |
| 933 | for label in labels.values() | 944 | for label in labels.values() |
| 934 | ] | 945 | ] |
| 935 | label_stats = self.store.get_query_label_stats(self.tenant_id, query) | 946 | label_stats = self.store.get_query_label_stats(self.tenant_id, query) |
| @@ -960,10 +971,10 @@ class SearchEvaluationFramework: | @@ -960,10 +971,10 @@ class SearchEvaluationFramework: | ||
| 960 | } | 971 | } |
| 961 | ) | 972 | ) |
| 962 | label_order = { | 973 | label_order = { |
| 963 | - RELEVANCE_EXACT: 0, | ||
| 964 | - RELEVANCE_HIGH: 1, | ||
| 965 | - RELEVANCE_LOW: 2, | ||
| 966 | - RELEVANCE_IRRELEVANT: 3, | 974 | + RELEVANCE_LV3: 0, |
| 975 | + RELEVANCE_LV2: 1, | ||
| 976 | + RELEVANCE_LV1: 2, | ||
| 977 | + RELEVANCE_LV0: 3, | ||
| 967 | } | 978 | } |
| 968 | missing_relevant.sort( | 979 | missing_relevant.sort( |
| 969 | key=lambda item: ( | 980 | key=lambda item: ( |
| @@ -989,6 +1000,7 @@ class SearchEvaluationFramework: | @@ -989,6 +1000,7 @@ class SearchEvaluationFramework: | ||
| 989 | "top_k": top_k, | 1000 | "top_k": top_k, |
| 990 | "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), | 1001 | "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels), |
| 991 | "metric_context": _metric_context_payload(), | 1002 | "metric_context": _metric_context_payload(), |
| 1003 | + "request_id": str(search_payload.get("_eval_request_id") or ""), | ||
| 992 | "results": labeled, | 1004 | "results": labeled, |
| 993 | "missing_relevant": missing_relevant, | 1005 | "missing_relevant": missing_relevant, |
| 994 | "label_stats": { | 1006 | "label_stats": { |
| @@ -996,9 +1008,9 @@ class SearchEvaluationFramework: | @@ -996,9 +1008,9 @@ class SearchEvaluationFramework: | ||
| 996 | "unlabeled_hits_treated_irrelevant": unlabeled_hits, | 1008 | "unlabeled_hits_treated_irrelevant": unlabeled_hits, |
| 997 | "recalled_hits": len(labeled), | 1009 | "recalled_hits": len(labeled), |
| 998 | "missing_relevant_count": len(missing_relevant), | 1010 | "missing_relevant_count": len(missing_relevant), |
| 999 | - "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | ||
| 1000 | - "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_HIGH), | ||
| 1001 | - "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LOW), | 1011 | + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV3), |
| 1012 | + "missing_high_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV2), | ||
| 1013 | + "missing_low_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_LV1), | ||
| 1002 | }, | 1014 | }, |
| 1003 | "tips": tips, | 1015 | "tips": tips, |
| 1004 | "total": int(search_payload.get("total") or 0), | 1016 | "total": int(search_payload.get("total") or 0), |
| @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework: | @@ -1014,6 +1026,7 @@ class SearchEvaluationFramework: | ||
| 1014 | force_refresh_labels: bool = False, | 1026 | force_refresh_labels: bool = False, |
| 1015 | ) -> Dict[str, Any]: | 1027 | ) -> Dict[str, Any]: |
| 1016 | per_query = [] | 1028 | per_query = [] |
| 1029 | + case_snapshot_top_n = min(max(int(top_k), 1), 20) | ||
| 1017 | total_q = len(queries) | 1030 | total_q = len(queries) |
| 1018 | _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) | 1031 | _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) |
| 1019 | for q_index, query in enumerate(queries, start=1): | 1032 | for q_index, query in enumerate(queries, start=1): |
| @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework: | @@ -1025,7 +1038,7 @@ class SearchEvaluationFramework: | ||
| 1025 | force_refresh_labels=force_refresh_labels, | 1038 | force_refresh_labels=force_refresh_labels, |
| 1026 | ) | 1039 | ) |
| 1027 | labels = [ | 1040 | labels = [ |
| 1028 | - item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 1041 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0 |
| 1029 | for item in live["results"] | 1042 | for item in live["results"] |
| 1030 | ] | 1043 | ] |
| 1031 | per_query.append( | 1044 | per_query.append( |
| @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework: | @@ -1036,6 +1049,21 @@ class SearchEvaluationFramework: | ||
| 1036 | "metrics": live["metrics"], | 1049 | "metrics": live["metrics"], |
| 1037 | "distribution": label_distribution(labels), | 1050 | "distribution": label_distribution(labels), |
| 1038 | "total": live["total"], | 1051 | "total": live["total"], |
| 1052 | + "request_id": live.get("request_id") or "", | ||
| 1053 | + "case_snapshot_top_n": case_snapshot_top_n, | ||
| 1054 | + "top_label_sequence_top10": _encode_label_sequence(live["results"], 10), | ||
| 1055 | + "top_label_sequence_top20": _encode_label_sequence(live["results"], case_snapshot_top_n), | ||
| 1056 | + "top_results": [ | ||
| 1057 | + { | ||
| 1058 | + "rank": int(item.get("rank") or 0), | ||
| 1059 | + "spu_id": str(item.get("spu_id") or ""), | ||
| 1060 | + "label": item.get("label"), | ||
| 1061 | + "title": item.get("title"), | ||
| 1062 | + "title_zh": item.get("title_zh"), | ||
| 1063 | + "relevance_score": item.get("relevance_score"), | ||
| 1064 | + } | ||
| 1065 | + for item in live["results"][:case_snapshot_top_n] | ||
| 1066 | + ], | ||
| 1039 | } | 1067 | } |
| 1040 | ) | 1068 | ) |
| 1041 | m = live["metrics"] | 1069 | m = live["metrics"] |
| @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework: | @@ -1055,10 +1083,10 @@ class SearchEvaluationFramework: | ||
| 1055 | ) | 1083 | ) |
| 1056 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) | 1084 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) |
| 1057 | aggregate_distribution = { | 1085 | aggregate_distribution = { |
| 1058 | - RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | ||
| 1059 | - RELEVANCE_HIGH: sum(item["distribution"][RELEVANCE_HIGH] for item in per_query), | ||
| 1060 | - RELEVANCE_LOW: sum(item["distribution"][RELEVANCE_LOW] for item in per_query), | ||
| 1061 | - RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), | 1086 | + RELEVANCE_LV3: sum(item["distribution"][RELEVANCE_LV3] for item in per_query), |
| 1087 | + RELEVANCE_LV2: sum(item["distribution"][RELEVANCE_LV2] for item in per_query), | ||
| 1088 | + RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query), | ||
| 1089 | + RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query), | ||
| 1062 | } | 1090 | } |
| 1063 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | 1091 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" |
| 1064 | report_dir = ensure_dir(self.artifact_root / "batch_reports") | 1092 | report_dir = ensure_dir(self.artifact_root / "batch_reports") |
scripts/evaluation/eval_framework/metrics.py
| @@ -6,12 +6,12 @@ import math | @@ -6,12 +6,12 @@ import math | ||
| 6 | from typing import Dict, Iterable, Sequence | 6 | from typing import Dict, Iterable, Sequence |
| 7 | 7 | ||
| 8 | from .constants import ( | 8 | from .constants import ( |
| 9 | - RELEVANCE_EXACT, | ||
| 10 | RELEVANCE_GAIN_MAP, | 9 | RELEVANCE_GAIN_MAP, |
| 11 | RELEVANCE_GRADE_MAP, | 10 | RELEVANCE_GRADE_MAP, |
| 12 | - RELEVANCE_HIGH, | ||
| 13 | - RELEVANCE_IRRELEVANT, | ||
| 14 | - RELEVANCE_LOW, | 11 | + RELEVANCE_LV0, |
| 12 | + RELEVANCE_LV1, | ||
| 13 | + RELEVANCE_LV2, | ||
| 14 | + RELEVANCE_LV3, | ||
| 15 | RELEVANCE_NON_IRRELEVANT, | 15 | RELEVANCE_NON_IRRELEVANT, |
| 16 | RELEVANCE_STRONG, | 16 | RELEVANCE_STRONG, |
| 17 | STOP_PROB_MAP, | 17 | STOP_PROB_MAP, |
| @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0 | @@ -33,7 +33,7 @@ PRIMARY_METRIC_GRADE_NORMALIZER = float(max(RELEVANCE_GRADE_MAP.values()) or 1.0 | ||
| 33 | def _normalize_label(label: str) -> str: | 33 | def _normalize_label(label: str) -> str: |
| 34 | if label in RELEVANCE_GRADE_MAP: | 34 | if label in RELEVANCE_GRADE_MAP: |
| 35 | return label | 35 | return label |
| 36 | - return RELEVANCE_IRRELEVANT | 36 | + return RELEVANCE_LV0 |
| 37 | 37 | ||
| 38 | 38 | ||
| 39 | def _gains_for_labels(labels: Sequence[str]) -> list[float]: | 39 | def _gains_for_labels(labels: Sequence[str]) -> list[float]: |
| @@ -135,7 +135,7 @@ def compute_query_metrics( | @@ -135,7 +135,7 @@ def compute_query_metrics( | ||
| 135 | ideal = list(ideal_labels) if ideal_labels is not None else list(labels) | 135 | ideal = list(ideal_labels) if ideal_labels is not None else list(labels) |
| 136 | metrics: Dict[str, float] = {} | 136 | metrics: Dict[str, float] = {} |
| 137 | 137 | ||
| 138 | - exact_hits = _binary_hits(labels, [RELEVANCE_EXACT]) | 138 | + exact_hits = _binary_hits(labels, [RELEVANCE_LV3]) |
| 139 | strong_hits = _binary_hits(labels, RELEVANCE_STRONG) | 139 | strong_hits = _binary_hits(labels, RELEVANCE_STRONG) |
| 140 | useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) | 140 | useful_hits = _binary_hits(labels, RELEVANCE_NON_IRRELEVANT) |
| 141 | 141 | ||
| @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo | @@ -183,8 +183,8 @@ def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, flo | ||
| 183 | 183 | ||
| 184 | def label_distribution(labels: Sequence[str]) -> Dict[str, int]: | 184 | def label_distribution(labels: Sequence[str]) -> Dict[str, int]: |
| 185 | return { | 185 | return { |
| 186 | - RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), | ||
| 187 | - RELEVANCE_HIGH: sum(1 for label in labels if label == RELEVANCE_HIGH), | ||
| 188 | - RELEVANCE_LOW: sum(1 for label in labels if label == RELEVANCE_LOW), | ||
| 189 | - RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), | 186 | + RELEVANCE_LV3: sum(1 for label in labels if label == RELEVANCE_LV3), |
| 187 | + RELEVANCE_LV2: sum(1 for label in labels if label == RELEVANCE_LV2), | ||
| 188 | + RELEVANCE_LV1: sum(1 for label in labels if label == RELEVANCE_LV1), | ||
| 189 | + RELEVANCE_LV0: sum(1 for label in labels if label == RELEVANCE_LV0), | ||
| 190 | } | 190 | } |
scripts/evaluation/eval_framework/reports.py
| @@ -4,7 +4,7 @@ from __future__ import annotations | @@ -4,7 +4,7 @@ from __future__ import annotations | ||
| 4 | 4 | ||
| 5 | from typing import Any, Dict | 5 | from typing import Any, Dict |
| 6 | 6 | ||
| 7 | -from .constants import RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_IRRELEVANT, RELEVANCE_LOW | 7 | +from .constants import RELEVANCE_GAIN_MAP, RELEVANCE_LV0, RELEVANCE_LV1, RELEVANCE_LV2, RELEVANCE_LV3 |
| 8 | from .metrics import PRIMARY_METRIC_KEYS | 8 | from .metrics import PRIMARY_METRIC_KEYS |
| 9 | 9 | ||
| 10 | 10 | ||
| @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: | @@ -25,6 +25,38 @@ def _append_metric_block(lines: list[str], metrics: Dict[str, Any]) -> None: | ||
| 25 | lines.append(f"- {key}: {value}") | 25 | lines.append(f"- {key}: {value}") |
| 26 | 26 | ||
| 27 | 27 | ||
| 28 | +def _label_level_code(label: str) -> str: | ||
| 29 | + grade = RELEVANCE_GAIN_MAP.get(label) | ||
| 30 | + return f"L{grade}" if grade is not None else "?" | ||
| 31 | + | ||
| 32 | + | ||
| 33 | +def _append_case_snapshot(lines: list[str], item: Dict[str, Any]) -> None: | ||
| 34 | + request_id = str(item.get("request_id") or "").strip() | ||
| 35 | + if request_id: | ||
| 36 | + lines.append(f"- Request ID: `{request_id}`") | ||
| 37 | + seq10 = str(item.get("top_label_sequence_top10") or "").strip() | ||
| 38 | + if seq10: | ||
| 39 | + lines.append(f"- Top-10 Labels: `{seq10}`") | ||
| 40 | + seq20 = str(item.get("top_label_sequence_top20") or "").strip() | ||
| 41 | + if seq20 and seq20 != seq10: | ||
| 42 | + lines.append(f"- Top-20 Labels: `{seq20}`") | ||
| 43 | + top_results = item.get("top_results") or [] | ||
| 44 | + if not top_results: | ||
| 45 | + return | ||
| 46 | + lines.append("- Case Snapshot:") | ||
| 47 | + for result in top_results[:5]: | ||
| 48 | + rank = int(result.get("rank") or 0) | ||
| 49 | + label = _label_level_code(str(result.get("label") or "")) | ||
| 50 | + spu_id = str(result.get("spu_id") or "") | ||
| 51 | + title = str(result.get("title") or "") | ||
| 52 | + title_zh = str(result.get("title_zh") or "") | ||
| 53 | + relevance_score = result.get("relevance_score") | ||
| 54 | + score_suffix = f" (rel={relevance_score})" if relevance_score not in (None, "") else "" | ||
| 55 | + lines.append(f" - #{rank} [{label}] spu={spu_id} {title}{score_suffix}") | ||
| 56 | + if title_zh: | ||
| 57 | + lines.append(f" zh: {title_zh}") | ||
| 58 | + | ||
| 59 | + | ||
| 28 | def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | 60 | def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 29 | lines = [ | 61 | lines = [ |
| 30 | "# Search Batch Evaluation", | 62 | "# Search Batch Evaluation", |
| @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | @@ -56,10 +88,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | ||
| 56 | "", | 88 | "", |
| 57 | "## Label Distribution", | 89 | "## Label Distribution", |
| 58 | "", | 90 | "", |
| 59 | - f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}", | ||
| 60 | - f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}", | ||
| 61 | - f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}", | ||
| 62 | - f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", | 91 | + f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}", |
| 92 | + f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}", | ||
| 93 | + f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}", | ||
| 94 | + f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}", | ||
| 63 | ] | 95 | ] |
| 64 | ) | 96 | ) |
| 65 | lines.extend(["", "## Per Query", ""]) | 97 | lines.extend(["", "## Per Query", ""]) |
| @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | @@ -68,9 +100,10 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | ||
| 68 | lines.append("") | 100 | lines.append("") |
| 69 | _append_metric_block(lines, item.get("metrics") or {}) | 101 | _append_metric_block(lines, item.get("metrics") or {}) |
| 70 | distribution = item.get("distribution") or {} | 102 | distribution = item.get("distribution") or {} |
| 71 | - lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_EXACT, 0)}") | ||
| 72 | - lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_HIGH, 0)}") | ||
| 73 | - lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LOW, 0)}") | ||
| 74 | - lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") | 103 | + lines.append(f"- Fully Relevant: {distribution.get(RELEVANCE_LV3, 0)}") |
| 104 | + lines.append(f"- Mostly Relevant: {distribution.get(RELEVANCE_LV2, 0)}") | ||
| 105 | + lines.append(f"- Weakly Relevant: {distribution.get(RELEVANCE_LV1, 0)}") | ||
| 106 | + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_LV0, 0)}") | ||
| 107 | + _append_case_snapshot(lines, item) | ||
| 75 | lines.append("") | 108 | lines.append("") |
| 76 | return "\n".join(lines) | 109 | return "\n".join(lines) |
scripts/evaluation/eval_framework/static/eval_web.js
| @@ -190,7 +190,7 @@ async function loadQueries() { | @@ -190,7 +190,7 @@ async function loadQueries() { | ||
| 190 | 190 | ||
| 191 | function historySummaryHtml(meta) { | 191 | function historySummaryHtml(meta) { |
| 192 | const m = meta && meta.aggregate_metrics; | 192 | const m = meta && meta.aggregate_metrics; |
| 193 | - const nq = (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; | 193 | + const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null; |
| 194 | const parts = []; | 194 | const parts = []; |
| 195 | if (nq != null) parts.push(`<span>Queries</span> ${nq}`); | 195 | if (nq != null) parts.push(`<span>Queries</span> ${nq}`); |
| 196 | if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`); | 196 | if (m && m["Primary_Metric_Score"] != null) parts.push(`<span>Primary</span> ${fmtNumber(m["Primary_Metric_Score"])}`); |
scripts/evaluation/eval_framework/store.py
| @@ -23,6 +23,18 @@ class QueryBuildResult: | @@ -23,6 +23,18 @@ class QueryBuildResult: | ||
| 23 | output_json_path: Path | 23 | output_json_path: Path |
| 24 | 24 | ||
| 25 | 25 | ||
| 26 | +def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]: | ||
| 27 | + return { | ||
| 28 | + "batch_id": metadata.get("batch_id"), | ||
| 29 | + "created_at": metadata.get("created_at"), | ||
| 30 | + "tenant_id": metadata.get("tenant_id"), | ||
| 31 | + "top_k": metadata.get("top_k"), | ||
| 32 | + "query_count": len(metadata.get("queries") or []), | ||
| 33 | + "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}), | ||
| 34 | + "metric_context": dict(metadata.get("metric_context") or {}), | ||
| 35 | + } | ||
| 36 | + | ||
| 37 | + | ||
| 26 | class EvalStore: | 38 | class EvalStore: |
| 27 | def __init__(self, db_path: Path): | 39 | def __init__(self, db_path: Path): |
| 28 | self.db_path = db_path | 40 | self.db_path = db_path |
| @@ -339,6 +351,7 @@ class EvalStore: | @@ -339,6 +351,7 @@ class EvalStore: | ||
| 339 | ).fetchall() | 351 | ).fetchall() |
| 340 | items: List[Dict[str, Any]] = [] | 352 | items: List[Dict[str, Any]] = [] |
| 341 | for row in rows: | 353 | for row in rows: |
| 354 | + metadata = json.loads(row["metadata_json"]) | ||
| 342 | items.append( | 355 | items.append( |
| 343 | { | 356 | { |
| 344 | "batch_id": row["batch_id"], | 357 | "batch_id": row["batch_id"], |
| @@ -346,7 +359,7 @@ class EvalStore: | @@ -346,7 +359,7 @@ class EvalStore: | ||
| 346 | "output_json_path": row["output_json_path"], | 359 | "output_json_path": row["output_json_path"], |
| 347 | "report_markdown_path": row["report_markdown_path"], | 360 | "report_markdown_path": row["report_markdown_path"], |
| 348 | "config_snapshot_path": row["config_snapshot_path"], | 361 | "config_snapshot_path": row["config_snapshot_path"], |
| 349 | - "metadata": json.loads(row["metadata_json"]), | 362 | + "metadata": _compact_batch_metadata(metadata), |
| 350 | "created_at": row["created_at"], | 363 | "created_at": row["created_at"], |
| 351 | } | 364 | } |
| 352 | ) | 365 | ) |
scripts/evaluation/offline_ltr_fit.py
| @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path: | @@ -23,11 +23,11 @@ if str(PROJECT_ROOT) not in sys.path: | ||
| 23 | 23 | ||
| 24 | from scripts.evaluation.eval_framework.constants import ( | 24 | from scripts.evaluation.eval_framework.constants import ( |
| 25 | DEFAULT_ARTIFACT_ROOT, | 25 | DEFAULT_ARTIFACT_ROOT, |
| 26 | - RELEVANCE_EXACT, | ||
| 27 | RELEVANCE_GRADE_MAP, | 26 | RELEVANCE_GRADE_MAP, |
| 28 | - RELEVANCE_HIGH, | ||
| 29 | - RELEVANCE_IRRELEVANT, | ||
| 30 | - RELEVANCE_LOW, | 27 | + RELEVANCE_LV0, |
| 28 | + RELEVANCE_LV1, | ||
| 29 | + RELEVANCE_LV2, | ||
| 30 | + RELEVANCE_LV3, | ||
| 31 | ) | 31 | ) |
| 32 | from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics | 32 | from scripts.evaluation.eval_framework.metrics import aggregate_metrics, compute_query_metrics |
| 33 | from scripts.evaluation.eval_framework.store import EvalStore | 33 | from scripts.evaluation.eval_framework.store import EvalStore |
| @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp | @@ -35,10 +35,10 @@ from scripts.evaluation.eval_framework.utils import ensure_dir, utc_timestamp | ||
| 35 | 35 | ||
| 36 | 36 | ||
| 37 | LABELS_BY_GRADE = { | 37 | LABELS_BY_GRADE = { |
| 38 | - 3: RELEVANCE_EXACT, | ||
| 39 | - 2: RELEVANCE_HIGH, | ||
| 40 | - 1: RELEVANCE_LOW, | ||
| 41 | - 0: RELEVANCE_IRRELEVANT, | 38 | + 3: RELEVANCE_LV3, |
| 39 | + 2: RELEVANCE_LV2, | ||
| 40 | + 1: RELEVANCE_LV1, | ||
| 41 | + 0: RELEVANCE_LV0, | ||
| 42 | } | 42 | } |
| 43 | 43 | ||
| 44 | 44 |