Blame view

scripts/evaluation/eval_framework/constants.py 2.8 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
7
8
  """Paths and shared constants for search evaluation."""
  
  from pathlib import Path
  
  _PKG_DIR = Path(__file__).resolve().parent
  _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
  PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
  
a345b01f   tangwang   eval framework
9
10
11
12
  # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
  RELEVANCE_EXACT = "Exact Match"
  RELEVANCE_HIGH = "High Relevant"
  RELEVANCE_LOW = "Low Relevant"
c81b0fc1   tangwang   scripts/evaluatio...
13
  RELEVANCE_IRRELEVANT = "Irrelevant"
a345b01f   tangwang   eval framework
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
  
  VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
  
  # Precision / MAP "positive" set (all non-irrelevant tiers)
  RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
  
  _LEGACY_LABEL_MAP = {
      "Exact": RELEVANCE_EXACT,
      "Partial": RELEVANCE_HIGH,
  }
  
  
  def normalize_stored_label(label: str) -> str:
      """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels."""
      s = str(label).strip()
      if s in VALID_LABELS:
          return s
      return _LEGACY_LABEL_MAP.get(s, s)
  
c81b0fc1   tangwang   scripts/evaluatio...
33
34
35
36
  
  DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
  DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
  
bdb65283   tangwang   标注框架 批量标注
37
38
39
  # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
  DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
  DEFAULT_JUDGE_ENABLE_THINKING = True
a3734f13   tangwang   eval任务 美国地区不支持bat...
40
  DEFAULT_JUDGE_DASHSCOPE_BATCH = False
bdb65283   tangwang   标注框架 批量标注
41
42
  DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
  DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
d172c259   tangwang   eval框架
43
  
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
44
45
46
47
  # --- Rebuild annotation pool (``build --force-refresh-labels``) ---
  # Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus +
  # LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``).
  DEFAULT_SEARCH_RECALL_TOP_K = 200
d172c259   tangwang   eval框架
48
49
50
  DEFAULT_RERANK_HIGH_THRESHOLD = 0.5
  DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000
  DEFAULT_REBUILD_LLM_BATCH_SIZE = 50
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
51
52
53
  # At least this many LLM batches run before early-stop is considered.
  DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
  # Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs).
d172c259   tangwang   eval框架
54
  DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
55
56
  
  # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
35ae3b29   tangwang   批量评估框架,召回参数修改和llm...
57
58
59
  # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
  #   - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO  (default 93.9%),
  #   - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO  (default 95.9%).
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
60
  # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
35ae3b29   tangwang   批量评估框架,召回参数修改和llm...
61
62
63
64
65
  # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
  # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
  DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
  DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
  DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3