Blame view

scripts/evaluation/eval_framework/constants.py 3.71 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
7
8
  """Paths and shared constants for search evaluation."""
  
  from pathlib import Path
  
  _PKG_DIR = Path(__file__).resolve().parent
  _SCRIPTS_EVAL_DIR = _PKG_DIR.parent
  PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
  
a345b01f   tangwang   eval framework
9
  # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
d73ca84a   tangwang   refine eval case ...
10
11
12
13
  RELEVANCE_LV3 = "Fully Relevant"
  RELEVANCE_LV2 = "Mostly Relevant"
  RELEVANCE_LV1 = "Weakly Relevant"
  RELEVANCE_LV0 = "Irrelevant"
a345b01f   tangwang   eval framework
14
  
d73ca84a   tangwang   refine eval case ...
15
  VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
a345b01f   tangwang   eval framework
16
  
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
17
  # Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
d73ca84a   tangwang   refine eval case ...
18
19
  RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
  RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
20
21
22
23
  
  # Graded relevance for ranking evaluation.
  # We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
  RELEVANCE_GRADE_MAP = {
d73ca84a   tangwang   refine eval case ...
24
25
26
27
      RELEVANCE_LV3: 3,
      RELEVANCE_LV2: 2,
      RELEVANCE_LV1: 1,
      RELEVANCE_LV0: 0,
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
28
  }
a6d51aa7   tangwang   eval
29
30
  # 标准的gain计算方法:2^rel - 1
  # 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
31
  RELEVANCE_GAIN_MAP = {
a6d51aa7   tangwang   eval
32
33
      # label: (2 ** grade) - 1
      label: grade
7ddd4cb3   tangwang   评估体系从三等级->四等级 Exa...
34
35
      for label, grade in RELEVANCE_GRADE_MAP.items()
  }
a345b01f   tangwang   eval framework
36
  
30b490e1   tangwang   添加ERR评估指标
37
38
  # P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
  STOP_PROB_MAP = {
d73ca84a   tangwang   refine eval case ...
39
40
41
42
      RELEVANCE_LV3: 0.99,
      RELEVANCE_LV2: 0.8,
      RELEVANCE_LV1: 0.1,
      RELEVANCE_LV0: 0.0,
30b490e1   tangwang   添加ERR评估指标
43
44
  }
  
c81b0fc1   tangwang   scripts/evaluatio...
45
46
47
  DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
  DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
  
cdd8ee3a   tangwang   eval框架日志独立
48
49
50
51
52
53
  # Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``)
  EVAL_LOG_DIR = PROJECT_ROOT / "logs"
  EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose"
  EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log"
  EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log"
  
bdb65283   tangwang   标注框架 批量标注
54
  # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
cdd8ee3a   tangwang   eval框架日志独立
55
56
  DEFAULT_JUDGE_MODEL = "qwen3.5-plus"
  DEFAULT_JUDGE_ENABLE_THINKING = False
a3734f13   tangwang   eval任务 美国地区不支持bat...
57
  DEFAULT_JUDGE_DASHSCOPE_BATCH = False
cdd8ee3a   tangwang   eval框架日志独立
58
59
  
  # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
331861d5   tangwang   eval框架配置化
60
  DEFAULT_INTENT_MODEL = "qwen3-max"
cdd8ee3a   tangwang   eval框架日志独立
61
  DEFAULT_INTENT_ENABLE_THINKING = True
bdb65283   tangwang   标注框架 批量标注
62
63
  DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
  DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
d172c259   tangwang   eval框架
64
  
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
65
66
67
68
  # --- Rebuild annotation pool (``build --force-refresh-labels``) ---
  # Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus +
  # LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``).
  DEFAULT_SEARCH_RECALL_TOP_K = 200
d172c259   tangwang   eval框架
69
70
71
  DEFAULT_RERANK_HIGH_THRESHOLD = 0.5
  DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000
  DEFAULT_REBUILD_LLM_BATCH_SIZE = 50
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
72
73
74
  # At least this many LLM batches run before early-stop is considered.
  DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
  # Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs).
d172c259   tangwang   eval框架
75
  DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
dedd31c5   tangwang   1. 搜索 recall 池「1 ...
76
77
  
  # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
35ae3b29   tangwang   批量评估框架,召回参数修改和llm...
78
79
  # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
  #   - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO  (default 93.9%),
441f049d   tangwang   评测体系优化,以及
80
  #   - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO  (default 95.9%).
d73ca84a   tangwang   refine eval case ...
81
  # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
35ae3b29   tangwang   批量评估框架,召回参数修改和llm...
82
83
  # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
  # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
cdd8ee3a   tangwang   eval框架日志独立
84
  DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
35ae3b29   tangwang   批量评估框架,召回参数修改和llm...
85
86
  DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
  DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3