c81b0fc1
tangwang
scripts/evaluatio...
|
1
2
3
4
5
6
7
8
|
"""Paths and shared constants for search evaluation."""
from pathlib import Path
_PKG_DIR = Path(__file__).resolve().parent
_SCRIPTS_EVAL_DIR = _PKG_DIR.parent
PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
|
a345b01f
tangwang
eval framework
|
9
|
# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
|
d73ca84a
tangwang
refine eval case ...
|
10
11
12
13
|
RELEVANCE_LV3 = "Fully Relevant"
RELEVANCE_LV2 = "Mostly Relevant"
RELEVANCE_LV1 = "Weakly Relevant"
RELEVANCE_LV0 = "Irrelevant"
|
a345b01f
tangwang
eval framework
|
14
|
|
d73ca84a
tangwang
refine eval case ...
|
15
|
VALID_LABELS = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1, RELEVANCE_LV0})
|
a345b01f
tangwang
eval framework
|
16
|
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
17
|
# Useful label sets for binary diagnostic slices layered on top of graded ranking metrics.
|
d73ca84a
tangwang
refine eval case ...
|
18
19
|
RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_LV3, RELEVANCE_LV2, RELEVANCE_LV1})
RELEVANCE_STRONG = frozenset({RELEVANCE_LV3, RELEVANCE_LV2})
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
20
21
22
23
|
# Graded relevance for ranking evaluation.
# We use rel grades 3/2/1/0 and gain = 2^rel - 1, which is standard for NDCG-style metrics.
RELEVANCE_GRADE_MAP = {
|
d73ca84a
tangwang
refine eval case ...
|
24
25
26
27
|
RELEVANCE_LV3: 3,
RELEVANCE_LV2: 2,
RELEVANCE_LV1: 1,
RELEVANCE_LV0: 0,
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
28
|
}
|
a6d51aa7
tangwang
eval
|
29
30
|
# 标准的gain计算方法:2^rel - 1
# 但是是因为标注质量不是特别精确,因此适当降低 exact 和 high 的区分度
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
31
|
RELEVANCE_GAIN_MAP = {
|
a6d51aa7
tangwang
eval
|
32
33
|
# label: (2 ** grade) - 1
label: grade
|
7ddd4cb3
tangwang
评估体系从三等级->四等级 Exa...
|
34
35
|
for label, grade in RELEVANCE_GRADE_MAP.items()
}
|
a345b01f
tangwang
eval framework
|
36
|
|
30b490e1
tangwang
添加ERR评估指标
|
37
|
# P(stop | relevance) for ERR (Expected Reciprocal Rank); cascade model (Chapelle et al., 2009).
|
8e081187
tangwang
ERR打分,停止概率的计算公式修改...
|
38
|
# p(t) = (2^t - 1) / 2^{max_grade}
|
30b490e1
tangwang
添加ERR评估指标
|
39
|
STOP_PROB_MAP = {
|
8e081187
tangwang
ERR打分,停止概率的计算公式修改...
|
40
41
42
|
RELEVANCE_LV3: 0.875,
RELEVANCE_LV2: 0.375,
RELEVANCE_LV1: 0.125,
|
d73ca84a
tangwang
refine eval case ...
|
43
|
RELEVANCE_LV0: 0.0,
|
30b490e1
tangwang
添加ERR评估指标
|
44
45
|
}
|
c81b0fc1
tangwang
scripts/evaluatio...
|
46
47
48
|
DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
|
cdd8ee3a
tangwang
eval框架日志独立
|
49
50
51
52
53
54
|
# Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``)
EVAL_LOG_DIR = PROJECT_ROOT / "logs"
EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose"
EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log"
EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log"
|
bdb65283
tangwang
标注框架 批量标注
|
55
|
# Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
|
cdd8ee3a
tangwang
eval框架日志独立
|
56
57
|
DEFAULT_JUDGE_MODEL = "qwen3.5-plus"
DEFAULT_JUDGE_ENABLE_THINKING = False
|
a3734f13
tangwang
eval任务 美国地区不支持bat...
|
58
|
DEFAULT_JUDGE_DASHSCOPE_BATCH = False
|
cdd8ee3a
tangwang
eval框架日志独立
|
59
60
|
# Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
|
331861d5
tangwang
eval框架配置化
|
61
|
DEFAULT_INTENT_MODEL = "qwen3-max"
|
cdd8ee3a
tangwang
eval框架日志独立
|
62
|
DEFAULT_INTENT_ENABLE_THINKING = True
|
bdb65283
tangwang
标注框架 批量标注
|
63
64
|
DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
|
d172c259
tangwang
eval框架
|
65
|
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
66
67
68
69
|
# --- Rebuild annotation pool (``build --force-refresh-labels``) ---
# Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus +
# LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``).
DEFAULT_SEARCH_RECALL_TOP_K = 200
|
d172c259
tangwang
eval框架
|
70
71
72
|
DEFAULT_RERANK_HIGH_THRESHOLD = 0.5
DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000
DEFAULT_REBUILD_LLM_BATCH_SIZE = 50
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
73
74
75
|
# At least this many LLM batches run before early-stop is considered.
DEFAULT_REBUILD_MIN_LLM_BATCHES = 10
# Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs).
|
d172c259
tangwang
eval框架
|
76
|
DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
77
78
|
# LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed):
|
35ae3b29
tangwang
批量评估框架,召回参数修改和llm...
|
79
80
|
# A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``):
# - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%),
|
441f049d
tangwang
评测体系优化,以及
|
81
|
# - (Irrelevant + Weakly Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%).
|
d73ca84a
tangwang
refine eval case ...
|
82
|
# ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LV1`` ("Weakly Relevant").
|
35ae3b29
tangwang
批量评估框架,召回参数修改和llm...
|
83
84
|
# Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
# reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
|
cdd8ee3a
tangwang
eval框架日志独立
|
85
|
DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
|
35ae3b29
tangwang
批量评估框架,召回参数修改和llm...
|
86
87
|
DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
|