c81b0fc1
tangwang
scripts/evaluatio...
|
1
2
3
4
5
6
7
8
|
"""Paths and shared constants for search evaluation."""
from pathlib import Path
_PKG_DIR = Path(__file__).resolve().parent
_SCRIPTS_EVAL_DIR = _PKG_DIR.parent
PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1]
|
a345b01f
tangwang
eval framework
|
9
10
11
12
|
# Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN)
RELEVANCE_EXACT = "Exact Match"
RELEVANCE_HIGH = "High Relevant"
RELEVANCE_LOW = "Low Relevant"
|
c81b0fc1
tangwang
scripts/evaluatio...
|
13
|
RELEVANCE_IRRELEVANT = "Irrelevant"
|
a345b01f
tangwang
eval framework
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT})
# Precision / MAP "positive" set (all non-irrelevant tiers)
RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW})
_LEGACY_LABEL_MAP = {
"Exact": RELEVANCE_EXACT,
"Partial": RELEVANCE_HIGH,
}
def normalize_stored_label(label: str) -> str:
"""Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels."""
s = str(label).strip()
if s in VALID_LABELS:
return s
return _LEGACY_LABEL_MAP.get(s, s)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
33
34
35
36
|
DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
|
bdb65283
tangwang
标注框架 批量标注
|
37
38
39
40
41
42
|
# Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
DEFAULT_JUDGE_ENABLE_THINKING = True
DEFAULT_JUDGE_DASHSCOPE_BATCH = True
DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
|
d172c259
tangwang
eval框架
|
43
44
45
46
47
48
|
# Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches
DEFAULT_SEARCH_RECALL_TOP_K = 500
DEFAULT_RERANK_HIGH_THRESHOLD = 0.5
DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000
DEFAULT_REBUILD_LLM_BATCH_SIZE = 50
|
167f33b4
tangwang
eval框架前端
|
49
|
DEFAULT_REBUILD_MIN_LLM_BATCHES = 20
|
d172c259
tangwang
eval框架
|
50
51
52
|
DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.92
DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
|