"""Paths and shared constants for search evaluation.""" from pathlib import Path _PKG_DIR = Path(__file__).resolve().parent _SCRIPTS_EVAL_DIR = _PKG_DIR.parent PROJECT_ROOT = _SCRIPTS_EVAL_DIR.parents[1] # Canonical English labels (must match LLM prompt output in prompts._CLASSIFY_TEMPLATE_EN) RELEVANCE_EXACT = "Exact Match" RELEVANCE_HIGH = "High Relevant" RELEVANCE_LOW = "Low Relevant" RELEVANCE_IRRELEVANT = "Irrelevant" VALID_LABELS = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW, RELEVANCE_IRRELEVANT}) # Precision / MAP "positive" set (all non-irrelevant tiers) RELEVANCE_NON_IRRELEVANT = frozenset({RELEVANCE_EXACT, RELEVANCE_HIGH, RELEVANCE_LOW}) _LEGACY_LABEL_MAP = { "Exact": RELEVANCE_EXACT, "Partial": RELEVANCE_HIGH, } def normalize_stored_label(label: str) -> str: """Map legacy 3-way SQLite labels to current 4-way strings; pass through canonical labels.""" s = str(label).strip() if s in VALID_LABELS: return s return _LEGACY_LABEL_MAP.get(s, s) DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) DEFAULT_JUDGE_MODEL = "qwen3.5-flash" DEFAULT_JUDGE_ENABLE_THINKING = True DEFAULT_JUDGE_DASHSCOPE_BATCH = False DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 # --- Rebuild annotation pool (``build --force-refresh-labels``) --- # Flow: search recall pool (rerank_score=1, no rerank API) + rerank rest of corpus + # LLM labels in fixed-size batches along global order (see ``framework._annotate_rebuild_batches``). DEFAULT_SEARCH_RECALL_TOP_K = 200 DEFAULT_RERANK_HIGH_THRESHOLD = 0.5 DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000 DEFAULT_REBUILD_LLM_BATCH_SIZE = 50 # At least this many LLM batches run before early-stop is considered. DEFAULT_REBUILD_MIN_LLM_BATCHES = 10 # Hard cap on LLM batches per query (each batch labels up to ``DEFAULT_REBUILD_LLM_BATCH_SIZE`` docs). DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 # LLM early-stop (only after ``DEFAULT_REBUILD_MIN_LLM_BATCHES`` completed): # A batch is "bad" when **both** hold (strict inequalities; see ``framework._annotate_rebuild_batches``): # - irrelevant_ratio > DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO (default 93.9%), # - (Irrelevant + Low Relevant) / n > DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO (default 95.9%). # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3