Commit 331861d5449c3ad25cbc6d89df780b0299735fb5

Authored by tangwang
1 parent 1c2ba48e

eval框架配置化

config/config.yaml
@@ -64,6 +64,41 @@ assets: @@ -64,6 +64,41 @@ assets:
64 product_enrich: 64 product_enrich:
65 max_workers: 40 65 max_workers: 40
66 66
  67 +# 离线 / Web 相关性评估(scripts/evaluation、eval-web)
  68 +# CLI 未显式传参时使用此处默认值;search_base_url 未配置时自动为 http://127.0.0.1:{runtime.api_port}
  69 +search_evaluation:
  70 + artifact_root: artifacts/search_evaluation
  71 + queries_file: scripts/evaluation/queries/queries.txt
  72 + eval_log_dir: logs
  73 + default_tenant_id: '163'
  74 + search_base_url: ''
  75 + web_host: 0.0.0.0
  76 + web_port: 6010
  77 + judge_model: qwen3.5-plus
  78 + judge_enable_thinking: false
  79 + judge_dashscope_batch: false
  80 + intent_model: qwen3-max
  81 + intent_enable_thinking: true
  82 + judge_batch_completion_window: 24h
  83 + judge_batch_poll_interval_sec: 10.0
  84 + build_search_depth: 1000
  85 + build_rerank_depth: 10000
  86 + annotate_search_top_k: 120
  87 + annotate_rerank_top_k: 200
  88 + batch_top_k: 100
  89 + audit_top_k: 100
  90 + audit_limit_suspicious: 5
  91 + default_language: en
  92 + search_recall_top_k: 200
  93 + rerank_high_threshold: 0.5
  94 + rerank_high_skip_count: 1000
  95 + rebuild_llm_batch_size: 50
  96 + rebuild_min_llm_batches: 10
  97 + rebuild_max_llm_batches: 40
  98 + rebuild_irrelevant_stop_ratio: 0.799
  99 + rebuild_irrel_low_combined_stop_ratio: 0.959
  100 + rebuild_irrelevant_stop_streak: 3
  101 +
67 # ES Index Settings (基础设置) 102 # ES Index Settings (基础设置)
68 es_settings: 103 es_settings:
69 number_of_shards: 1 104 number_of_shards: 1
@@ -75,7 +110,9 @@ es_settings: @@ -75,7 +110,9 @@ es_settings:
75 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 110 # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
76 field_boosts: 111 field_boosts:
77 title: 3.0 112 title: 3.0
78 - qanchors: 2.5 113 + qanchors: 2.3
  114 + enriched_tags: 2.3
  115 + keywords: 2.0
79 tags: 2.0 116 tags: 2.0
80 category_name_text: 2.0 117 category_name_text: 2.0
81 category_path: 2.0 118 category_path: 2.0
@@ -152,7 +189,11 @@ query_config: @@ -152,7 +189,11 @@ query_config:
152 multilingual_fields: 189 multilingual_fields:
153 - title 190 - title
154 - qanchors 191 - qanchors
  192 + - keywords
155 - enriched_tags 193 - enriched_tags
  194 + - option1_values
  195 + - option2_values
  196 + - option3_values
156 - category_path 197 - category_path
157 - category_name_text 198 - category_name_text
158 - brief 199 - brief
@@ -46,6 +46,7 @@ from config.schema import ( @@ -46,6 +46,7 @@ from config.schema import (
46 RerankServiceInstanceConfig, 46 RerankServiceInstanceConfig,
47 RuntimeConfig, 47 RuntimeConfig,
48 SearchConfig, 48 SearchConfig,
  49 + SearchEvaluationConfig,
49 SecretsConfig, 50 SecretsConfig,
50 ServicesConfig, 51 ServicesConfig,
51 SPUConfig, 52 SPUConfig,
@@ -263,6 +264,7 @@ class AppConfigLoader: @@ -263,6 +264,7 @@ class AppConfigLoader:
263 product_enrich_config = ProductEnrichConfig( 264 product_enrich_config = ProductEnrichConfig(
264 max_workers=int(product_enrich_raw.get("max_workers", 40)), 265 max_workers=int(product_enrich_raw.get("max_workers", 40)),
265 ) 266 )
  267 + search_evaluation_config = self._build_search_evaluation_config(raw, runtime_config)
266 268
267 metadata = ConfigMetadata( 269 metadata = ConfigMetadata(
268 loaded_files=tuple(loaded_files), 270 loaded_files=tuple(loaded_files),
@@ -278,6 +280,7 @@ class AppConfigLoader: @@ -278,6 +280,7 @@ class AppConfigLoader:
278 services=services_config, 280 services=services_config,
279 tenants=tenants_config, 281 tenants=tenants_config,
280 assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path), 282 assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path),
  283 + search_evaluation=search_evaluation_config,
281 metadata=metadata, 284 metadata=metadata,
282 ) 285 )
283 286
@@ -290,6 +293,7 @@ class AppConfigLoader: @@ -290,6 +293,7 @@ class AppConfigLoader:
290 services=app_config.services, 293 services=app_config.services,
291 tenants=app_config.tenants, 294 tenants=app_config.tenants,
292 assets=app_config.assets, 295 assets=app_config.assets,
  296 + search_evaluation=app_config.search_evaluation,
293 metadata=ConfigMetadata( 297 metadata=ConfigMetadata(
294 loaded_files=app_config.metadata.loaded_files, 298 loaded_files=app_config.metadata.loaded_files,
295 config_hash=config_hash, 299 config_hash=config_hash,
@@ -297,6 +301,89 @@ class AppConfigLoader: @@ -297,6 +301,89 @@ class AppConfigLoader:
297 ), 301 ),
298 ) 302 )
299 303
  304 + def _build_search_evaluation_config(self, raw: Dict[str, Any], runtime: RuntimeConfig) -> SearchEvaluationConfig:
  305 + se = raw.get("search_evaluation") if isinstance(raw.get("search_evaluation"), dict) else {}
  306 + default_artifact = (self.project_root / "artifacts" / "search_evaluation").resolve()
  307 + default_queries = (self.project_root / "scripts" / "evaluation" / "queries" / "queries.txt").resolve()
  308 + default_log_dir = (self.project_root / "logs").resolve()
  309 + default_search_base = f"http://127.0.0.1:{int(runtime.api_port)}"
  310 +
  311 + def _project_path(value: Any, default: Path) -> Path:
  312 + if value in (None, ""):
  313 + return default
  314 + candidate = Path(str(value))
  315 + if candidate.is_absolute():
  316 + return candidate.resolve()
  317 + return (self.project_root / candidate).resolve()
  318 +
  319 + def _str(key: str, default: str) -> str:
  320 + v = se.get(key)
  321 + if v is None or (isinstance(v, str) and not v.strip()):
  322 + return default
  323 + return str(v).strip()
  324 +
  325 + def _int(key: str, default: int) -> int:
  326 + v = se.get(key)
  327 + if v is None:
  328 + return default
  329 + return int(v)
  330 +
  331 + def _float(key: str, default: float) -> float:
  332 + v = se.get(key)
  333 + if v is None:
  334 + return default
  335 + return float(v)
  336 +
  337 + def _bool(key: str, default: bool) -> bool:
  338 + v = se.get(key)
  339 + if v is None:
  340 + return default
  341 + if isinstance(v, bool):
  342 + return v
  343 + if isinstance(v, str):
  344 + return v.strip().lower() in {"1", "true", "yes", "on"}
  345 + return bool(v)
  346 +
  347 + raw_search_url = se.get("search_base_url")
  348 + if raw_search_url is None or (isinstance(raw_search_url, str) and not str(raw_search_url).strip()):
  349 + search_base_url = default_search_base
  350 + else:
  351 + search_base_url = str(raw_search_url).strip()
  352 +
  353 + return SearchEvaluationConfig(
  354 + artifact_root=_project_path(se.get("artifact_root"), default_artifact),
  355 + queries_file=_project_path(se.get("queries_file"), default_queries),
  356 + eval_log_dir=_project_path(se.get("eval_log_dir"), default_log_dir),
  357 + default_tenant_id=_str("default_tenant_id", "163"),
  358 + search_base_url=search_base_url,
  359 + web_host=_str("web_host", "0.0.0.0"),
  360 + web_port=_int("web_port", 6010),
  361 + judge_model=_str("judge_model", "qwen3.5-plus"),
  362 + judge_enable_thinking=_bool("judge_enable_thinking", False),
  363 + judge_dashscope_batch=_bool("judge_dashscope_batch", False),
  364 + intent_model=_str("intent_model", "qwen3-max"),
  365 + intent_enable_thinking=_bool("intent_enable_thinking", True),
  366 + judge_batch_completion_window=_str("judge_batch_completion_window", "24h"),
  367 + judge_batch_poll_interval_sec=_float("judge_batch_poll_interval_sec", 10.0),
  368 + build_search_depth=_int("build_search_depth", 1000),
  369 + build_rerank_depth=_int("build_rerank_depth", 10000),
  370 + annotate_search_top_k=_int("annotate_search_top_k", 120),
  371 + annotate_rerank_top_k=_int("annotate_rerank_top_k", 200),
  372 + batch_top_k=_int("batch_top_k", 100),
  373 + audit_top_k=_int("audit_top_k", 100),
  374 + audit_limit_suspicious=_int("audit_limit_suspicious", 5),
  375 + default_language=_str("default_language", "en"),
  376 + search_recall_top_k=_int("search_recall_top_k", 200),
  377 + rerank_high_threshold=_float("rerank_high_threshold", 0.5),
  378 + rerank_high_skip_count=_int("rerank_high_skip_count", 1000),
  379 + rebuild_llm_batch_size=_int("rebuild_llm_batch_size", 50),
  380 + rebuild_min_llm_batches=_int("rebuild_min_llm_batches", 10),
  381 + rebuild_max_llm_batches=_int("rebuild_max_llm_batches", 40),
  382 + rebuild_irrelevant_stop_ratio=_float("rebuild_irrelevant_stop_ratio", 0.799),
  383 + rebuild_irrel_low_combined_stop_ratio=_float("rebuild_irrel_low_combined_stop_ratio", 0.959),
  384 + rebuild_irrelevant_stop_streak=_int("rebuild_irrelevant_stop_streak", 3),
  385 + )
  386 +
300 def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig: 387 def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig:
301 field_boosts = raw.get("field_boosts") or {} 388 field_boosts = raw.get("field_boosts") or {}
302 if not isinstance(field_boosts, dict): 389 if not isinstance(field_boosts, dict):
@@ -376,6 +376,43 @@ class AssetsConfig: @@ -376,6 +376,43 @@ class AssetsConfig:
376 376
377 377
378 @dataclass(frozen=True) 378 @dataclass(frozen=True)
  379 +class SearchEvaluationConfig:
  380 + """Offline / web UI search evaluation (YAML: ``search_evaluation``)."""
  381 +
  382 + artifact_root: Path
  383 + queries_file: Path
  384 + eval_log_dir: Path
  385 + default_tenant_id: str
  386 + search_base_url: str
  387 + web_host: str
  388 + web_port: int
  389 + judge_model: str
  390 + judge_enable_thinking: bool
  391 + judge_dashscope_batch: bool
  392 + intent_model: str
  393 + intent_enable_thinking: bool
  394 + judge_batch_completion_window: str
  395 + judge_batch_poll_interval_sec: float
  396 + build_search_depth: int
  397 + build_rerank_depth: int
  398 + annotate_search_top_k: int
  399 + annotate_rerank_top_k: int
  400 + batch_top_k: int
  401 + audit_top_k: int
  402 + audit_limit_suspicious: int
  403 + default_language: str
  404 + search_recall_top_k: int
  405 + rerank_high_threshold: float
  406 + rerank_high_skip_count: int
  407 + rebuild_llm_batch_size: int
  408 + rebuild_min_llm_batches: int
  409 + rebuild_max_llm_batches: int
  410 + rebuild_irrelevant_stop_ratio: float
  411 + rebuild_irrel_low_combined_stop_ratio: float
  412 + rebuild_irrelevant_stop_streak: int
  413 +
  414 +
  415 +@dataclass(frozen=True)
379 class ConfigMetadata: 416 class ConfigMetadata:
380 loaded_files: Tuple[str, ...] 417 loaded_files: Tuple[str, ...]
381 config_hash: str 418 config_hash: str
@@ -393,6 +430,7 @@ class AppConfig: @@ -393,6 +430,7 @@ class AppConfig:
393 services: ServicesConfig 430 services: ServicesConfig
394 tenants: TenantCatalogConfig 431 tenants: TenantCatalogConfig
395 assets: AssetsConfig 432 assets: AssetsConfig
  433 + search_evaluation: SearchEvaluationConfig
396 metadata: ConfigMetadata 434 metadata: ConfigMetadata
397 435
398 def sanitized_dict(self) -> Dict[str, Any]: 436 def sanitized_dict(self) -> Dict[str, Any]:
scripts/evaluation/eval_framework/cli.py
@@ -8,21 +8,6 @@ import logging @@ -8,21 +8,6 @@ import logging
8 from pathlib import Path 8 from pathlib import Path
9 from typing import Any, Dict 9 from typing import Any, Dict
10 10
11 -from .constants import (  
12 - DEFAULT_INTENT_ENABLE_THINKING,  
13 - DEFAULT_INTENT_MODEL,  
14 - DEFAULT_QUERY_FILE,  
15 - DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,  
16 - DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,  
17 - DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,  
18 - DEFAULT_REBUILD_LLM_BATCH_SIZE,  
19 - DEFAULT_REBUILD_MAX_LLM_BATCHES,  
20 - DEFAULT_REBUILD_MIN_LLM_BATCHES,  
21 - DEFAULT_RERANK_HIGH_SKIP_COUNT,  
22 - DEFAULT_RERANK_HIGH_THRESHOLD,  
23 - DEFAULT_SEARCH_RECALL_TOP_K,  
24 -)  
25 -from .constants import EVAL_LOG_FILE  
26 from .framework import SearchEvaluationFramework 11 from .framework import SearchEvaluationFramework
27 from .logging_setup import setup_eval_logging 12 from .logging_setup import setup_eval_logging
28 from .utils import ensure_dir, utc_now_iso, utc_timestamp 13 from .utils import ensure_dir, utc_now_iso, utc_timestamp
@@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: @@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
36 "--judge-model", 21 "--judge-model",
37 default=None, 22 default=None,
38 metavar="MODEL", 23 metavar="MODEL",
39 - help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).", 24 + help="Judge LLM model (default: config.yaml search_evaluation.judge_model).",
40 ) 25 )
41 p.add_argument( 26 p.add_argument(
42 "--enable-thinking", 27 "--enable-thinking",
43 action=argparse.BooleanOptionalAction, 28 action=argparse.BooleanOptionalAction,
44 default=None, 29 default=None,
45 - help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).", 30 + help="enable_thinking for DashScope (default: search_evaluation.judge_enable_thinking).",
46 ) 31 )
47 p.add_argument( 32 p.add_argument(
48 "--dashscope-batch", 33 "--dashscope-batch",
49 action=argparse.BooleanOptionalAction, 34 action=argparse.BooleanOptionalAction,
50 default=None, 35 default=None,
51 - help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).", 36 + help="DashScope Batch File API vs sync chat (default: search_evaluation.judge_dashscope_batch).",
52 ) 37 )
53 38
54 39
@@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -> None: @@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -> None:
57 "--intent-model", 42 "--intent-model",
58 default=None, 43 default=None,
59 metavar="MODEL", 44 metavar="MODEL",
60 - help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).", 45 + help="Query-intent LLM model before relevance judging (default: search_evaluation.intent_model).",
61 ) 46 )
62 p.add_argument( 47 p.add_argument(
63 "--intent-enable-thinking", 48 "--intent-enable-thinking",
64 action=argparse.BooleanOptionalAction, 49 action=argparse.BooleanOptionalAction,
65 default=None, 50 default=None,
66 - help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).", 51 + help="enable_thinking for intent model (default: search_evaluation.intent_enable_thinking).",
67 ) 52 )
68 53
69 54
@@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: @@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
82 return kw 67 return kw
83 68
84 69
  70 +def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None:
  71 + """Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``)."""
  72 + from config.loader import get_app_config
  73 +
  74 + se = get_app_config().search_evaluation
  75 + if getattr(args, "tenant_id", None) in (None, ""):
  76 + args.tenant_id = se.default_tenant_id
  77 + if getattr(args, "queries_file", None) in (None, ""):
  78 + args.queries_file = str(se.queries_file)
  79 + if getattr(args, "language", None) in (None, ""):
  80 + args.language = se.default_language
  81 +
  82 + if args.command == "serve":
  83 + if getattr(args, "host", None) in (None, ""):
  84 + args.host = se.web_host
  85 + if getattr(args, "port", None) is None:
  86 + args.port = se.web_port
  87 +
  88 + if args.command == "batch":
  89 + if getattr(args, "top_k", None) is None:
  90 + args.top_k = se.batch_top_k
  91 +
  92 + if args.command == "audit":
  93 + if getattr(args, "top_k", None) is None:
  94 + args.top_k = se.audit_top_k
  95 + if getattr(args, "limit_suspicious", None) is None:
  96 + args.limit_suspicious = se.audit_limit_suspicious
  97 +
  98 + if args.command == "build":
  99 + if getattr(args, "search_depth", None) is None:
  100 + args.search_depth = se.build_search_depth
  101 + if getattr(args, "rerank_depth", None) is None:
  102 + args.rerank_depth = se.build_rerank_depth
  103 + if getattr(args, "annotate_search_top_k", None) is None:
  104 + args.annotate_search_top_k = se.annotate_search_top_k
  105 + if getattr(args, "annotate_rerank_top_k", None) is None:
  106 + args.annotate_rerank_top_k = se.annotate_rerank_top_k
  107 + if getattr(args, "search_recall_top_k", None) is None:
  108 + args.search_recall_top_k = se.search_recall_top_k
  109 + if getattr(args, "rerank_high_threshold", None) is None:
  110 + args.rerank_high_threshold = se.rerank_high_threshold
  111 + if getattr(args, "rerank_high_skip_count", None) is None:
  112 + args.rerank_high_skip_count = se.rerank_high_skip_count
  113 + if getattr(args, "rebuild_llm_batch_size", None) is None:
  114 + args.rebuild_llm_batch_size = se.rebuild_llm_batch_size
  115 + if getattr(args, "rebuild_min_batches", None) is None:
  116 + args.rebuild_min_batches = se.rebuild_min_llm_batches
  117 + if getattr(args, "rebuild_max_batches", None) is None:
  118 + args.rebuild_max_batches = se.rebuild_max_llm_batches
  119 + if getattr(args, "rebuild_irrelevant_stop_ratio", None) is None:
  120 + args.rebuild_irrelevant_stop_ratio = se.rebuild_irrelevant_stop_ratio
  121 + if getattr(args, "rebuild_irrel_low_combined_stop_ratio", None) is None:
  122 + args.rebuild_irrel_low_combined_stop_ratio = se.rebuild_irrel_low_combined_stop_ratio
  123 + if getattr(args, "rebuild_irrelevant_stop_streak", None) is None:
  124 + args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak
  125 +
  126 +
85 def build_cli_parser() -> argparse.ArgumentParser: 127 def build_cli_parser() -> argparse.ArgumentParser:
86 parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") 128 parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
87 sub = parser.add_subparsers(dest="command", required=True) 129 sub = parser.add_subparsers(dest="command", required=True)
88 130
89 build = sub.add_parser("build", help="Build pooled annotation set for queries") 131 build = sub.add_parser("build", help="Build pooled annotation set for queries")
90 - build.add_argument("--tenant-id", default="163")  
91 - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
92 - build.add_argument("--search-depth", type=int, default=1000)  
93 - build.add_argument("--rerank-depth", type=int, default=10000)  
94 - build.add_argument("--annotate-search-top-k", type=int, default=120)  
95 - build.add_argument("--annotate-rerank-top-k", type=int, default=200) 132 + build.add_argument(
  133 + "--tenant-id",
  134 + default=None,
  135 + help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).",
  136 + )
  137 + build.add_argument(
  138 + "--queries-file",
  139 + default=None,
  140 + help="Query list file (default: search_evaluation.queries_file).",
  141 + )
  142 + build.add_argument(
  143 + "--search-depth",
  144 + type=int,
  145 + default=None,
  146 + help="Default: search_evaluation.build_search_depth.",
  147 + )
  148 + build.add_argument(
  149 + "--rerank-depth",
  150 + type=int,
  151 + default=None,
  152 + help="Default: search_evaluation.build_rerank_depth.",
  153 + )
  154 + build.add_argument(
  155 + "--annotate-search-top-k",
  156 + type=int,
  157 + default=None,
  158 + help="Default: search_evaluation.annotate_search_top_k.",
  159 + )
  160 + build.add_argument(
  161 + "--annotate-rerank-top-k",
  162 + type=int,
  163 + default=None,
  164 + help="Default: search_evaluation.annotate_rerank_top_k.",
  165 + )
96 build.add_argument( 166 build.add_argument(
97 "--search-recall-top-k", 167 "--search-recall-top-k",
98 type=int, 168 type=int,
@@ -118,7 +188,7 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -118,7 +188,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
118 "--rebuild-irrelevant-stop-ratio", 188 "--rebuild-irrelevant-stop-ratio",
119 type=float, 189 type=float,
120 default=None, 190 default=None,
121 - help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", 191 + help="Rebuild only: bad batch requires irrelevant_ratio > this (default: search_evaluation.rebuild_irrelevant_stop_ratio).",
122 ) 192 )
123 build.add_argument( 193 build.add_argument(
124 "--rebuild-irrel-low-combined-stop-ratio", 194 "--rebuild-irrel-low-combined-stop-ratio",
@@ -132,36 +202,45 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -132,36 +202,45 @@ def build_cli_parser() -> argparse.ArgumentParser:
132 default=None, 202 default=None,
133 help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", 203 help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
134 ) 204 )
135 - build.add_argument("--language", default="en") 205 + build.add_argument(
  206 + "--language",
  207 + default=None,
  208 + help="Default: search_evaluation.default_language.",
  209 + )
136 build.add_argument("--force-refresh-rerank", action="store_true") 210 build.add_argument("--force-refresh-rerank", action="store_true")
137 build.add_argument("--force-refresh-labels", action="store_true") 211 build.add_argument("--force-refresh-labels", action="store_true")
138 add_judge_llm_args(build) 212 add_judge_llm_args(build)
139 add_intent_llm_args(build) 213 add_intent_llm_args(build)
140 214
141 batch = sub.add_parser("batch", help="Run batch evaluation against live search") 215 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
142 - batch.add_argument("--tenant-id", default="163")  
143 - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
144 - batch.add_argument("--top-k", type=int, default=100)  
145 - batch.add_argument("--language", default="en") 216 + batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
  217 + batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  218 + batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
  219 + batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
146 batch.add_argument("--force-refresh-labels", action="store_true") 220 batch.add_argument("--force-refresh-labels", action="store_true")
147 add_judge_llm_args(batch) 221 add_judge_llm_args(batch)
148 add_intent_llm_args(batch) 222 add_intent_llm_args(batch)
149 223
150 audit = sub.add_parser("audit", help="Audit annotation quality for queries") 224 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
151 - audit.add_argument("--tenant-id", default="163")  
152 - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
153 - audit.add_argument("--top-k", type=int, default=100)  
154 - audit.add_argument("--language", default="en")  
155 - audit.add_argument("--limit-suspicious", type=int, default=5) 225 + audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
  226 + audit.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  227 + audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
  228 + audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
  229 + audit.add_argument(
  230 + "--limit-suspicious",
  231 + type=int,
  232 + default=None,
  233 + help="Default: search_evaluation.audit_limit_suspicious.",
  234 + )
156 audit.add_argument("--force-refresh-labels", action="store_true") 235 audit.add_argument("--force-refresh-labels", action="store_true")
157 add_judge_llm_args(audit) 236 add_judge_llm_args(audit)
158 add_intent_llm_args(audit) 237 add_intent_llm_args(audit)
159 238
160 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") 239 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
161 - serve.add_argument("--tenant-id", default="163")  
162 - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))  
163 - serve.add_argument("--host", default="0.0.0.0")  
164 - serve.add_argument("--port", type=int, default=6010) 240 + serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
  241 + serve.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  242 + serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.")
  243 + serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.")
165 add_judge_llm_args(serve) 244 add_judge_llm_args(serve)
166 add_intent_llm_args(serve) 245 add_intent_llm_args(serve)
167 246
@@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -> None: @@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -> None:
175 rebuild_kwargs = {} 254 rebuild_kwargs = {}
176 if args.force_refresh_labels: 255 if args.force_refresh_labels:
177 rebuild_kwargs = { 256 rebuild_kwargs = {
178 - "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K,  
179 - "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD,  
180 - "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT,  
181 - "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE,  
182 - "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES,  
183 - "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES,  
184 - "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio  
185 - if args.rebuild_irrelevant_stop_ratio is not None  
186 - else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,  
187 - "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio  
188 - if args.rebuild_irrel_low_combined_stop_ratio is not None  
189 - else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,  
190 - "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak  
191 - if args.rebuild_irrelevant_stop_streak is not None  
192 - else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, 257 + "search_recall_top_k": args.search_recall_top_k,
  258 + "rerank_high_threshold": args.rerank_high_threshold,
  259 + "rerank_high_skip_count": args.rerank_high_skip_count,
  260 + "rebuild_llm_batch_size": args.rebuild_llm_batch_size,
  261 + "rebuild_min_batches": args.rebuild_min_batches,
  262 + "rebuild_max_batches": args.rebuild_max_batches,
  263 + "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio,
  264 + "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio,
  265 + "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak,
193 } 266 }
194 - for query in queries: 267 + total_q = len(queries)
  268 + for q_index, query in enumerate(queries, start=1):
  269 + _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
195 result = framework.build_query_annotation_set( 270 result = framework.build_query_annotation_set(
196 query=query, 271 query=query,
197 search_depth=args.search_depth, 272 search_depth=args.search_depth,
@@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -> None: @@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -> None:
230 def run_batch(args: argparse.Namespace) -> None: 305 def run_batch(args: argparse.Namespace) -> None:
231 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 306 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
232 queries = framework.queries_from_file(Path(args.queries_file)) 307 queries = framework.queries_from_file(Path(args.queries_file))
  308 + _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
233 payload = framework.batch_evaluate( 309 payload = framework.batch_evaluate(
234 queries=queries, 310 queries=queries,
235 top_k=args.top_k, 311 top_k=args.top_k,
@@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -> None: @@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -> None:
302 378
303 379
304 def main() -> None: 380 def main() -> None:
305 - setup_eval_logging() 381 + from config.loader import get_app_config
  382 +
  383 + se = get_app_config().search_evaluation
  384 + log_file = setup_eval_logging(se.eval_log_dir)
306 parser = build_cli_parser() 385 parser = build_cli_parser()
307 args = parser.parse_args() 386 args = parser.parse_args()
  387 + _apply_search_evaluation_cli_defaults(args)
308 logging.getLogger("search_eval").info( 388 logging.getLogger("search_eval").info(
309 "CLI start command=%s tenant_id=%s log_file=%s", 389 "CLI start command=%s tenant_id=%s log_file=%s",
310 args.command, 390 args.command,
311 getattr(args, "tenant_id", ""), 391 getattr(args, "tenant_id", ""),
312 - EVAL_LOG_FILE.resolve(), 392 + log_file.resolve(),
313 ) 393 )
314 if args.command == "build": 394 if args.command == "build":
315 run_build(args) 395 run_build(args)
scripts/evaluation/eval_framework/clients.py
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple @@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
12 12
13 import requests 13 import requests
14 14
15 -from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS 15 +from .constants import VALID_LABELS
16 from .logging_setup import setup_eval_logging 16 from .logging_setup import setup_eval_logging
17 from .prompts import classify_prompt, intent_analysis_prompt 17 from .prompts import classify_prompt, intent_analysis_prompt
18 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps 18 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
@@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False @@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False
23 23
24 24
25 def _get_eval_llm_verbose_logger() -> logging.Logger: 25 def _get_eval_llm_verbose_logger() -> logging.Logger:
26 - """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``."""  
27 - setup_eval_logging() 26 + """File logger for full LLM prompts/responses under ``search_evaluation.eval_log_dir/verbose/``."""
  27 + from config.loader import get_app_config
  28 +
  29 + se = get_app_config().search_evaluation
  30 + setup_eval_logging(se.eval_log_dir)
28 global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged 31 global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged
29 with _VERBOSE_LOGGER_LOCK: 32 with _VERBOSE_LOGGER_LOCK:
30 if _eval_llm_verbose_logger_singleton is not None: 33 if _eval_llm_verbose_logger_singleton is not None:
31 return _eval_llm_verbose_logger_singleton 34 return _eval_llm_verbose_logger_singleton
32 - log_path = EVAL_VERBOSE_LOG_FILE 35 + log_path = se.eval_log_dir / "verbose" / "eval_verbose.log"
33 log_path.parent.mkdir(parents=True, exist_ok=True) 36 log_path.parent.mkdir(parents=True, exist_ok=True)
34 lg = logging.getLogger("search_eval.verbose_llm") 37 lg = logging.getLogger("search_eval.verbose_llm")
35 lg.setLevel(logging.INFO) 38 lg.setLevel(logging.INFO)
scripts/evaluation/eval_framework/constants.py
@@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False @@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False
46 DEFAULT_JUDGE_DASHSCOPE_BATCH = False 46 DEFAULT_JUDGE_DASHSCOPE_BATCH = False
47 47
48 # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) 48 # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
49 -DEFAULT_INTENT_MODEL = "qwen-max" 49 +DEFAULT_INTENT_MODEL = "qwen3-max"
50 DEFAULT_INTENT_ENABLE_THINKING = True 50 DEFAULT_INTENT_ENABLE_THINKING = True
51 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" 51 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
52 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 52 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
scripts/evaluation/eval_framework/framework.py
@@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name @@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name
16 16
17 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient 17 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
18 from .constants import ( 18 from .constants import (
19 - DEFAULT_ARTIFACT_ROOT,  
20 - DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW,  
21 - DEFAULT_INTENT_ENABLE_THINKING,  
22 - DEFAULT_INTENT_MODEL,  
23 - DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC,  
24 - DEFAULT_JUDGE_DASHSCOPE_BATCH,  
25 - DEFAULT_JUDGE_ENABLE_THINKING,  
26 - DEFAULT_JUDGE_MODEL,  
27 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, 19 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
28 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, 20 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
29 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, 21 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
@@ -79,8 +71,8 @@ class SearchEvaluationFramework: @@ -79,8 +71,8 @@ class SearchEvaluationFramework:
79 def __init__( 71 def __init__(
80 self, 72 self,
81 tenant_id: str, 73 tenant_id: str,
82 - artifact_root: Path = DEFAULT_ARTIFACT_ROOT,  
83 - search_base_url: str = "http://localhost:6002", 74 + artifact_root: Path | None = None,
  75 + search_base_url: str | None = None,
84 *, 76 *,
85 judge_model: str | None = None, 77 judge_model: str | None = None,
86 enable_thinking: bool | None = None, 78 enable_thinking: bool | None = None,
@@ -88,12 +80,14 @@ class SearchEvaluationFramework: @@ -88,12 +80,14 @@ class SearchEvaluationFramework:
88 intent_model: str | None = None, 80 intent_model: str | None = None,
89 intent_enable_thinking: bool | None = None, 81 intent_enable_thinking: bool | None = None,
90 ): 82 ):
91 - init_service(get_app_config().infrastructure.elasticsearch.host) 83 + app_cfg = get_app_config()
  84 + se = app_cfg.search_evaluation
  85 + init_service(app_cfg.infrastructure.elasticsearch.host)
92 self.tenant_id = str(tenant_id) 86 self.tenant_id = str(tenant_id)
93 - self.artifact_root = ensure_dir(artifact_root) 87 + self.artifact_root = ensure_dir(artifact_root if artifact_root is not None else se.artifact_root)
94 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") 88 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
95 - self.search_client = SearchServiceClient(search_base_url, self.tenant_id)  
96 - app_cfg = get_app_config() 89 + sb = search_base_url if search_base_url is not None else se.search_base_url
  90 + self.search_client = SearchServiceClient(sb, self.tenant_id)
97 rerank_service_url = str( 91 rerank_service_url = str(
98 app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] 92 app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
99 ) 93 )
@@ -102,11 +96,11 @@ class SearchEvaluationFramework: @@ -102,11 +96,11 @@ class SearchEvaluationFramework:
102 api_key = app_cfg.infrastructure.secrets.dashscope_api_key 96 api_key = app_cfg.infrastructure.secrets.dashscope_api_key
103 if not api_key: 97 if not api_key:
104 raise RuntimeError("dashscope_api_key is required for search evaluation annotation") 98 raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
105 - model = str(judge_model or DEFAULT_JUDGE_MODEL)  
106 - et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking  
107 - use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch  
108 - batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW  
109 - batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC) 99 + model = str(judge_model if judge_model is not None else se.judge_model)
  100 + et = se.judge_enable_thinking if enable_thinking is None else enable_thinking
  101 + use_batch = se.judge_dashscope_batch if use_dashscope_batch is None else use_dashscope_batch
  102 + batch_window = se.judge_batch_completion_window
  103 + batch_poll = float(se.judge_batch_poll_interval_sec)
110 self.label_client = DashScopeLabelClient( 104 self.label_client = DashScopeLabelClient(
111 model=model, 105 model=model,
112 base_url=str(llm_cfg["base_url"]), 106 base_url=str(llm_cfg["base_url"]),
@@ -116,8 +110,8 @@ class SearchEvaluationFramework: @@ -116,8 +110,8 @@ class SearchEvaluationFramework:
116 enable_thinking=et, 110 enable_thinking=et,
117 use_batch=use_batch, 111 use_batch=use_batch,
118 ) 112 )
119 - intent_m = str(intent_model or DEFAULT_INTENT_MODEL)  
120 - intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking 113 + intent_m = str(intent_model if intent_model is not None else se.intent_model)
  114 + intent_et = se.intent_enable_thinking if intent_enable_thinking is None else intent_enable_thinking
121 self.intent_client = DashScopeLabelClient( 115 self.intent_client = DashScopeLabelClient(
122 model=intent_m, 116 model=intent_m,
123 base_url=str(llm_cfg["base_url"]), 117 base_url=str(llm_cfg["base_url"]),
@@ -629,6 +623,21 @@ class SearchEvaluationFramework: @@ -629,6 +623,21 @@ class SearchEvaluationFramework:
629 corpus = self.corpus_docs(refresh=False) 623 corpus = self.corpus_docs(refresh=False)
630 corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} 624 corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()}
631 625
  626 + rerank_pending_n = sum(
  627 + 1
  628 + for d in corpus
  629 + if str(d.get("spu_id") or "").strip()
  630 + and str(d.get("spu_id")) not in pool_spu_ids
  631 + )
  632 + _log.info(
  633 + "[eval-rebuild] query=%r phase=rerank_outside_pool docs≈%s (pool=%s, force_refresh_rerank=%s); "
  634 + "this can take a long time with no further logs until LLM batches start",
  635 + query,
  636 + rerank_pending_n,
  637 + len(pool_spu_ids),
  638 + force_refresh_rerank,
  639 + )
  640 +
632 ranked_outside = self.full_corpus_rerank_outside_exclude( 641 ranked_outside = self.full_corpus_rerank_outside_exclude(
633 query=query, 642 query=query,
634 docs=corpus, 643 docs=corpus,
@@ -905,7 +914,9 @@ class SearchEvaluationFramework: @@ -905,7 +914,9 @@ class SearchEvaluationFramework:
905 force_refresh_labels: bool = False, 914 force_refresh_labels: bool = False,
906 ) -> Dict[str, Any]: 915 ) -> Dict[str, Any]:
907 per_query = [] 916 per_query = []
908 - for query in queries: 917 + total_q = len(queries)
  918 + _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
  919 + for q_index, query in enumerate(queries, start=1):
909 live = self.evaluate_live_query( 920 live = self.evaluate_live_query(
910 query, 921 query,
911 top_k=top_k, 922 top_k=top_k,
@@ -927,6 +938,16 @@ class SearchEvaluationFramework: @@ -927,6 +938,16 @@ class SearchEvaluationFramework:
927 "total": live["total"], 938 "total": live["total"],
928 } 939 }
929 ) 940 )
  941 + m = live["metrics"]
  942 + _log.info(
  943 + "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s",
  944 + q_index,
  945 + total_q,
  946 + query,
  947 + m.get("P@10"),
  948 + m.get("MAP_3"),
  949 + live.get("total"),
  950 + )
930 aggregate = aggregate_metrics([item["metrics"] for item in per_query]) 951 aggregate = aggregate_metrics([item["metrics"] for item in per_query])
931 aggregate_distribution = { 952 aggregate_distribution = {
932 RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), 953 RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
@@ -955,5 +976,11 @@ class SearchEvaluationFramework: @@ -955,5 +976,11 @@ class SearchEvaluationFramework:
955 output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") 976 output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
956 report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") 977 report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
957 self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) 978 self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
  979 + _log.info(
  980 + "[batch-eval] finished batch_id=%s per_query=%s json=%s",
  981 + batch_id,
  982 + len(per_query),
  983 + output_json_path,
  984 + )
958 return payload 985 return payload
959 986
scripts/evaluation/eval_framework/logging_setup.py
1 -"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``).""" 1 +"""Configure dedicated eval run logs (defaults: repo ``logs/``; override via ``config.yaml`` ``search_evaluation.eval_log_dir``)."""
2 2
3 from __future__ import annotations 3 from __future__ import annotations
4 4
5 import logging 5 import logging
6 import sys 6 import sys
  7 +from pathlib import Path
7 8
8 -from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR 9 +from .constants import EVAL_LOG_DIR
9 10
10 _setup_done = False 11 _setup_done = False
11 12
12 13
13 -def setup_eval_logging() -> None:  
14 - """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.""" 14 +def setup_eval_logging(eval_log_dir: Path | None = None) -> Path:
  15 + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.
  16 +
  17 + Returns the path to the primary ``eval.log`` file.
  18 + """
15 global _setup_done 19 global _setup_done
16 - if _setup_done:  
17 - return 20 + log_dir = Path(eval_log_dir).resolve() if eval_log_dir is not None else EVAL_LOG_DIR.resolve()
  21 + verbose_dir = log_dir / "verbose"
  22 + log_file = log_dir / "eval.log"
18 23
19 - EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True)  
20 - EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True) 24 + log_dir.mkdir(parents=True, exist_ok=True)
  25 + verbose_dir.mkdir(parents=True, exist_ok=True)
21 26
22 fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") 27 fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
23 root = logging.getLogger("search_eval") 28 root = logging.getLogger("search_eval")
24 root.setLevel(logging.INFO) 29 root.setLevel(logging.INFO)
25 if root.handlers: 30 if root.handlers:
26 _setup_done = True 31 _setup_done = True
27 - return  
28 - fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8") 32 + return log_file
  33 +
  34 + fh = logging.FileHandler(log_file, encoding="utf-8")
29 fh.setFormatter(fmt) 35 fh.setFormatter(fmt)
30 sh = logging.StreamHandler(sys.stderr) 36 sh = logging.StreamHandler(sys.stderr)
31 sh.setFormatter(fmt) 37 sh.setFormatter(fmt)
@@ -33,3 +39,4 @@ def setup_eval_logging() -> None: @@ -33,3 +39,4 @@ def setup_eval_logging() -> None:
33 root.addHandler(sh) 39 root.addHandler(sh)
34 root.propagate = False 40 root.propagate = False
35 _setup_done = True 41 _setup_done = True
  42 + return log_file