Commit 331861d5449c3ad25cbc6d89df780b0299735fb5
1 parent
1c2ba48e
eval框架配置化
Showing
8 changed files
with
380 additions
and
97 deletions
Show diff stats
config/config.yaml
| @@ -64,6 +64,41 @@ assets: | @@ -64,6 +64,41 @@ assets: | ||
| 64 | product_enrich: | 64 | product_enrich: |
| 65 | max_workers: 40 | 65 | max_workers: 40 |
| 66 | 66 | ||
| 67 | +# 离线 / Web 相关性评估(scripts/evaluation、eval-web) | ||
| 68 | +# CLI 未显式传参时使用此处默认值;search_base_url 未配置时自动为 http://127.0.0.1:{runtime.api_port} | ||
| 69 | +search_evaluation: | ||
| 70 | + artifact_root: artifacts/search_evaluation | ||
| 71 | + queries_file: scripts/evaluation/queries/queries.txt | ||
| 72 | + eval_log_dir: logs | ||
| 73 | + default_tenant_id: '163' | ||
| 74 | + search_base_url: '' | ||
| 75 | + web_host: 0.0.0.0 | ||
| 76 | + web_port: 6010 | ||
| 77 | + judge_model: qwen3.5-plus | ||
| 78 | + judge_enable_thinking: false | ||
| 79 | + judge_dashscope_batch: false | ||
| 80 | + intent_model: qwen3-max | ||
| 81 | + intent_enable_thinking: true | ||
| 82 | + judge_batch_completion_window: 24h | ||
| 83 | + judge_batch_poll_interval_sec: 10.0 | ||
| 84 | + build_search_depth: 1000 | ||
| 85 | + build_rerank_depth: 10000 | ||
| 86 | + annotate_search_top_k: 120 | ||
| 87 | + annotate_rerank_top_k: 200 | ||
| 88 | + batch_top_k: 100 | ||
| 89 | + audit_top_k: 100 | ||
| 90 | + audit_limit_suspicious: 5 | ||
| 91 | + default_language: en | ||
| 92 | + search_recall_top_k: 200 | ||
| 93 | + rerank_high_threshold: 0.5 | ||
| 94 | + rerank_high_skip_count: 1000 | ||
| 95 | + rebuild_llm_batch_size: 50 | ||
| 96 | + rebuild_min_llm_batches: 10 | ||
| 97 | + rebuild_max_llm_batches: 40 | ||
| 98 | + rebuild_irrelevant_stop_ratio: 0.799 | ||
| 99 | + rebuild_irrel_low_combined_stop_ratio: 0.959 | ||
| 100 | + rebuild_irrelevant_stop_streak: 3 | ||
| 101 | + | ||
| 67 | # ES Index Settings (基础设置) | 102 | # ES Index Settings (基础设置) |
| 68 | es_settings: | 103 | es_settings: |
| 69 | number_of_shards: 1 | 104 | number_of_shards: 1 |
| @@ -75,7 +110,9 @@ es_settings: | @@ -75,7 +110,9 @@ es_settings: | ||
| 75 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | 110 | # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 |
| 76 | field_boosts: | 111 | field_boosts: |
| 77 | title: 3.0 | 112 | title: 3.0 |
| 78 | - qanchors: 2.5 | 113 | + qanchors: 2.3 |
| 114 | + enriched_tags: 2.3 | ||
| 115 | + keywords: 2.0 | ||
| 79 | tags: 2.0 | 116 | tags: 2.0 |
| 80 | category_name_text: 2.0 | 117 | category_name_text: 2.0 |
| 81 | category_path: 2.0 | 118 | category_path: 2.0 |
| @@ -152,7 +189,11 @@ query_config: | @@ -152,7 +189,11 @@ query_config: | ||
| 152 | multilingual_fields: | 189 | multilingual_fields: |
| 153 | - title | 190 | - title |
| 154 | - qanchors | 191 | - qanchors |
| 192 | + - keywords | ||
| 155 | - enriched_tags | 193 | - enriched_tags |
| 194 | + - option1_values | ||
| 195 | + - option2_values | ||
| 196 | + - option3_values | ||
| 156 | - category_path | 197 | - category_path |
| 157 | - category_name_text | 198 | - category_name_text |
| 158 | - brief | 199 | - brief |
config/loader.py
| @@ -46,6 +46,7 @@ from config.schema import ( | @@ -46,6 +46,7 @@ from config.schema import ( | ||
| 46 | RerankServiceInstanceConfig, | 46 | RerankServiceInstanceConfig, |
| 47 | RuntimeConfig, | 47 | RuntimeConfig, |
| 48 | SearchConfig, | 48 | SearchConfig, |
| 49 | + SearchEvaluationConfig, | ||
| 49 | SecretsConfig, | 50 | SecretsConfig, |
| 50 | ServicesConfig, | 51 | ServicesConfig, |
| 51 | SPUConfig, | 52 | SPUConfig, |
| @@ -263,6 +264,7 @@ class AppConfigLoader: | @@ -263,6 +264,7 @@ class AppConfigLoader: | ||
| 263 | product_enrich_config = ProductEnrichConfig( | 264 | product_enrich_config = ProductEnrichConfig( |
| 264 | max_workers=int(product_enrich_raw.get("max_workers", 40)), | 265 | max_workers=int(product_enrich_raw.get("max_workers", 40)), |
| 265 | ) | 266 | ) |
| 267 | + search_evaluation_config = self._build_search_evaluation_config(raw, runtime_config) | ||
| 266 | 268 | ||
| 267 | metadata = ConfigMetadata( | 269 | metadata = ConfigMetadata( |
| 268 | loaded_files=tuple(loaded_files), | 270 | loaded_files=tuple(loaded_files), |
| @@ -278,6 +280,7 @@ class AppConfigLoader: | @@ -278,6 +280,7 @@ class AppConfigLoader: | ||
| 278 | services=services_config, | 280 | services=services_config, |
| 279 | tenants=tenants_config, | 281 | tenants=tenants_config, |
| 280 | assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path), | 282 | assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path), |
| 283 | + search_evaluation=search_evaluation_config, | ||
| 281 | metadata=metadata, | 284 | metadata=metadata, |
| 282 | ) | 285 | ) |
| 283 | 286 | ||
| @@ -290,6 +293,7 @@ class AppConfigLoader: | @@ -290,6 +293,7 @@ class AppConfigLoader: | ||
| 290 | services=app_config.services, | 293 | services=app_config.services, |
| 291 | tenants=app_config.tenants, | 294 | tenants=app_config.tenants, |
| 292 | assets=app_config.assets, | 295 | assets=app_config.assets, |
| 296 | + search_evaluation=app_config.search_evaluation, | ||
| 293 | metadata=ConfigMetadata( | 297 | metadata=ConfigMetadata( |
| 294 | loaded_files=app_config.metadata.loaded_files, | 298 | loaded_files=app_config.metadata.loaded_files, |
| 295 | config_hash=config_hash, | 299 | config_hash=config_hash, |
| @@ -297,6 +301,89 @@ class AppConfigLoader: | @@ -297,6 +301,89 @@ class AppConfigLoader: | ||
| 297 | ), | 301 | ), |
| 298 | ) | 302 | ) |
| 299 | 303 | ||
| 304 | + def _build_search_evaluation_config(self, raw: Dict[str, Any], runtime: RuntimeConfig) -> SearchEvaluationConfig: | ||
| 305 | + se = raw.get("search_evaluation") if isinstance(raw.get("search_evaluation"), dict) else {} | ||
| 306 | + default_artifact = (self.project_root / "artifacts" / "search_evaluation").resolve() | ||
| 307 | + default_queries = (self.project_root / "scripts" / "evaluation" / "queries" / "queries.txt").resolve() | ||
| 308 | + default_log_dir = (self.project_root / "logs").resolve() | ||
| 309 | + default_search_base = f"http://127.0.0.1:{int(runtime.api_port)}" | ||
| 310 | + | ||
| 311 | + def _project_path(value: Any, default: Path) -> Path: | ||
| 312 | + if value in (None, ""): | ||
| 313 | + return default | ||
| 314 | + candidate = Path(str(value)) | ||
| 315 | + if candidate.is_absolute(): | ||
| 316 | + return candidate.resolve() | ||
| 317 | + return (self.project_root / candidate).resolve() | ||
| 318 | + | ||
| 319 | + def _str(key: str, default: str) -> str: | ||
| 320 | + v = se.get(key) | ||
| 321 | + if v is None or (isinstance(v, str) and not v.strip()): | ||
| 322 | + return default | ||
| 323 | + return str(v).strip() | ||
| 324 | + | ||
| 325 | + def _int(key: str, default: int) -> int: | ||
| 326 | + v = se.get(key) | ||
| 327 | + if v is None: | ||
| 328 | + return default | ||
| 329 | + return int(v) | ||
| 330 | + | ||
| 331 | + def _float(key: str, default: float) -> float: | ||
| 332 | + v = se.get(key) | ||
| 333 | + if v is None: | ||
| 334 | + return default | ||
| 335 | + return float(v) | ||
| 336 | + | ||
| 337 | + def _bool(key: str, default: bool) -> bool: | ||
| 338 | + v = se.get(key) | ||
| 339 | + if v is None: | ||
| 340 | + return default | ||
| 341 | + if isinstance(v, bool): | ||
| 342 | + return v | ||
| 343 | + if isinstance(v, str): | ||
| 344 | + return v.strip().lower() in {"1", "true", "yes", "on"} | ||
| 345 | + return bool(v) | ||
| 346 | + | ||
| 347 | + raw_search_url = se.get("search_base_url") | ||
| 348 | + if raw_search_url is None or (isinstance(raw_search_url, str) and not str(raw_search_url).strip()): | ||
| 349 | + search_base_url = default_search_base | ||
| 350 | + else: | ||
| 351 | + search_base_url = str(raw_search_url).strip() | ||
| 352 | + | ||
| 353 | + return SearchEvaluationConfig( | ||
| 354 | + artifact_root=_project_path(se.get("artifact_root"), default_artifact), | ||
| 355 | + queries_file=_project_path(se.get("queries_file"), default_queries), | ||
| 356 | + eval_log_dir=_project_path(se.get("eval_log_dir"), default_log_dir), | ||
| 357 | + default_tenant_id=_str("default_tenant_id", "163"), | ||
| 358 | + search_base_url=search_base_url, | ||
| 359 | + web_host=_str("web_host", "0.0.0.0"), | ||
| 360 | + web_port=_int("web_port", 6010), | ||
| 361 | + judge_model=_str("judge_model", "qwen3.5-plus"), | ||
| 362 | + judge_enable_thinking=_bool("judge_enable_thinking", False), | ||
| 363 | + judge_dashscope_batch=_bool("judge_dashscope_batch", False), | ||
| 364 | + intent_model=_str("intent_model", "qwen3-max"), | ||
| 365 | + intent_enable_thinking=_bool("intent_enable_thinking", True), | ||
| 366 | + judge_batch_completion_window=_str("judge_batch_completion_window", "24h"), | ||
| 367 | + judge_batch_poll_interval_sec=_float("judge_batch_poll_interval_sec", 10.0), | ||
| 368 | + build_search_depth=_int("build_search_depth", 1000), | ||
| 369 | + build_rerank_depth=_int("build_rerank_depth", 10000), | ||
| 370 | + annotate_search_top_k=_int("annotate_search_top_k", 120), | ||
| 371 | + annotate_rerank_top_k=_int("annotate_rerank_top_k", 200), | ||
| 372 | + batch_top_k=_int("batch_top_k", 100), | ||
| 373 | + audit_top_k=_int("audit_top_k", 100), | ||
| 374 | + audit_limit_suspicious=_int("audit_limit_suspicious", 5), | ||
| 375 | + default_language=_str("default_language", "en"), | ||
| 376 | + search_recall_top_k=_int("search_recall_top_k", 200), | ||
| 377 | + rerank_high_threshold=_float("rerank_high_threshold", 0.5), | ||
| 378 | + rerank_high_skip_count=_int("rerank_high_skip_count", 1000), | ||
| 379 | + rebuild_llm_batch_size=_int("rebuild_llm_batch_size", 50), | ||
| 380 | + rebuild_min_llm_batches=_int("rebuild_min_llm_batches", 10), | ||
| 381 | + rebuild_max_llm_batches=_int("rebuild_max_llm_batches", 40), | ||
| 382 | + rebuild_irrelevant_stop_ratio=_float("rebuild_irrelevant_stop_ratio", 0.799), | ||
| 383 | + rebuild_irrel_low_combined_stop_ratio=_float("rebuild_irrel_low_combined_stop_ratio", 0.959), | ||
| 384 | + rebuild_irrelevant_stop_streak=_int("rebuild_irrelevant_stop_streak", 3), | ||
| 385 | + ) | ||
| 386 | + | ||
| 300 | def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig: | 387 | def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig: |
| 301 | field_boosts = raw.get("field_boosts") or {} | 388 | field_boosts = raw.get("field_boosts") or {} |
| 302 | if not isinstance(field_boosts, dict): | 389 | if not isinstance(field_boosts, dict): |
config/schema.py
| @@ -376,6 +376,43 @@ class AssetsConfig: | @@ -376,6 +376,43 @@ class AssetsConfig: | ||
| 376 | 376 | ||
| 377 | 377 | ||
| 378 | @dataclass(frozen=True) | 378 | @dataclass(frozen=True) |
| 379 | +class SearchEvaluationConfig: | ||
| 380 | + """Offline / web UI search evaluation (YAML: ``search_evaluation``).""" | ||
| 381 | + | ||
| 382 | + artifact_root: Path | ||
| 383 | + queries_file: Path | ||
| 384 | + eval_log_dir: Path | ||
| 385 | + default_tenant_id: str | ||
| 386 | + search_base_url: str | ||
| 387 | + web_host: str | ||
| 388 | + web_port: int | ||
| 389 | + judge_model: str | ||
| 390 | + judge_enable_thinking: bool | ||
| 391 | + judge_dashscope_batch: bool | ||
| 392 | + intent_model: str | ||
| 393 | + intent_enable_thinking: bool | ||
| 394 | + judge_batch_completion_window: str | ||
| 395 | + judge_batch_poll_interval_sec: float | ||
| 396 | + build_search_depth: int | ||
| 397 | + build_rerank_depth: int | ||
| 398 | + annotate_search_top_k: int | ||
| 399 | + annotate_rerank_top_k: int | ||
| 400 | + batch_top_k: int | ||
| 401 | + audit_top_k: int | ||
| 402 | + audit_limit_suspicious: int | ||
| 403 | + default_language: str | ||
| 404 | + search_recall_top_k: int | ||
| 405 | + rerank_high_threshold: float | ||
| 406 | + rerank_high_skip_count: int | ||
| 407 | + rebuild_llm_batch_size: int | ||
| 408 | + rebuild_min_llm_batches: int | ||
| 409 | + rebuild_max_llm_batches: int | ||
| 410 | + rebuild_irrelevant_stop_ratio: float | ||
| 411 | + rebuild_irrel_low_combined_stop_ratio: float | ||
| 412 | + rebuild_irrelevant_stop_streak: int | ||
| 413 | + | ||
| 414 | + | ||
| 415 | +@dataclass(frozen=True) | ||
| 379 | class ConfigMetadata: | 416 | class ConfigMetadata: |
| 380 | loaded_files: Tuple[str, ...] | 417 | loaded_files: Tuple[str, ...] |
| 381 | config_hash: str | 418 | config_hash: str |
| @@ -393,6 +430,7 @@ class AppConfig: | @@ -393,6 +430,7 @@ class AppConfig: | ||
| 393 | services: ServicesConfig | 430 | services: ServicesConfig |
| 394 | tenants: TenantCatalogConfig | 431 | tenants: TenantCatalogConfig |
| 395 | assets: AssetsConfig | 432 | assets: AssetsConfig |
| 433 | + search_evaluation: SearchEvaluationConfig | ||
| 396 | metadata: ConfigMetadata | 434 | metadata: ConfigMetadata |
| 397 | 435 | ||
| 398 | def sanitized_dict(self) -> Dict[str, Any]: | 436 | def sanitized_dict(self) -> Dict[str, Any]: |
scripts/evaluation/eval_framework/cli.py
| @@ -8,21 +8,6 @@ import logging | @@ -8,21 +8,6 @@ import logging | ||
| 8 | from pathlib import Path | 8 | from pathlib import Path |
| 9 | from typing import Any, Dict | 9 | from typing import Any, Dict |
| 10 | 10 | ||
| 11 | -from .constants import ( | ||
| 12 | - DEFAULT_INTENT_ENABLE_THINKING, | ||
| 13 | - DEFAULT_INTENT_MODEL, | ||
| 14 | - DEFAULT_QUERY_FILE, | ||
| 15 | - DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | ||
| 16 | - DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | ||
| 17 | - DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | ||
| 18 | - DEFAULT_REBUILD_LLM_BATCH_SIZE, | ||
| 19 | - DEFAULT_REBUILD_MAX_LLM_BATCHES, | ||
| 20 | - DEFAULT_REBUILD_MIN_LLM_BATCHES, | ||
| 21 | - DEFAULT_RERANK_HIGH_SKIP_COUNT, | ||
| 22 | - DEFAULT_RERANK_HIGH_THRESHOLD, | ||
| 23 | - DEFAULT_SEARCH_RECALL_TOP_K, | ||
| 24 | -) | ||
| 25 | -from .constants import EVAL_LOG_FILE | ||
| 26 | from .framework import SearchEvaluationFramework | 11 | from .framework import SearchEvaluationFramework |
| 27 | from .logging_setup import setup_eval_logging | 12 | from .logging_setup import setup_eval_logging |
| 28 | from .utils import ensure_dir, utc_now_iso, utc_timestamp | 13 | from .utils import ensure_dir, utc_now_iso, utc_timestamp |
| @@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | @@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | ||
| 36 | "--judge-model", | 21 | "--judge-model", |
| 37 | default=None, | 22 | default=None, |
| 38 | metavar="MODEL", | 23 | metavar="MODEL", |
| 39 | - help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).", | 24 | + help="Judge LLM model (default: config.yaml search_evaluation.judge_model).", |
| 40 | ) | 25 | ) |
| 41 | p.add_argument( | 26 | p.add_argument( |
| 42 | "--enable-thinking", | 27 | "--enable-thinking", |
| 43 | action=argparse.BooleanOptionalAction, | 28 | action=argparse.BooleanOptionalAction, |
| 44 | default=None, | 29 | default=None, |
| 45 | - help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).", | 30 | + help="enable_thinking for DashScope (default: search_evaluation.judge_enable_thinking).", |
| 46 | ) | 31 | ) |
| 47 | p.add_argument( | 32 | p.add_argument( |
| 48 | "--dashscope-batch", | 33 | "--dashscope-batch", |
| 49 | action=argparse.BooleanOptionalAction, | 34 | action=argparse.BooleanOptionalAction, |
| 50 | default=None, | 35 | default=None, |
| 51 | - help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).", | 36 | + help="DashScope Batch File API vs sync chat (default: search_evaluation.judge_dashscope_batch).", |
| 52 | ) | 37 | ) |
| 53 | 38 | ||
| 54 | 39 | ||
| @@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -> None: | @@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -> None: | ||
| 57 | "--intent-model", | 42 | "--intent-model", |
| 58 | default=None, | 43 | default=None, |
| 59 | metavar="MODEL", | 44 | metavar="MODEL", |
| 60 | - help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).", | 45 | + help="Query-intent LLM model before relevance judging (default: search_evaluation.intent_model).", |
| 61 | ) | 46 | ) |
| 62 | p.add_argument( | 47 | p.add_argument( |
| 63 | "--intent-enable-thinking", | 48 | "--intent-enable-thinking", |
| 64 | action=argparse.BooleanOptionalAction, | 49 | action=argparse.BooleanOptionalAction, |
| 65 | default=None, | 50 | default=None, |
| 66 | - help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).", | 51 | + help="enable_thinking for intent model (default: search_evaluation.intent_enable_thinking).", |
| 67 | ) | 52 | ) |
| 68 | 53 | ||
| 69 | 54 | ||
| @@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | @@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | ||
| 82 | return kw | 67 | return kw |
| 83 | 68 | ||
| 84 | 69 | ||
| 70 | +def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: | ||
| 71 | + """Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``).""" | ||
| 72 | + from config.loader import get_app_config | ||
| 73 | + | ||
| 74 | + se = get_app_config().search_evaluation | ||
| 75 | + if getattr(args, "tenant_id", None) in (None, ""): | ||
| 76 | + args.tenant_id = se.default_tenant_id | ||
| 77 | + if getattr(args, "queries_file", None) in (None, ""): | ||
| 78 | + args.queries_file = str(se.queries_file) | ||
| 79 | + if getattr(args, "language", None) in (None, ""): | ||
| 80 | + args.language = se.default_language | ||
| 81 | + | ||
| 82 | + if args.command == "serve": | ||
| 83 | + if getattr(args, "host", None) in (None, ""): | ||
| 84 | + args.host = se.web_host | ||
| 85 | + if getattr(args, "port", None) is None: | ||
| 86 | + args.port = se.web_port | ||
| 87 | + | ||
| 88 | + if args.command == "batch": | ||
| 89 | + if getattr(args, "top_k", None) is None: | ||
| 90 | + args.top_k = se.batch_top_k | ||
| 91 | + | ||
| 92 | + if args.command == "audit": | ||
| 93 | + if getattr(args, "top_k", None) is None: | ||
| 94 | + args.top_k = se.audit_top_k | ||
| 95 | + if getattr(args, "limit_suspicious", None) is None: | ||
| 96 | + args.limit_suspicious = se.audit_limit_suspicious | ||
| 97 | + | ||
| 98 | + if args.command == "build": | ||
| 99 | + if getattr(args, "search_depth", None) is None: | ||
| 100 | + args.search_depth = se.build_search_depth | ||
| 101 | + if getattr(args, "rerank_depth", None) is None: | ||
| 102 | + args.rerank_depth = se.build_rerank_depth | ||
| 103 | + if getattr(args, "annotate_search_top_k", None) is None: | ||
| 104 | + args.annotate_search_top_k = se.annotate_search_top_k | ||
| 105 | + if getattr(args, "annotate_rerank_top_k", None) is None: | ||
| 106 | + args.annotate_rerank_top_k = se.annotate_rerank_top_k | ||
| 107 | + if getattr(args, "search_recall_top_k", None) is None: | ||
| 108 | + args.search_recall_top_k = se.search_recall_top_k | ||
| 109 | + if getattr(args, "rerank_high_threshold", None) is None: | ||
| 110 | + args.rerank_high_threshold = se.rerank_high_threshold | ||
| 111 | + if getattr(args, "rerank_high_skip_count", None) is None: | ||
| 112 | + args.rerank_high_skip_count = se.rerank_high_skip_count | ||
| 113 | + if getattr(args, "rebuild_llm_batch_size", None) is None: | ||
| 114 | + args.rebuild_llm_batch_size = se.rebuild_llm_batch_size | ||
| 115 | + if getattr(args, "rebuild_min_batches", None) is None: | ||
| 116 | + args.rebuild_min_batches = se.rebuild_min_llm_batches | ||
| 117 | + if getattr(args, "rebuild_max_batches", None) is None: | ||
| 118 | + args.rebuild_max_batches = se.rebuild_max_llm_batches | ||
| 119 | + if getattr(args, "rebuild_irrelevant_stop_ratio", None) is None: | ||
| 120 | + args.rebuild_irrelevant_stop_ratio = se.rebuild_irrelevant_stop_ratio | ||
| 121 | + if getattr(args, "rebuild_irrel_low_combined_stop_ratio", None) is None: | ||
| 122 | + args.rebuild_irrel_low_combined_stop_ratio = se.rebuild_irrel_low_combined_stop_ratio | ||
| 123 | + if getattr(args, "rebuild_irrelevant_stop_streak", None) is None: | ||
| 124 | + args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak | ||
| 125 | + | ||
| 126 | + | ||
| 85 | def build_cli_parser() -> argparse.ArgumentParser: | 127 | def build_cli_parser() -> argparse.ArgumentParser: |
| 86 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") | 128 | parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") |
| 87 | sub = parser.add_subparsers(dest="command", required=True) | 129 | sub = parser.add_subparsers(dest="command", required=True) |
| 88 | 130 | ||
| 89 | build = sub.add_parser("build", help="Build pooled annotation set for queries") | 131 | build = sub.add_parser("build", help="Build pooled annotation set for queries") |
| 90 | - build.add_argument("--tenant-id", default="163") | ||
| 91 | - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 92 | - build.add_argument("--search-depth", type=int, default=1000) | ||
| 93 | - build.add_argument("--rerank-depth", type=int, default=10000) | ||
| 94 | - build.add_argument("--annotate-search-top-k", type=int, default=120) | ||
| 95 | - build.add_argument("--annotate-rerank-top-k", type=int, default=200) | 132 | + build.add_argument( |
| 133 | + "--tenant-id", | ||
| 134 | + default=None, | ||
| 135 | + help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).", | ||
| 136 | + ) | ||
| 137 | + build.add_argument( | ||
| 138 | + "--queries-file", | ||
| 139 | + default=None, | ||
| 140 | + help="Query list file (default: search_evaluation.queries_file).", | ||
| 141 | + ) | ||
| 142 | + build.add_argument( | ||
| 143 | + "--search-depth", | ||
| 144 | + type=int, | ||
| 145 | + default=None, | ||
| 146 | + help="Default: search_evaluation.build_search_depth.", | ||
| 147 | + ) | ||
| 148 | + build.add_argument( | ||
| 149 | + "--rerank-depth", | ||
| 150 | + type=int, | ||
| 151 | + default=None, | ||
| 152 | + help="Default: search_evaluation.build_rerank_depth.", | ||
| 153 | + ) | ||
| 154 | + build.add_argument( | ||
| 155 | + "--annotate-search-top-k", | ||
| 156 | + type=int, | ||
| 157 | + default=None, | ||
| 158 | + help="Default: search_evaluation.annotate_search_top_k.", | ||
| 159 | + ) | ||
| 160 | + build.add_argument( | ||
| 161 | + "--annotate-rerank-top-k", | ||
| 162 | + type=int, | ||
| 163 | + default=None, | ||
| 164 | + help="Default: search_evaluation.annotate_rerank_top_k.", | ||
| 165 | + ) | ||
| 96 | build.add_argument( | 166 | build.add_argument( |
| 97 | "--search-recall-top-k", | 167 | "--search-recall-top-k", |
| 98 | type=int, | 168 | type=int, |
| @@ -118,7 +188,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -118,7 +188,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 118 | "--rebuild-irrelevant-stop-ratio", | 188 | "--rebuild-irrelevant-stop-ratio", |
| 119 | type=float, | 189 | type=float, |
| 120 | default=None, | 190 | default=None, |
| 121 | - help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", | 191 | + help="Rebuild only: bad batch requires irrelevant_ratio > this (default: search_evaluation.rebuild_irrelevant_stop_ratio).", |
| 122 | ) | 192 | ) |
| 123 | build.add_argument( | 193 | build.add_argument( |
| 124 | "--rebuild-irrel-low-combined-stop-ratio", | 194 | "--rebuild-irrel-low-combined-stop-ratio", |
| @@ -132,36 +202,45 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -132,36 +202,45 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 132 | default=None, | 202 | default=None, |
| 133 | help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", | 203 | help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", |
| 134 | ) | 204 | ) |
| 135 | - build.add_argument("--language", default="en") | 205 | + build.add_argument( |
| 206 | + "--language", | ||
| 207 | + default=None, | ||
| 208 | + help="Default: search_evaluation.default_language.", | ||
| 209 | + ) | ||
| 136 | build.add_argument("--force-refresh-rerank", action="store_true") | 210 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 137 | build.add_argument("--force-refresh-labels", action="store_true") | 211 | build.add_argument("--force-refresh-labels", action="store_true") |
| 138 | add_judge_llm_args(build) | 212 | add_judge_llm_args(build) |
| 139 | add_intent_llm_args(build) | 213 | add_intent_llm_args(build) |
| 140 | 214 | ||
| 141 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") | 215 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| 142 | - batch.add_argument("--tenant-id", default="163") | ||
| 143 | - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 144 | - batch.add_argument("--top-k", type=int, default=100) | ||
| 145 | - batch.add_argument("--language", default="en") | 216 | + batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") |
| 217 | + batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") | ||
| 218 | + batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") | ||
| 219 | + batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") | ||
| 146 | batch.add_argument("--force-refresh-labels", action="store_true") | 220 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 147 | add_judge_llm_args(batch) | 221 | add_judge_llm_args(batch) |
| 148 | add_intent_llm_args(batch) | 222 | add_intent_llm_args(batch) |
| 149 | 223 | ||
| 150 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") | 224 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| 151 | - audit.add_argument("--tenant-id", default="163") | ||
| 152 | - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 153 | - audit.add_argument("--top-k", type=int, default=100) | ||
| 154 | - audit.add_argument("--language", default="en") | ||
| 155 | - audit.add_argument("--limit-suspicious", type=int, default=5) | 225 | + audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") |
| 226 | + audit.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") | ||
| 227 | + audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") | ||
| 228 | + audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") | ||
| 229 | + audit.add_argument( | ||
| 230 | + "--limit-suspicious", | ||
| 231 | + type=int, | ||
| 232 | + default=None, | ||
| 233 | + help="Default: search_evaluation.audit_limit_suspicious.", | ||
| 234 | + ) | ||
| 156 | audit.add_argument("--force-refresh-labels", action="store_true") | 235 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 157 | add_judge_llm_args(audit) | 236 | add_judge_llm_args(audit) |
| 158 | add_intent_llm_args(audit) | 237 | add_intent_llm_args(audit) |
| 159 | 238 | ||
| 160 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | 239 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| 161 | - serve.add_argument("--tenant-id", default="163") | ||
| 162 | - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | ||
| 163 | - serve.add_argument("--host", default="0.0.0.0") | ||
| 164 | - serve.add_argument("--port", type=int, default=6010) | 240 | + serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") |
| 241 | + serve.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") | ||
| 242 | + serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.") | ||
| 243 | + serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.") | ||
| 165 | add_judge_llm_args(serve) | 244 | add_judge_llm_args(serve) |
| 166 | add_intent_llm_args(serve) | 245 | add_intent_llm_args(serve) |
| 167 | 246 | ||
| @@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -> None: | @@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 175 | rebuild_kwargs = {} | 254 | rebuild_kwargs = {} |
| 176 | if args.force_refresh_labels: | 255 | if args.force_refresh_labels: |
| 177 | rebuild_kwargs = { | 256 | rebuild_kwargs = { |
| 178 | - "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K, | ||
| 179 | - "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD, | ||
| 180 | - "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT, | ||
| 181 | - "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE, | ||
| 182 | - "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES, | ||
| 183 | - "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES, | ||
| 184 | - "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio | ||
| 185 | - if args.rebuild_irrelevant_stop_ratio is not None | ||
| 186 | - else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | ||
| 187 | - "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio | ||
| 188 | - if args.rebuild_irrel_low_combined_stop_ratio is not None | ||
| 189 | - else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | ||
| 190 | - "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak | ||
| 191 | - if args.rebuild_irrelevant_stop_streak is not None | ||
| 192 | - else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | 257 | + "search_recall_top_k": args.search_recall_top_k, |
| 258 | + "rerank_high_threshold": args.rerank_high_threshold, | ||
| 259 | + "rerank_high_skip_count": args.rerank_high_skip_count, | ||
| 260 | + "rebuild_llm_batch_size": args.rebuild_llm_batch_size, | ||
| 261 | + "rebuild_min_batches": args.rebuild_min_batches, | ||
| 262 | + "rebuild_max_batches": args.rebuild_max_batches, | ||
| 263 | + "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio, | ||
| 264 | + "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio, | ||
| 265 | + "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak, | ||
| 193 | } | 266 | } |
| 194 | - for query in queries: | 267 | + total_q = len(queries) |
| 268 | + for q_index, query in enumerate(queries, start=1): | ||
| 269 | + _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) | ||
| 195 | result = framework.build_query_annotation_set( | 270 | result = framework.build_query_annotation_set( |
| 196 | query=query, | 271 | query=query, |
| 197 | search_depth=args.search_depth, | 272 | search_depth=args.search_depth, |
| @@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -> None: | @@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 230 | def run_batch(args: argparse.Namespace) -> None: | 305 | def run_batch(args: argparse.Namespace) -> None: |
| 231 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) | 306 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 232 | queries = framework.queries_from_file(Path(args.queries_file)) | 307 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 308 | + _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) | ||
| 233 | payload = framework.batch_evaluate( | 309 | payload = framework.batch_evaluate( |
| 234 | queries=queries, | 310 | queries=queries, |
| 235 | top_k=args.top_k, | 311 | top_k=args.top_k, |
| @@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -> None: | @@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -> None: | ||
| 302 | 378 | ||
| 303 | 379 | ||
| 304 | def main() -> None: | 380 | def main() -> None: |
| 305 | - setup_eval_logging() | 381 | + from config.loader import get_app_config |
| 382 | + | ||
| 383 | + se = get_app_config().search_evaluation | ||
| 384 | + log_file = setup_eval_logging(se.eval_log_dir) | ||
| 306 | parser = build_cli_parser() | 385 | parser = build_cli_parser() |
| 307 | args = parser.parse_args() | 386 | args = parser.parse_args() |
| 387 | + _apply_search_evaluation_cli_defaults(args) | ||
| 308 | logging.getLogger("search_eval").info( | 388 | logging.getLogger("search_eval").info( |
| 309 | "CLI start command=%s tenant_id=%s log_file=%s", | 389 | "CLI start command=%s tenant_id=%s log_file=%s", |
| 310 | args.command, | 390 | args.command, |
| 311 | getattr(args, "tenant_id", ""), | 391 | getattr(args, "tenant_id", ""), |
| 312 | - EVAL_LOG_FILE.resolve(), | 392 | + log_file.resolve(), |
| 313 | ) | 393 | ) |
| 314 | if args.command == "build": | 394 | if args.command == "build": |
| 315 | run_build(args) | 395 | run_build(args) |
scripts/evaluation/eval_framework/clients.py
| @@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple | @@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple | ||
| 12 | 12 | ||
| 13 | import requests | 13 | import requests |
| 14 | 14 | ||
| 15 | -from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS | 15 | +from .constants import VALID_LABELS |
| 16 | from .logging_setup import setup_eval_logging | 16 | from .logging_setup import setup_eval_logging |
| 17 | from .prompts import classify_prompt, intent_analysis_prompt | 17 | from .prompts import classify_prompt, intent_analysis_prompt |
| 18 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps | 18 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps |
| @@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False | @@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False | ||
| 23 | 23 | ||
| 24 | 24 | ||
| 25 | def _get_eval_llm_verbose_logger() -> logging.Logger: | 25 | def _get_eval_llm_verbose_logger() -> logging.Logger: |
| 26 | - """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``.""" | ||
| 27 | - setup_eval_logging() | 26 | + """File logger for full LLM prompts/responses under ``search_evaluation.eval_log_dir/verbose/``.""" |
| 27 | + from config.loader import get_app_config | ||
| 28 | + | ||
| 29 | + se = get_app_config().search_evaluation | ||
| 30 | + setup_eval_logging(se.eval_log_dir) | ||
| 28 | global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged | 31 | global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged |
| 29 | with _VERBOSE_LOGGER_LOCK: | 32 | with _VERBOSE_LOGGER_LOCK: |
| 30 | if _eval_llm_verbose_logger_singleton is not None: | 33 | if _eval_llm_verbose_logger_singleton is not None: |
| 31 | return _eval_llm_verbose_logger_singleton | 34 | return _eval_llm_verbose_logger_singleton |
| 32 | - log_path = EVAL_VERBOSE_LOG_FILE | 35 | + log_path = se.eval_log_dir / "verbose" / "eval_verbose.log" |
| 33 | log_path.parent.mkdir(parents=True, exist_ok=True) | 36 | log_path.parent.mkdir(parents=True, exist_ok=True) |
| 34 | lg = logging.getLogger("search_eval.verbose_llm") | 37 | lg = logging.getLogger("search_eval.verbose_llm") |
| 35 | lg.setLevel(logging.INFO) | 38 | lg.setLevel(logging.INFO) |
scripts/evaluation/eval_framework/constants.py
| @@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False | @@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False | ||
| 46 | DEFAULT_JUDGE_DASHSCOPE_BATCH = False | 46 | DEFAULT_JUDGE_DASHSCOPE_BATCH = False |
| 47 | 47 | ||
| 48 | # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) | 48 | # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) |
| 49 | -DEFAULT_INTENT_MODEL = "qwen-max" | 49 | +DEFAULT_INTENT_MODEL = "qwen3-max" |
| 50 | DEFAULT_INTENT_ENABLE_THINKING = True | 50 | DEFAULT_INTENT_ENABLE_THINKING = True |
| 51 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" | 51 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" |
| 52 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 | 52 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 |
scripts/evaluation/eval_framework/framework.py
| @@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name | @@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name | ||
| 16 | 16 | ||
| 17 | from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient | 17 | from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient |
| 18 | from .constants import ( | 18 | from .constants import ( |
| 19 | - DEFAULT_ARTIFACT_ROOT, | ||
| 20 | - DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, | ||
| 21 | - DEFAULT_INTENT_ENABLE_THINKING, | ||
| 22 | - DEFAULT_INTENT_MODEL, | ||
| 23 | - DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, | ||
| 24 | - DEFAULT_JUDGE_DASHSCOPE_BATCH, | ||
| 25 | - DEFAULT_JUDGE_ENABLE_THINKING, | ||
| 26 | - DEFAULT_JUDGE_MODEL, | ||
| 27 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | 19 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, |
| 28 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | 20 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| 29 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, | 21 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, |
| @@ -79,8 +71,8 @@ class SearchEvaluationFramework: | @@ -79,8 +71,8 @@ class SearchEvaluationFramework: | ||
| 79 | def __init__( | 71 | def __init__( |
| 80 | self, | 72 | self, |
| 81 | tenant_id: str, | 73 | tenant_id: str, |
| 82 | - artifact_root: Path = DEFAULT_ARTIFACT_ROOT, | ||
| 83 | - search_base_url: str = "http://localhost:6002", | 74 | + artifact_root: Path | None = None, |
| 75 | + search_base_url: str | None = None, | ||
| 84 | *, | 76 | *, |
| 85 | judge_model: str | None = None, | 77 | judge_model: str | None = None, |
| 86 | enable_thinking: bool | None = None, | 78 | enable_thinking: bool | None = None, |
| @@ -88,12 +80,14 @@ class SearchEvaluationFramework: | @@ -88,12 +80,14 @@ class SearchEvaluationFramework: | ||
| 88 | intent_model: str | None = None, | 80 | intent_model: str | None = None, |
| 89 | intent_enable_thinking: bool | None = None, | 81 | intent_enable_thinking: bool | None = None, |
| 90 | ): | 82 | ): |
| 91 | - init_service(get_app_config().infrastructure.elasticsearch.host) | 83 | + app_cfg = get_app_config() |
| 84 | + se = app_cfg.search_evaluation | ||
| 85 | + init_service(app_cfg.infrastructure.elasticsearch.host) | ||
| 92 | self.tenant_id = str(tenant_id) | 86 | self.tenant_id = str(tenant_id) |
| 93 | - self.artifact_root = ensure_dir(artifact_root) | 87 | + self.artifact_root = ensure_dir(artifact_root if artifact_root is not None else se.artifact_root) |
| 94 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") | 88 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") |
| 95 | - self.search_client = SearchServiceClient(search_base_url, self.tenant_id) | ||
| 96 | - app_cfg = get_app_config() | 89 | + sb = search_base_url if search_base_url is not None else se.search_base_url |
| 90 | + self.search_client = SearchServiceClient(sb, self.tenant_id) | ||
| 97 | rerank_service_url = str( | 91 | rerank_service_url = str( |
| 98 | app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] | 92 | app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] |
| 99 | ) | 93 | ) |
| @@ -102,11 +96,11 @@ class SearchEvaluationFramework: | @@ -102,11 +96,11 @@ class SearchEvaluationFramework: | ||
| 102 | api_key = app_cfg.infrastructure.secrets.dashscope_api_key | 96 | api_key = app_cfg.infrastructure.secrets.dashscope_api_key |
| 103 | if not api_key: | 97 | if not api_key: |
| 104 | raise RuntimeError("dashscope_api_key is required for search evaluation annotation") | 98 | raise RuntimeError("dashscope_api_key is required for search evaluation annotation") |
| 105 | - model = str(judge_model or DEFAULT_JUDGE_MODEL) | ||
| 106 | - et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking | ||
| 107 | - use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch | ||
| 108 | - batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW | ||
| 109 | - batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC) | 99 | + model = str(judge_model if judge_model is not None else se.judge_model) |
| 100 | + et = se.judge_enable_thinking if enable_thinking is None else enable_thinking | ||
| 101 | + use_batch = se.judge_dashscope_batch if use_dashscope_batch is None else use_dashscope_batch | ||
| 102 | + batch_window = se.judge_batch_completion_window | ||
| 103 | + batch_poll = float(se.judge_batch_poll_interval_sec) | ||
| 110 | self.label_client = DashScopeLabelClient( | 104 | self.label_client = DashScopeLabelClient( |
| 111 | model=model, | 105 | model=model, |
| 112 | base_url=str(llm_cfg["base_url"]), | 106 | base_url=str(llm_cfg["base_url"]), |
| @@ -116,8 +110,8 @@ class SearchEvaluationFramework: | @@ -116,8 +110,8 @@ class SearchEvaluationFramework: | ||
| 116 | enable_thinking=et, | 110 | enable_thinking=et, |
| 117 | use_batch=use_batch, | 111 | use_batch=use_batch, |
| 118 | ) | 112 | ) |
| 119 | - intent_m = str(intent_model or DEFAULT_INTENT_MODEL) | ||
| 120 | - intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking | 113 | + intent_m = str(intent_model if intent_model is not None else se.intent_model) |
| 114 | + intent_et = se.intent_enable_thinking if intent_enable_thinking is None else intent_enable_thinking | ||
| 121 | self.intent_client = DashScopeLabelClient( | 115 | self.intent_client = DashScopeLabelClient( |
| 122 | model=intent_m, | 116 | model=intent_m, |
| 123 | base_url=str(llm_cfg["base_url"]), | 117 | base_url=str(llm_cfg["base_url"]), |
| @@ -629,6 +623,21 @@ class SearchEvaluationFramework: | @@ -629,6 +623,21 @@ class SearchEvaluationFramework: | ||
| 629 | corpus = self.corpus_docs(refresh=False) | 623 | corpus = self.corpus_docs(refresh=False) |
| 630 | corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} | 624 | corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} |
| 631 | 625 | ||
| 626 | + rerank_pending_n = sum( | ||
| 627 | + 1 | ||
| 628 | + for d in corpus | ||
| 629 | + if str(d.get("spu_id") or "").strip() | ||
| 630 | + and str(d.get("spu_id")) not in pool_spu_ids | ||
| 631 | + ) | ||
| 632 | + _log.info( | ||
| 633 | + "[eval-rebuild] query=%r phase=rerank_outside_pool docs≈%s (pool=%s, force_refresh_rerank=%s); " | ||
| 634 | + "this can take a long time with no further logs until LLM batches start", | ||
| 635 | + query, | ||
| 636 | + rerank_pending_n, | ||
| 637 | + len(pool_spu_ids), | ||
| 638 | + force_refresh_rerank, | ||
| 639 | + ) | ||
| 640 | + | ||
| 632 | ranked_outside = self.full_corpus_rerank_outside_exclude( | 641 | ranked_outside = self.full_corpus_rerank_outside_exclude( |
| 633 | query=query, | 642 | query=query, |
| 634 | docs=corpus, | 643 | docs=corpus, |
| @@ -905,7 +914,9 @@ class SearchEvaluationFramework: | @@ -905,7 +914,9 @@ class SearchEvaluationFramework: | ||
| 905 | force_refresh_labels: bool = False, | 914 | force_refresh_labels: bool = False, |
| 906 | ) -> Dict[str, Any]: | 915 | ) -> Dict[str, Any]: |
| 907 | per_query = [] | 916 | per_query = [] |
| 908 | - for query in queries: | 917 | + total_q = len(queries) |
| 918 | + _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) | ||
| 919 | + for q_index, query in enumerate(queries, start=1): | ||
| 909 | live = self.evaluate_live_query( | 920 | live = self.evaluate_live_query( |
| 910 | query, | 921 | query, |
| 911 | top_k=top_k, | 922 | top_k=top_k, |
| @@ -927,6 +938,16 @@ class SearchEvaluationFramework: | @@ -927,6 +938,16 @@ class SearchEvaluationFramework: | ||
| 927 | "total": live["total"], | 938 | "total": live["total"], |
| 928 | } | 939 | } |
| 929 | ) | 940 | ) |
| 941 | + m = live["metrics"] | ||
| 942 | + _log.info( | ||
| 943 | + "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s", | ||
| 944 | + q_index, | ||
| 945 | + total_q, | ||
| 946 | + query, | ||
| 947 | + m.get("P@10"), | ||
| 948 | + m.get("MAP_3"), | ||
| 949 | + live.get("total"), | ||
| 950 | + ) | ||
| 930 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) | 951 | aggregate = aggregate_metrics([item["metrics"] for item in per_query]) |
| 931 | aggregate_distribution = { | 952 | aggregate_distribution = { |
| 932 | RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | 953 | RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), |
| @@ -955,5 +976,11 @@ class SearchEvaluationFramework: | @@ -955,5 +976,11 @@ class SearchEvaluationFramework: | ||
| 955 | output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | 976 | output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") |
| 956 | report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") | 977 | report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") |
| 957 | self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) | 978 | self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) |
| 979 | + _log.info( | ||
| 980 | + "[batch-eval] finished batch_id=%s per_query=%s json=%s", | ||
| 981 | + batch_id, | ||
| 982 | + len(per_query), | ||
| 983 | + output_json_path, | ||
| 984 | + ) | ||
| 958 | return payload | 985 | return payload |
| 959 | 986 |
scripts/evaluation/eval_framework/logging_setup.py
| 1 | -"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``).""" | 1 | +"""Configure dedicated eval run logs (defaults: repo ``logs/``; override via ``config.yaml`` ``search_evaluation.eval_log_dir``).""" |
| 2 | 2 | ||
| 3 | from __future__ import annotations | 3 | from __future__ import annotations |
| 4 | 4 | ||
| 5 | import logging | 5 | import logging |
| 6 | import sys | 6 | import sys |
| 7 | +from pathlib import Path | ||
| 7 | 8 | ||
| 8 | -from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR | 9 | +from .constants import EVAL_LOG_DIR |
| 9 | 10 | ||
| 10 | _setup_done = False | 11 | _setup_done = False |
| 11 | 12 | ||
| 12 | 13 | ||
| 13 | -def setup_eval_logging() -> None: | ||
| 14 | - """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.""" | 14 | +def setup_eval_logging(eval_log_dir: Path | None = None) -> Path: |
| 15 | + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist. | ||
| 16 | + | ||
| 17 | + Returns the path to the primary ``eval.log`` file. | ||
| 18 | + """ | ||
| 15 | global _setup_done | 19 | global _setup_done |
| 16 | - if _setup_done: | ||
| 17 | - return | 20 | + log_dir = Path(eval_log_dir).resolve() if eval_log_dir is not None else EVAL_LOG_DIR.resolve() |
| 21 | + verbose_dir = log_dir / "verbose" | ||
| 22 | + log_file = log_dir / "eval.log" | ||
| 18 | 23 | ||
| 19 | - EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True) | ||
| 20 | - EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True) | 24 | + log_dir.mkdir(parents=True, exist_ok=True) |
| 25 | + verbose_dir.mkdir(parents=True, exist_ok=True) | ||
| 21 | 26 | ||
| 22 | fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") | 27 | fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") |
| 23 | root = logging.getLogger("search_eval") | 28 | root = logging.getLogger("search_eval") |
| 24 | root.setLevel(logging.INFO) | 29 | root.setLevel(logging.INFO) |
| 25 | if root.handlers: | 30 | if root.handlers: |
| 26 | _setup_done = True | 31 | _setup_done = True |
| 27 | - return | ||
| 28 | - fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8") | 32 | + return log_file |
| 33 | + | ||
| 34 | + fh = logging.FileHandler(log_file, encoding="utf-8") | ||
| 29 | fh.setFormatter(fmt) | 35 | fh.setFormatter(fmt) |
| 30 | sh = logging.StreamHandler(sys.stderr) | 36 | sh = logging.StreamHandler(sys.stderr) |
| 31 | sh.setFormatter(fmt) | 37 | sh.setFormatter(fmt) |
| @@ -33,3 +39,4 @@ def setup_eval_logging() -> None: | @@ -33,3 +39,4 @@ def setup_eval_logging() -> None: | ||
| 33 | root.addHandler(sh) | 39 | root.addHandler(sh) |
| 34 | root.propagate = False | 40 | root.propagate = False |
| 35 | _setup_done = True | 41 | _setup_done = True |
| 42 | + return log_file |