eval框架配置化

tangwang
1 parent 1c2ba48e
Showing 8 changed files with 380 additions and 97 deletions Show diff stats
config/config.yaml
config/loader.py
config/schema.py
scripts/evaluation/eval_framework/cli.py
scripts/evaluation/eval_framework/clients.py
scripts/evaluation/eval_framework/constants.py
scripts/evaluation/eval_framework/framework.py
scripts/evaluation/eval_framework/logging_setup.py
@@ -64,6 +64,41 @@ assets:
 product_enrich:
   max_workers: 40
+# 离线 / Web 相关性评估（scripts/evaluation、eval-web）
+# CLI 未显式传参时使用此处默认值；search_base_url 未配置时自动为 http://127.0.0.1:{runtime.api_port}
+search_evaluation:
+  artifact_root: artifacts/search_evaluation
+  queries_file: scripts/evaluation/queries/queries.txt
+  eval_log_dir: logs
+  default_tenant_id: '163'
+  search_base_url: ''
+  web_host: 0.0.0.0
+  web_port: 6010
+  judge_model: qwen3.5-plus
+  judge_enable_thinking: false
+  judge_dashscope_batch: false
+  intent_model: qwen3-max
+  intent_enable_thinking: true
+  judge_batch_completion_window: 24h
+  judge_batch_poll_interval_sec: 10.0
+  build_search_depth: 1000
+  build_rerank_depth: 10000
+  annotate_search_top_k: 120
+  annotate_rerank_top_k: 200
+  batch_top_k: 100
+  audit_top_k: 100
+  audit_limit_suspicious: 5
+  default_language: en
+  search_recall_top_k: 200
+  rerank_high_threshold: 0.5
+  rerank_high_skip_count: 1000
+  rebuild_llm_batch_size: 50
+  rebuild_min_llm_batches: 10
+  rebuild_max_llm_batches: 40
+  rebuild_irrelevant_stop_ratio: 0.799
+  rebuild_irrel_low_combined_stop_ratio: 0.959
+  rebuild_irrelevant_stop_streak: 3
+
 # ES Index Settings (基础设置)
 es_settings:
   number_of_shards: 1
@@ -75,7 +110,9 @@ es_settings:
 # 若需要按某个语言单独调权，也可以加显式 key（例如 title.de: 3.2）。
 field_boosts:
   title: 3.0
-  qanchors: 2.5
+  qanchors: 2.3
+  enriched_tags: 2.3
+  keywords: 2.0
   tags: 2.0
   category_name_text: 2.0
   category_path: 2.0
@@ -152,7 +189,11 @@ query_config:
     multilingual_fields:
     - title
     - qanchors
+    - keywords
     - enriched_tags
+    - option1_values
+    - option2_values
+    - option3_values
     - category_path
     - category_name_text
     - brief
@@ -46,6 +46,7 @@ from config.schema import (
     RerankServiceInstanceConfig,
     RuntimeConfig,
     SearchConfig,
+    SearchEvaluationConfig,
     SecretsConfig,
     ServicesConfig,
     SPUConfig,
@@ -263,6 +264,7 @@ class AppConfigLoader:
         product_enrich_config = ProductEnrichConfig(
             max_workers=int(product_enrich_raw.get("max_workers", 40)),
         )
+        search_evaluation_config = self._build_search_evaluation_config(raw, runtime_config)
         metadata = ConfigMetadata(
             loaded_files=tuple(loaded_files),
@@ -278,6 +280,7 @@ class AppConfigLoader:
             services=services_config,
             tenants=tenants_config,
             assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path),
+            search_evaluation=search_evaluation_config,
             metadata=metadata,
         )
@@ -290,6 +293,7 @@ class AppConfigLoader:
             services=app_config.services,
             tenants=app_config.tenants,
             assets=app_config.assets,
+            search_evaluation=app_config.search_evaluation,
             metadata=ConfigMetadata(
                 loaded_files=app_config.metadata.loaded_files,
                 config_hash=config_hash,
@@ -297,6 +301,89 @@ class AppConfigLoader:
             ),
         )
+    def _build_search_evaluation_config(self, raw: Dict[str, Any], runtime: RuntimeConfig) -> SearchEvaluationConfig:
+        se = raw.get("search_evaluation") if isinstance(raw.get("search_evaluation"), dict) else {}
+        default_artifact = (self.project_root / "artifacts" / "search_evaluation").resolve()
+        default_queries = (self.project_root / "scripts" / "evaluation" / "queries" / "queries.txt").resolve()
+        default_log_dir = (self.project_root / "logs").resolve()
+        default_search_base = f"http://127.0.0.1:{int(runtime.api_port)}"
+
+        def _project_path(value: Any, default: Path) -> Path:
+            if value in (None, ""):
+                return default
+            candidate = Path(str(value))
+            if candidate.is_absolute():
+                return candidate.resolve()
+            return (self.project_root / candidate).resolve()
+
+        def _str(key: str, default: str) -> str:
+            v = se.get(key)
+            if v is None or (isinstance(v, str) and not v.strip()):
+                return default
+            return str(v).strip()
+
+        def _int(key: str, default: int) -> int:
+            v = se.get(key)
+            if v is None:
+                return default
+            return int(v)
+
+        def _float(key: str, default: float) -> float:
+            v = se.get(key)
+            if v is None:
+                return default
+            return float(v)
+
+        def _bool(key: str, default: bool) -> bool:
+            v = se.get(key)
+            if v is None:
+                return default
+            if isinstance(v, bool):
+                return v
+            if isinstance(v, str):
+                return v.strip().lower() in {"1", "true", "yes", "on"}
+            return bool(v)
+
+        raw_search_url = se.get("search_base_url")
+        if raw_search_url is None or (isinstance(raw_search_url, str) and not str(raw_search_url).strip()):
+            search_base_url = default_search_base
+        else:
+            search_base_url = str(raw_search_url).strip()
+
+        return SearchEvaluationConfig(
+            artifact_root=_project_path(se.get("artifact_root"), default_artifact),
+            queries_file=_project_path(se.get("queries_file"), default_queries),
+            eval_log_dir=_project_path(se.get("eval_log_dir"), default_log_dir),
+            default_tenant_id=_str("default_tenant_id", "163"),
+            search_base_url=search_base_url,
+            web_host=_str("web_host", "0.0.0.0"),
+            web_port=_int("web_port", 6010),
+            judge_model=_str("judge_model", "qwen3.5-plus"),
+            judge_enable_thinking=_bool("judge_enable_thinking", False),
+            judge_dashscope_batch=_bool("judge_dashscope_batch", False),
+            intent_model=_str("intent_model", "qwen3-max"),
+            intent_enable_thinking=_bool("intent_enable_thinking", True),
+            judge_batch_completion_window=_str("judge_batch_completion_window", "24h"),
+            judge_batch_poll_interval_sec=_float("judge_batch_poll_interval_sec", 10.0),
+            build_search_depth=_int("build_search_depth", 1000),
+            build_rerank_depth=_int("build_rerank_depth", 10000),
+            annotate_search_top_k=_int("annotate_search_top_k", 120),
+            annotate_rerank_top_k=_int("annotate_rerank_top_k", 200),
+            batch_top_k=_int("batch_top_k", 100),
+            audit_top_k=_int("audit_top_k", 100),
+            audit_limit_suspicious=_int("audit_limit_suspicious", 5),
+            default_language=_str("default_language", "en"),
+            search_recall_top_k=_int("search_recall_top_k", 200),
+            rerank_high_threshold=_float("rerank_high_threshold", 0.5),
+            rerank_high_skip_count=_int("rerank_high_skip_count", 1000),
+            rebuild_llm_batch_size=_int("rebuild_llm_batch_size", 50),
+            rebuild_min_llm_batches=_int("rebuild_min_llm_batches", 10),
+            rebuild_max_llm_batches=_int("rebuild_max_llm_batches", 40),
+            rebuild_irrelevant_stop_ratio=_float("rebuild_irrelevant_stop_ratio", 0.799),
+            rebuild_irrel_low_combined_stop_ratio=_float("rebuild_irrel_low_combined_stop_ratio", 0.959),
+            rebuild_irrelevant_stop_streak=_int("rebuild_irrelevant_stop_streak", 3),
+        )
+
     def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig:
         field_boosts = raw.get("field_boosts") or {}
         if not isinstance(field_boosts, dict):
@@ -376,6 +376,43 @@ class AssetsConfig:
 @dataclass(frozen=True)
+class SearchEvaluationConfig:
+    """Offline / web UI search evaluation (YAML: ``search_evaluation``)."""
+
+    artifact_root: Path
+    queries_file: Path
+    eval_log_dir: Path
+    default_tenant_id: str
+    search_base_url: str
+    web_host: str
+    web_port: int
+    judge_model: str
+    judge_enable_thinking: bool
+    judge_dashscope_batch: bool
+    intent_model: str
+    intent_enable_thinking: bool
+    judge_batch_completion_window: str
+    judge_batch_poll_interval_sec: float
+    build_search_depth: int
+    build_rerank_depth: int
+    annotate_search_top_k: int
+    annotate_rerank_top_k: int
+    batch_top_k: int
+    audit_top_k: int
+    audit_limit_suspicious: int
+    default_language: str
+    search_recall_top_k: int
+    rerank_high_threshold: float
+    rerank_high_skip_count: int
+    rebuild_llm_batch_size: int
+    rebuild_min_llm_batches: int
+    rebuild_max_llm_batches: int
+    rebuild_irrelevant_stop_ratio: float
+    rebuild_irrel_low_combined_stop_ratio: float
+    rebuild_irrelevant_stop_streak: int
+
+
+@dataclass(frozen=True)
 class ConfigMetadata:
     loaded_files: Tuple[str, ...]
     config_hash: str
@@ -393,6 +430,7 @@ class AppConfig:
     services: ServicesConfig
     tenants: TenantCatalogConfig
     assets: AssetsConfig
+    search_evaluation: SearchEvaluationConfig
     metadata: ConfigMetadata
     def sanitized_dict(self) -> Dict[str, Any]:
@@ -8,21 +8,6 @@ import logging
 from pathlib import Path
 from typing import Any, Dict
-from .constants import (
-    DEFAULT_INTENT_ENABLE_THINKING,
-    DEFAULT_INTENT_MODEL,
-    DEFAULT_QUERY_FILE,
-    DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
-    DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
-    DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
-    DEFAULT_REBUILD_LLM_BATCH_SIZE,
-    DEFAULT_REBUILD_MAX_LLM_BATCHES,
-    DEFAULT_REBUILD_MIN_LLM_BATCHES,
-    DEFAULT_RERANK_HIGH_SKIP_COUNT,
-    DEFAULT_RERANK_HIGH_THRESHOLD,
-    DEFAULT_SEARCH_RECALL_TOP_K,
-)
-from .constants import EVAL_LOG_FILE
 from .framework import SearchEvaluationFramework
 from .logging_setup import setup_eval_logging
 from .utils import ensure_dir, utc_now_iso, utc_timestamp
@@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -&gt; None:
         "--judge-model",
         default=None,
         metavar="MODEL",
-        help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).",
+        help="Judge LLM model (default: config.yaml search_evaluation.judge_model).",
     )
     p.add_argument(
         "--enable-thinking",
         action=argparse.BooleanOptionalAction,
         default=None,
-        help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).",
+        help="enable_thinking for DashScope (default: search_evaluation.judge_enable_thinking).",
     )
     p.add_argument(
         "--dashscope-batch",
         action=argparse.BooleanOptionalAction,
         default=None,
-        help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).",
+        help="DashScope Batch File API vs sync chat (default: search_evaluation.judge_dashscope_batch).",
     )
@@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -&gt; None:
         "--intent-model",
         default=None,
         metavar="MODEL",
-        help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).",
+        help="Query-intent LLM model before relevance judging (default: search_evaluation.intent_model).",
     )
     p.add_argument(
         "--intent-enable-thinking",
         action=argparse.BooleanOptionalAction,
         default=None,
-        help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).",
+        help="enable_thinking for intent model (default: search_evaluation.intent_enable_thinking).",
     )
@@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -&gt; Dict[str, Any]:
     return kw
+def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None:
+    """Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``)."""
+    from config.loader import get_app_config
+
+    se = get_app_config().search_evaluation
+    if getattr(args, "tenant_id", None) in (None, ""):
+        args.tenant_id = se.default_tenant_id
+    if getattr(args, "queries_file", None) in (None, ""):
+        args.queries_file = str(se.queries_file)
+    if getattr(args, "language", None) in (None, ""):
+        args.language = se.default_language
+
+    if args.command == "serve":
+        if getattr(args, "host", None) in (None, ""):
+            args.host = se.web_host
+        if getattr(args, "port", None) is None:
+            args.port = se.web_port
+
+    if args.command == "batch":
+        if getattr(args, "top_k", None) is None:
+            args.top_k = se.batch_top_k
+
+    if args.command == "audit":
+        if getattr(args, "top_k", None) is None:
+            args.top_k = se.audit_top_k
+        if getattr(args, "limit_suspicious", None) is None:
+            args.limit_suspicious = se.audit_limit_suspicious
+
+    if args.command == "build":
+        if getattr(args, "search_depth", None) is None:
+            args.search_depth = se.build_search_depth
+        if getattr(args, "rerank_depth", None) is None:
+            args.rerank_depth = se.build_rerank_depth
+        if getattr(args, "annotate_search_top_k", None) is None:
+            args.annotate_search_top_k = se.annotate_search_top_k
+        if getattr(args, "annotate_rerank_top_k", None) is None:
+            args.annotate_rerank_top_k = se.annotate_rerank_top_k
+        if getattr(args, "search_recall_top_k", None) is None:
+            args.search_recall_top_k = se.search_recall_top_k
+        if getattr(args, "rerank_high_threshold", None) is None:
+            args.rerank_high_threshold = se.rerank_high_threshold
+        if getattr(args, "rerank_high_skip_count", None) is None:
+            args.rerank_high_skip_count = se.rerank_high_skip_count
+        if getattr(args, "rebuild_llm_batch_size", None) is None:
+            args.rebuild_llm_batch_size = se.rebuild_llm_batch_size
+        if getattr(args, "rebuild_min_batches", None) is None:
+            args.rebuild_min_batches = se.rebuild_min_llm_batches
+        if getattr(args, "rebuild_max_batches", None) is None:
+            args.rebuild_max_batches = se.rebuild_max_llm_batches
+        if getattr(args, "rebuild_irrelevant_stop_ratio", None) is None:
+            args.rebuild_irrelevant_stop_ratio = se.rebuild_irrelevant_stop_ratio
+        if getattr(args, "rebuild_irrel_low_combined_stop_ratio", None) is None:
+            args.rebuild_irrel_low_combined_stop_ratio = se.rebuild_irrel_low_combined_stop_ratio
+        if getattr(args, "rebuild_irrelevant_stop_streak", None) is None:
+            args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak
+
+
 def build_cli_parser() -> argparse.ArgumentParser:
     parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
     sub = parser.add_subparsers(dest="command", required=True)
     build = sub.add_parser("build", help="Build pooled annotation set for queries")
-    build.add_argument("--tenant-id", default="163")
-    build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
-    build.add_argument("--search-depth", type=int, default=1000)
-    build.add_argument("--rerank-depth", type=int, default=10000)
-    build.add_argument("--annotate-search-top-k", type=int, default=120)
-    build.add_argument("--annotate-rerank-top-k", type=int, default=200)
+    build.add_argument(
+        "--tenant-id",
+        default=None,
+        help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).",
+    )
+    build.add_argument(
+        "--queries-file",
+        default=None,
+        help="Query list file (default: search_evaluation.queries_file).",
+    )
+    build.add_argument(
+        "--search-depth",
+        type=int,
+        default=None,
+        help="Default: search_evaluation.build_search_depth.",
+    )
+    build.add_argument(
+        "--rerank-depth",
+        type=int,
+        default=None,
+        help="Default: search_evaluation.build_rerank_depth.",
+    )
+    build.add_argument(
+        "--annotate-search-top-k",
+        type=int,
+        default=None,
+        help="Default: search_evaluation.annotate_search_top_k.",
+    )
+    build.add_argument(
+        "--annotate-rerank-top-k",
+        type=int,
+        default=None,
+        help="Default: search_evaluation.annotate_rerank_top_k.",
+    )
     build.add_argument(
         "--search-recall-top-k",
         type=int,
@@ -118,7 +188,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         "--rebuild-irrelevant-stop-ratio",
         type=float,
         default=None,
-        help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).",
+        help="Rebuild only: bad batch requires irrelevant_ratio > this (default: search_evaluation.rebuild_irrelevant_stop_ratio).",
     )
     build.add_argument(
         "--rebuild-irrel-low-combined-stop-ratio",
@@ -132,36 +202,45 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         default=None,
         help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
     )
-    build.add_argument("--language", default="en")
+    build.add_argument(
+        "--language",
+        default=None,
+        help="Default: search_evaluation.default_language.",
+    )
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(build)
     add_intent_llm_args(build)
     batch = sub.add_parser("batch", help="Run batch evaluation against live search")
-    batch.add_argument("--tenant-id", default="163")
-    batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
-    batch.add_argument("--top-k", type=int, default=100)
-    batch.add_argument("--language", default="en")
+    batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
+    batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
+    batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
+    batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
     batch.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(batch)
     add_intent_llm_args(batch)
     audit = sub.add_parser("audit", help="Audit annotation quality for queries")
-    audit.add_argument("--tenant-id", default="163")
-    audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
-    audit.add_argument("--top-k", type=int, default=100)
-    audit.add_argument("--language", default="en")
-    audit.add_argument("--limit-suspicious", type=int, default=5)
+    audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
+    audit.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
+    audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
+    audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
+    audit.add_argument(
+        "--limit-suspicious",
+        type=int,
+        default=None,
+        help="Default: search_evaluation.audit_limit_suspicious.",
+    )
     audit.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(audit)
     add_intent_llm_args(audit)
     serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
-    serve.add_argument("--tenant-id", default="163")
-    serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
-    serve.add_argument("--host", default="0.0.0.0")
-    serve.add_argument("--port", type=int, default=6010)
+    serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
+    serve.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
+    serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.")
+    serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.")
     add_judge_llm_args(serve)
     add_intent_llm_args(serve)
@@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -&gt; None:
     rebuild_kwargs = {}
     if args.force_refresh_labels:
         rebuild_kwargs = {
-            "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K,
-            "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD,
-            "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT,
-            "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE,
-            "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES,
-            "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES,
-            "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio
-            if args.rebuild_irrelevant_stop_ratio is not None
-            else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
-            "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio
-            if args.rebuild_irrel_low_combined_stop_ratio is not None
-            else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
-            "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak
-            if args.rebuild_irrelevant_stop_streak is not None
-            else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
+            "search_recall_top_k": args.search_recall_top_k,
+            "rerank_high_threshold": args.rerank_high_threshold,
+            "rerank_high_skip_count": args.rerank_high_skip_count,
+            "rebuild_llm_batch_size": args.rebuild_llm_batch_size,
+            "rebuild_min_batches": args.rebuild_min_batches,
+            "rebuild_max_batches": args.rebuild_max_batches,
+            "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio,
+            "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio,
+            "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak,
         }
-    for query in queries:
+    total_q = len(queries)
+    for q_index, query in enumerate(queries, start=1):
+        _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
         result = framework.build_query_annotation_set(
             query=query,
             search_depth=args.search_depth,
@@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
 def run_batch(args: argparse.Namespace) -> None:
     framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
+    _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
     payload = framework.batch_evaluate(
         queries=queries,
         top_k=args.top_k,
@@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -&gt; None:
 def main() -> None:
-    setup_eval_logging()
+    from config.loader import get_app_config
+
+    se = get_app_config().search_evaluation
+    log_file = setup_eval_logging(se.eval_log_dir)
     parser = build_cli_parser()
     args = parser.parse_args()
+    _apply_search_evaluation_cli_defaults(args)
     logging.getLogger("search_eval").info(
         "CLI start command=%s tenant_id=%s log_file=%s",
         args.command,
         getattr(args, "tenant_id", ""),
-        EVAL_LOG_FILE.resolve(),
+        log_file.resolve(),
     )
     if args.command == "build":
         run_build(args)
@@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple
 import requests
-from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS
+from .constants import VALID_LABELS
 from .logging_setup import setup_eval_logging
 from .prompts import classify_prompt, intent_analysis_prompt
 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
@@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False
 def _get_eval_llm_verbose_logger() -> logging.Logger:
-    """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``."""
-    setup_eval_logging()
+    """File logger for full LLM prompts/responses under ``search_evaluation.eval_log_dir/verbose/``."""
+    from config.loader import get_app_config
+
+    se = get_app_config().search_evaluation
+    setup_eval_logging(se.eval_log_dir)
     global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged
     with _VERBOSE_LOGGER_LOCK:
         if _eval_llm_verbose_logger_singleton is not None:
             return _eval_llm_verbose_logger_singleton
-        log_path = EVAL_VERBOSE_LOG_FILE
+        log_path = se.eval_log_dir / "verbose" / "eval_verbose.log"
         log_path.parent.mkdir(parents=True, exist_ok=True)
         lg = logging.getLogger("search_eval.verbose_llm")
         lg.setLevel(logging.INFO)
@@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False
 DEFAULT_JUDGE_DASHSCOPE_BATCH = False
 # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
-DEFAULT_INTENT_MODEL = "qwen-max"
+DEFAULT_INTENT_MODEL = "qwen3-max"
 DEFAULT_INTENT_ENABLE_THINKING = True
 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
@@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name
 from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient
 from .constants import (
-    DEFAULT_ARTIFACT_ROOT,
-    DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW,
-    DEFAULT_INTENT_ENABLE_THINKING,
-    DEFAULT_INTENT_MODEL,
-    DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC,
-    DEFAULT_JUDGE_DASHSCOPE_BATCH,
-    DEFAULT_JUDGE_ENABLE_THINKING,
-    DEFAULT_JUDGE_MODEL,
     DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK,
@@ -79,8 +71,8 @@ class SearchEvaluationFramework:
     def __init__(
         self,
         tenant_id: str,
-        artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
-        search_base_url: str = "http://localhost:6002",
+        artifact_root: Path | None = None,
+        search_base_url: str | None = None,
         *,
         judge_model: str | None = None,
         enable_thinking: bool | None = None,
@@ -88,12 +80,14 @@ class SearchEvaluationFramework:
         intent_model: str | None = None,
         intent_enable_thinking: bool | None = None,
     ):
-        init_service(get_app_config().infrastructure.elasticsearch.host)
+        app_cfg = get_app_config()
+        se = app_cfg.search_evaluation
+        init_service(app_cfg.infrastructure.elasticsearch.host)
         self.tenant_id = str(tenant_id)
-        self.artifact_root = ensure_dir(artifact_root)
+        self.artifact_root = ensure_dir(artifact_root if artifact_root is not None else se.artifact_root)
         self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
-        self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
-        app_cfg = get_app_config()
+        sb = search_base_url if search_base_url is not None else se.search_base_url
+        self.search_client = SearchServiceClient(sb, self.tenant_id)
         rerank_service_url = str(
             app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
         )
@@ -102,11 +96,11 @@ class SearchEvaluationFramework:
         api_key = app_cfg.infrastructure.secrets.dashscope_api_key
         if not api_key:
             raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
-        model = str(judge_model or DEFAULT_JUDGE_MODEL)
-        et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking
-        use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch
-        batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW
-        batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC)
+        model = str(judge_model if judge_model is not None else se.judge_model)
+        et = se.judge_enable_thinking if enable_thinking is None else enable_thinking
+        use_batch = se.judge_dashscope_batch if use_dashscope_batch is None else use_dashscope_batch
+        batch_window = se.judge_batch_completion_window
+        batch_poll = float(se.judge_batch_poll_interval_sec)
         self.label_client = DashScopeLabelClient(
             model=model,
             base_url=str(llm_cfg["base_url"]),
@@ -116,8 +110,8 @@ class SearchEvaluationFramework:
             enable_thinking=et,
             use_batch=use_batch,
         )
-        intent_m = str(intent_model or DEFAULT_INTENT_MODEL)
-        intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking
+        intent_m = str(intent_model if intent_model is not None else se.intent_model)
+        intent_et = se.intent_enable_thinking if intent_enable_thinking is None else intent_enable_thinking
         self.intent_client = DashScopeLabelClient(
             model=intent_m,
             base_url=str(llm_cfg["base_url"]),
@@ -629,6 +623,21 @@ class SearchEvaluationFramework:
         corpus = self.corpus_docs(refresh=False)
         corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()}
+        rerank_pending_n = sum(
+            1
+            for d in corpus
+            if str(d.get("spu_id") or "").strip()
+            and str(d.get("spu_id")) not in pool_spu_ids
+        )
+        _log.info(
+            "[eval-rebuild] query=%r phase=rerank_outside_pool docs≈%s (pool=%s, force_refresh_rerank=%s); "
+            "this can take a long time with no further logs until LLM batches start",
+            query,
+            rerank_pending_n,
+            len(pool_spu_ids),
+            force_refresh_rerank,
+        )
+
         ranked_outside = self.full_corpus_rerank_outside_exclude(
             query=query,
             docs=corpus,
@@ -905,7 +914,9 @@ class SearchEvaluationFramework:
         force_refresh_labels: bool = False,
     ) -> Dict[str, Any]:
         per_query = []
-        for query in queries:
+        total_q = len(queries)
+        _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate)
+        for q_index, query in enumerate(queries, start=1):
             live = self.evaluate_live_query(
                 query,
                 top_k=top_k,
@@ -927,6 +938,16 @@ class SearchEvaluationFramework:
                     "total": live["total"],
                 }
             )
+            m = live["metrics"]
+            _log.info(
+                "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s",
+                q_index,
+                total_q,
+                query,
+                m.get("P@10"),
+                m.get("MAP_3"),
+                live.get("total"),
+            )
         aggregate = aggregate_metrics([item["metrics"] for item in per_query])
         aggregate_distribution = {
             RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
@@ -955,5 +976,11 @@ class SearchEvaluationFramework:
         output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
         report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
         self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
+        _log.info(
+            "[batch-eval] finished batch_id=%s per_query=%s json=%s",
+            batch_id,
+            len(per_query),
+            output_json_path,
+        )
         return payload
-"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``)."""
+"""Configure dedicated eval run logs (defaults: repo ``logs/``; override via ``config.yaml`` ``search_evaluation.eval_log_dir``)."""
 from __future__ import annotations
 import logging
 import sys
+from pathlib import Path
-from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR
+from .constants import EVAL_LOG_DIR
 _setup_done = False
-def setup_eval_logging() -> None:
-    """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist."""
+def setup_eval_logging(eval_log_dir: Path | None = None) -> Path:
+    """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.
+
+    Returns the path to the primary ``eval.log`` file.
+    """
     global _setup_done
-    if _setup_done:
-        return
+    log_dir = Path(eval_log_dir).resolve() if eval_log_dir is not None else EVAL_LOG_DIR.resolve()
+    verbose_dir = log_dir / "verbose"
+    log_file = log_dir / "eval.log"
-    EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True)
-    EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True)
+    log_dir.mkdir(parents=True, exist_ok=True)
+    verbose_dir.mkdir(parents=True, exist_ok=True)
     fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
     root = logging.getLogger("search_eval")
     root.setLevel(logging.INFO)
     if root.handlers:
         _setup_done = True
-        return
-    fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8")
+        return log_file
+
+    fh = logging.FileHandler(log_file, encoding="utf-8")
     fh.setFormatter(fmt)
     sh = logging.StreamHandler(sys.stderr)
     sh.setFormatter(fmt)
@@ -33,3 +39,4 @@ def setup_eval_logging() -&gt; None:
     root.addHandler(sh)
     root.propagate = False
     _setup_done = True
+    return log_file