diff --git a/config/config.yaml b/config/config.yaml index f217327..9f1c772 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -64,6 +64,41 @@ assets: product_enrich: max_workers: 40 +# 离线 / Web 相关性评估(scripts/evaluation、eval-web) +# CLI 未显式传参时使用此处默认值;search_base_url 未配置时自动为 http://127.0.0.1:{runtime.api_port} +search_evaluation: + artifact_root: artifacts/search_evaluation + queries_file: scripts/evaluation/queries/queries.txt + eval_log_dir: logs + default_tenant_id: '163' + search_base_url: '' + web_host: 0.0.0.0 + web_port: 6010 + judge_model: qwen3.5-plus + judge_enable_thinking: false + judge_dashscope_batch: false + intent_model: qwen3-max + intent_enable_thinking: true + judge_batch_completion_window: 24h + judge_batch_poll_interval_sec: 10.0 + build_search_depth: 1000 + build_rerank_depth: 10000 + annotate_search_top_k: 120 + annotate_rerank_top_k: 200 + batch_top_k: 100 + audit_top_k: 100 + audit_limit_suspicious: 5 + default_language: en + search_recall_top_k: 200 + rerank_high_threshold: 0.5 + rerank_high_skip_count: 1000 + rebuild_llm_batch_size: 50 + rebuild_min_llm_batches: 10 + rebuild_max_llm_batches: 40 + rebuild_irrelevant_stop_ratio: 0.799 + rebuild_irrel_low_combined_stop_ratio: 0.959 + rebuild_irrelevant_stop_streak: 3 + # ES Index Settings (基础设置) es_settings: number_of_shards: 1 @@ -75,7 +110,9 @@ es_settings: # 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 field_boosts: title: 3.0 - qanchors: 2.5 + qanchors: 2.3 + enriched_tags: 2.3 + keywords: 2.0 tags: 2.0 category_name_text: 2.0 category_path: 2.0 @@ -152,7 +189,11 @@ query_config: multilingual_fields: - title - qanchors + - keywords - enriched_tags + - option1_values + - option2_values + - option3_values - category_path - category_name_text - brief diff --git a/config/loader.py b/config/loader.py index c084b3d..c2332c6 100644 --- a/config/loader.py +++ b/config/loader.py @@ -46,6 +46,7 @@ from config.schema import ( RerankServiceInstanceConfig, RuntimeConfig, SearchConfig, + SearchEvaluationConfig, SecretsConfig, ServicesConfig, SPUConfig, @@ -263,6 +264,7 @@ class AppConfigLoader: product_enrich_config = ProductEnrichConfig( max_workers=int(product_enrich_raw.get("max_workers", 40)), ) + search_evaluation_config = self._build_search_evaluation_config(raw, runtime_config) metadata = ConfigMetadata( loaded_files=tuple(loaded_files), @@ -278,6 +280,7 @@ class AppConfigLoader: services=services_config, tenants=tenants_config, assets=AssetsConfig(query_rewrite_dictionary_path=rewrite_path), + search_evaluation=search_evaluation_config, metadata=metadata, ) @@ -290,6 +293,7 @@ class AppConfigLoader: services=app_config.services, tenants=app_config.tenants, assets=app_config.assets, + search_evaluation=app_config.search_evaluation, metadata=ConfigMetadata( loaded_files=app_config.metadata.loaded_files, config_hash=config_hash, @@ -297,6 +301,89 @@ class AppConfigLoader: ), ) + def _build_search_evaluation_config(self, raw: Dict[str, Any], runtime: RuntimeConfig) -> SearchEvaluationConfig: + se = raw.get("search_evaluation") if isinstance(raw.get("search_evaluation"), dict) else {} + default_artifact = (self.project_root / "artifacts" / "search_evaluation").resolve() + default_queries = (self.project_root / "scripts" / "evaluation" / "queries" / "queries.txt").resolve() + default_log_dir = (self.project_root / "logs").resolve() + default_search_base = f"http://127.0.0.1:{int(runtime.api_port)}" + + def _project_path(value: Any, default: Path) -> Path: + if value in (None, ""): + return default + candidate = Path(str(value)) + if candidate.is_absolute(): + return candidate.resolve() + return (self.project_root / candidate).resolve() + + def _str(key: str, default: str) -> str: + v = se.get(key) + if v is None or (isinstance(v, str) and not v.strip()): + return default + return str(v).strip() + + def _int(key: str, default: int) -> int: + v = se.get(key) + if v is None: + return default + return int(v) + + def _float(key: str, default: float) -> float: + v = se.get(key) + if v is None: + return default + return float(v) + + def _bool(key: str, default: bool) -> bool: + v = se.get(key) + if v is None: + return default + if isinstance(v, bool): + return v + if isinstance(v, str): + return v.strip().lower() in {"1", "true", "yes", "on"} + return bool(v) + + raw_search_url = se.get("search_base_url") + if raw_search_url is None or (isinstance(raw_search_url, str) and not str(raw_search_url).strip()): + search_base_url = default_search_base + else: + search_base_url = str(raw_search_url).strip() + + return SearchEvaluationConfig( + artifact_root=_project_path(se.get("artifact_root"), default_artifact), + queries_file=_project_path(se.get("queries_file"), default_queries), + eval_log_dir=_project_path(se.get("eval_log_dir"), default_log_dir), + default_tenant_id=_str("default_tenant_id", "163"), + search_base_url=search_base_url, + web_host=_str("web_host", "0.0.0.0"), + web_port=_int("web_port", 6010), + judge_model=_str("judge_model", "qwen3.5-plus"), + judge_enable_thinking=_bool("judge_enable_thinking", False), + judge_dashscope_batch=_bool("judge_dashscope_batch", False), + intent_model=_str("intent_model", "qwen3-max"), + intent_enable_thinking=_bool("intent_enable_thinking", True), + judge_batch_completion_window=_str("judge_batch_completion_window", "24h"), + judge_batch_poll_interval_sec=_float("judge_batch_poll_interval_sec", 10.0), + build_search_depth=_int("build_search_depth", 1000), + build_rerank_depth=_int("build_rerank_depth", 10000), + annotate_search_top_k=_int("annotate_search_top_k", 120), + annotate_rerank_top_k=_int("annotate_rerank_top_k", 200), + batch_top_k=_int("batch_top_k", 100), + audit_top_k=_int("audit_top_k", 100), + audit_limit_suspicious=_int("audit_limit_suspicious", 5), + default_language=_str("default_language", "en"), + search_recall_top_k=_int("search_recall_top_k", 200), + rerank_high_threshold=_float("rerank_high_threshold", 0.5), + rerank_high_skip_count=_int("rerank_high_skip_count", 1000), + rebuild_llm_batch_size=_int("rebuild_llm_batch_size", 50), + rebuild_min_llm_batches=_int("rebuild_min_llm_batches", 10), + rebuild_max_llm_batches=_int("rebuild_max_llm_batches", 40), + rebuild_irrelevant_stop_ratio=_float("rebuild_irrelevant_stop_ratio", 0.799), + rebuild_irrel_low_combined_stop_ratio=_float("rebuild_irrel_low_combined_stop_ratio", 0.959), + rebuild_irrelevant_stop_streak=_int("rebuild_irrelevant_stop_streak", 3), + ) + def _build_search_config(self, raw: Dict[str, Any], rewrite_dictionary: Dict[str, str]) -> SearchConfig: field_boosts = raw.get("field_boosts") or {} if not isinstance(field_boosts, dict): diff --git a/config/schema.py b/config/schema.py index 0554aab..8cf0466 100644 --- a/config/schema.py +++ b/config/schema.py @@ -376,6 +376,43 @@ class AssetsConfig: @dataclass(frozen=True) +class SearchEvaluationConfig: + """Offline / web UI search evaluation (YAML: ``search_evaluation``).""" + + artifact_root: Path + queries_file: Path + eval_log_dir: Path + default_tenant_id: str + search_base_url: str + web_host: str + web_port: int + judge_model: str + judge_enable_thinking: bool + judge_dashscope_batch: bool + intent_model: str + intent_enable_thinking: bool + judge_batch_completion_window: str + judge_batch_poll_interval_sec: float + build_search_depth: int + build_rerank_depth: int + annotate_search_top_k: int + annotate_rerank_top_k: int + batch_top_k: int + audit_top_k: int + audit_limit_suspicious: int + default_language: str + search_recall_top_k: int + rerank_high_threshold: float + rerank_high_skip_count: int + rebuild_llm_batch_size: int + rebuild_min_llm_batches: int + rebuild_max_llm_batches: int + rebuild_irrelevant_stop_ratio: float + rebuild_irrel_low_combined_stop_ratio: float + rebuild_irrelevant_stop_streak: int + + +@dataclass(frozen=True) class ConfigMetadata: loaded_files: Tuple[str, ...] config_hash: str @@ -393,6 +430,7 @@ class AppConfig: services: ServicesConfig tenants: TenantCatalogConfig assets: AssetsConfig + search_evaluation: SearchEvaluationConfig metadata: ConfigMetadata def sanitized_dict(self) -> Dict[str, Any]: diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 2de3101..0c76e86 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -8,21 +8,6 @@ import logging from pathlib import Path from typing import Any, Dict -from .constants import ( - DEFAULT_INTENT_ENABLE_THINKING, - DEFAULT_INTENT_MODEL, - DEFAULT_QUERY_FILE, - DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, - DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, - DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, - DEFAULT_REBUILD_LLM_BATCH_SIZE, - DEFAULT_REBUILD_MAX_LLM_BATCHES, - DEFAULT_REBUILD_MIN_LLM_BATCHES, - DEFAULT_RERANK_HIGH_SKIP_COUNT, - DEFAULT_RERANK_HIGH_THRESHOLD, - DEFAULT_SEARCH_RECALL_TOP_K, -) -from .constants import EVAL_LOG_FILE from .framework import SearchEvaluationFramework from .logging_setup import setup_eval_logging from .utils import ensure_dir, utc_now_iso, utc_timestamp @@ -36,19 +21,19 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: "--judge-model", default=None, metavar="MODEL", - help="Judge LLM model (default: eval_framework.constants.DEFAULT_JUDGE_MODEL).", + help="Judge LLM model (default: config.yaml search_evaluation.judge_model).", ) p.add_argument( "--enable-thinking", action=argparse.BooleanOptionalAction, default=None, - help="enable_thinking for DashScope (default: DEFAULT_JUDGE_ENABLE_THINKING).", + help="enable_thinking for DashScope (default: search_evaluation.judge_enable_thinking).", ) p.add_argument( "--dashscope-batch", action=argparse.BooleanOptionalAction, default=None, - help="DashScope Batch File API vs sync chat (default: DEFAULT_JUDGE_DASHSCOPE_BATCH).", + help="DashScope Batch File API vs sync chat (default: search_evaluation.judge_dashscope_batch).", ) @@ -57,13 +42,13 @@ def add_intent_llm_args(p: argparse.ArgumentParser) -> None: "--intent-model", default=None, metavar="MODEL", - help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).", + help="Query-intent LLM model before relevance judging (default: search_evaluation.intent_model).", ) p.add_argument( "--intent-enable-thinking", action=argparse.BooleanOptionalAction, default=None, - help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).", + help="enable_thinking for intent model (default: search_evaluation.intent_enable_thinking).", ) @@ -82,17 +67,102 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: return kw +def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None: + """Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``).""" + from config.loader import get_app_config + + se = get_app_config().search_evaluation + if getattr(args, "tenant_id", None) in (None, ""): + args.tenant_id = se.default_tenant_id + if getattr(args, "queries_file", None) in (None, ""): + args.queries_file = str(se.queries_file) + if getattr(args, "language", None) in (None, ""): + args.language = se.default_language + + if args.command == "serve": + if getattr(args, "host", None) in (None, ""): + args.host = se.web_host + if getattr(args, "port", None) is None: + args.port = se.web_port + + if args.command == "batch": + if getattr(args, "top_k", None) is None: + args.top_k = se.batch_top_k + + if args.command == "audit": + if getattr(args, "top_k", None) is None: + args.top_k = se.audit_top_k + if getattr(args, "limit_suspicious", None) is None: + args.limit_suspicious = se.audit_limit_suspicious + + if args.command == "build": + if getattr(args, "search_depth", None) is None: + args.search_depth = se.build_search_depth + if getattr(args, "rerank_depth", None) is None: + args.rerank_depth = se.build_rerank_depth + if getattr(args, "annotate_search_top_k", None) is None: + args.annotate_search_top_k = se.annotate_search_top_k + if getattr(args, "annotate_rerank_top_k", None) is None: + args.annotate_rerank_top_k = se.annotate_rerank_top_k + if getattr(args, "search_recall_top_k", None) is None: + args.search_recall_top_k = se.search_recall_top_k + if getattr(args, "rerank_high_threshold", None) is None: + args.rerank_high_threshold = se.rerank_high_threshold + if getattr(args, "rerank_high_skip_count", None) is None: + args.rerank_high_skip_count = se.rerank_high_skip_count + if getattr(args, "rebuild_llm_batch_size", None) is None: + args.rebuild_llm_batch_size = se.rebuild_llm_batch_size + if getattr(args, "rebuild_min_batches", None) is None: + args.rebuild_min_batches = se.rebuild_min_llm_batches + if getattr(args, "rebuild_max_batches", None) is None: + args.rebuild_max_batches = se.rebuild_max_llm_batches + if getattr(args, "rebuild_irrelevant_stop_ratio", None) is None: + args.rebuild_irrelevant_stop_ratio = se.rebuild_irrelevant_stop_ratio + if getattr(args, "rebuild_irrel_low_combined_stop_ratio", None) is None: + args.rebuild_irrel_low_combined_stop_ratio = se.rebuild_irrel_low_combined_stop_ratio + if getattr(args, "rebuild_irrelevant_stop_streak", None) is None: + args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak + + def build_cli_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") sub = parser.add_subparsers(dest="command", required=True) build = sub.add_parser("build", help="Build pooled annotation set for queries") - build.add_argument("--tenant-id", default="163") - build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) - build.add_argument("--search-depth", type=int, default=1000) - build.add_argument("--rerank-depth", type=int, default=10000) - build.add_argument("--annotate-search-top-k", type=int, default=120) - build.add_argument("--annotate-rerank-top-k", type=int, default=200) + build.add_argument( + "--tenant-id", + default=None, + help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).", + ) + build.add_argument( + "--queries-file", + default=None, + help="Query list file (default: search_evaluation.queries_file).", + ) + build.add_argument( + "--search-depth", + type=int, + default=None, + help="Default: search_evaluation.build_search_depth.", + ) + build.add_argument( + "--rerank-depth", + type=int, + default=None, + help="Default: search_evaluation.build_rerank_depth.", + ) + build.add_argument( + "--annotate-search-top-k", + type=int, + default=None, + help="Default: search_evaluation.annotate_search_top_k.", + ) + build.add_argument( + "--annotate-rerank-top-k", + type=int, + default=None, + help="Default: search_evaluation.annotate_rerank_top_k.", + ) build.add_argument( "--search-recall-top-k", type=int, @@ -118,7 +188,7 @@ def build_cli_parser() -> argparse.ArgumentParser: "--rebuild-irrelevant-stop-ratio", type=float, default=None, - help="Rebuild only: bad batch requires irrelevant_ratio > this (default 0.939).", + help="Rebuild only: bad batch requires irrelevant_ratio > this (default: search_evaluation.rebuild_irrelevant_stop_ratio).", ) build.add_argument( "--rebuild-irrel-low-combined-stop-ratio", @@ -132,36 +202,45 @@ def build_cli_parser() -> argparse.ArgumentParser: default=None, help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).", ) - build.add_argument("--language", default="en") + build.add_argument( + "--language", + default=None, + help="Default: search_evaluation.default_language.", + ) build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(build) add_intent_llm_args(build) batch = sub.add_parser("batch", help="Run batch evaluation against live search") - batch.add_argument("--tenant-id", default="163") - batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) - batch.add_argument("--top-k", type=int, default=100) - batch.add_argument("--language", default="en") + batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") + batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") + batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") + batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") batch.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(batch) add_intent_llm_args(batch) audit = sub.add_parser("audit", help="Audit annotation quality for queries") - audit.add_argument("--tenant-id", default="163") - audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) - audit.add_argument("--top-k", type=int, default=100) - audit.add_argument("--language", default="en") - audit.add_argument("--limit-suspicious", type=int, default=5) + audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") + audit.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") + audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") + audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") + audit.add_argument( + "--limit-suspicious", + type=int, + default=None, + help="Default: search_evaluation.audit_limit_suspicious.", + ) audit.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(audit) add_intent_llm_args(audit) serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") - serve.add_argument("--tenant-id", default="163") - serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) - serve.add_argument("--host", default="0.0.0.0") - serve.add_argument("--port", type=int, default=6010) + serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.") + serve.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") + serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.") + serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.") add_judge_llm_args(serve) add_intent_llm_args(serve) @@ -175,23 +254,19 @@ def run_build(args: argparse.Namespace) -> None: rebuild_kwargs = {} if args.force_refresh_labels: rebuild_kwargs = { - "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K, - "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD, - "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT, - "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE, - "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES, - "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES, - "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio - if args.rebuild_irrelevant_stop_ratio is not None - else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, - "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio - if args.rebuild_irrel_low_combined_stop_ratio is not None - else DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, - "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak - if args.rebuild_irrelevant_stop_streak is not None - else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, + "search_recall_top_k": args.search_recall_top_k, + "rerank_high_threshold": args.rerank_high_threshold, + "rerank_high_skip_count": args.rerank_high_skip_count, + "rebuild_llm_batch_size": args.rebuild_llm_batch_size, + "rebuild_min_batches": args.rebuild_min_batches, + "rebuild_max_batches": args.rebuild_max_batches, + "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio, + "rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio, + "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak, } - for query in queries: + total_q = len(queries) + for q_index, query in enumerate(queries, start=1): + _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) result = framework.build_query_annotation_set( query=query, search_depth=args.search_depth, @@ -230,6 +305,7 @@ def run_build(args: argparse.Namespace) -> None: def run_batch(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) + _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) payload = framework.batch_evaluate( queries=queries, top_k=args.top_k, @@ -302,14 +378,18 @@ def run_serve(args: argparse.Namespace) -> None: def main() -> None: - setup_eval_logging() + from config.loader import get_app_config + + se = get_app_config().search_evaluation + log_file = setup_eval_logging(se.eval_log_dir) parser = build_cli_parser() args = parser.parse_args() + _apply_search_evaluation_cli_defaults(args) logging.getLogger("search_eval").info( "CLI start command=%s tenant_id=%s log_file=%s", args.command, getattr(args, "tenant_id", ""), - EVAL_LOG_FILE.resolve(), + log_file.resolve(), ) if args.command == "build": run_build(args) diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 54d7a51..180a0f3 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -12,7 +12,7 @@ from typing import Any, Dict, List, Optional, Sequence, Tuple import requests -from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS +from .constants import VALID_LABELS from .logging_setup import setup_eval_logging from .prompts import classify_prompt, intent_analysis_prompt from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps @@ -23,13 +23,16 @@ _eval_llm_verbose_path_logged = False def _get_eval_llm_verbose_logger() -> logging.Logger: - """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``.""" - setup_eval_logging() + """File logger for full LLM prompts/responses under ``search_evaluation.eval_log_dir/verbose/``.""" + from config.loader import get_app_config + + se = get_app_config().search_evaluation + setup_eval_logging(se.eval_log_dir) global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged with _VERBOSE_LOGGER_LOCK: if _eval_llm_verbose_logger_singleton is not None: return _eval_llm_verbose_logger_singleton - log_path = EVAL_VERBOSE_LOG_FILE + log_path = se.eval_log_dir / "verbose" / "eval_verbose.log" log_path.parent.mkdir(parents=True, exist_ok=True) lg = logging.getLogger("search_eval.verbose_llm") lg.setLevel(logging.INFO) diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index d14bb59..2fdd865 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -46,7 +46,7 @@ DEFAULT_JUDGE_ENABLE_THINKING = False DEFAULT_JUDGE_DASHSCOPE_BATCH = False # Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) -DEFAULT_INTENT_MODEL = "qwen-max" +DEFAULT_INTENT_MODEL = "qwen3-max" DEFAULT_INTENT_ENABLE_THINKING = True DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 64aa096..320bffb 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -16,14 +16,6 @@ from indexer.mapping_generator import get_tenant_index_name from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceClient from .constants import ( - DEFAULT_ARTIFACT_ROOT, - DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, - DEFAULT_INTENT_ENABLE_THINKING, - DEFAULT_INTENT_MODEL, - DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, - DEFAULT_JUDGE_DASHSCOPE_BATCH, - DEFAULT_JUDGE_ENABLE_THINKING, - DEFAULT_JUDGE_MODEL, DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, @@ -79,8 +71,8 @@ class SearchEvaluationFramework: def __init__( self, tenant_id: str, - artifact_root: Path = DEFAULT_ARTIFACT_ROOT, - search_base_url: str = "http://localhost:6002", + artifact_root: Path | None = None, + search_base_url: str | None = None, *, judge_model: str | None = None, enable_thinking: bool | None = None, @@ -88,12 +80,14 @@ class SearchEvaluationFramework: intent_model: str | None = None, intent_enable_thinking: bool | None = None, ): - init_service(get_app_config().infrastructure.elasticsearch.host) + app_cfg = get_app_config() + se = app_cfg.search_evaluation + init_service(app_cfg.infrastructure.elasticsearch.host) self.tenant_id = str(tenant_id) - self.artifact_root = ensure_dir(artifact_root) + self.artifact_root = ensure_dir(artifact_root if artifact_root is not None else se.artifact_root) self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") - self.search_client = SearchServiceClient(search_base_url, self.tenant_id) - app_cfg = get_app_config() + sb = search_base_url if search_base_url is not None else se.search_base_url + self.search_client = SearchServiceClient(sb, self.tenant_id) rerank_service_url = str( app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] ) @@ -102,11 +96,11 @@ class SearchEvaluationFramework: api_key = app_cfg.infrastructure.secrets.dashscope_api_key if not api_key: raise RuntimeError("dashscope_api_key is required for search evaluation annotation") - model = str(judge_model or DEFAULT_JUDGE_MODEL) - et = DEFAULT_JUDGE_ENABLE_THINKING if enable_thinking is None else enable_thinking - use_batch = DEFAULT_JUDGE_DASHSCOPE_BATCH if use_dashscope_batch is None else use_dashscope_batch - batch_window = DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW - batch_poll = float(DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC) + model = str(judge_model if judge_model is not None else se.judge_model) + et = se.judge_enable_thinking if enable_thinking is None else enable_thinking + use_batch = se.judge_dashscope_batch if use_dashscope_batch is None else use_dashscope_batch + batch_window = se.judge_batch_completion_window + batch_poll = float(se.judge_batch_poll_interval_sec) self.label_client = DashScopeLabelClient( model=model, base_url=str(llm_cfg["base_url"]), @@ -116,8 +110,8 @@ class SearchEvaluationFramework: enable_thinking=et, use_batch=use_batch, ) - intent_m = str(intent_model or DEFAULT_INTENT_MODEL) - intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking + intent_m = str(intent_model if intent_model is not None else se.intent_model) + intent_et = se.intent_enable_thinking if intent_enable_thinking is None else intent_enable_thinking self.intent_client = DashScopeLabelClient( model=intent_m, base_url=str(llm_cfg["base_url"]), @@ -629,6 +623,21 @@ class SearchEvaluationFramework: corpus = self.corpus_docs(refresh=False) corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} + rerank_pending_n = sum( + 1 + for d in corpus + if str(d.get("spu_id") or "").strip() + and str(d.get("spu_id")) not in pool_spu_ids + ) + _log.info( + "[eval-rebuild] query=%r phase=rerank_outside_pool docs≈%s (pool=%s, force_refresh_rerank=%s); " + "this can take a long time with no further logs until LLM batches start", + query, + rerank_pending_n, + len(pool_spu_ids), + force_refresh_rerank, + ) + ranked_outside = self.full_corpus_rerank_outside_exclude( query=query, docs=corpus, @@ -905,7 +914,9 @@ class SearchEvaluationFramework: force_refresh_labels: bool = False, ) -> Dict[str, Any]: per_query = [] - for query in queries: + total_q = len(queries) + _log.info("[batch-eval] starting %s queries top_k=%s auto_annotate=%s", total_q, top_k, auto_annotate) + for q_index, query in enumerate(queries, start=1): live = self.evaluate_live_query( query, top_k=top_k, @@ -927,6 +938,16 @@ class SearchEvaluationFramework: "total": live["total"], } ) + m = live["metrics"] + _log.info( + "[batch-eval] (%s/%s) query=%r P@10=%s MAP_3=%s total_hits=%s", + q_index, + total_q, + query, + m.get("P@10"), + m.get("MAP_3"), + live.get("total"), + ) aggregate = aggregate_metrics([item["metrics"] for item in per_query]) aggregate_distribution = { RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), @@ -955,5 +976,11 @@ class SearchEvaluationFramework: output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) + _log.info( + "[batch-eval] finished batch_id=%s per_query=%s json=%s", + batch_id, + len(per_query), + output_json_path, + ) return payload diff --git a/scripts/evaluation/eval_framework/logging_setup.py b/scripts/evaluation/eval_framework/logging_setup.py index 8323a85..f689cc2 100644 --- a/scripts/evaluation/eval_framework/logging_setup.py +++ b/scripts/evaluation/eval_framework/logging_setup.py @@ -1,31 +1,37 @@ -"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``).""" +"""Configure dedicated eval run logs (defaults: repo ``logs/``; override via ``config.yaml`` ``search_evaluation.eval_log_dir``).""" from __future__ import annotations import logging import sys +from pathlib import Path -from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR +from .constants import EVAL_LOG_DIR _setup_done = False -def setup_eval_logging() -> None: - """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.""" +def setup_eval_logging(eval_log_dir: Path | None = None) -> Path: + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist. + + Returns the path to the primary ``eval.log`` file. + """ global _setup_done - if _setup_done: - return + log_dir = Path(eval_log_dir).resolve() if eval_log_dir is not None else EVAL_LOG_DIR.resolve() + verbose_dir = log_dir / "verbose" + log_file = log_dir / "eval.log" - EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True) - EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True) + log_dir.mkdir(parents=True, exist_ok=True) + verbose_dir.mkdir(parents=True, exist_ok=True) fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") root = logging.getLogger("search_eval") root.setLevel(logging.INFO) if root.handlers: _setup_done = True - return - fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8") + return log_file + + fh = logging.FileHandler(log_file, encoding="utf-8") fh.setFormatter(fmt) sh = logging.StreamHandler(sys.stderr) sh.setFormatter(fmt) @@ -33,3 +39,4 @@ def setup_eval_logging() -> None: root.addHandler(sh) root.propagate = False _setup_done = True + return log_file -- libgit2 0.21.2