From 310bb3bc8f6351cf7c957189ce54522d2b322366 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 15:59:19 +0800 Subject: [PATCH] eval tools --- config/config.yaml | 4 ++-- scripts/evaluation/eval_framework/cli.py | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/evaluation/eval_framework/clients.py | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------ scripts/evaluation/eval_framework/framework.py | 6 +++--- scripts/evaluation/start_eval.sh | 38 ++++++++++++++++++++++++++------------ 5 files changed, 149 insertions(+), 23 deletions(-) diff --git a/config/config.yaml b/config/config.yaml index 9f1c772..5c659a8 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -153,8 +153,8 @@ query_config: # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 - translation_embedding_wait_budget_ms_source_in_index: 200 # 80 - translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 + translation_embedding_wait_budget_ms_source_in_index: 300 # 80 + translation_embedding_wait_budget_ms_source_not_in_index: 400 # 200 style_intent: enabled: true selected_sku_boost: 1.2 diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 0c76e86..923129a 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -5,6 +5,7 @@ from __future__ import annotations import argparse import json import logging +import shutil from pathlib import Path from typing import Any, Dict @@ -16,6 +17,35 @@ from .web_app import create_web_app _cli_log = logging.getLogger("search_eval.cli") +def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]: + if not start_from_query: + return queries + try: + start_idx = queries.index(start_from_query) + except ValueError as exc: + raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc + return queries[start_idx:] + + +def _reset_build_artifacts() -> None: + from config.loader import get_app_config + + artifact_root = get_app_config().search_evaluation.artifact_root + removed = [] + db_path = artifact_root / "search_eval.sqlite3" + query_builds_dir = artifact_root / "query_builds" + if db_path.exists(): + db_path.unlink() + removed.append(str(db_path)) + if query_builds_dir.exists(): + shutil.rmtree(query_builds_dir) + removed.append(str(query_builds_dir)) + if removed: + _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed)) + else: + _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root) + + def add_judge_llm_args(p: argparse.ArgumentParser) -> None: p.add_argument( "--judge-model", @@ -207,6 +237,16 @@ def build_cli_parser() -> argparse.ArgumentParser: default=None, help="Default: search_evaluation.default_language.", ) + build.add_argument( + "--start-from-query", + default=None, + help="Start processing from this exact query text in the queries file.", + ) + build.add_argument( + "--reset-artifacts", + action="store_true", + help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.", + ) build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(build) @@ -217,6 +257,11 @@ def build_cli_parser() -> argparse.ArgumentParser: batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") + batch.add_argument( + "--start-from-query", + default=None, + help="Start processing from this exact query text in the queries file.", + ) batch.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(batch) add_intent_llm_args(batch) @@ -227,6 +272,11 @@ def build_cli_parser() -> argparse.ArgumentParser: audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") audit.add_argument( + "--start-from-query", + default=None, + help="Start processing from this exact query text in the queries file.", + ) + audit.add_argument( "--limit-suspicious", type=int, default=None, @@ -248,8 +298,11 @@ def build_cli_parser() -> argparse.ArgumentParser: def run_build(args: argparse.Namespace) -> None: + if args.reset_artifacts: + _reset_build_artifacts() framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) + queries = _filter_queries_from_start(queries, args.start_from_query) summary = [] rebuild_kwargs = {} if args.force_refresh_labels: @@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -> None: def run_batch(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) + queries = _filter_queries_from_start(queries, args.start_from_query) _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) payload = framework.batch_evaluate( queries=queries, @@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -> None: def run_audit(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) + queries = _filter_queries_from_start(queries, args.start_from_query) audit_items = [] for query in queries: item = framework.audit_live_query( diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 180a0f3..df28e9a 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps _VERBOSE_LOGGER_LOCK = threading.Lock() _eval_llm_verbose_logger_singleton: logging.Logger | None = None _eval_llm_verbose_path_logged = False +_TRANSIENT_HTTP_STATUS_CODES = frozenset({408, 425, 429, 500, 502, 503, 504}) +_client_log = logging.getLogger("search_eval.clients") def _get_eval_llm_verbose_logger() -> logging.Logger: @@ -85,6 +87,62 @@ class SearchServiceClient: self.base_url = base_url.rstrip("/") self.tenant_id = str(tenant_id) self.session = requests.Session() + # Batch eval depends on live backend responses; tolerate brief restarts. + self.retry_attempts = 45 + self.retry_delay_sec = 2.0 + + @staticmethod + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool: + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)): + return True + if isinstance(exc, requests.exceptions.HTTPError): + response = getattr(exc, "response", None) + if response is None: + return True + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES + return False + + def _request_json( + self, + method: str, + path: str, + *, + timeout: float, + headers: Optional[Dict[str, str]] = None, + json_payload: Optional[Dict[str, Any]] = None, + ) -> Dict[str, Any]: + last_exc: requests.exceptions.RequestException | None = None + url = f"{self.base_url}{path}" + for attempt in range(1, self.retry_attempts + 1): + try: + response = self.session.request( + method=method, + url=url, + headers=headers, + json=json_payload, + timeout=timeout, + ) + response.raise_for_status() + return response.json() + except requests.exceptions.RequestException as exc: + last_exc = exc + if not self._is_transient_request_error(exc) or attempt >= self.retry_attempts: + raise + _client_log.warning( + "Transient search-eval request failure, retrying (%s/%s): %s %s error=%s", + attempt, + self.retry_attempts, + method.upper(), + url, + exc, + ) + time.sleep(self.retry_delay_sec) + if last_exc is not None: + raise last_exc + raise RuntimeError(f"unexpected request retry state for {method.upper()} {url}") + + def get_json(self, path: str, *, timeout: float = 20) -> Dict[str, Any]: + return self._request_json("GET", path, timeout=timeout) def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: payload: Dict[str, Any] = { @@ -95,14 +153,13 @@ class SearchServiceClient: } if debug: payload["debug"] = True - response = self.session.post( - f"{self.base_url}/search/", - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, - json=payload, + return self._request_json( + "POST", + "/search/", timeout=120, + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, + json_payload=payload, ) - response.raise_for_status() - return response.json() class RerankServiceClient: diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 320bffb..02508b5 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -567,7 +567,7 @@ class SearchEvaluationFramework: "created_at": utc_now_iso(), "tenant_id": self.tenant_id, "query": query, - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20), "search_total": int(search_payload.get("total") or 0), "search_depth_requested": search_depth, "search_depth_effective": len(search_results), @@ -762,7 +762,7 @@ class SearchEvaluationFramework: "created_at": utc_now_iso(), "tenant_id": self.tenant_id, "query": query, - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20), "search_total": int(search_payload.get("total") or 0), "search_depth_requested": search_depth, "search_depth_effective": len(search_results), @@ -958,7 +958,7 @@ class SearchEvaluationFramework: batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" report_dir = ensure_dir(self.artifact_root / "batch_reports") config_snapshot_path = report_dir / f"{batch_id}_config.json" - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() + config_snapshot = self.search_client.get_json("/admin/config", timeout=20) config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") output_json_path = report_dir / f"{batch_id}.json" report_md_path = report_dir / f"{batch_id}.md" diff --git a/scripts/evaluation/start_eval.sh b/scripts/evaluation/start_eval.sh index dc097c3..ce2442e 100755 --- a/scripts/evaluation/start_eval.sh +++ b/scripts/evaluation/start_eval.sh @@ -7,32 +7,46 @@ cd "$ROOT" PY="${ROOT}/.venv/bin/python" TENANT_ID="${TENANT_ID:-163}" QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" +START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}" usage() { echo "Usage: $0 batch|batch-rebuild|serve" echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" } case "${1:-}" in batch) - exec "$PY" scripts/evaluation/build_annotation_set.py batch \ - --tenant-id "$TENANT_ID" \ - --queries-file "$QUERIES" \ - --top-k 50 \ + cmd=( + "$PY" scripts/evaluation/build_annotation_set.py batch + --tenant-id "$TENANT_ID" + --queries-file "$QUERIES" + --top-k 50 --language en + ) + if [ -n "$START_FROM_QUERY" ]; then + cmd+=(--start-from-query "$START_FROM_QUERY") + fi + exec "${cmd[@]}" ;; batch-rebuild) - exec "$PY" scripts/evaluation/build_annotation_set.py build \ - --tenant-id "$TENANT_ID" \ - --queries-file "$QUERIES" \ - --search-depth 500 \ - --rerank-depth 10000 \ - --force-refresh-rerank \ - --force-refresh-labels \ + cmd=( + "$PY" scripts/evaluation/build_annotation_set.py build + --tenant-id "$TENANT_ID" + --queries-file "$QUERIES" + --search-depth 500 + --rerank-depth 10000 + --reset-artifacts + --force-refresh-rerank + --force-refresh-labels --language en + ) + if [ -n "$START_FROM_QUERY" ]; then + cmd+=(--start-from-query "$START_FROM_QUERY") + fi + exec "${cmd[@]}" ;; serve) EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" -- libgit2 0.21.2