eval tools

tangwang
1 parent 331861d5
Showing 5 changed files with 149 additions and 23 deletions Show diff stats
config/config.yaml
scripts/evaluation/eval_framework/cli.py
scripts/evaluation/eval_framework/clients.py
scripts/evaluation/eval_framework/framework.py
scripts/evaluation/start_eval.sh
@@ -153,8 +153,8 @@ query_config:
   # 查询解析阶段：翻译与 query 向量并发执行，共用同一等待预算（毫秒）。
   # 检测语言已在租户 index_languages 内：较短；不在索引语言内：较长（翻译对召回更关键）。
-  translation_embedding_wait_budget_ms_source_in_index: 200  # 80
-  translation_embedding_wait_budget_ms_source_not_in_index: 300  # 200
+  translation_embedding_wait_budget_ms_source_in_index: 300  # 80
+  translation_embedding_wait_budget_ms_source_not_in_index: 400  # 200
   style_intent:
     enabled: true
     selected_sku_boost: 1.2
@@ -5,6 +5,7 @@ from __future__ import annotations
 import argparse
 import json
 import logging
+import shutil
 from pathlib import Path
 from typing import Any, Dict
@@ -16,6 +17,35 @@ from .web_app import create_web_app
 _cli_log = logging.getLogger("search_eval.cli")
+def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]:
+    if not start_from_query:
+        return queries
+    try:
+        start_idx = queries.index(start_from_query)
+    except ValueError as exc:
+        raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc
+    return queries[start_idx:]
+
+
+def _reset_build_artifacts() -> None:
+    from config.loader import get_app_config
+
+    artifact_root = get_app_config().search_evaluation.artifact_root
+    removed = []
+    db_path = artifact_root / "search_eval.sqlite3"
+    query_builds_dir = artifact_root / "query_builds"
+    if db_path.exists():
+        db_path.unlink()
+        removed.append(str(db_path))
+    if query_builds_dir.exists():
+        shutil.rmtree(query_builds_dir)
+        removed.append(str(query_builds_dir))
+    if removed:
+        _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed))
+    else:
+        _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root)
+
+
 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
     p.add_argument(
         "--judge-model",
@@ -207,6 +237,16 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
         default=None,
         help="Default: search_evaluation.default_language.",
     )
+    build.add_argument(
+        "--start-from-query",
+        default=None,
+        help="Start processing from this exact query text in the queries file.",
+    )
+    build.add_argument(
+        "--reset-artifacts",
+        action="store_true",
+        help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
+    )
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(build)
@@ -217,6 +257,11 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
     batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
     batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
+    batch.add_argument(
+        "--start-from-query",
+        default=None,
+        help="Start processing from this exact query text in the queries file.",
+    )
     batch.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(batch)
     add_intent_llm_args(batch)
@@ -227,6 +272,11 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
     audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
     audit.add_argument(
+        "--start-from-query",
+        default=None,
+        help="Start processing from this exact query text in the queries file.",
+    )
+    audit.add_argument(
         "--limit-suspicious",
         type=int,
         default=None,
@@ -248,8 +298,11 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
 def run_build(args: argparse.Namespace) -> None:
+    if args.reset_artifacts:
+        _reset_build_artifacts()
     framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
+    queries = _filter_queries_from_start(queries, args.start_from_query)
     summary = []
     rebuild_kwargs = {}
     if args.force_refresh_labels:
@@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
 def run_batch(args: argparse.Namespace) -> None:
     framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
+    queries = _filter_queries_from_start(queries, args.start_from_query)
     _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
     payload = framework.batch_evaluate(
         queries=queries,
@@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -&gt; None:
 def run_audit(args: argparse.Namespace) -> None:
     framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
     queries = framework.queries_from_file(Path(args.queries_file))
+    queries = _filter_queries_from_start(queries, args.start_from_query)
     audit_items = []
     for query in queries:
         item = framework.audit_live_query(
@@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
 _VERBOSE_LOGGER_LOCK = threading.Lock()
 _eval_llm_verbose_logger_singleton: logging.Logger | None = None
 _eval_llm_verbose_path_logged = False
+_TRANSIENT_HTTP_STATUS_CODES = frozenset({408, 425, 429, 500, 502, 503, 504})
+_client_log = logging.getLogger("search_eval.clients")
 def _get_eval_llm_verbose_logger() -> logging.Logger:
@@ -85,6 +87,62 @@ class SearchServiceClient:
         self.base_url = base_url.rstrip("/")
         self.tenant_id = str(tenant_id)
         self.session = requests.Session()
+        # Batch eval depends on live backend responses; tolerate brief restarts.
+        self.retry_attempts = 45
+        self.retry_delay_sec = 2.0
+
+    @staticmethod
+    def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool:
+        if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
+            return True
+        if isinstance(exc, requests.exceptions.HTTPError):
+            response = getattr(exc, "response", None)
+            if response is None:
+                return True
+            return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES
+        return False
+
+    def _request_json(
+        self,
+        method: str,
+        path: str,
+        *,
+        timeout: float,
+        headers: Optional[Dict[str, str]] = None,
+        json_payload: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        last_exc: requests.exceptions.RequestException | None = None
+        url = f"{self.base_url}{path}"
+        for attempt in range(1, self.retry_attempts + 1):
+            try:
+                response = self.session.request(
+                    method=method,
+                    url=url,
+                    headers=headers,
+                    json=json_payload,
+                    timeout=timeout,
+                )
+                response.raise_for_status()
+                return response.json()
+            except requests.exceptions.RequestException as exc:
+                last_exc = exc
+                if not self._is_transient_request_error(exc) or attempt >= self.retry_attempts:
+                    raise
+                _client_log.warning(
+                    "Transient search-eval request failure, retrying (%s/%s): %s %s error=%s",
+                    attempt,
+                    self.retry_attempts,
+                    method.upper(),
+                    url,
+                    exc,
+                )
+                time.sleep(self.retry_delay_sec)
+        if last_exc is not None:
+            raise last_exc
+        raise RuntimeError(f"unexpected request retry state for {method.upper()} {url}")
+
+    def get_json(self, path: str, *, timeout: float = 20) -> Dict[str, Any]:
+        return self._request_json("GET", path, timeout=timeout)
     def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
         payload: Dict[str, Any] = {
@@ -95,14 +153,13 @@ class SearchServiceClient:
         }
         if debug:
             payload["debug"] = True
-        response = self.session.post(
-            f"{self.base_url}/search/",
-            headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
-            json=payload,
+        return self._request_json(
+            "POST",
+            "/search/",
             timeout=120,
+            headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
+            json_payload=payload,
         )
-        response.raise_for_status()
-        return response.json()
 class RerankServiceClient:
@@ -567,7 +567,7 @@ class SearchEvaluationFramework:
             "created_at": utc_now_iso(),
             "tenant_id": self.tenant_id,
             "query": query,
-            "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
+            "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
             "search_total": int(search_payload.get("total") or 0),
             "search_depth_requested": search_depth,
             "search_depth_effective": len(search_results),
@@ -762,7 +762,7 @@ class SearchEvaluationFramework:
             "created_at": utc_now_iso(),
             "tenant_id": self.tenant_id,
             "query": query,
-            "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
+            "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
             "search_total": int(search_payload.get("total") or 0),
             "search_depth_requested": search_depth,
             "search_depth_effective": len(search_results),
@@ -958,7 +958,7 @@ class SearchEvaluationFramework:
         batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
         report_dir = ensure_dir(self.artifact_root / "batch_reports")
         config_snapshot_path = report_dir / f"{batch_id}_config.json"
-        config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
+        config_snapshot = self.search_client.get_json("/admin/config", timeout=20)
         config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
         output_json_path = report_dir / f"{batch_id}.json"
         report_md_path = report_dir / f"{batch_id}.md"
@@ -7,32 +7,46 @@ cd &quot;$ROOT&quot;
 PY="${ROOT}/.venv/bin/python"
 TENANT_ID="${TENANT_ID:-163}"
 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
+START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}"
 usage() {
   echo "Usage: $0 batch|batch-rebuild|serve"
   echo "  batch          — batch eval: live search every query, LLM only for missing labels (top_k=50)"
   echo "  batch-rebuild  — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
   echo "  serve          — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
-  echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
+  echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
 }
 case "${1:-}" in
   batch)
-    exec "$PY" scripts/evaluation/build_annotation_set.py batch \
-      --tenant-id "$TENANT_ID" \
-      --queries-file "$QUERIES" \
-      --top-k 50 \
+    cmd=(
+      "$PY" scripts/evaluation/build_annotation_set.py batch
+      --tenant-id "$TENANT_ID"
+      --queries-file "$QUERIES"
+      --top-k 50
       --language en
+    )
+    if [ -n "$START_FROM_QUERY" ]; then
+      cmd+=(--start-from-query "$START_FROM_QUERY")
+    fi
+    exec "${cmd[@]}"
     ;;
   batch-rebuild)
-    exec "$PY" scripts/evaluation/build_annotation_set.py build \
-      --tenant-id "$TENANT_ID" \
-      --queries-file "$QUERIES" \
-      --search-depth 500 \
-      --rerank-depth 10000 \
-      --force-refresh-rerank \
-      --force-refresh-labels \
+    cmd=(
+      "$PY" scripts/evaluation/build_annotation_set.py build
+      --tenant-id "$TENANT_ID"
+      --queries-file "$QUERIES"
+      --search-depth 500
+      --rerank-depth 10000
+      --reset-artifacts
+      --force-refresh-rerank
+      --force-refresh-labels
       --language en
+    )
+    if [ -n "$START_FROM_QUERY" ]; then
+      cmd+=(--start-from-query "$START_FROM_QUERY")
+    fi
+    exec "${cmd[@]}"
     ;;
   serve)
     EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"