Commit 310bb3bc8f6351cf7c957189ce54522d2b322366
1 parent
331861d5
eval tools
Showing
5 changed files
with
149 additions
and
23 deletions
Show diff stats
config/config.yaml
| ... | ... | @@ -153,8 +153,8 @@ query_config: |
| 153 | 153 | |
| 154 | 154 | # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 |
| 155 | 155 | # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 |
| 156 | - translation_embedding_wait_budget_ms_source_in_index: 200 # 80 | |
| 157 | - translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 | |
| 156 | + translation_embedding_wait_budget_ms_source_in_index: 300 # 80 | |
| 157 | + translation_embedding_wait_budget_ms_source_not_in_index: 400 # 200 | |
| 158 | 158 | style_intent: |
| 159 | 159 | enabled: true |
| 160 | 160 | selected_sku_boost: 1.2 | ... | ... |
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -5,6 +5,7 @@ from __future__ import annotations |
| 5 | 5 | import argparse |
| 6 | 6 | import json |
| 7 | 7 | import logging |
| 8 | +import shutil | |
| 8 | 9 | from pathlib import Path |
| 9 | 10 | from typing import Any, Dict |
| 10 | 11 | |
| ... | ... | @@ -16,6 +17,35 @@ from .web_app import create_web_app |
| 16 | 17 | _cli_log = logging.getLogger("search_eval.cli") |
| 17 | 18 | |
| 18 | 19 | |
| 20 | +def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]: | |
| 21 | + if not start_from_query: | |
| 22 | + return queries | |
| 23 | + try: | |
| 24 | + start_idx = queries.index(start_from_query) | |
| 25 | + except ValueError as exc: | |
| 26 | + raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc | |
| 27 | + return queries[start_idx:] | |
| 28 | + | |
| 29 | + | |
| 30 | +def _reset_build_artifacts() -> None: | |
| 31 | + from config.loader import get_app_config | |
| 32 | + | |
| 33 | + artifact_root = get_app_config().search_evaluation.artifact_root | |
| 34 | + removed = [] | |
| 35 | + db_path = artifact_root / "search_eval.sqlite3" | |
| 36 | + query_builds_dir = artifact_root / "query_builds" | |
| 37 | + if db_path.exists(): | |
| 38 | + db_path.unlink() | |
| 39 | + removed.append(str(db_path)) | |
| 40 | + if query_builds_dir.exists(): | |
| 41 | + shutil.rmtree(query_builds_dir) | |
| 42 | + removed.append(str(query_builds_dir)) | |
| 43 | + if removed: | |
| 44 | + _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed)) | |
| 45 | + else: | |
| 46 | + _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root) | |
| 47 | + | |
| 48 | + | |
| 19 | 49 | def add_judge_llm_args(p: argparse.ArgumentParser) -> None: |
| 20 | 50 | p.add_argument( |
| 21 | 51 | "--judge-model", |
| ... | ... | @@ -207,6 +237,16 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 207 | 237 | default=None, |
| 208 | 238 | help="Default: search_evaluation.default_language.", |
| 209 | 239 | ) |
| 240 | + build.add_argument( | |
| 241 | + "--start-from-query", | |
| 242 | + default=None, | |
| 243 | + help="Start processing from this exact query text in the queries file.", | |
| 244 | + ) | |
| 245 | + build.add_argument( | |
| 246 | + "--reset-artifacts", | |
| 247 | + action="store_true", | |
| 248 | + help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.", | |
| 249 | + ) | |
| 210 | 250 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 211 | 251 | build.add_argument("--force-refresh-labels", action="store_true") |
| 212 | 252 | add_judge_llm_args(build) |
| ... | ... | @@ -217,6 +257,11 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 217 | 257 | batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") |
| 218 | 258 | batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") |
| 219 | 259 | batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") |
| 260 | + batch.add_argument( | |
| 261 | + "--start-from-query", | |
| 262 | + default=None, | |
| 263 | + help="Start processing from this exact query text in the queries file.", | |
| 264 | + ) | |
| 220 | 265 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 221 | 266 | add_judge_llm_args(batch) |
| 222 | 267 | add_intent_llm_args(batch) |
| ... | ... | @@ -227,6 +272,11 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 227 | 272 | audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") |
| 228 | 273 | audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") |
| 229 | 274 | audit.add_argument( |
| 275 | + "--start-from-query", | |
| 276 | + default=None, | |
| 277 | + help="Start processing from this exact query text in the queries file.", | |
| 278 | + ) | |
| 279 | + audit.add_argument( | |
| 230 | 280 | "--limit-suspicious", |
| 231 | 281 | type=int, |
| 232 | 282 | default=None, |
| ... | ... | @@ -248,8 +298,11 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 248 | 298 | |
| 249 | 299 | |
| 250 | 300 | def run_build(args: argparse.Namespace) -> None: |
| 301 | + if args.reset_artifacts: | |
| 302 | + _reset_build_artifacts() | |
| 251 | 303 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 252 | 304 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 305 | + queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 253 | 306 | summary = [] |
| 254 | 307 | rebuild_kwargs = {} |
| 255 | 308 | if args.force_refresh_labels: |
| ... | ... | @@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -> None: |
| 305 | 358 | def run_batch(args: argparse.Namespace) -> None: |
| 306 | 359 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 307 | 360 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 361 | + queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 308 | 362 | _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) |
| 309 | 363 | payload = framework.batch_evaluate( |
| 310 | 364 | queries=queries, |
| ... | ... | @@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -> None: |
| 319 | 373 | def run_audit(args: argparse.Namespace) -> None: |
| 320 | 374 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 321 | 375 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 376 | + queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 322 | 377 | audit_items = [] |
| 323 | 378 | for query in queries: |
| 324 | 379 | item = framework.audit_live_query( | ... | ... |
scripts/evaluation/eval_framework/clients.py
| ... | ... | @@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps |
| 20 | 20 | _VERBOSE_LOGGER_LOCK = threading.Lock() |
| 21 | 21 | _eval_llm_verbose_logger_singleton: logging.Logger | None = None |
| 22 | 22 | _eval_llm_verbose_path_logged = False |
| 23 | +_TRANSIENT_HTTP_STATUS_CODES = frozenset({408, 425, 429, 500, 502, 503, 504}) | |
| 24 | +_client_log = logging.getLogger("search_eval.clients") | |
| 23 | 25 | |
| 24 | 26 | |
| 25 | 27 | def _get_eval_llm_verbose_logger() -> logging.Logger: |
| ... | ... | @@ -85,6 +87,62 @@ class SearchServiceClient: |
| 85 | 87 | self.base_url = base_url.rstrip("/") |
| 86 | 88 | self.tenant_id = str(tenant_id) |
| 87 | 89 | self.session = requests.Session() |
| 90 | + # Batch eval depends on live backend responses; tolerate brief restarts. | |
| 91 | + self.retry_attempts = 45 | |
| 92 | + self.retry_delay_sec = 2.0 | |
| 93 | + | |
| 94 | + @staticmethod | |
| 95 | + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool: | |
| 96 | + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)): | |
| 97 | + return True | |
| 98 | + if isinstance(exc, requests.exceptions.HTTPError): | |
| 99 | + response = getattr(exc, "response", None) | |
| 100 | + if response is None: | |
| 101 | + return True | |
| 102 | + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES | |
| 103 | + return False | |
| 104 | + | |
| 105 | + def _request_json( | |
| 106 | + self, | |
| 107 | + method: str, | |
| 108 | + path: str, | |
| 109 | + *, | |
| 110 | + timeout: float, | |
| 111 | + headers: Optional[Dict[str, str]] = None, | |
| 112 | + json_payload: Optional[Dict[str, Any]] = None, | |
| 113 | + ) -> Dict[str, Any]: | |
| 114 | + last_exc: requests.exceptions.RequestException | None = None | |
| 115 | + url = f"{self.base_url}{path}" | |
| 116 | + for attempt in range(1, self.retry_attempts + 1): | |
| 117 | + try: | |
| 118 | + response = self.session.request( | |
| 119 | + method=method, | |
| 120 | + url=url, | |
| 121 | + headers=headers, | |
| 122 | + json=json_payload, | |
| 123 | + timeout=timeout, | |
| 124 | + ) | |
| 125 | + response.raise_for_status() | |
| 126 | + return response.json() | |
| 127 | + except requests.exceptions.RequestException as exc: | |
| 128 | + last_exc = exc | |
| 129 | + if not self._is_transient_request_error(exc) or attempt >= self.retry_attempts: | |
| 130 | + raise | |
| 131 | + _client_log.warning( | |
| 132 | + "Transient search-eval request failure, retrying (%s/%s): %s %s error=%s", | |
| 133 | + attempt, | |
| 134 | + self.retry_attempts, | |
| 135 | + method.upper(), | |
| 136 | + url, | |
| 137 | + exc, | |
| 138 | + ) | |
| 139 | + time.sleep(self.retry_delay_sec) | |
| 140 | + if last_exc is not None: | |
| 141 | + raise last_exc | |
| 142 | + raise RuntimeError(f"unexpected request retry state for {method.upper()} {url}") | |
| 143 | + | |
| 144 | + def get_json(self, path: str, *, timeout: float = 20) -> Dict[str, Any]: | |
| 145 | + return self._request_json("GET", path, timeout=timeout) | |
| 88 | 146 | |
| 89 | 147 | def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: |
| 90 | 148 | payload: Dict[str, Any] = { |
| ... | ... | @@ -95,14 +153,13 @@ class SearchServiceClient: |
| 95 | 153 | } |
| 96 | 154 | if debug: |
| 97 | 155 | payload["debug"] = True |
| 98 | - response = self.session.post( | |
| 99 | - f"{self.base_url}/search/", | |
| 100 | - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 101 | - json=payload, | |
| 156 | + return self._request_json( | |
| 157 | + "POST", | |
| 158 | + "/search/", | |
| 102 | 159 | timeout=120, |
| 160 | + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 161 | + json_payload=payload, | |
| 103 | 162 | ) |
| 104 | - response.raise_for_status() | |
| 105 | - return response.json() | |
| 106 | 163 | |
| 107 | 164 | |
| 108 | 165 | class RerankServiceClient: | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -567,7 +567,7 @@ class SearchEvaluationFramework: |
| 567 | 567 | "created_at": utc_now_iso(), |
| 568 | 568 | "tenant_id": self.tenant_id, |
| 569 | 569 | "query": query, |
| 570 | - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 570 | + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20), | |
| 571 | 571 | "search_total": int(search_payload.get("total") or 0), |
| 572 | 572 | "search_depth_requested": search_depth, |
| 573 | 573 | "search_depth_effective": len(search_results), |
| ... | ... | @@ -762,7 +762,7 @@ class SearchEvaluationFramework: |
| 762 | 762 | "created_at": utc_now_iso(), |
| 763 | 763 | "tenant_id": self.tenant_id, |
| 764 | 764 | "query": query, |
| 765 | - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 765 | + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20), | |
| 766 | 766 | "search_total": int(search_payload.get("total") or 0), |
| 767 | 767 | "search_depth_requested": search_depth, |
| 768 | 768 | "search_depth_effective": len(search_results), |
| ... | ... | @@ -958,7 +958,7 @@ class SearchEvaluationFramework: |
| 958 | 958 | batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" |
| 959 | 959 | report_dir = ensure_dir(self.artifact_root / "batch_reports") |
| 960 | 960 | config_snapshot_path = report_dir / f"{batch_id}_config.json" |
| 961 | - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() | |
| 961 | + config_snapshot = self.search_client.get_json("/admin/config", timeout=20) | |
| 962 | 962 | config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") |
| 963 | 963 | output_json_path = report_dir / f"{batch_id}.json" |
| 964 | 964 | report_md_path = report_dir / f"{batch_id}.md" | ... | ... |
scripts/evaluation/start_eval.sh
| ... | ... | @@ -7,32 +7,46 @@ cd "$ROOT" |
| 7 | 7 | PY="${ROOT}/.venv/bin/python" |
| 8 | 8 | TENANT_ID="${TENANT_ID:-163}" |
| 9 | 9 | QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 10 | +START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}" | |
| 10 | 11 | |
| 11 | 12 | usage() { |
| 12 | 13 | echo "Usage: $0 batch|batch-rebuild|serve" |
| 13 | 14 | echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" |
| 14 | 15 | echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 15 | 16 | echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 16 | - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 17 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 17 | 18 | } |
| 18 | 19 | |
| 19 | 20 | case "${1:-}" in |
| 20 | 21 | batch) |
| 21 | - exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | |
| 22 | - --tenant-id "$TENANT_ID" \ | |
| 23 | - --queries-file "$QUERIES" \ | |
| 24 | - --top-k 50 \ | |
| 22 | + cmd=( | |
| 23 | + "$PY" scripts/evaluation/build_annotation_set.py batch | |
| 24 | + --tenant-id "$TENANT_ID" | |
| 25 | + --queries-file "$QUERIES" | |
| 26 | + --top-k 50 | |
| 25 | 27 | --language en |
| 28 | + ) | |
| 29 | + if [ -n "$START_FROM_QUERY" ]; then | |
| 30 | + cmd+=(--start-from-query "$START_FROM_QUERY") | |
| 31 | + fi | |
| 32 | + exec "${cmd[@]}" | |
| 26 | 33 | ;; |
| 27 | 34 | batch-rebuild) |
| 28 | - exec "$PY" scripts/evaluation/build_annotation_set.py build \ | |
| 29 | - --tenant-id "$TENANT_ID" \ | |
| 30 | - --queries-file "$QUERIES" \ | |
| 31 | - --search-depth 500 \ | |
| 32 | - --rerank-depth 10000 \ | |
| 33 | - --force-refresh-rerank \ | |
| 34 | - --force-refresh-labels \ | |
| 35 | + cmd=( | |
| 36 | + "$PY" scripts/evaluation/build_annotation_set.py build | |
| 37 | + --tenant-id "$TENANT_ID" | |
| 38 | + --queries-file "$QUERIES" | |
| 39 | + --search-depth 500 | |
| 40 | + --rerank-depth 10000 | |
| 41 | + --reset-artifacts | |
| 42 | + --force-refresh-rerank | |
| 43 | + --force-refresh-labels | |
| 35 | 44 | --language en |
| 45 | + ) | |
| 46 | + if [ -n "$START_FROM_QUERY" ]; then | |
| 47 | + cmd+=(--start-from-query "$START_FROM_QUERY") | |
| 48 | + fi | |
| 49 | + exec "${cmd[@]}" | |
| 36 | 50 | ;; |
| 37 | 51 | serve) |
| 38 | 52 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ... | ... |