Commit 310bb3bc8f6351cf7c957189ce54522d2b322366

Authored by tangwang
1 parent 331861d5

eval tools

config/config.yaml
... ... @@ -153,8 +153,8 @@ query_config:
153 153  
154 154 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
155 155 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
156   - translation_embedding_wait_budget_ms_source_in_index: 200 # 80
157   - translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200
  156 + translation_embedding_wait_budget_ms_source_in_index: 300 # 80
  157 + translation_embedding_wait_budget_ms_source_not_in_index: 400 # 200
158 158 style_intent:
159 159 enabled: true
160 160 selected_sku_boost: 1.2
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -5,6 +5,7 @@ from __future__ import annotations
5 5 import argparse
6 6 import json
7 7 import logging
  8 +import shutil
8 9 from pathlib import Path
9 10 from typing import Any, Dict
10 11  
... ... @@ -16,6 +17,35 @@ from .web_app import create_web_app
16 17 _cli_log = logging.getLogger("search_eval.cli")
17 18  
18 19  
  20 +def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]:
  21 + if not start_from_query:
  22 + return queries
  23 + try:
  24 + start_idx = queries.index(start_from_query)
  25 + except ValueError as exc:
  26 + raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc
  27 + return queries[start_idx:]
  28 +
  29 +
  30 +def _reset_build_artifacts() -> None:
  31 + from config.loader import get_app_config
  32 +
  33 + artifact_root = get_app_config().search_evaluation.artifact_root
  34 + removed = []
  35 + db_path = artifact_root / "search_eval.sqlite3"
  36 + query_builds_dir = artifact_root / "query_builds"
  37 + if db_path.exists():
  38 + db_path.unlink()
  39 + removed.append(str(db_path))
  40 + if query_builds_dir.exists():
  41 + shutil.rmtree(query_builds_dir)
  42 + removed.append(str(query_builds_dir))
  43 + if removed:
  44 + _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed))
  45 + else:
  46 + _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root)
  47 +
  48 +
19 49 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
20 50 p.add_argument(
21 51 "--judge-model",
... ... @@ -207,6 +237,16 @@ def build_cli_parser() -> argparse.ArgumentParser:
207 237 default=None,
208 238 help="Default: search_evaluation.default_language.",
209 239 )
  240 + build.add_argument(
  241 + "--start-from-query",
  242 + default=None,
  243 + help="Start processing from this exact query text in the queries file.",
  244 + )
  245 + build.add_argument(
  246 + "--reset-artifacts",
  247 + action="store_true",
  248 + help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
  249 + )
210 250 build.add_argument("--force-refresh-rerank", action="store_true")
211 251 build.add_argument("--force-refresh-labels", action="store_true")
212 252 add_judge_llm_args(build)
... ... @@ -217,6 +257,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
217 257 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
218 258 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
219 259 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
  260 + batch.add_argument(
  261 + "--start-from-query",
  262 + default=None,
  263 + help="Start processing from this exact query text in the queries file.",
  264 + )
220 265 batch.add_argument("--force-refresh-labels", action="store_true")
221 266 add_judge_llm_args(batch)
222 267 add_intent_llm_args(batch)
... ... @@ -227,6 +272,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
227 272 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
228 273 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
229 274 audit.add_argument(
  275 + "--start-from-query",
  276 + default=None,
  277 + help="Start processing from this exact query text in the queries file.",
  278 + )
  279 + audit.add_argument(
230 280 "--limit-suspicious",
231 281 type=int,
232 282 default=None,
... ... @@ -248,8 +298,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
248 298  
249 299  
250 300 def run_build(args: argparse.Namespace) -> None:
  301 + if args.reset_artifacts:
  302 + _reset_build_artifacts()
251 303 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
252 304 queries = framework.queries_from_file(Path(args.queries_file))
  305 + queries = _filter_queries_from_start(queries, args.start_from_query)
253 306 summary = []
254 307 rebuild_kwargs = {}
255 308 if args.force_refresh_labels:
... ... @@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -> None:
305 358 def run_batch(args: argparse.Namespace) -> None:
306 359 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
307 360 queries = framework.queries_from_file(Path(args.queries_file))
  361 + queries = _filter_queries_from_start(queries, args.start_from_query)
308 362 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
309 363 payload = framework.batch_evaluate(
310 364 queries=queries,
... ... @@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -> None:
319 373 def run_audit(args: argparse.Namespace) -> None:
320 374 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
321 375 queries = framework.queries_from_file(Path(args.queries_file))
  376 + queries = _filter_queries_from_start(queries, args.start_from_query)
322 377 audit_items = []
323 378 for query in queries:
324 379 item = framework.audit_live_query(
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
20 20 _VERBOSE_LOGGER_LOCK = threading.Lock()
21 21 _eval_llm_verbose_logger_singleton: logging.Logger | None = None
22 22 _eval_llm_verbose_path_logged = False
  23 +_TRANSIENT_HTTP_STATUS_CODES = frozenset({408, 425, 429, 500, 502, 503, 504})
  24 +_client_log = logging.getLogger("search_eval.clients")
23 25  
24 26  
25 27 def _get_eval_llm_verbose_logger() -> logging.Logger:
... ... @@ -85,6 +87,62 @@ class SearchServiceClient:
85 87 self.base_url = base_url.rstrip("/")
86 88 self.tenant_id = str(tenant_id)
87 89 self.session = requests.Session()
  90 + # Batch eval depends on live backend responses; tolerate brief restarts.
  91 + self.retry_attempts = 45
  92 + self.retry_delay_sec = 2.0
  93 +
  94 + @staticmethod
  95 + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool:
  96 + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
  97 + return True
  98 + if isinstance(exc, requests.exceptions.HTTPError):
  99 + response = getattr(exc, "response", None)
  100 + if response is None:
  101 + return True
  102 + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES
  103 + return False
  104 +
  105 + def _request_json(
  106 + self,
  107 + method: str,
  108 + path: str,
  109 + *,
  110 + timeout: float,
  111 + headers: Optional[Dict[str, str]] = None,
  112 + json_payload: Optional[Dict[str, Any]] = None,
  113 + ) -> Dict[str, Any]:
  114 + last_exc: requests.exceptions.RequestException | None = None
  115 + url = f"{self.base_url}{path}"
  116 + for attempt in range(1, self.retry_attempts + 1):
  117 + try:
  118 + response = self.session.request(
  119 + method=method,
  120 + url=url,
  121 + headers=headers,
  122 + json=json_payload,
  123 + timeout=timeout,
  124 + )
  125 + response.raise_for_status()
  126 + return response.json()
  127 + except requests.exceptions.RequestException as exc:
  128 + last_exc = exc
  129 + if not self._is_transient_request_error(exc) or attempt >= self.retry_attempts:
  130 + raise
  131 + _client_log.warning(
  132 + "Transient search-eval request failure, retrying (%s/%s): %s %s error=%s",
  133 + attempt,
  134 + self.retry_attempts,
  135 + method.upper(),
  136 + url,
  137 + exc,
  138 + )
  139 + time.sleep(self.retry_delay_sec)
  140 + if last_exc is not None:
  141 + raise last_exc
  142 + raise RuntimeError(f"unexpected request retry state for {method.upper()} {url}")
  143 +
  144 + def get_json(self, path: str, *, timeout: float = 20) -> Dict[str, Any]:
  145 + return self._request_json("GET", path, timeout=timeout)
88 146  
89 147 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
90 148 payload: Dict[str, Any] = {
... ... @@ -95,14 +153,13 @@ class SearchServiceClient:
95 153 }
96 154 if debug:
97 155 payload["debug"] = True
98   - response = self.session.post(
99   - f"{self.base_url}/search/",
100   - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
101   - json=payload,
  156 + return self._request_json(
  157 + "POST",
  158 + "/search/",
102 159 timeout=120,
  160 + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  161 + json_payload=payload,
103 162 )
104   - response.raise_for_status()
105   - return response.json()
106 163  
107 164  
108 165 class RerankServiceClient:
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -567,7 +567,7 @@ class SearchEvaluationFramework:
567 567 "created_at": utc_now_iso(),
568 568 "tenant_id": self.tenant_id,
569 569 "query": query,
570   - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
  570 + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
571 571 "search_total": int(search_payload.get("total") or 0),
572 572 "search_depth_requested": search_depth,
573 573 "search_depth_effective": len(search_results),
... ... @@ -762,7 +762,7 @@ class SearchEvaluationFramework:
762 762 "created_at": utc_now_iso(),
763 763 "tenant_id": self.tenant_id,
764 764 "query": query,
765   - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
  765 + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
766 766 "search_total": int(search_payload.get("total") or 0),
767 767 "search_depth_requested": search_depth,
768 768 "search_depth_effective": len(search_results),
... ... @@ -958,7 +958,7 @@ class SearchEvaluationFramework:
958 958 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
959 959 report_dir = ensure_dir(self.artifact_root / "batch_reports")
960 960 config_snapshot_path = report_dir / f"{batch_id}_config.json"
961   - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
  961 + config_snapshot = self.search_client.get_json("/admin/config", timeout=20)
962 962 config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
963 963 output_json_path = report_dir / f"{batch_id}.json"
964 964 report_md_path = report_dir / f"{batch_id}.md"
... ...
scripts/evaluation/start_eval.sh
... ... @@ -7,32 +7,46 @@ cd "$ROOT"
7 7 PY="${ROOT}/.venv/bin/python"
8 8 TENANT_ID="${TENANT_ID:-163}"
9 9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  10 +START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}"
10 11  
11 12 usage() {
12 13 echo "Usage: $0 batch|batch-rebuild|serve"
13 14 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
14 15 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
15 16 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
16   - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
  17 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
17 18 }
18 19  
19 20 case "${1:-}" in
20 21 batch)
21   - exec "$PY" scripts/evaluation/build_annotation_set.py batch \
22   - --tenant-id "$TENANT_ID" \
23   - --queries-file "$QUERIES" \
24   - --top-k 50 \
  22 + cmd=(
  23 + "$PY" scripts/evaluation/build_annotation_set.py batch
  24 + --tenant-id "$TENANT_ID"
  25 + --queries-file "$QUERIES"
  26 + --top-k 50
25 27 --language en
  28 + )
  29 + if [ -n "$START_FROM_QUERY" ]; then
  30 + cmd+=(--start-from-query "$START_FROM_QUERY")
  31 + fi
  32 + exec "${cmd[@]}"
26 33 ;;
27 34 batch-rebuild)
28   - exec "$PY" scripts/evaluation/build_annotation_set.py build \
29   - --tenant-id "$TENANT_ID" \
30   - --queries-file "$QUERIES" \
31   - --search-depth 500 \
32   - --rerank-depth 10000 \
33   - --force-refresh-rerank \
34   - --force-refresh-labels \
  35 + cmd=(
  36 + "$PY" scripts/evaluation/build_annotation_set.py build
  37 + --tenant-id "$TENANT_ID"
  38 + --queries-file "$QUERIES"
  39 + --search-depth 500
  40 + --rerank-depth 10000
  41 + --reset-artifacts
  42 + --force-refresh-rerank
  43 + --force-refresh-labels
35 44 --language en
  45 + )
  46 + if [ -n "$START_FROM_QUERY" ]; then
  47 + cmd+=(--start-from-query "$START_FROM_QUERY")
  48 + fi
  49 + exec "${cmd[@]}"
36 50 ;;
37 51 serve)
38 52 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
... ...