Commit 310bb3bc8f6351cf7c957189ce54522d2b322366

Authored by tangwang
1 parent 331861d5

eval tools

config/config.yaml
@@ -153,8 +153,8 @@ query_config: @@ -153,8 +153,8 @@ query_config:
153 153
154 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 154 # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
155 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 155 # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
156 - translation_embedding_wait_budget_ms_source_in_index: 200 # 80  
157 - translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 156 + translation_embedding_wait_budget_ms_source_in_index: 300 # 80
  157 + translation_embedding_wait_budget_ms_source_not_in_index: 400 # 200
158 style_intent: 158 style_intent:
159 enabled: true 159 enabled: true
160 selected_sku_boost: 1.2 160 selected_sku_boost: 1.2
scripts/evaluation/eval_framework/cli.py
@@ -5,6 +5,7 @@ from __future__ import annotations @@ -5,6 +5,7 @@ from __future__ import annotations
5 import argparse 5 import argparse
6 import json 6 import json
7 import logging 7 import logging
  8 +import shutil
8 from pathlib import Path 9 from pathlib import Path
9 from typing import Any, Dict 10 from typing import Any, Dict
10 11
@@ -16,6 +17,35 @@ from .web_app import create_web_app @@ -16,6 +17,35 @@ from .web_app import create_web_app
16 _cli_log = logging.getLogger("search_eval.cli") 17 _cli_log = logging.getLogger("search_eval.cli")
17 18
18 19
  20 +def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]:
  21 + if not start_from_query:
  22 + return queries
  23 + try:
  24 + start_idx = queries.index(start_from_query)
  25 + except ValueError as exc:
  26 + raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc
  27 + return queries[start_idx:]
  28 +
  29 +
  30 +def _reset_build_artifacts() -> None:
  31 + from config.loader import get_app_config
  32 +
  33 + artifact_root = get_app_config().search_evaluation.artifact_root
  34 + removed = []
  35 + db_path = artifact_root / "search_eval.sqlite3"
  36 + query_builds_dir = artifact_root / "query_builds"
  37 + if db_path.exists():
  38 + db_path.unlink()
  39 + removed.append(str(db_path))
  40 + if query_builds_dir.exists():
  41 + shutil.rmtree(query_builds_dir)
  42 + removed.append(str(query_builds_dir))
  43 + if removed:
  44 + _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed))
  45 + else:
  46 + _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root)
  47 +
  48 +
19 def add_judge_llm_args(p: argparse.ArgumentParser) -> None: 49 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
20 p.add_argument( 50 p.add_argument(
21 "--judge-model", 51 "--judge-model",
@@ -207,6 +237,16 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -207,6 +237,16 @@ def build_cli_parser() -> argparse.ArgumentParser:
207 default=None, 237 default=None,
208 help="Default: search_evaluation.default_language.", 238 help="Default: search_evaluation.default_language.",
209 ) 239 )
  240 + build.add_argument(
  241 + "--start-from-query",
  242 + default=None,
  243 + help="Start processing from this exact query text in the queries file.",
  244 + )
  245 + build.add_argument(
  246 + "--reset-artifacts",
  247 + action="store_true",
  248 + help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
  249 + )
210 build.add_argument("--force-refresh-rerank", action="store_true") 250 build.add_argument("--force-refresh-rerank", action="store_true")
211 build.add_argument("--force-refresh-labels", action="store_true") 251 build.add_argument("--force-refresh-labels", action="store_true")
212 add_judge_llm_args(build) 252 add_judge_llm_args(build)
@@ -217,6 +257,11 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -217,6 +257,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
217 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") 257 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
218 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") 258 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
219 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") 259 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
  260 + batch.add_argument(
  261 + "--start-from-query",
  262 + default=None,
  263 + help="Start processing from this exact query text in the queries file.",
  264 + )
220 batch.add_argument("--force-refresh-labels", action="store_true") 265 batch.add_argument("--force-refresh-labels", action="store_true")
221 add_judge_llm_args(batch) 266 add_judge_llm_args(batch)
222 add_intent_llm_args(batch) 267 add_intent_llm_args(batch)
@@ -227,6 +272,11 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -227,6 +272,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
227 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") 272 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
228 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") 273 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
229 audit.add_argument( 274 audit.add_argument(
  275 + "--start-from-query",
  276 + default=None,
  277 + help="Start processing from this exact query text in the queries file.",
  278 + )
  279 + audit.add_argument(
230 "--limit-suspicious", 280 "--limit-suspicious",
231 type=int, 281 type=int,
232 default=None, 282 default=None,
@@ -248,8 +298,11 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -248,8 +298,11 @@ def build_cli_parser() -> argparse.ArgumentParser:
248 298
249 299
250 def run_build(args: argparse.Namespace) -> None: 300 def run_build(args: argparse.Namespace) -> None:
  301 + if args.reset_artifacts:
  302 + _reset_build_artifacts()
251 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 303 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
252 queries = framework.queries_from_file(Path(args.queries_file)) 304 queries = framework.queries_from_file(Path(args.queries_file))
  305 + queries = _filter_queries_from_start(queries, args.start_from_query)
253 summary = [] 306 summary = []
254 rebuild_kwargs = {} 307 rebuild_kwargs = {}
255 if args.force_refresh_labels: 308 if args.force_refresh_labels:
@@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -> None: @@ -305,6 +358,7 @@ def run_build(args: argparse.Namespace) -> None:
305 def run_batch(args: argparse.Namespace) -> None: 358 def run_batch(args: argparse.Namespace) -> None:
306 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 359 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
307 queries = framework.queries_from_file(Path(args.queries_file)) 360 queries = framework.queries_from_file(Path(args.queries_file))
  361 + queries = _filter_queries_from_start(queries, args.start_from_query)
308 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) 362 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
309 payload = framework.batch_evaluate( 363 payload = framework.batch_evaluate(
310 queries=queries, 364 queries=queries,
@@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -> None: @@ -319,6 +373,7 @@ def run_batch(args: argparse.Namespace) -> None:
319 def run_audit(args: argparse.Namespace) -> None: 373 def run_audit(args: argparse.Namespace) -> None:
320 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 374 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
321 queries = framework.queries_from_file(Path(args.queries_file)) 375 queries = framework.queries_from_file(Path(args.queries_file))
  376 + queries = _filter_queries_from_start(queries, args.start_from_query)
322 audit_items = [] 377 audit_items = []
323 for query in queries: 378 for query in queries:
324 item = framework.audit_live_query( 379 item = framework.audit_live_query(
scripts/evaluation/eval_framework/clients.py
@@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps @@ -20,6 +20,8 @@ from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
20 _VERBOSE_LOGGER_LOCK = threading.Lock() 20 _VERBOSE_LOGGER_LOCK = threading.Lock()
21 _eval_llm_verbose_logger_singleton: logging.Logger | None = None 21 _eval_llm_verbose_logger_singleton: logging.Logger | None = None
22 _eval_llm_verbose_path_logged = False 22 _eval_llm_verbose_path_logged = False
  23 +_TRANSIENT_HTTP_STATUS_CODES = frozenset({408, 425, 429, 500, 502, 503, 504})
  24 +_client_log = logging.getLogger("search_eval.clients")
23 25
24 26
25 def _get_eval_llm_verbose_logger() -> logging.Logger: 27 def _get_eval_llm_verbose_logger() -> logging.Logger:
@@ -85,6 +87,62 @@ class SearchServiceClient: @@ -85,6 +87,62 @@ class SearchServiceClient:
85 self.base_url = base_url.rstrip("/") 87 self.base_url = base_url.rstrip("/")
86 self.tenant_id = str(tenant_id) 88 self.tenant_id = str(tenant_id)
87 self.session = requests.Session() 89 self.session = requests.Session()
  90 + # Batch eval depends on live backend responses; tolerate brief restarts.
  91 + self.retry_attempts = 45
  92 + self.retry_delay_sec = 2.0
  93 +
  94 + @staticmethod
  95 + def _is_transient_request_error(exc: requests.exceptions.RequestException) -> bool:
  96 + if isinstance(exc, (requests.exceptions.ConnectionError, requests.exceptions.Timeout)):
  97 + return True
  98 + if isinstance(exc, requests.exceptions.HTTPError):
  99 + response = getattr(exc, "response", None)
  100 + if response is None:
  101 + return True
  102 + return int(response.status_code) in _TRANSIENT_HTTP_STATUS_CODES
  103 + return False
  104 +
  105 + def _request_json(
  106 + self,
  107 + method: str,
  108 + path: str,
  109 + *,
  110 + timeout: float,
  111 + headers: Optional[Dict[str, str]] = None,
  112 + json_payload: Optional[Dict[str, Any]] = None,
  113 + ) -> Dict[str, Any]:
  114 + last_exc: requests.exceptions.RequestException | None = None
  115 + url = f"{self.base_url}{path}"
  116 + for attempt in range(1, self.retry_attempts + 1):
  117 + try:
  118 + response = self.session.request(
  119 + method=method,
  120 + url=url,
  121 + headers=headers,
  122 + json=json_payload,
  123 + timeout=timeout,
  124 + )
  125 + response.raise_for_status()
  126 + return response.json()
  127 + except requests.exceptions.RequestException as exc:
  128 + last_exc = exc
  129 + if not self._is_transient_request_error(exc) or attempt >= self.retry_attempts:
  130 + raise
  131 + _client_log.warning(
  132 + "Transient search-eval request failure, retrying (%s/%s): %s %s error=%s",
  133 + attempt,
  134 + self.retry_attempts,
  135 + method.upper(),
  136 + url,
  137 + exc,
  138 + )
  139 + time.sleep(self.retry_delay_sec)
  140 + if last_exc is not None:
  141 + raise last_exc
  142 + raise RuntimeError(f"unexpected request retry state for {method.upper()} {url}")
  143 +
  144 + def get_json(self, path: str, *, timeout: float = 20) -> Dict[str, Any]:
  145 + return self._request_json("GET", path, timeout=timeout)
88 146
89 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]: 147 def search(self, query: str, size: int, from_: int = 0, language: str = "en", *, debug: bool = False) -> Dict[str, Any]:
90 payload: Dict[str, Any] = { 148 payload: Dict[str, Any] = {
@@ -95,14 +153,13 @@ class SearchServiceClient: @@ -95,14 +153,13 @@ class SearchServiceClient:
95 } 153 }
96 if debug: 154 if debug:
97 payload["debug"] = True 155 payload["debug"] = True
98 - response = self.session.post(  
99 - f"{self.base_url}/search/",  
100 - headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},  
101 - json=payload, 156 + return self._request_json(
  157 + "POST",
  158 + "/search/",
102 timeout=120, 159 timeout=120,
  160 + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  161 + json_payload=payload,
103 ) 162 )
104 - response.raise_for_status()  
105 - return response.json()  
106 163
107 164
108 class RerankServiceClient: 165 class RerankServiceClient:
scripts/evaluation/eval_framework/framework.py
@@ -567,7 +567,7 @@ class SearchEvaluationFramework: @@ -567,7 +567,7 @@ class SearchEvaluationFramework:
567 "created_at": utc_now_iso(), 567 "created_at": utc_now_iso(),
568 "tenant_id": self.tenant_id, 568 "tenant_id": self.tenant_id,
569 "query": query, 569 "query": query,
570 - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), 570 + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
571 "search_total": int(search_payload.get("total") or 0), 571 "search_total": int(search_payload.get("total") or 0),
572 "search_depth_requested": search_depth, 572 "search_depth_requested": search_depth,
573 "search_depth_effective": len(search_results), 573 "search_depth_effective": len(search_results),
@@ -762,7 +762,7 @@ class SearchEvaluationFramework: @@ -762,7 +762,7 @@ class SearchEvaluationFramework:
762 "created_at": utc_now_iso(), 762 "created_at": utc_now_iso(),
763 "tenant_id": self.tenant_id, 763 "tenant_id": self.tenant_id,
764 "query": query, 764 "query": query,
765 - "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), 765 + "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
766 "search_total": int(search_payload.get("total") or 0), 766 "search_total": int(search_payload.get("total") or 0),
767 "search_depth_requested": search_depth, 767 "search_depth_requested": search_depth,
768 "search_depth_effective": len(search_results), 768 "search_depth_effective": len(search_results),
@@ -958,7 +958,7 @@ class SearchEvaluationFramework: @@ -958,7 +958,7 @@ class SearchEvaluationFramework:
958 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" 958 batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
959 report_dir = ensure_dir(self.artifact_root / "batch_reports") 959 report_dir = ensure_dir(self.artifact_root / "batch_reports")
960 config_snapshot_path = report_dir / f"{batch_id}_config.json" 960 config_snapshot_path = report_dir / f"{batch_id}_config.json"
961 - config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() 961 + config_snapshot = self.search_client.get_json("/admin/config", timeout=20)
962 config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") 962 config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
963 output_json_path = report_dir / f"{batch_id}.json" 963 output_json_path = report_dir / f"{batch_id}.json"
964 report_md_path = report_dir / f"{batch_id}.md" 964 report_md_path = report_dir / f"{batch_id}.md"
scripts/evaluation/start_eval.sh
@@ -7,32 +7,46 @@ cd "$ROOT" @@ -7,32 +7,46 @@ cd "$ROOT"
7 PY="${ROOT}/.venv/bin/python" 7 PY="${ROOT}/.venv/bin/python"
8 TENANT_ID="${TENANT_ID:-163}" 8 TENANT_ID="${TENANT_ID:-163}"
9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" 9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  10 +START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}"
10 11
11 usage() { 12 usage() {
12 echo "Usage: $0 batch|batch-rebuild|serve" 13 echo "Usage: $0 batch|batch-rebuild|serve"
13 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" 14 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
14 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" 15 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
15 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" 16 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
16 - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" 17 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
17 } 18 }
18 19
19 case "${1:-}" in 20 case "${1:-}" in
20 batch) 21 batch)
21 - exec "$PY" scripts/evaluation/build_annotation_set.py batch \  
22 - --tenant-id "$TENANT_ID" \  
23 - --queries-file "$QUERIES" \  
24 - --top-k 50 \ 22 + cmd=(
  23 + "$PY" scripts/evaluation/build_annotation_set.py batch
  24 + --tenant-id "$TENANT_ID"
  25 + --queries-file "$QUERIES"
  26 + --top-k 50
25 --language en 27 --language en
  28 + )
  29 + if [ -n "$START_FROM_QUERY" ]; then
  30 + cmd+=(--start-from-query "$START_FROM_QUERY")
  31 + fi
  32 + exec "${cmd[@]}"
26 ;; 33 ;;
27 batch-rebuild) 34 batch-rebuild)
28 - exec "$PY" scripts/evaluation/build_annotation_set.py build \  
29 - --tenant-id "$TENANT_ID" \  
30 - --queries-file "$QUERIES" \  
31 - --search-depth 500 \  
32 - --rerank-depth 10000 \  
33 - --force-refresh-rerank \  
34 - --force-refresh-labels \ 35 + cmd=(
  36 + "$PY" scripts/evaluation/build_annotation_set.py build
  37 + --tenant-id "$TENANT_ID"
  38 + --queries-file "$QUERIES"
  39 + --search-depth 500
  40 + --rerank-depth 10000
  41 + --reset-artifacts
  42 + --force-refresh-rerank
  43 + --force-refresh-labels
35 --language en 44 --language en
  45 + )
  46 + if [ -n "$START_FROM_QUERY" ]; then
  47 + cmd+=(--start-from-query "$START_FROM_QUERY")
  48 + fi
  49 + exec "${cmd[@]}"
36 ;; 50 ;;
37 serve) 51 serve)
38 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" 52 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"