From 286e9b4f2263cb08873b9f3ef506a304070c0861 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 16:10:30 +0800 Subject: [PATCH] evalution --- scripts/evaluation/eval_framework/cli.py | 72 ++++++++++++++++++++++++++---------------------------------------------- scripts/evaluation/eval_framework/clients.py | 41 +++++++++++++++++++++++++++++++++-------- scripts/evaluation/eval_framework/framework.py | 15 +++++++++++++++ scripts/evaluation/eval_framework/utils.py | 19 ++++--------------- scripts/evaluation/start_eval.sh | 39 +++++++++++++-------------------------- 5 files changed, 91 insertions(+), 95 deletions(-) diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index 923129a..068f056 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -17,16 +17,6 @@ from .web_app import create_web_app _cli_log = logging.getLogger("search_eval.cli") -def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]: - if not start_from_query: - return queries - try: - start_idx = queries.index(start_from_query) - except ValueError as exc: - raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc - return queries[start_idx:] - - def _reset_build_artifacts() -> None: from config.loader import get_app_config @@ -238,11 +228,6 @@ def build_cli_parser() -> argparse.ArgumentParser: help="Default: search_evaluation.default_language.", ) build.add_argument( - "--start-from-query", - default=None, - help="Start processing from this exact query text in the queries file.", - ) - build.add_argument( "--reset-artifacts", action="store_true", help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.", @@ -257,11 +242,6 @@ def build_cli_parser() -> argparse.ArgumentParser: batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") - batch.add_argument( - "--start-from-query", - default=None, - help="Start processing from this exact query text in the queries file.", - ) batch.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(batch) add_intent_llm_args(batch) @@ -272,11 +252,6 @@ def build_cli_parser() -> argparse.ArgumentParser: audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") audit.add_argument( - "--start-from-query", - default=None, - help="Start processing from this exact query text in the queries file.", - ) - audit.add_argument( "--limit-suspicious", type=int, default=None, @@ -302,7 +277,6 @@ def run_build(args: argparse.Namespace) -> None: _reset_build_artifacts() framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) - queries = _filter_queries_from_start(queries, args.start_from_query) summary = [] rebuild_kwargs = {} if args.force_refresh_labels: @@ -320,17 +294,21 @@ def run_build(args: argparse.Namespace) -> None: total_q = len(queries) for q_index, query in enumerate(queries, start=1): _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) - result = framework.build_query_annotation_set( - query=query, - search_depth=args.search_depth, - rerank_depth=args.rerank_depth, - annotate_search_top_k=args.annotate_search_top_k, - annotate_rerank_top_k=args.annotate_rerank_top_k, - language=args.language, - force_refresh_rerank=args.force_refresh_rerank, - force_refresh_labels=args.force_refresh_labels, - **rebuild_kwargs, - ) + try: + result = framework.build_query_annotation_set( + query=query, + search_depth=args.search_depth, + rerank_depth=args.rerank_depth, + annotate_search_top_k=args.annotate_search_top_k, + annotate_rerank_top_k=args.annotate_rerank_top_k, + language=args.language, + force_refresh_rerank=args.force_refresh_rerank, + force_refresh_labels=args.force_refresh_labels, + **rebuild_kwargs, + ) + except Exception: + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) + raise summary.append( { "query": result.query, @@ -358,22 +336,24 @@ def run_build(args: argparse.Namespace) -> None: def run_batch(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) - queries = _filter_queries_from_start(queries, args.start_from_query) _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) - payload = framework.batch_evaluate( - queries=queries, - top_k=args.top_k, - auto_annotate=True, - language=args.language, - force_refresh_labels=args.force_refresh_labels, - ) + try: + payload = framework.batch_evaluate( + queries=queries, + top_k=args.top_k, + auto_annotate=True, + language=args.language, + force_refresh_labels=args.force_refresh_labels, + ) + except Exception: + _cli_log.exception("[batch] failed while evaluating query list from %s", args.queries_file) + raise _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"]) def run_audit(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) queries = framework.queries_from_file(Path(args.queries_file)) - queries = _filter_queries_from_start(queries, args.start_from_query) audit_items = [] for query in queries: item = framework.audit_live_query( diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index df28e9a..17b8196 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -212,6 +212,8 @@ class DashScopeLabelClient: self.enable_thinking = bool(enable_thinking) self.use_batch = bool(use_batch) self.session = requests.Session() + self.retry_attempts = 4 + self.retry_delay_sec = 3.0 def _auth_headers(self) -> Dict[str, str]: return {"Authorization": f"Bearer {self.api_key}"} @@ -320,18 +322,41 @@ class DashScopeLabelClient: return content, safe_json_dumps(row) def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]: - if not self.use_batch: - content, raw = self._chat_sync(prompt) - else: + last_exc: Exception | None = None + for attempt in range(1, self.retry_attempts + 1): try: - content, raw = self._chat_batch(prompt) - except requests.exceptions.HTTPError as e: - resp = getattr(e, "response", None) - if resp is not None and resp.status_code == 404: - self.use_batch = False + if not self.use_batch: content, raw = self._chat_sync(prompt) else: + try: + content, raw = self._chat_batch(prompt) + except requests.exceptions.HTTPError as e: + resp = getattr(e, "response", None) + if resp is not None and resp.status_code == 404: + self.use_batch = False + content, raw = self._chat_sync(prompt) + else: + raise + break + except Exception as exc: + last_exc = exc + is_request_error = isinstance(exc, requests.exceptions.RequestException) + if not is_request_error or attempt >= self.retry_attempts: raise + _client_log.warning( + "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s", + attempt, + self.retry_attempts, + phase, + self.model, + self.use_batch, + exc, + ) + time.sleep(self.retry_delay_sec) + else: + if last_exc is not None: + raise last_exc + raise RuntimeError(f"unexpected DashScope retry state for phase={phase}") _log_eval_llm_verbose( phase=phase, model=self.model, diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 02508b5..ba7c90e 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -335,6 +335,12 @@ class SearchEvaluationFramework: ) return [(labels, raw_response, docs)] except Exception: + _log.exception( + "[eval-rebuild] classify failed query=%r docs=%s; %s", + query, + len(docs), + "splitting batch" if len(docs) > 1 else "single-doc failure", + ) if len(docs) == 1: raise mid = len(docs) // 2 @@ -382,6 +388,15 @@ class SearchEvaluationFramework: if not batch_docs: break + _log.info( + "[eval-rebuild] query=%r starting llm_batch=%s/%s size=%s offset=%s", + query, + batch_idx + 1, + max_batches, + len(batch_docs), + start, + ) + batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh) for sub_labels, raw_response, sub_batch in batch_pairs: to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} diff --git a/scripts/evaluation/eval_framework/utils.py b/scripts/evaluation/eval_framework/utils.py index dbe613c..7ae33a5 100644 --- a/scripts/evaluation/eval_framework/utils.py +++ b/scripts/evaluation/eval_framework/utils.py @@ -84,22 +84,12 @@ def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: option1, option2, option3 = compact_option_values(doc.get("skus") or []) vendor = pick_text(doc.get("vendor"), "en") category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") - tags = doc.get("tags") or [] - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) parts = [title] if option1: - parts.append(f"option1={option1}") + parts.append(f"{option1}") if option2: - parts.append(f"option2={option2}") - if option3: - parts.append(f"option3={option3}") - if vendor: - parts.append(f"vendor={vendor}") - if category: - parts.append(f"category={category}") - if tags_text: - parts.append(f"tags={tags_text}") - return f"{idx}. " + " | ".join(part for part in parts if part) + parts.append(f"{option2}") + return f"{idx}. " + " ".join(part for part in parts if part) def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: @@ -109,8 +99,7 @@ def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: "image_url": doc.get("image_url"), "vendor": pick_text(doc.get("vendor"), "en"), "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), - "option_values": list(compact_option_values(doc.get("skus") or [])), - "tags": list((doc.get("tags") or [])[:6]), + "option_values": list(compact_option_values(doc.get("skus") or [])) } diff --git a/scripts/evaluation/start_eval.sh b/scripts/evaluation/start_eval.sh index ce2442e..870de8f 100755 --- a/scripts/evaluation/start_eval.sh +++ b/scripts/evaluation/start_eval.sh @@ -7,46 +7,33 @@ cd "$ROOT" PY="${ROOT}/.venv/bin/python" TENANT_ID="${TENANT_ID:-163}" QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" -START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}" usage() { echo "Usage: $0 batch|batch-rebuild|serve" echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" } case "${1:-}" in batch) - cmd=( - "$PY" scripts/evaluation/build_annotation_set.py batch - --tenant-id "$TENANT_ID" - --queries-file "$QUERIES" - --top-k 50 + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ + --tenant-id "$TENANT_ID" \ + --queries-file "$QUERIES" \ + --top-k 50 \ --language en - ) - if [ -n "$START_FROM_QUERY" ]; then - cmd+=(--start-from-query "$START_FROM_QUERY") - fi - exec "${cmd[@]}" ;; batch-rebuild) - cmd=( - "$PY" scripts/evaluation/build_annotation_set.py build - --tenant-id "$TENANT_ID" - --queries-file "$QUERIES" - --search-depth 500 - --rerank-depth 10000 - --reset-artifacts - --force-refresh-rerank - --force-refresh-labels + exec "$PY" scripts/evaluation/build_annotation_set.py build \ + --tenant-id "$TENANT_ID" \ + --queries-file "$QUERIES" \ + --search-depth 500 \ + --rerank-depth 10000 \ + --reset-artifacts \ + --force-refresh-rerank \ + --force-refresh-labels \ --language en - ) - if [ -n "$START_FROM_QUERY" ]; then - cmd+=(--start-from-query "$START_FROM_QUERY") - fi - exec "${cmd[@]}" ;; serve) EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" -- libgit2 0.21.2