Commit 286e9b4f2263cb08873b9f3ef506a304070c0861
1 parent
310bb3bc
evalution
Showing
5 changed files
with
91 additions
and
95 deletions
Show diff stats
scripts/evaluation/eval_framework/cli.py
| ... | ... | @@ -17,16 +17,6 @@ from .web_app import create_web_app |
| 17 | 17 | _cli_log = logging.getLogger("search_eval.cli") |
| 18 | 18 | |
| 19 | 19 | |
| 20 | -def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]: | |
| 21 | - if not start_from_query: | |
| 22 | - return queries | |
| 23 | - try: | |
| 24 | - start_idx = queries.index(start_from_query) | |
| 25 | - except ValueError as exc: | |
| 26 | - raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc | |
| 27 | - return queries[start_idx:] | |
| 28 | - | |
| 29 | - | |
| 30 | 20 | def _reset_build_artifacts() -> None: |
| 31 | 21 | from config.loader import get_app_config |
| 32 | 22 | |
| ... | ... | @@ -238,11 +228,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 238 | 228 | help="Default: search_evaluation.default_language.", |
| 239 | 229 | ) |
| 240 | 230 | build.add_argument( |
| 241 | - "--start-from-query", | |
| 242 | - default=None, | |
| 243 | - help="Start processing from this exact query text in the queries file.", | |
| 244 | - ) | |
| 245 | - build.add_argument( | |
| 246 | 231 | "--reset-artifacts", |
| 247 | 232 | action="store_true", |
| 248 | 233 | help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.", |
| ... | ... | @@ -257,11 +242,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 257 | 242 | batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") |
| 258 | 243 | batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") |
| 259 | 244 | batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") |
| 260 | - batch.add_argument( | |
| 261 | - "--start-from-query", | |
| 262 | - default=None, | |
| 263 | - help="Start processing from this exact query text in the queries file.", | |
| 264 | - ) | |
| 265 | 245 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 266 | 246 | add_judge_llm_args(batch) |
| 267 | 247 | add_intent_llm_args(batch) |
| ... | ... | @@ -272,11 +252,6 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 272 | 252 | audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") |
| 273 | 253 | audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") |
| 274 | 254 | audit.add_argument( |
| 275 | - "--start-from-query", | |
| 276 | - default=None, | |
| 277 | - help="Start processing from this exact query text in the queries file.", | |
| 278 | - ) | |
| 279 | - audit.add_argument( | |
| 280 | 255 | "--limit-suspicious", |
| 281 | 256 | type=int, |
| 282 | 257 | default=None, |
| ... | ... | @@ -302,7 +277,6 @@ def run_build(args: argparse.Namespace) -> None: |
| 302 | 277 | _reset_build_artifacts() |
| 303 | 278 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 304 | 279 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 305 | - queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 306 | 280 | summary = [] |
| 307 | 281 | rebuild_kwargs = {} |
| 308 | 282 | if args.force_refresh_labels: |
| ... | ... | @@ -320,17 +294,21 @@ def run_build(args: argparse.Namespace) -> None: |
| 320 | 294 | total_q = len(queries) |
| 321 | 295 | for q_index, query in enumerate(queries, start=1): |
| 322 | 296 | _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) |
| 323 | - result = framework.build_query_annotation_set( | |
| 324 | - query=query, | |
| 325 | - search_depth=args.search_depth, | |
| 326 | - rerank_depth=args.rerank_depth, | |
| 327 | - annotate_search_top_k=args.annotate_search_top_k, | |
| 328 | - annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 329 | - language=args.language, | |
| 330 | - force_refresh_rerank=args.force_refresh_rerank, | |
| 331 | - force_refresh_labels=args.force_refresh_labels, | |
| 332 | - **rebuild_kwargs, | |
| 333 | - ) | |
| 297 | + try: | |
| 298 | + result = framework.build_query_annotation_set( | |
| 299 | + query=query, | |
| 300 | + search_depth=args.search_depth, | |
| 301 | + rerank_depth=args.rerank_depth, | |
| 302 | + annotate_search_top_k=args.annotate_search_top_k, | |
| 303 | + annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 304 | + language=args.language, | |
| 305 | + force_refresh_rerank=args.force_refresh_rerank, | |
| 306 | + force_refresh_labels=args.force_refresh_labels, | |
| 307 | + **rebuild_kwargs, | |
| 308 | + ) | |
| 309 | + except Exception: | |
| 310 | + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q) | |
| 311 | + raise | |
| 334 | 312 | summary.append( |
| 335 | 313 | { |
| 336 | 314 | "query": result.query, |
| ... | ... | @@ -358,22 +336,24 @@ def run_build(args: argparse.Namespace) -> None: |
| 358 | 336 | def run_batch(args: argparse.Namespace) -> None: |
| 359 | 337 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 360 | 338 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 361 | - queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 362 | 339 | _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) |
| 363 | - payload = framework.batch_evaluate( | |
| 364 | - queries=queries, | |
| 365 | - top_k=args.top_k, | |
| 366 | - auto_annotate=True, | |
| 367 | - language=args.language, | |
| 368 | - force_refresh_labels=args.force_refresh_labels, | |
| 369 | - ) | |
| 340 | + try: | |
| 341 | + payload = framework.batch_evaluate( | |
| 342 | + queries=queries, | |
| 343 | + top_k=args.top_k, | |
| 344 | + auto_annotate=True, | |
| 345 | + language=args.language, | |
| 346 | + force_refresh_labels=args.force_refresh_labels, | |
| 347 | + ) | |
| 348 | + except Exception: | |
| 349 | + _cli_log.exception("[batch] failed while evaluating query list from %s", args.queries_file) | |
| 350 | + raise | |
| 370 | 351 | _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"]) |
| 371 | 352 | |
| 372 | 353 | |
| 373 | 354 | def run_audit(args: argparse.Namespace) -> None: |
| 374 | 355 | framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) |
| 375 | 356 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 376 | - queries = _filter_queries_from_start(queries, args.start_from_query) | |
| 377 | 357 | audit_items = [] |
| 378 | 358 | for query in queries: |
| 379 | 359 | item = framework.audit_live_query( | ... | ... |
scripts/evaluation/eval_framework/clients.py
| ... | ... | @@ -212,6 +212,8 @@ class DashScopeLabelClient: |
| 212 | 212 | self.enable_thinking = bool(enable_thinking) |
| 213 | 213 | self.use_batch = bool(use_batch) |
| 214 | 214 | self.session = requests.Session() |
| 215 | + self.retry_attempts = 4 | |
| 216 | + self.retry_delay_sec = 3.0 | |
| 215 | 217 | |
| 216 | 218 | def _auth_headers(self) -> Dict[str, str]: |
| 217 | 219 | return {"Authorization": f"Bearer {self.api_key}"} |
| ... | ... | @@ -320,18 +322,41 @@ class DashScopeLabelClient: |
| 320 | 322 | return content, safe_json_dumps(row) |
| 321 | 323 | |
| 322 | 324 | def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]: |
| 323 | - if not self.use_batch: | |
| 324 | - content, raw = self._chat_sync(prompt) | |
| 325 | - else: | |
| 325 | + last_exc: Exception | None = None | |
| 326 | + for attempt in range(1, self.retry_attempts + 1): | |
| 326 | 327 | try: |
| 327 | - content, raw = self._chat_batch(prompt) | |
| 328 | - except requests.exceptions.HTTPError as e: | |
| 329 | - resp = getattr(e, "response", None) | |
| 330 | - if resp is not None and resp.status_code == 404: | |
| 331 | - self.use_batch = False | |
| 328 | + if not self.use_batch: | |
| 332 | 329 | content, raw = self._chat_sync(prompt) |
| 333 | 330 | else: |
| 331 | + try: | |
| 332 | + content, raw = self._chat_batch(prompt) | |
| 333 | + except requests.exceptions.HTTPError as e: | |
| 334 | + resp = getattr(e, "response", None) | |
| 335 | + if resp is not None and resp.status_code == 404: | |
| 336 | + self.use_batch = False | |
| 337 | + content, raw = self._chat_sync(prompt) | |
| 338 | + else: | |
| 339 | + raise | |
| 340 | + break | |
| 341 | + except Exception as exc: | |
| 342 | + last_exc = exc | |
| 343 | + is_request_error = isinstance(exc, requests.exceptions.RequestException) | |
| 344 | + if not is_request_error or attempt >= self.retry_attempts: | |
| 334 | 345 | raise |
| 346 | + _client_log.warning( | |
| 347 | + "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s", | |
| 348 | + attempt, | |
| 349 | + self.retry_attempts, | |
| 350 | + phase, | |
| 351 | + self.model, | |
| 352 | + self.use_batch, | |
| 353 | + exc, | |
| 354 | + ) | |
| 355 | + time.sleep(self.retry_delay_sec) | |
| 356 | + else: | |
| 357 | + if last_exc is not None: | |
| 358 | + raise last_exc | |
| 359 | + raise RuntimeError(f"unexpected DashScope retry state for phase={phase}") | |
| 335 | 360 | _log_eval_llm_verbose( |
| 336 | 361 | phase=phase, |
| 337 | 362 | model=self.model, | ... | ... |
scripts/evaluation/eval_framework/framework.py
| ... | ... | @@ -335,6 +335,12 @@ class SearchEvaluationFramework: |
| 335 | 335 | ) |
| 336 | 336 | return [(labels, raw_response, docs)] |
| 337 | 337 | except Exception: |
| 338 | + _log.exception( | |
| 339 | + "[eval-rebuild] classify failed query=%r docs=%s; %s", | |
| 340 | + query, | |
| 341 | + len(docs), | |
| 342 | + "splitting batch" if len(docs) > 1 else "single-doc failure", | |
| 343 | + ) | |
| 338 | 344 | if len(docs) == 1: |
| 339 | 345 | raise |
| 340 | 346 | mid = len(docs) // 2 |
| ... | ... | @@ -382,6 +388,15 @@ class SearchEvaluationFramework: |
| 382 | 388 | if not batch_docs: |
| 383 | 389 | break |
| 384 | 390 | |
| 391 | + _log.info( | |
| 392 | + "[eval-rebuild] query=%r starting llm_batch=%s/%s size=%s offset=%s", | |
| 393 | + query, | |
| 394 | + batch_idx + 1, | |
| 395 | + max_batches, | |
| 396 | + len(batch_docs), | |
| 397 | + start, | |
| 398 | + ) | |
| 399 | + | |
| 385 | 400 | batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh) |
| 386 | 401 | for sub_labels, raw_response, sub_batch in batch_pairs: |
| 387 | 402 | to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} | ... | ... |
scripts/evaluation/eval_framework/utils.py
| ... | ... | @@ -84,22 +84,12 @@ def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: |
| 84 | 84 | option1, option2, option3 = compact_option_values(doc.get("skus") or []) |
| 85 | 85 | vendor = pick_text(doc.get("vendor"), "en") |
| 86 | 86 | category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") |
| 87 | - tags = doc.get("tags") or [] | |
| 88 | - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) | |
| 89 | 87 | parts = [title] |
| 90 | 88 | if option1: |
| 91 | - parts.append(f"option1={option1}") | |
| 89 | + parts.append(f"{option1}") | |
| 92 | 90 | if option2: |
| 93 | - parts.append(f"option2={option2}") | |
| 94 | - if option3: | |
| 95 | - parts.append(f"option3={option3}") | |
| 96 | - if vendor: | |
| 97 | - parts.append(f"vendor={vendor}") | |
| 98 | - if category: | |
| 99 | - parts.append(f"category={category}") | |
| 100 | - if tags_text: | |
| 101 | - parts.append(f"tags={tags_text}") | |
| 102 | - return f"{idx}. " + " | ".join(part for part in parts if part) | |
| 91 | + parts.append(f"{option2}") | |
| 92 | + return f"{idx}. " + " ".join(part for part in parts if part) | |
| 103 | 93 | |
| 104 | 94 | |
| 105 | 95 | def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: |
| ... | ... | @@ -109,8 +99,7 @@ def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: |
| 109 | 99 | "image_url": doc.get("image_url"), |
| 110 | 100 | "vendor": pick_text(doc.get("vendor"), "en"), |
| 111 | 101 | "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), |
| 112 | - "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 113 | - "tags": list((doc.get("tags") or [])[:6]), | |
| 102 | + "option_values": list(compact_option_values(doc.get("skus") or [])) | |
| 114 | 103 | } |
| 115 | 104 | |
| 116 | 105 | ... | ... |
scripts/evaluation/start_eval.sh
| ... | ... | @@ -7,46 +7,33 @@ cd "$ROOT" |
| 7 | 7 | PY="${ROOT}/.venv/bin/python" |
| 8 | 8 | TENANT_ID="${TENANT_ID:-163}" |
| 9 | 9 | QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" |
| 10 | -START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}" | |
| 11 | 10 | |
| 12 | 11 | usage() { |
| 13 | 12 | echo "Usage: $0 batch|batch-rebuild|serve" |
| 14 | 13 | echo " batch โ batch eval: live search every query, LLM only for missing labels (top_k=50)" |
| 15 | 14 | echo " batch-rebuild โ deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" |
| 16 | 15 | echo " serve โ eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" |
| 17 | - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 16 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" | |
| 18 | 17 | } |
| 19 | 18 | |
| 20 | 19 | case "${1:-}" in |
| 21 | 20 | batch) |
| 22 | - cmd=( | |
| 23 | - "$PY" scripts/evaluation/build_annotation_set.py batch | |
| 24 | - --tenant-id "$TENANT_ID" | |
| 25 | - --queries-file "$QUERIES" | |
| 26 | - --top-k 50 | |
| 21 | + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | |
| 22 | + --tenant-id "$TENANT_ID" \ | |
| 23 | + --queries-file "$QUERIES" \ | |
| 24 | + --top-k 50 \ | |
| 27 | 25 | --language en |
| 28 | - ) | |
| 29 | - if [ -n "$START_FROM_QUERY" ]; then | |
| 30 | - cmd+=(--start-from-query "$START_FROM_QUERY") | |
| 31 | - fi | |
| 32 | - exec "${cmd[@]}" | |
| 33 | 26 | ;; |
| 34 | 27 | batch-rebuild) |
| 35 | - cmd=( | |
| 36 | - "$PY" scripts/evaluation/build_annotation_set.py build | |
| 37 | - --tenant-id "$TENANT_ID" | |
| 38 | - --queries-file "$QUERIES" | |
| 39 | - --search-depth 500 | |
| 40 | - --rerank-depth 10000 | |
| 41 | - --reset-artifacts | |
| 42 | - --force-refresh-rerank | |
| 43 | - --force-refresh-labels | |
| 28 | + exec "$PY" scripts/evaluation/build_annotation_set.py build \ | |
| 29 | + --tenant-id "$TENANT_ID" \ | |
| 30 | + --queries-file "$QUERIES" \ | |
| 31 | + --search-depth 500 \ | |
| 32 | + --rerank-depth 10000 \ | |
| 33 | + --reset-artifacts \ | |
| 34 | + --force-refresh-rerank \ | |
| 35 | + --force-refresh-labels \ | |
| 44 | 36 | --language en |
| 45 | - ) | |
| 46 | - if [ -n "$START_FROM_QUERY" ]; then | |
| 47 | - cmd+=(--start-from-query "$START_FROM_QUERY") | |
| 48 | - fi | |
| 49 | - exec "${cmd[@]}" | |
| 50 | 37 | ;; |
| 51 | 38 | serve) |
| 52 | 39 | EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" | ... | ... |