Commit 286e9b4f2263cb08873b9f3ef506a304070c0861

Authored by tangwang
1 parent 310bb3bc

evalution

scripts/evaluation/eval_framework/cli.py
@@ -17,16 +17,6 @@ from .web_app import create_web_app @@ -17,16 +17,6 @@ from .web_app import create_web_app
17 _cli_log = logging.getLogger("search_eval.cli") 17 _cli_log = logging.getLogger("search_eval.cli")
18 18
19 19
20 -def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]:  
21 - if not start_from_query:  
22 - return queries  
23 - try:  
24 - start_idx = queries.index(start_from_query)  
25 - except ValueError as exc:  
26 - raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc  
27 - return queries[start_idx:]  
28 -  
29 -  
30 def _reset_build_artifacts() -> None: 20 def _reset_build_artifacts() -> None:
31 from config.loader import get_app_config 21 from config.loader import get_app_config
32 22
@@ -238,11 +228,6 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -238,11 +228,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
238 help="Default: search_evaluation.default_language.", 228 help="Default: search_evaluation.default_language.",
239 ) 229 )
240 build.add_argument( 230 build.add_argument(
241 - "--start-from-query",  
242 - default=None,  
243 - help="Start processing from this exact query text in the queries file.",  
244 - )  
245 - build.add_argument(  
246 "--reset-artifacts", 231 "--reset-artifacts",
247 action="store_true", 232 action="store_true",
248 help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.", 233 help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
@@ -257,11 +242,6 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -257,11 +242,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
257 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.") 242 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
258 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.") 243 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
259 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") 244 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
260 - batch.add_argument(  
261 - "--start-from-query",  
262 - default=None,  
263 - help="Start processing from this exact query text in the queries file.",  
264 - )  
265 batch.add_argument("--force-refresh-labels", action="store_true") 245 batch.add_argument("--force-refresh-labels", action="store_true")
266 add_judge_llm_args(batch) 246 add_judge_llm_args(batch)
267 add_intent_llm_args(batch) 247 add_intent_llm_args(batch)
@@ -272,11 +252,6 @@ def build_cli_parser() -> argparse.ArgumentParser: @@ -272,11 +252,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
272 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.") 252 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
273 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.") 253 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
274 audit.add_argument( 254 audit.add_argument(
275 - "--start-from-query",  
276 - default=None,  
277 - help="Start processing from this exact query text in the queries file.",  
278 - )  
279 - audit.add_argument(  
280 "--limit-suspicious", 255 "--limit-suspicious",
281 type=int, 256 type=int,
282 default=None, 257 default=None,
@@ -302,7 +277,6 @@ def run_build(args: argparse.Namespace) -> None: @@ -302,7 +277,6 @@ def run_build(args: argparse.Namespace) -> None:
302 _reset_build_artifacts() 277 _reset_build_artifacts()
303 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 278 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
304 queries = framework.queries_from_file(Path(args.queries_file)) 279 queries = framework.queries_from_file(Path(args.queries_file))
305 - queries = _filter_queries_from_start(queries, args.start_from_query)  
306 summary = [] 280 summary = []
307 rebuild_kwargs = {} 281 rebuild_kwargs = {}
308 if args.force_refresh_labels: 282 if args.force_refresh_labels:
@@ -320,17 +294,21 @@ def run_build(args: argparse.Namespace) -> None: @@ -320,17 +294,21 @@ def run_build(args: argparse.Namespace) -> None:
320 total_q = len(queries) 294 total_q = len(queries)
321 for q_index, query in enumerate(queries, start=1): 295 for q_index, query in enumerate(queries, start=1):
322 _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query) 296 _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
323 - result = framework.build_query_annotation_set(  
324 - query=query,  
325 - search_depth=args.search_depth,  
326 - rerank_depth=args.rerank_depth,  
327 - annotate_search_top_k=args.annotate_search_top_k,  
328 - annotate_rerank_top_k=args.annotate_rerank_top_k,  
329 - language=args.language,  
330 - force_refresh_rerank=args.force_refresh_rerank,  
331 - force_refresh_labels=args.force_refresh_labels,  
332 - **rebuild_kwargs,  
333 - ) 297 + try:
  298 + result = framework.build_query_annotation_set(
  299 + query=query,
  300 + search_depth=args.search_depth,
  301 + rerank_depth=args.rerank_depth,
  302 + annotate_search_top_k=args.annotate_search_top_k,
  303 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  304 + language=args.language,
  305 + force_refresh_rerank=args.force_refresh_rerank,
  306 + force_refresh_labels=args.force_refresh_labels,
  307 + **rebuild_kwargs,
  308 + )
  309 + except Exception:
  310 + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q)
  311 + raise
334 summary.append( 312 summary.append(
335 { 313 {
336 "query": result.query, 314 "query": result.query,
@@ -358,22 +336,24 @@ def run_build(args: argparse.Namespace) -> None: @@ -358,22 +336,24 @@ def run_build(args: argparse.Namespace) -> None:
358 def run_batch(args: argparse.Namespace) -> None: 336 def run_batch(args: argparse.Namespace) -> None:
359 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 337 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
360 queries = framework.queries_from_file(Path(args.queries_file)) 338 queries = framework.queries_from_file(Path(args.queries_file))
361 - queries = _filter_queries_from_start(queries, args.start_from_query)  
362 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries)) 339 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
363 - payload = framework.batch_evaluate(  
364 - queries=queries,  
365 - top_k=args.top_k,  
366 - auto_annotate=True,  
367 - language=args.language,  
368 - force_refresh_labels=args.force_refresh_labels,  
369 - ) 340 + try:
  341 + payload = framework.batch_evaluate(
  342 + queries=queries,
  343 + top_k=args.top_k,
  344 + auto_annotate=True,
  345 + language=args.language,
  346 + force_refresh_labels=args.force_refresh_labels,
  347 + )
  348 + except Exception:
  349 + _cli_log.exception("[batch] failed while evaluating query list from %s", args.queries_file)
  350 + raise
370 _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"]) 351 _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"])
371 352
372 353
373 def run_audit(args: argparse.Namespace) -> None: 354 def run_audit(args: argparse.Namespace) -> None:
374 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args)) 355 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
375 queries = framework.queries_from_file(Path(args.queries_file)) 356 queries = framework.queries_from_file(Path(args.queries_file))
376 - queries = _filter_queries_from_start(queries, args.start_from_query)  
377 audit_items = [] 357 audit_items = []
378 for query in queries: 358 for query in queries:
379 item = framework.audit_live_query( 359 item = framework.audit_live_query(
scripts/evaluation/eval_framework/clients.py
@@ -212,6 +212,8 @@ class DashScopeLabelClient: @@ -212,6 +212,8 @@ class DashScopeLabelClient:
212 self.enable_thinking = bool(enable_thinking) 212 self.enable_thinking = bool(enable_thinking)
213 self.use_batch = bool(use_batch) 213 self.use_batch = bool(use_batch)
214 self.session = requests.Session() 214 self.session = requests.Session()
  215 + self.retry_attempts = 4
  216 + self.retry_delay_sec = 3.0
215 217
216 def _auth_headers(self) -> Dict[str, str]: 218 def _auth_headers(self) -> Dict[str, str]:
217 return {"Authorization": f"Bearer {self.api_key}"} 219 return {"Authorization": f"Bearer {self.api_key}"}
@@ -320,18 +322,41 @@ class DashScopeLabelClient: @@ -320,18 +322,41 @@ class DashScopeLabelClient:
320 return content, safe_json_dumps(row) 322 return content, safe_json_dumps(row)
321 323
322 def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]: 324 def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]:
323 - if not self.use_batch:  
324 - content, raw = self._chat_sync(prompt)  
325 - else: 325 + last_exc: Exception | None = None
  326 + for attempt in range(1, self.retry_attempts + 1):
326 try: 327 try:
327 - content, raw = self._chat_batch(prompt)  
328 - except requests.exceptions.HTTPError as e:  
329 - resp = getattr(e, "response", None)  
330 - if resp is not None and resp.status_code == 404:  
331 - self.use_batch = False 328 + if not self.use_batch:
332 content, raw = self._chat_sync(prompt) 329 content, raw = self._chat_sync(prompt)
333 else: 330 else:
  331 + try:
  332 + content, raw = self._chat_batch(prompt)
  333 + except requests.exceptions.HTTPError as e:
  334 + resp = getattr(e, "response", None)
  335 + if resp is not None and resp.status_code == 404:
  336 + self.use_batch = False
  337 + content, raw = self._chat_sync(prompt)
  338 + else:
  339 + raise
  340 + break
  341 + except Exception as exc:
  342 + last_exc = exc
  343 + is_request_error = isinstance(exc, requests.exceptions.RequestException)
  344 + if not is_request_error or attempt >= self.retry_attempts:
334 raise 345 raise
  346 + _client_log.warning(
  347 + "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s",
  348 + attempt,
  349 + self.retry_attempts,
  350 + phase,
  351 + self.model,
  352 + self.use_batch,
  353 + exc,
  354 + )
  355 + time.sleep(self.retry_delay_sec)
  356 + else:
  357 + if last_exc is not None:
  358 + raise last_exc
  359 + raise RuntimeError(f"unexpected DashScope retry state for phase={phase}")
335 _log_eval_llm_verbose( 360 _log_eval_llm_verbose(
336 phase=phase, 361 phase=phase,
337 model=self.model, 362 model=self.model,
scripts/evaluation/eval_framework/framework.py
@@ -335,6 +335,12 @@ class SearchEvaluationFramework: @@ -335,6 +335,12 @@ class SearchEvaluationFramework:
335 ) 335 )
336 return [(labels, raw_response, docs)] 336 return [(labels, raw_response, docs)]
337 except Exception: 337 except Exception:
  338 + _log.exception(
  339 + "[eval-rebuild] classify failed query=%r docs=%s; %s",
  340 + query,
  341 + len(docs),
  342 + "splitting batch" if len(docs) > 1 else "single-doc failure",
  343 + )
338 if len(docs) == 1: 344 if len(docs) == 1:
339 raise 345 raise
340 mid = len(docs) // 2 346 mid = len(docs) // 2
@@ -382,6 +388,15 @@ class SearchEvaluationFramework: @@ -382,6 +388,15 @@ class SearchEvaluationFramework:
382 if not batch_docs: 388 if not batch_docs:
383 break 389 break
384 390
  391 + _log.info(
  392 + "[eval-rebuild] query=%r starting llm_batch=%s/%s size=%s offset=%s",
  393 + query,
  394 + batch_idx + 1,
  395 + max_batches,
  396 + len(batch_docs),
  397 + start,
  398 + )
  399 +
385 batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh) 400 batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh)
386 for sub_labels, raw_response, sub_batch in batch_pairs: 401 for sub_labels, raw_response, sub_batch in batch_pairs:
387 to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} 402 to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
scripts/evaluation/eval_framework/utils.py
@@ -84,22 +84,12 @@ def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: @@ -84,22 +84,12 @@ def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
84 option1, option2, option3 = compact_option_values(doc.get("skus") or []) 84 option1, option2, option3 = compact_option_values(doc.get("skus") or [])
85 vendor = pick_text(doc.get("vendor"), "en") 85 vendor = pick_text(doc.get("vendor"), "en")
86 category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") 86 category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
87 - tags = doc.get("tags") or []  
88 - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)  
89 parts = [title] 87 parts = [title]
90 if option1: 88 if option1:
91 - parts.append(f"option1={option1}") 89 + parts.append(f"{option1}")
92 if option2: 90 if option2:
93 - parts.append(f"option2={option2}")  
94 - if option3:  
95 - parts.append(f"option3={option3}")  
96 - if vendor:  
97 - parts.append(f"vendor={vendor}")  
98 - if category:  
99 - parts.append(f"category={category}")  
100 - if tags_text:  
101 - parts.append(f"tags={tags_text}")  
102 - return f"{idx}. " + " | ".join(part for part in parts if part) 91 + parts.append(f"{option2}")
  92 + return f"{idx}. " + " ".join(part for part in parts if part)
103 93
104 94
105 def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: 95 def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
@@ -109,8 +99,7 @@ def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: @@ -109,8 +99,7 @@ def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
109 "image_url": doc.get("image_url"), 99 "image_url": doc.get("image_url"),
110 "vendor": pick_text(doc.get("vendor"), "en"), 100 "vendor": pick_text(doc.get("vendor"), "en"),
111 "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), 101 "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
112 - "option_values": list(compact_option_values(doc.get("skus") or [])),  
113 - "tags": list((doc.get("tags") or [])[:6]), 102 + "option_values": list(compact_option_values(doc.get("skus") or []))
114 } 103 }
115 104
116 105
scripts/evaluation/start_eval.sh
@@ -7,46 +7,33 @@ cd "$ROOT" @@ -7,46 +7,33 @@ cd "$ROOT"
7 PY="${ROOT}/.venv/bin/python" 7 PY="${ROOT}/.venv/bin/python"
8 TENANT_ID="${TENANT_ID:-163}" 8 TENANT_ID="${TENANT_ID:-163}"
9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" 9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
10 -START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}"  
11 10
12 usage() { 11 usage() {
13 echo "Usage: $0 batch|batch-rebuild|serve" 12 echo "Usage: $0 batch|batch-rebuild|serve"
14 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)" 13 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
15 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" 14 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
16 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" 15 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
17 - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" 16 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
18 } 17 }
19 18
20 case "${1:-}" in 19 case "${1:-}" in
21 batch) 20 batch)
22 - cmd=(  
23 - "$PY" scripts/evaluation/build_annotation_set.py batch  
24 - --tenant-id "$TENANT_ID"  
25 - --queries-file "$QUERIES"  
26 - --top-k 50 21 + exec "$PY" scripts/evaluation/build_annotation_set.py batch \
  22 + --tenant-id "$TENANT_ID" \
  23 + --queries-file "$QUERIES" \
  24 + --top-k 50 \
27 --language en 25 --language en
28 - )  
29 - if [ -n "$START_FROM_QUERY" ]; then  
30 - cmd+=(--start-from-query "$START_FROM_QUERY")  
31 - fi  
32 - exec "${cmd[@]}"  
33 ;; 26 ;;
34 batch-rebuild) 27 batch-rebuild)
35 - cmd=(  
36 - "$PY" scripts/evaluation/build_annotation_set.py build  
37 - --tenant-id "$TENANT_ID"  
38 - --queries-file "$QUERIES"  
39 - --search-depth 500  
40 - --rerank-depth 10000  
41 - --reset-artifacts  
42 - --force-refresh-rerank  
43 - --force-refresh-labels 28 + exec "$PY" scripts/evaluation/build_annotation_set.py build \
  29 + --tenant-id "$TENANT_ID" \
  30 + --queries-file "$QUERIES" \
  31 + --search-depth 500 \
  32 + --rerank-depth 10000 \
  33 + --reset-artifacts \
  34 + --force-refresh-rerank \
  35 + --force-refresh-labels \
44 --language en 36 --language en
45 - )  
46 - if [ -n "$START_FROM_QUERY" ]; then  
47 - cmd+=(--start-from-query "$START_FROM_QUERY")  
48 - fi  
49 - exec "${cmd[@]}"  
50 ;; 37 ;;
51 serve) 38 serve)
52 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" 39 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"