Commit 286e9b4f2263cb08873b9f3ef506a304070c0861

Authored by tangwang
1 parent 310bb3bc

evalution

scripts/evaluation/eval_framework/cli.py
... ... @@ -17,16 +17,6 @@ from .web_app import create_web_app
17 17 _cli_log = logging.getLogger("search_eval.cli")
18 18  
19 19  
20   -def _filter_queries_from_start(queries: list[str], start_from_query: str | None) -> list[str]:
21   - if not start_from_query:
22   - return queries
23   - try:
24   - start_idx = queries.index(start_from_query)
25   - except ValueError as exc:
26   - raise SystemExit(f"start-from-query not found in queries file: {start_from_query!r}") from exc
27   - return queries[start_idx:]
28   -
29   -
30 20 def _reset_build_artifacts() -> None:
31 21 from config.loader import get_app_config
32 22  
... ... @@ -238,11 +228,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
238 228 help="Default: search_evaluation.default_language.",
239 229 )
240 230 build.add_argument(
241   - "--start-from-query",
242   - default=None,
243   - help="Start processing from this exact query text in the queries file.",
244   - )
245   - build.add_argument(
246 231 "--reset-artifacts",
247 232 action="store_true",
248 233 help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
... ... @@ -257,11 +242,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
257 242 batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
258 243 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
259 244 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
260   - batch.add_argument(
261   - "--start-from-query",
262   - default=None,
263   - help="Start processing from this exact query text in the queries file.",
264   - )
265 245 batch.add_argument("--force-refresh-labels", action="store_true")
266 246 add_judge_llm_args(batch)
267 247 add_intent_llm_args(batch)
... ... @@ -272,11 +252,6 @@ def build_cli_parser() -> argparse.ArgumentParser:
272 252 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
273 253 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
274 254 audit.add_argument(
275   - "--start-from-query",
276   - default=None,
277   - help="Start processing from this exact query text in the queries file.",
278   - )
279   - audit.add_argument(
280 255 "--limit-suspicious",
281 256 type=int,
282 257 default=None,
... ... @@ -302,7 +277,6 @@ def run_build(args: argparse.Namespace) -> None:
302 277 _reset_build_artifacts()
303 278 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
304 279 queries = framework.queries_from_file(Path(args.queries_file))
305   - queries = _filter_queries_from_start(queries, args.start_from_query)
306 280 summary = []
307 281 rebuild_kwargs = {}
308 282 if args.force_refresh_labels:
... ... @@ -320,17 +294,21 @@ def run_build(args: argparse.Namespace) -> None:
320 294 total_q = len(queries)
321 295 for q_index, query in enumerate(queries, start=1):
322 296 _cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
323   - result = framework.build_query_annotation_set(
324   - query=query,
325   - search_depth=args.search_depth,
326   - rerank_depth=args.rerank_depth,
327   - annotate_search_top_k=args.annotate_search_top_k,
328   - annotate_rerank_top_k=args.annotate_rerank_top_k,
329   - language=args.language,
330   - force_refresh_rerank=args.force_refresh_rerank,
331   - force_refresh_labels=args.force_refresh_labels,
332   - **rebuild_kwargs,
333   - )
  297 + try:
  298 + result = framework.build_query_annotation_set(
  299 + query=query,
  300 + search_depth=args.search_depth,
  301 + rerank_depth=args.rerank_depth,
  302 + annotate_search_top_k=args.annotate_search_top_k,
  303 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  304 + language=args.language,
  305 + force_refresh_rerank=args.force_refresh_rerank,
  306 + force_refresh_labels=args.force_refresh_labels,
  307 + **rebuild_kwargs,
  308 + )
  309 + except Exception:
  310 + _cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q)
  311 + raise
334 312 summary.append(
335 313 {
336 314 "query": result.query,
... ... @@ -358,22 +336,24 @@ def run_build(args: argparse.Namespace) -> None:
358 336 def run_batch(args: argparse.Namespace) -> None:
359 337 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
360 338 queries = framework.queries_from_file(Path(args.queries_file))
361   - queries = _filter_queries_from_start(queries, args.start_from_query)
362 339 _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
363   - payload = framework.batch_evaluate(
364   - queries=queries,
365   - top_k=args.top_k,
366   - auto_annotate=True,
367   - language=args.language,
368   - force_refresh_labels=args.force_refresh_labels,
369   - )
  340 + try:
  341 + payload = framework.batch_evaluate(
  342 + queries=queries,
  343 + top_k=args.top_k,
  344 + auto_annotate=True,
  345 + language=args.language,
  346 + force_refresh_labels=args.force_refresh_labels,
  347 + )
  348 + except Exception:
  349 + _cli_log.exception("[batch] failed while evaluating query list from %s", args.queries_file)
  350 + raise
370 351 _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"])
371 352  
372 353  
373 354 def run_audit(args: argparse.Namespace) -> None:
374 355 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
375 356 queries = framework.queries_from_file(Path(args.queries_file))
376   - queries = _filter_queries_from_start(queries, args.start_from_query)
377 357 audit_items = []
378 358 for query in queries:
379 359 item = framework.audit_live_query(
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -212,6 +212,8 @@ class DashScopeLabelClient:
212 212 self.enable_thinking = bool(enable_thinking)
213 213 self.use_batch = bool(use_batch)
214 214 self.session = requests.Session()
  215 + self.retry_attempts = 4
  216 + self.retry_delay_sec = 3.0
215 217  
216 218 def _auth_headers(self) -> Dict[str, str]:
217 219 return {"Authorization": f"Bearer {self.api_key}"}
... ... @@ -320,18 +322,41 @@ class DashScopeLabelClient:
320 322 return content, safe_json_dumps(row)
321 323  
322 324 def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]:
323   - if not self.use_batch:
324   - content, raw = self._chat_sync(prompt)
325   - else:
  325 + last_exc: Exception | None = None
  326 + for attempt in range(1, self.retry_attempts + 1):
326 327 try:
327   - content, raw = self._chat_batch(prompt)
328   - except requests.exceptions.HTTPError as e:
329   - resp = getattr(e, "response", None)
330   - if resp is not None and resp.status_code == 404:
331   - self.use_batch = False
  328 + if not self.use_batch:
332 329 content, raw = self._chat_sync(prompt)
333 330 else:
  331 + try:
  332 + content, raw = self._chat_batch(prompt)
  333 + except requests.exceptions.HTTPError as e:
  334 + resp = getattr(e, "response", None)
  335 + if resp is not None and resp.status_code == 404:
  336 + self.use_batch = False
  337 + content, raw = self._chat_sync(prompt)
  338 + else:
  339 + raise
  340 + break
  341 + except Exception as exc:
  342 + last_exc = exc
  343 + is_request_error = isinstance(exc, requests.exceptions.RequestException)
  344 + if not is_request_error or attempt >= self.retry_attempts:
334 345 raise
  346 + _client_log.warning(
  347 + "Transient DashScope error, retrying (%s/%s): phase=%s model=%s use_batch=%s error=%s",
  348 + attempt,
  349 + self.retry_attempts,
  350 + phase,
  351 + self.model,
  352 + self.use_batch,
  353 + exc,
  354 + )
  355 + time.sleep(self.retry_delay_sec)
  356 + else:
  357 + if last_exc is not None:
  358 + raise last_exc
  359 + raise RuntimeError(f"unexpected DashScope retry state for phase={phase}")
335 360 _log_eval_llm_verbose(
336 361 phase=phase,
337 362 model=self.model,
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -335,6 +335,12 @@ class SearchEvaluationFramework:
335 335 )
336 336 return [(labels, raw_response, docs)]
337 337 except Exception:
  338 + _log.exception(
  339 + "[eval-rebuild] classify failed query=%r docs=%s; %s",
  340 + query,
  341 + len(docs),
  342 + "splitting batch" if len(docs) > 1 else "single-doc failure",
  343 + )
338 344 if len(docs) == 1:
339 345 raise
340 346 mid = len(docs) // 2
... ... @@ -382,6 +388,15 @@ class SearchEvaluationFramework:
382 388 if not batch_docs:
383 389 break
384 390  
  391 + _log.info(
  392 + "[eval-rebuild] query=%r starting llm_batch=%s/%s size=%s offset=%s",
  393 + query,
  394 + batch_idx + 1,
  395 + max_batches,
  396 + len(batch_docs),
  397 + start,
  398 + )
  399 +
385 400 batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh)
386 401 for sub_labels, raw_response, sub_batch in batch_pairs:
387 402 to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
... ...
scripts/evaluation/eval_framework/utils.py
... ... @@ -84,22 +84,12 @@ def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
84 84 option1, option2, option3 = compact_option_values(doc.get("skus") or [])
85 85 vendor = pick_text(doc.get("vendor"), "en")
86 86 category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
87   - tags = doc.get("tags") or []
88   - tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
89 87 parts = [title]
90 88 if option1:
91   - parts.append(f"option1={option1}")
  89 + parts.append(f"{option1}")
92 90 if option2:
93   - parts.append(f"option2={option2}")
94   - if option3:
95   - parts.append(f"option3={option3}")
96   - if vendor:
97   - parts.append(f"vendor={vendor}")
98   - if category:
99   - parts.append(f"category={category}")
100   - if tags_text:
101   - parts.append(f"tags={tags_text}")
102   - return f"{idx}. " + " | ".join(part for part in parts if part)
  91 + parts.append(f"{option2}")
  92 + return f"{idx}. " + " ".join(part for part in parts if part)
103 93  
104 94  
105 95 def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
... ... @@ -109,8 +99,7 @@ def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
109 99 "image_url": doc.get("image_url"),
110 100 "vendor": pick_text(doc.get("vendor"), "en"),
111 101 "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
112   - "option_values": list(compact_option_values(doc.get("skus") or [])),
113   - "tags": list((doc.get("tags") or [])[:6]),
  102 + "option_values": list(compact_option_values(doc.get("skus") or []))
114 103 }
115 104  
116 105  
... ...
scripts/evaluation/start_eval.sh
... ... @@ -7,46 +7,33 @@ cd "$ROOT"
7 7 PY="${ROOT}/.venv/bin/python"
8 8 TENANT_ID="${TENANT_ID:-163}"
9 9 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
10   -START_FROM_QUERY="${REPO_EVAL_START_FROM_QUERY:-}"
11 10  
12 11 usage() {
13 12 echo "Usage: $0 batch|batch-rebuild|serve"
14 13 echo " batch โ€” batch eval: live search every query, LLM only for missing labels (top_k=50)"
15 14 echo " batch-rebuild โ€” deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
16 15 echo " serve โ€” eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
17   - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, REPO_EVAL_START_FROM_QUERY, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
  16 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
18 17 }
19 18  
20 19 case "${1:-}" in
21 20 batch)
22   - cmd=(
23   - "$PY" scripts/evaluation/build_annotation_set.py batch
24   - --tenant-id "$TENANT_ID"
25   - --queries-file "$QUERIES"
26   - --top-k 50
  21 + exec "$PY" scripts/evaluation/build_annotation_set.py batch \
  22 + --tenant-id "$TENANT_ID" \
  23 + --queries-file "$QUERIES" \
  24 + --top-k 50 \
27 25 --language en
28   - )
29   - if [ -n "$START_FROM_QUERY" ]; then
30   - cmd+=(--start-from-query "$START_FROM_QUERY")
31   - fi
32   - exec "${cmd[@]}"
33 26 ;;
34 27 batch-rebuild)
35   - cmd=(
36   - "$PY" scripts/evaluation/build_annotation_set.py build
37   - --tenant-id "$TENANT_ID"
38   - --queries-file "$QUERIES"
39   - --search-depth 500
40   - --rerank-depth 10000
41   - --reset-artifacts
42   - --force-refresh-rerank
43   - --force-refresh-labels
  28 + exec "$PY" scripts/evaluation/build_annotation_set.py build \
  29 + --tenant-id "$TENANT_ID" \
  30 + --queries-file "$QUERIES" \
  31 + --search-depth 500 \
  32 + --rerank-depth 10000 \
  33 + --reset-artifacts \
  34 + --force-refresh-rerank \
  35 + --force-refresh-labels \
44 36 --language en
45   - )
46   - if [ -n "$START_FROM_QUERY" ]; then
47   - cmd+=(--start-from-query "$START_FROM_QUERY")
48   - fi
49   - exec "${cmd[@]}"
50 37 ;;
51 38 serve)
52 39 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
... ...