Commit cdd8ee3a3dfe584e2e32a46c7567ea95c2dafac2

Authored by tangwang
1 parent 35ae3b29

eval框架日志独立

现在的行为(按你的路径)
用途	路径(相对仓库根 PROJECT_ROOT)
评估主日志(CLI + framework 的 INFO)	logs/eval.log
LLM 全量 prompt / 原始响应	logs/verbose/eval_verbose.log
实现要点:

constants.py:EVAL_LOG_DIR、EVAL_VERBOSE_LOG_DIR、EVAL_LOG_FILE、EVAL_VERBOSE_LOG_FILE。
logging_setup.py:setup_eval_logging() 给名为 search_eval 的 logger 挂
文件 + stderr,只初始化一次;build_annotation_set.py / serve_eval_web.py
走的 eval_framework.cli.main() 开头会先调用。
cli.py:原来的 print 改为 search_eval.cli 的 logging.info;启动时写一条
CLI start command=... log_file=... 到 logs/eval.log。
framework.py:rebuild 相关 print 改为 search_eval.framework 的
logging.info。
clients.py:verbose 改为写入
logs/verbose/eval_verbose.log;首次需要时调用 setup_eval_logging(),并用
search_eval.info 提示 verbose 文件路径(不再用 print)。
api/result_formatter.py
... ... @@ -76,6 +76,13 @@ class ResultFormatter:
76 76 category_path = pick_lang_field(source, "category_path")
77 77 category_name = pick_lang_field(source, "category_name_text") or source.get("category_name")
78 78  
  79 + # tags: core-language object {"en": "a,b", "zh": "..."} from indexer
  80 + tags: Optional[List[str]] = None
  81 + if isinstance(source.get("tags"), dict):
  82 + tags_txt = pick_lang_field(source, "tags")
  83 + if tags_txt:
  84 + tags = [t.strip() for t in str(tags_txt).split(",") if t.strip()] or None
  85 +
79 86 # Extract SKUs
80 87 skus = []
81 88 skus_data = source.get('skus', [])
... ... @@ -129,7 +136,7 @@ class ResultFormatter:
129 136 category1_name=source.get('category1_name'),
130 137 category2_name=source.get('category2_name'),
131 138 category3_name=source.get('category3_name'),
132   - tags=source.get('tags'),
  139 + tags=tags,
133 140 price=source.get('min_price'),
134 141 compare_at_price=source.get('compare_at_price'),
135 142 currency="USD", # Default currency
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -4,10 +4,13 @@ from __future__ import annotations
4 4  
5 5 import argparse
6 6 import json
  7 +import logging
7 8 from pathlib import Path
8 9 from typing import Any, Dict
9 10  
10 11 from .constants import (
  12 + DEFAULT_INTENT_ENABLE_THINKING,
  13 + DEFAULT_INTENT_MODEL,
11 14 DEFAULT_QUERY_FILE,
12 15 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
13 16 DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
... ... @@ -19,10 +22,14 @@ from .constants import (
19 22 DEFAULT_RERANK_HIGH_THRESHOLD,
20 23 DEFAULT_SEARCH_RECALL_TOP_K,
21 24 )
  25 +from .constants import EVAL_LOG_FILE
22 26 from .framework import SearchEvaluationFramework
  27 +from .logging_setup import setup_eval_logging
23 28 from .utils import ensure_dir, utc_now_iso, utc_timestamp
24 29 from .web_app import create_web_app
25 30  
  31 +_cli_log = logging.getLogger("search_eval.cli")
  32 +
26 33  
27 34 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
28 35 p.add_argument(
... ... @@ -45,6 +52,21 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
45 52 )
46 53  
47 54  
  55 +def add_intent_llm_args(p: argparse.ArgumentParser) -> None:
  56 + p.add_argument(
  57 + "--intent-model",
  58 + default=None,
  59 + metavar="MODEL",
  60 + help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).",
  61 + )
  62 + p.add_argument(
  63 + "--intent-enable-thinking",
  64 + action=argparse.BooleanOptionalAction,
  65 + default=None,
  66 + help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).",
  67 + )
  68 +
  69 +
48 70 def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
49 71 kw: Dict[str, Any] = {}
50 72 if args.judge_model is not None:
... ... @@ -53,6 +75,10 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
53 75 kw["enable_thinking"] = args.enable_thinking
54 76 if args.dashscope_batch is not None:
55 77 kw["use_dashscope_batch"] = args.dashscope_batch
  78 + if getattr(args, "intent_model", None) is not None:
  79 + kw["intent_model"] = args.intent_model
  80 + if getattr(args, "intent_enable_thinking", None) is not None:
  81 + kw["intent_enable_thinking"] = args.intent_enable_thinking
56 82 return kw
57 83  
58 84  
... ... @@ -110,6 +136,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
110 136 build.add_argument("--force-refresh-rerank", action="store_true")
111 137 build.add_argument("--force-refresh-labels", action="store_true")
112 138 add_judge_llm_args(build)
  139 + add_intent_llm_args(build)
113 140  
114 141 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
115 142 batch.add_argument("--tenant-id", default="163")
... ... @@ -118,6 +145,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
118 145 batch.add_argument("--language", default="en")
119 146 batch.add_argument("--force-refresh-labels", action="store_true")
120 147 add_judge_llm_args(batch)
  148 + add_intent_llm_args(batch)
121 149  
122 150 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
123 151 audit.add_argument("--tenant-id", default="163")
... ... @@ -127,6 +155,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
127 155 audit.add_argument("--limit-suspicious", type=int, default=5)
128 156 audit.add_argument("--force-refresh-labels", action="store_true")
129 157 add_judge_llm_args(audit)
  158 + add_intent_llm_args(audit)
130 159  
131 160 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
132 161 serve.add_argument("--tenant-id", default="163")
... ... @@ -134,6 +163,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
134 163 serve.add_argument("--host", default="0.0.0.0")
135 164 serve.add_argument("--port", type=int, default=6010)
136 165 add_judge_llm_args(serve)
  166 + add_intent_llm_args(serve)
137 167  
138 168 return parser
139 169  
... ... @@ -183,14 +213,18 @@ def run_build(args: argparse.Namespace) -> None:
183 213 "output_json_path": str(result.output_json_path),
184 214 }
185 215 )
186   - print(
187   - f"[build] query={result.query!r} search_total={result.search_total} "
188   - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
189   - f"annotated={result.annotated_count} output={result.output_json_path}"
  216 + _cli_log.info(
  217 + "[build] query=%r search_total=%s search_depth=%s corpus=%s annotated=%s output=%s",
  218 + result.query,
  219 + result.search_total,
  220 + result.search_depth,
  221 + result.rerank_corpus_size,
  222 + result.annotated_count,
  223 + result.output_json_path,
190 224 )
191 225 out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
192 226 out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
193   - print(f"[done] summary={out_path}")
  227 + _cli_log.info("[done] summary=%s", out_path)
194 228  
195 229  
196 230 def run_batch(args: argparse.Namespace) -> None:
... ... @@ -203,7 +237,7 @@ def run_batch(args: argparse.Namespace) -> None:
203 237 language=args.language,
204 238 force_refresh_labels=args.force_refresh_labels,
205 239 )
206   - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
  240 + _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"])
207 241  
208 242  
209 243 def run_audit(args: argparse.Namespace) -> None:
... ... @@ -239,8 +273,11 @@ def run_audit(args: argparse.Namespace) -> None:
239 273 "suspicious_examples": item["suspicious"][: args.limit_suspicious],
240 274 }
241 275 )
242   - print(
243   - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
  276 + _cli_log.info(
  277 + "[audit] query=%r suspicious=%s metrics=%s",
  278 + query,
  279 + len(item["suspicious"]),
  280 + item["metrics"],
244 281 )
245 282  
246 283 summary = {
... ... @@ -253,7 +290,7 @@ def run_audit(args: argparse.Namespace) -> None:
253 290 }
254 291 out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
255 292 out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
256   - print(f"[done] audit={out_path}")
  293 + _cli_log.info("[done] audit=%s", out_path)
257 294  
258 295  
259 296 def run_serve(args: argparse.Namespace) -> None:
... ... @@ -265,8 +302,15 @@ def run_serve(args: argparse.Namespace) -> None:
265 302  
266 303  
267 304 def main() -> None:
  305 + setup_eval_logging()
268 306 parser = build_cli_parser()
269 307 args = parser.parse_args()
  308 + logging.getLogger("search_eval").info(
  309 + "CLI start command=%s tenant_id=%s log_file=%s",
  310 + args.command,
  311 + getattr(args, "tenant_id", ""),
  312 + EVAL_LOG_FILE.resolve(),
  313 + )
270 314 if args.command == "build":
271 315 run_build(args)
272 316 return
... ...
scripts/evaluation/eval_framework/clients.py
... ... @@ -4,16 +4,67 @@ from __future__ import annotations
4 4  
5 5 import io
6 6 import json
  7 +import logging
  8 +import threading
7 9 import time
8 10 import uuid
9 11 from typing import Any, Dict, List, Optional, Sequence, Tuple
10 12  
11 13 import requests
12 14  
13   -from .constants import VALID_LABELS
14   -from .prompts import classify_prompt
  15 +from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS
  16 +from .logging_setup import setup_eval_logging
  17 +from .prompts import classify_prompt, intent_analysis_prompt
15 18 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
16 19  
  20 +_VERBOSE_LOGGER_LOCK = threading.Lock()
  21 +_eval_llm_verbose_logger_singleton: logging.Logger | None = None
  22 +_eval_llm_verbose_path_logged = False
  23 +
  24 +
  25 +def _get_eval_llm_verbose_logger() -> logging.Logger:
  26 + """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``."""
  27 + setup_eval_logging()
  28 + global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged
  29 + with _VERBOSE_LOGGER_LOCK:
  30 + if _eval_llm_verbose_logger_singleton is not None:
  31 + return _eval_llm_verbose_logger_singleton
  32 + log_path = EVAL_VERBOSE_LOG_FILE
  33 + log_path.parent.mkdir(parents=True, exist_ok=True)
  34 + lg = logging.getLogger("search_eval.verbose_llm")
  35 + lg.setLevel(logging.INFO)
  36 + if not lg.handlers:
  37 + handler = logging.FileHandler(log_path, encoding="utf-8")
  38 + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
  39 + lg.addHandler(handler)
  40 + lg.propagate = False
  41 + _eval_llm_verbose_logger_singleton = lg
  42 + if not _eval_llm_verbose_path_logged:
  43 + _eval_llm_verbose_path_logged = True
  44 + logging.getLogger("search_eval").info(
  45 + "LLM verbose I/O log (full prompt + response): %s",
  46 + log_path.resolve(),
  47 + )
  48 + return lg
  49 +
  50 +
  51 +def _log_eval_llm_verbose(
  52 + *,
  53 + phase: str,
  54 + model: str,
  55 + prompt: str,
  56 + assistant_text: str,
  57 + raw_response: str,
  58 +) -> None:
  59 + log = _get_eval_llm_verbose_logger()
  60 + sep = "=" * 80
  61 + log.info("\n%s", sep)
  62 + log.info("phase=%s model=%s", phase, model)
  63 + log.info("%s\nFULL PROMPT (user message)\n%s", sep, prompt)
  64 + log.info("%s\nASSISTANT CONTENT (parsed)\n%s", sep, assistant_text)
  65 + log.info("%s\nRAW RESPONSE (JSON string)\n%s", sep, raw_response)
  66 + log.info("%s\n", sep)
  67 +
17 68  
18 69 def _canonicalize_judge_label(raw: str) -> str | None:
19 70 s = str(raw or "").strip().strip('"').strip("'")
... ... @@ -208,17 +259,27 @@ class DashScopeLabelClient:
208 259 content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
209 260 return content, safe_json_dumps(row)
210 261  
211   - def _chat(self, prompt: str) -> Tuple[str, str]:
  262 + def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]:
212 263 if not self.use_batch:
213   - return self._chat_sync(prompt)
214   - try:
215   - return self._chat_batch(prompt)
216   - except requests.exceptions.HTTPError as e:
217   - resp = getattr(e, "response", None)
218   - if resp is not None and resp.status_code == 404:
219   - self.use_batch = False
220   - return self._chat_sync(prompt)
221   - raise
  264 + content, raw = self._chat_sync(prompt)
  265 + else:
  266 + try:
  267 + content, raw = self._chat_batch(prompt)
  268 + except requests.exceptions.HTTPError as e:
  269 + resp = getattr(e, "response", None)
  270 + if resp is not None and resp.status_code == 404:
  271 + self.use_batch = False
  272 + content, raw = self._chat_sync(prompt)
  273 + else:
  274 + raise
  275 + _log_eval_llm_verbose(
  276 + phase=phase,
  277 + model=self.model,
  278 + prompt=prompt,
  279 + assistant_text=content,
  280 + raw_response=raw,
  281 + )
  282 + return content, raw
222 283  
223 284 def _find_batch_line_for_custom_id(
224 285 self,
... ... @@ -242,14 +303,20 @@ class DashScopeLabelClient:
242 303 return obj
243 304 return None
244 305  
  306 + def query_intent(self, query: str) -> Tuple[str, str]:
  307 + prompt = intent_analysis_prompt(query)
  308 + return self._chat(prompt, phase="query_intent")
  309 +
245 310 def classify_batch(
246 311 self,
247 312 query: str,
248 313 docs: Sequence[Dict[str, Any]],
  314 + *,
  315 + query_intent_block: str = "",
249 316 ) -> Tuple[List[str], str]:
250 317 numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
251   - prompt = classify_prompt(query, numbered_docs)
252   - content, raw_response = self._chat(prompt)
  318 + prompt = classify_prompt(query, numbered_docs, query_intent_block=query_intent_block)
  319 + content, raw_response = self._chat(prompt, phase="relevance_classify")
253 320 labels: List[str] = []
254 321 for line in str(content or "").splitlines():
255 322 canon = _canonicalize_judge_label(line)
... ...
scripts/evaluation/eval_framework/constants.py
... ... @@ -34,10 +34,20 @@ def normalize_stored_label(label: str) -> str:
34 34 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
35 35 DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
36 36  
  37 +# Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``)
  38 +EVAL_LOG_DIR = PROJECT_ROOT / "logs"
  39 +EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose"
  40 +EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log"
  41 +EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log"
  42 +
37 43 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
38   -DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
39   -DEFAULT_JUDGE_ENABLE_THINKING = True
  44 +DEFAULT_JUDGE_MODEL = "qwen3.5-plus"
  45 +DEFAULT_JUDGE_ENABLE_THINKING = False
40 46 DEFAULT_JUDGE_DASHSCOPE_BATCH = False
  47 +
  48 +# Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
  49 +DEFAULT_INTENT_MODEL = "qwen-max"
  50 +DEFAULT_INTENT_ENABLE_THINKING = True
41 51 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
42 52 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
43 53  
... ... @@ -60,6 +70,6 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
60 70 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
61 71 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
62 72 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
63   -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
  73 +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
64 74 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
65 75 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -3,6 +3,7 @@
3 3 from __future__ import annotations
4 4  
5 5 import json
  6 +import logging
6 7 import time
7 8 from pathlib import Path
8 9 from typing import Any, Dict, List, Sequence, Tuple
... ... @@ -17,6 +18,8 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli
17 18 from .constants import (
18 19 DEFAULT_ARTIFACT_ROOT,
19 20 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW,
  21 + DEFAULT_INTENT_ENABLE_THINKING,
  22 + DEFAULT_INTENT_MODEL,
20 23 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC,
21 24 DEFAULT_JUDGE_DASHSCOPE_BATCH,
22 25 DEFAULT_JUDGE_ENABLE_THINKING,
... ... @@ -52,6 +55,8 @@ from .utils import (
52 55 zh_title_from_multilingual,
53 56 )
54 57  
  58 +_log = logging.getLogger("search_eval.framework")
  59 +
55 60  
56 61 def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
57 62 """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``."""
... ... @@ -80,6 +85,8 @@ class SearchEvaluationFramework:
80 85 judge_model: str | None = None,
81 86 enable_thinking: bool | None = None,
82 87 use_dashscope_batch: bool | None = None,
  88 + intent_model: str | None = None,
  89 + intent_enable_thinking: bool | None = None,
83 90 ):
84 91 init_service(get_app_config().infrastructure.elasticsearch.host)
85 92 self.tenant_id = str(tenant_id)
... ... @@ -109,6 +116,24 @@ class SearchEvaluationFramework:
109 116 enable_thinking=et,
110 117 use_batch=use_batch,
111 118 )
  119 + intent_m = str(intent_model or DEFAULT_INTENT_MODEL)
  120 + intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking
  121 + self.intent_client = DashScopeLabelClient(
  122 + model=intent_m,
  123 + base_url=str(llm_cfg["base_url"]),
  124 + api_key=str(api_key),
  125 + batch_completion_window=batch_window,
  126 + batch_poll_interval_sec=batch_poll,
  127 + enable_thinking=bool(intent_et),
  128 + use_batch=False,
  129 + )
  130 + self._query_intent_cache: Dict[str, str] = {}
  131 +
  132 + def _ensure_query_intent_block(self, query: str) -> str:
  133 + if query not in self._query_intent_cache:
  134 + text, _raw = self.intent_client.query_intent(query)
  135 + self._query_intent_cache[query] = str(text or "").strip()
  136 + return self._query_intent_cache[query]
112 137  
113 138 def audit_live_query(
114 139 self,
... ... @@ -310,7 +335,10 @@ class SearchEvaluationFramework:
310 335 if not docs:
311 336 return []
312 337 try:
313   - labels, raw_response = self.label_client.classify_batch(query, docs)
  338 + intent_block = self._ensure_query_intent_block(query)
  339 + labels, raw_response = self.label_client.classify_batch(
  340 + query, docs, query_intent_block=intent_block
  341 + )
314 342 return [(labels, raw_response, docs)]
315 343 except Exception:
316 344 if len(docs) == 1:
... ... @@ -392,11 +420,16 @@ class SearchEvaluationFramework:
392 420 "offset_end": min(start + n, total_ordered),
393 421 }
394 422 batch_logs.append(log_entry)
395   - print(
396   - f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} "
397   - f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} "
398   - f"irrel_plus_low_ratio={irrel_low_ratio:.4f}",
399   - flush=True,
  423 + _log.info(
  424 + "[eval-rebuild] query=%r llm_batch=%s/%s size=%s exact_ratio=%.4f irrelevant_ratio=%.4f "
  425 + "irrel_plus_low_ratio=%.4f",
  426 + query,
  427 + batch_idx + 1,
  428 + max_batches,
  429 + n,
  430 + exact_ratio,
  431 + irrelevant_ratio,
  432 + irrel_low_ratio,
400 433 )
401 434  
402 435 # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
... ... @@ -409,11 +442,14 @@ class SearchEvaluationFramework:
409 442 else:
410 443 streak = 0
411 444 if streak >= stop_streak:
412   - print(
413   - f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
414   - f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} "
415   - f"and irrel+low>{irrelevant_low_combined_stop_ratio})",
416   - flush=True,
  445 + _log.info(
  446 + "[eval-rebuild] query=%r early_stop after %s batches (%s consecutive batches: "
  447 + "irrelevant>%s and irrel+low>%s)",
  448 + query,
  449 + batch_idx + 1,
  450 + stop_streak,
  451 + irrelevant_stop_ratio,
  452 + irrelevant_low_combined_stop_ratio,
417 453 )
418 454 break
419 455  
... ... @@ -626,11 +662,13 @@ class SearchEvaluationFramework:
626 662 if rerank_high_n > int(rerank_high_skip_count):
627 663 skipped = True
628 664 skip_reason = "too_many_high_rerank_scores"
629   - print(
630   - f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} "
631   - f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} "
632   - f"(relevant tail too large / query too easy to satisfy)",
633   - flush=True,
  665 + _log.info(
  666 + "[eval-rebuild] query=%r skip: rerank_score>%s outside recall pool count=%s > %s "
  667 + "(relevant tail too large / query too easy to satisfy)",
  668 + query,
  669 + rerank_high_threshold,
  670 + rerank_high_n,
  671 + rerank_high_skip_count,
634 672 )
635 673 else:
636 674 ordered_docs: List[Dict[str, Any]] = []
... ...
scripts/evaluation/eval_framework/logging_setup.py 0 → 100644
... ... @@ -0,0 +1,35 @@
  1 +"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``)."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import logging
  6 +import sys
  7 +
  8 +from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR
  9 +
  10 +_setup_done = False
  11 +
  12 +
  13 +def setup_eval_logging() -> None:
  14 + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist."""
  15 + global _setup_done
  16 + if _setup_done:
  17 + return
  18 +
  19 + EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True)
  20 + EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True)
  21 +
  22 + fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
  23 + root = logging.getLogger("search_eval")
  24 + root.setLevel(logging.INFO)
  25 + if root.handlers:
  26 + _setup_done = True
  27 + return
  28 + fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8")
  29 + fh.setFormatter(fmt)
  30 + sh = logging.StreamHandler(sys.stderr)
  31 + sh.setFormatter(fmt)
  32 + root.addHandler(fh)
  33 + root.addHandler(sh)
  34 + root.propagate = False
  35 + _setup_done = True
... ...
scripts/evaluation/eval_framework/prompts.py
... ... @@ -4,6 +4,54 @@ from __future__ import annotations
4 4  
5 5 from typing import Sequence
6 6  
  7 +
  8 +_QUERY_INTENT_ANALYSIS_TEMPLATE_EN = """You are an intent analysis expert for a fashion e-commerce search system.
  9 +
  10 +Given a user's search query, analyze the shopping intent behind the query in the context of fashion and apparel e-commerce, and summarize the user's core search need in one concise sentence.
  11 +Also provide the Chinese translation and English translation of the query.
  12 +
  13 +Requirements:
  14 +- Keep the intent analysis concise and easy to understand, using 1 to 3 short sentences.
  15 +- Stay grounded in the original query and summarize the user's likely shopping intent without adding unnecessary context.
  16 +- When the query is vague or ambiguous, take a conservative approach and keep the analysis close to the original wording.
  17 +- Chinese translation: if the original query is already in Chinese, keep it unchanged.
  18 +- English translation: if the original query is already in English, keep it unchanged.
  19 +- Do not output anything other than the required three-line format.
  20 +
  21 +Output format (strictly exactly three lines):
  22 +Intent: concise analysis of the user's search intent
  23 +Query中文翻译: Chinese translation of the query
  24 +Query English translation: English translation of the query
  25 +
  26 +Now analyze the following query:
  27 +
  28 +Query: {query}
  29 +"""
  30 +
  31 +_QUERY_INTENT_ANALYSIS_RESULT_TEMPLATE_ZH = """
  32 +你是一个æœè£…å“类电商æœç´¢æ„图分æžä¸“家。
  33 +
  34 +给定用户输入的æœç´¢è¯ï¼Œè¯·åœ¨æœè£…å“类电商场景下,分æžè¯¥æœç´¢è¯èƒŒåŽçš„购物æ„图,并用一å¥è¯ç®€è¦æè¿°ç”¨æˆ·çš„æ ¸å¿ƒæœç´¢éœ€æ±‚。
  35 +åŒæ—¶ï¼Œæä¾›è¯¥æœç´¢è¯çš„中文翻译和英文翻译。
  36 +
  37 +è¦æ±‚:
  38 +- æ„图分æžåº”ç®€æ´æ˜“懂,用 1 到 3 å¥çŸ­å¥æ¦‚括用户的æœç´¢æ„图。
  39 +- ç»“åˆ query 本身,尽é‡è´´è¿‘用户原始æœç´¢éœ€æ±‚è¿›è¡Œæ€»ç»“ï¼Œä¸æ·»åŠ ä¸å¿…è¦çš„背景ã€å»¶ä¼¸æˆ–臆测。
  40 +- 如果 query ä¸å¤Ÿæ˜Žç¡®æˆ–有歧义,应ä¿å®ˆå¤„ç†ï¼Œå°½é‡ä¿æŒä¸ŽåŽŸè¯è¡¨è¾¾ä¸€è‡´ã€‚
  41 +- 中文翻译:如果原始 query 本身就是中文,则按原样输出。
  42 +- 英文翻译:如果原始 query 本身就是英文,则按原样输出。
  43 +- 除指定格å¼å¤–,ä¸è¦è¾“出任何é¢å¤–说明。
  44 +
  45 +输出格å¼ï¼ˆä¸¥æ ¼æŒ‰ä¸‰è¡Œè¾“出):
  46 +Intent: 对用户æœç´¢æ„图的简æ´åˆ†æž
  47 +Query中文翻译: query 的中文翻译
  48 +Query English translation: query 的英文翻译
  49 +
  50 +现在请分æžä»¥ä¸‹æœç´¢è¯ï¼š
  51 +
  52 +Query: {query}
  53 +"""
  54 +
7 55 _CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system.
8 56 Given a user query and the information for each product, assign a relevance label to each product.
9 57  
... ... @@ -136,7 +184,7 @@ Typical examples:
136 184 - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**;
137 185 - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement.
138 186  
139   -Query: {query}
  187 +Query: {query}{intent_suffix}
140 188  
141 189 Products:
142 190 {lines}
... ... @@ -276,7 +324,7 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个æœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸­çš„相关性åˆ
276 324 - 未æåŠ / 无法确认,优先按“基本相关â€å¤„ç†ï¼›
277 325 - åªæœ‰å½“商å“ä¿¡æ¯æ˜Žç¡®æ˜¾ç¤ºä¸ŽæŸ¥è¯¢è¦æ±‚ç›¸åæ—¶ï¼Œæ‰è§†ä¸ºå±žæ€§å†²çªã€‚
278 326  
279   -查询:{query}
  327 +查询:{query}{intent_suffix}
280 328  
281 329 商å“:
282 330 {lines}
... ... @@ -293,7 +341,17 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个æœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸­çš„相关性åˆ
293 341 """
294 342  
295 343  
296   -def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
  344 +def intent_analysis_prompt(query: str) -> str:
  345 + return _QUERY_INTENT_ANALYSIS_TEMPLATE_EN.format(query=query)
  346 +
  347 +
  348 +def classify_prompt(
  349 + query: str,
  350 + numbered_doc_lines: Sequence[str],
  351 + *,
  352 + query_intent_block: str = "",
  353 +) -> str:
297 354 lines = "\n".join(numbered_doc_lines)
298 355 n = len(numbered_doc_lines)
299   - return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n)
  356 + intent_suffix = f"\n{query_intent_block.strip()}" if query_intent_block and query_intent_block.strip() else ""
  357 + return _CLASSIFY_TEMPLATE_EN.format(query=query, intent_suffix=intent_suffix, lines=lines, n=n)
... ...