Commit cdd8ee3a3dfe584e2e32a46c7567ea95c2dafac2
1 parent
35ae3b29
eval框架日志独立
现在的行为(按你的路径) 用途 路径(相对仓库根 PROJECT_ROOT) 评估主日志(CLI + framework 的 INFO) logs/eval.log LLM 全量 prompt / 原始响应 logs/verbose/eval_verbose.log 实现要点: constants.py:EVAL_LOG_DIR、EVAL_VERBOSE_LOG_DIR、EVAL_LOG_FILE、EVAL_VERBOSE_LOG_FILE。 logging_setup.py:setup_eval_logging() 给名为 search_eval 的 logger 挂 文件 + stderr,只初始化一次;build_annotation_set.py / serve_eval_web.py 走的 eval_framework.cli.main() 开头会先调用。 cli.py:原来的 print 改为 search_eval.cli 的 logging.info;启动时写一条 CLI start command=... log_file=... 到 logs/eval.log。 framework.py:rebuild 相关 print 改为 search_eval.framework 的 logging.info。 clients.py:verbose 改为写入 logs/verbose/eval_verbose.log;首次需要时调用 setup_eval_logging(),并用 search_eval.info 提示 verbose 文件路径(不再用 print)。
Showing
7 changed files
with
306 additions
and
47 deletions
Show diff stats
api/result_formatter.py
| @@ -76,6 +76,13 @@ class ResultFormatter: | @@ -76,6 +76,13 @@ class ResultFormatter: | ||
| 76 | category_path = pick_lang_field(source, "category_path") | 76 | category_path = pick_lang_field(source, "category_path") |
| 77 | category_name = pick_lang_field(source, "category_name_text") or source.get("category_name") | 77 | category_name = pick_lang_field(source, "category_name_text") or source.get("category_name") |
| 78 | 78 | ||
| 79 | + # tags: core-language object {"en": "a,b", "zh": "..."} from indexer | ||
| 80 | + tags: Optional[List[str]] = None | ||
| 81 | + if isinstance(source.get("tags"), dict): | ||
| 82 | + tags_txt = pick_lang_field(source, "tags") | ||
| 83 | + if tags_txt: | ||
| 84 | + tags = [t.strip() for t in str(tags_txt).split(",") if t.strip()] or None | ||
| 85 | + | ||
| 79 | # Extract SKUs | 86 | # Extract SKUs |
| 80 | skus = [] | 87 | skus = [] |
| 81 | skus_data = source.get('skus', []) | 88 | skus_data = source.get('skus', []) |
| @@ -129,7 +136,7 @@ class ResultFormatter: | @@ -129,7 +136,7 @@ class ResultFormatter: | ||
| 129 | category1_name=source.get('category1_name'), | 136 | category1_name=source.get('category1_name'), |
| 130 | category2_name=source.get('category2_name'), | 137 | category2_name=source.get('category2_name'), |
| 131 | category3_name=source.get('category3_name'), | 138 | category3_name=source.get('category3_name'), |
| 132 | - tags=source.get('tags'), | 139 | + tags=tags, |
| 133 | price=source.get('min_price'), | 140 | price=source.get('min_price'), |
| 134 | compare_at_price=source.get('compare_at_price'), | 141 | compare_at_price=source.get('compare_at_price'), |
| 135 | currency="USD", # Default currency | 142 | currency="USD", # Default currency |
scripts/evaluation/eval_framework/cli.py
| @@ -4,10 +4,13 @@ from __future__ import annotations | @@ -4,10 +4,13 @@ from __future__ import annotations | ||
| 4 | 4 | ||
| 5 | import argparse | 5 | import argparse |
| 6 | import json | 6 | import json |
| 7 | +import logging | ||
| 7 | from pathlib import Path | 8 | from pathlib import Path |
| 8 | from typing import Any, Dict | 9 | from typing import Any, Dict |
| 9 | 10 | ||
| 10 | from .constants import ( | 11 | from .constants import ( |
| 12 | + DEFAULT_INTENT_ENABLE_THINKING, | ||
| 13 | + DEFAULT_INTENT_MODEL, | ||
| 11 | DEFAULT_QUERY_FILE, | 14 | DEFAULT_QUERY_FILE, |
| 12 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, | 15 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, |
| 13 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, | 16 | DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, |
| @@ -19,10 +22,14 @@ from .constants import ( | @@ -19,10 +22,14 @@ from .constants import ( | ||
| 19 | DEFAULT_RERANK_HIGH_THRESHOLD, | 22 | DEFAULT_RERANK_HIGH_THRESHOLD, |
| 20 | DEFAULT_SEARCH_RECALL_TOP_K, | 23 | DEFAULT_SEARCH_RECALL_TOP_K, |
| 21 | ) | 24 | ) |
| 25 | +from .constants import EVAL_LOG_FILE | ||
| 22 | from .framework import SearchEvaluationFramework | 26 | from .framework import SearchEvaluationFramework |
| 27 | +from .logging_setup import setup_eval_logging | ||
| 23 | from .utils import ensure_dir, utc_now_iso, utc_timestamp | 28 | from .utils import ensure_dir, utc_now_iso, utc_timestamp |
| 24 | from .web_app import create_web_app | 29 | from .web_app import create_web_app |
| 25 | 30 | ||
| 31 | +_cli_log = logging.getLogger("search_eval.cli") | ||
| 32 | + | ||
| 26 | 33 | ||
| 27 | def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | 34 | def add_judge_llm_args(p: argparse.ArgumentParser) -> None: |
| 28 | p.add_argument( | 35 | p.add_argument( |
| @@ -45,6 +52,21 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | @@ -45,6 +52,21 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: | ||
| 45 | ) | 52 | ) |
| 46 | 53 | ||
| 47 | 54 | ||
| 55 | +def add_intent_llm_args(p: argparse.ArgumentParser) -> None: | ||
| 56 | + p.add_argument( | ||
| 57 | + "--intent-model", | ||
| 58 | + default=None, | ||
| 59 | + metavar="MODEL", | ||
| 60 | + help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).", | ||
| 61 | + ) | ||
| 62 | + p.add_argument( | ||
| 63 | + "--intent-enable-thinking", | ||
| 64 | + action=argparse.BooleanOptionalAction, | ||
| 65 | + default=None, | ||
| 66 | + help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).", | ||
| 67 | + ) | ||
| 68 | + | ||
| 69 | + | ||
| 48 | def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | 70 | def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: |
| 49 | kw: Dict[str, Any] = {} | 71 | kw: Dict[str, Any] = {} |
| 50 | if args.judge_model is not None: | 72 | if args.judge_model is not None: |
| @@ -53,6 +75,10 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | @@ -53,6 +75,10 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: | ||
| 53 | kw["enable_thinking"] = args.enable_thinking | 75 | kw["enable_thinking"] = args.enable_thinking |
| 54 | if args.dashscope_batch is not None: | 76 | if args.dashscope_batch is not None: |
| 55 | kw["use_dashscope_batch"] = args.dashscope_batch | 77 | kw["use_dashscope_batch"] = args.dashscope_batch |
| 78 | + if getattr(args, "intent_model", None) is not None: | ||
| 79 | + kw["intent_model"] = args.intent_model | ||
| 80 | + if getattr(args, "intent_enable_thinking", None) is not None: | ||
| 81 | + kw["intent_enable_thinking"] = args.intent_enable_thinking | ||
| 56 | return kw | 82 | return kw |
| 57 | 83 | ||
| 58 | 84 | ||
| @@ -110,6 +136,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -110,6 +136,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 110 | build.add_argument("--force-refresh-rerank", action="store_true") | 136 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 111 | build.add_argument("--force-refresh-labels", action="store_true") | 137 | build.add_argument("--force-refresh-labels", action="store_true") |
| 112 | add_judge_llm_args(build) | 138 | add_judge_llm_args(build) |
| 139 | + add_intent_llm_args(build) | ||
| 113 | 140 | ||
| 114 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") | 141 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| 115 | batch.add_argument("--tenant-id", default="163") | 142 | batch.add_argument("--tenant-id", default="163") |
| @@ -118,6 +145,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -118,6 +145,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 118 | batch.add_argument("--language", default="en") | 145 | batch.add_argument("--language", default="en") |
| 119 | batch.add_argument("--force-refresh-labels", action="store_true") | 146 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 120 | add_judge_llm_args(batch) | 147 | add_judge_llm_args(batch) |
| 148 | + add_intent_llm_args(batch) | ||
| 121 | 149 | ||
| 122 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") | 150 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| 123 | audit.add_argument("--tenant-id", default="163") | 151 | audit.add_argument("--tenant-id", default="163") |
| @@ -127,6 +155,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -127,6 +155,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 127 | audit.add_argument("--limit-suspicious", type=int, default=5) | 155 | audit.add_argument("--limit-suspicious", type=int, default=5) |
| 128 | audit.add_argument("--force-refresh-labels", action="store_true") | 156 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 129 | add_judge_llm_args(audit) | 157 | add_judge_llm_args(audit) |
| 158 | + add_intent_llm_args(audit) | ||
| 130 | 159 | ||
| 131 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | 160 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| 132 | serve.add_argument("--tenant-id", default="163") | 161 | serve.add_argument("--tenant-id", default="163") |
| @@ -134,6 +163,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -134,6 +163,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 134 | serve.add_argument("--host", default="0.0.0.0") | 163 | serve.add_argument("--host", default="0.0.0.0") |
| 135 | serve.add_argument("--port", type=int, default=6010) | 164 | serve.add_argument("--port", type=int, default=6010) |
| 136 | add_judge_llm_args(serve) | 165 | add_judge_llm_args(serve) |
| 166 | + add_intent_llm_args(serve) | ||
| 137 | 167 | ||
| 138 | return parser | 168 | return parser |
| 139 | 169 | ||
| @@ -183,14 +213,18 @@ def run_build(args: argparse.Namespace) -> None: | @@ -183,14 +213,18 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 183 | "output_json_path": str(result.output_json_path), | 213 | "output_json_path": str(result.output_json_path), |
| 184 | } | 214 | } |
| 185 | ) | 215 | ) |
| 186 | - print( | ||
| 187 | - f"[build] query={result.query!r} search_total={result.search_total} " | ||
| 188 | - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | ||
| 189 | - f"annotated={result.annotated_count} output={result.output_json_path}" | 216 | + _cli_log.info( |
| 217 | + "[build] query=%r search_total=%s search_depth=%s corpus=%s annotated=%s output=%s", | ||
| 218 | + result.query, | ||
| 219 | + result.search_total, | ||
| 220 | + result.search_depth, | ||
| 221 | + result.rerank_corpus_size, | ||
| 222 | + result.annotated_count, | ||
| 223 | + result.output_json_path, | ||
| 190 | ) | 224 | ) |
| 191 | out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | 225 | out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" |
| 192 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | 226 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") |
| 193 | - print(f"[done] summary={out_path}") | 227 | + _cli_log.info("[done] summary=%s", out_path) |
| 194 | 228 | ||
| 195 | 229 | ||
| 196 | def run_batch(args: argparse.Namespace) -> None: | 230 | def run_batch(args: argparse.Namespace) -> None: |
| @@ -203,7 +237,7 @@ def run_batch(args: argparse.Namespace) -> None: | @@ -203,7 +237,7 @@ def run_batch(args: argparse.Namespace) -> None: | ||
| 203 | language=args.language, | 237 | language=args.language, |
| 204 | force_refresh_labels=args.force_refresh_labels, | 238 | force_refresh_labels=args.force_refresh_labels, |
| 205 | ) | 239 | ) |
| 206 | - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}") | 240 | + _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"]) |
| 207 | 241 | ||
| 208 | 242 | ||
| 209 | def run_audit(args: argparse.Namespace) -> None: | 243 | def run_audit(args: argparse.Namespace) -> None: |
| @@ -239,8 +273,11 @@ def run_audit(args: argparse.Namespace) -> None: | @@ -239,8 +273,11 @@ def run_audit(args: argparse.Namespace) -> None: | ||
| 239 | "suspicious_examples": item["suspicious"][: args.limit_suspicious], | 273 | "suspicious_examples": item["suspicious"][: args.limit_suspicious], |
| 240 | } | 274 | } |
| 241 | ) | 275 | ) |
| 242 | - print( | ||
| 243 | - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}" | 276 | + _cli_log.info( |
| 277 | + "[audit] query=%r suspicious=%s metrics=%s", | ||
| 278 | + query, | ||
| 279 | + len(item["suspicious"]), | ||
| 280 | + item["metrics"], | ||
| 244 | ) | 281 | ) |
| 245 | 282 | ||
| 246 | summary = { | 283 | summary = { |
| @@ -253,7 +290,7 @@ def run_audit(args: argparse.Namespace) -> None: | @@ -253,7 +290,7 @@ def run_audit(args: argparse.Namespace) -> None: | ||
| 253 | } | 290 | } |
| 254 | out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" | 291 | out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" |
| 255 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | 292 | out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") |
| 256 | - print(f"[done] audit={out_path}") | 293 | + _cli_log.info("[done] audit=%s", out_path) |
| 257 | 294 | ||
| 258 | 295 | ||
| 259 | def run_serve(args: argparse.Namespace) -> None: | 296 | def run_serve(args: argparse.Namespace) -> None: |
| @@ -265,8 +302,15 @@ def run_serve(args: argparse.Namespace) -> None: | @@ -265,8 +302,15 @@ def run_serve(args: argparse.Namespace) -> None: | ||
| 265 | 302 | ||
| 266 | 303 | ||
| 267 | def main() -> None: | 304 | def main() -> None: |
| 305 | + setup_eval_logging() | ||
| 268 | parser = build_cli_parser() | 306 | parser = build_cli_parser() |
| 269 | args = parser.parse_args() | 307 | args = parser.parse_args() |
| 308 | + logging.getLogger("search_eval").info( | ||
| 309 | + "CLI start command=%s tenant_id=%s log_file=%s", | ||
| 310 | + args.command, | ||
| 311 | + getattr(args, "tenant_id", ""), | ||
| 312 | + EVAL_LOG_FILE.resolve(), | ||
| 313 | + ) | ||
| 270 | if args.command == "build": | 314 | if args.command == "build": |
| 271 | run_build(args) | 315 | run_build(args) |
| 272 | return | 316 | return |
scripts/evaluation/eval_framework/clients.py
| @@ -4,16 +4,67 @@ from __future__ import annotations | @@ -4,16 +4,67 @@ from __future__ import annotations | ||
| 4 | 4 | ||
| 5 | import io | 5 | import io |
| 6 | import json | 6 | import json |
| 7 | +import logging | ||
| 8 | +import threading | ||
| 7 | import time | 9 | import time |
| 8 | import uuid | 10 | import uuid |
| 9 | from typing import Any, Dict, List, Optional, Sequence, Tuple | 11 | from typing import Any, Dict, List, Optional, Sequence, Tuple |
| 10 | 12 | ||
| 11 | import requests | 13 | import requests |
| 12 | 14 | ||
| 13 | -from .constants import VALID_LABELS | ||
| 14 | -from .prompts import classify_prompt | 15 | +from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS |
| 16 | +from .logging_setup import setup_eval_logging | ||
| 17 | +from .prompts import classify_prompt, intent_analysis_prompt | ||
| 15 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps | 18 | from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps |
| 16 | 19 | ||
| 20 | +_VERBOSE_LOGGER_LOCK = threading.Lock() | ||
| 21 | +_eval_llm_verbose_logger_singleton: logging.Logger | None = None | ||
| 22 | +_eval_llm_verbose_path_logged = False | ||
| 23 | + | ||
| 24 | + | ||
| 25 | +def _get_eval_llm_verbose_logger() -> logging.Logger: | ||
| 26 | + """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``.""" | ||
| 27 | + setup_eval_logging() | ||
| 28 | + global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged | ||
| 29 | + with _VERBOSE_LOGGER_LOCK: | ||
| 30 | + if _eval_llm_verbose_logger_singleton is not None: | ||
| 31 | + return _eval_llm_verbose_logger_singleton | ||
| 32 | + log_path = EVAL_VERBOSE_LOG_FILE | ||
| 33 | + log_path.parent.mkdir(parents=True, exist_ok=True) | ||
| 34 | + lg = logging.getLogger("search_eval.verbose_llm") | ||
| 35 | + lg.setLevel(logging.INFO) | ||
| 36 | + if not lg.handlers: | ||
| 37 | + handler = logging.FileHandler(log_path, encoding="utf-8") | ||
| 38 | + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) | ||
| 39 | + lg.addHandler(handler) | ||
| 40 | + lg.propagate = False | ||
| 41 | + _eval_llm_verbose_logger_singleton = lg | ||
| 42 | + if not _eval_llm_verbose_path_logged: | ||
| 43 | + _eval_llm_verbose_path_logged = True | ||
| 44 | + logging.getLogger("search_eval").info( | ||
| 45 | + "LLM verbose I/O log (full prompt + response): %s", | ||
| 46 | + log_path.resolve(), | ||
| 47 | + ) | ||
| 48 | + return lg | ||
| 49 | + | ||
| 50 | + | ||
| 51 | +def _log_eval_llm_verbose( | ||
| 52 | + *, | ||
| 53 | + phase: str, | ||
| 54 | + model: str, | ||
| 55 | + prompt: str, | ||
| 56 | + assistant_text: str, | ||
| 57 | + raw_response: str, | ||
| 58 | +) -> None: | ||
| 59 | + log = _get_eval_llm_verbose_logger() | ||
| 60 | + sep = "=" * 80 | ||
| 61 | + log.info("\n%s", sep) | ||
| 62 | + log.info("phase=%s model=%s", phase, model) | ||
| 63 | + log.info("%s\nFULL PROMPT (user message)\n%s", sep, prompt) | ||
| 64 | + log.info("%s\nASSISTANT CONTENT (parsed)\n%s", sep, assistant_text) | ||
| 65 | + log.info("%s\nRAW RESPONSE (JSON string)\n%s", sep, raw_response) | ||
| 66 | + log.info("%s\n", sep) | ||
| 67 | + | ||
| 17 | 68 | ||
| 18 | def _canonicalize_judge_label(raw: str) -> str | None: | 69 | def _canonicalize_judge_label(raw: str) -> str | None: |
| 19 | s = str(raw or "").strip().strip('"').strip("'") | 70 | s = str(raw or "").strip().strip('"').strip("'") |
| @@ -208,17 +259,27 @@ class DashScopeLabelClient: | @@ -208,17 +259,27 @@ class DashScopeLabelClient: | ||
| 208 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | 259 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() |
| 209 | return content, safe_json_dumps(row) | 260 | return content, safe_json_dumps(row) |
| 210 | 261 | ||
| 211 | - def _chat(self, prompt: str) -> Tuple[str, str]: | 262 | + def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]: |
| 212 | if not self.use_batch: | 263 | if not self.use_batch: |
| 213 | - return self._chat_sync(prompt) | ||
| 214 | - try: | ||
| 215 | - return self._chat_batch(prompt) | ||
| 216 | - except requests.exceptions.HTTPError as e: | ||
| 217 | - resp = getattr(e, "response", None) | ||
| 218 | - if resp is not None and resp.status_code == 404: | ||
| 219 | - self.use_batch = False | ||
| 220 | - return self._chat_sync(prompt) | ||
| 221 | - raise | 264 | + content, raw = self._chat_sync(prompt) |
| 265 | + else: | ||
| 266 | + try: | ||
| 267 | + content, raw = self._chat_batch(prompt) | ||
| 268 | + except requests.exceptions.HTTPError as e: | ||
| 269 | + resp = getattr(e, "response", None) | ||
| 270 | + if resp is not None and resp.status_code == 404: | ||
| 271 | + self.use_batch = False | ||
| 272 | + content, raw = self._chat_sync(prompt) | ||
| 273 | + else: | ||
| 274 | + raise | ||
| 275 | + _log_eval_llm_verbose( | ||
| 276 | + phase=phase, | ||
| 277 | + model=self.model, | ||
| 278 | + prompt=prompt, | ||
| 279 | + assistant_text=content, | ||
| 280 | + raw_response=raw, | ||
| 281 | + ) | ||
| 282 | + return content, raw | ||
| 222 | 283 | ||
| 223 | def _find_batch_line_for_custom_id( | 284 | def _find_batch_line_for_custom_id( |
| 224 | self, | 285 | self, |
| @@ -242,14 +303,20 @@ class DashScopeLabelClient: | @@ -242,14 +303,20 @@ class DashScopeLabelClient: | ||
| 242 | return obj | 303 | return obj |
| 243 | return None | 304 | return None |
| 244 | 305 | ||
| 306 | + def query_intent(self, query: str) -> Tuple[str, str]: | ||
| 307 | + prompt = intent_analysis_prompt(query) | ||
| 308 | + return self._chat(prompt, phase="query_intent") | ||
| 309 | + | ||
| 245 | def classify_batch( | 310 | def classify_batch( |
| 246 | self, | 311 | self, |
| 247 | query: str, | 312 | query: str, |
| 248 | docs: Sequence[Dict[str, Any]], | 313 | docs: Sequence[Dict[str, Any]], |
| 314 | + *, | ||
| 315 | + query_intent_block: str = "", | ||
| 249 | ) -> Tuple[List[str], str]: | 316 | ) -> Tuple[List[str], str]: |
| 250 | numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | 317 | numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] |
| 251 | - prompt = classify_prompt(query, numbered_docs) | ||
| 252 | - content, raw_response = self._chat(prompt) | 318 | + prompt = classify_prompt(query, numbered_docs, query_intent_block=query_intent_block) |
| 319 | + content, raw_response = self._chat(prompt, phase="relevance_classify") | ||
| 253 | labels: List[str] = [] | 320 | labels: List[str] = [] |
| 254 | for line in str(content or "").splitlines(): | 321 | for line in str(content or "").splitlines(): |
| 255 | canon = _canonicalize_judge_label(line) | 322 | canon = _canonicalize_judge_label(line) |
scripts/evaluation/eval_framework/constants.py
| @@ -34,10 +34,20 @@ def normalize_stored_label(label: str) -> str: | @@ -34,10 +34,20 @@ def normalize_stored_label(label: str) -> str: | ||
| 34 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | 34 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| 35 | DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" | 35 | DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" |
| 36 | 36 | ||
| 37 | +# Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``) | ||
| 38 | +EVAL_LOG_DIR = PROJECT_ROOT / "logs" | ||
| 39 | +EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose" | ||
| 40 | +EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log" | ||
| 41 | +EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log" | ||
| 42 | + | ||
| 37 | # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) | 43 | # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) |
| 38 | -DEFAULT_JUDGE_MODEL = "qwen3.5-flash" | ||
| 39 | -DEFAULT_JUDGE_ENABLE_THINKING = True | 44 | +DEFAULT_JUDGE_MODEL = "qwen3.5-plus" |
| 45 | +DEFAULT_JUDGE_ENABLE_THINKING = False | ||
| 40 | DEFAULT_JUDGE_DASHSCOPE_BATCH = False | 46 | DEFAULT_JUDGE_DASHSCOPE_BATCH = False |
| 47 | + | ||
| 48 | +# Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) | ||
| 49 | +DEFAULT_INTENT_MODEL = "qwen-max" | ||
| 50 | +DEFAULT_INTENT_ENABLE_THINKING = True | ||
| 41 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" | 51 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" |
| 42 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 | 52 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 |
| 43 | 53 | ||
| @@ -60,6 +70,6 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | @@ -60,6 +70,6 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 | ||
| 60 | # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). | 70 | # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). |
| 61 | # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak | 71 | # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak |
| 62 | # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). | 72 | # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). |
| 63 | -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 | 73 | +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 |
| 64 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 | 74 | DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 |
| 65 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 | 75 | DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 |
scripts/evaluation/eval_framework/framework.py
| @@ -3,6 +3,7 @@ | @@ -3,6 +3,7 @@ | ||
| 3 | from __future__ import annotations | 3 | from __future__ import annotations |
| 4 | 4 | ||
| 5 | import json | 5 | import json |
| 6 | +import logging | ||
| 6 | import time | 7 | import time |
| 7 | from pathlib import Path | 8 | from pathlib import Path |
| 8 | from typing import Any, Dict, List, Sequence, Tuple | 9 | from typing import Any, Dict, List, Sequence, Tuple |
| @@ -17,6 +18,8 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli | @@ -17,6 +18,8 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli | ||
| 17 | from .constants import ( | 18 | from .constants import ( |
| 18 | DEFAULT_ARTIFACT_ROOT, | 19 | DEFAULT_ARTIFACT_ROOT, |
| 19 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, | 20 | DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, |
| 21 | + DEFAULT_INTENT_ENABLE_THINKING, | ||
| 22 | + DEFAULT_INTENT_MODEL, | ||
| 20 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, | 23 | DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, |
| 21 | DEFAULT_JUDGE_DASHSCOPE_BATCH, | 24 | DEFAULT_JUDGE_DASHSCOPE_BATCH, |
| 22 | DEFAULT_JUDGE_ENABLE_THINKING, | 25 | DEFAULT_JUDGE_ENABLE_THINKING, |
| @@ -52,6 +55,8 @@ from .utils import ( | @@ -52,6 +55,8 @@ from .utils import ( | ||
| 52 | zh_title_from_multilingual, | 55 | zh_title_from_multilingual, |
| 53 | ) | 56 | ) |
| 54 | 57 | ||
| 58 | +_log = logging.getLogger("search_eval.framework") | ||
| 59 | + | ||
| 55 | 60 | ||
| 56 | def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: | 61 | def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: |
| 57 | """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" | 62 | """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" |
| @@ -80,6 +85,8 @@ class SearchEvaluationFramework: | @@ -80,6 +85,8 @@ class SearchEvaluationFramework: | ||
| 80 | judge_model: str | None = None, | 85 | judge_model: str | None = None, |
| 81 | enable_thinking: bool | None = None, | 86 | enable_thinking: bool | None = None, |
| 82 | use_dashscope_batch: bool | None = None, | 87 | use_dashscope_batch: bool | None = None, |
| 88 | + intent_model: str | None = None, | ||
| 89 | + intent_enable_thinking: bool | None = None, | ||
| 83 | ): | 90 | ): |
| 84 | init_service(get_app_config().infrastructure.elasticsearch.host) | 91 | init_service(get_app_config().infrastructure.elasticsearch.host) |
| 85 | self.tenant_id = str(tenant_id) | 92 | self.tenant_id = str(tenant_id) |
| @@ -109,6 +116,24 @@ class SearchEvaluationFramework: | @@ -109,6 +116,24 @@ class SearchEvaluationFramework: | ||
| 109 | enable_thinking=et, | 116 | enable_thinking=et, |
| 110 | use_batch=use_batch, | 117 | use_batch=use_batch, |
| 111 | ) | 118 | ) |
| 119 | + intent_m = str(intent_model or DEFAULT_INTENT_MODEL) | ||
| 120 | + intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking | ||
| 121 | + self.intent_client = DashScopeLabelClient( | ||
| 122 | + model=intent_m, | ||
| 123 | + base_url=str(llm_cfg["base_url"]), | ||
| 124 | + api_key=str(api_key), | ||
| 125 | + batch_completion_window=batch_window, | ||
| 126 | + batch_poll_interval_sec=batch_poll, | ||
| 127 | + enable_thinking=bool(intent_et), | ||
| 128 | + use_batch=False, | ||
| 129 | + ) | ||
| 130 | + self._query_intent_cache: Dict[str, str] = {} | ||
| 131 | + | ||
| 132 | + def _ensure_query_intent_block(self, query: str) -> str: | ||
| 133 | + if query not in self._query_intent_cache: | ||
| 134 | + text, _raw = self.intent_client.query_intent(query) | ||
| 135 | + self._query_intent_cache[query] = str(text or "").strip() | ||
| 136 | + return self._query_intent_cache[query] | ||
| 112 | 137 | ||
| 113 | def audit_live_query( | 138 | def audit_live_query( |
| 114 | self, | 139 | self, |
| @@ -310,7 +335,10 @@ class SearchEvaluationFramework: | @@ -310,7 +335,10 @@ class SearchEvaluationFramework: | ||
| 310 | if not docs: | 335 | if not docs: |
| 311 | return [] | 336 | return [] |
| 312 | try: | 337 | try: |
| 313 | - labels, raw_response = self.label_client.classify_batch(query, docs) | 338 | + intent_block = self._ensure_query_intent_block(query) |
| 339 | + labels, raw_response = self.label_client.classify_batch( | ||
| 340 | + query, docs, query_intent_block=intent_block | ||
| 341 | + ) | ||
| 314 | return [(labels, raw_response, docs)] | 342 | return [(labels, raw_response, docs)] |
| 315 | except Exception: | 343 | except Exception: |
| 316 | if len(docs) == 1: | 344 | if len(docs) == 1: |
| @@ -392,11 +420,16 @@ class SearchEvaluationFramework: | @@ -392,11 +420,16 @@ class SearchEvaluationFramework: | ||
| 392 | "offset_end": min(start + n, total_ordered), | 420 | "offset_end": min(start + n, total_ordered), |
| 393 | } | 421 | } |
| 394 | batch_logs.append(log_entry) | 422 | batch_logs.append(log_entry) |
| 395 | - print( | ||
| 396 | - f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} " | ||
| 397 | - f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} " | ||
| 398 | - f"irrel_plus_low_ratio={irrel_low_ratio:.4f}", | ||
| 399 | - flush=True, | 423 | + _log.info( |
| 424 | + "[eval-rebuild] query=%r llm_batch=%s/%s size=%s exact_ratio=%.4f irrelevant_ratio=%.4f " | ||
| 425 | + "irrel_plus_low_ratio=%.4f", | ||
| 426 | + query, | ||
| 427 | + batch_idx + 1, | ||
| 428 | + max_batches, | ||
| 429 | + n, | ||
| 430 | + exact_ratio, | ||
| 431 | + irrelevant_ratio, | ||
| 432 | + irrel_low_ratio, | ||
| 400 | ) | 433 | ) |
| 401 | 434 | ||
| 402 | # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). | 435 | # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). |
| @@ -409,11 +442,14 @@ class SearchEvaluationFramework: | @@ -409,11 +442,14 @@ class SearchEvaluationFramework: | ||
| 409 | else: | 442 | else: |
| 410 | streak = 0 | 443 | streak = 0 |
| 411 | if streak >= stop_streak: | 444 | if streak >= stop_streak: |
| 412 | - print( | ||
| 413 | - f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " | ||
| 414 | - f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} " | ||
| 415 | - f"and irrel+low>{irrelevant_low_combined_stop_ratio})", | ||
| 416 | - flush=True, | 445 | + _log.info( |
| 446 | + "[eval-rebuild] query=%r early_stop after %s batches (%s consecutive batches: " | ||
| 447 | + "irrelevant>%s and irrel+low>%s)", | ||
| 448 | + query, | ||
| 449 | + batch_idx + 1, | ||
| 450 | + stop_streak, | ||
| 451 | + irrelevant_stop_ratio, | ||
| 452 | + irrelevant_low_combined_stop_ratio, | ||
| 417 | ) | 453 | ) |
| 418 | break | 454 | break |
| 419 | 455 | ||
| @@ -626,11 +662,13 @@ class SearchEvaluationFramework: | @@ -626,11 +662,13 @@ class SearchEvaluationFramework: | ||
| 626 | if rerank_high_n > int(rerank_high_skip_count): | 662 | if rerank_high_n > int(rerank_high_skip_count): |
| 627 | skipped = True | 663 | skipped = True |
| 628 | skip_reason = "too_many_high_rerank_scores" | 664 | skip_reason = "too_many_high_rerank_scores" |
| 629 | - print( | ||
| 630 | - f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} " | ||
| 631 | - f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} " | ||
| 632 | - f"(relevant tail too large / query too easy to satisfy)", | ||
| 633 | - flush=True, | 665 | + _log.info( |
| 666 | + "[eval-rebuild] query=%r skip: rerank_score>%s outside recall pool count=%s > %s " | ||
| 667 | + "(relevant tail too large / query too easy to satisfy)", | ||
| 668 | + query, | ||
| 669 | + rerank_high_threshold, | ||
| 670 | + rerank_high_n, | ||
| 671 | + rerank_high_skip_count, | ||
| 634 | ) | 672 | ) |
| 635 | else: | 673 | else: |
| 636 | ordered_docs: List[Dict[str, Any]] = [] | 674 | ordered_docs: List[Dict[str, Any]] = [] |
| @@ -0,0 +1,35 @@ | @@ -0,0 +1,35 @@ | ||
| 1 | +"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``).""" | ||
| 2 | + | ||
| 3 | +from __future__ import annotations | ||
| 4 | + | ||
| 5 | +import logging | ||
| 6 | +import sys | ||
| 7 | + | ||
| 8 | +from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR | ||
| 9 | + | ||
| 10 | +_setup_done = False | ||
| 11 | + | ||
| 12 | + | ||
| 13 | +def setup_eval_logging() -> None: | ||
| 14 | + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.""" | ||
| 15 | + global _setup_done | ||
| 16 | + if _setup_done: | ||
| 17 | + return | ||
| 18 | + | ||
| 19 | + EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True) | ||
| 20 | + EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True) | ||
| 21 | + | ||
| 22 | + fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") | ||
| 23 | + root = logging.getLogger("search_eval") | ||
| 24 | + root.setLevel(logging.INFO) | ||
| 25 | + if root.handlers: | ||
| 26 | + _setup_done = True | ||
| 27 | + return | ||
| 28 | + fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8") | ||
| 29 | + fh.setFormatter(fmt) | ||
| 30 | + sh = logging.StreamHandler(sys.stderr) | ||
| 31 | + sh.setFormatter(fmt) | ||
| 32 | + root.addHandler(fh) | ||
| 33 | + root.addHandler(sh) | ||
| 34 | + root.propagate = False | ||
| 35 | + _setup_done = True |
scripts/evaluation/eval_framework/prompts.py
| @@ -4,6 +4,54 @@ from __future__ import annotations | @@ -4,6 +4,54 @@ from __future__ import annotations | ||
| 4 | 4 | ||
| 5 | from typing import Sequence | 5 | from typing import Sequence |
| 6 | 6 | ||
| 7 | + | ||
| 8 | +_QUERY_INTENT_ANALYSIS_TEMPLATE_EN = """You are an intent analysis expert for a fashion e-commerce search system. | ||
| 9 | + | ||
| 10 | +Given a user's search query, analyze the shopping intent behind the query in the context of fashion and apparel e-commerce, and summarize the user's core search need in one concise sentence. | ||
| 11 | +Also provide the Chinese translation and English translation of the query. | ||
| 12 | + | ||
| 13 | +Requirements: | ||
| 14 | +- Keep the intent analysis concise and easy to understand, using 1 to 3 short sentences. | ||
| 15 | +- Stay grounded in the original query and summarize the user's likely shopping intent without adding unnecessary context. | ||
| 16 | +- When the query is vague or ambiguous, take a conservative approach and keep the analysis close to the original wording. | ||
| 17 | +- Chinese translation: if the original query is already in Chinese, keep it unchanged. | ||
| 18 | +- English translation: if the original query is already in English, keep it unchanged. | ||
| 19 | +- Do not output anything other than the required three-line format. | ||
| 20 | + | ||
| 21 | +Output format (strictly exactly three lines): | ||
| 22 | +Intent: concise analysis of the user's search intent | ||
| 23 | +Query䏿–‡ç¿»è¯‘: Chinese translation of the query | ||
| 24 | +Query English translation: English translation of the query | ||
| 25 | + | ||
| 26 | +Now analyze the following query: | ||
| 27 | + | ||
| 28 | +Query: {query} | ||
| 29 | +""" | ||
| 30 | + | ||
| 31 | +_QUERY_INTENT_ANALYSIS_RESULT_TEMPLATE_ZH = """ | ||
| 32 | +ä½ æ˜¯ä¸€ä¸ªæœè£…å“类电商æœç´¢æ„图分æžä¸“家。 | ||
| 33 | + | ||
| 34 | +给定用户输入的æœç´¢è¯ï¼Œè¯·åœ¨æœè£…å“类电商场景下,分æžè¯¥æœç´¢è¯èƒŒåŽçš„è´ç‰©æ„图,并用一å¥è¯ç®€è¦æè¿°ç”¨æˆ·çš„æ ¸å¿ƒæœç´¢éœ€æ±‚。 | ||
| 35 | +åŒæ—¶ï¼Œæä¾›è¯¥æœç´¢è¯çš„䏿–‡ç¿»è¯‘和英文翻译。 | ||
| 36 | + | ||
| 37 | +è¦æ±‚: | ||
| 38 | +- æ„图分æžåº”ç®€æ´æ˜“懂,用 1 到 3 å¥çŸå¥æ¦‚括用户的æœç´¢æ„图。 | ||
| 39 | +- ç»“åˆ query 本身,尽é‡è´´è¿‘用户原始æœç´¢éœ€æ±‚è¿›è¡Œæ€»ç»“ï¼Œä¸æ·»åŠ ä¸å¿…è¦çš„背景ã€å»¶ä¼¸æˆ–臆测。 | ||
| 40 | +- 如果 query ä¸å¤Ÿæ˜Žç¡®æˆ–有æ§ä¹‰ï¼Œåº”ä¿å®ˆå¤„ç†ï¼Œå°½é‡ä¿æŒä¸ŽåŽŸè¯è¡¨è¾¾ä¸€è‡´ã€‚ | ||
| 41 | +- 䏿–‡ç¿»è¯‘:如果原始 query æœ¬èº«å°±æ˜¯ä¸æ–‡ï¼Œåˆ™æŒ‰åŽŸæ ·è¾“å‡ºã€‚ | ||
| 42 | +- 英文翻译:如果原始 query æœ¬èº«å°±æ˜¯è‹±æ–‡ï¼Œåˆ™æŒ‰åŽŸæ ·è¾“å‡ºã€‚ | ||
| 43 | +- é™¤æŒ‡å®šæ ¼å¼å¤–,ä¸è¦è¾“出任何é¢å¤–说明。 | ||
| 44 | + | ||
| 45 | +è¾“å‡ºæ ¼å¼ï¼ˆä¸¥æ ¼æŒ‰ä¸‰è¡Œè¾“出): | ||
| 46 | +Intent: 对用户æœç´¢æ„图的简æ´åˆ†æž | ||
| 47 | +Query䏿–‡ç¿»è¯‘: query çš„ä¸æ–‡ç¿»è¯‘ | ||
| 48 | +Query English translation: query 的英文翻译 | ||
| 49 | + | ||
| 50 | +现在请分æžä»¥ä¸‹æœç´¢è¯ï¼š | ||
| 51 | + | ||
| 52 | +Query: {query} | ||
| 53 | +""" | ||
| 54 | + | ||
| 7 | _CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. | 55 | _CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. |
| 8 | Given a user query and the information for each product, assign a relevance label to each product. | 56 | Given a user query and the information for each product, assign a relevance label to each product. |
| 9 | 57 | ||
| @@ -136,7 +184,7 @@ Typical examples: | @@ -136,7 +184,7 @@ Typical examples: | ||
| 136 | - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; | 184 | - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; |
| 137 | - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. | 185 | - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. |
| 138 | 186 | ||
| 139 | -Query: {query} | 187 | +Query: {query}{intent_suffix} |
| 140 | 188 | ||
| 141 | Products: | 189 | Products: |
| 142 | {lines} | 190 | {lines} |
| @@ -276,7 +324,7 @@ _CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ | @@ -276,7 +324,7 @@ _CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ | ||
| 276 | - 未æåŠ / æ— æ³•ç¡®è®¤ï¼Œä¼˜å…ˆæŒ‰â€œåŸºæœ¬ç›¸å…³â€å¤„ç†ï¼› | 324 | - 未æåŠ / æ— æ³•ç¡®è®¤ï¼Œä¼˜å…ˆæŒ‰â€œåŸºæœ¬ç›¸å…³â€å¤„ç†ï¼› |
| 277 | - åªæœ‰å½“商å“ä¿¡æ¯æ˜Žç¡®æ˜¾ç¤ºä¸ŽæŸ¥è¯¢è¦æ±‚ç›¸åæ—¶ï¼Œæ‰è§†ä¸ºå±žæ€§å†²çªã€‚ | 325 | - åªæœ‰å½“商å“ä¿¡æ¯æ˜Žç¡®æ˜¾ç¤ºä¸ŽæŸ¥è¯¢è¦æ±‚ç›¸åæ—¶ï¼Œæ‰è§†ä¸ºå±žæ€§å†²çªã€‚ |
| 278 | 326 | ||
| 279 | -查询:{query} | 327 | +查询:{query}{intent_suffix} |
| 280 | 328 | ||
| 281 | 商å“: | 329 | 商å“: |
| 282 | {lines} | 330 | {lines} |
| @@ -293,7 +341,17 @@ _CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ | @@ -293,7 +341,17 @@ _CLASSIFY_TEMPLATE_ZH = """ä½ æ˜¯ä¸€ä¸ªæœé¥°ç”µå•†æœç´¢ç³»ç»Ÿä¸çš„ç›¸å…³æ€§åˆ | ||
| 293 | """ | 341 | """ |
| 294 | 342 | ||
| 295 | 343 | ||
| 296 | -def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: | 344 | +def intent_analysis_prompt(query: str) -> str: |
| 345 | + return _QUERY_INTENT_ANALYSIS_TEMPLATE_EN.format(query=query) | ||
| 346 | + | ||
| 347 | + | ||
| 348 | +def classify_prompt( | ||
| 349 | + query: str, | ||
| 350 | + numbered_doc_lines: Sequence[str], | ||
| 351 | + *, | ||
| 352 | + query_intent_block: str = "", | ||
| 353 | +) -> str: | ||
| 297 | lines = "\n".join(numbered_doc_lines) | 354 | lines = "\n".join(numbered_doc_lines) |
| 298 | n = len(numbered_doc_lines) | 355 | n = len(numbered_doc_lines) |
| 299 | - return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n) | 356 | + intent_suffix = f"\n{query_intent_block.strip()}" if query_intent_block and query_intent_block.strip() else "" |
| 357 | + return _CLASSIFY_TEMPLATE_EN.format(query=query, intent_suffix=intent_suffix, lines=lines, n=n) |