From cdd8ee3a3dfe584e2e32a46c7567ea95c2dafac2 Mon Sep 17 00:00:00 2001 From: tangwang Date: Wed, 1 Apr 2026 14:19:27 +0800 Subject: [PATCH] eval框架日志独立 现在的行为(按你的路径) 用途 路径(相对仓库根 PROJECT_ROOT) 评估主日志(CLI + framework 的 INFO) logs/eval.log LLM 全量 prompt / 原始响应 logs/verbose/eval_verbose.log 实现要点: --- api/result_formatter.py | 9 ++++++++- scripts/evaluation/eval_framework/cli.py | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++--------- scripts/evaluation/eval_framework/clients.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- scripts/evaluation/eval_framework/constants.py | 16 +++++++++++++--- scripts/evaluation/eval_framework/framework.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------- scripts/evaluation/eval_framework/logging_setup.py | 35 +++++++++++++++++++++++++++++++++++ scripts/evaluation/eval_framework/prompts.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 7 files changed, 306 insertions(+), 47 deletions(-) create mode 100644 scripts/evaluation/eval_framework/logging_setup.py diff --git a/api/result_formatter.py b/api/result_formatter.py index c1d5910..4ad608f 100644 --- a/api/result_formatter.py +++ b/api/result_formatter.py @@ -76,6 +76,13 @@ class ResultFormatter: category_path = pick_lang_field(source, "category_path") category_name = pick_lang_field(source, "category_name_text") or source.get("category_name") + # tags: core-language object {"en": "a,b", "zh": "..."} from indexer + tags: Optional[List[str]] = None + if isinstance(source.get("tags"), dict): + tags_txt = pick_lang_field(source, "tags") + if tags_txt: + tags = [t.strip() for t in str(tags_txt).split(",") if t.strip()] or None + # Extract SKUs skus = [] skus_data = source.get('skus', []) @@ -129,7 +136,7 @@ class ResultFormatter: category1_name=source.get('category1_name'), category2_name=source.get('category2_name'), category3_name=source.get('category3_name'), - tags=source.get('tags'), + tags=tags, price=source.get('min_price'), compare_at_price=source.get('compare_at_price'), currency="USD", # Default currency diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index b5a3486..2de3101 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -4,10 +4,13 @@ from __future__ import annotations import argparse import json +import logging from pathlib import Path from typing import Any, Dict from .constants import ( + DEFAULT_INTENT_ENABLE_THINKING, + DEFAULT_INTENT_MODEL, DEFAULT_QUERY_FILE, DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO, DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, @@ -19,10 +22,14 @@ from .constants import ( DEFAULT_RERANK_HIGH_THRESHOLD, DEFAULT_SEARCH_RECALL_TOP_K, ) +from .constants import EVAL_LOG_FILE from .framework import SearchEvaluationFramework +from .logging_setup import setup_eval_logging from .utils import ensure_dir, utc_now_iso, utc_timestamp from .web_app import create_web_app +_cli_log = logging.getLogger("search_eval.cli") + def add_judge_llm_args(p: argparse.ArgumentParser) -> None: p.add_argument( @@ -45,6 +52,21 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None: ) +def add_intent_llm_args(p: argparse.ArgumentParser) -> None: + p.add_argument( + "--intent-model", + default=None, + metavar="MODEL", + help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).", + ) + p.add_argument( + "--intent-enable-thinking", + action=argparse.BooleanOptionalAction, + default=None, + help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).", + ) + + def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: kw: Dict[str, Any] = {} if args.judge_model is not None: @@ -53,6 +75,10 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]: kw["enable_thinking"] = args.enable_thinking if args.dashscope_batch is not None: kw["use_dashscope_batch"] = args.dashscope_batch + if getattr(args, "intent_model", None) is not None: + kw["intent_model"] = args.intent_model + if getattr(args, "intent_enable_thinking", None) is not None: + kw["intent_enable_thinking"] = args.intent_enable_thinking return kw @@ -110,6 +136,7 @@ def build_cli_parser() -> argparse.ArgumentParser: build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(build) + add_intent_llm_args(build) batch = sub.add_parser("batch", help="Run batch evaluation against live search") batch.add_argument("--tenant-id", default="163") @@ -118,6 +145,7 @@ def build_cli_parser() -> argparse.ArgumentParser: batch.add_argument("--language", default="en") batch.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(batch) + add_intent_llm_args(batch) audit = sub.add_parser("audit", help="Audit annotation quality for queries") audit.add_argument("--tenant-id", default="163") @@ -127,6 +155,7 @@ def build_cli_parser() -> argparse.ArgumentParser: audit.add_argument("--limit-suspicious", type=int, default=5) audit.add_argument("--force-refresh-labels", action="store_true") add_judge_llm_args(audit) + add_intent_llm_args(audit) serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") serve.add_argument("--tenant-id", default="163") @@ -134,6 +163,7 @@ def build_cli_parser() -> argparse.ArgumentParser: serve.add_argument("--host", default="0.0.0.0") serve.add_argument("--port", type=int, default=6010) add_judge_llm_args(serve) + add_intent_llm_args(serve) return parser @@ -183,14 +213,18 @@ def run_build(args: argparse.Namespace) -> None: "output_json_path": str(result.output_json_path), } ) - print( - f"[build] query={result.query!r} search_total={result.search_total} " - f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " - f"annotated={result.annotated_count} output={result.output_json_path}" + _cli_log.info( + "[build] query=%r search_total=%s search_depth=%s corpus=%s annotated=%s output=%s", + result.query, + result.search_total, + result.search_depth, + result.rerank_corpus_size, + result.annotated_count, + result.output_json_path, ) out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") - print(f"[done] summary={out_path}") + _cli_log.info("[done] summary=%s", out_path) def run_batch(args: argparse.Namespace) -> None: @@ -203,7 +237,7 @@ def run_batch(args: argparse.Namespace) -> None: language=args.language, force_refresh_labels=args.force_refresh_labels, ) - print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}") + _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"]) def run_audit(args: argparse.Namespace) -> None: @@ -239,8 +273,11 @@ def run_audit(args: argparse.Namespace) -> None: "suspicious_examples": item["suspicious"][: args.limit_suspicious], } ) - print( - f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}" + _cli_log.info( + "[audit] query=%r suspicious=%s metrics=%s", + query, + len(item["suspicious"]), + item["metrics"], ) summary = { @@ -253,7 +290,7 @@ def run_audit(args: argparse.Namespace) -> None: } out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") - print(f"[done] audit={out_path}") + _cli_log.info("[done] audit=%s", out_path) def run_serve(args: argparse.Namespace) -> None: @@ -265,8 +302,15 @@ def run_serve(args: argparse.Namespace) -> None: def main() -> None: + setup_eval_logging() parser = build_cli_parser() args = parser.parse_args() + logging.getLogger("search_eval").info( + "CLI start command=%s tenant_id=%s log_file=%s", + args.command, + getattr(args, "tenant_id", ""), + EVAL_LOG_FILE.resolve(), + ) if args.command == "build": run_build(args) return diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py index 3775638..54d7a51 100644 --- a/scripts/evaluation/eval_framework/clients.py +++ b/scripts/evaluation/eval_framework/clients.py @@ -4,16 +4,67 @@ from __future__ import annotations import io import json +import logging +import threading import time import uuid from typing import Any, Dict, List, Optional, Sequence, Tuple import requests -from .constants import VALID_LABELS -from .prompts import classify_prompt +from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS +from .logging_setup import setup_eval_logging +from .prompts import classify_prompt, intent_analysis_prompt from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps +_VERBOSE_LOGGER_LOCK = threading.Lock() +_eval_llm_verbose_logger_singleton: logging.Logger | None = None +_eval_llm_verbose_path_logged = False + + +def _get_eval_llm_verbose_logger() -> logging.Logger: + """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``.""" + setup_eval_logging() + global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged + with _VERBOSE_LOGGER_LOCK: + if _eval_llm_verbose_logger_singleton is not None: + return _eval_llm_verbose_logger_singleton + log_path = EVAL_VERBOSE_LOG_FILE + log_path.parent.mkdir(parents=True, exist_ok=True) + lg = logging.getLogger("search_eval.verbose_llm") + lg.setLevel(logging.INFO) + if not lg.handlers: + handler = logging.FileHandler(log_path, encoding="utf-8") + handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")) + lg.addHandler(handler) + lg.propagate = False + _eval_llm_verbose_logger_singleton = lg + if not _eval_llm_verbose_path_logged: + _eval_llm_verbose_path_logged = True + logging.getLogger("search_eval").info( + "LLM verbose I/O log (full prompt + response): %s", + log_path.resolve(), + ) + return lg + + +def _log_eval_llm_verbose( + *, + phase: str, + model: str, + prompt: str, + assistant_text: str, + raw_response: str, +) -> None: + log = _get_eval_llm_verbose_logger() + sep = "=" * 80 + log.info("\n%s", sep) + log.info("phase=%s model=%s", phase, model) + log.info("%s\nFULL PROMPT (user message)\n%s", sep, prompt) + log.info("%s\nASSISTANT CONTENT (parsed)\n%s", sep, assistant_text) + log.info("%s\nRAW RESPONSE (JSON string)\n%s", sep, raw_response) + log.info("%s\n", sep) + def _canonicalize_judge_label(raw: str) -> str | None: s = str(raw or "").strip().strip('"').strip("'") @@ -208,17 +259,27 @@ class DashScopeLabelClient: content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() return content, safe_json_dumps(row) - def _chat(self, prompt: str) -> Tuple[str, str]: + def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]: if not self.use_batch: - return self._chat_sync(prompt) - try: - return self._chat_batch(prompt) - except requests.exceptions.HTTPError as e: - resp = getattr(e, "response", None) - if resp is not None and resp.status_code == 404: - self.use_batch = False - return self._chat_sync(prompt) - raise + content, raw = self._chat_sync(prompt) + else: + try: + content, raw = self._chat_batch(prompt) + except requests.exceptions.HTTPError as e: + resp = getattr(e, "response", None) + if resp is not None and resp.status_code == 404: + self.use_batch = False + content, raw = self._chat_sync(prompt) + else: + raise + _log_eval_llm_verbose( + phase=phase, + model=self.model, + prompt=prompt, + assistant_text=content, + raw_response=raw, + ) + return content, raw def _find_batch_line_for_custom_id( self, @@ -242,14 +303,20 @@ class DashScopeLabelClient: return obj return None + def query_intent(self, query: str) -> Tuple[str, str]: + prompt = intent_analysis_prompt(query) + return self._chat(prompt, phase="query_intent") + def classify_batch( self, query: str, docs: Sequence[Dict[str, Any]], + *, + query_intent_block: str = "", ) -> Tuple[List[str], str]: numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] - prompt = classify_prompt(query, numbered_docs) - content, raw_response = self._chat(prompt) + prompt = classify_prompt(query, numbered_docs, query_intent_block=query_intent_block) + content, raw_response = self._chat(prompt, phase="relevance_classify") labels: List[str] = [] for line in str(content or "").splitlines(): canon = _canonicalize_judge_label(line) diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index 4c54b38..d14bb59 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -34,10 +34,20 @@ def normalize_stored_label(label: str) -> str: DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" +# Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``) +EVAL_LOG_DIR = PROJECT_ROOT / "logs" +EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose" +EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log" +EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log" + # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs) -DEFAULT_JUDGE_MODEL = "qwen3.5-flash" -DEFAULT_JUDGE_ENABLE_THINKING = True +DEFAULT_JUDGE_MODEL = "qwen3.5-plus" +DEFAULT_JUDGE_ENABLE_THINKING = False DEFAULT_JUDGE_DASHSCOPE_BATCH = False + +# Query-intent LLM (separate from judge; used once per query, injected into relevance prompts) +DEFAULT_INTENT_MODEL = "qwen-max" +DEFAULT_INTENT_ENABLE_THINKING = True DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h" DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0 @@ -60,6 +70,6 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant"). # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3). -DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939 +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 975f7dd..64aa096 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -3,6 +3,7 @@ from __future__ import annotations import json +import logging import time from pathlib import Path from typing import Any, Dict, List, Sequence, Tuple @@ -17,6 +18,8 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli from .constants import ( DEFAULT_ARTIFACT_ROOT, DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW, + DEFAULT_INTENT_ENABLE_THINKING, + DEFAULT_INTENT_MODEL, DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC, DEFAULT_JUDGE_DASHSCOPE_BATCH, DEFAULT_JUDGE_ENABLE_THINKING, @@ -52,6 +55,8 @@ from .utils import ( zh_title_from_multilingual, ) +_log = logging.getLogger("search_eval.framework") + def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]: """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``.""" @@ -80,6 +85,8 @@ class SearchEvaluationFramework: judge_model: str | None = None, enable_thinking: bool | None = None, use_dashscope_batch: bool | None = None, + intent_model: str | None = None, + intent_enable_thinking: bool | None = None, ): init_service(get_app_config().infrastructure.elasticsearch.host) self.tenant_id = str(tenant_id) @@ -109,6 +116,24 @@ class SearchEvaluationFramework: enable_thinking=et, use_batch=use_batch, ) + intent_m = str(intent_model or DEFAULT_INTENT_MODEL) + intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking + self.intent_client = DashScopeLabelClient( + model=intent_m, + base_url=str(llm_cfg["base_url"]), + api_key=str(api_key), + batch_completion_window=batch_window, + batch_poll_interval_sec=batch_poll, + enable_thinking=bool(intent_et), + use_batch=False, + ) + self._query_intent_cache: Dict[str, str] = {} + + def _ensure_query_intent_block(self, query: str) -> str: + if query not in self._query_intent_cache: + text, _raw = self.intent_client.query_intent(query) + self._query_intent_cache[query] = str(text or "").strip() + return self._query_intent_cache[query] def audit_live_query( self, @@ -310,7 +335,10 @@ class SearchEvaluationFramework: if not docs: return [] try: - labels, raw_response = self.label_client.classify_batch(query, docs) + intent_block = self._ensure_query_intent_block(query) + labels, raw_response = self.label_client.classify_batch( + query, docs, query_intent_block=intent_block + ) return [(labels, raw_response, docs)] except Exception: if len(docs) == 1: @@ -392,11 +420,16 @@ class SearchEvaluationFramework: "offset_end": min(start + n, total_ordered), } batch_logs.append(log_entry) - print( - f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} " - f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} " - f"irrel_plus_low_ratio={irrel_low_ratio:.4f}", - flush=True, + _log.info( + "[eval-rebuild] query=%r llm_batch=%s/%s size=%s exact_ratio=%.4f irrelevant_ratio=%.4f " + "irrel_plus_low_ratio=%.4f", + query, + batch_idx + 1, + max_batches, + n, + exact_ratio, + irrelevant_ratio, + irrel_low_ratio, ) # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality). @@ -409,11 +442,14 @@ class SearchEvaluationFramework: else: streak = 0 if streak >= stop_streak: - print( - f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " - f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} " - f"and irrel+low>{irrelevant_low_combined_stop_ratio})", - flush=True, + _log.info( + "[eval-rebuild] query=%r early_stop after %s batches (%s consecutive batches: " + "irrelevant>%s and irrel+low>%s)", + query, + batch_idx + 1, + stop_streak, + irrelevant_stop_ratio, + irrelevant_low_combined_stop_ratio, ) break @@ -626,11 +662,13 @@ class SearchEvaluationFramework: if rerank_high_n > int(rerank_high_skip_count): skipped = True skip_reason = "too_many_high_rerank_scores" - print( - f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} " - f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} " - f"(relevant tail too large / query too easy to satisfy)", - flush=True, + _log.info( + "[eval-rebuild] query=%r skip: rerank_score>%s outside recall pool count=%s > %s " + "(relevant tail too large / query too easy to satisfy)", + query, + rerank_high_threshold, + rerank_high_n, + rerank_high_skip_count, ) else: ordered_docs: List[Dict[str, Any]] = [] diff --git a/scripts/evaluation/eval_framework/logging_setup.py b/scripts/evaluation/eval_framework/logging_setup.py new file mode 100644 index 0000000..8323a85 --- /dev/null +++ b/scripts/evaluation/eval_framework/logging_setup.py @@ -0,0 +1,35 @@ +"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``).""" + +from __future__ import annotations + +import logging +import sys + +from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR + +_setup_done = False + + +def setup_eval_logging() -> None: + """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist.""" + global _setup_done + if _setup_done: + return + + EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True) + EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True) + + fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s") + root = logging.getLogger("search_eval") + root.setLevel(logging.INFO) + if root.handlers: + _setup_done = True + return + fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8") + fh.setFormatter(fmt) + sh = logging.StreamHandler(sys.stderr) + sh.setFormatter(fmt) + root.addHandler(fh) + root.addHandler(sh) + root.propagate = False + _setup_done = True diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py index 5fc9201..3c9b6d2 100644 --- a/scripts/evaluation/eval_framework/prompts.py +++ b/scripts/evaluation/eval_framework/prompts.py @@ -4,6 +4,54 @@ from __future__ import annotations from typing import Sequence + +_QUERY_INTENT_ANALYSIS_TEMPLATE_EN = """You are an intent analysis expert for a fashion e-commerce search system. + +Given a user's search query, analyze the shopping intent behind the query in the context of fashion and apparel e-commerce, and summarize the user's core search need in one concise sentence. +Also provide the Chinese translation and English translation of the query. + +Requirements: +- Keep the intent analysis concise and easy to understand, using 1 to 3 short sentences. +- Stay grounded in the original query and summarize the user's likely shopping intent without adding unnecessary context. +- When the query is vague or ambiguous, take a conservative approach and keep the analysis close to the original wording. +- Chinese translation: if the original query is already in Chinese, keep it unchanged. +- English translation: if the original query is already in English, keep it unchanged. +- Do not output anything other than the required three-line format. + +Output format (strictly exactly three lines): +Intent: concise analysis of the user's search intent +Query中文翻译: Chinese translation of the query +Query English translation: English translation of the query + +Now analyze the following query: + +Query: {query} +""" + +_QUERY_INTENT_ANALYSIS_RESULT_TEMPLATE_ZH = """ +你是一个服装品类电商搜索意图分析专家。 + +给定用户输入的搜索词,请在服装品类电商场景下,分析该搜索词背后的购物意图,并用一句话简要描述用户的核心搜索需求。 +同时,提供该搜索词的中文翻译和英文翻译。 + +要求: +- 意图分析应简洁易懂,用 1 到 3 句短句概括用户的搜索意图。 +- 结合 query 本身,尽量贴近用户原始搜索需求进行总结,不添加不必要的背景、延伸或臆测。 +- 如果 query 不够明确或有歧义,应保守处理,尽量保持与原词表达一致。 +- 中文翻译:如果原始 query 本身就是中文,则按原样输出。 +- 英文翻译:如果原始 query 本身就是英文,则按原样输出。 +- 除指定格式外,不要输出任何额外说明。 + +输出格式(严格按三行输出): +Intent: 对用户搜索意图的简洁分析 +Query中文翻译: query 的中文翻译 +Query English translation: query 的英文翻译 + +现在请分析以下搜索词: + +Query: {query} +""" + _CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system. Given a user query and the information for each product, assign a relevance label to each product. @@ -136,7 +184,7 @@ Typical examples: - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**; - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement. -Query: {query} +Query: {query}{intent_suffix} Products: {lines} @@ -276,7 +324,7 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性 - 未提及 / 无法确认,优先按“基本相关”处理; - 只有当商品信息明确显示与查询要求相反时,才视为属性冲突。 -查询:{query} +查询:{query}{intent_suffix} 商品: {lines} @@ -293,7 +341,17 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性 """ -def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str: +def intent_analysis_prompt(query: str) -> str: + return _QUERY_INTENT_ANALYSIS_TEMPLATE_EN.format(query=query) + + +def classify_prompt( + query: str, + numbered_doc_lines: Sequence[str], + *, + query_intent_block: str = "", +) -> str: lines = "\n".join(numbered_doc_lines) n = len(numbered_doc_lines) - return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n) + intent_suffix = f"\n{query_intent_block.strip()}" if query_intent_block and query_intent_block.strip() else "" + return _CLASSIFY_TEMPLATE_EN.format(query=query, intent_suffix=intent_suffix, lines=lines, n=n) -- libgit2 0.21.2