From cdd8ee3a3dfe584e2e32a46c7567ea95c2dafac2 Mon Sep 17 00:00:00 2001
From: tangwang <tangwang@hsyl>
Date: Wed, 1 Apr 2026 14:19:27 +0800
Subject: [PATCH] eval框架日志独立 现在的行为（按你的路径） 用途	路径（相对仓库根 PROJECT_ROOT） 评估主日志（CLI + framework 的 INFO）	logs/eval.log LLM 全量 prompt / 原始响应	logs/verbose/eval_verbose.log 实现要点：

---
 api/result_formatter.py                            |  9 ++++++++-
 scripts/evaluation/eval_framework/cli.py           | 62 +++++++++++++++++++++++++++++++++++++++++++++++++++++---------
 scripts/evaluation/eval_framework/clients.py       | 95 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
 scripts/evaluation/eval_framework/constants.py     | 16 +++++++++++++---
 scripts/evaluation/eval_framework/framework.py     | 70 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
 scripts/evaluation/eval_framework/logging_setup.py | 35 +++++++++++++++++++++++++++++++++++
 scripts/evaluation/eval_framework/prompts.py       | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 7 files changed, 306 insertions(+), 47 deletions(-)
 create mode 100644 scripts/evaluation/eval_framework/logging_setup.py

diff --git a/api/result_formatter.py b/api/result_formatter.py
index c1d5910..4ad608f 100644
--- a/api/result_formatter.py
+++ b/api/result_formatter.py
@@ -76,6 +76,13 @@ class ResultFormatter:
             category_path = pick_lang_field(source, "category_path")
             category_name = pick_lang_field(source, "category_name_text") or source.get("category_name")
 
+            # tags: core-language object {"en": "a,b", "zh": "..."} from indexer
+            tags: Optional[List[str]] = None
+            if isinstance(source.get("tags"), dict):
+                tags_txt = pick_lang_field(source, "tags")
+                if tags_txt:
+                    tags = [t.strip() for t in str(tags_txt).split(",") if t.strip()] or None
+
             # Extract SKUs
             skus = []
             skus_data = source.get('skus', [])
@@ -129,7 +136,7 @@ class ResultFormatter:
                 category1_name=source.get('category1_name'),
                 category2_name=source.get('category2_name'),
                 category3_name=source.get('category3_name'),
-                tags=source.get('tags'),
+                tags=tags,
                 price=source.get('min_price'),
                 compare_at_price=source.get('compare_at_price'),
                 currency="USD",  # Default currency
diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py
index b5a3486..2de3101 100644
--- a/scripts/evaluation/eval_framework/cli.py
+++ b/scripts/evaluation/eval_framework/cli.py
@@ -4,10 +4,13 @@ from __future__ import annotations
 
 import argparse
 import json
+import logging
 from pathlib import Path
 from typing import Any, Dict
 
 from .constants import (
+    DEFAULT_INTENT_ENABLE_THINKING,
+    DEFAULT_INTENT_MODEL,
     DEFAULT_QUERY_FILE,
     DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO,
     DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO,
@@ -19,10 +22,14 @@ from .constants import (
     DEFAULT_RERANK_HIGH_THRESHOLD,
     DEFAULT_SEARCH_RECALL_TOP_K,
 )
+from .constants import EVAL_LOG_FILE
 from .framework import SearchEvaluationFramework
+from .logging_setup import setup_eval_logging
 from .utils import ensure_dir, utc_now_iso, utc_timestamp
 from .web_app import create_web_app
 
+_cli_log = logging.getLogger("search_eval.cli")
+
 
 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
     p.add_argument(
@@ -45,6 +52,21 @@ def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
     )
 
 
+def add_intent_llm_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument(
+        "--intent-model",
+        default=None,
+        metavar="MODEL",
+        help=f"Query-intent LLM model before relevance judging (default: {DEFAULT_INTENT_MODEL!r}).",
+    )
+    p.add_argument(
+        "--intent-enable-thinking",
+        action=argparse.BooleanOptionalAction,
+        default=None,
+        help=f"enable_thinking for intent model (default: {DEFAULT_INTENT_ENABLE_THINKING}).",
+    )
+
+
 def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
     kw: Dict[str, Any] = {}
     if args.judge_model is not None:
@@ -53,6 +75,10 @@ def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
         kw["enable_thinking"] = args.enable_thinking
     if args.dashscope_batch is not None:
         kw["use_dashscope_batch"] = args.dashscope_batch
+    if getattr(args, "intent_model", None) is not None:
+        kw["intent_model"] = args.intent_model
+    if getattr(args, "intent_enable_thinking", None) is not None:
+        kw["intent_enable_thinking"] = args.intent_enable_thinking
     return kw
 
 
@@ -110,6 +136,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(build)
+    add_intent_llm_args(build)
 
     batch = sub.add_parser("batch", help="Run batch evaluation against live search")
     batch.add_argument("--tenant-id", default="163")
@@ -118,6 +145,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
     batch.add_argument("--language", default="en")
     batch.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(batch)
+    add_intent_llm_args(batch)
 
     audit = sub.add_parser("audit", help="Audit annotation quality for queries")
     audit.add_argument("--tenant-id", default="163")
@@ -127,6 +155,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
     audit.add_argument("--limit-suspicious", type=int, default=5)
     audit.add_argument("--force-refresh-labels", action="store_true")
     add_judge_llm_args(audit)
+    add_intent_llm_args(audit)
 
     serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
     serve.add_argument("--tenant-id", default="163")
@@ -134,6 +163,7 @@ def build_cli_parser() -> argparse.ArgumentParser:
     serve.add_argument("--host", default="0.0.0.0")
     serve.add_argument("--port", type=int, default=6010)
     add_judge_llm_args(serve)
+    add_intent_llm_args(serve)
 
     return parser
 
@@ -183,14 +213,18 @@ def run_build(args: argparse.Namespace) -> None:
                 "output_json_path": str(result.output_json_path),
             }
         )
-        print(
-            f"[build] query={result.query!r} search_total={result.search_total} "
-            f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
-            f"annotated={result.annotated_count} output={result.output_json_path}"
+        _cli_log.info(
+            "[build] query=%r search_total=%s search_depth=%s corpus=%s annotated=%s output=%s",
+            result.query,
+            result.search_total,
+            result.search_depth,
+            result.rerank_corpus_size,
+            result.annotated_count,
+            result.output_json_path,
         )
     out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
     out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
-    print(f"[done] summary={out_path}")
+    _cli_log.info("[done] summary=%s", out_path)
 
 
 def run_batch(args: argparse.Namespace) -> None:
@@ -203,7 +237,7 @@ def run_batch(args: argparse.Namespace) -> None:
         language=args.language,
         force_refresh_labels=args.force_refresh_labels,
     )
-    print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
+    _cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"])
 
 
 def run_audit(args: argparse.Namespace) -> None:
@@ -239,8 +273,11 @@ def run_audit(args: argparse.Namespace) -> None:
                 "suspicious_examples": item["suspicious"][: args.limit_suspicious],
             }
         )
-        print(
-            f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
+        _cli_log.info(
+            "[audit] query=%r suspicious=%s metrics=%s",
+            query,
+            len(item["suspicious"]),
+            item["metrics"],
         )
 
     summary = {
@@ -253,7 +290,7 @@ def run_audit(args: argparse.Namespace) -> None:
     }
     out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
     out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
-    print(f"[done] audit={out_path}")
+    _cli_log.info("[done] audit=%s", out_path)
 
 
 def run_serve(args: argparse.Namespace) -> None:
@@ -265,8 +302,15 @@ def run_serve(args: argparse.Namespace) -> None:
 
 
 def main() -> None:
+    setup_eval_logging()
     parser = build_cli_parser()
     args = parser.parse_args()
+    logging.getLogger("search_eval").info(
+        "CLI start command=%s tenant_id=%s log_file=%s",
+        args.command,
+        getattr(args, "tenant_id", ""),
+        EVAL_LOG_FILE.resolve(),
+    )
     if args.command == "build":
         run_build(args)
         return
diff --git a/scripts/evaluation/eval_framework/clients.py b/scripts/evaluation/eval_framework/clients.py
index 3775638..54d7a51 100644
--- a/scripts/evaluation/eval_framework/clients.py
+++ b/scripts/evaluation/eval_framework/clients.py
@@ -4,16 +4,67 @@ from __future__ import annotations
 
 import io
 import json
+import logging
+import threading
 import time
 import uuid
 from typing import Any, Dict, List, Optional, Sequence, Tuple
 
 import requests
 
-from .constants import VALID_LABELS
-from .prompts import classify_prompt
+from .constants import EVAL_VERBOSE_LOG_FILE, VALID_LABELS
+from .logging_setup import setup_eval_logging
+from .prompts import classify_prompt, intent_analysis_prompt
 from .utils import build_label_doc_line, extract_json_blob, safe_json_dumps
 
+_VERBOSE_LOGGER_LOCK = threading.Lock()
+_eval_llm_verbose_logger_singleton: logging.Logger | None = None
+_eval_llm_verbose_path_logged = False
+
+
+def _get_eval_llm_verbose_logger() -> logging.Logger:
+    """File logger for full LLM prompts/responses → ``logs/verbose/eval_verbose.log``."""
+    setup_eval_logging()
+    global _eval_llm_verbose_logger_singleton, _eval_llm_verbose_path_logged
+    with _VERBOSE_LOGGER_LOCK:
+        if _eval_llm_verbose_logger_singleton is not None:
+            return _eval_llm_verbose_logger_singleton
+        log_path = EVAL_VERBOSE_LOG_FILE
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        lg = logging.getLogger("search_eval.verbose_llm")
+        lg.setLevel(logging.INFO)
+        if not lg.handlers:
+            handler = logging.FileHandler(log_path, encoding="utf-8")
+            handler.setFormatter(logging.Formatter("%(asctime)s - %(levelname)s - %(message)s"))
+            lg.addHandler(handler)
+            lg.propagate = False
+        _eval_llm_verbose_logger_singleton = lg
+        if not _eval_llm_verbose_path_logged:
+            _eval_llm_verbose_path_logged = True
+            logging.getLogger("search_eval").info(
+                "LLM verbose I/O log (full prompt + response): %s",
+                log_path.resolve(),
+            )
+        return lg
+
+
+def _log_eval_llm_verbose(
+    *,
+    phase: str,
+    model: str,
+    prompt: str,
+    assistant_text: str,
+    raw_response: str,
+) -> None:
+    log = _get_eval_llm_verbose_logger()
+    sep = "=" * 80
+    log.info("\n%s", sep)
+    log.info("phase=%s model=%s", phase, model)
+    log.info("%s\nFULL PROMPT (user message)\n%s", sep, prompt)
+    log.info("%s\nASSISTANT CONTENT (parsed)\n%s", sep, assistant_text)
+    log.info("%s\nRAW RESPONSE (JSON string)\n%s", sep, raw_response)
+    log.info("%s\n", sep)
+
 
 def _canonicalize_judge_label(raw: str) -> str | None:
     s = str(raw or "").strip().strip('"').strip("'")
@@ -208,17 +259,27 @@ class DashScopeLabelClient:
         content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
         return content, safe_json_dumps(row)
 
-    def _chat(self, prompt: str) -> Tuple[str, str]:
+    def _chat(self, prompt: str, *, phase: str = "chat") -> Tuple[str, str]:
         if not self.use_batch:
-            return self._chat_sync(prompt)
-        try:
-            return self._chat_batch(prompt)
-        except requests.exceptions.HTTPError as e:
-            resp = getattr(e, "response", None)
-            if resp is not None and resp.status_code == 404:
-                self.use_batch = False
-                return self._chat_sync(prompt)
-            raise
+            content, raw = self._chat_sync(prompt)
+        else:
+            try:
+                content, raw = self._chat_batch(prompt)
+            except requests.exceptions.HTTPError as e:
+                resp = getattr(e, "response", None)
+                if resp is not None and resp.status_code == 404:
+                    self.use_batch = False
+                    content, raw = self._chat_sync(prompt)
+                else:
+                    raise
+        _log_eval_llm_verbose(
+            phase=phase,
+            model=self.model,
+            prompt=prompt,
+            assistant_text=content,
+            raw_response=raw,
+        )
+        return content, raw
 
     def _find_batch_line_for_custom_id(
         self,
@@ -242,14 +303,20 @@ class DashScopeLabelClient:
                 return obj
         return None
 
+    def query_intent(self, query: str) -> Tuple[str, str]:
+        prompt = intent_analysis_prompt(query)
+        return self._chat(prompt, phase="query_intent")
+
     def classify_batch(
         self,
         query: str,
         docs: Sequence[Dict[str, Any]],
+        *,
+        query_intent_block: str = "",
     ) -> Tuple[List[str], str]:
         numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
-        prompt = classify_prompt(query, numbered_docs)
-        content, raw_response = self._chat(prompt)
+        prompt = classify_prompt(query, numbered_docs, query_intent_block=query_intent_block)
+        content, raw_response = self._chat(prompt, phase="relevance_classify")
         labels: List[str] = []
         for line in str(content or "").splitlines():
             canon = _canonicalize_judge_label(line)
diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py
index 4c54b38..d14bb59 100644
--- a/scripts/evaluation/eval_framework/constants.py
+++ b/scripts/evaluation/eval_framework/constants.py
@@ -34,10 +34,20 @@ def normalize_stored_label(label: str) -> str:
 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
 DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt"
 
+# Logging (``build_annotation_set.py`` / ``serve_eval_web.py`` → ``eval_framework.cli.main``)
+EVAL_LOG_DIR = PROJECT_ROOT / "logs"
+EVAL_VERBOSE_LOG_DIR = EVAL_LOG_DIR / "verbose"
+EVAL_LOG_FILE = EVAL_LOG_DIR / "eval.log"
+EVAL_VERBOSE_LOG_FILE = EVAL_VERBOSE_LOG_DIR / "eval_verbose.log"
+
 # Judge LLM (eval_framework only; override via CLI --judge-model / constructor kwargs)
-DEFAULT_JUDGE_MODEL = "qwen3.5-flash"
-DEFAULT_JUDGE_ENABLE_THINKING = True
+DEFAULT_JUDGE_MODEL = "qwen3.5-plus"
+DEFAULT_JUDGE_ENABLE_THINKING = False
 DEFAULT_JUDGE_DASHSCOPE_BATCH = False
+
+# Query-intent LLM (separate from judge; used once per query, injected into relevance prompts)
+DEFAULT_INTENT_MODEL = "qwen-max"
+DEFAULT_INTENT_ENABLE_THINKING = True
 DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW = "24h"
 DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC = 10.0
 
@@ -60,6 +70,6 @@ DEFAULT_REBUILD_MAX_LLM_BATCHES = 40
 # ``irrelevant_ratio`` = Irrelevant count / n; weak relevance is ``RELEVANCE_LOW`` ("Low Relevant").
 # Increment streak on consecutive bad batches; reset on any non-bad batch. Stop when streak
 # reaches ``DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK`` (default 3).
-DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.939
+DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.799
 DEFAULT_REBUILD_IRREL_LOW_COMBINED_STOP_RATIO = 0.959
 DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3
diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py
index 975f7dd..64aa096 100644
--- a/scripts/evaluation/eval_framework/framework.py
+++ b/scripts/evaluation/eval_framework/framework.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import json
+import logging
 import time
 from pathlib import Path
 from typing import Any, Dict, List, Sequence, Tuple
@@ -17,6 +18,8 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli
 from .constants import (
     DEFAULT_ARTIFACT_ROOT,
     DEFAULT_JUDGE_BATCH_COMPLETION_WINDOW,
+    DEFAULT_INTENT_ENABLE_THINKING,
+    DEFAULT_INTENT_MODEL,
     DEFAULT_JUDGE_BATCH_POLL_INTERVAL_SEC,
     DEFAULT_JUDGE_DASHSCOPE_BATCH,
     DEFAULT_JUDGE_ENABLE_THINKING,
@@ -52,6 +55,8 @@ from .utils import (
     zh_title_from_multilingual,
 )
 
+_log = logging.getLogger("search_eval.framework")
+
 
 def _zh_titles_from_debug_per_result(debug_info: Any) -> Dict[str, str]:
     """Map ``spu_id`` -> Chinese title from ``debug_info.per_result[].title_multilingual``."""
@@ -80,6 +85,8 @@ class SearchEvaluationFramework:
         judge_model: str | None = None,
         enable_thinking: bool | None = None,
         use_dashscope_batch: bool | None = None,
+        intent_model: str | None = None,
+        intent_enable_thinking: bool | None = None,
     ):
         init_service(get_app_config().infrastructure.elasticsearch.host)
         self.tenant_id = str(tenant_id)
@@ -109,6 +116,24 @@ class SearchEvaluationFramework:
             enable_thinking=et,
             use_batch=use_batch,
         )
+        intent_m = str(intent_model or DEFAULT_INTENT_MODEL)
+        intent_et = DEFAULT_INTENT_ENABLE_THINKING if intent_enable_thinking is None else intent_enable_thinking
+        self.intent_client = DashScopeLabelClient(
+            model=intent_m,
+            base_url=str(llm_cfg["base_url"]),
+            api_key=str(api_key),
+            batch_completion_window=batch_window,
+            batch_poll_interval_sec=batch_poll,
+            enable_thinking=bool(intent_et),
+            use_batch=False,
+        )
+        self._query_intent_cache: Dict[str, str] = {}
+
+    def _ensure_query_intent_block(self, query: str) -> str:
+        if query not in self._query_intent_cache:
+            text, _raw = self.intent_client.query_intent(query)
+            self._query_intent_cache[query] = str(text or "").strip()
+        return self._query_intent_cache[query]
 
     def audit_live_query(
         self,
@@ -310,7 +335,10 @@ class SearchEvaluationFramework:
         if not docs:
             return []
         try:
-            labels, raw_response = self.label_client.classify_batch(query, docs)
+            intent_block = self._ensure_query_intent_block(query)
+            labels, raw_response = self.label_client.classify_batch(
+                query, docs, query_intent_block=intent_block
+            )
             return [(labels, raw_response, docs)]
         except Exception:
             if len(docs) == 1:
@@ -392,11 +420,16 @@ class SearchEvaluationFramework:
                 "offset_end": min(start + n, total_ordered),
             }
             batch_logs.append(log_entry)
-            print(
-                f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} "
-                f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f} "
-                f"irrel_plus_low_ratio={irrel_low_ratio:.4f}",
-                flush=True,
+            _log.info(
+                "[eval-rebuild] query=%r llm_batch=%s/%s size=%s exact_ratio=%.4f irrelevant_ratio=%.4f "
+                "irrel_plus_low_ratio=%.4f",
+                query,
+                batch_idx + 1,
+                max_batches,
+                n,
+                exact_ratio,
+                irrelevant_ratio,
+                irrel_low_ratio,
             )
 
             # Early-stop streak: only evaluated after min_batches (warm-up before trusting tail quality).
@@ -409,11 +442,14 @@ class SearchEvaluationFramework:
                 else:
                     streak = 0
                 if streak >= stop_streak:
-                    print(
-                        f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches "
-                        f"({stop_streak} consecutive batches: irrelevant>{irrelevant_stop_ratio} "
-                        f"and irrel+low>{irrelevant_low_combined_stop_ratio})",
-                        flush=True,
+                    _log.info(
+                        "[eval-rebuild] query=%r early_stop after %s batches (%s consecutive batches: "
+                        "irrelevant>%s and irrel+low>%s)",
+                        query,
+                        batch_idx + 1,
+                        stop_streak,
+                        irrelevant_stop_ratio,
+                        irrelevant_low_combined_stop_ratio,
                     )
                     break
 
@@ -626,11 +662,13 @@ class SearchEvaluationFramework:
         if rerank_high_n > int(rerank_high_skip_count):
             skipped = True
             skip_reason = "too_many_high_rerank_scores"
-            print(
-                f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} "
-                f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} "
-                f"(relevant tail too large / query too easy to satisfy)",
-                flush=True,
+            _log.info(
+                "[eval-rebuild] query=%r skip: rerank_score>%s outside recall pool count=%s > %s "
+                "(relevant tail too large / query too easy to satisfy)",
+                query,
+                rerank_high_threshold,
+                rerank_high_n,
+                rerank_high_skip_count,
             )
         else:
             ordered_docs: List[Dict[str, Any]] = []
diff --git a/scripts/evaluation/eval_framework/logging_setup.py b/scripts/evaluation/eval_framework/logging_setup.py
new file mode 100644
index 0000000..8323a85
--- /dev/null
+++ b/scripts/evaluation/eval_framework/logging_setup.py
@@ -0,0 +1,35 @@
+"""Configure dedicated eval run logs under repo ``logs/`` (see ``constants.EVAL_*_LOG_*``)."""
+
+from __future__ import annotations
+
+import logging
+import sys
+
+from .constants import EVAL_LOG_DIR, EVAL_LOG_FILE, EVAL_VERBOSE_LOG_DIR
+
+_setup_done = False
+
+
+def setup_eval_logging() -> None:
+    """Attach file + stderr handlers to ``search_eval`` once; ensure log directories exist."""
+    global _setup_done
+    if _setup_done:
+        return
+
+    EVAL_LOG_DIR.mkdir(parents=True, exist_ok=True)
+    EVAL_VERBOSE_LOG_DIR.mkdir(parents=True, exist_ok=True)
+
+    fmt = logging.Formatter("%(asctime)s - %(levelname)s - %(name)s - %(message)s")
+    root = logging.getLogger("search_eval")
+    root.setLevel(logging.INFO)
+    if root.handlers:
+        _setup_done = True
+        return
+    fh = logging.FileHandler(EVAL_LOG_FILE, encoding="utf-8")
+    fh.setFormatter(fmt)
+    sh = logging.StreamHandler(sys.stderr)
+    sh.setFormatter(fmt)
+    root.addHandler(fh)
+    root.addHandler(sh)
+    root.propagate = False
+    _setup_done = True
diff --git a/scripts/evaluation/eval_framework/prompts.py b/scripts/evaluation/eval_framework/prompts.py
index 5fc9201..3c9b6d2 100644
--- a/scripts/evaluation/eval_framework/prompts.py
+++ b/scripts/evaluation/eval_framework/prompts.py
@@ -4,6 +4,54 @@ from __future__ import annotations
 
 from typing import Sequence
 
+
+_QUERY_INTENT_ANALYSIS_TEMPLATE_EN = """You are an intent analysis expert for a fashion e-commerce search system.
+
+Given a user's search query, analyze the shopping intent behind the query in the context of fashion and apparel e-commerce, and summarize the user's core search need in one concise sentence.
+Also provide the Chinese translation and English translation of the query.
+
+Requirements:
+- Keep the intent analysis concise and easy to understand, using 1 to 3 short sentences.
+- Stay grounded in the original query and summarize the user's likely shopping intent without adding unnecessary context.
+- When the query is vague or ambiguous, take a conservative approach and keep the analysis close to the original wording.
+- Chinese translation: if the original query is already in Chinese, keep it unchanged.
+- English translation: if the original query is already in English, keep it unchanged.
+- Do not output anything other than the required three-line format.
+
+Output format (strictly exactly three lines):
+Intent: concise analysis of the user's search intent
+Query中文翻译: Chinese translation of the query
+Query English translation: English translation of the query
+
+Now analyze the following query:
+
+Query: {query}
+"""
+
+_QUERY_INTENT_ANALYSIS_RESULT_TEMPLATE_ZH = """
+你是一个服装品类电商搜索意图分析专家。
+
+给定用户输入的搜索词，请在服装品类电商场景下，分析该搜索词背后的购物意图，并用一句话简要描述用户的核心搜索需求。
+同时，提供该搜索词的中文翻译和英文翻译。
+
+要求：
+- 意图分析应简洁易懂，用 1 到 3 句短句概括用户的搜索意图。
+- 结合 query 本身，尽量贴近用户原始搜索需求进行总结，不添加不必要的背景、延伸或臆测。
+- 如果 query 不够明确或有歧义，应保守处理，尽量保持与原词表达一致。
+- 中文翻译：如果原始 query 本身就是中文，则按原样输出。
+- 英文翻译：如果原始 query 本身就是英文，则按原样输出。
+- 除指定格式外，不要输出任何额外说明。
+
+输出格式（严格按三行输出）：
+Intent: 对用户搜索意图的简洁分析
+Query中文翻译: query 的中文翻译
+Query English translation: query 的英文翻译
+
+现在请分析以下搜索词：
+
+Query: {query}
+"""
+
 _CLASSIFY_TEMPLATE_EN = """You are a relevance judgment assistant for a fashion e-commerce search system.
 Given a user query and the information for each product, assign a relevance label to each product.
 
@@ -136,7 +184,7 @@ Typical examples:
    - If the attribute is not mentioned or cannot be confirmed, prefer **High Relevant**;
    - Only treat it as a conflict when the product information clearly shows the opposite of the query requirement.
 
-Query: {query}
+Query: {query}{intent_suffix}
 
 Products:
 {lines}
@@ -276,7 +324,7 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性�
    - 未提及 / 无法确认，优先按“基本相关”处理；
    - 只有当商品信息明确显示与查询要求相反时，才视为属性冲突。
 
-查询：{query}
+查询：{query}{intent_suffix}
 
 商品：
 {lines}
@@ -293,7 +341,17 @@ _CLASSIFY_TEMPLATE_ZH = """你是一个服饰电商搜索系统中的相关性�
 """
 
 
-def classify_prompt(query: str, numbered_doc_lines: Sequence[str]) -> str:
+def intent_analysis_prompt(query: str) -> str:
+    return _QUERY_INTENT_ANALYSIS_TEMPLATE_EN.format(query=query)
+
+
+def classify_prompt(
+    query: str,
+    numbered_doc_lines: Sequence[str],
+    *,
+    query_intent_block: str = "",
+) -> str:
     lines = "\n".join(numbered_doc_lines)
     n = len(numbered_doc_lines)
-    return _CLASSIFY_TEMPLATE_EN.format(query=query, lines=lines, n=n)
+    intent_suffix = f"\n{query_intent_block.strip()}" if query_intent_block and query_intent_block.strip() else ""
+    return _CLASSIFY_TEMPLATE_EN.format(query=query, intent_suffix=intent_suffix, lines=lines, n=n)
--
libgit2 0.21.2