From d172c2591757cdbb0104c872cc9e04ee79bee724 Mon Sep 17 00:00:00 2001 From: tangwang Date: Tue, 31 Mar 2026 23:27:53 +0800 Subject: [PATCH] eval框架 --- docs/issue-2026-03-31-评估框架-done-0331.md | 151 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ scripts/evaluation/README.md | 22 +++++++++++++++------- scripts/evaluation/README_Requirement_zh.md | 14 +++++++++++--- scripts/evaluation/eval_framework/cli.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- scripts/evaluation/eval_framework/constants.py | 10 ++++++++++ scripts/evaluation/eval_framework/framework.py | 329 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- scripts/evaluation/quick_start_eval.sh | 12 +++++++----- 7 files changed, 584 insertions(+), 17 deletions(-) create mode 100644 docs/issue-2026-03-31-评估框架-done-0331.md diff --git a/docs/issue-2026-03-31-评估框架-done-0331.md b/docs/issue-2026-03-31-评估框架-done-0331.md new file mode 100644 index 0000000..3783695 --- /dev/null +++ b/docs/issue-2026-03-31-评估框架-done-0331.md @@ -0,0 +1,151 @@ + + +参考资料: + +1. 搜索接口: + +```bash +export BASE_URL="${BASE_URL:-http://localhost:6002}" +export TENANT_ID="${TENANT_ID:-163}" # 改成你的租户ID +``` +```bash +curl -sS "$BASE_URL/search/" \ + -H "Content-Type: application/json" \ + -H "X-Tenant-ID: $TENANT_ID" \ + -d '{ + "query": "芭比娃娃", + "size": 20, + "from": 0, + "language": "zh" + }' +``` + +response: +{ + "results": [ + { + "spu_id": "12345", + "title": "芭比时尚娃娃", + "image_url": "https://example.com/image.jpg", + "specifications":[], + "skus":[{"sku_id":" ... +... + +2. 重排服务: +curl -X POST "http://localhost:6007/rerank" \ + -H "Content-Type: application/json" \ + -d '{ + "query": "玩具 芭比", + "docs": ["12PCS 6 Types of Dolls with Bottles", "纯棉T恤 短袖"], + "top_n":386, + "normalize": true + }' + + +3. 基于指定字段查询:es_debug_search.py + + +主要任务: +1. 评估工具的建立: +注意判断结果好坏,要用统一的评估工具,不要对每个query设定关键词匹配的规则来判断是否符合要求,这样不可扩展,这种方式且容易有误判还是复杂,并且不好扩展到其他搜索词。 +因此要做一个搜索结果评估工具、多个结果对比的工具,供后面的标注集合构建工具调用。工具内部实现可以是调用大模型来判断,说清楚什么叫高相关、基本相关、不相关: + +prompt: +```bash +你是一个电商搜索结果相关性评估助手。请根据用户查询(query)和每个商品的信息,输出该商品的相关性等级。 + +## 相关性等级标准 +Exact 完全相关 — 完全匹配用户搜索需求。 +Partial 部分相关 — 主意图满足(同品类或相近用途,基本上符合搜索意图),但次要属性(如颜色、风格、尺码等)跟用户需求有偏差或无法确认。 +Irrelevant 不相关 — 品类或用途不符,主诉求未满足。 + +1. {title1} {option1_value1} {option2_value1} {option3_value1} +2. {title2} {option1_value2} {option2_value2}, {option3_value2} +... +50. {title50} {option1_value50} {option2_value50} {option3_value50} + +## 输出格式 +严格输出 {input_nums} 行,每行仅Exact / Partial / Irrelevant三者之一。按顺序对应上述 50 个商品。不要输出任何其他任何信息 +``` + + +2. 测试集(结果标注)建立: +@queries/queries.txt + +对其中每一个query: +1. 召回: +1)参考搜索接口 召回结果。搜索结果的top500,纳入召回池,打分全部标记为1 +2)调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过reranker模型接口调用。每80个doc做一次请求。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 +3)对reranker打分超过0.5的结果数大于1000条的query,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 + + +2. 对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 +直到连续三批不相关比例都大于92%。 +最少要跑15批,最多跑40批 + +3. 请你思考如何存储结果、并利于以后的对比、使用、展示。 + + + + +3. 评估工具页面: +请你设计一个搜索评估交互页面。端口6010。 +页面主题:上方是搜索框,如果发起搜索,那么下方给出本次结果的总体指标以及top100结果(允许翻页) + +总体指标: +| 指标 | 含义 | +|------|------| +| **P@5, P@10, P@20, P@50** | 前 K 个结果中「仅 3 相关」的精确率 | +| **P@5_2_3 ~ P@50_2_3** | 前 K 个结果中「2 和 3 都算相关」的精确率 | +| **MAP_3** | 仅 3 相关时的 Average Precision(单 query) | +| **MAP_2_3** | 2 和 3 都相关时的 Average Precision | + +结果列表: +按行列下来,每行左侧给每个结果找到标注值(三个等级。对结果也可以颜色标记),展示图片,title.en+title.en+首个sku的option1/2/3_value(分三行展示,这三行和左侧的图片并列) + + +评测页面最左侧: +queries默认是queries/queries.txt,填入左侧列表框,点击其中任何一个发起搜索。 + +4. 批量评估工具 + +给一个批量执行脚本, + +这里要新增一个批量评估的页面。点击批量评估的按钮,对所有搜索词依次发起搜索,最后汇总总体的评估指标,生成报告,报告名称带上时间标记和一些关键信息。并且记录当时的主搜索程序的config.yaml。 +你需要精心地设计如何切换两种模式,通过同一个端口承载这两种不同交互的内容。 +批量评估关注的是所有搜索词总体的评估指标。 +需要记录测试环境时间以及当时的配置文件,以及对应的结果。要保存历次的评估记录,并能查到每一次评估结果对应的配置文件有相关的指标 + +以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。 + + + + + + +1. 请仔细检验这个标注集的质量,如果质量不符合要求,那么你要优化工具,迭代直至标注集的结果质量足够高,可以以此为自动化工具来评估检索效果,对检索效果形成指导性意见。 +2. 在结果标注集的质量足够好,批量评估工具足够好用,并且经过你的试用,能判断出搜索质量好坏的情况下,开始真正的动手检索效果调优:基于这个50条query的结果标注集和批量评估工具,对融合公式进行调参。请你先精心地设计实验,设计几组参数,对几组参数分别修改config.yaml、重启(./restart.sh backend)、跑批量评估、收集结果。 +注意评估的过程中,如果发现工具不好用,发现日志不全,发现可以通过修改工具或者日志来提高效率,都可以先做这些,根据完善。 +注意你是代码的总负责人,你有任何权限来满足你进行检索效果调优的需要。你如果发现有其他可能带来更大提升的点,也可以进行实验,你甚至可以修改融合、重排漏斗的代码,来进行实验,以追求更好的结果指标。 +但是注意,因为收到性能和耗时的约束,不要调大reranker模型的输入条数、不要打开精排,耗时方面无法承受两轮reranker模型的调用。 + + + + + + + + + +@scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py +@quick_start_eval.sh (29-35) +请以如下流程为准,进行改造: +如果重建的话,对每个query: +每个搜索结果应该会扫描全库, +1. 搜索结果的top500,纳入召回池,打分全部标记为1 +2. 调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过 +3. 对reranker打分超过0.5的大于1000条,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 + +对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 +直到连续三批不相关比例都大于92%。 +最少要跑15批,最多跑40批 diff --git a/scripts/evaluation/README.md b/scripts/evaluation/README.md index e863339..0488a27 100644 --- a/scripts/evaluation/README.md +++ b/scripts/evaluation/README.md @@ -23,7 +23,7 @@ This directory holds the offline annotation builder, the evaluation web UI/API, | `fusion_experiments_round1.json` | Broader first-round experiments | | `queries/queries.txt` | Canonical evaluation queries | | `README_Requirement.md` | Product/requirements reference | -| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild`, or `serve` | +| `quick_start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` | | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. | ## Quick start (repo root) @@ -34,7 +34,7 @@ Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashS # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM ./scripts/evaluation/quick_start_eval.sh batch -# Full re-label of current top_k recall (expensive) +# Deep rebuild: search recall top-500 (score 1) + full-corpus rerank outside pool + batched LLM (early stop; expensive) ./scripts/evaluation/quick_start_eval.sh batch-rebuild # UI: http://127.0.0.1:6010/ @@ -52,9 +52,15 @@ Explicit equivalents: --language en \ --labeler-mode simple -./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ - ... same args ... \ - --force-refresh-labels +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \ + --tenant-id "${TENANT_ID:-163}" \ + --queries-file scripts/evaluation/queries/queries.txt \ + --search-depth 500 \ + --rerank-depth 10000 \ + --force-refresh-rerank \ + --force-refresh-labels \ + --language en \ + --labeler-mode simple ./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ --tenant-id "${TENANT_ID:-163}" \ @@ -63,7 +69,9 @@ Explicit equivalents: --port 6010 ``` -Each batch run walks the full queries file. With `--force-refresh-labels`, every recalled `spu_id` in the window is re-sent to the LLM and upserted. Without it, only missing labels are filled. +Each `batch` run walks the full queries file. With `batch --force-refresh-labels`, every live top-`k` hit is re-judged by the LLM. + +**Rebuild (`build --force-refresh-labels`):** For each query: take search top **500** as the recall pool (treated as rerank score **1**; those SKUs are not sent to the reranker). Rerank the rest of the tenant corpus; if more than **1000** non-pool docs have rerank score **> 0.5**, the query is **skipped** (logged as too easy / tail too relevant). Otherwise merge pool (search order) + non-pool (rerank score descending), then LLM-judge in batches of **50**, logging **exact_ratio** and **irrelevant_ratio** per batch. Stop after **3** consecutive batches with irrelevant_ratio **> 92%**, but only after at least **15** batches and at most **40** batches. ## Artifacts @@ -87,7 +95,7 @@ Default root: `artifacts/search_evaluation/` **Standard:** Run `batch` without `--force-refresh-labels` to extend coverage, then use the UI or batch in cached mode. Single-query evaluation defaults to **no** auto-annotation: recall still hits the live API; scoring uses SQLite only, and unlabeled hits count as `Irrelevant`. -**Deeper pool:** `build_annotation_set.py build` merges deep search and full-corpus rerank windows before labeling (see CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). +**Incremental pool (no full rebuild):** `build_annotation_set.py build` without `--force-refresh-labels` merges search and full-corpus rerank windows before labeling (CLI `--search-depth`, `--rerank-depth`, `--annotate-*-top-k`). **Full rebuild** uses the recall-pool + rerank-skip + batched early-stop flow above; tune thresholds via `--search-recall-top-k`, `--rerank-high-threshold`, `--rerank-high-skip-count`, `--rebuild-*` flags on `build`. **Fusion tuning:** `tune_fusion.py` writes experiment configs, restarts the backend, runs batch evaluation, and optionally applies the best variant (see `--experiments-file`, `--score-metric`, `--apply-best`). diff --git a/scripts/evaluation/README_Requirement_zh.md b/scripts/evaluation/README_Requirement_zh.md index cdaa439..0f749f1 100644 --- a/scripts/evaluation/README_Requirement_zh.md +++ b/scripts/evaluation/README_Requirement_zh.md @@ -72,12 +72,20 @@ Irrelevant 不相关 — 品类或用途不符,主诉求未满足。 对其中每一个query: 1. 召回: -1)参考搜索接口 召回1k结果。 -2)遍历全库,得到每个spu的title,请求重排模型,进行全排序,得到top1w结果。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 -2. 对以上结果,拆分batch请求llm,进行结果标注。 +1)参考搜索接口 召回结果。搜索结果的top500,纳入召回池,打分全部标记为1 +2)调用重排模型,扫描全库(tenant_id=163),如果已经在召回池(打分已经是1了),则跳过,其余的全部过reranker模型接口调用。每80个doc做一次请求。注意重排模型打分一定要做缓存(本地文件缓存即可。query+title->rerank_score)。 +3)对reranker打分超过0.5的结果数大于1000条的query,则打印一行日志,跳过这个query,表示相关结果太多、容易被满足 + + +2. 对如上召回的内容,进行全排序,然后逐批进行llm评判标注(50个一批),每一批都记录exact比例和不相关比例,打印日志。 +直到连续三批不相关比例都大于92%。 +最少要跑15批,最多跑40批 + 3. 请你思考如何存储结果、并利于以后的对比、使用、展示。 + + 3. 评估工具页面: 请你设计一个搜索评估交互页面。端口6010。 页面主题:上方是搜索框,如果发起搜索,那么下方给出本次结果的总体指标以及top100结果(允许翻页) diff --git a/scripts/evaluation/eval_framework/cli.py b/scripts/evaluation/eval_framework/cli.py index c3a55bb..c561639 100644 --- a/scripts/evaluation/eval_framework/cli.py +++ b/scripts/evaluation/eval_framework/cli.py @@ -6,7 +6,18 @@ import argparse import json from pathlib import Path -from .constants import DEFAULT_LABELER_MODE, DEFAULT_QUERY_FILE +from .constants import ( + DEFAULT_LABELER_MODE, + DEFAULT_QUERY_FILE, + DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, + DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, + DEFAULT_REBUILD_LLM_BATCH_SIZE, + DEFAULT_REBUILD_MAX_LLM_BATCHES, + DEFAULT_REBUILD_MIN_LLM_BATCHES, + DEFAULT_RERANK_HIGH_SKIP_COUNT, + DEFAULT_RERANK_HIGH_THRESHOLD, + DEFAULT_SEARCH_RECALL_TOP_K, +) from .framework import SearchEvaluationFramework from .utils import ensure_dir, utc_now_iso, utc_timestamp from .web_app import create_web_app @@ -23,6 +34,39 @@ def build_cli_parser() -> argparse.ArgumentParser: build.add_argument("--rerank-depth", type=int, default=10000) build.add_argument("--annotate-search-top-k", type=int, default=120) build.add_argument("--annotate-rerank-top-k", type=int, default=200) + build.add_argument( + "--search-recall-top-k", + type=int, + default=None, + help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 500).", + ) + build.add_argument( + "--rerank-high-threshold", + type=float, + default=None, + help="Rebuild only: count rerank scores above this on non-pool docs (default 0.5).", + ) + build.add_argument( + "--rerank-high-skip-count", + type=int, + default=None, + help="Rebuild only: skip query if more than this many non-pool docs have rerank score > threshold (default 1000).", + ) + build.add_argument("--rebuild-llm-batch-size", type=int, default=None, help="Rebuild only: LLM batch size (default 50).") + build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 15).") + build.add_argument("--rebuild-max-batches", type=int, default=None, help="Rebuild only: max LLM batches (default 40).") + build.add_argument( + "--rebuild-irrelevant-stop-ratio", + type=float, + default=None, + help="Rebuild only: irrelevant ratio above this counts toward early-stop streak (default 0.92).", + ) + build.add_argument( + "--rebuild-irrelevant-stop-streak", + type=int, + default=None, + help="Rebuild only: stop after this many consecutive batches above irrelevant ratio (default 3).", + ) build.add_argument("--language", default="en") build.add_argument("--force-refresh-rerank", action="store_true") build.add_argument("--force-refresh-labels", action="store_true") @@ -59,6 +103,22 @@ def run_build(args: argparse.Namespace) -> None: framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) queries = framework.queries_from_file(Path(args.queries_file)) summary = [] + rebuild_kwargs = {} + if args.force_refresh_labels: + rebuild_kwargs = { + "search_recall_top_k": args.search_recall_top_k if args.search_recall_top_k is not None else DEFAULT_SEARCH_RECALL_TOP_K, + "rerank_high_threshold": args.rerank_high_threshold if args.rerank_high_threshold is not None else DEFAULT_RERANK_HIGH_THRESHOLD, + "rerank_high_skip_count": args.rerank_high_skip_count if args.rerank_high_skip_count is not None else DEFAULT_RERANK_HIGH_SKIP_COUNT, + "rebuild_llm_batch_size": args.rebuild_llm_batch_size if args.rebuild_llm_batch_size is not None else DEFAULT_REBUILD_LLM_BATCH_SIZE, + "rebuild_min_batches": args.rebuild_min_batches if args.rebuild_min_batches is not None else DEFAULT_REBUILD_MIN_LLM_BATCHES, + "rebuild_max_batches": args.rebuild_max_batches if args.rebuild_max_batches is not None else DEFAULT_REBUILD_MAX_LLM_BATCHES, + "rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio + if args.rebuild_irrelevant_stop_ratio is not None + else DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, + "rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak + if args.rebuild_irrelevant_stop_streak is not None + else DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, + } for query in queries: result = framework.build_query_annotation_set( query=query, @@ -69,6 +129,7 @@ def run_build(args: argparse.Namespace) -> None: language=args.language, force_refresh_rerank=args.force_refresh_rerank, force_refresh_labels=args.force_refresh_labels, + **rebuild_kwargs, ) summary.append( { diff --git a/scripts/evaluation/eval_framework/constants.py b/scripts/evaluation/eval_framework/constants.py index ad6496d..f0c64f6 100644 --- a/scripts/evaluation/eval_framework/constants.py +++ b/scripts/evaluation/eval_framework/constants.py @@ -17,3 +17,13 @@ DEFAULT_QUERY_FILE = _SCRIPTS_EVAL_DIR / "queries" / "queries.txt" JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" DEFAULT_LABELER_MODE = "simple" + +# Rebuild annotation pool (build --force-refresh-labels): search recall + full-corpus rerank + LLM batches +DEFAULT_SEARCH_RECALL_TOP_K = 500 +DEFAULT_RERANK_HIGH_THRESHOLD = 0.5 +DEFAULT_RERANK_HIGH_SKIP_COUNT = 1000 +DEFAULT_REBUILD_LLM_BATCH_SIZE = 50 +DEFAULT_REBUILD_MIN_LLM_BATCHES = 15 +DEFAULT_REBUILD_MAX_LLM_BATCHES = 40 +DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO = 0.92 +DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK = 3 diff --git a/scripts/evaluation/eval_framework/framework.py b/scripts/evaluation/eval_framework/framework.py index 0e0c5a0..4706894 100644 --- a/scripts/evaluation/eval_framework/framework.py +++ b/scripts/evaluation/eval_framework/framework.py @@ -17,6 +17,14 @@ from .clients import DashScopeLabelClient, RerankServiceClient, SearchServiceCli from .constants import ( DEFAULT_ARTIFACT_ROOT, DEFAULT_LABELER_MODE, + DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, + DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, + DEFAULT_REBUILD_LLM_BATCH_SIZE, + DEFAULT_REBUILD_MAX_LLM_BATCHES, + DEFAULT_REBUILD_MIN_LLM_BATCHES, + DEFAULT_RERANK_HIGH_SKIP_COUNT, + DEFAULT_RERANK_HIGH_THRESHOLD, + DEFAULT_SEARCH_RECALL_TOP_K, JUDGE_PROMPT_VERSION_COMPLEX, RELEVANCE_EXACT, RELEVANCE_IRRELEVANT, @@ -345,7 +353,7 @@ class SearchEvaluationFramework: self, query: str, docs: Sequence[Dict[str, Any]], - batch_size: int = 24, + batch_size: int = 80, force_refresh: bool = False, ) -> List[Dict[str, Any]]: cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) @@ -374,6 +382,52 @@ class SearchEvaluationFramework: ranked.sort(key=lambda item: item["score"], reverse=True) return ranked + def full_corpus_rerank_outside_exclude( + self, + query: str, + docs: Sequence[Dict[str, Any]], + exclude_spu_ids: set[str], + batch_size: int = 80, + force_refresh: bool = False, + ) -> List[Dict[str, Any]]: + """Rerank all corpus docs whose spu_id is not in ``exclude_spu_ids``; excluded IDs are not scored via API.""" + exclude_spu_ids = {str(x) for x in exclude_spu_ids} + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) + pending: List[Dict[str, Any]] = [ + doc + for doc in docs + if str(doc.get("spu_id")) not in exclude_spu_ids + and str(doc.get("spu_id")) + and (force_refresh or str(doc.get("spu_id")) not in cached) + ] + if pending: + new_scores: Dict[str, float] = {} + for start in range(0, len(pending), batch_size): + batch = pending[start : start + batch_size] + scores = self._rerank_batch_with_retry(query=query, docs=batch) + if len(scores) != len(batch): + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs") + for doc, score in zip(batch, scores): + new_scores[str(doc.get("spu_id"))] = float(score) + self.store.upsert_rerank_scores( + self.tenant_id, + query, + new_scores, + model_name="qwen3_vllm_score", + ) + cached.update(new_scores) + + ranked: List[Dict[str, Any]] = [] + for doc in docs: + spu_id = str(doc.get("spu_id") or "") + if not spu_id or spu_id in exclude_spu_ids: + continue + ranked.append( + {"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc} + ) + ranked.sort(key=lambda item: item["score"], reverse=True) + return ranked + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: if not docs: return [] @@ -447,6 +501,78 @@ class SearchEvaluationFramework: mid = len(docs) // 2 return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) + def _annotate_rebuild_batches( + self, + query: str, + ordered_docs: Sequence[Dict[str, Any]], + *, + batch_size: int = DEFAULT_REBUILD_LLM_BATCH_SIZE, + min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, + max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, + irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, + stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, + force_refresh: bool = True, + ) -> Tuple[Dict[str, str], List[Dict[str, Any]]]: + """LLM-label ``ordered_docs`` in fixed-size batches with early stop after enough irrelevant-heavy batches.""" + batch_logs: List[Dict[str, Any]] = [] + streak = 0 + labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query)) + total_ordered = len(ordered_docs) + + for batch_idx in range(max_batches): + start = batch_idx * batch_size + batch_docs = list(ordered_docs[start : start + batch_size]) + if not batch_docs: + break + + batch_pairs = self._classify_with_retry(query, batch_docs, force_refresh=force_refresh) + for sub_labels, raw_response, sub_batch in batch_pairs: + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} + self.store.upsert_labels( + self.tenant_id, + query, + to_store, + judge_model=self.label_client.model, + raw_response=raw_response, + ) + labels.update(to_store) + time.sleep(0.1) + + n = len(batch_docs) + exact_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_EXACT) + irrel_n = sum(1 for doc in batch_docs if labels.get(str(doc.get("spu_id"))) == RELEVANCE_IRRELEVANT) + exact_ratio = exact_n / n if n else 0.0 + irrelevant_ratio = irrel_n / n if n else 0.0 + log_entry = { + "batch_index": batch_idx + 1, + "size": n, + "exact_ratio": round(exact_ratio, 6), + "irrelevant_ratio": round(irrelevant_ratio, 6), + "offset_start": start, + "offset_end": min(start + n, total_ordered), + } + batch_logs.append(log_entry) + print( + f"[eval-rebuild] query={query!r} llm_batch={batch_idx + 1}/{max_batches} " + f"size={n} exact_ratio={exact_ratio:.4f} irrelevant_ratio={irrelevant_ratio:.4f}", + flush=True, + ) + + if batch_idx + 1 >= min_batches: + if irrelevant_ratio > irrelevant_stop_ratio: + streak += 1 + else: + streak = 0 + if streak >= stop_streak: + print( + f"[eval-rebuild] query={query!r} early_stop after {batch_idx + 1} batches " + f"({stop_streak} consecutive batches with irrelevant_ratio > {irrelevant_stop_ratio})", + flush=True, + ) + break + + return labels, batch_logs + def build_query_annotation_set( self, query: str, @@ -458,7 +584,32 @@ class SearchEvaluationFramework: language: str = "en", force_refresh_rerank: bool = False, force_refresh_labels: bool = False, + search_recall_top_k: int = DEFAULT_SEARCH_RECALL_TOP_K, + rerank_high_threshold: float = DEFAULT_RERANK_HIGH_THRESHOLD, + rerank_high_skip_count: int = DEFAULT_RERANK_HIGH_SKIP_COUNT, + rebuild_llm_batch_size: int = DEFAULT_REBUILD_LLM_BATCH_SIZE, + rebuild_min_batches: int = DEFAULT_REBUILD_MIN_LLM_BATCHES, + rebuild_max_batches: int = DEFAULT_REBUILD_MAX_LLM_BATCHES, + rebuild_irrelevant_stop_ratio: float = DEFAULT_REBUILD_IRRELEVANT_STOP_RATIO, + rebuild_irrelevant_stop_streak: int = DEFAULT_REBUILD_IRRELEVANT_STOP_STREAK, ) -> QueryBuildResult: + if force_refresh_labels: + return self._build_query_annotation_set_rebuild( + query=query, + search_depth=search_depth, + rerank_depth=rerank_depth, + language=language, + force_refresh_rerank=force_refresh_rerank, + search_recall_top_k=search_recall_top_k, + rerank_high_threshold=rerank_high_threshold, + rerank_high_skip_count=rerank_high_skip_count, + rebuild_llm_batch_size=rebuild_llm_batch_size, + rebuild_min_batches=rebuild_min_batches, + rebuild_max_batches=rebuild_max_batches, + rebuild_irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, + rebuild_irrelevant_stop_streak=rebuild_irrelevant_stop_streak, + ) + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language) search_results = list(search_payload.get("results") or []) corpus = self.corpus_docs(refresh=False) @@ -558,6 +709,182 @@ class SearchEvaluationFramework: output_json_path=output_json_path, ) + def _build_query_annotation_set_rebuild( + self, + query: str, + *, + search_depth: int, + rerank_depth: int, + language: str, + force_refresh_rerank: bool, + search_recall_top_k: int, + rerank_high_threshold: float, + rerank_high_skip_count: int, + rebuild_llm_batch_size: int, + rebuild_min_batches: int, + rebuild_max_batches: int, + rebuild_irrelevant_stop_ratio: float, + rebuild_irrelevant_stop_streak: int, + ) -> QueryBuildResult: + search_size = max(int(search_depth), int(search_recall_top_k)) + search_payload = self.search_client.search(query=query, size=search_size, from_=0, language=language) + search_results = list(search_payload.get("results") or []) + recall_n = min(int(search_recall_top_k), len(search_results)) + pool_search_docs = search_results[:recall_n] + pool_spu_ids = {str(d.get("spu_id")) for d in pool_search_docs if str(d.get("spu_id") or "").strip()} + + corpus = self.corpus_docs(refresh=False) + corpus_by_id = {str(d.get("spu_id")): d for d in corpus if str(d.get("spu_id") or "").strip()} + + ranked_outside = self.full_corpus_rerank_outside_exclude( + query=query, + docs=corpus, + exclude_spu_ids=pool_spu_ids, + force_refresh=force_refresh_rerank, + ) + rerank_high_n = sum(1 for item in ranked_outside if float(item["score"]) > float(rerank_high_threshold)) + + rebuild_meta: Dict[str, Any] = { + "mode": "rebuild_v1", + "search_recall_top_k": search_recall_top_k, + "recall_pool_size": len(pool_spu_ids), + "pool_rerank_score_assigned": 1.0, + "rerank_high_threshold": rerank_high_threshold, + "rerank_high_count_outside_pool": rerank_high_n, + "rerank_high_skip_count": rerank_high_skip_count, + "rebuild_llm_batch_size": rebuild_llm_batch_size, + "rebuild_min_batches": rebuild_min_batches, + "rebuild_max_batches": rebuild_max_batches, + "rebuild_irrelevant_stop_ratio": rebuild_irrelevant_stop_ratio, + "rebuild_irrelevant_stop_streak": rebuild_irrelevant_stop_streak, + } + + batch_logs: List[Dict[str, Any]] = [] + skipped = False + skip_reason: str | None = None + labels: Dict[str, str] = dict(self.store.get_labels(self.tenant_id, query)) + llm_labeled_total = 0 + + if rerank_high_n > int(rerank_high_skip_count): + skipped = True + skip_reason = "too_many_high_rerank_scores" + print( + f"[eval-rebuild] query={query!r} skip: rerank_score>{rerank_high_threshold} " + f"outside recall pool count={rerank_high_n} > {rerank_high_skip_count} " + f"(relevant tail too large / query too easy to satisfy)", + flush=True, + ) + else: + ordered_docs: List[Dict[str, Any]] = [] + seen_ordered: set[str] = set() + for doc in pool_search_docs: + sid = str(doc.get("spu_id") or "") + if not sid or sid in seen_ordered: + continue + seen_ordered.add(sid) + ordered_docs.append(corpus_by_id.get(sid, doc)) + for item in ranked_outside: + sid = str(item["spu_id"]) + if sid in seen_ordered: + continue + seen_ordered.add(sid) + ordered_docs.append(item["doc"]) + + labels, batch_logs = self._annotate_rebuild_batches( + query, + ordered_docs, + batch_size=rebuild_llm_batch_size, + min_batches=rebuild_min_batches, + max_batches=rebuild_max_batches, + irrelevant_stop_ratio=rebuild_irrelevant_stop_ratio, + stop_streak=rebuild_irrelevant_stop_streak, + force_refresh=True, + ) + llm_labeled_total = sum(int(entry.get("size") or 0) for entry in batch_logs) + + rebuild_meta["skipped"] = skipped + rebuild_meta["skip_reason"] = skip_reason + rebuild_meta["llm_batch_logs"] = batch_logs + rebuild_meta["llm_labeled_total"] = llm_labeled_total + + rerank_depth_effective = min(int(rerank_depth), len(ranked_outside)) + search_labeled_results: List[Dict[str, Any]] = [] + for rank, doc in enumerate(search_results, start=1): + spu_id = str(doc.get("spu_id")) + in_pool = rank <= recall_n + search_labeled_results.append( + { + "rank": rank, + "spu_id": spu_id, + "title": build_display_title(doc), + "image_url": doc.get("image_url"), + "rerank_score": 1.0 if in_pool else None, + "label": labels.get(spu_id), + "option_values": list(compact_option_values(doc.get("skus") or [])), + "product": compact_product_payload(doc), + } + ) + + rerank_top_results: List[Dict[str, Any]] = [] + for rank, item in enumerate(ranked_outside[:rerank_depth_effective], start=1): + doc = item["doc"] + spu_id = str(item["spu_id"]) + rerank_top_results.append( + { + "rank": rank, + "spu_id": spu_id, + "title": build_display_title(doc), + "image_url": doc.get("image_url"), + "rerank_score": round(float(item["score"]), 8), + "label": labels.get(spu_id), + "option_values": list(compact_option_values(doc.get("skus") or [])), + "product": compact_product_payload(doc), + } + ) + + top100_labels = [ + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT + for item in search_labeled_results[:100] + ] + metrics = compute_query_metrics(top100_labels) + output_dir = ensure_dir(self.artifact_root / "query_builds") + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" + output_json_path = output_dir / f"{run_id}.json" + pool_docs_count = len(pool_spu_ids) + len(ranked_outside) + payload = { + "run_id": run_id, + "created_at": utc_now_iso(), + "tenant_id": self.tenant_id, + "query": query, + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), + "search_total": int(search_payload.get("total") or 0), + "search_depth_requested": search_depth, + "search_depth_effective": len(search_results), + "rerank_depth_requested": rerank_depth, + "rerank_depth_effective": rerank_depth_effective, + "corpus_size": len(corpus), + "annotation_pool": { + "rebuild": rebuild_meta, + "ordered_union_size": pool_docs_count, + }, + "labeler_mode": self.labeler_mode, + "query_profile": self.get_query_profile(query, force_refresh=False) if self.labeler_mode == "complex" else None, + "metrics_top100": metrics, + "search_results": search_labeled_results, + "full_rerank_top": rerank_top_results, + } + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"]) + return QueryBuildResult( + query=query, + tenant_id=self.tenant_id, + search_total=int(search_payload.get("total") or 0), + search_depth=len(search_results), + rerank_corpus_size=len(corpus), + annotated_count=llm_labeled_total if not skipped else 0, + output_json_path=output_json_path, + ) + def evaluate_live_query( self, query: str, diff --git a/scripts/evaluation/quick_start_eval.sh b/scripts/evaluation/quick_start_eval.sh index ad162d7..0bcbf32 100755 --- a/scripts/evaluation/quick_start_eval.sh +++ b/scripts/evaluation/quick_start_eval.sh @@ -11,7 +11,7 @@ QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" usage() { echo "Usage: $0 batch|batch-rebuild|serve" echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50, simple)" - echo " batch-rebuild — same as batch but --force-refresh-labels (re-LLM all top_k hits; expensive, overwrites cache)" + echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)" echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)" echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)" } @@ -26,13 +26,15 @@ case "${1:-}" in --labeler-mode simple ;; batch-rebuild) - exec "$PY" scripts/evaluation/build_annotation_set.py batch \ + exec "$PY" scripts/evaluation/build_annotation_set.py build \ --tenant-id "$TENANT_ID" \ --queries-file "$QUERIES" \ - --top-k 50 \ + --search-depth 500 \ + --rerank-depth 10000 \ + --force-refresh-rerank \ + --force-refresh-labels \ --language en \ - --labeler-mode simple \ - --force-refresh-labels + --labeler-mode simple ;; serve) EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}" -- libgit2 0.21.2