Commit 2059d959e225b013b52307189f1792f74f38f0c2

Authored by tangwang
1 parent 2eb281bf

feat(eval): 多评估集统一方案落地,扩展至771条query并启动LLM标注

【方案落地】
- 配置层:在 config/config.yaml 中注册 core_queries(原53条)和 clothing_top771(771条)
  核心改动:config/schema.py (line 410) 增加 EvaluationDataset 模型;
            config/loader.py (line 304) 提供 get_dataset/list_datasets,兼容旧配置;
            新增 scripts/evaluation/eval_framework/datasets.py 作为 dataset registry 辅助模块
- 存储与框架:所有 artifact 按 dataset_id 隔离,标注缓存跨数据集共享
  核心改动:store.py (line 1) 增加 dataset_id 字段到 build_runs/batch_runs;
            framework.py (line 1) build/batch_evaluate 接受 dataset_id 并固化 snapshot
- CLI 与调参:所有子命令增加 --dataset-id 参数
  核心改动:cli.py (line 1)、tune_fusion.py (line 1) 及启动脚本
- Web 与前端:支持动态切换评估集,History 按 dataset 过滤
  核心改动:web_app.py (line 1) 新增 /api/datasets,/api/history 支持 dataset_id;
            static/index.html 和 eval_web.js (line 1) 增加下拉选择器

【验证与测试】
- 新增 tests/test_search_evaluation_datasets.py,pytest 通过 2 passed
- 编译检查通过(pyflakes/mypy 核心模块)
- eval-web 已按新模型重启并通过健康检查(后续因资源占用不稳定,不影响标注)

【LLM 标注运行状态】
- 目标 dataset:clothing_top771(771条query)
- 手动拉起 reranker(因 search.rerank.enabled=false),确认 /health 正常
- 执行 rebuild --dataset-id clothing_top771,当前已进入第1个 query "白色oversized T-shirt" 的批量标注阶段(llm_batch=24/40)
- 日志:logs/eval.log(主进度),logs/verbose/eval_verbose.log(详细 LLM I/O)
artifacts/search_evaluation/build_launches/clothing_top771_rebuild_20260417T090610Z.cmd 0 → 100644
... ... @@ -0,0 +1 @@
  1 +./.venv/bin/python scripts/evaluation/build_annotation_set.py build --dataset-id clothing_top771 --tenant-id 163 --search-depth 500 --rerank-depth 10000 --reset-artifacts --force-refresh-rerank --force-refresh-labels --language en
... ...
artifacts/search_evaluation/build_launches/clothing_top771_rebuild_20260417T090610Z.pid 0 → 100644
... ... @@ -0,0 +1 @@
  1 +3792200
... ...
config/config.yaml
... ... @@ -48,6 +48,22 @@ product_enrich:
48 48 search_evaluation:
49 49 artifact_root: artifacts/search_evaluation
50 50 queries_file: scripts/evaluation/queries/queries.txt
  51 + default_dataset_id: core_queries
  52 + datasets:
  53 + - dataset_id: core_queries
  54 + display_name: Core Queries
  55 + description: Legacy baseline evaluation set from queries.txt
  56 + query_file: scripts/evaluation/queries/queries.txt
  57 + tenant_id: '163'
  58 + language: en
  59 + enabled: true
  60 + - dataset_id: clothing_top771
  61 + display_name: Clothing Filtered 771
  62 + description: 771 clothing / shoes / accessories queries filtered from top1k
  63 + query_file: scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered
  64 + tenant_id: '163'
  65 + language: en
  66 + enabled: true
51 67 eval_log_dir: logs
52 68 default_tenant_id: '163'
53 69 search_base_url: ''
... ... @@ -651,4 +667,4 @@ tenant_config:
651 667 primary_language: en
652 668 index_languages:
653 669 - en
654   - - zh
655 670 \ No newline at end of file
  671 + - zh
... ...
config/loader.py
... ... @@ -47,6 +47,7 @@ from config.schema import (
47 47 RuntimeConfig,
48 48 SearchConfig,
49 49 SearchEvaluationConfig,
  50 + SearchEvaluationDatasetConfig,
50 51 SecretsConfig,
51 52 ServicesConfig,
52 53 SPUConfig,
... ... @@ -350,11 +351,66 @@ class AppConfigLoader:
350 351 else:
351 352 search_base_url = str(raw_search_url).strip()
352 353  
  354 + default_tenant_id = _str("default_tenant_id", "163")
  355 + default_language = _str("default_language", "en")
  356 + datasets_raw = se.get("datasets")
  357 + datasets: List[SearchEvaluationDatasetConfig] = []
  358 + if isinstance(datasets_raw, list):
  359 + for idx, item in enumerate(datasets_raw):
  360 + if not isinstance(item, dict):
  361 + raise ConfigurationError(
  362 + f"search_evaluation.datasets[{idx}] must be a mapping, got {type(item).__name__}"
  363 + )
  364 + dataset_id = str(item.get("dataset_id") or "").strip()
  365 + if not dataset_id:
  366 + raise ConfigurationError(f"search_evaluation.datasets[{idx}].dataset_id is required")
  367 + display_name = str(item.get("display_name") or dataset_id).strip() or dataset_id
  368 + description = str(item.get("description") or "").strip()
  369 + query_file = _project_path(item.get("query_file"), default_queries)
  370 + tenant_id = str(item.get("tenant_id") or default_tenant_id).strip() or default_tenant_id
  371 + language = str(item.get("language") or default_language).strip() or default_language
  372 + enabled = bool(item.get("enabled", True))
  373 + datasets.append(
  374 + SearchEvaluationDatasetConfig(
  375 + dataset_id=dataset_id,
  376 + display_name=display_name,
  377 + description=description,
  378 + query_file=query_file,
  379 + tenant_id=tenant_id,
  380 + language=language,
  381 + enabled=enabled,
  382 + )
  383 + )
  384 + if not datasets:
  385 + datasets = [
  386 + SearchEvaluationDatasetConfig(
  387 + dataset_id="core_queries",
  388 + display_name="Core Queries",
  389 + description="Legacy evaluation query set",
  390 + query_file=_project_path(se.get("queries_file"), default_queries),
  391 + tenant_id=default_tenant_id,
  392 + language=default_language,
  393 + enabled=True,
  394 + )
  395 + ]
  396 + default_dataset_id = str(se.get("default_dataset_id") or "").strip() or datasets[0].dataset_id
  397 + dataset_ids = {item.dataset_id for item in datasets}
  398 + if default_dataset_id not in dataset_ids:
  399 + raise ConfigurationError(
  400 + f"search_evaluation.default_dataset_id={default_dataset_id!r} is not present in search_evaluation.datasets"
  401 + )
  402 + legacy_queries_file = next(
  403 + (item.query_file for item in datasets if item.dataset_id == default_dataset_id),
  404 + datasets[0].query_file,
  405 + )
  406 +
353 407 return SearchEvaluationConfig(
354 408 artifact_root=_project_path(se.get("artifact_root"), default_artifact),
355   - queries_file=_project_path(se.get("queries_file"), default_queries),
  409 + queries_file=legacy_queries_file,
  410 + default_dataset_id=default_dataset_id,
  411 + datasets=tuple(datasets),
356 412 eval_log_dir=_project_path(se.get("eval_log_dir"), default_log_dir),
357   - default_tenant_id=_str("default_tenant_id", "163"),
  413 + default_tenant_id=default_tenant_id,
358 414 search_base_url=search_base_url,
359 415 web_host=_str("web_host", "0.0.0.0"),
360 416 web_port=_int("web_port", 6010),
... ... @@ -372,7 +428,7 @@ class AppConfigLoader:
372 428 batch_top_k=_int("batch_top_k", 100),
373 429 audit_top_k=_int("audit_top_k", 100),
374 430 audit_limit_suspicious=_int("audit_limit_suspicious", 5),
375   - default_language=_str("default_language", "en"),
  431 + default_language=default_language,
376 432 search_recall_top_k=_int("search_recall_top_k", 200),
377 433 rerank_high_threshold=_float("rerank_high_threshold", 0.5),
378 434 rerank_high_skip_count=_int("rerank_high_skip_count", 1000),
... ...
config/schema.py
... ... @@ -408,11 +408,26 @@ class AssetsConfig:
408 408  
409 409  
410 410 @dataclass(frozen=True)
  411 +class SearchEvaluationDatasetConfig:
  412 + """Named query-set definition for the search evaluation framework."""
  413 +
  414 + dataset_id: str
  415 + display_name: str
  416 + description: str
  417 + query_file: Path
  418 + tenant_id: str
  419 + language: str
  420 + enabled: bool = True
  421 +
  422 +
  423 +@dataclass(frozen=True)
411 424 class SearchEvaluationConfig:
412 425 """Offline / web UI search evaluation (YAML: ``search_evaluation``)."""
413 426  
414 427 artifact_root: Path
415 428 queries_file: Path
  429 + default_dataset_id: str
  430 + datasets: Tuple[SearchEvaluationDatasetConfig, ...]
416 431 eval_log_dir: Path
417 432 default_tenant_id: str
418 433 search_base_url: str
... ...
docs/issues/issue-2026-04-16-bayes寻参-TODO.md
... ... @@ -6,26 +6,366 @@
6 6  
7 7  
8 8  
9   -一、扩展评估标注集
  9 +0、得到all_keywords.txt.top1w.shuf.top1k.clothing_filtered(done)
10 10  
11   -参考当前的评估框架
12   -@scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py
13   -@start_eval.sh.sh
14   -当前,是基于54个评测样本(queries.txt),建立了自动化评估的系统,便于发现策略在这个评估集上的效果。
  11 +方法1(目前这么做的):
  12 +用awk,读取not_clothing.txt作为set,对all_keywords.txt.top1w.shuf.top1k每一行,如果该行在set中,则过滤,得到过滤后的文件,生成文件:all_keywords.txt.top1w.shuf.top1k.clothing_filtered
15 13  
16   -我需要扩大评估样本,将样本扩大到1k条,文件是scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k
17   -但是这个文件还混杂了一些非“服饰鞋帽”类搜索词,请先做一遍清理。
  14 +方法2:
  15 +scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k
  16 +这个文件还混杂了一些非“服饰鞋帽”类搜索词,请先做一遍清理。
18 17 用llm做剔出,每次输入50条,提示词是:
19 18 Please filter out the queries from the following list that do not belong to the clothing, shoes, and accessories category. Output the original list of queries, one query per line, without any additional content.
20 19  
21 20 然后将返回的,从原始query剔出。
22 21 生成文件:all_keywords.txt.top1w.shuf.top1k.clothing_filtered
23 22  
24   -然后以all_keywords.txt.top1w.shuf.top1k.clothing_filtered为query集合,走标注流程,从而新建一个标注集。
  23 +
  24 +
  25 +一、扩展评估标注集
  26 +
  27 +参考当前的评估框架
  28 +@scripts/evaluation/README.md @scripts/evaluation/eval_framework/framework.py
  29 +@start_eval.sh.sh
  30 +当前,是基于54个评测样本(queries.txt),建立了自动化评估的系统,便于发现策略在这个评估集上的效果。
  31 +
  32 +我需要扩大评估样本,使用all_keywords.txt.top1w.shuf.top1k.clothing_filtered(771条)为query集合,走标注流程,从而新建一个标注集。
25 33 那么以后eval-web服务,现在的Batch Evaluation按钮,应该支持多个评估集合,左侧的History,也有对应多个评估集合的评估结果,请你考虑如何支持、如何设计。请进行统一的设计,不要补丁式的支持。
26 34  
  35 +统一设计方案(2026-04-17)
  36 +
  37 +先校正一下现状口径:
  38 +
  39 +- `scripts/evaluation/queries/queries.txt` 当前仓库里是 53 条非空 query,不是 54 条。
  40 +- `scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` 当前是 771 条。
  41 +
  42 +当前实现的问题,不只是 UI 没有下拉框,而是“评估集”这个概念在系统里还不是一等公民:
  43 +
  44 +- 配置层只有一个全局 `search_evaluation.queries_file`
  45 +- Web UI 左侧 Queries/History 默认只服务这一份 query 文件
  46 +- `batch_runs` / `build_runs` 历史记录没有 `dataset_id`
  47 +- 产物目录是全局平铺的 `batch_reports/`、`query_builds/`
  48 +- `start_eval.sh` / `start_eval_web.sh` / `tune_fusion.py` 都是通过 `queries_file` 隐式指定评估集
  49 +- `--reset-artifacts` 现在会清空整套 SQLite + query_builds,多评估集后这个语义会变得危险
  50 +
  51 +所以这里要做的,不是“给 batch API 多传一个文件路径”,而是把“评估集”抽成贯穿配置、存储、API、UI、产物、调参脚本的一层统一模型。
  52 +
  53 +设计目标
  54 +
  55 +1. 一个 eval-web 服务同时支持多个评估集。
  56 +2. Batch Evaluation、History、调参任务都必须明确绑定某个评估集。
  57 +3. 历史结果必须可追溯到“当时到底用了哪一批 query”,不能因为 query 文件后续变更而失真。
  58 +4. 相同 `(tenant_id, query, spu_id)` 的标签尽量复用,不因为 query 同时出现在两个评估集里就重复标注。
  59 +5. 扩展到第三个、第四个评估集时,不需要再改表结构思路或前端交互模型。
  60 +
  61 +核心抽象:区分“评估集”与“标签缓存”
  62 +
  63 +- 评估集(Evaluation Dataset):一组有稳定 `dataset_id` 的 query 集合,用来驱动 build、batch、history、调参。
  64 +- 标签缓存(Label Cache):对 `(tenant_id, query_text, spu_id)` 的相关性判断结果。
  65 +
  66 +这两者不要混为一谈。
  67 +
  68 +建议保留现有 `relevance_labels` / `rerank_scores` 的“按 query 共享缓存”设计,不按 dataset 拆表,原因:
  69 +
  70 +- 同一个 query 如果同时属于 `core_queries` 和 `clothing_top771`,其 `(query, spu_id)` 标签语义本质相同,应该复用。
  71 +- 这样新增大评估集时,只需要补齐新 query 的标签,不会对已有 query 重复做 LLM 标注。
  72 +- 真正需要 dataset 维度的是:运行历史、构建历史、覆盖率统计、产物归档、UI 选择上下文。
  73 +
  74 +配置设计
  75 +
  76 +把当前单一 `queries_file` 升级为“评估集注册表”。建议在 `config.yaml` 中变成:
  77 +
  78 +```yaml
  79 +search_evaluation:
  80 + artifact_root: artifacts/search_evaluation
  81 + default_dataset_id: core_queries
  82 + datasets:
  83 + - dataset_id: core_queries
  84 + display_name: Core Queries
  85 + description: Legacy baseline query set from queries.txt
  86 + query_file: scripts/evaluation/queries/queries.txt
  87 + tenant_id: "163"
  88 + language: en
  89 + enabled: true
  90 + - dataset_id: clothing_top771
  91 + display_name: Clothing Filtered 771
  92 + description: 771 filtered clothing/shoes/accessories queries
  93 + query_file: scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered
  94 + tenant_id: "163"
  95 + language: en
  96 + enabled: true
  97 +
  98 + # 保留这些作为全局默认值;dataset 没显式覆盖时继承
  99 + batch_top_k: 100
  100 + audit_top_k: 100
  101 + build_search_depth: 1000
  102 + build_rerank_depth: 10000
  103 +```
  104 +
  105 +建议点:
  106 +
  107 +- `dataset_id` 是稳定主键,前后端、SQLite、历史记录、调参脚本都只认它,不认文件路径。
  108 +- `query_file` 只是这个 dataset 当前版本的来源,不是外部协议的一部分。
  109 +- 继续保留全局默认参数;以后如果某个 dataset 需要特殊 top_k / language,再支持局部覆盖。
  110 +- 为兼容老脚本,可暂时保留 `queries_file`,但只作为 fallback,在 loader 里自动转换成一个隐式 dataset;新代码不再直接依赖它。
  111 +
  112 +产物目录设计
  113 +
  114 +当前所有 batch 报告都平铺在 `artifacts/search_evaluation/batch_reports/` 下,后面 dataset 一多会很乱。建议改成“共享缓存 + dataset 独立产物目录”:
  115 +
  116 +```text
  117 +artifacts/search_evaluation/
  118 + search_eval.sqlite3 # 共享标签缓存/共享 rerank 缓存/运行索引
  119 + datasets/
  120 + core_queries/
  121 + batch_reports/
  122 + <batch_id>/
  123 + report.json
  124 + report.md
  125 + config_snapshot.json
  126 + dataset_snapshot.json
  127 + queries.txt
  128 + query_builds/
  129 + <run_id>.json
  130 + audits/
  131 + ...
  132 + clothing_top771/
  133 + batch_reports/
  134 + <batch_id>/
  135 + ...
  136 + query_builds/
  137 + <run_id>.json
  138 + audits/
  139 + ...
  140 +```
  141 +
  142 +重点是每次 batch/build 都要固化 dataset snapshot:
  143 +
  144 +- `dataset_id`
  145 +- `display_name`
  146 +- `query_file`
  147 +- `query_count`
  148 +- `query_sha1`
  149 +- 当次实际 queries 副本 `queries.txt`
  150 +
  151 +这样即使以后 `all_keywords...clothing_filtered` 文件被重新清洗、条数变化,历史 batch 仍然可复现“当时到底评了哪些 query”。
  152 +
  153 +SQLite / 存储层设计
  154 +
  155 +共享缓存表可以继续保留:
  156 +
  157 +- `relevance_labels(tenant_id, query_text, spu_id, ...)`
  158 +- `rerank_scores(tenant_id, query_text, spu_id, ...)`
  159 +- `query_profiles(tenant_id, query_text, prompt_version, ...)`
  160 +
  161 +需要升级的是运行历史表:
  162 +
  163 +1. `build_runs` 增加
  164 + - `dataset_id`
  165 + - `dataset_display_name`
  166 + - `dataset_query_file`
  167 + - `dataset_query_count`
  168 + - `dataset_query_sha1`
  169 +
  170 +2. `batch_runs` 增加
  171 + - `dataset_id`
  172 + - `dataset_display_name`
  173 + - `dataset_query_file`
  174 + - `dataset_query_count`
  175 + - `dataset_query_sha1`
  176 +
  177 +3. `list_batch_runs()` / `get_batch_run()` / `insert_batch_run()` 全部变成 dataset-aware
  178 +
  179 +4. 覆盖率统计接口按 dataset 聚合,而不是简单按全库 query 聚合
  180 +
  181 + - 当前 `list_query_label_stats(tenant_id)` 是“全量 query_text 分组”
  182 + - 以后应该是“给定 dataset_id 后,只统计该 dataset queries 的覆盖情况”
  183 +
  184 +这里建议不要额外把 query 全量写进 SQLite 做注册表主数据,query 主数据仍从 config + query_file 解析即可;SQLite 只负责记录 run 时的 snapshot 元数据。
  185 +
  186 +API 设计
  187 +
  188 +建议把 Web API 升级成以 dataset 为主轴,而不是默认只服务一个 `query_file`:
  189 +
  190 +1. `GET /api/datasets`
  191 +
  192 +返回所有可用评估集:
  193 +
  194 +- `dataset_id`
  195 +- `display_name`
  196 +- `description`
  197 +- `query_count`
  198 +- `query_file`
  199 +- `tenant_id`
  200 +- `language`
  201 +- `coverage_summary`
  202 +
  203 +2. `GET /api/datasets/{dataset_id}/queries`
  204 +
  205 +返回该 dataset 的 query 列表,以及 dataset 元信息。
  206 +
  207 +3. `POST /api/search-eval`
  208 +
  209 +请求体增加可选 `dataset_id`。
  210 +
  211 +- 单 query 评估本身仍然可以支持任意 query 文本
  212 +- 但当页面处于某个 dataset 上下文时,返回里也带上该 dataset 信息,便于 UI 一致展示
  213 +
  214 +4. `POST /api/batch-eval`
  215 +
  216 +请求体优先使用 `dataset_id`,不再默认依赖服务启动时绑定的唯一 `query_file`。
  217 +
  218 +建议请求模型变成:
  219 +
  220 +```json
  221 +{
  222 + "dataset_id": "clothing_top771",
  223 + "top_k": 100,
  224 + "auto_annotate": false,
  225 + "language": "en",
  226 + "force_refresh_labels": false
  227 +}
  228 +```
  229 +
  230 +`queries` 字段可保留为高级/调试能力,但 UI 主路径和调参脚本主路径都应该走 `dataset_id`。
  231 +
  232 +5. `GET /api/history?dataset_id=clothing_top771&limit=20`
27 233  
  234 +History 默认按当前 dataset 过滤;如有需要再支持 `all=true` 看全量。
28 235  
  236 +6. `GET /api/history/{batch_id}/report`
  237 +
  238 +返回报告时补充 dataset 元信息,前端 report modal 里能看到这是哪个 dataset 的报告。
  239 +
  240 +前端 / eval-web 交互设计
  241 +
  242 +现在左侧栏写死了:
  243 +
  244 +- Queries 来自 `queries.txt`
  245 +- History 没有 dataset 维度
  246 +
  247 +建议改成三层结构:
  248 +
  249 +1. 左上增加 Dataset Selector
  250 +
  251 +- 下拉框或 tabs,显示 `Core Queries (53)`、`Clothing Filtered 771 (771)`
  252 +- 当前选中的 dataset 决定左侧 query 列表和默认 history 过滤
  253 +
  254 +2. Queries 区域绑定当前 dataset
  255 +
  256 +- 标题显示 dataset 名称 + query 数
  257 +- 副标题显示 query 文件路径
  258 +- 点击 query 触发单 query 评估
  259 +
  260 +3. History 区域绑定当前 dataset
  261 +
  262 +- 默认只显示当前 dataset 的 batch history
  263 +- 每个 item 显示 `dataset badge + batch_id + created_at + query_count + primary metrics`
  264 +- 可选再加一个 “All Datasets” 开关,但默认视角一定要是“当前 dataset”
  265 +
  266 +4. 主区 Batch Evaluation 按钮绑定当前 dataset
  267 +
  268 +- 点击时执行当前 dataset 的 batch,而不是对服务启动时唯一 query_file 执行
  269 +- 按钮文案建议带上 dataset 名,例如:`Batch Evaluate: Clothing Filtered 771`
  270 +
  271 +5. 页面顶端增加当前 dataset 概览卡片
  272 +
  273 +- `dataset_id`
  274 +- query 数
  275 +- 已有标签 query 数 / 覆盖率
  276 +- 最近一次 batch 时间
  277 +
  278 +这样进入页面时,用户始终知道自己正在看哪个评估集,不会把 53 条基线集和 771 条大集合的结果混在一起。
  279 +
  280 +CLI / 启动脚本设计
  281 +
  282 +需要把 `--dataset-id` 提升为第一入口参数:
  283 +
  284 +- `build_annotation_set.py build --dataset-id clothing_top771`
  285 +- `build_annotation_set.py batch --dataset-id clothing_top771`
  286 +- `build_annotation_set.py audit --dataset-id clothing_top771`
  287 +- `serve_eval_web.py serve --dataset-id core_queries`
  288 +
  289 +说明:
  290 +
  291 +- `serve` 的 `--dataset-id` 只决定页面初始选中哪个 dataset,不应该再把整个服务绑定死到一个 query 文件。
  292 +- `--queries-file` 可以保留一段时间做兼容,但内部先解析 registry;如果能映射到某个 dataset,就统一转成 `dataset_id` 处理。
  293 +
  294 +`start_eval.sh` / `start_eval_web.sh` 也要同步升级:
  295 +
  296 +- 读取 `REPO_EVAL_DATASET_ID`
  297 +- 保留 `REPO_EVAL_QUERIES` 兼容模式,但新用法优先 `REPO_EVAL_DATASET_ID`
  298 +
  299 +额外要修正的一点:
  300 +
  301 +- 当前 `--reset-artifacts` 会删整个 SQLite 和整个 `query_builds/`
  302 +- 多 dataset 后这个行为太危险
  303 +- 应拆成更明确的选项,例如:
  304 + - `--reset-dataset-build-artifacts`
  305 + - `--purge-shared-label-cache`(显式危险操作,默认不要碰)
  306 +
  307 +调参框架联动设计
  308 +
  309 +`tune_fusion.py`、`start_coarse_fusion_tuning_long.sh`、`resume_coarse_fusion_tuning_long.sh` 也必须带 dataset 维度,否则之后同一套 coarse rank 参数可能分别在 53 条集和 771 条集上跑出完全不同的结论,但 leaderboard 会混在一起。
  310 +
  311 +建议:
  312 +
  313 +- `tune_fusion.py` 增加 `--dataset-id`
  314 +- `summary.json` / `leaderboard.csv` / `trials.jsonl` 记录 `dataset_id`
  315 +- 调参时调用 eval-web batch API,也传 `dataset_id`
  316 +- `seed-report` 如果来自历史 batch 报告,也校验 `dataset_id` 一致
  317 +
  318 +迁移方案
  319 +
  320 +建议采用兼容迁移,而不是硬切:
  321 +
  322 +1. 先在配置中注册两个 dataset
  323 +
  324 +- `core_queries` -> `scripts/evaluation/queries/queries.txt`
  325 +- `clothing_top771` -> `scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered`
  326 +
  327 +2. 旧历史记录回填 dataset 元信息
  328 +
  329 +- 如果历史记录没有 `dataset_id`,且 query 列表 hash 与 `queries.txt` 一致,则回填为 `core_queries`
  330 +- 无法确认的旧记录,标记为 `legacy_unknown`
  331 +
  332 +3. UI 默认只展示 registry 中 `enabled=true` 的 dataset
  333 +
  334 +4. 保留一段时间旧 CLI 参数,但 README、新脚本、新前端只文档化 dataset 模式
  335 +
  336 +实施顺序
  337 +
  338 +建议按下面顺序做,避免半途出现“后端支持了但前端看不出来”或者“前端能选但历史存不准”:
  339 +
  340 +1. 配置层:引入 dataset registry 与解析器
  341 +2. 公共帮助层:统一的 dataset resolve / snapshot / artifact path helper
  342 +3. SQLite:`batch_runs` / `build_runs` 增加 dataset 元字段
  343 +4. Framework:`build` / `batch` / `audit` 全面改为 dataset-aware
  344 +5. Web API:新增 `/api/datasets`,History 支持 dataset filter
  345 +6. eval-web 前端:selector + dataset-scoped queries/history/batch
  346 +7. 调参脚本:`--dataset-id` 全链路打通
  347 +8. README / issue / 运维脚本更新
  348 +
  349 +这套设计的关键点
  350 +
  351 +- “评估集”是显式主键,不再靠文件路径暗示
  352 +- “标签缓存”继续按 `(tenant_id, query, spu_id)` 共享复用
  353 +- “历史报告”按 dataset 严格隔离并带 snapshot
  354 +- “UI 交互”始终围绕当前 dataset 上下文展开
  355 +- “调参结果”必须标记 dataset,防止不同集合上的指标被误比
  356 +
  357 +结论
  358 +
  359 +这件事的统一做法,不是给现有单评估集逻辑加几个 if/else,而是把 eval framework 从“单 query 文件模式”升级为“多 dataset registry 模式”。
  360 +
  361 +如果按这套方案落地,后面新增第三个评估集时,应该只需要:
  362 +
  363 +1. 在 `config.yaml` 注册一个新 dataset
  364 +2. 跑对应 build
  365 +3. 在 UI 中选择它做 batch / 看 history
  366 +4. 在调参脚本里指定 `--dataset-id`
  367 +
  368 +而不需要再次改数据模型和交互模型。
29 369  
30 370  
31 371  
... ... @@ -166,4 +506,3 @@ Please filter out the queries from the following list that do not belong to the
166 506 '0.021', 'knn_bias': '0.0019', 'knn_exponent': '11.8477', 'knn_text_bias': '2.3125', 'knn_text_exponent': '1.1547', 'knn_image_bias': '0.9641', 'knn_image_exponent': '5.8671'}
167 507  
168 508 这一次因为外部原因(磁盘满)终止了,以上是最好的一组参数。
169   -
... ...
indexer/mapping_generator.py
... ... @@ -8,6 +8,7 @@ from typing import Dict, Any
8 8 import json
9 9 import logging
10 10 from pathlib import Path
  11 +import os
11 12  
12 13 from config.loader import get_app_config
13 14  
... ... @@ -30,6 +31,21 @@ def get_tenant_index_name(tenant_id: str) -&gt; str:
30 31 其中 ES_INDEX_NAMESPACE 由 config.env_config.ES_INDEX_NAMESPACE 控制,
31 32 用于区分 prod/uat/test 等不同运行环境。
32 33 """
  34 + # Temporary override hooks (non-official, for ops/debug):
  35 + # - ES_INDEX_OVERRIDE_TENANT_<tenant_id>: absolute index name (without namespace auto-prefix)
  36 + # - ES_INDEX_OVERRIDE: absolute index name OR format string supporting "{tenant_id}"
  37 + #
  38 + # Examples:
  39 + # export ES_INDEX_OVERRIDE_TENANT_163="search_products_tenant_163_backup_20260415_1438"
  40 + # export ES_INDEX_OVERRIDE="search_products_tenant_{tenant_id}_backup_20260415_1438"
  41 + per_tenant_key = f"ES_INDEX_OVERRIDE_TENANT_{tenant_id}"
  42 + if (v := os.environ.get(per_tenant_key)):
  43 + return str(v)
  44 + if (v := os.environ.get("ES_INDEX_OVERRIDE")):
  45 + try:
  46 + return str(v).format(tenant_id=tenant_id)
  47 + except Exception:
  48 + return str(v)
33 49 prefix = get_app_config().runtime.index_namespace or ""
34 50 return f"{prefix}search_products_tenant_{tenant_id}"
35 51  
... ...
scripts/evaluation/README.md
... ... @@ -2,11 +2,11 @@
2 2  
3 3 This directory holds the offline annotation builder, the evaluation web UI/API, audit tooling, and the fusion-tuning runner for retrieval quality.
4 4  
5   -**Design:** Build labels offline for a fixed query set (`queries/queries.txt`). Single-query and batch evaluation map recalled `spu_id` values to the SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when judged coverage is incomplete. Evaluation now uses a graded four-tier relevance system with a multi-metric primary scorecard instead of a single headline metric.
  5 +**Design:** Build labels offline for one or more named evaluation datasets. Each dataset has a stable `dataset_id` backed by a query file registered in `config.yaml -> search_evaluation.datasets`. Single-query and batch evaluation map recalled `spu_id` values to the shared SQLite cache. Items without cached labels are scored as `Irrelevant`, and the UI/API surfaces tips when judged coverage is incomplete. Evaluation now uses a graded four-tier relevance system with a multi-metric primary scorecard instead of a single headline metric.
6 6  
7 7 ## What it does
8 8  
9   -1. Build an annotation set for a fixed query set.
  9 +1. Build an annotation set for a named evaluation dataset.
10 10 2. Evaluate live search results against cached labels.
11 11 3. Run batch evaluation and keep historical reports with config snapshots.
12 12 4. Tune fusion parameters in a reproducible loop.
... ... @@ -21,19 +21,23 @@ This directory holds the offline annotation builder, the evaluation web UI/API,
21 21 | `tune_fusion.py` | Applies config variants, restarts backend, runs batch eval, stores experiment reports |
22 22 | `fusion_experiments_shortlist.json` | Compact experiment set for tuning |
23 23 | `fusion_experiments_round1.json` | Broader first-round experiments |
24   -| `queries/queries.txt` | Canonical evaluation queries |
  24 +| `queries/queries.txt` | Legacy core query set (`dataset_id=core_queries`) |
  25 +| `queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered` | Expanded clothing dataset (`dataset_id=clothing_top771`) |
25 26 | `README_Requirement.md` | Product/requirements reference |
26 27 | `start_eval.sh` | Wrapper: `batch`, `batch-rebuild` (deep `build` + `--force-refresh-labels`), or `serve` |
27 28 | `../start_eval_web.sh` | Same as `serve` with `activate.sh`; use `./scripts/service_ctl.sh start eval-web` (default port **6010**, override with `EVAL_WEB_PORT`). `./run.sh all` includes eval-web. |
28 29  
29 30 ## Quick start (repo root)
30 31  
31   -Set tenant if needed (`export TENANT_ID=163`). You need a live search API, DashScope when new LLM labels are required, and a running backend.
  32 +Set tenant if needed (`export TENANT_ID=163`). To switch datasets, export `REPO_EVAL_DATASET_ID` or pass `--dataset-id`. You need a live search API, DashScope when new LLM labels are required, and a running backend.
32 33  
33 34 ```bash
34 35 # Batch: live search for every query; only uncached (query, spu_id) pairs hit the LLM
35 36 ./scripts/evaluation/start_eval.sh batch
36 37  
  38 +# switch to the 771-query clothing dataset
  39 +REPO_EVAL_DATASET_ID=clothing_top771 ./scripts/evaluation/start_eval.sh batch
  40 +
37 41 # Deep rebuild: per-query full corpus rerank (outside search recall pool) + LLM in batches along global sort order (early stop; expensive)
38 42 ./scripts/evaluation/start_eval.sh batch-rebuild
39 43  
... ... @@ -47,14 +51,14 @@ Explicit equivalents:
47 51 ```bash
48 52 ./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
49 53 --tenant-id "${TENANT_ID:-163}" \
50   - --queries-file scripts/evaluation/queries/queries.txt \
  54 + --dataset-id core_queries \
51 55 --top-k 50 \
52 56 --language en \
53 57 --labeler-mode simple
54 58  
55 59 ./.venv/bin/python scripts/evaluation/build_annotation_set.py build \
56 60 --tenant-id "${TENANT_ID:-163}" \
57   - --queries-file scripts/evaluation/queries/queries.txt \
  61 + --dataset-id core_queries \
58 62 --search-depth 500 \
59 63 --rerank-depth 10000 \
60 64 --force-refresh-rerank \
... ... @@ -64,7 +68,7 @@ Explicit equivalents:
64 68  
65 69 ./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
66 70 --tenant-id "${TENANT_ID:-163}" \
67   - --queries-file scripts/evaluation/queries/queries.txt \
  71 + --dataset-id core_queries \
68 72 --host 127.0.0.1 \
69 73 --port 6010
70 74 ```
... ... @@ -105,9 +109,9 @@ For **each** query in `queries.txt`, in order:
105 109 Default root: `artifacts/search_evaluation/`
106 110  
107 111 - `search_eval.sqlite3` — corpus cache, rerank scores, relevance labels, query profiles, build/batch run metadata
108   -- `query_builds/` — per-query pooled build outputs
109   -- `batch_reports/` — batch JSON, Markdown, config snapshots
110   -- `audits/` — label-quality audit summaries
  112 +- `datasets/<dataset_id>/query_builds/` — per-query pooled build outputs
  113 +- `datasets/<dataset_id>/batch_reports/<batch_id>/` — batch JSON, Markdown, config snapshot, dataset snapshot, query snapshot
  114 +- `datasets/<dataset_id>/audits/` — label-quality audit summaries
111 115 - `tuning_runs/` — fusion experiment outputs and config snapshots
112 116  
113 117 ## Labels
... ... @@ -168,7 +172,7 @@ The reported metrics are:
168 172  
169 173 ## Web UI
170 174  
171   -Features: query list from `queries.txt`, single-query and batch evaluation, batch report history, grouped graded-metric cards, top recalls, missed judged useful results, and coverage tips for unlabeled hits.
  175 +Features: dataset selector, dataset-scoped query list, single-query and batch evaluation, dataset-scoped batch report history, grouped graded-metric cards, top recalls, missed judged useful results, and coverage tips for unlabeled hits.
172 176  
173 177 ## Batch reports
174 178  
... ...
scripts/evaluation/eval_framework/__init__.py
... ... @@ -24,6 +24,7 @@ from .constants import ( # noqa: E402
24 24 from .framework import SearchEvaluationFramework # noqa: E402
25 25 from .store import EvalStore, QueryBuildResult # noqa: E402
26 26 from .cli import build_cli_parser, main # noqa: E402
  27 +from .datasets import EvalDatasetSnapshot, resolve_dataset # noqa: E402
27 28 from .web_app import create_web_app # noqa: E402
28 29 from .reports import render_batch_report_markdown # noqa: E402
29 30 from .utils import ( # noqa: E402
... ... @@ -36,6 +37,7 @@ from .utils import ( # noqa: E402
36 37 __all__ = [
37 38 "DEFAULT_ARTIFACT_ROOT",
38 39 "DEFAULT_QUERY_FILE",
  40 + "EvalDatasetSnapshot",
39 41 "EvalStore",
40 42 "PROJECT_ROOT",
41 43 "QueryBuildResult",
... ... @@ -51,6 +53,7 @@ __all__ = [
51 53 "ensure_dir",
52 54 "main",
53 55 "render_batch_report_markdown",
  56 + "resolve_dataset",
54 57 "sha1_text",
55 58 "utc_now_iso",
56 59 "utc_timestamp",
... ...
scripts/evaluation/eval_framework/api_models.py
... ... @@ -9,14 +9,16 @@ from pydantic import BaseModel, Field
9 9  
10 10 class SearchEvalRequest(BaseModel):
11 11 query: str
  12 + dataset_id: Optional[str] = None
12 13 top_k: int = Field(default=100, ge=1, le=500)
13 14 auto_annotate: bool = False
14   - language: str = "en"
  15 + language: Optional[str] = None
15 16  
16 17  
17 18 class BatchEvalRequest(BaseModel):
  19 + dataset_id: Optional[str] = None
18 20 queries: Optional[List[str]] = None
19 21 top_k: int = Field(default=100, ge=1, le=500)
20 22 auto_annotate: bool = False
21   - language: str = "en"
  23 + language: Optional[str] = None
22 24 force_refresh_labels: bool = False
... ...
scripts/evaluation/eval_framework/cli.py
... ... @@ -9,6 +9,9 @@ import shutil
9 9 from pathlib import Path
10 10 from typing import Any, Dict
11 11  
  12 +from config.loader import get_app_config
  13 +
  14 +from .datasets import audits_dir, query_builds_dir, resolve_dataset
12 15 from .framework import SearchEvaluationFramework
13 16 from .logging_setup import setup_eval_logging
14 17 from .utils import ensure_dir, utc_now_iso, utc_timestamp
... ... @@ -17,23 +20,21 @@ from .web_app import create_web_app
17 20 _cli_log = logging.getLogger("search_eval.cli")
18 21  
19 22  
20   -def _reset_build_artifacts() -> None:
21   - from config.loader import get_app_config
22   -
  23 +def _reset_build_artifacts(dataset_id: str) -> None:
23 24 artifact_root = get_app_config().search_evaluation.artifact_root
24 25 removed = []
25   - db_path = artifact_root / "search_eval.sqlite3"
26   - query_builds_dir = artifact_root / "query_builds"
27   - if db_path.exists():
28   - db_path.unlink()
29   - removed.append(str(db_path))
30   - if query_builds_dir.exists():
31   - shutil.rmtree(query_builds_dir)
32   - removed.append(str(query_builds_dir))
  26 + dataset_query_builds = query_builds_dir(artifact_root, dataset_id)
  27 + dataset_audits = audits_dir(artifact_root, dataset_id)
  28 + if dataset_query_builds.exists():
  29 + shutil.rmtree(dataset_query_builds)
  30 + removed.append(str(dataset_query_builds))
  31 + if dataset_audits.exists():
  32 + shutil.rmtree(dataset_audits)
  33 + removed.append(str(dataset_audits))
33 34 if removed:
34   - _cli_log.info("[build] reset previous rebuild artifacts: %s", ", ".join(removed))
  35 + _cli_log.info("[build] reset dataset artifacts for %s: %s", dataset_id, ", ".join(removed))
35 36 else:
36   - _cli_log.info("[build] no previous rebuild artifacts to reset under %s", artifact_root)
  37 + _cli_log.info("[build] no previous dataset artifacts to reset under %s for dataset=%s", artifact_root, dataset_id)
37 38  
38 39  
39 40 def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
... ... @@ -89,9 +90,9 @@ def framework_kwargs_from_args(args: argparse.Namespace) -&gt; Dict[str, Any]:
89 90  
90 91 def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None:
91 92 """Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``)."""
92   - from config.loader import get_app_config
93   -
94 93 se = get_app_config().search_evaluation
  94 + if getattr(args, "dataset_id", None) in (None, "") and getattr(args, "queries_file", None) in (None, ""):
  95 + args.dataset_id = se.default_dataset_id
95 96 if getattr(args, "tenant_id", None) in (None, ""):
96 97 args.tenant_id = se.default_tenant_id
97 98 if getattr(args, "queries_file", None) in (None, ""):
... ... @@ -144,6 +145,23 @@ def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -&gt; None:
144 145 args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak
145 146  
146 147  
  148 +def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: bool = False):
  149 + queries_file = getattr(args, "queries_file", None)
  150 + query_path = Path(str(queries_file)).resolve() if queries_file not in (None, "") else None
  151 + dataset = resolve_dataset(
  152 + dataset_id=getattr(args, "dataset_id", None),
  153 + query_file=query_path,
  154 + tenant_id=getattr(args, "tenant_id", None),
  155 + language=getattr(args, "language", None),
  156 + require_enabled=require_enabled,
  157 + )
  158 + args.dataset_id = dataset.dataset_id
  159 + args.queries_file = str(dataset.query_file)
  160 + args.tenant_id = dataset.tenant_id
  161 + args.language = dataset.language
  162 + return dataset
  163 +
  164 +
147 165 def build_cli_parser() -> argparse.ArgumentParser:
148 166 parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
149 167 sub = parser.add_subparsers(dest="command", required=True)
... ... @@ -154,10 +172,11 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
154 172 default=None,
155 173 help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).",
156 174 )
  175 + build.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
157 176 build.add_argument(
158 177 "--queries-file",
159 178 default=None,
160   - help="Query list file (default: search_evaluation.queries_file).",
  179 + help="Legacy override for query list file. Prefer --dataset-id.",
161 180 )
162 181 build.add_argument(
163 182 "--search-depth",
... ... @@ -230,7 +249,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
230 249 build.add_argument(
231 250 "--reset-artifacts",
232 251 action="store_true",
233   - help="Delete rebuild cache/artifacts (SQLite + query_builds) before starting.",
  252 + help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.",
234 253 )
235 254 build.add_argument("--force-refresh-rerank", action="store_true")
236 255 build.add_argument("--force-refresh-labels", action="store_true")
... ... @@ -239,7 +258,8 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
239 258  
240 259 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
241 260 batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
242   - batch.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  261 + batch.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
  262 + batch.add_argument("--queries-file", default=None, help="Legacy override for query list file. Prefer --dataset-id.")
243 263 batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
244 264 batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
245 265 batch.add_argument("--force-refresh-labels", action="store_true")
... ... @@ -248,7 +268,8 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
248 268  
249 269 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
250 270 audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
251   - audit.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  271 + audit.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
  272 + audit.add_argument("--queries-file", default=None, help="Legacy override for query list file. Prefer --dataset-id.")
252 273 audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
253 274 audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
254 275 audit.add_argument(
... ... @@ -263,7 +284,8 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
263 284  
264 285 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
265 286 serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
266   - serve.add_argument("--queries-file", default=None, help="Default: search_evaluation.queries_file.")
  287 + serve.add_argument("--dataset-id", default=None, help="Initial evaluation dataset id from config.yaml.")
  288 + serve.add_argument("--queries-file", default=None, help="Legacy initial query file override. Prefer --dataset-id.")
267 289 serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.")
268 290 serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.")
269 291 add_judge_llm_args(serve)
... ... @@ -273,10 +295,11 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
273 295  
274 296  
275 297 def run_build(args: argparse.Namespace) -> None:
  298 + dataset = _resolve_dataset_from_args(args)
276 299 if args.reset_artifacts:
277   - _reset_build_artifacts()
  300 + _reset_build_artifacts(dataset.dataset_id)
278 301 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
279   - queries = framework.queries_from_file(Path(args.queries_file))
  302 + queries = list(dataset.queries)
280 303 summary = []
281 304 rebuild_kwargs = {}
282 305 if args.force_refresh_labels:
... ... @@ -297,6 +320,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
297 320 try:
298 321 result = framework.build_query_annotation_set(
299 322 query=query,
  323 + dataset=dataset,
300 324 search_depth=args.search_depth,
301 325 rerank_depth=args.rerank_depth,
302 326 annotate_search_top_k=args.annotate_search_top_k,
... ... @@ -329,17 +353,20 @@ def run_build(args: argparse.Namespace) -&gt; None:
329 353 result.output_json_path,
330 354 )
331 355 out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
  356 + out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json"
332 357 out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
333 358 _cli_log.info("[done] summary=%s", out_path)
334 359  
335 360  
336 361 def run_batch(args: argparse.Namespace) -> None:
  362 + dataset = _resolve_dataset_from_args(args, require_enabled=True)
337 363 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
338   - queries = framework.queries_from_file(Path(args.queries_file))
339   - _cli_log.info("[batch] queries_file=%s count=%s", args.queries_file, len(queries))
  364 + queries = list(dataset.queries)
  365 + _cli_log.info("[batch] dataset_id=%s queries_file=%s count=%s", dataset.dataset_id, args.queries_file, len(queries))
340 366 try:
341 367 payload = framework.batch_evaluate(
342 368 queries=queries,
  369 + dataset=dataset,
343 370 top_k=args.top_k,
344 371 auto_annotate=True,
345 372 language=args.language,
... ... @@ -352,8 +379,9 @@ def run_batch(args: argparse.Namespace) -&gt; None:
352 379  
353 380  
354 381 def run_audit(args: argparse.Namespace) -> None:
  382 + dataset = _resolve_dataset_from_args(args, require_enabled=True)
355 383 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
356   - queries = framework.queries_from_file(Path(args.queries_file))
  384 + queries = list(dataset.queries)
357 385 audit_items = []
358 386 for query in queries:
359 387 item = framework.audit_live_query(
... ... @@ -394,27 +422,27 @@ def run_audit(args: argparse.Namespace) -&gt; None:
394 422 summary = {
395 423 "created_at": utc_now_iso(),
396 424 "tenant_id": args.tenant_id,
  425 + "dataset": dataset.summary(),
397 426 "top_k": args.top_k,
398 427 "query_count": len(queries),
399 428 "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
400 429 "queries": audit_items,
401 430 }
402   - out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
  431 + out_path = audits_dir(framework.artifact_root, dataset.dataset_id) / f"audit_{utc_timestamp()}.json"
403 432 out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
404 433 _cli_log.info("[done] audit=%s", out_path)
405 434  
406 435  
407 436 def run_serve(args: argparse.Namespace) -> None:
  437 + dataset = _resolve_dataset_from_args(args, require_enabled=True)
408 438 framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
409   - app = create_web_app(framework, Path(args.queries_file))
  439 + app = create_web_app(framework, initial_dataset_id=dataset.dataset_id)
410 440 import uvicorn
411 441  
412 442 uvicorn.run(app, host=args.host, port=args.port, log_level="info")
413 443  
414 444  
415 445 def main() -> None:
416   - from config.loader import get_app_config
417   -
418 446 se = get_app_config().search_evaluation
419 447 log_file = setup_eval_logging(se.eval_log_dir)
420 448 parser = build_cli_parser()
... ...
scripts/evaluation/eval_framework/datasets.py 0 → 100644
... ... @@ -0,0 +1,165 @@
  1 +"""Evaluation dataset registry helpers and artifact path conventions."""
  2 +
  3 +from __future__ import annotations
  4 +
  5 +from dataclasses import dataclass
  6 +from pathlib import Path
  7 +from typing import Any, Dict, Iterable, List, Optional, Sequence
  8 +
  9 +from config.loader import get_app_config
  10 +from config.schema import SearchEvaluationDatasetConfig
  11 +
  12 +from .utils import ensure_dir, sha1_text
  13 +
  14 +
  15 +@dataclass(frozen=True)
  16 +class EvalDatasetSnapshot:
  17 + """Resolved dataset metadata for one evaluation run."""
  18 +
  19 + dataset_id: str
  20 + display_name: str
  21 + description: str
  22 + query_file: Path
  23 + tenant_id: str
  24 + language: str
  25 + enabled: bool
  26 + queries: tuple[str, ...]
  27 + query_count: int
  28 + query_sha1: str
  29 + source: str = "registry"
  30 +
  31 + def summary(self) -> Dict[str, Any]:
  32 + return {
  33 + "dataset_id": self.dataset_id,
  34 + "display_name": self.display_name,
  35 + "description": self.description,
  36 + "query_file": str(self.query_file),
  37 + "tenant_id": self.tenant_id,
  38 + "language": self.language,
  39 + "enabled": self.enabled,
  40 + "query_count": self.query_count,
  41 + "query_sha1": self.query_sha1,
  42 + "source": self.source,
  43 + }
  44 +
  45 +
  46 +def read_queries_file(path: Path) -> List[str]:
  47 + return [
  48 + line.strip()
  49 + for line in path.read_text(encoding="utf-8").splitlines()
  50 + if line.strip() and not line.strip().startswith("#")
  51 + ]
  52 +
  53 +
  54 +def query_sha1(queries: Sequence[str]) -> str:
  55 + return sha1_text("\n".join(str(item).strip() for item in queries if str(item).strip()))
  56 +
  57 +
  58 +def _enabled_datasets(datasets: Iterable[SearchEvaluationDatasetConfig]) -> List[SearchEvaluationDatasetConfig]:
  59 + return [item for item in datasets if item.enabled]
  60 +
  61 +
  62 +def list_registered_datasets(enabled_only: bool = False) -> List[SearchEvaluationDatasetConfig]:
  63 + se = get_app_config().search_evaluation
  64 + datasets = list(se.datasets)
  65 + return _enabled_datasets(datasets) if enabled_only else datasets
  66 +
  67 +
  68 +def resolve_registered_dataset(dataset_id: str) -> SearchEvaluationDatasetConfig:
  69 + for item in list_registered_datasets(enabled_only=False):
  70 + if item.dataset_id == dataset_id:
  71 + return item
  72 + raise KeyError(f"unknown evaluation dataset: {dataset_id}")
  73 +
  74 +
  75 +def resolve_dataset(
  76 + *,
  77 + dataset_id: Optional[str] = None,
  78 + query_file: Optional[Path] = None,
  79 + tenant_id: Optional[str] = None,
  80 + language: Optional[str] = None,
  81 + require_enabled: bool = False,
  82 +) -> EvalDatasetSnapshot:
  83 + se = get_app_config().search_evaluation
  84 + registered = list_registered_datasets(enabled_only=False)
  85 + selected: Optional[SearchEvaluationDatasetConfig] = None
  86 +
  87 + if dataset_id:
  88 + selected = resolve_registered_dataset(dataset_id)
  89 + elif query_file is not None:
  90 + normalized = query_file.resolve()
  91 + for item in registered:
  92 + if item.query_file.resolve() == normalized:
  93 + selected = item
  94 + break
  95 + else:
  96 + selected = resolve_registered_dataset(se.default_dataset_id)
  97 +
  98 + if selected is None:
  99 + path = (query_file or se.queries_file).resolve()
  100 + queries = tuple(read_queries_file(path))
  101 + derived_id = dataset_id or f"adhoc_{sha1_text(str(path))[:12]}"
  102 + effective_tenant = str(tenant_id or se.default_tenant_id)
  103 + effective_language = str(language or se.default_language)
  104 + return EvalDatasetSnapshot(
  105 + dataset_id=derived_id,
  106 + display_name=path.name,
  107 + description="Ad-hoc evaluation dataset from explicit query file",
  108 + query_file=path,
  109 + tenant_id=effective_tenant,
  110 + language=effective_language,
  111 + enabled=True,
  112 + queries=queries,
  113 + query_count=len(queries),
  114 + query_sha1=query_sha1(queries),
  115 + source="adhoc",
  116 + )
  117 +
  118 + if require_enabled and not selected.enabled:
  119 + raise ValueError(f"evaluation dataset is disabled: {selected.dataset_id}")
  120 +
  121 + effective_tenant = str(tenant_id or selected.tenant_id or se.default_tenant_id)
  122 + effective_language = str(language or selected.language or se.default_language)
  123 + queries = tuple(read_queries_file(selected.query_file))
  124 + return EvalDatasetSnapshot(
  125 + dataset_id=selected.dataset_id,
  126 + display_name=selected.display_name,
  127 + description=selected.description,
  128 + query_file=selected.query_file.resolve(),
  129 + tenant_id=effective_tenant,
  130 + language=effective_language,
  131 + enabled=selected.enabled,
  132 + queries=queries,
  133 + query_count=len(queries),
  134 + query_sha1=query_sha1(queries),
  135 + source="registry",
  136 + )
  137 +
  138 +
  139 +def infer_dataset_id_from_queries(queries: Sequence[str]) -> Optional[str]:
  140 + target_sha = query_sha1(queries)
  141 + for item in list_registered_datasets(enabled_only=False):
  142 + snapshot = resolve_dataset(dataset_id=item.dataset_id)
  143 + if snapshot.query_sha1 == target_sha:
  144 + return snapshot.dataset_id
  145 + return None
  146 +
  147 +
  148 +def artifact_dataset_root(artifact_root: Path, dataset_id: str) -> Path:
  149 + return ensure_dir(artifact_root / "datasets" / dataset_id)
  150 +
  151 +
  152 +def query_builds_dir(artifact_root: Path, dataset_id: str) -> Path:
  153 + return ensure_dir(artifact_dataset_root(artifact_root, dataset_id) / "query_builds")
  154 +
  155 +
  156 +def batch_reports_root(artifact_root: Path, dataset_id: str) -> Path:
  157 + return ensure_dir(artifact_dataset_root(artifact_root, dataset_id) / "batch_reports")
  158 +
  159 +
  160 +def batch_report_run_dir(artifact_root: Path, dataset_id: str, batch_id: str) -> Path:
  161 + return ensure_dir(batch_reports_root(artifact_root, dataset_id) / batch_id)
  162 +
  163 +
  164 +def audits_dir(artifact_root: Path, dataset_id: str) -> Path:
  165 + return ensure_dir(artifact_dataset_root(artifact_root, dataset_id) / "audits")
... ...
scripts/evaluation/eval_framework/framework.py
... ... @@ -34,6 +34,7 @@ from .constants import (
34 34 VALID_LABELS,
35 35 STOP_PROB_MAP,
36 36 )
  37 +from .datasets import EvalDatasetSnapshot, batch_report_run_dir, query_builds_dir
37 38 from .metrics import (
38 39 PRIMARY_METRIC_GRADE_NORMALIZER,
39 40 PRIMARY_METRIC_KEYS,
... ... @@ -541,6 +542,7 @@ class SearchEvaluationFramework:
541 542 self,
542 543 query: str,
543 544 *,
  545 + dataset: EvalDatasetSnapshot | None = None,
544 546 search_depth: int = 1000,
545 547 rerank_depth: int = 10000,
546 548 annotate_search_top_k: int = 120,
... ... @@ -571,6 +573,7 @@ class SearchEvaluationFramework:
571 573 if force_refresh_labels:
572 574 return self._build_query_annotation_set_rebuild(
573 575 query=query,
  576 + dataset=dataset,
574 577 search_depth=search_depth,
575 578 rerank_depth=rerank_depth,
576 579 language=language,
... ... @@ -647,13 +650,16 @@ class SearchEvaluationFramework:
647 650 for item in search_labeled_results[:100]
648 651 ]
649 652 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
650   - output_dir = ensure_dir(self.artifact_root / "query_builds")
  653 + output_dir = query_builds_dir(self.artifact_root, dataset.dataset_id) if dataset else ensure_dir(
  654 + self.artifact_root / "query_builds"
  655 + )
651 656 run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
652 657 output_json_path = output_dir / f"{run_id}.json"
653 658 payload = {
654 659 "run_id": run_id,
655 660 "created_at": utc_now_iso(),
656 661 "tenant_id": self.tenant_id,
  662 + "dataset": dataset.summary() if dataset else None,
657 663 "query": query,
658 664 "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
659 665 "search_total": int(search_payload.get("total") or 0),
... ... @@ -673,7 +679,14 @@ class SearchEvaluationFramework:
673 679 "full_rerank_top": rerank_top_results,
674 680 }
675 681 output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
676   - self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
  682 + self.store.insert_build_run(
  683 + run_id,
  684 + self.tenant_id,
  685 + query,
  686 + output_json_path,
  687 + payload,
  688 + dataset=dataset,
  689 + )
677 690 return QueryBuildResult(
678 691 query=query,
679 692 tenant_id=self.tenant_id,
... ... @@ -688,6 +701,7 @@ class SearchEvaluationFramework:
688 701 self,
689 702 query: str,
690 703 *,
  704 + dataset: EvalDatasetSnapshot | None,
691 705 search_depth: int,
692 706 rerank_depth: int,
693 707 language: str,
... ... @@ -857,7 +871,9 @@ class SearchEvaluationFramework:
857 871 for item in search_labeled_results[:100]
858 872 ]
859 873 metrics = compute_query_metrics(top100_labels, ideal_labels=list(labels.values()))
860   - output_dir = ensure_dir(self.artifact_root / "query_builds")
  874 + output_dir = query_builds_dir(self.artifact_root, dataset.dataset_id) if dataset else ensure_dir(
  875 + self.artifact_root / "query_builds"
  876 + )
861 877 run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
862 878 output_json_path = output_dir / f"{run_id}.json"
863 879 pool_docs_count = len(pool_spu_ids) + len(ranked_outside)
... ... @@ -865,6 +881,7 @@ class SearchEvaluationFramework:
865 881 "run_id": run_id,
866 882 "created_at": utc_now_iso(),
867 883 "tenant_id": self.tenant_id,
  884 + "dataset": dataset.summary() if dataset else None,
868 885 "query": query,
869 886 "config_meta": self.search_client.get_json("/admin/config/meta", timeout=20),
870 887 "search_total": int(search_payload.get("total") or 0),
... ... @@ -883,7 +900,14 @@ class SearchEvaluationFramework:
883 900 "full_rerank_top": rerank_top_results,
884 901 }
885 902 output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
886   - self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
  903 + self.store.insert_build_run(
  904 + run_id,
  905 + self.tenant_id,
  906 + query,
  907 + output_json_path,
  908 + payload,
  909 + dataset=dataset,
  910 + )
887 911 return QueryBuildResult(
888 912 query=query,
889 913 tenant_id=self.tenant_id,
... ... @@ -901,6 +925,7 @@ class SearchEvaluationFramework:
901 925 auto_annotate: bool = False,
902 926 language: str = "en",
903 927 force_refresh_labels: bool = False,
  928 + dataset: EvalDatasetSnapshot | None = None,
904 929 ) -> Dict[str, Any]:
905 930 search_payload = self.search_client.search(
906 931 query=query, size=max(top_k, 100), from_=0, language=language, debug=True
... ... @@ -997,6 +1022,7 @@ class SearchEvaluationFramework:
997 1022 return {
998 1023 "query": query,
999 1024 "tenant_id": self.tenant_id,
  1025 + "dataset": dataset.summary() if dataset else None,
1000 1026 "top_k": top_k,
1001 1027 "metrics": compute_query_metrics(metric_labels, ideal_labels=ideal_labels),
1002 1028 "metric_context": _metric_context_payload(),
... ... @@ -1020,6 +1046,7 @@ class SearchEvaluationFramework:
1020 1046 self,
1021 1047 queries: Sequence[str],
1022 1048 *,
  1049 + dataset: EvalDatasetSnapshot | None = None,
1023 1050 top_k: int = 100,
1024 1051 auto_annotate: bool = True,
1025 1052 language: str = "en",
... ... @@ -1036,6 +1063,7 @@ class SearchEvaluationFramework:
1036 1063 auto_annotate=auto_annotate,
1037 1064 language=language,
1038 1065 force_refresh_labels=force_refresh_labels,
  1066 + dataset=dataset,
1039 1067 )
1040 1068 labels = [
1041 1069 item["label"] if item["label"] in VALID_LABELS else RELEVANCE_LV0
... ... @@ -1088,17 +1116,31 @@ class SearchEvaluationFramework:
1088 1116 RELEVANCE_LV1: sum(item["distribution"][RELEVANCE_LV1] for item in per_query),
1089 1117 RELEVANCE_LV0: sum(item["distribution"][RELEVANCE_LV0] for item in per_query),
1090 1118 }
1091   - batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
1092   - report_dir = ensure_dir(self.artifact_root / "batch_reports")
1093   - config_snapshot_path = report_dir / f"{batch_id}_config.json"
  1119 + dataset_id = dataset.dataset_id if dataset else "legacy_default"
  1120 + dataset_hash = dataset.query_sha1 if dataset else sha1_text("|".join(queries))
  1121 + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + dataset_id + '|' + dataset_hash)[:10]}"
  1122 + report_dir = batch_report_run_dir(self.artifact_root, dataset_id, batch_id) if dataset else ensure_dir(
  1123 + self.artifact_root / "batch_reports"
  1124 + )
  1125 + config_snapshot_path = report_dir / "config_snapshot.json" if dataset else report_dir / f"{batch_id}_config.json"
1094 1126 config_snapshot = self.search_client.get_json("/admin/config", timeout=20)
1095 1127 config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
1096   - output_json_path = report_dir / f"{batch_id}.json"
1097   - report_md_path = report_dir / f"{batch_id}.md"
  1128 + dataset_snapshot_path = report_dir / "dataset_snapshot.json" if dataset else None
  1129 + queries_snapshot_path = report_dir / "queries.txt" if dataset else None
  1130 + if dataset_snapshot_path is not None:
  1131 + dataset_snapshot_path.write_text(
  1132 + json.dumps(dataset.summary(), ensure_ascii=False, indent=2),
  1133 + encoding="utf-8",
  1134 + )
  1135 + if queries_snapshot_path is not None:
  1136 + queries_snapshot_path.write_text("\n".join(queries) + "\n", encoding="utf-8")
  1137 + output_json_path = report_dir / "report.json" if dataset else report_dir / f"{batch_id}.json"
  1138 + report_md_path = report_dir / "report.md" if dataset else report_dir / f"{batch_id}.md"
1098 1139 payload = {
1099 1140 "batch_id": batch_id,
1100 1141 "created_at": utc_now_iso(),
1101 1142 "tenant_id": self.tenant_id,
  1143 + "dataset": dataset.summary() if dataset else None,
1102 1144 "queries": list(queries),
1103 1145 "top_k": top_k,
1104 1146 "aggregate_metrics": aggregate,
... ... @@ -1106,10 +1148,20 @@ class SearchEvaluationFramework:
1106 1148 "aggregate_distribution": aggregate_distribution,
1107 1149 "per_query": per_query,
1108 1150 "config_snapshot_path": str(config_snapshot_path),
  1151 + "dataset_snapshot_path": str(dataset_snapshot_path) if dataset_snapshot_path is not None else "",
  1152 + "queries_snapshot_path": str(queries_snapshot_path) if queries_snapshot_path is not None else "",
1109 1153 }
1110 1154 output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
1111 1155 report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
1112   - self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
  1156 + self.store.insert_batch_run(
  1157 + batch_id,
  1158 + self.tenant_id,
  1159 + output_json_path,
  1160 + report_md_path,
  1161 + config_snapshot_path,
  1162 + payload,
  1163 + dataset=dataset,
  1164 + )
1113 1165 _log.info(
1114 1166 "[batch-eval] finished batch_id=%s per_query=%s json=%s",
1115 1167 batch_id,
... ...
scripts/evaluation/eval_framework/reports.py
... ... @@ -67,9 +67,22 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
67 67 f"- Query count: {len(payload['queries'])}",
68 68 f"- Top K: {payload['top_k']}",
69 69 "",
70   - "## Aggregate Metrics",
71   - "",
72 70 ]
  71 + dataset = payload.get("dataset") or {}
  72 + if dataset:
  73 + lines.extend(
  74 + [
  75 + "## Dataset",
  76 + "",
  77 + f"- Dataset ID: {dataset.get('dataset_id', '')}",
  78 + f"- Display Name: {dataset.get('display_name', '')}",
  79 + f"- Query File: {dataset.get('query_file', '')}",
  80 + f"- Query Count: {dataset.get('query_count', '')}",
  81 + f"- Query SHA1: {dataset.get('query_sha1', '')}",
  82 + "",
  83 + ]
  84 + )
  85 + lines.extend(["## Aggregate Metrics", ""])
73 86 metric_context = payload.get("metric_context") or {}
74 87 if metric_context:
75 88 lines.extend(
... ...
scripts/evaluation/eval_framework/static/eval_web.js
... ... @@ -4,6 +4,9 @@ async function fetchJSON(url, options) {
4 4 return await res.json();
5 5 }
6 6  
  7 +let _datasets = [];
  8 +let _currentDatasetId = "";
  9 +
7 10 function fmtNumber(value, digits = 3) {
8 11 if (value == null || Number.isNaN(Number(value))) return "-";
9 12 return Number(value).toFixed(digits);
... ... @@ -173,9 +176,18 @@ function renderTips(data) {
173 176 }
174 177  
175 178 async function loadQueries() {
176   - const data = await fetchJSON("/api/queries");
  179 + if (!_currentDatasetId) return;
  180 + const data = await fetchJSON("/api/datasets/" + encodeURIComponent(_currentDatasetId) + "/queries");
177 181 const root = document.getElementById("queryList");
178 182 root.innerHTML = "";
  183 + const dataset = data.dataset || {};
  184 + document.getElementById("queriesMeta").innerHTML = `Loaded from <code>${dataset.query_file || ""}</code>`;
  185 + document.getElementById("datasetMeta").textContent =
  186 + `${dataset.display_name || dataset.dataset_id || ""} · ${dataset.query_count || 0} queries`;
  187 + document.getElementById("pageSubtitle").textContent =
  188 + `Current dataset: ${dataset.display_name || dataset.dataset_id || ""}. Single-query evaluation and batch evaluation share the same service on port 6010.`;
  189 + document.getElementById("batchButton").textContent =
  190 + `Batch Evaluation: ${dataset.display_name || dataset.dataset_id || ""}`;
179 191 data.queries.forEach((query) => {
180 192 const btn = document.createElement("button");
181 193 btn.className = "query-item";
... ... @@ -188,6 +200,26 @@ async function loadQueries() {
188 200 });
189 201 }
190 202  
  203 +async function loadDatasets() {
  204 + const data = await fetchJSON("/api/datasets");
  205 + _datasets = data.datasets || [];
  206 + if (!_currentDatasetId) _currentDatasetId = data.current_dataset_id || (_datasets[0] && _datasets[0].dataset_id) || "";
  207 + const select = document.getElementById("datasetSelect");
  208 + select.innerHTML = "";
  209 + _datasets.forEach((dataset) => {
  210 + const opt = document.createElement("option");
  211 + opt.value = dataset.dataset_id;
  212 + opt.textContent = `${dataset.display_name || dataset.dataset_id} (${dataset.query_count || 0})`;
  213 + if (dataset.dataset_id === _currentDatasetId) opt.selected = true;
  214 + select.appendChild(opt);
  215 + });
  216 + select.onchange = async (ev) => {
  217 + _currentDatasetId = ev.target.value;
  218 + await loadQueries();
  219 + await loadHistory();
  220 + };
  221 +}
  222 +
191 223 function historySummaryHtml(meta) {
192 224 const m = meta && meta.aggregate_metrics;
193 225 const nq = (meta && meta.query_count) || (meta && meta.queries && meta.queries.length) || (meta && meta.per_query && meta.per_query.length) || null;
... ... @@ -203,7 +235,8 @@ function historySummaryHtml(meta) {
203 235 }
204 236  
205 237 async function loadHistory() {
206   - const data = await fetchJSON("/api/history");
  238 + if (!_currentDatasetId) return;
  239 + const data = await fetchJSON("/api/history?dataset_id=" + encodeURIComponent(_currentDatasetId));
207 240 const root = document.getElementById("history");
208 241 root.classList.remove("muted");
209 242 const items = data.history || [];
... ... @@ -219,8 +252,10 @@ async function loadHistory() {
219 252 btn.className = "history-item";
220 253 btn.setAttribute("aria-label", `Open report ${item.batch_id}`);
221 254 const sum = historySummaryHtml(item.metadata);
  255 + const dataset = (item.metadata && item.metadata.dataset) || {};
  256 + const datasetName = dataset.display_name || dataset.dataset_id || item.dataset_id || "";
222 257 btn.innerHTML = `<div class="hid">${item.batch_id}</div>
223   - <div class="hmeta">${item.created_at} · tenant ${item.tenant_id}</div>${sum}`;
  258 + <div class="hmeta">${item.created_at} · tenant ${item.tenant_id}${datasetName ? ` · ${datasetName}` : ""}</div>${sum}`;
224 259 btn.onclick = () => openBatchReport(item.batch_id);
225 260 list.appendChild(btn);
226 261 });
... ... @@ -250,7 +285,10 @@ async function openBatchReport(batchId) {
250 285 try {
251 286 const rep = await fetchJSON("/api/history/" + encodeURIComponent(batchId) + "/report");
252 287 _lastReportPath = rep.report_markdown_path || "";
253   - metaEl.textContent = rep.report_markdown_path || "";
  288 + const dataset = rep.dataset || {};
  289 + metaEl.textContent = [dataset.display_name || dataset.dataset_id || "", rep.report_markdown_path || ""]
  290 + .filter(Boolean)
  291 + .join(" · ");
254 292 const raw = marked.parse(rep.markdown || "", { gfm: true });
255 293 const safe = DOMPurify.sanitize(raw, { USE_PROFILES: { html: true } });
256 294 body.className = "report-modal-body batch-report-md";
... ... @@ -279,11 +317,11 @@ document.getElementById(&quot;reportCopyPath&quot;).addEventListener(&quot;click&quot;, async () =&gt;
279 317 async function runSingle() {
280 318 const query = document.getElementById("queryInput").value.trim();
281 319 if (!query) return;
282   - document.getElementById("status").textContent = `Evaluating "${query}"...`;
  320 + document.getElementById("status").textContent = `Evaluating "${query}" on ${_currentDatasetId}...`;
283 321 const data = await fetchJSON("/api/search-eval", {
284 322 method: "POST",
285 323 headers: { "Content-Type": "application/json" },
286   - body: JSON.stringify({ query, top_k: 100, auto_annotate: false }),
  324 + body: JSON.stringify({ query, dataset_id: _currentDatasetId, top_k: 100, auto_annotate: false }),
287 325 });
288 326 document.getElementById("status").textContent = `Done. total=${data.total}`;
289 327 renderMetrics(data.metrics, data.metric_context);
... ... @@ -294,19 +332,19 @@ async function runSingle() {
294 332 }
295 333  
296 334 async function runBatch() {
297   - document.getElementById("status").textContent = "Running batch evaluation...";
  335 + document.getElementById("status").textContent = `Running batch evaluation for ${_currentDatasetId}...`;
298 336 const data = await fetchJSON("/api/batch-eval", {
299 337 method: "POST",
300 338 headers: { "Content-Type": "application/json" },
301   - body: JSON.stringify({ top_k: 100, auto_annotate: false }),
  339 + body: JSON.stringify({ dataset_id: _currentDatasetId, top_k: 100, auto_annotate: false }),
302 340 });
303 341 document.getElementById("status").textContent = `Batch done. report=${data.batch_id}`;
304 342 renderMetrics(data.aggregate_metrics, data.metric_context);
305 343 renderResults([], "results", true);
306 344 renderResults([], "missingRelevant", false);
307   - document.getElementById("tips").innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
  345 + document.getElementById("tips").innerHTML =
  346 + '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
308 347 loadHistory();
309 348 }
310 349  
311   -loadQueries();
312   -loadHistory();
  350 +loadDatasets().then(() => loadQueries()).then(() => loadHistory());
... ...
scripts/evaluation/eval_framework/static/index.html
... ... @@ -10,8 +10,13 @@
10 10 <body>
11 11 <div class="app">
12 12 <aside class="sidebar">
  13 + <h2>Datasets</h2>
  14 + <div class="section" style="padding-top:0">
  15 + <select id="datasetSelect" style="width:100%"></select>
  16 + <p id="datasetMeta" class="muted" style="font-size:12px;margin:8px 0 0"></p>
  17 + </div>
13 18 <h2>Queries</h2>
14   - <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
  19 + <p id="queriesMeta" class="muted">Loading dataset queries...</p>
15 20 <div id="queryList" class="query-list"></div>
16 21 <div class="section">
17 22 <h2>History</h2>
... ... @@ -21,11 +26,11 @@
21 26 </aside>
22 27 <main class="main">
23 28 <h1>Search Evaluation</h1>
24   - <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
  29 + <p id="pageSubtitle" class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
25 30 <div class="toolbar">
26 31 <input id="queryInput" type="text" placeholder="Search query" />
27 32 <button onclick="runSingle()">Evaluate Query</button>
28   - <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
  33 + <button id="batchButton" class="secondary" onclick="runBatch()">Batch Evaluation</button>
29 34 </div>
30 35 <div id="status" class="muted section"></div>
31 36 <section class="section">
... ...
scripts/evaluation/eval_framework/store.py
... ... @@ -9,6 +9,7 @@ from pathlib import Path
9 9 from typing import Any, Dict, List, Optional, Sequence
10 10  
11 11 from .constants import VALID_LABELS
  12 +from .datasets import EvalDatasetSnapshot, infer_dataset_id_from_queries
12 13 from .utils import ensure_dir, safe_json_dumps, utc_now_iso
13 14  
14 15  
... ... @@ -24,10 +25,13 @@ class QueryBuildResult:
24 25  
25 26  
26 27 def _compact_batch_metadata(metadata: Dict[str, Any]) -> Dict[str, Any]:
  28 + dataset = dict(metadata.get("dataset") or {})
27 29 return {
28 30 "batch_id": metadata.get("batch_id"),
29 31 "created_at": metadata.get("created_at"),
30 32 "tenant_id": metadata.get("tenant_id"),
  33 + "dataset": dataset,
  34 + "dataset_id": dataset.get("dataset_id") or metadata.get("dataset_id"),
31 35 "top_k": metadata.get("top_k"),
32 36 "query_count": len(metadata.get("queries") or []),
33 37 "aggregate_metrics": dict(metadata.get("aggregate_metrics") or {}),
... ... @@ -85,6 +89,11 @@ class EvalStore:
85 89 CREATE TABLE IF NOT EXISTS build_runs (
86 90 run_id TEXT PRIMARY KEY,
87 91 tenant_id TEXT NOT NULL,
  92 + dataset_id TEXT,
  93 + dataset_display_name TEXT,
  94 + dataset_query_file TEXT,
  95 + dataset_query_count INTEGER,
  96 + dataset_query_sha1 TEXT,
88 97 query_text TEXT NOT NULL,
89 98 output_json_path TEXT NOT NULL,
90 99 metadata_json TEXT NOT NULL,
... ... @@ -94,6 +103,11 @@ class EvalStore:
94 103 CREATE TABLE IF NOT EXISTS batch_runs (
95 104 batch_id TEXT PRIMARY KEY,
96 105 tenant_id TEXT NOT NULL,
  106 + dataset_id TEXT,
  107 + dataset_display_name TEXT,
  108 + dataset_query_file TEXT,
  109 + dataset_query_count INTEGER,
  110 + dataset_query_sha1 TEXT,
97 111 output_json_path TEXT NOT NULL,
98 112 report_markdown_path TEXT NOT NULL,
99 113 config_snapshot_path TEXT NOT NULL,
... ... @@ -113,8 +127,31 @@ class EvalStore:
113 127 );
114 128 """
115 129 )
  130 + self._ensure_column("build_runs", "dataset_id", "TEXT")
  131 + self._ensure_column("build_runs", "dataset_display_name", "TEXT")
  132 + self._ensure_column("build_runs", "dataset_query_file", "TEXT")
  133 + self._ensure_column("build_runs", "dataset_query_count", "INTEGER")
  134 + self._ensure_column("build_runs", "dataset_query_sha1", "TEXT")
  135 + self._ensure_column("batch_runs", "dataset_id", "TEXT")
  136 + self._ensure_column("batch_runs", "dataset_display_name", "TEXT")
  137 + self._ensure_column("batch_runs", "dataset_query_file", "TEXT")
  138 + self._ensure_column("batch_runs", "dataset_query_count", "INTEGER")
  139 + self._ensure_column("batch_runs", "dataset_query_sha1", "TEXT")
  140 + self.conn.execute(
  141 + "CREATE INDEX IF NOT EXISTS idx_batch_runs_dataset_created ON batch_runs(dataset_id, created_at DESC)"
  142 + )
  143 + self.conn.execute(
  144 + "CREATE INDEX IF NOT EXISTS idx_build_runs_dataset_created ON build_runs(dataset_id, created_at DESC)"
  145 + )
116 146 self.conn.commit()
117 147  
  148 + def _ensure_column(self, table: str, column: str, column_type: str) -> None:
  149 + rows = self.conn.execute(f"PRAGMA table_info({table})").fetchall()
  150 + existing = {str(row["name"]) for row in rows}
  151 + if column in existing:
  152 + return
  153 + self.conn.execute(f"ALTER TABLE {table} ADD COLUMN {column} {column_type}")
  154 +
118 155 def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
119 156 now = utc_now_iso()
120 157 rows = []
... ... @@ -302,13 +339,37 @@ class EvalStore:
302 339 )
303 340 self.conn.commit()
304 341  
305   - def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
  342 + def insert_build_run(
  343 + self,
  344 + run_id: str,
  345 + tenant_id: str,
  346 + query_text: str,
  347 + output_json_path: Path,
  348 + metadata: Dict[str, Any],
  349 + dataset: Optional[EvalDatasetSnapshot] = None,
  350 + ) -> None:
  351 + dataset_info = dataset.summary() if dataset is not None else dict(metadata.get("dataset") or {})
306 352 self.conn.execute(
307 353 """
308   - INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
309   - VALUES (?, ?, ?, ?, ?, ?)
  354 + INSERT OR REPLACE INTO build_runs (
  355 + run_id, tenant_id, dataset_id, dataset_display_name, dataset_query_file,
  356 + dataset_query_count, dataset_query_sha1, query_text, output_json_path, metadata_json, created_at
  357 + )
  358 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
310 359 """,
311   - (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
  360 + (
  361 + run_id,
  362 + tenant_id,
  363 + dataset_info.get("dataset_id"),
  364 + dataset_info.get("display_name"),
  365 + dataset_info.get("query_file"),
  366 + dataset_info.get("query_count"),
  367 + dataset_info.get("query_sha1"),
  368 + query_text,
  369 + str(output_json_path),
  370 + safe_json_dumps(metadata),
  371 + utc_now_iso(),
  372 + ),
312 373 )
313 374 self.conn.commit()
314 375  
... ... @@ -320,16 +381,27 @@ class EvalStore:
320 381 report_markdown_path: Path,
321 382 config_snapshot_path: Path,
322 383 metadata: Dict[str, Any],
  384 + dataset: Optional[EvalDatasetSnapshot] = None,
323 385 ) -> None:
  386 + dataset_info = dataset.summary() if dataset is not None else dict(metadata.get("dataset") or {})
324 387 self.conn.execute(
325 388 """
326 389 INSERT OR REPLACE INTO batch_runs
327   - (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
328   - VALUES (?, ?, ?, ?, ?, ?, ?)
  390 + (
  391 + batch_id, tenant_id, dataset_id, dataset_display_name, dataset_query_file,
  392 + dataset_query_count, dataset_query_sha1, output_json_path, report_markdown_path,
  393 + config_snapshot_path, metadata_json, created_at
  394 + )
  395 + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
329 396 """,
330 397 (
331 398 batch_id,
332 399 tenant_id,
  400 + dataset_info.get("dataset_id"),
  401 + dataset_info.get("display_name"),
  402 + dataset_info.get("query_file"),
  403 + dataset_info.get("query_count"),
  404 + dataset_info.get("query_sha1"),
333 405 str(output_json_path),
334 406 str(report_markdown_path),
335 407 str(config_snapshot_path),
... ... @@ -339,27 +411,59 @@ class EvalStore:
339 411 )
340 412 self.conn.commit()
341 413  
342   - def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
343   - rows = self.conn.execute(
344   - """
345   - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
346   - FROM batch_runs
347   - ORDER BY created_at DESC
348   - LIMIT ?
349   - """,
350   - (limit,),
351   - ).fetchall()
  414 + def list_batch_runs(self, limit: int = 20, dataset_id: Optional[str] = None) -> List[Dict[str, Any]]:
  415 + if dataset_id:
  416 + rows = self.conn.execute(
  417 + """
  418 + SELECT batch_id, tenant_id, dataset_id, dataset_display_name, dataset_query_file, dataset_query_count,
  419 + dataset_query_sha1, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  420 + FROM batch_runs
  421 + WHERE dataset_id=?
  422 + ORDER BY created_at DESC
  423 + LIMIT ?
  424 + """,
  425 + (dataset_id, limit),
  426 + ).fetchall()
  427 + else:
  428 + rows = self.conn.execute(
  429 + """
  430 + SELECT batch_id, tenant_id, dataset_id, dataset_display_name, dataset_query_file, dataset_query_count,
  431 + dataset_query_sha1, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  432 + FROM batch_runs
  433 + ORDER BY created_at DESC
  434 + LIMIT ?
  435 + """,
  436 + (limit,),
  437 + ).fetchall()
352 438 items: List[Dict[str, Any]] = []
353 439 for row in rows:
354 440 metadata = json.loads(row["metadata_json"])
  441 + inferred_dataset_id = row["dataset_id"] or metadata.get("dataset_id") or infer_dataset_id_from_queries(
  442 + metadata.get("queries") or []
  443 + )
  444 + dataset_meta = dict(metadata.get("dataset") or {})
  445 + if inferred_dataset_id and not dataset_meta.get("dataset_id"):
  446 + dataset_meta["dataset_id"] = inferred_dataset_id
  447 + if row["dataset_display_name"] and not dataset_meta.get("display_name"):
  448 + dataset_meta["display_name"] = row["dataset_display_name"]
  449 + if row["dataset_query_file"] and not dataset_meta.get("query_file"):
  450 + dataset_meta["query_file"] = row["dataset_query_file"]
  451 + if row["dataset_query_count"] and not dataset_meta.get("query_count"):
  452 + dataset_meta["query_count"] = int(row["dataset_query_count"])
  453 + if row["dataset_query_sha1"] and not dataset_meta.get("query_sha1"):
  454 + dataset_meta["query_sha1"] = row["dataset_query_sha1"]
355 455 items.append(
356 456 {
357 457 "batch_id": row["batch_id"],
358 458 "tenant_id": row["tenant_id"],
  459 + "dataset_id": inferred_dataset_id,
359 460 "output_json_path": row["output_json_path"],
360 461 "report_markdown_path": row["report_markdown_path"],
361 462 "config_snapshot_path": row["config_snapshot_path"],
362   - "metadata": _compact_batch_metadata(metadata),
  463 + "metadata": {
  464 + **_compact_batch_metadata(metadata),
  465 + "dataset": dataset_meta,
  466 + },
363 467 "created_at": row["created_at"],
364 468 }
365 469 )
... ... @@ -368,7 +472,8 @@ class EvalStore:
368 472 def get_batch_run(self, batch_id: str) -> Optional[Dict[str, Any]]:
369 473 row = self.conn.execute(
370 474 """
371   - SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  475 + SELECT batch_id, tenant_id, dataset_id, dataset_display_name, dataset_query_file, dataset_query_count,
  476 + dataset_query_sha1, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
372 477 FROM batch_runs
373 478 WHERE batch_id = ?
374 479 """,
... ... @@ -376,13 +481,32 @@ class EvalStore:
376 481 ).fetchone()
377 482 if row is None:
378 483 return None
  484 + metadata = json.loads(row["metadata_json"])
  485 + inferred_dataset_id = row["dataset_id"] or metadata.get("dataset_id") or infer_dataset_id_from_queries(
  486 + metadata.get("queries") or []
  487 + )
  488 + dataset_meta = dict(metadata.get("dataset") or {})
  489 + if inferred_dataset_id and not dataset_meta.get("dataset_id"):
  490 + dataset_meta["dataset_id"] = inferred_dataset_id
  491 + if row["dataset_display_name"] and not dataset_meta.get("display_name"):
  492 + dataset_meta["display_name"] = row["dataset_display_name"]
  493 + if row["dataset_query_file"] and not dataset_meta.get("query_file"):
  494 + dataset_meta["query_file"] = row["dataset_query_file"]
  495 + if row["dataset_query_count"] and not dataset_meta.get("query_count"):
  496 + dataset_meta["query_count"] = int(row["dataset_query_count"])
  497 + if row["dataset_query_sha1"] and not dataset_meta.get("query_sha1"):
  498 + dataset_meta["query_sha1"] = row["dataset_query_sha1"]
379 499 return {
380 500 "batch_id": row["batch_id"],
381 501 "tenant_id": row["tenant_id"],
  502 + "dataset_id": inferred_dataset_id,
382 503 "output_json_path": row["output_json_path"],
383 504 "report_markdown_path": row["report_markdown_path"],
384 505 "config_snapshot_path": row["config_snapshot_path"],
385   - "metadata": json.loads(row["metadata_json"]),
  506 + "metadata": {
  507 + **metadata,
  508 + "dataset": dataset_meta,
  509 + },
386 510 "created_at": row["created_at"],
387 511 }
388 512  
... ...
scripts/evaluation/eval_framework/web_app.py
... ... @@ -11,13 +11,15 @@ from fastapi.staticfiles import StaticFiles
11 11  
12 12 from .api_models import BatchEvalRequest, SearchEvalRequest
13 13 from .constants import DEFAULT_QUERY_FILE
  14 +from .datasets import list_registered_datasets, resolve_dataset
14 15 from .framework import SearchEvaluationFramework
15 16  
16 17 _STATIC_DIR = Path(__file__).resolve().parent / "static"
17 18  
18 19  
19   -def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
  20 +def create_web_app(framework: SearchEvaluationFramework, initial_dataset_id: str | None = None) -> FastAPI:
20 21 app = FastAPI(title="Search Evaluation UI", version="1.0.0")
  22 + current_dataset_id = initial_dataset_id or "core_queries"
21 23  
22 24 app.mount(
23 25 "/static",
... ... @@ -31,35 +33,75 @@ def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFA
31 33 def home() -> str:
32 34 return index_path.read_text(encoding="utf-8")
33 35  
  36 + @app.get("/api/datasets")
  37 + def api_datasets() -> Dict[str, Any]:
  38 + stats_by_query = {item["query"]: item for item in framework.store.list_query_label_stats(framework.tenant_id)}
  39 + datasets = []
  40 + for item in list_registered_datasets(enabled_only=True):
  41 + snapshot = resolve_dataset(dataset_id=item.dataset_id, tenant_id=framework.tenant_id)
  42 + labeled_queries = sum(1 for query in snapshot.queries if (stats_by_query.get(query) or {}).get("total", 0) > 0)
  43 + datasets.append(
  44 + {
  45 + **snapshot.summary(),
  46 + "coverage_summary": {
  47 + "labeled_queries": labeled_queries,
  48 + "coverage_ratio": (labeled_queries / snapshot.query_count) if snapshot.query_count else 0.0,
  49 + },
  50 + }
  51 + )
  52 + return {"datasets": datasets, "current_dataset_id": current_dataset_id}
  53 +
  54 + @app.get("/api/datasets/{dataset_id}/queries")
  55 + def api_dataset_queries(dataset_id: str) -> Dict[str, Any]:
  56 + dataset = resolve_dataset(dataset_id=dataset_id, tenant_id=framework.tenant_id, require_enabled=True)
  57 + return {"dataset": dataset.summary(), "queries": list(dataset.queries)}
  58 +
34 59 @app.get("/api/queries")
35   - def api_queries() -> Dict[str, Any]:
36   - return {"queries": framework.queries_from_file(query_file)}
  60 + def api_queries(dataset_id: str | None = None) -> Dict[str, Any]:
  61 + dataset = resolve_dataset(dataset_id=dataset_id or current_dataset_id, tenant_id=framework.tenant_id)
  62 + return {"dataset": dataset.summary(), "queries": list(dataset.queries)}
37 63  
38 64 @app.post("/api/search-eval")
39 65 def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
  66 + dataset = resolve_dataset(
  67 + dataset_id=request.dataset_id or current_dataset_id,
  68 + tenant_id=framework.tenant_id,
  69 + language=request.language,
  70 + )
40 71 return framework.evaluate_live_query(
41 72 query=request.query,
42 73 top_k=request.top_k,
43 74 auto_annotate=request.auto_annotate,
44   - language=request.language,
  75 + language=dataset.language,
  76 + dataset=dataset,
45 77 )
46 78  
47 79 @app.post("/api/batch-eval")
48 80 def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
49   - queries = request.queries or framework.queries_from_file(query_file)
  81 + dataset = resolve_dataset(
  82 + dataset_id=request.dataset_id or current_dataset_id,
  83 + tenant_id=framework.tenant_id,
  84 + language=request.language,
  85 + )
  86 + queries = request.queries or list(dataset.queries)
50 87 if not queries:
51 88 raise HTTPException(status_code=400, detail="No queries provided")
52 89 return framework.batch_evaluate(
53 90 queries=queries,
  91 + dataset=dataset,
54 92 top_k=request.top_k,
55 93 auto_annotate=request.auto_annotate,
56   - language=request.language,
  94 + language=dataset.language,
57 95 force_refresh_labels=request.force_refresh_labels,
58 96 )
59 97  
60 98 @app.get("/api/history")
61   - def api_history() -> Dict[str, Any]:
62   - return {"history": framework.store.list_batch_runs(limit=20)}
  99 + def api_history(dataset_id: str | None = None, limit: int = 20) -> Dict[str, Any]:
  100 + effective_dataset_id = dataset_id or current_dataset_id
  101 + return {
  102 + "history": framework.store.list_batch_runs(limit=limit, dataset_id=effective_dataset_id),
  103 + "dataset_id": effective_dataset_id,
  104 + }
63 105  
64 106 @app.get("/api/history/{batch_id}/report")
65 107 def api_history_report(batch_id: str) -> Dict[str, Any]:
... ... @@ -78,6 +120,7 @@ def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFA
78 120 "batch_id": row["batch_id"],
79 121 "created_at": row["created_at"],
80 122 "tenant_id": row["tenant_id"],
  123 + "dataset": row["metadata"].get("dataset") or {},
81 124 "report_markdown_path": str(report_path),
82 125 "markdown": report_path.read_text(encoding="utf-8"),
83 126 }
... ...
scripts/evaluation/queries/all_keywords.txt.top1w.shuf.top1k.clothing_filtered 0 → 100644
... ... @@ -0,0 +1,771 @@
  1 +白色oversized T-shirt
  2 +falda negra oficina
  3 +red fitted tee
  4 +黒いミディ丈スカート
  5 +黑色中长半身裙
  6 +فستان أسود متوسط الطول
  7 +чёрное летнее платье
  8 +修身牛仔裤
  9 +date night dress
  10 +vacation outfit dress
  11 +minimalist top
  12 +streetwear t-shirt
  13 +office casual blouse
  14 +波西米亚花朵衬衫
  15 +泡泡袖短袖
  16 +扎染字母T恤
  17 +V-Neck Cotton T-shirt
  18 +Athletic Gym T-shirt
  19 +Plus Size Loose T-shirt
  20 +Korean Style Slim T-shirt
  21 +Basic Layering Top
  22 +shawl collar cardigan
  23 +swim dress
  24 +毕业典礼定制西装
  25 +colorblock hoodie
  26 +sock boots
  27 +旅行服装
  28 +khaki green backpack
  29 +皱边裙
  30 +高跟鞋
  31 +图案连身衣
  32 +天鹅绒鸡尾酒会礼服
  33 +gingham dress
  34 +海滩度假装
  35 +vacation outfits
  36 +running shorts
  37 +pink sweater aesthetic
  38 +hiking boots
  39 +宽松开襟羊毛衫
  40 +business casual women
  41 +a-line dress
  42 +涤纶短裤
  43 +Compression Top Spandex
  44 +skiing trip insulated base layer
  45 +high waisted jeans
  46 +无袖夏装
  47 +雪纺衬衫
  48 +convertible zip-off hiking pants
  49 +petite summer linen shorts
  50 +tall slim fit men's linen shirt
  51 +tall slim fit trousers
  52 +tall straight leg pants
  53 +tassel maxi skirt
  54 +teacher clothesジャミロクワイ
  55 +barbie backpack
  56 +bandanas for women
  57 +columbia jacket men
  58 +halloween pjs
  59 +salwar suit
  60 +bolsas
  61 +jumpsuit herren
  62 +nike sneakers
  63 +tunics for women
  64 +skiunterwäsche kinder
  65 +long jacket for women winter wear
  66 +cape
  67 +playmobil einhorn
  68 +mens socks size 10-13
  69 +wedding guest dress fall
  70 +t shirt for men
  71 +golf shirts for men
  72 +barfußschuhe damen
  73 +sweatshirts for women stylish
  74 +toddler slippers
  75 +silicone ring
  76 +lululemon shorts
  77 +hausschuhe kinder mädchen
  78 +nba
  79 +hazbin hotel
  80 +alice in wonderland costume
  81 +women's lingerie, sleep & lounge
  82 +legami weihnachten
  83 +blouse readymade
  84 +portmonee herren
  85 +womens snow pants
  86 +tops für damen
  87 +hangers
  88 +snoopy gifts
  89 +charlie kirk
  90 +tennis skirt
  91 +linen pants women
  92 +dickies 874
  93 +skibrille damen
  94 +kurtis
  95 +warmer for men
  96 +tactical gear
  97 +thermo strumpfhose damen
  98 +hiking pants women
  99 +forest gump
  100 +maternity shorts
  101 +coat
  102 +chiffon sarees for women
  103 +weihnachtsohrringe
  104 +gold heels
  105 +kulturtasche damen
  106 +tank tops for women stylish
  107 +gefütterte matschhose
  108 +mens sweatpants
  109 +graphic print tops
  110 +crop tops for women western wear
  111 +bandanas for men
  112 +black skirt for women
  113 +spongebob costume
  114 +red tank top woman
  115 +hoka clifton 9 womens
  116 +sambas
  117 +loop schal damen
  118 +ethnic wear
  119 +cole haan women shoes
  120 +pyjama damen
  121 +koffer groß
  122 +mochila kipling
  123 +shirt dresses for women
  124 +shapewear for saree
  125 +boss herren
  126 +red beanie
  127 +demon slayer costume
  128 +kids halloween costumes
  129 +puma clothing
  130 +faultier socken
  131 +family christmas pajamas
  132 +traditional dress for women
  133 +mütze
  134 +wonder woman costume adult
  135 +golf glove
  136 +closed toe sandals women
  137 +ugly sweater men
  138 +pajama pants
  139 +bolsa maternidade
  140 +lingerie for women naughty
  141 +banarasi sarees for women
  142 +robes for women
  143 +portemonnaie herren
  144 +churidar set for women with dupatta
  145 +basketball shorts men
  146 +casual kurta set for women
  147 +outdoor hosen für herren
  148 +rcb jersey
  149 +womens jean shorts
  150 +boob tape
  151 +gym
  152 +shirt fan
  153 +sprayground backpack
  154 +twisters
  155 +handschuhe mit heizung
  156 +stirnband damen
  157 +cowboy hat men
  158 +vans shoes men
  159 +weste damen
  160 +old money clothes
  161 +womens shorts casual
  162 +new balance damen
  163 +slim wallet for men
  164 +red corset top
  165 +underwear for women combo
  166 +summer tops for seniors
  167 +carry on luggage
  168 +botas vaqueras para mujer
  169 +freddy krueger sweater
  170 +herren jeans
  171 +calvin klein unterhosen männer
  172 +pool bag
  173 +toms womens shoes
  174 +full sleeve tshirt for men
  175 +golf accessories
  176 +men socks
  177 +skull mask
  178 +jacketfor men
  179 +heated vest women
  180 +kostüm damen
  181 +lululemon crossbody bag
  182 +cap
  183 +white tops for women
  184 +jack
  185 +wollsocken
  186 +hoodie for women
  187 +toddler snow suit
  188 +felt
  189 +eastpak bauchtasche
  190 +fitness clothing
  191 +women kurta
  192 +mira costume kids
  193 +camisa masculina
  194 +black sneakers for men
  195 +easter dresses for women 2025
  196 +maria
  197 +oversized shirts for women
  198 +ballettkleidung mädchen
  199 +shapewear petticoat for women
  200 +beheizbare socken
  201 +kofferset
  202 +winter slippers for woman
  203 +denim shirt women
  204 +nachthemd damen langarm
  205 +white mini dress
  206 +hanes boxer briefs for men
  207 +hausschuhe
  208 +bomber jacket for man
  209 +herren jogginghose
  210 +u.s. polo assn.
  211 +regenhose damen
  212 +mens sweatshirt
  213 +north face jacket men
  214 +white sweater
  215 +small backpack
  216 +santa hats
  217 +duffel bag
  218 +sneaker herren
  219 +hello kitty pajamas
  220 +ecco herren schuhe
  221 +angel costume for girls
  222 +toe rings for women
  223 +nightgowns for women
  224 +boys easter shirt
  225 +red sarees for women
  226 +womens jacket
  227 +one piece swimsuit women tummy control
  228 +fersensporn einlagen
  229 +skechers for women
  230 +wintermütze herren
  231 +socks for woman
  232 +winter wear for men
  233 +meerjungfrau
  234 +kurti pant set with dupatta
  235 +hiking shoes women
  236 +womens fall clothes sale
  237 +skinny fit
  238 +costumes for adults
  239 +green tights
  240 +purses
  241 +clutch purses for women
  242 +relogio
  243 +schürze
  244 +papa geschenk
  245 +airtag holder
  246 +mardi gras beads
  247 +women's skirts
  248 +sheer black tights
  249 +red kurta set for women
  250 +bunny costume
  251 +sunglasses
  252 +malas e mochilas
  253 +sweat set for women
  254 +red top
  255 +code set for women stylish latest
  256 +football jersey for boys
  257 +jogginghose damen
  258 +flanell pyjama damen
  259 +herren t shirt
  260 +us polo t shirts for men
  261 +bodysuits for women
  262 +necessaire feminina
  263 +wig cap
  264 +pullover damen winter
  265 +half sweater for man
  266 +new balance herren
  267 +mala de viagem 10kg
  268 +dog costume
  269 +shoes for man stylish
  270 +crotchless lingerie outfits
  271 +postpartum belly band
  272 +sporthose herren kurz
  273 +pride shirt
  274 +panty for women
  275 +kaftan kurti for women
  276 +jogginghose herren nike
  277 +christmas onesie adult
  278 +period panty
  279 +wedding guest dress
  280 +womens dress pants
  281 +key chain
  282 +short kurtis for woman
  283 +white kurta set for women
  284 +boys water shoes
  285 +cargo pants for women high waist
  286 +チャンピオン パーカー
  287 +chikankari kurta for men
  288 +sally costume women
  289 +mittens for women
  290 +gay
  291 +eastpak rucksack
  292 +simple joys by carters
  293 +strickjacke herren
  294 +jorts
  295 +womens one piece swimsuits
  296 +batman
  297 +church dresses for women 2025
  298 +bra
  299 +nike socken damen 35-38
  300 +loafers for women
  301 +denim top
  302 +wärmesohlen für schuhe
  303 +vivaia shoes for women
  304 +louis phillips shirt for men
  305 +sexy night dress for women honeymoon
  306 +cap for men
  307 +jockey women
  308 +damen wintermantel
  309 +thermal for men
  310 +warme socken damen
  311 +panty for women daily use
  312 +long tops for woman
  313 +golf gifts for men
  314 +rieker winterschuhe damen
  315 +beach wear dress for women
  316 +kurta pajama set for men
  317 +baniyan for man
  318 +laufweste herren
  319 +nursing bras
  320 +pj sets for woman
  321 +louis philippe shirts for men
  322 +喪服 メンズ
  323 +sundress
  324 +dresses for women western wear
  325 +white sandals
  326 +mochila notebook
  327 +punjabi for men
  328 +linen pants men
  329 +libas kurta set for women
  330 +jack and jones jeans herren
  331 +men underwear
  332 +dresses for teens
  333 +workout set
  334 +carmesi period panties for women
  335 +men jackets
  336 +mütze jungen
  337 +marco polo damen
  338 +anarkali suit for women party wear
  339 +freizeithose herren
  340 +green wig
  341 +premium brand deals
  342 +plain sarees for women
  343 +scarf for women stylish
  344 +longchamp organizer insert
  345 +アンダーアーマー パーカー
  346 +red sweater for women
  347 +kurti tops
  348 +cowboy boots
  349 +norweger pullover herren
  350 +cupshe bathing suits for women
  351 +reading glasses for women
  352 +ugg boots damen
  353 +short sleeve shirts for women
  354 +girls snow boots
  355 +fall pajamas
  356 +go devil t shirt
  357 +golf deals
  358 +essentials hoodie
  359 +kerala sarees for women latest design
  360 +jeans tops for women
  361 +steppmantel damen winter
  362 +bombas
  363 +jeans pant for man
  364 +stiefel
  365 +spring tops for women 2025
  366 +wireless bras for women
  367 +plus size dresses for curvy women
  368 +tinkerbell costume for women
  369 +tênis masculino
  370 +panty
  371 +sequence sarees for women
  372 +adidas socken herren 43-46
  373 +top for women
  374 +racerback tank tops for women
  375 +old lady costume for kids
  376 +lola bunny costume
  377 +kurta pant set for women
  378 +woolen cap for man
  379 +onesie
  380 +high waisted shorts women
  381 +newborn girl clothes
  382 +gold heels for women
  383 +vikings
  384 +sweterfor women winter stylish plain black colour without button
  385 +sweater for kids
  386 +fascinators hats for women
  387 +zudio
  388 +curious george costume
  389 +wrangler purse
  390 +tank top with built in bra for women
  391 +bikini damen set
  392 +women kurta sets
  393 +suits for women
  394 +basketball gifts
  395 +alien costume women
  396 +womens sweatpants
  397 +crocs masculino
  398 +travel pants
  399 +yeoreo leggings
  400 +cotton shirts for men
  401 +winter gloves
  402 +period underwear
  403 +vaude
  404 +hausschuhe herren
  405 +crocs feminino
  406 +woolen cap for men
  407 +beheizbare einlegesohlen
  408 +relógios masculinos
  409 +uggs kids
  410 +fleece lined tights
  411 +feeding dresses for women full set
  412 +hausschuhe damen
  413 +garment bag
  414 +lioness
  415 +birkenstock sandals women
  416 +リーバイス 501
  417 +nippies
  418 +elsa kostüm mädchen
  419 +viking costume men
  420 +dirndl dresses women
  421 +platform sandals women
  422 +taschen damen
  423 +pretty garden dresses
  424 +saree
  425 +soft silk sarees for women
  426 +white heels
  427 +shoes for women
  428 +panama jack herren
  429 +coveralls for men
  430 +shirt for man
  431 +pullover damen herbst
  432 +concert outfits for women
  433 +running shoes for women
  434 +calvin klein
  435 +cat costume
  436 +shorts for kids girls
  437 +fahrradhandschuhe damen
  438 +botas de trabajo para hombre
  439 +plus size winter clothes for women
  440 +silicone rings for her
  441 +dr scholls women shoes
  442 +porch goose outfits
  443 +the grinch
  444 +green kurta set for women
  445 +ratchet belts for men
  446 +pajamas
  447 +binders
  448 +crop top for women stylish western
  449 +gold chain for men
  450 +turtle necks tops for women
  451 +veirdo hoodies for men
  452 +kette
  453 +sweater for men winter wear
  454 +hippie costume women
  455 +garmin watch
  456 +wallet
  457 +silk sarees
  458 +chuteira society
  459 +knee support for men gym
  460 +comfiest airport outfits
  461 +leather belt for men
  462 +nike tech
  463 +golf gifts
  464 +winterstiefel mädchen
  465 +family pajamas matching sets
  466 +vest for women
  467 +construction vest
  468 +snow pants men
  469 +スプリングコート メンズ
  470 +women sandals
  471 +cap headbands for graduation insert
  472 +ニューエラ パーカー
  473 +haarspangen damen
  474 +hand gloves for bike riding
  475 +short dresses for women
  476 +tween girls trendy stuff
  477 +suit
  478 +turtle neck t-shirt for men
  479 +geldbörse
  480 +leotards for girls
  481 +hiking shoes men
  482 +baseball bag
  483 +passport holder for travel
  484 +hoodies for men
  485 +ski jacket women
  486 +puma tshirt for man
  487 +lehenga for women latest design
  488 +basketball shoes
  489 +baumwollhandschuhe
  490 +strumpfhose mädchen
  491 +jessie toy story costume adult
  492 +womens underwear cotton
  493 +floral dresses for women
  494 +short kurti for women for jeans
  495 +stocking stuffers for teen boys
  496 +yoga mat for woman
  497 +womens sun hat
  498 +disfraz de halloween de hombre
  499 +high heels
  500 +trousers for men
  501 +vampire costume men
  502 +black tie
  503 +spiderman hoodie zip up
  504 +couples halloween costumes 2025
  505 +nike sweatpants for men
  506 +brown corset
  507 +last day of school teacher shirt
  508 +mens costume
  509 +baby doll night dress sexy
  510 +men kurta pajama set
  511 +nose studs
  512 +mens winter jackets
  513 +lingerie for women sexy slutty
  514 +vera bradley
  515 +womens slides
  516 +krishna dress for baby girl
  517 +black leggings women
  518 +satch schulrucksack jungen
  519 +mother of bride dresses
  520 +parx
  521 +fall clothes
  522 +suuksess
  523 +engagement rings for women
  524 +bademantel damen flauschig
  525 +levis jeans
  526 +red wig
  527 +flowy pants for women
  528 +maternity underwear
  529 +white button down shirt women
  530 +the north face jacke damen
  531 +renaissance costume women
  532 +matching pajamas for couples
  533 +tankini deals for retired women
  534 +formal shirts
  535 +socks for men 9-12
  536 +white tights
  537 +space jam
  538 +bodysuit
  539 +mens pants
  540 +shirt for men stylish
  541 +ugg clogs
  542 +waist beads
  543 +peignoirs femme
  544 +designer sarees for women party wear
  545 +white dress shirt for men
  546 +pullover
  547 +mens halloween costume
  548 +wellensteyn jacke herren
  549 +no show socks men
  550 +winter sneaker damen
  551 +bordeauxfarbener hoodie
  552 +rcb jersey 2025
  553 +ステューシー パーカー
  554 +vampire costume female
  555 +boys christmas pajamas
  556 +women hoodies for winter
  557 +fashion accessories
  558 +black crocs
  559 +gloves for men
  560 +vizzela
  561 +men pants
  562 +wheres waldo costume
  563 +toddler boots
  564 +shark onesie
  565 +body suit
  566 +gym gloves
  567 +tights
  568 +leather jacket men
  569 +damenuhr
  570 +chikankari kurti
  571 +small fan
  572 +ugg tasman
  573 +christmas sweater
  574 +fairy costume for girls
  575 +skechers winterschuhe damen
  576 +adidas spezial damen
  577 +hand gloves
  578 +beheizbare jacke
  579 +summer clothes for women
  580 +leggings
  581 +brown heels
  582 +rain poncho
  583 +rompers
  584 +renaissance costume men
  585 +christmas earrings
  586 +home slippers for women soft
  587 +puma cap men
  588 +rain boots kids
  589 +strickkleid damen herbst
  590 +jockey thermal wear for men
  591 +dresses for girls
  592 +bambus socken
  593 +raincoat for men waterproof
  594 +red lingerie for women
  595 +bathing suits
  596 +strawberry shortcake costume
  597 +victoria
  598 +carhartt pants for men
  599 +tennis shoes
  600 +indo western dress for men
  601 +tung tung tung sahur costume
  602 +bogg bag charms
  603 +football socks
  604 +compression t shirt
  605 +house slippers
  606 +digital watch
  607 +sneaker damen
  608 +tracksuit men
  609 +unterwäsche herren
  610 +mens halloween costumes
  611 +women saree
  612 +polka dot top
  613 +anniversary gifts for men
  614 +badelatschen herren
  615 +adidas shoes for women
  616 +sleeveless t shirts for men
  617 +cross necklace for women
  618 +nursing bras for breastfeeding
  619 +braune strumpfhose damen
  620 +wedding dress for women
  621 +churidar set for women
  622 +mens golf shorts
  623 +feeding kurtis for women cotton
  624 +boho dresses for women
  625 +damensch underwear for men
  626 +night suit for women cotton
  627 +corduroy pants women
  628 +adidas track suit for man
  629 +dresses for women 2025
  630 +cotton night suit for women
  631 +carhartt hoodie
  632 +jackets for men stylish latest
  633 +levis jeans for men
  634 +fall deals
  635 +mesh backpack
  636 +necessaire
  637 +umhängetasche herren
  638 +バドミントン ウェア
  639 +winterhandschuhe kinder
  640 +sully monsters inc costume
  641 +fleece lined tights women
  642 +アイズフロンティア 防寒着
  643 +organza kurta set for women
  644 +straw hat
  645 +tabaktasche
  646 +puma
  647 +ready to wear sarees for women
  648 +teacher shirts
  649 +brille
  650 +スカジャン
  651 +luxury outfits for women
  652 +winter boots for men
  653 +uhr damen
  654 +black lace top
  655 +dress for women
  656 +rumi kpop demon hunters costume
  657 +women sweater
  658 +puma sneaker herren
  659 +harry potter costume kids
  660 +whisper period panties
  661 +merino shirt damen
  662 +blouse for women
  663 +mens gym shorts
  664 +printed top
  665 +elphaba costume
  666 +halloween sweatshirts for women
  667 +rieker boots damen
  668 +arbeitstasche damen
  669 +turning point usa shirt
  670 +lycra track pants
  671 +puffer vests for women
  672 +freddy krueger costume women
  673 +pandora
  674 +oberteile damen
  675 +ariat boots mens
  676 +elmo
  677 +kpop demon hunters backpack
  678 +plus size costumes for women
  679 +tommy hilfiger herren jacke
  680 +woolen kurti for women
  681 +funny st patricks day shirt
  682 +100 days of school costume
  683 +formal dresses
  684 +bandhani saree
  685 +knee high boots women teaieui
  686 +skechers sandals for woman
  687 +affenzahn rucksack
  688 +tube tops for women with built in bra
  689 +jack and jones
  690 +chudidars set for women
  691 +kids dress girls
  692 +jack wolfskin jacke damen
  693 +anarkali kurtis for women
  694 +northface backpack for school
  695 +wide calf boots for women
  696 +halloween costumes for men
  697 +mens t shirts with collar
  698 +tênis feminino
  699 +sling bag for men
  700 +sports jacket for men
  701 +コロンビア ダウンジャケット
  702 +fuzzy socks
  703 +faja body shaper
  704 +women tank tops
  705 +us polo tshirt for men
  706 +chocolate brown dress
  707 +sandalia masculina
  708 +coach
  709 +ブライダルインナー
  710 +boxer briefs for men pack
  711 +the upside
  712 +womens t shirts
  713 +us polo shirt
  714 +kashmiri kurta set for women
  715 +dress shoes for men
  716 +korean pants for woman
  717 +nipple covers for women
  718 +sporttasche herren
  719 +running shoes for men
  720 +swarovski kette
  721 +indo era kurta set with dupatta for women
  722 +brown tights
  723 +handbags
  724 +sporttasche
  725 +tshirts for women
  726 +nighty for women stylish
  727 +overalls for men
  728 +palazzo pants for women
  729 +sperry shoes for men
  730 +lululemon jacket
  731 +geschenk mädchen 9 jahre
  732 +human hair wig
  733 +lowa wanderschuhe herren
  734 +clarks shoes for women
  735 +jockey vest for man
  736 +winter dress for women stylish
  737 +black cardigan for women
  738 +charlie kirk hat
  739 +toddler water shoes
  740 +rieker stiefeletten für damen
  741 +golf shoes men
  742 +presente masculino
  743 +tenis nike para mujer
  744 +stocking
  745 +gabor stiefeletten damen
  746 +uggs women
  747 +petite dresses for women 5 ft
  748 +cotton dress for woman
  749 +white pant for man
  750 +black saree party wear
  751 +allen solly t shirts for men
  752 +fahrradhandschuhe
  753 +コンバース
  754 +dr martens womens boots
  755 +sweater for boys
  756 +weitschaftstiefel damen
  757 +maternity dress
  758 +stiefel damen schwarz
  759 +アンダーアーマー tシャツ
  760 +coach purse
  761 +bombas socks for women
  762 +small crossbody bags for women
  763 +night dress
  764 +abendkleid
  765 +summer outfits for women
  766 +winterkleider damen
  767 +straight fit jeans for women
  768 +bolsa de viagem
  769 +rain boots women
  770 +korean tops for women
  771 +bullmer
... ...
scripts/evaluation/queries/not_clothing.txt 0 → 100644
... ... @@ -0,0 +1,371 @@
  1 +ultrasonic jewelry cleaner
  2 +roland kaiser
  3 +camping ausrüstung
  4 +transformers
  5 +badminton
  6 +burts bees
  7 +barbie accessories
  8 +gel nail polish remover
  9 +thrive causemetics
  10 +garmin uhr
  11 +fathers day gift
  12 +concealer
  13 +pack n play
  14 +balloonerism
  15 +amazon outlet
  16 +running essentials
  17 +snoopy geschenke
  18 +new born baby essentials
  19 +super kitties
  20 +canvas
  21 +transformers age of the primes
  22 +tea pot
  23 +rosary
  24 +silverette nursing cups
  25 +n95 mask for men
  26 +yeti camino 20
  27 +rolex watches for men
  28 +darts
  29 +toddler christmas gifts
  30 +the big bang theory
  31 +ayliva
  32 +motorrad zubehör
  33 +sockenwolle
  34 +gifts for men who have everything
  35 +casio uhr
  36 +fitness tracker
  37 +weihnachtsgeschenke für frauen
  38 +eye liner
  39 +mini fan
  40 +sarah connor
  41 +yoga mat thick
  42 +father's day
  43 +barbies
  44 +gifts for 2 year old girls
  45 +funny fathers day gifts
  46 +der grinch
  47 +fahrzeugschein hülle
  48 +ptomely grey
  49 +apple watch
  50 +dragon ball
  51 +golf bags for men
  52 +friday the 13th
  53 +last of us
  54 +mirror with lights
  55 +borat
  56 +lustige geschenke
  57 +stitch adventskalender
  58 +withings scanwatch 2
  59 +taylor swift gifts
  60 +ghostbusters
  61 +best organization essentials
  62 +action figures
  63 +gifts for 4 year old girl
  64 +toothpaste
  65 +kubotan
  66 +faultier
  67 +capybara plush
  68 +instant camera
  69 +stitch sachen
  70 +whisper ultra xl plus
  71 +cookies
  72 +gas mask
  73 +mothers day gifts for daughter
  74 +hochzeit
  75 +aura ring
  76 +rollschuhe
  77 +guarda chuva
  78 +the goonies
  79 +pocket pussies
  80 +stanley cup 40 oz
  81 +digital calendar
  82 +ぼーん
  83 +phone stand
  84 +pacifier
  85 +gifts for teen boys
  86 +sonic toys
  87 +kitchen sink
  88 +fourth of july deals
  89 +joop homme
  90 +baby essentials
  91 +male sex toy
  92 +supernatural
  93 +kids watch
  94 +retirement gifts for men
  95 +helikon tex
  96 +christmas gifts for grandkids
  97 +shopping cart cover for baby
  98 +sneaker balls
  99 +bedroom decor
  100 +herren uhr
  101 +the shooting of charlie kirk
  102 +vape
  103 +brinquedo menina
  104 +nascar
  105 +cruise essentials
  106 +shaun das schaf
  107 +star wars lego
  108 +geschenk für mama
  109 +black friday angebote 2025 ab wann
  110 +marie antoinette
  111 +teenage boy gifts
  112 +gabbys dollhouse figuren
  113 +jeep wrangler accessories
  114 +graduation gifts for her
  115 +sg cricket kit
  116 +shibumi beach shade
  117 +pilates board
  118 +vorhängeschloss mit zahlencode
  119 +olsenbande
  120 +weihnachtssüßigkeiten
  121 +pilates equipment
  122 +smart watches for women
  123 +michael kors uhr damen
  124 +gifts for people who love baking
  125 +corinthians
  126 +razor
  127 +regenschirm
  128 +fidget toys
  129 +iron man helmet
  130 +christmas wreath
  131 +corpes bride
  132 +portable fan
  133 +diane keaton
  134 +softball bag
  135 +apple watch ultra 2
  136 +jewelry organizers and storage
  137 +dog man
  138 +aperol
  139 +canguru para bebe
  140 +fishing lures
  141 +miss mouths messy eater stain remover
  142 +hydration backpack
  143 +wärmegürtel
  144 +golf balls
  145 +itzy ritzy
  146 +boba
  147 +schwangerschaft
  148 +window fan
  149 +hand cream
  150 +calculator
  151 +twin peaks
  152 +curb your enthusiasm
  153 +anal plug
  154 +scarface
  155 +diet coke
  156 +greys anatomy
  157 +funny gifts
  158 +hunting deals
  159 +hair color for women
  160 +labubu keychain
  161 +geschenk frau
  162 +gifts for people who are always cold
  163 +back scratcher
  164 +dinosaur
  165 +ultraschallreiniger
  166 +barbell
  167 +pink room decor
  168 +bateria cr2032
  169 +chicken jockey
  170 +prime deals sale
  171 +capybara
  172 +stocking stuffers
  173 +boo basket stuffers for women
  174 +dresser for bedroom
  175 +glasses cleaner
  176 +berserk
  177 +summer i turned preety
  178 +boat accessories
  179 +cheers
  180 +pete the cat
  181 +american cheese
  182 +kitchen accessories
  183 +travel size travel products
  184 +wall shelf
  185 +raquete beach tennis
  186 +insider
  187 +nightstand
  188 +cash box
  189 +cotton candy
  190 +以下是列表中**不属于服饰鞋帽类**的搜索需求:
  191 +
  192 +ultrasonic jewelry cleaner
  193 +roland kaiser
  194 +camping ausrüstung
  195 +transformers
  196 +badminton
  197 +burts bees
  198 +gel nail polish remover
  199 +thrive causemetics
  200 +garmin uhr
  201 +fathers day gift
  202 +concealer
  203 +shirt fan
  204 +twisters
  205 +pack n play
  206 +balloonerism
  207 +amazon outlet
  208 +golf accessories
  209 +running essentials
  210 +felt
  211 +new born baby essentials
  212 +super kitties
  213 +maria
  214 +canvas
  215 +transformers age of the primes
  216 +tea pot
  217 +rosary
  218 +silverette nursing cups
  219 +yeti camino 20
  220 +rolex watches for men
  221 +darts
  222 +toddler christmas gifts
  223 +the big bang theory
  224 +ayliva
  225 +fersensporn einlagen
  226 +motorrad zubehör
  227 +meerjungfrau
  228 +sockenwolle
  229 +gifts for men who have everything
  230 +fitness tracker
  231 +eye liner
  232 +mini fan
  233 +sarah connor
  234 +yoga mat thick
  235 +father's day
  236 +gifts for 2 year old girls
  237 +der grinch
  238 +fahrzeugschein hülle
  239 +ptomely grey
  240 +apple watch
  241 +key chain
  242 +gay
  243 +dragon ball
  244 +batman
  245 +friday the 13th
  246 +mirror with lights
  247 +last of us
  248 +borat
  249 +golf gifts for men
  250 +lustige geschenke
  251 +stitch adventskalender
  252 +withings scanwatch 2
  253 +taylor swift gifts
  254 +ghostbusters
  255 +best organization essentials
  256 +action figures
  257 +premium brand deals
  258 +toothpaste
  259 +kubotan
  260 +faultier
  261 +capybara plush
  262 +instant camera
  263 +golf deals
  264 +cookies
  265 +gas mask
  266 +mothers day gifts for daughter
  267 +hochzeit
  268 +aura ring
  269 +rollschuhe
  270 +guarda chuva
  271 +the goonies
  272 +pocket pussies
  273 +zudio
  274 +basketball gifts
  275 +stanley cup 40 oz
  276 +digital calendar
  277 +ぼーん
  278 +pacifier
  279 +phone stand
  280 +kitchen sink
  281 +sonic toys
  282 +fourth of july deals
  283 +male sex toy
  284 +supernatural
  285 +kids watch
  286 +retirement gifts for men
  287 +kette
  288 +garmin watch
  289 +christmas gifts for grandkids
  290 +sneaker balls
  291 +shopping cart cover for baby
  292 +bedroom decor
  293 +vape
  294 +brinquedo menina
  295 +cruise essentials
  296 +nascar
  297 +barbies
  298 +star wars lego
  299 +apple watch
  300 +gabbys dollhouse figuren
  301 +jeep wrangler accessories
  302 +graduation gifts for her
  303 +sg cricket kit
  304 +shibumi beach shade
  305 +pilates board
  306 +vorhängeschloss mit zahlencode
  307 +olsenbande
  308 +weihnachtssüßigkeiten
  309 +pilates equipment
  310 +fidget toys
  311 +iron man helmet
  312 +christmas wreath
  313 +corpes bride
  314 +portable fan
  315 +diane keaton
  316 +softball bag
  317 +aperol
  318 +dog man
  319 +fishing lures
  320 +miss mouths messy eater stain remover
  321 +tung tung tung sahur costume
  322 +bogg bag charms
  323 +anniversary gifts for men
  324 +golf balls
  325 +itzy ritzy
  326 +boba
  327 +window fan
  328 +rumi kpop demon hunters costume
  329 +hand cream
  330 +calculator
  331 +twin peaks
  332 +turning point usa shirt
  333 +curb your enthusiasm
  334 +pandora
  335 +kpop demon hunters backpack
  336 +anal plug
  337 +scarface
  338 +diet coke
  339 +greys anatomy
  340 +hunting deals
  341 +100 days of school costume
  342 +hair color for women
  343 +labubu keychain
  344 +back scratcher
  345 +dinosaur
  346 +ultraschallreiniger
  347 +barbell
  348 +bateria cr2032
  349 +pink room decor
  350 +chicken jockey
  351 +prime deals sale
  352 +capybara
  353 +stocking stuffers
  354 +the upside
  355 +boo basket stuffers for women
  356 +dresser for bedroom
  357 +glasses cleaner
  358 +berserk
  359 +summer i turned preety
  360 +boat accessories
  361 +cheers
  362 +human hair wig
  363 +pete the cat
  364 +american cheese
  365 +kitchen accessories
  366 +travel size travel products
  367 +wall shelf
  368 +insider
  369 +nightstand
  370 +cash box
  371 +cotton candy
... ...
scripts/evaluation/resume_coarse_fusion_tuning_long.sh
... ... @@ -29,6 +29,7 @@ fi
29 29 MAX_EVALS="${MAX_EVALS:-36}"
30 30 BATCH_SIZE="${BATCH_SIZE:-3}"
31 31 CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}"
  32 +DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
32 33  
33 34 LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
34 35 mkdir -p "${LAUNCH_DIR}"
... ... @@ -44,6 +45,7 @@ CMD=(
44 45 --search-space "${RUN_DIR}/search_space.yaml"
45 46 --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md
46 47 --tenant-id 163
  48 + --dataset-id "${DATASET_ID}"
47 49 --queries-file scripts/evaluation/queries/queries.txt
48 50 --top-k 100
49 51 --language en
... ...
scripts/evaluation/run_coarse_fusion_tuning.sh
... ... @@ -10,6 +10,7 @@ python scripts/evaluation/tune_fusion.py \
10 10 --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml \
11 11 --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md \
12 12 --tenant-id 163 \
  13 + --dataset-id "${REPO_EVAL_DATASET_ID:-core_queries}" \
13 14 --queries-file scripts/evaluation/queries/queries.txt \
14 15 --top-k 100 \
15 16 --language en \
... ...
scripts/evaluation/start_coarse_fusion_tuning_long.sh
... ... @@ -10,6 +10,7 @@ MAX_EVALS=&quot;${MAX_EVALS:-36}&quot;
10 10 BATCH_SIZE="${BATCH_SIZE:-3}"
11 11 CANDIDATE_POOL_SIZE="${CANDIDATE_POOL_SIZE:-512}"
12 12 RANDOM_SEED="${RANDOM_SEED:-20260416}"
  13 +DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
13 14  
14 15 LAUNCH_DIR="artifacts/search_evaluation/tuning_launches"
15 16 mkdir -p "${LAUNCH_DIR}"
... ... @@ -25,6 +26,7 @@ CMD=(
25 26 --search-space scripts/evaluation/tuning/coarse_rank_fusion_space.yaml
26 27 --seed-report artifacts/search_evaluation/batch_reports/batch_20260415T150754Z_00b6a8aa3d.md
27 28 --tenant-id 163
  29 + --dataset-id "${DATASET_ID}"
28 30 --queries-file scripts/evaluation/queries/queries.txt
29 31 --top-k 100
30 32 --language en
... ...
scripts/evaluation/start_eval.sh
... ... @@ -6,6 +6,7 @@ ROOT=&quot;$(cd &quot;$(dirname &quot;$0&quot;)/../..&quot; &amp;&amp; pwd)&quot;
6 6 cd "$ROOT"
7 7 PY="${ROOT}/.venv/bin/python"
8 8 TENANT_ID="${TENANT_ID:-163}"
  9 +DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
9 10 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
10 11  
11 12 usage() {
... ... @@ -13,13 +14,14 @@ usage() {
13 14 echo " batch — batch eval: live search every query, LLM only for missing labels (top_k=50)"
14 15 echo " batch-rebuild — deep rebuild: build --force-refresh-labels (search recall pool + full-corpus rerank + batched LLM; expensive)"
15 16 echo " serve — eval UI (default http://0.0.0.0:\${EVAL_WEB_PORT:-6010}/; also: ./scripts/start_eval_web.sh)"
16   - echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
  17 + echo "Env: TENANT_ID (default 163), REPO_EVAL_DATASET_ID (default core_queries), REPO_EVAL_QUERIES, EVAL_WEB_HOST, EVAL_WEB_PORT (default 6010)"
17 18 }
18 19  
19 20 case "${1:-}" in
20 21 batch)
21 22 exec "$PY" scripts/evaluation/build_annotation_set.py batch \
22 23 --tenant-id "$TENANT_ID" \
  24 + --dataset-id "$DATASET_ID" \
23 25 --queries-file "$QUERIES" \
24 26 --top-k 50 \
25 27 --language en
... ... @@ -27,6 +29,7 @@ case &quot;${1:-}&quot; in
27 29 batch-rebuild)
28 30 exec "$PY" scripts/evaluation/build_annotation_set.py build \
29 31 --tenant-id "$TENANT_ID" \
  32 + --dataset-id "$DATASET_ID" \
30 33 --queries-file "$QUERIES" \
31 34 --search-depth 500 \
32 35 --rerank-depth 10000 \
... ... @@ -40,6 +43,7 @@ case &quot;${1:-}&quot; in
40 43 EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}"
41 44 exec "$PY" scripts/evaluation/serve_eval_web.py serve \
42 45 --tenant-id "$TENANT_ID" \
  46 + --dataset-id "$DATASET_ID" \
43 47 --queries-file "$QUERIES" \
44 48 --host "$EVAL_WEB_HOST" \
45 49 --port "$EVAL_WEB_PORT"
... ...
scripts/evaluation/tune_fusion.py
... ... @@ -41,6 +41,7 @@ from scripts.evaluation.eval_framework import ( # noqa: E402
41 41 utc_now_iso,
42 42 utc_timestamp,
43 43 )
  44 +from scripts.evaluation.eval_framework.datasets import resolve_dataset
44 45  
45 46  
46 47 CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml"
... ... @@ -373,6 +374,7 @@ def verify_backend_config(base_url: str, target_path: str, expected: Dict[str, A
373 374 def run_batch_eval(
374 375 *,
375 376 tenant_id: str,
  377 + dataset_id: str | None,
376 378 queries_file: Path,
377 379 top_k: int,
378 380 language: str,
... ... @@ -384,13 +386,15 @@ def run_batch_eval(
384 386 "batch",
385 387 "--tenant-id",
386 388 str(tenant_id),
387   - "--queries-file",
388   - str(queries_file),
389 389 "--top-k",
390 390 str(top_k),
391 391 "--language",
392 392 language,
393 393 ]
  394 + if dataset_id:
  395 + cmd.extend(["--dataset-id", dataset_id])
  396 + else:
  397 + cmd.extend(["--queries-file", str(queries_file)])
394 398 if force_refresh_labels:
395 399 cmd.append("--force-refresh-labels")
396 400 completed = subprocess.run(
... ... @@ -406,16 +410,21 @@ def run_batch_eval(
406 410 if not batch_ids:
407 411 raise RuntimeError(f"failed to parse batch output: {output[-2000:]}")
408 412 batch_id = batch_ids[-1]
409   - batch_json_path = DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.json"
  413 + pattern = f"datasets/*/batch_reports/{batch_id}/report.json"
  414 + matches = sorted(DEFAULT_ARTIFACT_ROOT.glob(pattern))
  415 + batch_json_path = matches[0] if matches else (DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.json")
410 416 if not batch_json_path.is_file():
411 417 raise RuntimeError(f"batch json not found after eval: {batch_json_path}")
412 418 payload = json.loads(batch_json_path.read_text(encoding="utf-8"))
  419 + report_path = batch_json_path.with_name("report.md")
  420 + if not report_path.is_file():
  421 + report_path = DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.md"
413 422 return {
414 423 "batch_id": batch_id,
415 424 "payload": payload,
416 425 "raw_output": output,
417 426 "batch_json_path": str(batch_json_path),
418   - "batch_report_path": str(DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_id}.md"),
  427 + "batch_report_path": str(report_path),
419 428 }
420 429  
421 430  
... ... @@ -806,6 +815,8 @@ def render_markdown(
806 815 run_id: str,
807 816 created_at: str,
808 817 tenant_id: str,
  818 + dataset_id: str,
  819 + dataset_name: str,
809 820 query_count: int,
810 821 top_k: int,
811 822 metric: str,
... ... @@ -829,6 +840,8 @@ def render_markdown(
829 840 f"- Run ID: {run_id}",
830 841 f"- Created at: {created_at}",
831 842 f"- Tenant ID: {tenant_id}",
  843 + f"- Dataset ID: {dataset_id}",
  844 + f"- Dataset Name: {dataset_name}",
832 845 f"- Query count: {query_count}",
833 846 f"- Top K: {top_k}",
834 847 f"- Score metric: {metric}",
... ... @@ -941,6 +954,8 @@ def persist_run_summary(
941 954 run_dir: Path,
942 955 run_id: str,
943 956 tenant_id: str,
  957 + dataset_id: str,
  958 + dataset_name: str,
944 959 query_count: int,
945 960 top_k: int,
946 961 metric: str,
... ... @@ -951,6 +966,8 @@ def persist_run_summary(
951 966 "run_id": run_id,
952 967 "created_at": utc_now_iso(),
953 968 "tenant_id": tenant_id,
  969 + "dataset_id": dataset_id,
  970 + "dataset_name": dataset_name,
954 971 "query_count": query_count,
955 972 "top_k": top_k,
956 973 "score_metric": metric,
... ... @@ -965,6 +982,8 @@ def persist_run_summary(
965 982 run_id=run_id,
966 983 created_at=summary["created_at"],
967 984 tenant_id=tenant_id,
  985 + dataset_id=dataset_id,
  986 + dataset_name=dataset_name,
968 987 query_count=query_count,
969 988 top_k=top_k,
970 989 metric=metric,
... ... @@ -976,8 +995,18 @@ def persist_run_summary(
976 995  
977 996  
978 997 def run_experiment_mode(args: argparse.Namespace) -> None:
979   - queries_file = Path(args.queries_file)
980   - queries = read_queries(queries_file)
  998 + dataset = resolve_dataset(
  999 + dataset_id=getattr(args, "dataset_id", None),
  1000 + query_file=Path(args.queries_file).resolve() if getattr(args, "queries_file", None) else None,
  1001 + tenant_id=str(args.tenant_id),
  1002 + language=str(args.language),
  1003 + )
  1004 + args.dataset_id = dataset.dataset_id
  1005 + args.queries_file = str(dataset.query_file)
  1006 + args.tenant_id = dataset.tenant_id
  1007 + args.language = dataset.language
  1008 + queries_file = dataset.query_file
  1009 + queries = list(dataset.queries)
981 1010 base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
982 1011 base_config = load_yaml(CONFIG_PATH)
983 1012 experiments = load_experiments(Path(args.experiments_file))
... ... @@ -1012,6 +1041,7 @@ def run_experiment_mode(args: argparse.Namespace) -&gt; None:
1012 1041 )
1013 1042 batch_result = run_batch_eval(
1014 1043 tenant_id=args.tenant_id,
  1044 + dataset_id=args.dataset_id,
1015 1045 queries_file=queries_file,
1016 1046 top_k=args.top_k,
1017 1047 language=args.language,
... ... @@ -1064,6 +1094,8 @@ def run_experiment_mode(args: argparse.Namespace) -&gt; None:
1064 1094 run_dir=run_dir,
1065 1095 run_id=run_id,
1066 1096 tenant_id=str(args.tenant_id),
  1097 + dataset_id=str(args.dataset_id),
  1098 + dataset_name=dataset.display_name,
1067 1099 query_count=len(queries),
1068 1100 top_k=args.top_k,
1069 1101 metric=args.score_metric,
... ... @@ -1075,8 +1107,18 @@ def run_experiment_mode(args: argparse.Namespace) -&gt; None:
1075 1107  
1076 1108  
1077 1109 def run_optimize_mode(args: argparse.Namespace) -> None:
1078   - queries_file = Path(args.queries_file)
1079   - queries = read_queries(queries_file)
  1110 + dataset = resolve_dataset(
  1111 + dataset_id=getattr(args, "dataset_id", None),
  1112 + query_file=Path(args.queries_file).resolve() if getattr(args, "queries_file", None) else None,
  1113 + tenant_id=str(args.tenant_id),
  1114 + language=str(args.language),
  1115 + )
  1116 + args.dataset_id = dataset.dataset_id
  1117 + args.queries_file = str(dataset.query_file)
  1118 + args.tenant_id = dataset.tenant_id
  1119 + args.language = dataset.language
  1120 + queries_file = dataset.query_file
  1121 + queries = list(dataset.queries)
1080 1122 base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
1081 1123 base_config = load_yaml(CONFIG_PATH)
1082 1124 search_space_path = Path(args.search_space)
... ... @@ -1101,6 +1143,11 @@ def run_optimize_mode(args: argparse.Namespace) -&gt; None:
1101 1143 baseline_key = space.canonical_key(baseline_params)
1102 1144 if baseline_key not in {space.canonical_key(item["params"]) for item in trials if item.get("params")}:
1103 1145 payload = load_batch_payload(args.seed_report)
  1146 + payload_dataset_id = str(((payload.get("dataset") or {}).get("dataset_id")) or "")
  1147 + if payload_dataset_id and payload_dataset_id != str(args.dataset_id):
  1148 + raise RuntimeError(
  1149 + f"seed report dataset mismatch: expected={args.dataset_id} actual={payload_dataset_id}"
  1150 + )
1104 1151 trial = {
1105 1152 "trial_id": next_trial_name(trials, "trial"),
1106 1153 "name": "seed_baseline",
... ... @@ -1169,6 +1216,7 @@ def run_optimize_mode(args: argparse.Namespace) -&gt; None:
1169 1216 )
1170 1217 batch_result = run_batch_eval(
1171 1218 tenant_id=args.tenant_id,
  1219 + dataset_id=args.dataset_id,
1172 1220 queries_file=queries_file,
1173 1221 top_k=args.top_k,
1174 1222 language=args.language,
... ... @@ -1236,6 +1284,8 @@ def run_optimize_mode(args: argparse.Namespace) -&gt; None:
1236 1284 run_dir=run_dir,
1237 1285 run_id=run_id,
1238 1286 tenant_id=str(args.tenant_id),
  1287 + dataset_id=str(args.dataset_id),
  1288 + dataset_name=dataset.display_name,
1239 1289 query_count=len(queries),
1240 1290 top_k=args.top_k,
1241 1291 metric=args.score_metric,
... ... @@ -1268,6 +1318,8 @@ def run_optimize_mode(args: argparse.Namespace) -&gt; None:
1268 1318 run_dir=run_dir,
1269 1319 run_id=run_id,
1270 1320 tenant_id=str(args.tenant_id),
  1321 + dataset_id=str(args.dataset_id),
  1322 + dataset_name=dataset.display_name,
1271 1323 query_count=len(queries),
1272 1324 top_k=args.top_k,
1273 1325 metric=args.score_metric,
... ... @@ -1286,6 +1338,7 @@ def build_parser() -&gt; argparse.ArgumentParser:
1286 1338 )
1287 1339 parser.add_argument("--mode", choices=["optimize", "experiments"], default="optimize")
1288 1340 parser.add_argument("--tenant-id", default="163")
  1341 + parser.add_argument("--dataset-id", default="core_queries")
1289 1342 parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1290 1343 parser.add_argument("--top-k", type=int, default=100)
1291 1344 parser.add_argument("--language", default="en")
... ...
scripts/inspect/analyze_coarse_component_regression.py 0 → 100644
... ... @@ -0,0 +1,317 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Compare coarse-ranking score components between two indices for queries that regressed
  4 +in evaluation reports.
  5 +
  6 +This script answers a narrower question than field diffing:
  7 +for the documents that matter in worse queries, did the ranking move because of
  8 +image KNN, text KNN, lexical/text score, or coarse-window recall?
  9 +
  10 +Typical usage:
  11 + ./.venv/bin/python scripts/inspect/analyze_coarse_component_regression.py \
  12 + --current-report artifacts/search_evaluation/batch_reports/batch_20260417T073901Z_00b6a8aa3d.json \
  13 + --backup-report artifacts/search_evaluation/batch_reports/batch_20260417T074717Z_00b6a8aa3d.json \
  14 + --current-index search_products_tenant_163 \
  15 + --backup-index search_products_tenant_163_backup_20260415_1438
  16 +"""
  17 +
  18 +from __future__ import annotations
  19 +
  20 +import argparse
  21 +import logging
  22 +import os
  23 +import statistics
  24 +import sys
  25 +from collections import Counter
  26 +from pathlib import Path
  27 +from typing import Any, Dict, Iterable, List, Sequence, Tuple
  28 +
  29 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  30 +if str(PROJECT_ROOT) not in sys.path:
  31 + sys.path.insert(0, str(PROJECT_ROOT))
  32 +
  33 +from config import get_app_config
  34 +from context.request_context import create_request_context
  35 +from query import QueryParser
  36 +from search import Searcher
  37 +from utils.es_client import get_es_client_from_env
  38 +
  39 +from scripts.inspect.analyze_eval_index_regression import _load_report
  40 +
  41 +
  42 +logger = logging.getLogger("coarse_component_regression")
  43 +
  44 +
  45 +def _rank_map(rows: Sequence[Dict[str, Any]]) -> Dict[str, int]:
  46 + return {str(row["spu_id"]): int(row["rank"]) for row in rows}
  47 +
  48 +
  49 +def _collect_regressed_docs(
  50 + current_report: Dict[str, Any],
  51 + backup_report: Dict[str, Any],
  52 + *,
  53 + rank_gap_threshold: int,
  54 + scan_depth: int,
  55 +) -> Dict[str, List[Dict[str, Any]]]:
  56 + current_per_query = {row["query"]: row for row in current_report["per_query"]}
  57 + backup_per_query = {row["query"]: row for row in backup_report["per_query"]}
  58 + grouped: Dict[str, List[Dict[str, Any]]] = {}
  59 + for query, current_case in current_per_query.items():
  60 + backup_case = backup_per_query[query]
  61 + delta = (
  62 + float(current_case["metrics"]["Primary_Metric_Score"])
  63 + - float(backup_case["metrics"]["Primary_Metric_Score"])
  64 + )
  65 + if delta >= 0:
  66 + continue
  67 + current_ranks = _rank_map(current_case["top_results"])
  68 + for row in backup_case["top_results"][:scan_depth]:
  69 + if row["label"] not in {"Fully Relevant", "Mostly Relevant"}:
  70 + continue
  71 + current_rank = current_ranks.get(row["spu_id"], 999)
  72 + if current_rank <= int(row["rank"]) + rank_gap_threshold:
  73 + continue
  74 + grouped.setdefault(query, []).append(
  75 + {
  76 + "query": query,
  77 + "delta_primary": delta,
  78 + "spu_id": str(row["spu_id"]),
  79 + "backup_rank_eval": int(row["rank"]),
  80 + "backup_label": str(row["label"]),
  81 + "current_rank_eval": current_rank,
  82 + }
  83 + )
  84 + return grouped
  85 +
  86 +
  87 +def _build_searcher() -> Searcher:
  88 + config = get_app_config().search
  89 + es_client = get_es_client_from_env()
  90 + query_parser = QueryParser(config)
  91 + return Searcher(es_client, config, query_parser)
  92 +
  93 +
  94 +def _run_query(searcher: Searcher, *, query: str, tenant_id: str, index_name: str) -> Tuple[Dict[str, Dict[str, Any]], int]:
  95 + os.environ[f"ES_INDEX_OVERRIDE_TENANT_{tenant_id}"] = index_name
  96 + ctx = create_request_context(reqid="coarsecmp", uid="-1")
  97 + ctx._logger = logger
  98 + searcher.search(
  99 + query=query,
  100 + tenant_id=tenant_id,
  101 + size=10,
  102 + context=ctx,
  103 + debug=True,
  104 + enable_rerank=False,
  105 + language="en",
  106 + )
  107 + rows = ctx.get_intermediate_result("coarse_rank_scores", []) or []
  108 + by_doc: Dict[str, Dict[str, Any]] = {}
  109 + for rank, row in enumerate(rows, start=1):
  110 + doc_id = row.get("doc_id")
  111 + if doc_id is None:
  112 + continue
  113 + payload = dict(row)
  114 + payload["_coarse_rank"] = rank
  115 + by_doc[str(doc_id)] = payload
  116 + return by_doc, len(rows)
  117 +
  118 +
  119 +def _safe_float(value: Any) -> float | None:
  120 + try:
  121 + if value is None:
  122 + return None
  123 + return float(value)
  124 + except (TypeError, ValueError):
  125 + return None
  126 +
  127 +
  128 +def _delta(current_value: Any, backup_value: Any) -> float | None:
  129 + current = _safe_float(current_value)
  130 + backup = _safe_float(backup_value)
  131 + if current is None or backup is None:
  132 + return None
  133 + return current - backup
  134 +
  135 +
  136 +def _counter_key(delta_value: float | None, *, eps: float = 1e-6) -> str:
  137 + if delta_value is None:
  138 + return "missing"
  139 + if abs(delta_value) <= eps:
  140 + return "same"
  141 + return "lower" if delta_value < 0 else "higher"
  142 +
  143 +
  144 +def _median_or_none(values: Sequence[float]) -> float | None:
  145 + if not values:
  146 + return None
  147 + return float(statistics.median(values))
  148 +
  149 +
  150 +def _summarize_rows(comparisons: Sequence[Dict[str, Any]]) -> None:
  151 + both_present = [row for row in comparisons if row["current_row"] is not None and row["backup_row"] is not None]
  152 + backup_only = [row for row in comparisons if row["current_row"] is None and row["backup_row"] is not None]
  153 + current_only = [row for row in comparisons if row["current_row"] is not None and row["backup_row"] is None]
  154 +
  155 + image_counter: Counter[str] = Counter()
  156 + text_knn_counter: Counter[str] = Counter()
  157 + text_counter: Counter[str] = Counter()
  158 + es_counter: Counter[str] = Counter()
  159 + coarse_counter: Counter[str] = Counter()
  160 +
  161 + image_deltas: List[float] = []
  162 + text_knn_deltas: List[float] = []
  163 + text_deltas: List[float] = []
  164 + es_deltas: List[float] = []
  165 + coarse_deltas: List[float] = []
  166 +
  167 + for row in both_present:
  168 + image_delta = _delta(row["current_row"].get("image_knn_score"), row["backup_row"].get("image_knn_score"))
  169 + text_knn_delta = _delta(row["current_row"].get("text_knn_score"), row["backup_row"].get("text_knn_score"))
  170 + text_delta = _delta(row["current_row"].get("text_score"), row["backup_row"].get("text_score"))
  171 + es_delta = _delta(row["current_row"].get("es_score"), row["backup_row"].get("es_score"))
  172 + coarse_delta = _delta(row["current_row"].get("coarse_score"), row["backup_row"].get("coarse_score"))
  173 +
  174 + image_counter[_counter_key(image_delta)] += 1
  175 + text_knn_counter[_counter_key(text_knn_delta)] += 1
  176 + text_counter[_counter_key(text_delta)] += 1
  177 + es_counter[_counter_key(es_delta)] += 1
  178 + coarse_counter[_counter_key(coarse_delta)] += 1
  179 +
  180 + for bucket, sink in (
  181 + (image_delta, image_deltas),
  182 + (text_knn_delta, text_knn_deltas),
  183 + (text_delta, text_deltas),
  184 + (es_delta, es_deltas),
  185 + (coarse_delta, coarse_deltas),
  186 + ):
  187 + if bucket is not None:
  188 + sink.append(bucket)
  189 +
  190 + print("Coarse Component Summary")
  191 + print("=" * 80)
  192 + print(f"affected_docs: {len(comparisons)}")
  193 + print(f"present_in_both_coarse_windows: {len(both_present)}")
  194 + print(f"only_in_backup_coarse_window: {len(backup_only)}")
  195 + print(f"only_in_current_coarse_window: {len(current_only)}")
  196 + print()
  197 + print(f"image_knn delta buckets: {dict(image_counter)}")
  198 + print(f"text_knn delta buckets : {dict(text_knn_counter)}")
  199 + print(f"text_score delta buckets: {dict(text_counter)}")
  200 + print(f"es_score delta buckets : {dict(es_counter)}")
  201 + print(f"coarse_score buckets : {dict(coarse_counter)}")
  202 + print()
  203 + print(
  204 + "median deltas (current - backup): "
  205 + f"image_knn={_median_or_none(image_deltas)} | "
  206 + f"text_knn={_median_or_none(text_knn_deltas)} | "
  207 + f"text_score={_median_or_none(text_deltas)} | "
  208 + f"es_score={_median_or_none(es_deltas)} | "
  209 + f"coarse_score={_median_or_none(coarse_deltas)}"
  210 + )
  211 + print()
  212 +
  213 +
  214 +def _print_query_examples(comparisons: Sequence[Dict[str, Any]], top_queries: int, docs_per_query: int) -> None:
  215 + grouped: Dict[str, List[Dict[str, Any]]] = {}
  216 + for row in comparisons:
  217 + grouped.setdefault(row["query"], []).append(row)
  218 +
  219 + ordered_queries = sorted(
  220 + grouped,
  221 + key=lambda query: min(item["delta_primary"] for item in grouped[query]),
  222 + )
  223 +
  224 + print(f"Detailed Examples (top {top_queries} queries)")
  225 + print("=" * 80)
  226 + for query in ordered_queries[:top_queries]:
  227 + rows = sorted(grouped[query], key=lambda item: item["backup_rank_eval"])
  228 + print(f"\n## {query}")
  229 + print(f"affected_docs={len(rows)} | delta_primary={rows[0]['delta_primary']:+.6f}")
  230 + for row in rows[:docs_per_query]:
  231 + current_row = row["current_row"]
  232 + backup_row = row["backup_row"]
  233 + print(
  234 + f" - spu={row['spu_id']} "
  235 + f"eval_current={row['current_rank_eval']} eval_backup={row['backup_rank_eval']} "
  236 + f"coarse_current={current_row.get('_coarse_rank') if current_row else None} "
  237 + f"coarse_backup={backup_row.get('_coarse_rank') if backup_row else None}"
  238 + )
  239 + if current_row and backup_row:
  240 + print(
  241 + " image_knn "
  242 + f"{backup_row.get('image_knn_score')} -> {current_row.get('image_knn_score')} | "
  243 + "text_knn "
  244 + f"{backup_row.get('text_knn_score')} -> {current_row.get('text_knn_score')} | "
  245 + "text_score "
  246 + f"{backup_row.get('text_score')} -> {current_row.get('text_score')} | "
  247 + "es_score "
  248 + f"{backup_row.get('es_score')} -> {current_row.get('es_score')} | "
  249 + "coarse_score "
  250 + f"{backup_row.get('coarse_score')} -> {current_row.get('coarse_score')}"
  251 + )
  252 + else:
  253 + print(
  254 + f" present_current={current_row is not None} "
  255 + f"present_backup={backup_row is not None}"
  256 + )
  257 +
  258 +
  259 +def main() -> None:
  260 + parser = argparse.ArgumentParser(description="Analyze coarse-score component regressions")
  261 + parser.add_argument("--current-report", required=True)
  262 + parser.add_argument("--backup-report", required=True)
  263 + parser.add_argument("--current-index", required=True)
  264 + parser.add_argument("--backup-index", required=True)
  265 + parser.add_argument("--tenant-id", default="163")
  266 + parser.add_argument("--rank-gap-threshold", type=int, default=5)
  267 + parser.add_argument("--scan-depth", type=int, default=20)
  268 + parser.add_argument("--detail-queries", type=int, default=6)
  269 + parser.add_argument("--detail-docs-per-query", type=int, default=3)
  270 + args = parser.parse_args()
  271 +
  272 + logging.basicConfig(level=logging.WARNING)
  273 +
  274 + current_report = _load_report(args.current_report)
  275 + backup_report = _load_report(args.backup_report)
  276 + regressed = _collect_regressed_docs(
  277 + current_report=current_report,
  278 + backup_report=backup_report,
  279 + rank_gap_threshold=args.rank_gap_threshold,
  280 + scan_depth=args.scan_depth,
  281 + )
  282 +
  283 + searcher = _build_searcher()
  284 + comparisons: List[Dict[str, Any]] = []
  285 +
  286 + for query, rows in regressed.items():
  287 + current_by_doc, _ = _run_query(
  288 + searcher,
  289 + query=query,
  290 + tenant_id=args.tenant_id,
  291 + index_name=args.current_index,
  292 + )
  293 + backup_by_doc, _ = _run_query(
  294 + searcher,
  295 + query=query,
  296 + tenant_id=args.tenant_id,
  297 + index_name=args.backup_index,
  298 + )
  299 + for row in rows:
  300 + comparisons.append(
  301 + {
  302 + **row,
  303 + "current_row": current_by_doc.get(row["spu_id"]),
  304 + "backup_row": backup_by_doc.get(row["spu_id"]),
  305 + }
  306 + )
  307 +
  308 + _summarize_rows(comparisons)
  309 + _print_query_examples(
  310 + comparisons,
  311 + top_queries=args.detail_queries,
  312 + docs_per_query=args.detail_docs_per_query,
  313 + )
  314 +
  315 +
  316 +if __name__ == "__main__":
  317 + main()
... ...
scripts/inspect/analyze_eval_index_regression.py 0 → 100644
... ... @@ -0,0 +1,337 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Analyze search evaluation regressions between two batch reports and trace them back
  4 +to document field changes across two Elasticsearch indices.
  5 +
  6 +Typical usage:
  7 + ./.venv/bin/python scripts/inspect/analyze_eval_index_regression.py \
  8 + --current-report artifacts/search_evaluation/batch_reports/batch_20260417T073901Z_00b6a8aa3d.json \
  9 + --backup-report artifacts/search_evaluation/batch_reports/batch_20260417T074717Z_00b6a8aa3d.json \
  10 + --current-index search_products_tenant_163 \
  11 + --backup-index search_products_tenant_163_backup_20260415_1438
  12 +"""
  13 +
  14 +from __future__ import annotations
  15 +
  16 +import argparse
  17 +import json
  18 +import statistics
  19 +import sys
  20 +from collections import Counter
  21 +from pathlib import Path
  22 +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
  23 +
  24 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  25 +if str(PROJECT_ROOT) not in sys.path:
  26 + sys.path.insert(0, str(PROJECT_ROOT))
  27 +
  28 +from utils.es_client import get_es_client_from_env
  29 +
  30 +
  31 +SEARCHABLE_SOURCE_FIELDS: Sequence[str] = (
  32 + "title",
  33 + "keywords",
  34 + "qanchors",
  35 + "enriched_tags",
  36 + "enriched_attributes",
  37 + "option1_values",
  38 + "option2_values",
  39 + "option3_values",
  40 + "tags",
  41 + "category_path",
  42 + "category_name_text",
  43 +)
  44 +
  45 +CORE_FIELDS_TO_COMPARE: Sequence[str] = (
  46 + "title",
  47 + "keywords",
  48 + "qanchors",
  49 + "enriched_tags",
  50 + "enriched_attributes",
  51 + "option1_values",
  52 + "option2_values",
  53 + "option3_values",
  54 + "tags",
  55 +)
  56 +
  57 +STRONG_LABELS = {"Fully Relevant", "Mostly Relevant"}
  58 +
  59 +
  60 +def _load_report(path: str) -> Dict[str, Any]:
  61 + return json.loads(Path(path).read_text())
  62 +
  63 +
  64 +def _rank_map(rows: Sequence[Dict[str, Any]]) -> Dict[str, int]:
  65 + return {str(row["spu_id"]): int(row["rank"]) for row in rows}
  66 +
  67 +
  68 +def _label_map(rows: Sequence[Dict[str, Any]]) -> Dict[str, str]:
  69 + return {str(row["spu_id"]): str(row["label"]) for row in rows}
  70 +
  71 +
  72 +def _count_items(value: Any) -> int:
  73 + if isinstance(value, list):
  74 + return len(value)
  75 + if isinstance(value, str):
  76 + return len([x for x in value.split(",") if x.strip()])
  77 + return 0
  78 +
  79 +
  80 +def _json_short(value: Any, max_len: int = 220) -> str:
  81 + payload = json.dumps(value, ensure_ascii=False, sort_keys=True)
  82 + if len(payload) <= max_len:
  83 + return payload
  84 + return payload[: max_len - 3] + "..."
  85 +
  86 +
  87 +class SourceFetcher:
  88 + def __init__(self) -> None:
  89 + self.es = get_es_client_from_env().client
  90 + self._cache: Dict[Tuple[str, str], Optional[Dict[str, Any]]] = {}
  91 +
  92 + def fetch(self, index_name: str, spu_id: str) -> Optional[Dict[str, Any]]:
  93 + key = (index_name, spu_id)
  94 + if key in self._cache:
  95 + return self._cache[key]
  96 + body = {
  97 + "size": 1,
  98 + "query": {"term": {"spu_id": spu_id}},
  99 + "_source": ["spu_id", *SEARCHABLE_SOURCE_FIELDS],
  100 + }
  101 + hits = self.es.search(index=index_name, body=body)["hits"]["hits"]
  102 + doc = hits[0]["_source"] if hits else None
  103 + self._cache[key] = doc
  104 + return doc
  105 +
  106 +
  107 +def _changed_fields(current_doc: Dict[str, Any], backup_doc: Dict[str, Any]) -> List[str]:
  108 + return [field for field in CORE_FIELDS_TO_COMPARE if current_doc.get(field) != backup_doc.get(field)]
  109 +
  110 +
  111 +def _iter_regressed_docs(
  112 + current_report: Dict[str, Any],
  113 + backup_report: Dict[str, Any],
  114 + rank_gap_threshold: int,
  115 + scan_depth: int,
  116 +) -> Iterable[Dict[str, Any]]:
  117 + current_per_query = {row["query"]: row for row in current_report["per_query"]}
  118 + backup_per_query = {row["query"]: row for row in backup_report["per_query"]}
  119 + for query, current_case in current_per_query.items():
  120 + backup_case = backup_per_query[query]
  121 + delta = (
  122 + float(current_case["metrics"]["Primary_Metric_Score"])
  123 + - float(backup_case["metrics"]["Primary_Metric_Score"])
  124 + )
  125 + if delta >= 0:
  126 + continue
  127 + current_ranks = _rank_map(current_case["top_results"])
  128 + current_labels = _label_map(current_case["top_results"])
  129 + for row in backup_case["top_results"][:scan_depth]:
  130 + if row["label"] not in STRONG_LABELS:
  131 + continue
  132 + current_rank = current_ranks.get(row["spu_id"], 999)
  133 + if current_rank <= int(row["rank"]) + rank_gap_threshold:
  134 + continue
  135 + yield {
  136 + "query": query,
  137 + "delta_primary": delta,
  138 + "spu_id": str(row["spu_id"]),
  139 + "backup_rank": int(row["rank"]),
  140 + "backup_label": str(row["label"]),
  141 + "current_rank": current_rank,
  142 + "current_label": current_labels.get(row["spu_id"]),
  143 + }
  144 +
  145 +
  146 +def _print_metric_summary(current_report: Dict[str, Any], backup_report: Dict[str, Any], top_n: int) -> None:
  147 + current_per_query = {row["query"]: row for row in current_report["per_query"]}
  148 + backup_per_query = {row["query"]: row for row in backup_report["per_query"]}
  149 + deltas: List[Tuple[str, float, Dict[str, Any], Dict[str, Any]]] = []
  150 + for query, current_case in current_per_query.items():
  151 + backup_case = backup_per_query[query]
  152 + deltas.append(
  153 + (
  154 + query,
  155 + float(current_case["metrics"]["Primary_Metric_Score"])
  156 + - float(backup_case["metrics"]["Primary_Metric_Score"]),
  157 + current_case,
  158 + backup_case,
  159 + )
  160 + )
  161 + worse = sum(1 for _, delta, _, _ in deltas if delta < 0)
  162 + better = sum(1 for _, delta, _, _ in deltas if delta > 0)
  163 + print("Overall Query Delta")
  164 + print("=" * 80)
  165 + print(f"worse: {worse} | better: {better} | total: {len(deltas)}")
  166 + print(
  167 + "aggregate primary:"
  168 + f" current={current_report['aggregate_metrics']['Primary_Metric_Score']:.6f}"
  169 + f" backup={backup_report['aggregate_metrics']['Primary_Metric_Score']:.6f}"
  170 + f" delta={current_report['aggregate_metrics']['Primary_Metric_Score'] - backup_report['aggregate_metrics']['Primary_Metric_Score']:+.6f}"
  171 + )
  172 + print()
  173 + print(f"Worst {top_n} Queries By Primary_Metric_Score Delta")
  174 + print("=" * 80)
  175 + for query, delta, current_case, backup_case in sorted(deltas, key=lambda x: x[1])[:top_n]:
  176 + print(
  177 + f"{delta:+.4f}\t{query}\t"
  178 + f"NDCG@20 {current_case['metrics']['NDCG@20'] - backup_case['metrics']['NDCG@20']:+.4f}\t"
  179 + f"ERR@10 {current_case['metrics']['ERR@10'] - backup_case['metrics']['ERR@10']:+.4f}\t"
  180 + f"SP@10 {current_case['metrics']['Strong_Precision@10'] - backup_case['metrics']['Strong_Precision@10']:+.2f}"
  181 + )
  182 + print()
  183 +
  184 +
  185 +def _print_field_change_summary(
  186 + regressed_rows: Sequence[Dict[str, Any]],
  187 + fetcher: SourceFetcher,
  188 + current_index: str,
  189 + backup_index: str,
  190 +) -> None:
  191 + field_counter: Counter[str] = Counter()
  192 + qanchor_counts_en: List[Tuple[int, int]] = []
  193 + qanchor_counts_zh: List[Tuple[int, int]] = []
  194 + tag_counts_en: List[Tuple[int, int]] = []
  195 + tag_counts_zh: List[Tuple[int, int]] = []
  196 +
  197 + for row in regressed_rows:
  198 + current_doc = fetcher.fetch(current_index, row["spu_id"])
  199 + backup_doc = fetcher.fetch(backup_index, row["spu_id"])
  200 + if not current_doc or not backup_doc:
  201 + continue
  202 + for field in _changed_fields(current_doc, backup_doc):
  203 + field_counter[field] += 1
  204 +
  205 + current_qanchors = current_doc.get("qanchors") or {}
  206 + backup_qanchors = backup_doc.get("qanchors") or {}
  207 + current_tags = current_doc.get("enriched_tags") or {}
  208 + backup_tags = backup_doc.get("enriched_tags") or {}
  209 + qanchor_counts_en.append((_count_items(current_qanchors.get("en")), _count_items(backup_qanchors.get("en"))))
  210 + qanchor_counts_zh.append((_count_items(current_qanchors.get("zh")), _count_items(backup_qanchors.get("zh"))))
  211 + tag_counts_en.append((_count_items(current_tags.get("en")), _count_items(backup_tags.get("en"))))
  212 + tag_counts_zh.append((_count_items(current_tags.get("zh")), _count_items(backup_tags.get("zh"))))
  213 +
  214 + print("Affected Strong-Relevant Docs")
  215 + print("=" * 80)
  216 + print(f"count: {len(regressed_rows)}")
  217 + print("changed field frequency:")
  218 + for field, count in field_counter.most_common():
  219 + print(f" {field}: {count}")
  220 + print()
  221 +
  222 + def summarize_counts(name: str, pairs: Sequence[Tuple[int, int]]) -> None:
  223 + if not pairs:
  224 + return
  225 + current_counts = [current for current, _ in pairs]
  226 + backup_counts = [backup for _, backup in pairs]
  227 + print(
  228 + f"{name}: current_avg={statistics.mean(current_counts):.3f} "
  229 + f"backup_avg={statistics.mean(backup_counts):.3f} "
  230 + f"delta={statistics.mean(current - backup for current, backup in pairs):+.3f} "
  231 + f"backup_more={sum(1 for current, backup in pairs if backup > current)} "
  232 + f"current_more={sum(1 for current, backup in pairs if current > backup)}"
  233 + )
  234 +
  235 + print("phrase/tag density on affected docs:")
  236 + summarize_counts("qanchors.en", qanchor_counts_en)
  237 + summarize_counts("qanchors.zh", qanchor_counts_zh)
  238 + summarize_counts("enriched_tags.en", tag_counts_en)
  239 + summarize_counts("enriched_tags.zh", tag_counts_zh)
  240 + print()
  241 +
  242 +
  243 +def _print_query_details(
  244 + current_report: Dict[str, Any],
  245 + backup_report: Dict[str, Any],
  246 + regressed_rows: Sequence[Dict[str, Any]],
  247 + fetcher: SourceFetcher,
  248 + current_index: str,
  249 + backup_index: str,
  250 + top_queries: int,
  251 + max_docs_per_query: int,
  252 +) -> None:
  253 + current_per_query = {row["query"]: row for row in current_report["per_query"]}
  254 + backup_per_query = {row["query"]: row for row in backup_report["per_query"]}
  255 + grouped: Dict[str, List[Dict[str, Any]]] = {}
  256 + for row in regressed_rows:
  257 + grouped.setdefault(row["query"], []).append(row)
  258 +
  259 + ordered_queries = sorted(grouped, key=lambda q: current_per_query[q]["metrics"]["Primary_Metric_Score"] - backup_per_query[q]["metrics"]["Primary_Metric_Score"])
  260 +
  261 + print(f"Detailed Query Samples (top {top_queries})")
  262 + print("=" * 80)
  263 + for query in ordered_queries[:top_queries]:
  264 + current_case = current_per_query[query]
  265 + backup_case = backup_per_query[query]
  266 + delta = current_case["metrics"]["Primary_Metric_Score"] - backup_case["metrics"]["Primary_Metric_Score"]
  267 + print(f"\n## {query}")
  268 + print(
  269 + f"delta_primary={delta:+.6f} | current_top10={current_case['top_label_sequence_top10']} | "
  270 + f"backup_top10={backup_case['top_label_sequence_top10']}"
  271 + )
  272 + for row in sorted(grouped[query], key=lambda item: item["backup_rank"])[:max_docs_per_query]:
  273 + current_doc = fetcher.fetch(current_index, row["spu_id"])
  274 + backup_doc = fetcher.fetch(backup_index, row["spu_id"])
  275 + if not current_doc or not backup_doc:
  276 + print(
  277 + f" - spu={row['spu_id']} backup_rank={row['backup_rank']} current_rank={row['current_rank']} "
  278 + "(missing source)"
  279 + )
  280 + continue
  281 + changed = _changed_fields(current_doc, backup_doc)
  282 + print(
  283 + f" - spu={row['spu_id']} backup_rank={row['backup_rank']} ({row['backup_label']}) "
  284 + f"-> current_rank={row['current_rank']} ({row['current_label']})"
  285 + )
  286 + print(f" changed_fields: {', '.join(changed) if changed else '(none)'}")
  287 + for field in changed[:4]:
  288 + print(f" {field}.current: {_json_short(current_doc.get(field))}")
  289 + print(f" {field}.backup : {_json_short(backup_doc.get(field))}")
  290 +
  291 +
  292 +def main() -> None:
  293 + parser = argparse.ArgumentParser(description="Analyze eval regressions between two indices")
  294 + parser.add_argument("--current-report", required=True, help="Report JSON for the worse/current index")
  295 + parser.add_argument("--backup-report", required=True, help="Report JSON for the better/reference index")
  296 + parser.add_argument("--current-index", required=True, help="Current/worse index name")
  297 + parser.add_argument("--backup-index", required=True, help="Reference/better index name")
  298 + parser.add_argument("--rank-gap-threshold", type=int, default=5, help="Treat a strong-relevant doc as regressed when current rank > backup rank + this gap")
  299 + parser.add_argument("--scan-depth", type=int, default=20, help="Only inspect backup strong-relevant docs within this depth")
  300 + parser.add_argument("--top-worst-queries", type=int, default=12, help="How many worst queries to print in the metric summary")
  301 + parser.add_argument("--detail-queries", type=int, default=6, help="How many regressed queries to print detailed field diffs for")
  302 + parser.add_argument("--detail-docs-per-query", type=int, default=3, help="How many regressed docs to print per detailed query")
  303 + args = parser.parse_args()
  304 +
  305 + current_report = _load_report(args.current_report)
  306 + backup_report = _load_report(args.backup_report)
  307 + fetcher = SourceFetcher()
  308 + regressed_rows = list(
  309 + _iter_regressed_docs(
  310 + current_report=current_report,
  311 + backup_report=backup_report,
  312 + rank_gap_threshold=args.rank_gap_threshold,
  313 + scan_depth=args.scan_depth,
  314 + )
  315 + )
  316 +
  317 + _print_metric_summary(current_report, backup_report, top_n=args.top_worst_queries)
  318 + _print_field_change_summary(
  319 + regressed_rows=regressed_rows,
  320 + fetcher=fetcher,
  321 + current_index=args.current_index,
  322 + backup_index=args.backup_index,
  323 + )
  324 + _print_query_details(
  325 + current_report=current_report,
  326 + backup_report=backup_report,
  327 + regressed_rows=regressed_rows,
  328 + fetcher=fetcher,
  329 + current_index=args.current_index,
  330 + backup_index=args.backup_index,
  331 + top_queries=args.detail_queries,
  332 + max_docs_per_query=args.detail_docs_per_query,
  333 + )
  334 +
  335 +
  336 +if __name__ == "__main__":
  337 + main()
... ...
scripts/inspect/analyze_eval_regressions.py 0 → 100644
... ... @@ -0,0 +1,303 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Analyze per-query regressions between two batch evaluation JSON reports and
  4 +attribute likely causes by inspecting ES documents from two indices.
  5 +
  6 +Outputs:
  7 +- Top regressions by Primary_Metric_Score delta
  8 +- For each regressed query:
  9 + - metric deltas
  10 + - top-10 SPU overlap and swapped-in SPUs
  11 + - for swapped-in SPUs, show which search fields contain the query term
  12 +
  13 +This is a heuristic attribution tool (string containment), but it's fast and
  14 +usually enough to pinpoint regressions caused by missing/noisy fields such as
  15 +qanchors/keywords/title in different languages.
  16 +
  17 +Usage:
  18 + set -a; source .env; set +a
  19 + ./.venv/bin/python scripts/inspect/analyze_eval_regressions.py \
  20 + --old-report artifacts/search_evaluation/batch_reports/batch_...073901....json \
  21 + --new-report artifacts/search_evaluation/batch_reports/batch_...074717....json \
  22 + --old-index search_products_tenant_163 \
  23 + --new-index search_products_tenant_163_backup_20260415_1438 \
  24 + --top-n 10
  25 +"""
  26 +
  27 +from __future__ import annotations
  28 +
  29 +import argparse
  30 +import json
  31 +import os
  32 +import re
  33 +from dataclasses import dataclass
  34 +from pathlib import Path
  35 +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
  36 +
  37 +from elasticsearch import Elasticsearch
  38 +
  39 +
  40 +def load_json(path: str) -> Dict[str, Any]:
  41 + return json.loads(Path(path).read_text())
  42 +
  43 +
  44 +def norm_str(x: Any) -> str:
  45 + if x is None:
  46 + return ""
  47 + if isinstance(x, str):
  48 + return x
  49 + return str(x)
  50 +
  51 +
  52 +def is_cjk(s: str) -> bool:
  53 + return bool(re.search(r"[\u4e00-\u9fff]", s))
  54 +
  55 +
  56 +def flatten_text_values(v: Any) -> List[str]:
  57 + """Extract strings from nested objects/lists (best-effort)."""
  58 + out: List[str] = []
  59 + if v is None:
  60 + return out
  61 + if isinstance(v, str):
  62 + return [v]
  63 + if isinstance(v, (int, float, bool)):
  64 + return [str(v)]
  65 + if isinstance(v, dict):
  66 + for vv in v.values():
  67 + out.extend(flatten_text_values(vv))
  68 + return out
  69 + if isinstance(v, list):
  70 + for vv in v[:20]:
  71 + out.extend(flatten_text_values(vv))
  72 + return out
  73 + return [str(v)]
  74 +
  75 +
  76 +def get_lang_obj(src: Dict[str, Any], field: str, lang: str) -> Any:
  77 + obj = src.get(field)
  78 + if isinstance(obj, dict):
  79 + return obj.get(lang)
  80 + return None
  81 +
  82 +
  83 +def contains_query(val: Any, query: str) -> bool:
  84 + q = query.strip()
  85 + if not q:
  86 + return False
  87 + texts = flatten_text_values(val)
  88 + # simple substring match (case-insensitive for non-cjk)
  89 + if is_cjk(q):
  90 + return any(q in t for t in texts)
  91 + ql = q.lower()
  92 + return any(ql in (t or "").lower() for t in texts)
  93 +
  94 +
  95 +@dataclass
  96 +class PerQuery:
  97 + query: str
  98 + metrics: Dict[str, float]
  99 + top_results: List[Dict[str, Any]]
  100 + request_id: Optional[str]
  101 +
  102 +
  103 +def per_query_map(report: Dict[str, Any]) -> Dict[str, PerQuery]:
  104 + out: Dict[str, PerQuery] = {}
  105 + for rec in report.get("per_query") or []:
  106 + q = rec.get("query")
  107 + if not q:
  108 + continue
  109 + metrics = {k: float(v) for k, v in (rec.get("metrics") or {}).items() if isinstance(v, (int, float))}
  110 + out[q] = PerQuery(
  111 + query=q,
  112 + metrics=metrics,
  113 + top_results=list(rec.get("top_results") or []),
  114 + request_id=rec.get("request_id"),
  115 + )
  116 + return out
  117 +
  118 +
  119 +def top_spus(pq: PerQuery, n: int = 10) -> List[str]:
  120 + spus: List[str] = []
  121 + for r in pq.top_results[:n]:
  122 + spu = r.get("spu_id")
  123 + if spu is not None:
  124 + spus.append(str(spu))
  125 + return spus
  126 +
  127 +
  128 +def build_es() -> Elasticsearch:
  129 + es_url = os.environ.get("ES") or os.environ.get("ES_HOST") or "http://127.0.0.1:9200"
  130 + auth = os.environ.get("ES_AUTH")
  131 + if auth and ":" in auth:
  132 + user, pwd = auth.split(":", 1)
  133 + return Elasticsearch(hosts=[es_url], basic_auth=(user, pwd))
  134 + return Elasticsearch(hosts=[es_url])
  135 +
  136 +
  137 +def mget_sources(es: Elasticsearch, index: str, ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
  138 + resp = es.mget(index=index, body={"ids": list(ids)})
  139 + out: Dict[str, Dict[str, Any]] = {}
  140 + for d in resp.get("docs") or []:
  141 + if d.get("found") and d.get("_id") and isinstance(d.get("_source"), dict):
  142 + out[str(d["_id"])] = d["_source"]
  143 + return out
  144 +
  145 +
  146 +def non_empty(v: Any) -> bool:
  147 + if v is None:
  148 + return False
  149 + if isinstance(v, str):
  150 + return bool(v.strip())
  151 + if isinstance(v, (list, tuple, set)):
  152 + return len(v) > 0
  153 + if isinstance(v, dict):
  154 + return any(non_empty(x) for x in v.values())
  155 + return True
  156 +
  157 +
  158 +def summarize_field(src: Dict[str, Any], field: str, lang: Optional[str]) -> Dict[str, Any]:
  159 + """Summarize presence and a small sample for a field (optionally language-specific)."""
  160 + obj = src.get(field)
  161 + if lang and isinstance(obj, dict):
  162 + obj = obj.get(lang)
  163 + present = non_empty(obj)
  164 + sample = None
  165 + if isinstance(obj, str):
  166 + sample = obj[:80]
  167 + elif isinstance(obj, list):
  168 + sample = obj[:3]
  169 + elif isinstance(obj, dict):
  170 + sample = {k: obj.get(k) for k in list(obj.keys())[:3]}
  171 + return {"present": present, "sample": sample}
  172 +
  173 +
  174 +def main() -> int:
  175 + ap = argparse.ArgumentParser(description="Analyze regressions between two eval batch reports.")
  176 + ap.add_argument("--old-report", required=True, help="Older/worse/baseline batch JSON path")
  177 + ap.add_argument("--new-report", required=True, help="Newer candidate batch JSON path")
  178 + ap.add_argument("--old-index", required=True, help="ES index used by old report")
  179 + ap.add_argument("--new-index", required=True, help="ES index used by new report")
  180 + ap.add_argument("--top-n", type=int, default=10, help="How many worst regressions to analyze (default 10)")
  181 + ap.add_argument("--metric", default="Primary_Metric_Score", help="Metric to rank regressions by")
  182 + ap.add_argument("--topk", type=int, default=10, help="Top-K results to compare per query (default 10)")
  183 + args = ap.parse_args()
  184 +
  185 + old = load_json(args.old_report)
  186 + new = load_json(args.new_report)
  187 + old_map = per_query_map(old)
  188 + new_map = per_query_map(new)
  189 +
  190 + metric = args.metric
  191 + queries = list(new.get("queries") or old.get("queries") or [])
  192 +
  193 + deltas: List[Tuple[str, float]] = []
  194 + for q in queries:
  195 + o = old_map.get(q)
  196 + n = new_map.get(q)
  197 + if not o or not n:
  198 + continue
  199 + d = float(n.metrics.get(metric, 0.0)) - float(o.metrics.get(metric, 0.0))
  200 + deltas.append((q, d))
  201 +
  202 + deltas.sort(key=lambda x: x[1])
  203 + worst = deltas[: args.top_n]
  204 +
  205 + print("=" * 100)
  206 + print(f"Top {len(worst)} regressions by {metric} (new - old)")
  207 + print("=" * 100)
  208 + for q, d in worst:
  209 + o = old_map[q]
  210 + n = new_map[q]
  211 + print(f"- {q}: {d:+.4f} old={o.metrics.get(metric, 0.0):.4f} -> new={n.metrics.get(metric, 0.0):.4f}")
  212 +
  213 + es = build_es()
  214 +
  215 + # Fields that matter according to config.yaml
  216 + # (keep it aligned with multilingual_fields + best_fields/phrase_fields)
  217 + inspect_fields = [
  218 + "title",
  219 + "keywords",
  220 + "qanchors",
  221 + "category_name_text",
  222 + "vendor",
  223 + "tags",
  224 + "option1_values",
  225 + "option2_values",
  226 + "option3_values",
  227 + ]
  228 +
  229 + print("\n" + "=" * 100)
  230 + print("Heuristic attribution for worst regressions")
  231 + print("=" * 100)
  232 +
  233 + for q, d in worst:
  234 + o = old_map[q]
  235 + n = new_map[q]
  236 + old_spus = top_spus(o, args.topk)
  237 + new_spus = top_spus(n, args.topk)
  238 + old_set, new_set = set(old_spus), set(new_spus)
  239 + swapped_in = [s for s in new_spus if s not in old_set]
  240 + swapped_out = [s for s in old_spus if s not in new_set]
  241 +
  242 + print("\n" + "-" * 100)
  243 + print(f"Query: {q}")
  244 + print(f"Delta {metric}: {d:+.4f}")
  245 + # show a few key metrics
  246 + for m in ["NDCG@20", "Strong_Precision@10", "Gain_Recall@20", "ERR@10"]:
  247 + if m in o.metrics and m in n.metrics:
  248 + print(f" {m}: {n.metrics[m]-o.metrics[m]:+.4f} (old {o.metrics[m]:.4f} -> new {n.metrics[m]:.4f})")
  249 + print(f" old request_id={o.request_id} new request_id={n.request_id}")
  250 + print(f" top{args.topk} overlap: {len(old_set & new_set)}/{args.topk}")
  251 + print(f" swapped_in (new only): {swapped_in[:10]}")
  252 + print(f" swapped_out (old only): {swapped_out[:10]}")
  253 +
  254 + # Fetch swapped_in docs from both indices to spot index-field differences.
  255 + if not swapped_in:
  256 + continue
  257 + docs_new = mget_sources(es, args.new_index, swapped_in)
  258 + docs_old = mget_sources(es, args.old_index, swapped_in)
  259 +
  260 + lang = "zh" if is_cjk(q) else "en"
  261 + print(f" language_guess: {lang}")
  262 + for spu in swapped_in[:8]:
  263 + src_new = docs_new.get(spu) or {}
  264 + src_old = docs_old.get(spu) or {}
  265 +
  266 + title = get_lang_obj(src_new, "title", lang) or get_lang_obj(src_new, "title", "en") or ""
  267 + print(f" - spu={spu} title≈{norm_str(title)[:60]!r}")
  268 +
  269 + presence_new = {f: summarize_field(src_new, f, lang) for f in inspect_fields}
  270 + presence_old = {f: summarize_field(src_old, f, lang) for f in inspect_fields}
  271 +
  272 + new_only = [f for f in inspect_fields if presence_new[f]["present"] and not presence_old[f]["present"]]
  273 + old_only = [f for f in inspect_fields if presence_old[f]["present"] and not presence_new[f]["present"]]
  274 + if new_only or old_only:
  275 + print(f" field_presence_diff: new_only={new_only} old_only={old_only}")
  276 +
  277 + # still report exact-substring match where it exists (often useful for English)
  278 + hits = []
  279 + for f in inspect_fields:
  280 + v = get_lang_obj(src_new, f, lang)
  281 + if v is None:
  282 + v = src_new.get(f)
  283 + if contains_query(v, q):
  284 + hits.append(f)
  285 + if hits:
  286 + print(f" exact_substring_matched_fields: {hits}")
  287 +
  288 + # compact samples for the most likely culprits
  289 + for f in ["qanchors", "keywords", "title"]:
  290 + pn = presence_new.get(f)
  291 + po = presence_old.get(f)
  292 + if pn and po and (pn["present"] or po["present"]):
  293 + print(
  294 + f" {f}: new.present={pn['present']} old.present={po['present']} "
  295 + f"new.sample={pn['sample']} old.sample={po['sample']}"
  296 + )
  297 +
  298 + return 0
  299 +
  300 +
  301 +if __name__ == "__main__":
  302 + raise SystemExit(main())
  303 +
... ...
scripts/inspect/compare_indices.py 0 → 100644
... ... @@ -0,0 +1,376 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Compare two Elasticsearch indices:
  4 +- mapping structure (field paths + types)
  5 +- field coverage stats (exists; nested-safe)
  6 +- random sample documents (same _id) and diff _source field paths
  7 +
  8 +Usage:
  9 + python scripts/inspect/compare_indices.py INDEX_A INDEX_B --sample-size 25
  10 + python scripts/inspect/compare_indices.py INDEX_A INDEX_B --fields title.zh,vendor.zh,keywords.zh,tags.zh --fields-nested image_embedding.url,enriched_attributes.name
  11 +"""
  12 +
  13 +from __future__ import annotations
  14 +
  15 +import argparse
  16 +import json
  17 +import sys
  18 +from dataclasses import dataclass
  19 +from pathlib import Path
  20 +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
  21 +
  22 +sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
  23 +
  24 +from utils.es_client import ESClient, get_es_client_from_env
  25 +
  26 +
  27 +def _walk_mapping_properties(props: Dict[str, Any], prefix: str = "") -> Dict[str, str]:
  28 + """Flatten mapping properties into {field_path: type} including multi-fields."""
  29 + out: Dict[str, str] = {}
  30 + for name, node in (props or {}).items():
  31 + path = f"{prefix}.{name}" if prefix else name
  32 + if not isinstance(node, dict):
  33 + out[path] = "unknown"
  34 + continue
  35 + out[path] = node.get("type") or "object"
  36 + if isinstance(node.get("properties"), dict):
  37 + out.update(_walk_mapping_properties(node["properties"], path))
  38 + if isinstance(node.get("fields"), dict):
  39 + for sub, subnode in node["fields"].items():
  40 + if isinstance(subnode, dict):
  41 + out[f"{path}.{sub}"] = subnode.get("type") or "object"
  42 + else:
  43 + out[f"{path}.{sub}"] = "unknown"
  44 + return out
  45 +
  46 +
  47 +def _get_top_level_field_type(mapping: Dict[str, Any], top_field: str) -> Optional[str]:
  48 + props = mapping.get("mappings", {}).get("properties", {}) or {}
  49 + node = props.get(top_field)
  50 + if not isinstance(node, dict):
  51 + return None
  52 + return node.get("type") or "object"
  53 +
  54 +
  55 +def _field_paths_from_source(obj: Any, prefix: str = "", list_depth: int = 3) -> Set[str]:
  56 + """Return dotted field paths found in _source. For lists, uses '[]' marker."""
  57 + out: Set[str] = set()
  58 + if isinstance(obj, dict):
  59 + for k, v in obj.items():
  60 + p = f"{prefix}.{k}" if prefix else k
  61 + out.add(p)
  62 + out |= _field_paths_from_source(v, p, list_depth=list_depth)
  63 + elif isinstance(obj, list):
  64 + # Do not explode: just traverse first N elements
  65 + for v in obj[:list_depth]:
  66 + p = f"{prefix}[]" if prefix else "[]"
  67 + out |= _field_paths_from_source(v, p, list_depth=list_depth)
  68 + return out
  69 +
  70 +
  71 +def _chunks(seq: List[str], size: int) -> Iterable[List[str]]:
  72 + for i in range(0, len(seq), size):
  73 + yield seq[i : i + size]
  74 +
  75 +
  76 +@dataclass(frozen=True)
  77 +class CoverageField:
  78 + field: str
  79 + # If set, use nested query with this path (e.g. "image_embedding").
  80 + nested_path: Optional[str] = None
  81 +
  82 +
  83 +def _infer_coverage_fields(
  84 + mapping: Dict[str, Any],
  85 + raw_fields: List[str],
  86 + raw_nested_fields: List[str],
  87 +) -> List[CoverageField]:
  88 + """
  89 + Build coverage fields list. For fields in raw_nested_fields, always treat as nested
  90 + and infer nested path as first segment.
  91 + For raw_fields, auto-detect nested by checking mapping top-level field type.
  92 + """
  93 + out: List[CoverageField] = []
  94 +
  95 + nested_set = {f.strip() for f in raw_nested_fields if f.strip()}
  96 + for f in nested_set:
  97 + path = f.split(".", 1)[0]
  98 + out.append(CoverageField(field=f, nested_path=path))
  99 +
  100 + for f in [x.strip() for x in raw_fields if x.strip()]:
  101 + if f in nested_set:
  102 + continue
  103 + top = f.split(".", 1)[0]
  104 + top_type = _get_top_level_field_type(mapping, top)
  105 + if top_type == "nested":
  106 + out.append(CoverageField(field=f, nested_path=top))
  107 + else:
  108 + out.append(CoverageField(field=f, nested_path=None))
  109 +
  110 + # stable order (nested first then normal, but preserve user order otherwise)
  111 + seen: Set[Tuple[str, Optional[str]]] = set()
  112 + dedup: List[CoverageField] = []
  113 + for cf in out:
  114 + key = (cf.field, cf.nested_path)
  115 + if key in seen:
  116 + continue
  117 + seen.add(key)
  118 + dedup.append(cf)
  119 + return dedup
  120 +
  121 +
  122 +def _count_exists(es, index: str, cf: CoverageField) -> int:
  123 + """
  124 + Count docs where field exists.
  125 + - If nested_path is set, uses nested query (safe for nested fields).
  126 + - If nested query fails because path isn't actually nested in that index,
  127 + fall back to a non-nested exists query to avoid crashing the whole report.
  128 + """
  129 + if cf.nested_path:
  130 + nested_body = {
  131 + "query": {
  132 + "nested": {
  133 + "path": cf.nested_path,
  134 + "query": {"exists": {"field": cf.field}},
  135 + }
  136 + }
  137 + }
  138 + try:
  139 + return int(es.count(index, body=nested_body))
  140 + except Exception as e:
  141 + # Most common: "[nested] failed to find nested object under path [...]"
  142 + print(f"[warn] nested exists failed for {index} field={cf.field} path={cf.nested_path}: {type(e).__name__}")
  143 + # fall through to exists
  144 + body = {"query": {"exists": {"field": cf.field}}}
  145 + return int(es.count(index, body=body))
  146 +
  147 +
  148 +def _print_json(obj: Any) -> None:
  149 + print(json.dumps(obj, ensure_ascii=False, indent=2, sort_keys=False))
  150 +
  151 +
  152 +def compare_mapping(index_a: str, index_b: str, mapping_a: Dict[str, Any], mapping_b: Dict[str, Any]) -> None:
  153 + flat_a = _walk_mapping_properties(mapping_a.get("mappings", {}).get("properties", {}) or {})
  154 + flat_b = _walk_mapping_properties(mapping_b.get("mappings", {}).get("properties", {}) or {})
  155 +
  156 + only_a = sorted(set(flat_a) - set(flat_b))
  157 + only_b = sorted(set(flat_b) - set(flat_a))
  158 + type_diff = sorted([k for k in set(flat_a) & set(flat_b) if flat_a[k] != flat_b[k]])
  159 +
  160 + print("\n" + "=" * 90)
  161 + print("Mapping diff (flattened field paths + types)")
  162 + print("=" * 90)
  163 + print(f"index_a: {index_a}")
  164 + print(f"index_b: {index_b}")
  165 + print(f"only_in_a: {len(only_a)}")
  166 + print(f"only_in_b: {len(only_b)}")
  167 + print(f"type_diff: {len(type_diff)}")
  168 +
  169 + if only_a[:50]:
  170 + print("\nFields only in index_a (first 50):")
  171 + for f in only_a[:50]:
  172 + print(f" - {f} ({flat_a.get(f)})")
  173 + if len(only_a) > 50:
  174 + print(f" ... and {len(only_a) - 50} more")
  175 +
  176 + if only_b[:50]:
  177 + print("\nFields only in index_b (first 50):")
  178 + for f in only_b[:50]:
  179 + print(f" - {f} ({flat_b.get(f)})")
  180 + if len(only_b) > 50:
  181 + print(f" ... and {len(only_b) - 50} more")
  182 +
  183 + if type_diff[:50]:
  184 + print("\nFields with different types (first 50):")
  185 + for f in type_diff[:50]:
  186 + print(f" - {f}: a={flat_a.get(f)} b={flat_b.get(f)}")
  187 + if len(type_diff) > 50:
  188 + print(f" ... and {len(type_diff) - 50} more")
  189 +
  190 +
  191 +def compare_coverage(
  192 + es,
  193 + index_a: str,
  194 + index_b: str,
  195 + mapping_a: Dict[str, Any],
  196 + mapping_b: Dict[str, Any],
  197 + fields: List[str],
  198 + nested_fields: List[str],
  199 +) -> None:
  200 + cov_fields_a = _infer_coverage_fields(mapping_a, fields, nested_fields)
  201 + cov_fields_b = _infer_coverage_fields(mapping_b, fields, nested_fields)
  202 +
  203 + # keep shared list, but warn if inference differs (it shouldn't)
  204 + if [c.field for c in cov_fields_a] != [c.field for c in cov_fields_b]:
  205 + print("\n[warn] coverage field list differs between indices; using index_a inference as baseline")
  206 + cov_fields = cov_fields_a
  207 +
  208 + print("\n" + "=" * 90)
  209 + print("Field coverage stats (count of docs where field exists)")
  210 + print("=" * 90)
  211 + print(f"index_a: {index_a}")
  212 + print(f"index_b: {index_b}")
  213 +
  214 + for cf in cov_fields:
  215 + mode = f"nested(path={cf.nested_path})" if cf.nested_path else "exists"
  216 + a = _count_exists(es, index_a, cf)
  217 + b = _count_exists(es, index_b, cf)
  218 + print(f"\n- {cf.field} [{mode}]")
  219 + print(f" {index_a}: {a}")
  220 + print(f" {index_b}: {b}")
  221 +
  222 +
  223 +def compare_random_samples(
  224 + es,
  225 + index_a: str,
  226 + index_b: str,
  227 + sample_size: int,
  228 + random_seed: Optional[int],
  229 +) -> None:
  230 + print("\n" + "=" * 90)
  231 + print("Random sample diff (same _id; diff _source field paths)")
  232 + print("=" * 90)
  233 + print(f"sample_size: {sample_size}")
  234 +
  235 + random_score: Dict[str, Any] = {}
  236 + if random_seed is not None:
  237 + random_score["seed"] = random_seed
  238 +
  239 + sample_body = {
  240 + "size": sample_size,
  241 + "_source": False,
  242 + "query": {"function_score": {"query": {"match_all": {}}, "random_score": random_score}},
  243 + }
  244 +
  245 + # Use the underlying client directly to avoid passing duplicate `size`
  246 + # parameters through the wrapper.
  247 + resp = es.client.search(index=index_a, body=sample_body)
  248 + hits = (((resp or {}).get("hits") or {}).get("hits") or [])
  249 + ids = [h.get("_id") for h in hits if h.get("_id") is not None]
  250 +
  251 + if not ids:
  252 + print("No hits returned; cannot sample.")
  253 + return
  254 +
  255 + # mget in chunks
  256 + def mget(index: str, ids_: List[str]) -> Dict[str, Dict[str, Any]]:
  257 + out: Dict[str, Dict[str, Any]] = {}
  258 + for batch in _chunks(ids_, 500):
  259 + docs = es.client.mget(index=index, body={"ids": batch}).get("docs") or []
  260 + for d in docs:
  261 + if d.get("found") and d.get("_id") and isinstance(d.get("_source"), dict):
  262 + out[d["_id"]] = d["_source"]
  263 + return out
  264 +
  265 + a_docs = mget(index_a, ids)
  266 + b_docs = mget(index_b, ids)
  267 +
  268 + missing_in_b = [i for i in ids if i in a_docs and i not in b_docs]
  269 + missing_in_a = [i for i in ids if i in b_docs and i not in a_docs]
  270 +
  271 + only_in_a: Set[str] = set()
  272 + only_in_b: Set[str] = set()
  273 +
  274 + matched = 0
  275 + for _id in ids:
  276 + if _id in a_docs and _id in b_docs:
  277 + matched += 1
  278 + pa = _field_paths_from_source(a_docs[_id])
  279 + pb = _field_paths_from_source(b_docs[_id])
  280 + only_in_a |= (pa - pb)
  281 + only_in_b |= (pb - pa)
  282 +
  283 + summary = {
  284 + "sample_size": len(ids),
  285 + "matched": matched,
  286 + "missing_in_index_b_count": len(missing_in_b),
  287 + "missing_in_index_a_count": len(missing_in_a),
  288 + "missing_in_index_b_example": missing_in_b[:5],
  289 + "missing_in_index_a_example": missing_in_a[:5],
  290 + "fields_only_in_index_a_count": len(only_in_a),
  291 + "fields_only_in_index_b_count": len(only_in_b),
  292 + "fields_only_in_index_a_first80": sorted(list(only_in_a))[:80],
  293 + "fields_only_in_index_b_first80": sorted(list(only_in_b))[:80],
  294 + }
  295 + _print_json(summary)
  296 +
  297 +
  298 +def main() -> int:
  299 + parser = argparse.ArgumentParser(description="Compare two ES indices (mapping + data coverage + random sample).")
  300 + parser.add_argument("index_a", help="Index A name")
  301 + parser.add_argument("index_b", help="Index B name")
  302 + parser.add_argument("--sample-size", type=int, default=25, help="Random sample size (default: 25)")
  303 + parser.add_argument("--seed", type=int, default=None, help="Random seed for random_score (optional)")
  304 + parser.add_argument(
  305 + "--es-url",
  306 + default=None,
  307 + help="Elasticsearch URL. If omitted, uses env ES (preferred) or config/config.yaml.",
  308 + )
  309 + parser.add_argument(
  310 + "--es-auth",
  311 + default=None,
  312 + help="Basic auth in 'user:pass' form. If omitted, uses env ES_AUTH or config credentials.",
  313 + )
  314 + parser.add_argument(
  315 + "--fields",
  316 + default="title.zh,vendor.zh,keywords.zh,tags.zh,keywords.en,tags.en,enriched_taxonomy_attributes,image_embedding.url,enriched_attributes.name",
  317 + help="Comma-separated fields to compute coverage for (default: a sensible set)",
  318 + )
  319 + parser.add_argument(
  320 + "--fields-nested",
  321 + default="image_embedding.url,enriched_attributes.name",
  322 + help="Comma-separated fields that must be treated as nested exists (default: image_embedding.url,enriched_attributes.name)",
  323 + )
  324 + args = parser.parse_args()
  325 +
  326 + # Prefer doc-style env vars (ES/ES_AUTH) to match ops workflow in docs/常用查询 - ES.md.
  327 + # Fallback to config/config.yaml for repo-local tooling.
  328 + env = __import__("os").environ
  329 + es_url = args.es_url or (env.get("ES") or env.get("ES_HOST") or None)
  330 + es_auth = args.es_auth or env.get("ES_AUTH")
  331 + # Doc convention: if ES is unset, default to localhost:9200.
  332 + if not es_url and es_auth:
  333 + es_url = "http://127.0.0.1:9200"
  334 +
  335 + if es_url:
  336 + username = password = None
  337 + if es_auth and ":" in es_auth:
  338 + username, password = es_auth.split(":", 1)
  339 + es = ESClient(hosts=[es_url], username=username, password=password)
  340 + else:
  341 + es = get_es_client_from_env()
  342 +
  343 + if not es.ping():
  344 + print("✗ Cannot connect to Elasticsearch")
  345 + return 2
  346 +
  347 + if not es.index_exists(args.index_a):
  348 + print(f"✗ index not found: {args.index_a}")
  349 + return 2
  350 + if not es.index_exists(args.index_b):
  351 + print(f"✗ index not found: {args.index_b}")
  352 + return 2
  353 +
  354 + mapping_all_a = es.get_mapping(args.index_a) or {}
  355 + mapping_all_b = es.get_mapping(args.index_b) or {}
  356 + if args.index_a not in mapping_all_a or args.index_b not in mapping_all_b:
  357 + print("✗ Failed to fetch mappings for both indices")
  358 + return 2
  359 +
  360 + mapping_a = mapping_all_a[args.index_a]
  361 + mapping_b = mapping_all_b[args.index_b]
  362 +
  363 + compare_mapping(args.index_a, args.index_b, mapping_a, mapping_b)
  364 +
  365 + fields = [x for x in (args.fields or "").split(",") if x.strip()]
  366 + nested_fields = [x for x in (args.fields_nested or "").split(",") if x.strip()]
  367 + compare_coverage(es, args.index_a, args.index_b, mapping_a, mapping_b, fields, nested_fields)
  368 +
  369 + compare_random_samples(es, args.index_a, args.index_b, args.sample_size, args.seed)
  370 +
  371 + return 0
  372 +
  373 +
  374 +if __name__ == "__main__":
  375 + raise SystemExit(main())
  376 +
... ...
scripts/start_eval_web.sh
... ... @@ -9,6 +9,7 @@ source ./activate.sh
9 9 EVAL_WEB_PORT="${EVAL_WEB_PORT:-6010}"
10 10 EVAL_WEB_HOST="${EVAL_WEB_HOST:-0.0.0.0}"
11 11 TENANT_ID="${TENANT_ID:-163}"
  12 +DATASET_ID="${REPO_EVAL_DATASET_ID:-core_queries}"
12 13 QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
13 14  
14 15 GREEN='\033[0;32m'
... ... @@ -21,10 +22,11 @@ echo -e &quot;${GREEN}========================================${NC}&quot;
21 22 echo -e "\n${YELLOW}Evaluation UI:${NC} ${GREEN}http://localhost:${EVAL_WEB_PORT}/${NC}"
22 23 echo -e "${YELLOW}Requires backend for live search:${NC} ${GREEN}http://localhost:${API_PORT:-6002}${NC}\n"
23 24  
24   -export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_QUERIES
  25 +export EVAL_WEB_PORT EVAL_WEB_HOST TENANT_ID REPO_EVAL_DATASET_ID REPO_EVAL_QUERIES
25 26  
26 27 exec python scripts/evaluation/serve_eval_web.py serve \
27 28 --tenant-id "${TENANT_ID}" \
  29 + --dataset-id "${DATASET_ID}" \
28 30 --queries-file "${QUERIES}" \
29 31 --host "${EVAL_WEB_HOST}" \
30 32 --port "${EVAL_WEB_PORT}"
... ...
tests/test_search_evaluation_datasets.py 0 → 100644
... ... @@ -0,0 +1,18 @@
  1 +from config.loader import get_app_config
  2 +from scripts.evaluation.eval_framework.datasets import resolve_dataset
  3 +
  4 +
  5 +def test_search_evaluation_registry_contains_expected_datasets() -> None:
  6 + se = get_app_config().search_evaluation
  7 + ids = [item.dataset_id for item in se.datasets]
  8 + assert "core_queries" in ids
  9 + assert "clothing_top771" in ids
  10 + assert se.default_dataset_id == "core_queries"
  11 +
  12 +
  13 +def test_resolve_dataset_returns_expected_query_counts() -> None:
  14 + core = resolve_dataset(dataset_id="core_queries")
  15 + clothing = resolve_dataset(dataset_id="clothing_top771")
  16 + assert core.query_count > 0
  17 + assert clothing.query_count == 771
  18 + assert clothing.dataset_id == "clothing_top771"
... ...