评估框架

tangwang
1 parent 432d1c88
Showing 5 changed files with 719 additions and 49 deletions Show diff stats
config/config.yaml
scripts/evaluation/README.md
scripts/evaluation/eval_framework.py
scripts/evaluation/queries/queries.txt
scripts/evaluation/quick_start_eval.sh
+# Unified Configuration for Multi-Tenant Search Engine
+# 统一配置文件，所有租户共用一套配置
+# 注意：索引结构由 mappings/search_products.json 定义，此文件只配置搜索行为
+#
+# 约定：下列键为必填；进程环境变量可覆盖 infrastructure / runtime 中同名语义项
+#（如 ES_HOST、API_PORT 等），未设置环境变量时使用本文件中的值。
+
+# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
 runtime:
   environment: prod
   index_namespace: ''
@@ -13,6 +21,8 @@ runtime:
   translator_port: 6006
   reranker_host: 0.0.0.0
   reranker_port: 6007
+
+# 基础设施连接（敏感项优先读环境变量：ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY）
 infrastructure:
   elasticsearch:
     host: http://localhost:9200
@@ -39,16 +49,30 @@ infrastructure:
   secrets:
     dashscope_api_key: null
     deepl_auth_key: null
+
+# Elasticsearch Index
 es_index_name: search_products
+
+# 检索域 / 索引列表（可为空列表；每项字段均需显式给出）
 indexes: []
+
+# Config assets
 assets:
   query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict
+
+# Product content understanding (LLM enrich-content) configuration
 product_enrich:
   max_workers: 40
+
+# ES Index Settings (基础设置)
 es_settings:
   number_of_shards: 1
   number_of_replicas: 0
   refresh_interval: 30s
+
+# 字段权重配置（用于搜索时的字段boost）
+# 统一按“字段基名”配置；查询时按实际检索语言动态拼接 .{lang}。
+# 若需要按某个语言单独调权，也可以加显式 key（例如 title.de: 3.2）。
 field_boosts:
   title: 3.0
   qanchors: 2.5
@@ -61,21 +85,39 @@ field_boosts:
   option1_values: 1.5
   option2_values: 1.5
   option3_values: 1.5
+
+# Query Configuration（查询配置）
 query_config:
+  # 支持的语言
   supported_languages:
   - zh
   - en
   default_language: en
+
+  # 功能开关（翻译开关由tenant_config控制）
   enable_text_embedding: true
   enable_query_rewrite: true
-  zh_to_en_model: nllb-200-distilled-600m
-  en_to_zh_model: nllb-200-distilled-600m
+
+  # 查询翻译模型（须与 services.translation.capabilities 中某项一致）
+  # 源语种在租户 index_languages 内：主召回可打在源语种字段，用下面三项。
+  zh_to_en_model: nllb-200-distilled-600m  # "opus-mt-zh-en"
+  en_to_zh_model: nllb-200-distilled-600m  # "opus-mt-en-zh"
   default_translation_model: nllb-200-distilled-600m
+  # zh_to_en_model: deepl
+  # en_to_zh_model: deepl
+  # default_translation_model: deepl
+  # 源语种不在 index_languages：翻译对可检索文本更关键，可单独指定（缺省则与上一组相同）
   zh_to_en_model__source_not_in_index: nllb-200-distilled-600m
   en_to_zh_model__source_not_in_index: nllb-200-distilled-600m
   default_translation_model__source_not_in_index: nllb-200-distilled-600m
-  translation_embedding_wait_budget_ms_source_in_index: 200
-  translation_embedding_wait_budget_ms_source_not_in_index: 300
+  # zh_to_en_model__source_not_in_index: deepl
+  # en_to_zh_model__source_not_in_index: deepl
+  # default_translation_model__source_not_in_index: deepl
+
+  # 查询解析阶段：翻译与 query 向量并发执行，共用同一等待预算（毫秒）。
+  # 检测语言已在租户 index_languages 内：较短；不在索引语言内：较长（翻译对召回更关键）。
+  translation_embedding_wait_budget_ms_source_in_index: 200  # 80
+  translation_embedding_wait_budget_ms_source_not_in_index: 300  # 200
   style_intent:
     enabled: true
     selected_sku_boost: 1.2
@@ -102,6 +144,10 @@ query_config:
   product_title_exclusion:
     enabled: true
     dictionary_path: config/dictionaries/product_title_exclusion.tsv
+
+  # 动态多语言检索字段配置
+  # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式；
+  # shared_fields 为无语言后缀字段。
   search_fields:
     multilingual_fields:
     - title
@@ -111,11 +157,14 @@ query_config:
     - brief
     - description
     - vendor
+    # shared_fields: 无语言后缀字段；示例: tags, option1_values, option2_values, option3_values
     shared_fields: null
     core_multilingual_fields:
     - title
     - qanchors
     - category_name_text
+
+  # 统一文本召回策略（主查询 + 翻译查询）
   text_query_strategy:
     base_minimum_should_match: 60%
     translation_minimum_should_match: 60%
@@ -130,8 +179,14 @@ query_config:
       title: 5.0
       qanchors: 4.0
     phrase_match_boost: 3.0
+
+  # Embedding字段名称
   text_embedding_field: title_embedding
   image_embedding_field: image_embedding.vector
+
+  # 返回字段配置（_source includes）
+  # null表示返回所有字段，[]表示不返回任何字段，列表表示只返回指定字段
+  # 下列字段与 api/result_formatter.py（SpuResult 填充）及 search/searcher.py（SKU 排序/主图替换）一致
   source_fields:
   - spu_id
   - handle
@@ -163,18 +218,26 @@ query_config:
   - option3_values
   - specifications
   - skus
+
+  # KNN：文本向量与多模态（图片）向量各自 boost 与召回（k / num_candidates）
   knn_text_boost: 4
   knn_image_boost: 4
+
+  # knn_text_num_candidates = k * 3.4
   knn_text_k: 160
   knn_text_num_candidates: 560
   knn_text_k_long: 400
   knn_text_num_candidates_long: 1200
   knn_image_k: 400
   knn_image_num_candidates: 1200
+
+# Function Score配置（ES层打分规则）
 function_score:
   score_mode: sum
   boost_mode: multiply
   functions: []
+
+# 粗排配置（仅融合 ES 文本/向量信号，不调用模型）
 coarse_rank:
   enabled: true
   input_window: 700
@@ -182,12 +245,16 @@ coarse_rank:
   fusion:
     text_bias: 0.1
     text_exponent: 0.35
+    # base_query_trans_* 相对 base_query 的权重（见 search/rerank_client 中文本 dismax 融合）
+    # 因为es的打分已经给了trans进行了折扣，所以这里不再继续折扣
     text_translation_weight: 1.0
     knn_text_weight: 1.0
     knn_image_weight: 1.0
     knn_tie_breaker: 0.1
     knn_bias: 0.6
     knn_exponent: 0.0
+
+# 精排配置（轻量 reranker）
 fine_rank:
   enabled: false
   input_window: 160
@@ -196,6 +263,8 @@ fine_rank:
   rerank_query_template: '{query}'
   rerank_doc_template: '{title}'
   service_profile: fine
+
+# 重排配置（provider/URL 在 services.rerank）
 rerank:
   enabled: true
   rerank_window: 160
@@ -205,6 +274,11 @@ rerank:
   rerank_query_template: '{query}'
   rerank_doc_template: '{title}'
   service_profile: default
+
+  # 乘法融合：fused = Π (max(score,0) + bias) ** exponent（rerank / text / knn 三项）
+  # 其中 knn_score 先做一层 dis_max：
+  #   max(knn_text_weight * text_knn, knn_image_weight * image_knn)
+  #   + knn_tie_breaker * 另一侧较弱信号
   fusion:
     rerank_bias: 1.0e-05
     rerank_exponent: 1.15
@@ -212,22 +286,29 @@ rerank:
     fine_exponent: 1.0
     text_bias: 0.1
     text_exponent: 0.25
+    # base_query_trans_* 相对 base_query 的权重（见 search/rerank_client 中文本 dismax 融合）
     text_translation_weight: 0.8
     knn_text_weight: 1.0
     knn_image_weight: 1.0
     knn_tie_breaker: 0.1
     knn_bias: 0.6
     knn_exponent: 0.0
+
+# 可扩展服务/provider 注册表（单一配置源）
 services:
   translation:
     service_url: http://127.0.0.1:6006
+    # default_model: nllb-200-distilled-600m
     default_model: nllb-200-distilled-600m
     default_scene: general
     timeout_sec: 10.0
     cache:
       ttl_seconds: 62208000
       sliding_expiration: true
+      # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
       enable_model_quality_tier_cache: true
+      # Higher tier = better quality. Multiple models may share one tier (同级).
+      # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
       model_quality_tiers:
         deepl: 30
         qwen-mt: 30
@@ -321,12 +402,13 @@ services:
         num_beams: 1
         use_cache: true
   embedding:
-    provider: http
+    provider: http  # http
     providers:
       http:
         text_base_url: http://127.0.0.1:6005
         image_base_url: http://127.0.0.1:6008
-    backend: tei
+    # 服务内文本后端（embedding 进程启动时读取）
+    backend: tei  # tei | local_st
     backends:
       tei:
         base_url: http://127.0.0.1:8080
@@ -337,7 +419,10 @@ services:
         device: cuda
         batch_size: 32
         normalize_embeddings: true
-    image_backend: clip_as_service
+    # 服务内图片后端（embedding 进程启动时读取；cnclip gRPC 与 6008 须同一 model_name）
+    # Chinese-CLIP：ViT-H-14 → 1024 维，ViT-L-14 → 768 维。须与 mappings/search_products.json 中
+    # image_embedding.vector.dims 一致（当前索引为 1024 → 默认 ViT-H-14）。
+    image_backend: clip_as_service  # clip_as_service | local_cnclip
     image_backends:
       clip_as_service:
         server: grpc://127.0.0.1:51000
@@ -364,6 +449,7 @@ services:
       max_docs: 1000
       normalize: true
     default_instance: default
+    # 命名实例：同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。
     instances:
       default:
         host: 0.0.0.0
@@ -405,11 +491,29 @@ services:
         enforce_eager: false
         infer_batch_size: 100
         sort_by_doc_length: true
-        instruction_format: standard
+        # standard=_format_instruction__standard（固定 yes/no system）；compact=_format_instruction（instruction 作 system 且 user 内重复 Instruct）
+        instruction_format: standard  # compact standard
+        # instruction: "Given a query, score the product for relevance"
+        # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
+        # instruction: "rank products by given query, category match first"
+        # instruction: "Rank products by query relevance, prioritizing category match"
+        # instruction: "Rank products by query relevance, prioritizing category and style match"
+        # instruction: "Rank by query relevance, prioritize category & style"
+        # instruction: "Relevance ranking: category & style match first"
+        # instruction: "Score product relevance by query with category & style match prioritized"
+        # instruction: "Rank products by query with category & style match prioritized"
+        # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query"
         instruction: rank products by given query
+      # vLLM LLM.score()（跨编码打分）。独立高性能环境 .venv-reranker-score（vllm 0.18 固定版）：./scripts/setup_reranker_venv.sh qwen3_vllm_score
+      # 与 qwen3_vllm 可共用同一 model_name / HF 缓存；venv 分离以便升级 vLLM 而不影响 generate 后端。
       qwen3_vllm_score:
         model_name: Qwen/Qwen3-Reranker-0.6B
+        # 官方 Hub 原版需 true；若改用已转换的 seq-cls 权重（如 tomaarsen/...-seq-cls）则设为 false
         use_original_qwen3_hf_overrides: true
+        # vllm_runner: "auto"
+        # vllm_convert: "auto"
+        # 可选：在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并
+        # hf_overrides: {}
         engine: vllm
         max_model_len: 172
         tensor_parallel_size: 1
@@ -419,15 +523,23 @@ services:
         enforce_eager: false
         infer_batch_size: 80
         sort_by_doc_length: true
-        instruction_format: standard
+        # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
+        instruction_format: standard  # compact standard
+        # instruction: "Rank products by query with category & style match prioritized"
+        # instruction: "Given a shopping query, rank products by relevance"
         instruction: Rank products by query with category & style match prioritized
       qwen3_transformers:
         model_name: Qwen/Qwen3-Reranker-0.6B
         instruction: rank products by given query
+        # instruction: "Score the product’s relevance to the given query"
         max_length: 8192
         batch_size: 64
         use_fp16: true
+        # sdpa：默认无需 flash-attn；若已安装 flash_attn 可改为 flash_attention_2
         attn_implementation: sdpa
+      # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
+      # For 1 query + many short docs (for example 400 product titles), this usually reduces
+      # repeated prefix work and padding waste compared with pairwise batching.
       qwen3_transformers_packed:
         model_name: Qwen/Qwen3-Reranker-0.6B
         instruction: Rank products by query with category & style match prioritized
@@ -436,6 +548,8 @@ services:
         max_docs_per_pack: 0
         use_fp16: true
         sort_by_doc_length: true
+        # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
+        # If your torch/transformers stack validates it, you can benchmark "sdpa".
         attn_implementation: eager
       qwen3_gguf:
         repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
@@ -443,6 +557,7 @@ services:
         cache_dir: ./model_cache
         local_dir: ./models/reranker/qwen3-reranker-4b-gguf
         instruction: Rank products by query with category & style match prioritized
+        # T4 16GB / 性能优先配置：全量层 offload，实测比保守配置明显更快
         n_ctx: 512
         n_batch: 512
         n_ubatch: 512
@@ -465,6 +580,8 @@ services:
         cache_dir: ./model_cache
         local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf
         instruction: Rank products by query with category & style match prioritized
+        # 0.6B GGUF / online rerank baseline:
+        # 实测 400 titles 单请求约 265s，因此它更适合作为低显存功能后备，不适合在线低延迟主路由。
         n_ctx: 256
         n_batch: 256
         n_ubatch: 256
@@ -484,22 +601,34 @@ services:
         verbose: false
       dashscope_rerank:
         model_name: qwen3-rerank
+        # 按地域选择 endpoint:
+        # 中国:   https://dashscope.aliyuncs.com/compatible-api/v1/reranks
+        # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
+        # 美国:   https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
         endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
         api_key_env: RERANK_DASHSCOPE_API_KEY_CN
         timeout_sec: 10.0
-        top_n_cap: 0
-        batchsize: 64
+        top_n_cap: 0   # 0 表示 top_n=当前请求文档数；>0 则限制 top_n 上限
+        batchsize: 64  # 0 关闭；>0 启用并发小包调度（top_n/top_n_cap 仍生效，分包后全局截断）
         instruct: Given a shopping query, rank product titles by relevance
         max_retries: 2
         retry_backoff_sec: 0.2
+
+# SPU配置（已启用，使用嵌套skus）
 spu_config:
   enabled: true
   spu_field: spu_id
   inner_hits_size: 10
+  # 配置哪些option维度参与检索（进索引、以及在线搜索）
+  # 格式为list，选择option1/option2/option3中的一个或多个
   searchable_option_dimensions:
   - option1
   - option2
   - option3
+
+# 租户配置（Tenant Configuration）
+# 每个租户可配置主语言 primary_language 与索引语言 index_languages（主市场语言，商家可勾选）
+# 默认 index_languages: [en, zh]，可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
 tenant_config:
   default:
     primary_language: en
@@ -0,0 +1,330 @@
+# Search Evaluation Framework
+
+This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation.
+
+It is designed around one core rule:
+
+- Annotation should be built offline first.
+- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set.
+- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete.
+
+## Goals
+
+The framework supports four related tasks:
+
+1. Build an annotation set for a fixed query set.
+2. Evaluate a live search result list against that annotation set.
+3. Run batch evaluation and store historical reports with config snapshots.
+4. Tune fusion parameters reproducibly.
+
+## Files
+
+- `eval_framework.py`
+  Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation.
+- `build_annotation_set.py`
+  Thin CLI entrypoint into `eval_framework.py`.
+- `serve_eval_web.py`
+  Thin web entrypoint into `eval_framework.py`.
+- `tune_fusion.py`
+  Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports.
+- `fusion_experiments_shortlist.json`
+  A compact experiment set for practical tuning.
+- `fusion_experiments_round1.json`
+  A broader first-round experiment set.
+- `queries/queries.txt`
+  The canonical evaluation query set.
+- `README_Requirement.md`
+  Requirement reference document.
+- `quick_start_eval.sh`
+  Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`).
+
+## Quick start (from repo root)
+
+Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend.
+
+```bash
+# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/
+./scripts/evaluation/quick_start_eval.sh batch
+
+# 2) Evaluation UI on http://127.0.0.1:6010/
+./scripts/evaluation/quick_start_eval.sh serve
+```
+
+Equivalent explicit commands:
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
+  --tenant-id "${TENANT_ID:-163}" \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --top-k 50 \
+  --language en \
+  --labeler-mode simple \
+  --force-refresh-labels
+
+./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
+  --tenant-id "${TENANT_ID:-163}" \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --host 127.0.0.1 \
+  --port 6010
+```
+
+**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window.
+
+## Storage Layout
+
+All generated artifacts are under:
+
+- `/data/saas-search/artifacts/search_evaluation`
+
+Important subpaths:
+
+- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3`
+  Main cache and annotation store.
+- `/data/saas-search/artifacts/search_evaluation/query_builds`
+  Per-query pooled annotation-set build artifacts.
+- `/data/saas-search/artifacts/search_evaluation/batch_reports`
+  Batch evaluation JSON, Markdown reports, and config snapshots.
+- `/data/saas-search/artifacts/search_evaluation/audits`
+  Audit summaries for label quality checks.
+- `/data/saas-search/artifacts/search_evaluation/tuning_runs`
+  Fusion experiment summaries and per-experiment config snapshots.
+
+## SQLite Schema Summary
+
+The main tables in `search_eval.sqlite3` are:
+
+- `corpus_docs`
+  Cached product corpus for the tenant.
+- `rerank_scores`
+  Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`.
+- `relevance_labels`
+  Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`.
+- `query_profiles`
+  Structured query-intent profiles extracted before labeling.
+- `build_runs`
+  Per-query pooled-build records.
+- `batch_runs`
+  Batch evaluation history.
+
+## Label Semantics
+
+Three labels are used throughout:
+
+- `Exact`
+  Fully matches the intended product type and all explicit required attributes.
+- `Partial`
+  Main intent matches, but explicit attributes are missing, approximate, or weaker than requested.
+- `Irrelevant`
+  Product type mismatches, or explicit required attributes conflict.
+
+The framework always uses:
+
+- LLM-based batched relevance classification
+- caching and retry logic for robust offline labeling
+
+There are now two labeler modes:
+
+- `simple`
+  Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt.
+- `complex`
+  Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default.
+
+## Offline-First Workflow
+
+### 1. Refresh labels for the evaluation query set
+
+For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient.
+
+Example:
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --top-k 50 \
+  --language en \
+  --labeler-mode simple \
+  --force-refresh-labels
+```
+
+This command does two things:
+
+- runs **every** query in the file against the live backend (no skip list)
+- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM
+
+After this step, single-query evaluation can run in cached mode without calling the LLM again.
+
+### 2. Optional pooled build
+
+The framework also supports a heavier pooled build that combines:
+
+- top search results
+- top full-corpus reranker results
+
+Example:
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py build \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --search-depth 1000 \
+  --rerank-depth 10000 \
+  --annotate-search-top-k 100 \
+  --annotate-rerank-top-k 120 \
+  --language en
+```
+
+This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window.
+
+## Why Single-Query Evaluation Was Slow
+
+If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means:
+
+- perform live search
+- detect recalled but unlabeled products
+- call the LLM to label them
+
+That is not the intended steady-state evaluation path.
+
+The UI/API is now configured to prefer cached evaluation:
+
+- default single-query evaluation uses `auto_annotate=false`
+- unlabeled recalled results are treated as `Irrelevant`
+- the response includes tips explaining that coverage gap
+
+If you want stable, fast evaluation:
+
+1. prebuild labels offline
+2. use cached single-query evaluation
+
+## Web UI
+
+Start the evaluation UI:
+
+```bash
+./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --host 127.0.0.1 \
+  --port 6010
+```
+
+The UI provides:
+
+- query list loaded from `queries.txt`
+- single-query evaluation
+- batch evaluation
+- history of batch reports
+- top recalled results
+- missed `Exact` and `Partial` products that were not recalled
+- tips about unlabeled hits treated as `Irrelevant`
+
+### Single-query response behavior
+
+For a single query:
+
+1. live search returns recalled `spu_id` values
+2. the framework looks up cached labels by `(query, spu_id)`
+3. unlabeled recalled items are counted as `Irrelevant`
+4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial`
+
+This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer.
+
+## CLI Commands
+
+### Build pooled annotation artifacts
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py build ...
+```
+
+### Run batch evaluation
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --top-k 50 \
+  --language en \
+  --labeler-mode simple
+```
+
+Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first.
+
+### Audit annotation quality
+
+```bash
+./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --top-k 50 \
+  --language en \
+  --labeler-mode simple
+```
+
+This checks cached labels against current guardrails and reports suspicious cases.
+
+## Batch Reports
+
+Each batch run stores:
+
+- aggregate metrics
+- per-query metrics
+- label distribution
+- timestamp
+- config snapshot from `/admin/config`
+
+Reports are written as:
+
+- Markdown for easy reading
+- JSON for downstream processing
+
+## Fusion Tuning
+
+The tuning runner applies experiment configs sequentially and records the outcome.
+
+Example:
+
+```bash
+./.venv/bin/python scripts/evaluation/tune_fusion.py \
+  --tenant-id 163 \
+  --queries-file scripts/evaluation/queries/queries.txt \
+  --top-k 50 \
+  --language en \
+  --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \
+  --score-metric MAP_3 \
+  --apply-best
+```
+
+What it does:
+
+1. writes an experiment config into `config/config.yaml`
+2. restarts backend
+3. runs batch evaluation
+4. stores the per-experiment result
+5. optionally applies the best experiment at the end
+
+## Current Practical Recommendation
+
+For day-to-day evaluation:
+
+1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels`
+2. run the web UI or normal batch evaluation in cached mode
+3. only force-refresh labels again when:
+   - the query set changes
+   - the product corpus changes materially
+   - the labeling logic changes
+
+## Caveats
+
+- The current label cache is query-specific, not a full all-products all-queries matrix.
+- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached.
+- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting.
+- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters.
+
+## Related Requirement Docs
+
+- `README_Requirement.md`
+- `README_Requirement_zh.md`
+
+These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow.
@@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = &quot;Irrelevant&quot;
 VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
 DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
-JUDGE_PROMPT_VERSION = "v2_structured_20260331"
+JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
+JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
+DEFAULT_LABELER_MODE = "simple"
  
  
 def utc_now_iso() -> str:
@@ -625,6 +627,57 @@ class DashScopeLabelClient:
         content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
         return content, safe_json_dumps(data)
  
+    def classify_batch_simple(
+        self,
+        query: str,
+        docs: Sequence[Dict[str, Any]],
+    ) -> Tuple[List[str], str]:
+        numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
+        prompt = (
+            "You are an e-commerce search result relevance evaluation assistant. "
+            "Based on the user query and each product's information, output the relevance level for each product.\n\n"
+            "## Relevance Level Criteria\n"
+            "Exact — Fully matches the user's search intent.\n"
+            "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), "
+            "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
+            "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n"
+            "Additional judging guidance:\n"
+            "- If the query clearly names a product type, product type matching has the highest priority. "
+            "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
+            "bra vs top, backpack vs bag are not interchangeable.\n"
+            "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
+            "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
+            "- Do not guess missing attributes.\n"
+            "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
+            "- Be conservative with Exact.\n\n"
+            f"Query: {query}\n\n"
+            "Products:\n"
+            + "\n".join(numbered_docs)
+            + "\n\n## Output Format\n"
+            f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
+            "They must correspond sequentially to the products above. Do not output any other information.\n"
+        )
+        content, raw_response = self._chat(prompt)
+        labels = []
+        for line in str(content or "").splitlines():
+            label = line.strip()
+            if label in VALID_LABELS:
+                labels.append(label)
+        if len(labels) != len(docs):
+            payload = _extract_json_blob(content)
+            if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
+                labels = []
+                for item in payload["labels"][: len(docs)]:
+                    if isinstance(item, dict):
+                        label = str(item.get("label") or "").strip()
+                    else:
+                        label = str(item).strip()
+                    if label in VALID_LABELS:
+                        labels.append(label)
+        if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
+            raise ValueError(f"unexpected simple label output: {content!r}")
+        return labels, raw_response
+
     def extract_query_profile(
         self,
         query: str,
@@ -665,7 +718,7 @@ class DashScopeLabelClient:
         payload.setdefault("notes", [])
         return payload, raw_response
  
-    def classify_batch(
+    def classify_batch_complex(
         self,
         query: str,
         query_profile: Dict[str, Any],
@@ -763,10 +816,12 @@ class SearchEvaluationFramework:
         tenant_id: str,
         artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
         search_base_url: str = "http://localhost:6002",
+        labeler_mode: str = DEFAULT_LABELER_MODE,
     ):
         init_service(get_app_config().infrastructure.elasticsearch.host)
         self.tenant_id = str(tenant_id)
         self.artifact_root = ensure_dir(artifact_root)
+        self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
         self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
         self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
         app_cfg = get_app_config()
@@ -783,17 +838,24 @@ class SearchEvaluationFramework:
             base_url=str(llm_cfg["base_url"]),
             api_key=str(api_key),
         )
-        self.query_parser = get_query_parser()
+        self.query_parser = None
+
+    def _get_query_parser(self):
+        if self.query_parser is None:
+            self.query_parser = get_query_parser()
+        return self.query_parser
  
     def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
-        parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"])
+        parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
         payload = parsed.to_dict()
         payload["text_for_rerank"] = parsed.text_for_rerank()
         return payload
  
     def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
+        if self.labeler_mode != "complex":
+            raise RuntimeError("query profiles are only used in complex labeler mode")
         if not force_refresh:
-            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION)
+            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
             if cached is not None:
                 return cached
         parser_hints = self.build_query_parser_hints(query)
@@ -802,7 +864,7 @@ class SearchEvaluationFramework:
         self.store.upsert_query_profile(
             self.tenant_id,
             query,
-            JUDGE_PROMPT_VERSION,
+            JUDGE_PROMPT_VERSION_COMPLEX,
             self.label_client.model,
             profile,
             raw_response,
@@ -955,9 +1017,24 @@ class SearchEvaluationFramework:
         *,
         top_k: int = 100,
         language: str = "en",
-        auto_annotate: bool = True,
+        auto_annotate: bool = False,
     ) -> Dict[str, Any]:
         live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
+        if self.labeler_mode != "complex":
+            labels = [
+                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+                for item in live["results"]
+            ]
+            return {
+                "query": query,
+                "tenant_id": self.tenant_id,
+                "top_k": top_k,
+                "metrics": live["metrics"],
+                "distribution": label_distribution(labels),
+                "query_profile": None,
+                "suspicious": [],
+                "results": live["results"],
+            }
         query_profile = self.get_query_profile(query, force_refresh=False)
         suspicious: List[Dict[str, Any]] = []
  
@@ -1093,7 +1170,6 @@ class SearchEvaluationFramework:
         docs: Sequence[Dict[str, Any]],
         force_refresh: bool = False,
     ) -> Dict[str, str]:
-        query_profile = self.get_query_profile(query, force_refresh=force_refresh)
         labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
         missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
         if not missing_docs:
@@ -1101,12 +1177,9 @@ class SearchEvaluationFramework:
  
         for start in range(0, len(missing_docs), self.label_client.batch_size):
             batch = missing_docs[start : start + self.label_client.batch_size]
-            batch_pairs = self._classify_with_retry(query, query_profile, batch)
+            batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
             for sub_labels, raw_response, sub_batch in batch_pairs:
-                to_store = {
-                    str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc)
-                    for doc, label in zip(sub_batch, sub_labels)
-                }
+                to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
                 self.store.upsert_labels(
                     self.tenant_id,
                     query,
@@ -1121,19 +1194,28 @@ class SearchEvaluationFramework:
     def _classify_with_retry(
         self,
         query: str,
-        query_profile: Dict[str, Any],
         docs: Sequence[Dict[str, Any]],
+        *,
+        force_refresh: bool = False,
     ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
         if not docs:
             return []
         try:
-            labels, raw_response = self.label_client.classify_batch(query, query_profile, docs)
+            if self.labeler_mode == "complex":
+                query_profile = self.get_query_profile(query, force_refresh=force_refresh)
+                labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
+                labels = [
+                    self._apply_rule_based_label_guardrails(label, query_profile, doc)
+                    for doc, label in zip(docs, labels)
+                ]
+            else:
+                labels, raw_response = self.label_client.classify_batch_simple(query, docs)
             return [(labels, raw_response, docs)]
         except Exception:
             if len(docs) == 1:
                 raise
             mid = len(docs) // 2
-            return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:])
+            return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
  
     def build_query_annotation_set(
         self,
@@ -1163,7 +1245,6 @@ class SearchEvaluationFramework:
         for item in full_rerank[:annotate_rerank_top_k]:
             pool_docs[str(item["spu_id"])] = item["doc"]
  
-        query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels)
         labels = self.annotate_missing_labels(
             query=query,
             docs=list(pool_docs.values()),
@@ -1229,7 +1310,8 @@ class SearchEvaluationFramework:
                 "annotate_rerank_top_k": annotate_rerank_top_k,
                 "pool_size": len(pool_docs),
             },
-            "query_profile": query_profile,
+            "labeler_mode": self.labeler_mode,
+            "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
             "metrics_top100": metrics,
             "search_results": search_labeled_results,
             "full_rerank_top": rerank_top_results,
@@ -1250,7 +1332,7 @@ class SearchEvaluationFramework:
         self,
         query: str,
         top_k: int = 100,
-        auto_annotate: bool = True,
+        auto_annotate: bool = False,
         language: str = "en",
         force_refresh_labels: bool = False,
     ) -> Dict[str, Any]:
@@ -1259,16 +1341,21 @@ class SearchEvaluationFramework:
         if auto_annotate:
             self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
         labels = self.store.get_labels(self.tenant_id, query)
+        recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
         labeled = []
+        unlabeled_hits = 0
         for rank, doc in enumerate(results[:top_k], start=1):
             spu_id = str(doc.get("spu_id"))
+            label = labels.get(spu_id)
+            if label not in VALID_LABELS:
+                unlabeled_hits += 1
             labeled.append(
                 {
                     "rank": rank,
                     "spu_id": spu_id,
                     "title": build_display_title(doc),
                     "image_url": doc.get("image_url"),
-                    "label": labels.get(spu_id),
+                    "label": label,
                     "option_values": list(compact_option_values(doc.get("skus") or [])),
                     "product": compact_product_payload(doc),
                 }
@@ -1277,12 +1364,65 @@ class SearchEvaluationFramework:
             item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
             for item in labeled
         ]
+        label_stats = self.store.get_query_label_stats(self.tenant_id, query)
+        rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
+        relevant_missing_ids = [
+            spu_id
+            for spu_id, label in labels.items()
+            if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
+        ]
+        missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
+        missing_relevant = []
+        for spu_id in relevant_missing_ids:
+            doc = missing_docs_map.get(spu_id)
+            if not doc:
+                continue
+            missing_relevant.append(
+                {
+                    "spu_id": spu_id,
+                    "label": labels[spu_id],
+                    "rerank_score": rerank_scores.get(spu_id),
+                    "title": build_display_title(doc),
+                    "image_url": doc.get("image_url"),
+                    "option_values": list(compact_option_values(doc.get("skus") or [])),
+                    "product": compact_product_payload(doc),
+                }
+            )
+        label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
+        missing_relevant.sort(
+            key=lambda item: (
+                label_order.get(str(item.get("label")), 9),
+                -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
+                str(item.get("title") or ""),
+            )
+        )
+        tips: List[str] = []
+        if auto_annotate:
+            tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
+        else:
+            tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
+        if label_stats["total"] == 0:
+            tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
+        if unlabeled_hits:
+            tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
+        if not missing_relevant:
+            tips.append("No cached Exact/Partial products were missed by this recall set.")
         return {
             "query": query,
             "tenant_id": self.tenant_id,
             "top_k": top_k,
             "metrics": compute_query_metrics(metric_labels),
             "results": labeled,
+            "missing_relevant": missing_relevant,
+            "label_stats": {
+                **label_stats,
+                "unlabeled_hits_treated_irrelevant": unlabeled_hits,
+                "recalled_hits": len(labeled),
+                "missing_relevant_count": len(missing_relevant),
+                "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
+                "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
+            },
+            "tips": tips,
             "total": int(search_payload.get("total") or 0),
         }
  
@@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -&gt; str:
 class SearchEvalRequest(BaseModel):
     query: str
     top_k: int = Field(default=100, ge=1, le=500)
-    auto_annotate: bool = True
+    auto_annotate: bool = False
     language: str = "en"
  
  
 class BatchEvalRequest(BaseModel):
     queries: Optional[List[str]] = None
     top_k: int = Field(default=100, ge=1, le=500)
-    auto_annotate: bool = True
+    auto_annotate: bool = False
     language: str = "en"
     force_refresh_labels: bool = False
  
@@ -1494,6 +1634,8 @@ WEB_APP_HTML = &quot;&quot;&quot;
     .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
     .section { margin-bottom: 28px; }
     .history { font-size: 13px; line-height: 1.5; }
+    .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
+    .tip { margin-bottom: 6px; color: var(--muted); }
   </style>
 </head>
 <body>
@@ -1524,6 +1666,14 @@ WEB_APP_HTML = &quot;&quot;&quot;
         <h2>Top Results</h2>
         <div id="results" class="results"></div>
       </section>
+      <section class="section">
+        <h2>Missed Exact / Partial</h2>
+        <div id="missingRelevant" class="results"></div>
+      </section>
+      <section class="section">
+        <h2>Notes</h2>
+        <div id="tips" class="tips muted"></div>
+      </section>
     </main>
   </div>
   <script>
@@ -1542,15 +1692,15 @@ WEB_APP_HTML = &quot;&quot;&quot;
         root.appendChild(card);
       });
     }
-    function renderResults(results) {
-      const root = document.getElementById('results');
-      root.innerHTML = '';
+    function renderResults(results, rootId='results', showRank=true) {
+      const mount = document.getElementById(rootId);
+      mount.innerHTML = '';
       (results || []).forEach(item => {
         const label = item.label || 'Unknown';
         const box = document.createElement('div');
         box.className = 'result';
         box.innerHTML = `
-          <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div>
+          <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
           <img class="thumb" src="${item.image_url || ''}" alt="" />
           <div>
             <div class="title">${item.title || ''}</div>
@@ -1560,8 +1710,18 @@ WEB_APP_HTML = &quot;&quot;&quot;
               <div>${(item.option_values || [])[2] || ''}</div>
             </div>
           </div>`;
-        root.appendChild(box);
+        mount.appendChild(box);
       });
+      if (!(results || []).length) {
+        mount.innerHTML = '<div class="muted">None.</div>';
+      }
+    }
+    function renderTips(data) {
+      const root = document.getElementById('tips');
+      const tips = [...(data.tips || [])];
+      const stats = data.label_stats || {};
+      tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
+      root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
     }
     async function loadQueries() {
       const data = await fetchJSON('/api/queries');
@@ -1592,11 +1752,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
       const data = await fetchJSON('/api/search-eval', {
         method: 'POST',
         headers: {'Content-Type': 'application/json'},
-        body: JSON.stringify({query, top_k: 100, auto_annotate: true})
+        body: JSON.stringify({query, top_k: 100, auto_annotate: false})
       });
       document.getElementById('status').textContent = `Done. total=${data.total}`;
       renderMetrics(data.metrics);
-      renderResults(data.results);
+      renderResults(data.results, 'results', true);
+      renderResults(data.missing_relevant, 'missingRelevant', false);
+      renderTips(data);
       loadHistory();
     }
     async function runBatch() {
@@ -1604,11 +1766,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
       const data = await fetchJSON('/api/batch-eval', {
         method: 'POST',
         headers: {'Content-Type': 'application/json'},
-        body: JSON.stringify({top_k: 100, auto_annotate: true})
+        body: JSON.stringify({top_k: 100, auto_annotate: false})
       });
       document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
       renderMetrics(data.aggregate_metrics);
-      renderResults([]);
+      renderResults([], 'results', true);
+      renderResults([], 'missingRelevant', false);
+      document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
       loadHistory();
     }
     loadQueries();
@@ -1633,6 +1797,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     build.add_argument("--language", default="en")
     build.add_argument("--force-refresh-rerank", action="store_true")
     build.add_argument("--force-refresh-labels", action="store_true")
+    build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  
     batch = sub.add_parser("batch", help="Run batch evaluation against live search")
     batch.add_argument("--tenant-id", default="163")
@@ -1640,6 +1805,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     batch.add_argument("--top-k", type=int, default=100)
     batch.add_argument("--language", default="en")
     batch.add_argument("--force-refresh-labels", action="store_true")
+    batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  
     audit = sub.add_parser("audit", help="Audit annotation quality for queries")
     audit.add_argument("--tenant-id", default="163")
@@ -1648,18 +1814,20 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
     audit.add_argument("--language", default="en")
     audit.add_argument("--limit-suspicious", type=int, default=5)
     audit.add_argument("--force-refresh-labels", action="store_true")
+    audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  
     serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
     serve.add_argument("--tenant-id", default="163")
     serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
     serve.add_argument("--host", default="0.0.0.0")
     serve.add_argument("--port", type=int, default=6010)
+    serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
  
     return parser
  
  
 def run_build(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
     queries = framework.queries_from_file(Path(args.queries_file))
     summary = []
     for query in queries:
@@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
  
  
 def run_batch(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
     queries = framework.queries_from_file(Path(args.queries_file))
     payload = framework.batch_evaluate(
         queries=queries,
@@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -&gt; None:
  
  
 def run_audit(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
     queries = framework.queries_from_file(Path(args.queries_file))
     audit_items = []
     for query in queries:
@@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -&gt; None:
  
  
 def run_serve(args: argparse.Namespace) -> None:
-    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
     app = create_web_app(framework, Path(args.queries_file))
     import uvicorn
  
@@ -30,7 +30,6 @@ khaki green backpack
 高跟鞋
 图案连身衣
 天鹅绒鸡尾酒会礼服
-Wearing small clothes
 gingham dress
 海滩度假装
 vacation outfits
@@ -41,10 +40,15 @@ hiking boots
 business casual women
 a-line dress
 涤纶短裤
-哺乳文胸
 Compression Top Spandex
 skiing trip insulated base layer
 high waisted jeans
 无袖夏装
 雪纺衬衫
-convertible zip-off hiking pants
 \ No newline at end of file
+convertible zip-off hiking pants
+petite summer linen shorts
+tall slim fit men's linen shirt
+tall slim fit trousers
+tall straight leg pants
+tassel maxi skirt
+teacher clothes
 \ No newline at end of file
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# Search evaluation quick entrypoints. Run from any cwd; resolves repo root.
+set -euo pipefail
+
+ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
+cd "$ROOT"
+PY="${ROOT}/.venv/bin/python"
+TENANT_ID="${TENANT_ID:-163}"
+QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
+
+usage() {
+  echo "Usage: $0 batch|serve"
+  echo "  batch  — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)"
+  echo "  serve  — eval UI on http://127.0.0.1:6010/"
+  echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)"
+}
+
+case "${1:-}" in
+  batch)
+    exec "$PY" scripts/evaluation/build_annotation_set.py batch \
+      --tenant-id "$TENANT_ID" \
+      --queries-file "$QUERIES" \
+      --top-k 50 \
+      --language en \
+      --labeler-mode simple \
+      --force-refresh-labels
+    ;;
+  serve)
+    exec "$PY" scripts/evaluation/serve_eval_web.py serve \
+      --tenant-id "$TENANT_ID" \
+      --queries-file "$QUERIES" \
+      --host 127.0.0.1 \
+      --port 6010
+    ;;
+  *)
+    usage
+    exit 1
+    ;;
+esac