评估框架

tangwang
1 parent 267920e5
Showing 7 changed files with 2367 additions and 348 deletions Show diff stats
config/config.yaml
scripts/evaluation/README.md -> scripts/evaluation/README_Requirement.md
scripts/evaluation/README_zh.md -> scripts/evaluation/README_Requirement_zh.md
scripts/evaluation/build_annotation_set.py
scripts/evaluation/eval_framework.py
scripts/evaluation/serve_eval_web.py
scripts/evaluation/tune_fusion.py
-# Unified Configuration for Multi-Tenant Search Engine
-# 统一配置文件，所有租户共用一套配置
-# 注意：索引结构由 mappings/search_products.json 定义，此文件只配置搜索行为
-#
-# 约定：下列键为必填；进程环境变量可覆盖 infrastructure / runtime 中同名语义项
-#（如 ES_HOST、API_PORT 等），未设置环境变量时使用本文件中的值。
-
-# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
 runtime:
-  environment: "prod"
-  index_namespace: ""
-  api_host: "0.0.0.0"
+  environment: prod
+  index_namespace: ''
+  api_host: 0.0.0.0
   api_port: 6002
-  indexer_host: "0.0.0.0"
+  indexer_host: 0.0.0.0
   indexer_port: 6004
-  embedding_host: "0.0.0.0"
+  embedding_host: 0.0.0.0
   embedding_port: 6005
   embedding_text_port: 6005
   embedding_image_port: 6008
-  translator_host: "0.0.0.0"
+  translator_host: 0.0.0.0
   translator_port: 6006
-  reranker_host: "0.0.0.0"
+  reranker_host: 0.0.0.0
   reranker_port: 6007
-
-# 基础设施连接（敏感项优先读环境变量：ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY）
 infrastructure:
   elasticsearch:
-    host: "http://localhost:9200"
+    host: http://localhost:9200
     username: null
     password: null
   redis:
-    host: "localhost"
+    host: localhost
     port: 6479
     snapshot_db: 0
     password: null
@@ -37,8 +27,8 @@ infrastructure:
     socket_connect_timeout: 1
     retry_on_timeout: false
     cache_expire_days: 720
-    embedding_cache_prefix: "embedding"
-    anchor_cache_prefix: "product_anchors"
+    embedding_cache_prefix: embedding
+    anchor_cache_prefix: product_anchors
     anchor_cache_expire_days: 30
   database:
     host: null
@@ -49,30 +39,16 @@ infrastructure:
   secrets:
     dashscope_api_key: null
     deepl_auth_key: null
-
-# Elasticsearch Index
-es_index_name: "search_products"
-
-# 检索域 / 索引列表（可为空列表；每项字段均需显式给出）
+es_index_name: search_products
 indexes: []
-
-# Config assets
 assets:
-  query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
-
-# Product content understanding (LLM enrich-content) configuration
+  query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict
 product_enrich:
   max_workers: 40
-
-# ES Index Settings (基础设置)
 es_settings:
   number_of_shards: 1
   number_of_replicas: 0
-  refresh_interval: "30s"
-
-# 字段权重配置（用于搜索时的字段boost）
-# 统一按“字段基名”配置；查询时按实际检索语言动态拼接 .{lang}。
-# 若需要按某个语言单独调权，也可以加显式 key（例如 title.de: 3.2）。
+  refresh_interval: 30s
 field_boosts:
   title: 3.0
   qanchors: 2.5
@@ -85,79 +61,64 @@ field_boosts:
   option1_values: 1.5
   option2_values: 1.5
   option3_values: 1.5
-
-# Query Configuration（查询配置）
 query_config:
-  # 支持的语言
   supported_languages:
-    - "zh"
-    - "en"
-  default_language: "en"
-  
-  # 功能开关（翻译开关由tenant_config控制）
+  - zh
+  - en
+  default_language: en
   enable_text_embedding: true
   enable_query_rewrite: true
-
-  # 查询翻译模型（须与 services.translation.capabilities 中某项一致）
-  # 源语种在租户 index_languages 内：主召回可打在源语种字段，用下面三项。
-  zh_to_en_model: "nllb-200-distilled-600m" #  "opus-mt-zh-en"
-  en_to_zh_model: "nllb-200-distilled-600m" #  "opus-mt-en-zh"
-  default_translation_model: "nllb-200-distilled-600m"
-  # zh_to_en_model: "deepl"
-  # en_to_zh_model: "deepl"
-  # default_translation_model: "deepl"
-  # 源语种不在 index_languages：翻译对可检索文本更关键，可单独指定（缺省则与上一组相同）
-  zh_to_en_model__source_not_in_index: "nllb-200-distilled-600m"
-  en_to_zh_model__source_not_in_index: "nllb-200-distilled-600m"
-  default_translation_model__source_not_in_index: "nllb-200-distilled-600m"
-  # zh_to_en_model__source_not_in_index: "deepl"
-  # en_to_zh_model__source_not_in_index: "deepl"
-  # default_translation_model__source_not_in_index: "deepl"
-
-  # 查询解析阶段：翻译与 query 向量并发执行，共用同一等待预算（毫秒）。
-  # 检测语言已在租户 index_languages 内：较短；不在索引语言内：较长（翻译对召回更关键）。
-  translation_embedding_wait_budget_ms_source_in_index: 200 # 80
-  translation_embedding_wait_budget_ms_source_not_in_index: 300 #200
-
+  zh_to_en_model: nllb-200-distilled-600m
+  en_to_zh_model: nllb-200-distilled-600m
+  default_translation_model: nllb-200-distilled-600m
+  zh_to_en_model__source_not_in_index: nllb-200-distilled-600m
+  en_to_zh_model__source_not_in_index: nllb-200-distilled-600m
+  default_translation_model__source_not_in_index: nllb-200-distilled-600m
+  translation_embedding_wait_budget_ms_source_in_index: 200
+  translation_embedding_wait_budget_ms_source_not_in_index: 300
   style_intent:
     enabled: true
     selected_sku_boost: 1.2
-    color_dictionary_path: "config/dictionaries/style_intent_color.csv"
-    size_dictionary_path: "config/dictionaries/style_intent_size.csv"
+    color_dictionary_path: config/dictionaries/style_intent_color.csv
+    size_dictionary_path: config/dictionaries/style_intent_size.csv
     dimension_aliases:
-      color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
-      size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
-
+      color:
+      - color
+      - colors
+      - colour
+      - colours
+      - 颜色
+      - 色
+      - 色系
+      size:
+      - size
+      - sizes
+      - sizing
+      - 尺码
+      - 尺寸
+      - 码数
+      - 号码
+      - 码
   product_title_exclusion:
     enabled: true
-    dictionary_path: "config/dictionaries/product_title_exclusion.tsv"
-
-  # 动态多语言检索字段配置
-  # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式；
-  # shared_fields 为无语言后缀字段。
+    dictionary_path: config/dictionaries/product_title_exclusion.tsv
   search_fields:
     multilingual_fields:
-      - "title"
-      - "qanchors"
-      - "category_path"
-      - "category_name_text"
-      - "brief"
-      - "description"
-      - "vendor"
-    shared_fields:
-      # - "tags"
-      # - "option1_values"
-      # - "option2_values"
-      # - "option3_values"
+    - title
+    - qanchors
+    - category_path
+    - category_name_text
+    - brief
+    - description
+    - vendor
+    shared_fields: null
     core_multilingual_fields:
-      - "title"
-      - "qanchors"
-      - "category_name_text"
-
-  # 统一文本召回策略（主查询 + 翻译查询）
+    - title
+    - qanchors
+    - category_name_text
   text_query_strategy:
-    base_minimum_should_match: "60%"
-    translation_minimum_should_match: "60%"
+    base_minimum_should_match: 60%
+    translation_minimum_should_match: 60%
     translation_boost: 0.75
     tie_breaker_base_query: 0.5
     best_fields_boost: 2.0
@@ -169,67 +130,51 @@ query_config:
       title: 5.0
       qanchors: 4.0
     phrase_match_boost: 3.0
-
-  # Embedding字段名称
-  text_embedding_field: "title_embedding"
-  image_embedding_field: "image_embedding.vector"
-
-  # 返回字段配置（_source includes）
-  # null表示返回所有字段，[]表示不返回任何字段，列表表示只返回指定字段
-  # 下列字段与 api/result_formatter.py（SpuResult 填充）及 search/searcher.py（SKU 排序/主图替换）一致
+  text_embedding_field: title_embedding
+  image_embedding_field: image_embedding.vector
   source_fields:
-    - spu_id
-    - handle
-    - title
-    - brief
-    - description
-    - vendor
-    - category_name
-    - category_name_text
-    - category_path
-    - category_id
-    - category_level
-    - category1_name
-    - category2_name
-    - category3_name
-    - tags
-    - min_price
-    - compare_at_price
-    - image_url
-    - sku_prices
-    - sku_weights
-    - sku_weight_units
-    - total_inventory
-    - option1_name
-    - option1_values
-    - option2_name
-    - option2_values
-    - option3_name
-    - option3_values
-    - specifications
-    - skus
-  
-  # KNN：文本向量与多模态（图片）向量各自 boost 与召回（k / num_candidates）
+  - spu_id
+  - handle
+  - title
+  - brief
+  - description
+  - vendor
+  - category_name
+  - category_name_text
+  - category_path
+  - category_id
+  - category_level
+  - category1_name
+  - category2_name
+  - category3_name
+  - tags
+  - min_price
+  - compare_at_price
+  - image_url
+  - sku_prices
+  - sku_weights
+  - sku_weight_units
+  - total_inventory
+  - option1_name
+  - option1_values
+  - option2_name
+  - option2_values
+  - option3_name
+  - option3_values
+  - specifications
+  - skus
   knn_text_boost: 4
   knn_image_boost: 4
-
-  # knn_text_num_candidates = k * 3.4
   knn_text_k: 160
   knn_text_num_candidates: 560
-
   knn_text_k_long: 400
   knn_text_num_candidates_long: 1200
-
   knn_image_k: 400
   knn_image_num_candidates: 1200
-
-# Function Score配置（ES层打分规则）
 function_score:
-  score_mode: "sum"
-  boost_mode: "multiply"
+  score_mode: sum
+  boost_mode: multiply
   functions: []
-
-# 粗排配置（仅融合 ES 文本/向量信号，不调用模型）
 coarse_rank:
   enabled: true
   input_window: 700
@@ -237,69 +182,52 @@ coarse_rank:
   fusion:
     text_bias: 0.1
     text_exponent: 0.35
-    # base_query_trans_* 相对 base_query 的权重（见 search/rerank_client 中文本 dismax 融合）
-    # 因为es的打分已经给了trans进行了折扣，所以这里不再继续折扣
     text_translation_weight: 1.0
     knn_text_weight: 1.0
     knn_image_weight: 1.0
     knn_tie_breaker: 0.1
     knn_bias: 0.6
     knn_exponent: 0.0
-
-# 精排配置（轻量 reranker）
 fine_rank:
   enabled: false
   input_window: 160
   output_window: 80
   timeout_sec: 10.0
-  rerank_query_template: "{query}"
-  rerank_doc_template: "{title}"
-  service_profile: "fine"
-
-# 重排配置（provider/URL 在 services.rerank）
+  rerank_query_template: '{query}'
+  rerank_doc_template: '{title}'
+  service_profile: fine
 rerank:
   enabled: true
   rerank_window: 160
   timeout_sec: 15.0
   weight_es: 0.4
   weight_ai: 0.6
-  rerank_query_template: "{query}"
-  rerank_doc_template: "{title}"
-  service_profile: "default"
-  # 乘法融合：fused = Π (max(score,0) + bias) ** exponent（rerank / text / knn 三项）
-  # 其中 knn_score 先做一层 dis_max：
-  #   max(knn_text_weight * text_knn, knn_image_weight * image_knn)
-  #   + knn_tie_breaker * 另一侧较弱信号
+  rerank_query_template: '{query}'
+  rerank_doc_template: '{title}'
+  service_profile: default
   fusion:
-    rerank_bias: 0.00001
-    rerank_exponent: 1.0
-    fine_bias: 0.00001
+    rerank_bias: 1.0e-05
+    rerank_exponent: 1.15
+    fine_bias: 1.0e-05
     fine_exponent: 1.0
     text_bias: 0.1
-    text_exponent: 0.35
-    # base_query_trans_* 相对 base_query 的权重（见 search/rerank_client 中文本 dismax 融合）
-    text_translation_weight: 1.0
+    text_exponent: 0.25
+    text_translation_weight: 0.8
     knn_text_weight: 1.0
     knn_image_weight: 1.0
     knn_tie_breaker: 0.1
     knn_bias: 0.6
     knn_exponent: 0.0
-
-# 可扩展服务/provider 注册表（单一配置源）
 services:
   translation:
-    service_url: "http://127.0.0.1:6006"
-    # default_model: "nllb-200-distilled-600m"
-    default_model: "nllb-200-distilled-600m"
-    default_scene: "general"
+    service_url: http://127.0.0.1:6006
+    default_model: nllb-200-distilled-600m
+    default_scene: general
     timeout_sec: 10.0
     cache:
       ttl_seconds: 62208000
       sliding_expiration: true
-      # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
       enable_model_quality_tier_cache: true
-      # Higher tier = better quality. Multiple models may share one tier (同级).
-      # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
       model_quality_tiers:
         deepl: 30
         qwen-mt: 30
@@ -310,43 +238,43 @@ services:
     capabilities:
       qwen-mt:
         enabled: true
-        backend: "qwen_mt"
-        model: "qwen-mt-flash"
-        base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
+        backend: qwen_mt
+        model: qwen-mt-flash
+        base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1
         timeout_sec: 10.0
         use_cache: true
       llm:
         enabled: true
-        backend: "llm"
-        model: "qwen-flash"
-        base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
+        backend: llm
+        model: qwen-flash
+        base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1
         timeout_sec: 30.0
         use_cache: true
       deepl:
         enabled: true
-        backend: "deepl"
-        api_url: "https://api.deepl.com/v2/translate"
+        backend: deepl
+        api_url: https://api.deepl.com/v2/translate
         timeout_sec: 10.0
-        glossary_id: ""
+        glossary_id: ''
         use_cache: true
       nllb-200-distilled-600m:
         enabled: true
-        backend: "local_nllb"
-        model_id: "facebook/nllb-200-distilled-600M"
-        model_dir: "./models/translation/facebook/nllb-200-distilled-600M"
-        ct2_model_dir: "./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16"
-        ct2_compute_type: "float16"
-        ct2_conversion_quantization: "float16"
+        backend: local_nllb
+        model_id: facebook/nllb-200-distilled-600M
+        model_dir: ./models/translation/facebook/nllb-200-distilled-600M
+        ct2_model_dir: ./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16
+        ct2_compute_type: float16
+        ct2_conversion_quantization: float16
         ct2_auto_convert: true
         ct2_inter_threads: 4
         ct2_intra_threads: 0
         ct2_max_queued_batches: 32
-        ct2_batch_type: "examples"
-        ct2_decoding_length_mode: "source"
+        ct2_batch_type: examples
+        ct2_decoding_length_mode: source
         ct2_decoding_length_extra: 8
         ct2_decoding_length_min: 32
-        device: "cuda"
-        torch_dtype: "float16"
+        device: cuda
+        torch_dtype: float16
         batch_size: 64
         max_input_length: 256
         max_new_tokens: 64
@@ -354,19 +282,19 @@ services:
         use_cache: true
       opus-mt-zh-en:
         enabled: false
-        backend: "local_marian"
-        model_id: "Helsinki-NLP/opus-mt-zh-en"
-        model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en"
-        ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16"
-        ct2_compute_type: "float16"
-        ct2_conversion_quantization: "float16"
+        backend: local_marian
+        model_id: Helsinki-NLP/opus-mt-zh-en
+        model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en
+        ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16
+        ct2_compute_type: float16
+        ct2_conversion_quantization: float16
         ct2_auto_convert: true
         ct2_inter_threads: 1
         ct2_intra_threads: 0
         ct2_max_queued_batches: 0
-        ct2_batch_type: "examples"
-        device: "cuda"
-        torch_dtype: "float16"
+        ct2_batch_type: examples
+        device: cuda
+        torch_dtype: float16
         batch_size: 16
         max_input_length: 256
         max_new_tokens: 256
@@ -374,181 +302,147 @@ services:
         use_cache: true
       opus-mt-en-zh:
         enabled: false
-        backend: "local_marian"
-        model_id: "Helsinki-NLP/opus-mt-en-zh"
-        model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh"
-        ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16"
-        ct2_compute_type: "float16"
-        ct2_conversion_quantization: "float16"
+        backend: local_marian
+        model_id: Helsinki-NLP/opus-mt-en-zh
+        model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh
+        ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16
+        ct2_compute_type: float16
+        ct2_conversion_quantization: float16
         ct2_auto_convert: true
         ct2_inter_threads: 1
         ct2_intra_threads: 0
         ct2_max_queued_batches: 0
-        ct2_batch_type: "examples"
-        device: "cuda"
-        torch_dtype: "float16"
+        ct2_batch_type: examples
+        device: cuda
+        torch_dtype: float16
         batch_size: 16
         max_input_length: 256
         max_new_tokens: 256
         num_beams: 1
         use_cache: true
   embedding:
-    provider: "http"  # http
+    provider: http
     providers:
       http:
-        text_base_url: "http://127.0.0.1:6005"
-        image_base_url: "http://127.0.0.1:6008"
-    # 服务内文本后端（embedding 进程启动时读取）
-    backend: "tei"  # tei | local_st
+        text_base_url: http://127.0.0.1:6005
+        image_base_url: http://127.0.0.1:6008
+    backend: tei
     backends:
       tei:
-        base_url: "http://127.0.0.1:8080"
+        base_url: http://127.0.0.1:8080
         timeout_sec: 20
-        model_id: "Qwen/Qwen3-Embedding-0.6B"
+        model_id: Qwen/Qwen3-Embedding-0.6B
       local_st:
-        model_id: "Qwen/Qwen3-Embedding-0.6B"
-        device: "cuda"
+        model_id: Qwen/Qwen3-Embedding-0.6B
+        device: cuda
         batch_size: 32
         normalize_embeddings: true
-    # 服务内图片后端（embedding 进程启动时读取；cnclip gRPC 与 6008 须同一 model_name）
-    # Chinese-CLIP：ViT-H-14 → 1024 维，ViT-L-14 → 768 维。须与 mappings/search_products.json 中
-    # image_embedding.vector.dims 一致（当前索引为 1024 → 默认 ViT-H-14）。
-    image_backend: "clip_as_service"  # clip_as_service | local_cnclip
+    image_backend: clip_as_service
     image_backends:
       clip_as_service:
-        server: "grpc://127.0.0.1:51000"
-        model_name: "CN-CLIP/ViT-L-14"
+        server: grpc://127.0.0.1:51000
+        model_name: CN-CLIP/ViT-L-14
         batch_size: 8
         normalize_embeddings: true
       local_cnclip:
-        model_name: "ViT-L-14"
+        model_name: ViT-L-14
         device: null
         batch_size: 8
         normalize_embeddings: true
   rerank:
-    provider: "http"
+    provider: http
     providers:
       http:
         instances:
           default:
-            base_url: "http://127.0.0.1:6007"
-            service_url: "http://127.0.0.1:6007/rerank"
+            base_url: http://127.0.0.1:6007
+            service_url: http://127.0.0.1:6007/rerank
           fine:
-            base_url: "http://127.0.0.1:6009"
-            service_url: "http://127.0.0.1:6009/rerank"
+            base_url: http://127.0.0.1:6009
+            service_url: http://127.0.0.1:6009/rerank
     request:
       max_docs: 1000
       normalize: true
-    default_instance: "default"
-    # 命名实例：同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。
+    default_instance: default
     instances:
       default:
-        host: "0.0.0.0"
+        host: 0.0.0.0
         port: 6007
-        backend: "qwen3_vllm_score"
-        runtime_dir: "./.runtime/reranker/default"
+        backend: qwen3_vllm_score
+        runtime_dir: ./.runtime/reranker/default
       fine:
-        host: "0.0.0.0"
+        host: 0.0.0.0
         port: 6009
-        backend: "bge"
-        runtime_dir: "./.runtime/reranker/fine"
+        backend: bge
+        runtime_dir: ./.runtime/reranker/fine
     backends:
       bge:
-        model_name: "BAAI/bge-reranker-v2-m3"
+        model_name: BAAI/bge-reranker-v2-m3
         device: null
         use_fp16: true
         batch_size: 80
         max_length: 160
-        cache_dir: "./model_cache"
+        cache_dir: ./model_cache
         enable_warmup: true
       jina_reranker_v3:
-        model_name: "jinaai/jina-reranker-v3"
+        model_name: jinaai/jina-reranker-v3
         device: null
-        dtype: "float16"
+        dtype: float16
         batch_size: 64
         max_doc_length: 160
         max_query_length: 64
         sort_by_doc_length: true
-        cache_dir: "./model_cache"
+        cache_dir: ./model_cache
         trust_remote_code: true
       qwen3_vllm:
-        model_name: "Qwen/Qwen3-Reranker-0.6B"
-        engine: "vllm"
+        model_name: Qwen/Qwen3-Reranker-0.6B
+        engine: vllm
         max_model_len: 256
         tensor_parallel_size: 1
-        gpu_memory_utilization: 0.20
-        dtype: "float16"
+        gpu_memory_utilization: 0.2
+        dtype: float16
         enable_prefix_caching: true
         enforce_eager: false
         infer_batch_size: 100
         sort_by_doc_length: true
-        # standard=_format_instruction__standard（固定 yes/no system）；compact=_format_instruction（instruction 作 system 且 user 内重复 Instruct）
-        instruction_format: standard # compact standard
-        # instruction: "Given a query, score the product for relevance"
-        # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
-        # instruction: "rank products by given query, category match first" 
-        # instruction: "Rank products by query relevance, prioritizing category match"
-        # instruction: "Rank products by query relevance, prioritizing category and style match"
-        # instruction: "Rank by query relevance, prioritize category & style"
-        # instruction: "Relevance ranking: category & style match first"
-        # instruction: "Score product relevance by query with category & style match prioritized"
-        # instruction: "Rank products by query with category & style match prioritized"
-        # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query"
-        instruction: "rank products by given query"
-      # vLLM LLM.score()（跨编码打分）。独立高性能环境 .venv-reranker-score（vllm 0.18 固定版）：./scripts/setup_reranker_venv.sh qwen3_vllm_score
-      # 与 qwen3_vllm 可共用同一 model_name / HF 缓存；venv 分离以便升级 vLLM 而不影响 generate 后端。
+        instruction_format: standard
+        instruction: rank products by given query
       qwen3_vllm_score:
-        model_name: "Qwen/Qwen3-Reranker-0.6B"
-        # 官方 Hub 原版需 true；若改用已转换的 seq-cls 权重（如 tomaarsen/...-seq-cls）则设为 false
+        model_name: Qwen/Qwen3-Reranker-0.6B
         use_original_qwen3_hf_overrides: true
-        # vllm_runner: "auto"
-        # vllm_convert: "auto"
-        # 可选：在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并
-        # hf_overrides: {}
-        engine: "vllm"
+        engine: vllm
         max_model_len: 172
         tensor_parallel_size: 1
         gpu_memory_utilization: 0.15
-        dtype: "float16"
+        dtype: float16
         enable_prefix_caching: true
         enforce_eager: false
         infer_batch_size: 80
         sort_by_doc_length: true
-        # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
-        instruction_format: standard # compact standard
-        # instruction: "Rank products by query with category & style match prioritized"
-        instruction: "Rank products by query with category & style match prioritized"
-        # instruction: "Given a shopping query, rank products by relevance"
+        instruction_format: standard
+        instruction: Rank products by query with category & style match prioritized
       qwen3_transformers:
-        model_name: "Qwen/Qwen3-Reranker-0.6B"
-        instruction: "rank products by given query"
-        # instruction: "Score the product’s relevance to the given query"
+        model_name: Qwen/Qwen3-Reranker-0.6B
+        instruction: rank products by given query
         max_length: 8192
         batch_size: 64
         use_fp16: true
-        # sdpa：默认无需 flash-attn；若已安装 flash_attn 可改为 flash_attention_2
-        attn_implementation: "sdpa"
-      # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
-      # For 1 query + many short docs (for example 400 product titles), this usually reduces
-      # repeated prefix work and padding waste compared with pairwise batching.
+        attn_implementation: sdpa
       qwen3_transformers_packed:
-        model_name: "Qwen/Qwen3-Reranker-0.6B"
-        instruction: "Rank products by query with category & style match prioritized"
+        model_name: Qwen/Qwen3-Reranker-0.6B
+        instruction: Rank products by query with category & style match prioritized
         max_model_len: 256
         max_doc_len: 160
         max_docs_per_pack: 0
         use_fp16: true
         sort_by_doc_length: true
-        # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
-        # If your torch/transformers stack validates it, you can benchmark "sdpa".
-        attn_implementation: "eager"
+        attn_implementation: eager
       qwen3_gguf:
-        repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
-        filename: "*Q8_0.gguf"
-        cache_dir: "./model_cache"
-        local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
-        instruction: "Rank products by query with category & style match prioritized"
-        # T4 16GB / 性能优先配置：全量层 offload，实测比保守配置明显更快
+        repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
+        filename: '*Q8_0.gguf'
+        cache_dir: ./model_cache
+        local_dir: ./models/reranker/qwen3-reranker-4b-gguf
+        instruction: Rank products by query with category & style match prioritized
         n_ctx: 512
         n_batch: 512
         n_ubatch: 512
@@ -562,17 +456,15 @@ services:
         use_mlock: false
         infer_batch_size: 8
         sort_by_doc_length: true
-        length_sort_mode: "char"
+        length_sort_mode: char
         enable_warmup: true
         verbose: false
       qwen3_gguf_06b:
-        repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
-        filename: "qwen3-reranker-0.6b-q8_0.gguf"
-        cache_dir: "./model_cache"
-        local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf"
-        instruction: "Rank products by query with category & style match prioritized"
-        # 0.6B GGUF / online rerank baseline:
-        # 实测 400 titles 单请求约 265s，因此它更适合作为低显存功能后备，不适合在线低延迟主路由。
+        repo_id: ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF
+        filename: qwen3-reranker-0.6b-q8_0.gguf
+        cache_dir: ./model_cache
+        local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf
+        instruction: Rank products by query with category & style match prioritized
         n_ctx: 256
         n_batch: 256
         n_ubatch: 256
@@ -586,54 +478,57 @@ services:
         use_mlock: false
         infer_batch_size: 32
         sort_by_doc_length: true
-        length_sort_mode: "char"
+        length_sort_mode: char
         reuse_query_state: false
         enable_warmup: true
         verbose: false
       dashscope_rerank:
-        model_name: "qwen3-rerank"
-        # 按地域选择 endpoint:
-        # 中国:   https://dashscope.aliyuncs.com/compatible-api/v1/reranks
-        # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
-        # 美国:   https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
-        endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks"
-        api_key_env: "RERANK_DASHSCOPE_API_KEY_CN"
-        timeout_sec: 10.0 # 
-        top_n_cap: 0   # 0 表示 top_n=当前请求文档数；>0 则限制 top_n 上限
-        batchsize: 64 # 0 关闭；>0 启用并发小包调度（top_n/top_n_cap 仍生效，分包后全局截断）
-        instruct: "Given a shopping query, rank product titles by relevance"
+        model_name: qwen3-rerank
+        endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
+        api_key_env: RERANK_DASHSCOPE_API_KEY_CN
+        timeout_sec: 10.0
+        top_n_cap: 0
+        batchsize: 64
+        instruct: Given a shopping query, rank product titles by relevance
         max_retries: 2
         retry_backoff_sec: 0.2
-
-# SPU配置（已启用，使用嵌套skus）
 spu_config:
   enabled: true
-  spu_field: "spu_id"
+  spu_field: spu_id
   inner_hits_size: 10
-  # 配置哪些option维度参与检索（进索引、以及在线搜索）
-  # 格式为list，选择option1/option2/option3中的一个或多个
-  searchable_option_dimensions: ['option1', 'option2', 'option3']
-
-# 租户配置（Tenant Configuration）
-# 每个租户可配置主语言 primary_language 与索引语言 index_languages（主市场语言，商家可勾选）
-# 默认 index_languages: [en, zh]，可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
+  searchable_option_dimensions:
+  - option1
+  - option2
+  - option3
 tenant_config:
   default:
-    primary_language: "en"
-    index_languages: ["en", "zh"]
+    primary_language: en
+    index_languages:
+    - en
+    - zh
   tenants:
-    "1":
-      primary_language: "zh"
-      index_languages: ["zh", "en"]
-    "2":
-      primary_language: "en"
-      index_languages: ["en", "zh"]
-    "3":
-      primary_language: "zh"
-      index_languages: ["zh", "en"]
-    "162":
-      primary_language: "zh"
-      index_languages: ["zh", "en"]
-    "170":
-      primary_language: "en"
-      index_languages: ["en", "zh"]
+    '1':
+      primary_language: zh
+      index_languages:
+      - zh
+      - en
+    '2':
+      primary_language: en
+      index_languages:
+      - en
+      - zh
+    '3':
+      primary_language: zh
+      index_languages:
+      - zh
+      - en
+    '162':
+      primary_language: zh
+      index_languages:
+      - zh
+      - en
+    '170':
+      primary_language: en
+      index_languages:
+      - en
+      - zh
@@ -106,4 +106,18 @@ queries默认是queries/queries.txt，填入左侧列表框，点击其中任何
 批量评估关注的是所有搜索词总体的评估指标。
 需要记录测试环境时间以及当时的配置文件，以及对应的结果。要保存历次的评估记录，并能查到每一次评估结果对应的配置文件有相关的指标
  
-以上是我的总体设计，但有不周全的地方。你要站在更高的层次理解我的需求，你有足够的自由可以适当调整设计，基于你所了解的自动化搜索评估框架的最佳实践，做出更优秀的设计和更好的实现。
 \ No newline at end of file
+以上是我的总体设计，但有不周全的地方。你要站在更高的层次理解我的需求，你有足够的自由可以适当调整设计，基于你所了解的自动化搜索评估框架的最佳实践，做出更优秀的设计和更好的实现。
+
+
+
+
+
+
+
+
+
+1. 请仔细检验这个标注集的质量，如果质量不符合要求，那么你要优化工具，迭代直至标注集的结果质量足够高，可以以此为自动化工具来评估检索效果，对检索效果形成指导性意见。
+2. 在结果标注集的质量足够好，批量评估工具足够好用，并且经过你的试用，能判断出搜索质量好坏的情况下，开始真正的动手检索效果调优：基于这个50条query的结果标注集和批量评估工具，对融合公式进行调参。请你先精心地设计实验，设计几组参数，对几组参数分别修改config.yaml、重启（./restart.sh backend）、跑批量评估、收集结果。
+注意评估的过程中，如果发现工具不好用，发现日志不全，发现可以通过修改工具或者日志来提高效率，都可以先做这些，根据完善。
+注意你是代码的总负责人，你有任何权限来满足你进行检索效果调优的需要。你如果发现有其他可能带来更大提升的点，也可以进行实验，你甚至可以修改融合、重排漏斗的代码，来进行实验，以追求更好的结果指标。
+但是注意，因为收到性能和耗时的约束，不要调大reranker模型的输入条数、不要打开精排，耗时方面无法承受两轮reranker模型的调用。
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+import sys
+
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.evaluation.eval_framework import main
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,1786 @@
+#!/usr/bin/env python3
+"""
+Search evaluation framework for pooled relevance annotation, live metrics, and reports.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import math
+import os
+import re
+import sqlite3
+import sys
+import time
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
+
+import requests
+from elasticsearch.helpers import scan
+from fastapi import FastAPI, HTTPException
+from fastapi.responses import HTMLResponse
+from pydantic import BaseModel, Field
+
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from api.app import get_app_config, get_es_client, get_query_parser, init_service
+from indexer.mapping_generator import get_tenant_index_name
+
+
+RELEVANCE_EXACT = "Exact"
+RELEVANCE_PARTIAL = "Partial"
+RELEVANCE_IRRELEVANT = "Irrelevant"
+VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
+DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
+DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
+JUDGE_PROMPT_VERSION = "v2_structured_20260331"
+
+
+def utc_now_iso() -> str:
+    return datetime.now(timezone.utc).isoformat()
+
+
+def utc_timestamp() -> str:
+    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
+
+
+def ensure_dir(path: Path) -> Path:
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def sha1_text(text: str) -> str:
+    return hashlib.sha1(text.encode("utf-8")).hexdigest()
+
+
+def pick_text(value: Any, preferred_lang: str = "en") -> str:
+    if value is None:
+        return ""
+    if isinstance(value, dict):
+        return str(
+            value.get(preferred_lang)
+            or value.get("en")
+            or value.get("zh")
+            or next((v for v in value.values() if v), "")
+        ).strip()
+    return str(value).strip()
+
+
+def safe_json_dumps(data: Any) -> str:
+    return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
+
+
+def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
+    if not skus:
+        return "", "", ""
+    first = skus[0] or {}
+    return (
+        str(first.get("option1_value") or "").strip(),
+        str(first.get("option2_value") or "").strip(),
+        str(first.get("option3_value") or "").strip(),
+    )
+
+
+def build_display_title(doc: Dict[str, Any]) -> str:
+    title = doc.get("title")
+    en = pick_text(title, "en")
+    zh = pick_text(title, "zh")
+    if en and zh and en != zh:
+        return f"{en} / {zh}"
+    return en or zh
+
+
+def build_rerank_doc(doc: Dict[str, Any]) -> str:
+    title = build_display_title(doc)
+    return title[:400]
+
+
+def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
+    title = build_display_title(doc)
+    option1, option2, option3 = compact_option_values(doc.get("skus") or [])
+    vendor = pick_text(doc.get("vendor"), "en")
+    category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
+    tags = doc.get("tags") or []
+    tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
+    parts = [title]
+    if option1:
+        parts.append(f"option1={option1}")
+    if option2:
+        parts.append(f"option2={option2}")
+    if option3:
+        parts.append(f"option3={option3}")
+    if vendor:
+        parts.append(f"vendor={vendor}")
+    if category:
+        parts.append(f"category={category}")
+    if tags_text:
+        parts.append(f"tags={tags_text}")
+    return f"{idx}. " + " | ".join(part for part in parts if part)
+
+
+def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
+    return {
+        "spu_id": str(doc.get("spu_id") or ""),
+        "title": build_display_title(doc),
+        "image_url": doc.get("image_url"),
+        "vendor": pick_text(doc.get("vendor"), "en"),
+        "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
+        "option_values": list(compact_option_values(doc.get("skus") or [])),
+        "tags": list((doc.get("tags") or [])[:6]),
+    }
+
+
+def normalize_text(text: Any) -> str:
+    value = str(text or "").strip().lower()
+    value = re.sub(r"\s+", " ", value)
+    return value
+
+
+def _extract_json_blob(text: str) -> Any:
+    cleaned = str(text or "").strip()
+    candidates: List[str] = [cleaned]
+    fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
+    candidates.extend(match.strip() for match in fence_matches if match.strip())
+
+    for candidate in candidates:
+        try:
+            return json.loads(candidate)
+        except Exception:
+            pass
+
+    starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
+    ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
+    for start in starts:
+        for end in reversed(ends):
+            if end <= start:
+                continue
+            fragment = cleaned[start : end + 1]
+            try:
+                return json.loads(fragment)
+            except Exception:
+                continue
+    raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
+
+
+@dataclass
+class QueryBuildResult:
+    query: str
+    tenant_id: str
+    search_total: int
+    search_depth: int
+    rerank_corpus_size: int
+    annotated_count: int
+    output_json_path: Path
+
+
+class EvalStore:
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        ensure_dir(db_path.parent)
+        self.conn = sqlite3.connect(str(db_path), check_same_thread=False)
+        self.conn.row_factory = sqlite3.Row
+        self._init_schema()
+
+    def _init_schema(self) -> None:
+        self.conn.executescript(
+            """
+            CREATE TABLE IF NOT EXISTS corpus_docs (
+              tenant_id TEXT NOT NULL,
+              spu_id TEXT NOT NULL,
+              title_json TEXT,
+              vendor_json TEXT,
+              category_path_json TEXT,
+              category_name_json TEXT,
+              image_url TEXT,
+              skus_json TEXT,
+              tags_json TEXT,
+              raw_json TEXT NOT NULL,
+              updated_at TEXT NOT NULL,
+              PRIMARY KEY (tenant_id, spu_id)
+            );
+
+            CREATE TABLE IF NOT EXISTS rerank_scores (
+              tenant_id TEXT NOT NULL,
+              query_text TEXT NOT NULL,
+              spu_id TEXT NOT NULL,
+              score REAL NOT NULL,
+              model_name TEXT,
+              updated_at TEXT NOT NULL,
+              PRIMARY KEY (tenant_id, query_text, spu_id)
+            );
+
+            CREATE TABLE IF NOT EXISTS relevance_labels (
+              tenant_id TEXT NOT NULL,
+              query_text TEXT NOT NULL,
+              spu_id TEXT NOT NULL,
+              label TEXT NOT NULL,
+              judge_model TEXT,
+              raw_response TEXT,
+              updated_at TEXT NOT NULL,
+              PRIMARY KEY (tenant_id, query_text, spu_id)
+            );
+
+            CREATE TABLE IF NOT EXISTS build_runs (
+              run_id TEXT PRIMARY KEY,
+              tenant_id TEXT NOT NULL,
+              query_text TEXT NOT NULL,
+              output_json_path TEXT NOT NULL,
+              metadata_json TEXT NOT NULL,
+              created_at TEXT NOT NULL
+            );
+
+            CREATE TABLE IF NOT EXISTS batch_runs (
+              batch_id TEXT PRIMARY KEY,
+              tenant_id TEXT NOT NULL,
+              output_json_path TEXT NOT NULL,
+              report_markdown_path TEXT NOT NULL,
+              config_snapshot_path TEXT NOT NULL,
+              metadata_json TEXT NOT NULL,
+              created_at TEXT NOT NULL
+            );
+
+            CREATE TABLE IF NOT EXISTS query_profiles (
+              tenant_id TEXT NOT NULL,
+              query_text TEXT NOT NULL,
+              prompt_version TEXT NOT NULL,
+              judge_model TEXT,
+              profile_json TEXT NOT NULL,
+              raw_response TEXT NOT NULL,
+              updated_at TEXT NOT NULL,
+              PRIMARY KEY (tenant_id, query_text, prompt_version)
+            );
+            """
+        )
+        self.conn.commit()
+
+    def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
+        now = utc_now_iso()
+        rows = []
+        for doc in docs:
+            rows.append(
+                (
+                    tenant_id,
+                    str(doc.get("spu_id") or ""),
+                    safe_json_dumps(doc.get("title")),
+                    safe_json_dumps(doc.get("vendor")),
+                    safe_json_dumps(doc.get("category_path")),
+                    safe_json_dumps(doc.get("category_name")),
+                    str(doc.get("image_url") or ""),
+                    safe_json_dumps(doc.get("skus") or []),
+                    safe_json_dumps(doc.get("tags") or []),
+                    safe_json_dumps(doc),
+                    now,
+                )
+            )
+        self.conn.executemany(
+            """
+            INSERT INTO corpus_docs (
+              tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,
+              image_url, skus_json, tags_json, raw_json, updated_at
+            ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(tenant_id, spu_id) DO UPDATE SET
+              title_json=excluded.title_json,
+              vendor_json=excluded.vendor_json,
+              category_path_json=excluded.category_path_json,
+              category_name_json=excluded.category_name_json,
+              image_url=excluded.image_url,
+              skus_json=excluded.skus_json,
+              tags_json=excluded.tags_json,
+              raw_json=excluded.raw_json,
+              updated_at=excluded.updated_at
+            """,
+            rows,
+        )
+        self.conn.commit()
+
+    def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:
+        rows = self.conn.execute(
+            "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",
+            (tenant_id,),
+        ).fetchall()
+        return [json.loads(row["raw_json"]) for row in rows]
+
+    def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
+        keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]
+        if not keys:
+            return {}
+        placeholders = ",".join("?" for _ in keys)
+        rows = self.conn.execute(
+            f"""
+            SELECT spu_id, raw_json
+            FROM corpus_docs
+            WHERE tenant_id=? AND spu_id IN ({placeholders})
+            """,
+            [tenant_id, *keys],
+        ).fetchall()
+        return {
+            str(row["spu_id"]): json.loads(row["raw_json"])
+            for row in rows
+        }
+
+    def has_corpus(self, tenant_id: str) -> bool:
+        row = self.conn.execute(
+            "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",
+            (tenant_id,),
+        ).fetchone()
+        return bool(row and row["n"] > 0)
+
+    def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:
+        rows = self.conn.execute(
+            """
+            SELECT spu_id, score
+            FROM rerank_scores
+            WHERE tenant_id=? AND query_text=?
+            """,
+            (tenant_id, query_text),
+        ).fetchall()
+        return {str(row["spu_id"]): float(row["score"]) for row in rows}
+
+    def upsert_rerank_scores(
+        self,
+        tenant_id: str,
+        query_text: str,
+        scores: Dict[str, float],
+        model_name: str,
+    ) -> None:
+        now = utc_now_iso()
+        rows = [
+            (tenant_id, query_text, spu_id, float(score), model_name, now)
+            for spu_id, score in scores.items()
+        ]
+        self.conn.executemany(
+            """
+            INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?)
+            ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
+              score=excluded.score,
+              model_name=excluded.model_name,
+              updated_at=excluded.updated_at
+            """,
+            rows,
+        )
+        self.conn.commit()
+
+    def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:
+        rows = self.conn.execute(
+            """
+            SELECT spu_id, label
+            FROM relevance_labels
+            WHERE tenant_id=? AND query_text=?
+            """,
+            (tenant_id, query_text),
+        ).fetchall()
+        return {str(row["spu_id"]): str(row["label"]) for row in rows}
+
+    def upsert_labels(
+        self,
+        tenant_id: str,
+        query_text: str,
+        labels: Dict[str, str],
+        judge_model: str,
+        raw_response: str,
+    ) -> None:
+        now = utc_now_iso()
+        rows = []
+        for spu_id, label in labels.items():
+            if label not in VALID_LABELS:
+                raise ValueError(f"invalid label: {label}")
+            rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))
+        self.conn.executemany(
+            """
+            INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
+              label=excluded.label,
+              judge_model=excluded.judge_model,
+              raw_response=excluded.raw_response,
+              updated_at=excluded.updated_at
+            """,
+            rows,
+        )
+        self.conn.commit()
+
+    def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:
+        row = self.conn.execute(
+            """
+            SELECT profile_json
+            FROM query_profiles
+            WHERE tenant_id=? AND query_text=? AND prompt_version=?
+            """,
+            (tenant_id, query_text, prompt_version),
+        ).fetchone()
+        if not row:
+            return None
+        return json.loads(row["profile_json"])
+
+    def upsert_query_profile(
+        self,
+        tenant_id: str,
+        query_text: str,
+        prompt_version: str,
+        judge_model: str,
+        profile: Dict[str, Any],
+        raw_response: str,
+    ) -> None:
+        self.conn.execute(
+            """
+            INSERT OR REPLACE INTO query_profiles
+            (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            (
+                tenant_id,
+                query_text,
+                prompt_version,
+                judge_model,
+                safe_json_dumps(profile),
+                raw_response,
+                utc_now_iso(),
+            ),
+        )
+        self.conn.commit()
+
+    def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
+        self.conn.execute(
+            """
+            INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
+            VALUES (?, ?, ?, ?, ?, ?)
+            """,
+            (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
+        )
+        self.conn.commit()
+
+    def insert_batch_run(
+        self,
+        batch_id: str,
+        tenant_id: str,
+        output_json_path: Path,
+        report_markdown_path: Path,
+        config_snapshot_path: Path,
+        metadata: Dict[str, Any],
+    ) -> None:
+        self.conn.execute(
+            """
+            INSERT OR REPLACE INTO batch_runs
+            (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
+            VALUES (?, ?, ?, ?, ?, ?, ?)
+            """,
+            (
+                batch_id,
+                tenant_id,
+                str(output_json_path),
+                str(report_markdown_path),
+                str(config_snapshot_path),
+                safe_json_dumps(metadata),
+                utc_now_iso(),
+            ),
+        )
+        self.conn.commit()
+
+    def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
+        rows = self.conn.execute(
+            """
+            SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
+            FROM batch_runs
+            ORDER BY created_at DESC
+            LIMIT ?
+            """,
+            (limit,),
+        ).fetchall()
+        items: List[Dict[str, Any]] = []
+        for row in rows:
+            items.append(
+                {
+                    "batch_id": row["batch_id"],
+                    "tenant_id": row["tenant_id"],
+                    "output_json_path": row["output_json_path"],
+                    "report_markdown_path": row["report_markdown_path"],
+                    "config_snapshot_path": row["config_snapshot_path"],
+                    "metadata": json.loads(row["metadata_json"]),
+                    "created_at": row["created_at"],
+                }
+            )
+        return items
+
+    def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:
+        rows = self.conn.execute(
+            """
+            SELECT
+              query_text,
+              COUNT(*) AS total,
+              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
+              MAX(updated_at) AS updated_at
+            FROM relevance_labels
+            WHERE tenant_id=?
+            GROUP BY query_text
+            ORDER BY query_text
+            """,
+            (tenant_id,),
+        ).fetchall()
+        return [
+            {
+                "query": str(row["query_text"]),
+                "total": int(row["total"]),
+                "exact_count": int(row["exact_count"] or 0),
+                "partial_count": int(row["partial_count"] or 0),
+                "irrelevant_count": int(row["irrelevant_count"] or 0),
+                "updated_at": row["updated_at"],
+            }
+            for row in rows
+        ]
+
+    def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:
+        row = self.conn.execute(
+            """
+            SELECT
+              COUNT(*) AS total,
+              SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
+              SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
+              SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
+              MAX(updated_at) AS updated_at
+            FROM relevance_labels
+            WHERE tenant_id=? AND query_text=?
+            """,
+            (tenant_id, query_text),
+        ).fetchone()
+        return {
+            "query": query_text,
+            "total": int((row["total"] or 0) if row else 0),
+            "exact_count": int((row["exact_count"] or 0) if row else 0),
+            "partial_count": int((row["partial_count"] or 0) if row else 0),
+            "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
+            "updated_at": row["updated_at"] if row else None,
+        }
+
+
+class SearchServiceClient:
+    def __init__(self, base_url: str, tenant_id: str):
+        self.base_url = base_url.rstrip("/")
+        self.tenant_id = str(tenant_id)
+        self.session = requests.Session()
+
+    def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
+        response = self.session.post(
+            f"{self.base_url}/search/",
+            headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
+            json={"query": query, "size": size, "from": from_, "language": language},
+            timeout=120,
+        )
+        response.raise_for_status()
+        return response.json()
+
+
+class RerankServiceClient:
+    def __init__(self, service_url: str):
+        self.service_url = service_url.rstrip("/")
+        self.session = requests.Session()
+
+    def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:
+        payload: Dict[str, Any] = {
+            "query": query,
+            "docs": list(docs),
+            "normalize": normalize,
+        }
+        if top_n is not None:
+            payload["top_n"] = int(top_n)
+        response = self.session.post(self.service_url, json=payload, timeout=180)
+        response.raise_for_status()
+        data = response.json()
+        return list(data.get("scores") or []), dict(data.get("meta") or {})
+
+
+class DashScopeLabelClient:
+    def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
+        self.model = model
+        self.base_url = base_url.rstrip("/")
+        self.api_key = api_key
+        self.batch_size = int(batch_size)
+        self.session = requests.Session()
+
+    def _chat(self, prompt: str) -> Tuple[str, str]:
+        response = self.session.post(
+            f"{self.base_url}/chat/completions",
+            headers={
+                "Authorization": f"Bearer {self.api_key}",
+                "Content-Type": "application/json",
+            },
+            json={
+                "model": self.model,
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0,
+                "top_p": 0.1,
+            },
+            timeout=180,
+        )
+        response.raise_for_status()
+        data = response.json()
+        content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
+        return content, safe_json_dumps(data)
+
+    def extract_query_profile(
+        self,
+        query: str,
+        parser_hints: Dict[str, Any],
+    ) -> Tuple[Dict[str, Any], str]:
+        prompt = (
+            "You are building a structured intent profile for e-commerce relevance judging.\n"
+            "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
+            "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
+            "Return JSON with this schema:\n"
+            "{\n"
+            '  "normalized_query_en": string,\n'
+            '  "primary_category": string,\n'
+            '  "allowed_categories": [string],\n'
+            '  "required_attributes": [\n'
+            '    {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
+            "  ],\n"
+            '  "notes": [string]\n'
+            "}\n\n"
+            "Guidelines:\n"
+            "- Exact later will require explicit evidence for all required attributes.\n"
+            "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
+            "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
+            "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
+            "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
+            "- For color, include conflicting colors only when clear from the query.\n\n"
+            f"Original query: {query}\n"
+            f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n"
+        )
+        content, raw_response = self._chat(prompt)
+        payload = _extract_json_blob(content)
+        if not isinstance(payload, dict):
+            raise ValueError(f"unexpected query profile payload: {content!r}")
+        payload.setdefault("normalized_query_en", query)
+        payload.setdefault("primary_category", "")
+        payload.setdefault("allowed_categories", [])
+        payload.setdefault("required_attributes", [])
+        payload.setdefault("notes", [])
+        return payload, raw_response
+
+    def classify_batch(
+        self,
+        query: str,
+        query_profile: Dict[str, Any],
+        docs: Sequence[Dict[str, Any]],
+    ) -> Tuple[List[str], str]:
+        numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
+        prompt = (
+            "You are an e-commerce search relevance judge.\n"
+            "Judge each product against the structured query profile below.\n\n"
+            "Relevance rules:\n"
+            "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
+            "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
+            "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
+            "- Be conservative with Exact.\n"
+            "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
+            "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
+            f"Original query: {query}\n"
+            f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n"
+            "Products:\n"
+            + "\n".join(numbered_docs)
+            + "\n\nReturn JSON only, with schema:\n"
+            '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
+        )
+        content, raw_response = self._chat(prompt)
+        payload = _extract_json_blob(content)
+        if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
+            raise ValueError(f"unexpected label payload: {content!r}")
+        labels_payload = payload["labels"]
+        labels: List[str] = []
+        for item in labels_payload[: len(docs)]:
+            if not isinstance(item, dict):
+                continue
+            label = str(item.get("label") or "").strip()
+            if label in VALID_LABELS:
+                labels.append(label)
+        if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
+            raise ValueError(f"unexpected label output: {content!r}")
+        return labels, raw_response
+
+
+def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
+    if k <= 0:
+        return 0.0
+    sliced = list(labels[:k])
+    if not sliced:
+        return 0.0
+    hits = sum(1 for label in sliced if label in relevant)
+    return hits / float(min(k, len(sliced)))
+
+
+def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
+    hit_count = 0
+    precision_sum = 0.0
+    for idx, label in enumerate(labels, start=1):
+        if label not in relevant:
+            continue
+        hit_count += 1
+        precision_sum += hit_count / idx
+    if hit_count == 0:
+        return 0.0
+    return precision_sum / hit_count
+
+
+def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
+    metrics: Dict[str, float] = {}
+    for k in (5, 10, 20, 50):
+        metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
+        metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+    metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
+    metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
+    return metrics
+
+
+def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:
+    if not metric_items:
+        return {}
+    keys = sorted(metric_items[0].keys())
+    return {
+        key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)
+        for key in keys
+    }
+
+
+def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
+    return {
+        RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
+        RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
+        RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
+    }
+
+
+class SearchEvaluationFramework:
+    def __init__(
+        self,
+        tenant_id: str,
+        artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
+        search_base_url: str = "http://localhost:6002",
+    ):
+        init_service(get_app_config().infrastructure.elasticsearch.host)
+        self.tenant_id = str(tenant_id)
+        self.artifact_root = ensure_dir(artifact_root)
+        self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
+        self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
+        app_cfg = get_app_config()
+        rerank_service_url = str(
+            app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
+        )
+        self.rerank_client = RerankServiceClient(rerank_service_url)
+        llm_cfg = app_cfg.services.translation.capabilities["llm"]
+        api_key = app_cfg.infrastructure.secrets.dashscope_api_key
+        if not api_key:
+            raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
+        self.label_client = DashScopeLabelClient(
+            model=str(llm_cfg["model"]),
+            base_url=str(llm_cfg["base_url"]),
+            api_key=str(api_key),
+        )
+        self.query_parser = get_query_parser()
+
+    def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
+        parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"])
+        payload = parsed.to_dict()
+        payload["text_for_rerank"] = parsed.text_for_rerank()
+        return payload
+
+    def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
+        if not force_refresh:
+            cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION)
+            if cached is not None:
+                return cached
+        parser_hints = self.build_query_parser_hints(query)
+        profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
+        profile["parser_hints"] = parser_hints
+        self.store.upsert_query_profile(
+            self.tenant_id,
+            query,
+            JUDGE_PROMPT_VERSION,
+            self.label_client.model,
+            profile,
+            raw_response,
+        )
+        return profile
+
+    @staticmethod
+    def _doc_evidence_text(doc: Dict[str, Any]) -> str:
+        pieces: List[str] = [
+            build_display_title(doc),
+            pick_text(doc.get("vendor"), "en"),
+            pick_text(doc.get("category_path"), "en"),
+            pick_text(doc.get("category_name"), "en"),
+        ]
+        for sku in doc.get("skus") or []:
+            pieces.extend(
+                [
+                    str(sku.get("option1_value") or ""),
+                    str(sku.get("option2_value") or ""),
+                    str(sku.get("option3_value") or ""),
+                ]
+            )
+        for tag in doc.get("tags") or []:
+            pieces.append(str(tag))
+        return normalize_text(" | ".join(piece for piece in pieces if piece))
+
+    def _apply_rule_based_label_guardrails(
+        self,
+        label: str,
+        query_profile: Dict[str, Any],
+        doc: Dict[str, Any],
+    ) -> str:
+        if label not in VALID_LABELS:
+            return label
+        evidence = self._doc_evidence_text(doc)
+        category = normalize_text(query_profile.get("primary_category"))
+        allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
+
+        primary_category_match = True
+        if category:
+            primary_category_match = category in evidence
+        allowed_category_match = True
+        if allowed_categories:
+            allowed_category_match = any(signal in evidence for signal in allowed_categories)
+
+        if label == RELEVANCE_EXACT and not primary_category_match:
+            if allowed_category_match:
+                label = RELEVANCE_PARTIAL
+            else:
+                return RELEVANCE_IRRELEVANT
+
+        for attr in query_profile.get("required_attributes") or []:
+            if not isinstance(attr, dict):
+                continue
+            attr_name = normalize_text(attr.get("name"))
+            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
+                continue
+            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
+            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
+            if attr_name == "fit":
+                if any(term in {"oversized", "oversize"} for term in required_terms):
+                    conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
+                if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
+                    conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
+            has_required = any(term in evidence for term in required_terms) if required_terms else True
+            has_conflict = any(term in evidence for term in conflicting_terms)
+
+            if has_conflict:
+                return RELEVANCE_IRRELEVANT
+            if label == RELEVANCE_EXACT and not has_required:
+                label = RELEVANCE_PARTIAL
+
+        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
+            return RELEVANCE_IRRELEVANT
+
+        return label
+
+    @staticmethod
+    def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
+        option_values = list(item.get("option_values") or [])
+        while len(option_values) < 3:
+            option_values.append("")
+        product = dict(item.get("product") or {})
+        return {
+            "spu_id": item.get("spu_id"),
+            "title": product.get("title") or item.get("title"),
+            "vendor": product.get("vendor"),
+            "category_path": product.get("category"),
+            "category_name": product.get("category"),
+            "image_url": item.get("image_url") or product.get("image_url"),
+            "tags": product.get("tags") or [],
+            "skus": [
+                {
+                    "option1_value": option_values[0],
+                    "option2_value": option_values[1],
+                    "option3_value": option_values[2],
+                }
+            ],
+        }
+
+    def _collect_label_issues(
+        self,
+        label: str,
+        query_profile: Dict[str, Any],
+        doc: Dict[str, Any],
+    ) -> List[str]:
+        evidence = self._doc_evidence_text(doc)
+        issues: List[str] = []
+        category = normalize_text(query_profile.get("primary_category"))
+        allowed_categories = [
+            normalize_text(item)
+            for item in query_profile.get("allowed_categories") or []
+            if str(item).strip()
+        ]
+
+        primary_category_match = True if not category else category in evidence
+        allowed_category_match = False if allowed_categories else primary_category_match
+        if allowed_categories:
+            allowed_category_match = any(signal in evidence for signal in allowed_categories)
+
+        if label == RELEVANCE_EXACT and not primary_category_match:
+            if allowed_category_match:
+                issues.append("Exact missing primary category evidence")
+            else:
+                issues.append("Exact has category mismatch")
+
+        if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
+            issues.append("Partial has category mismatch")
+
+        for attr in query_profile.get("required_attributes") or []:
+            if not isinstance(attr, dict):
+                continue
+            attr_name = normalize_text(attr.get("name"))
+            if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
+                continue
+            required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
+            conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
+            has_required = any(term in evidence for term in required_terms) if required_terms else True
+            has_conflict = any(term in evidence for term in conflicting_terms)
+
+            if has_conflict and label != RELEVANCE_IRRELEVANT:
+                issues.append(f"{label} conflicts on {attr_name}")
+            if label == RELEVANCE_EXACT and not has_required:
+                issues.append(f"Exact missing {attr_name}")
+        return issues
+
+    def audit_live_query(
+        self,
+        query: str,
+        *,
+        top_k: int = 100,
+        language: str = "en",
+        auto_annotate: bool = True,
+    ) -> Dict[str, Any]:
+        live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
+        query_profile = self.get_query_profile(query, force_refresh=False)
+        suspicious: List[Dict[str, Any]] = []
+
+        for item in live["results"]:
+            doc = self._result_item_to_doc(item)
+            issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
+            suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
+            if suggested_label != (item["label"] or ""):
+                issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
+            if issues:
+                suspicious.append(
+                    {
+                        "rank": item["rank"],
+                        "spu_id": item["spu_id"],
+                        "title": item["title"],
+                        "label": item["label"],
+                        "suggested_label": suggested_label,
+                        "issues": issues,
+                    }
+                )
+
+        labels = [
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            for item in live["results"]
+        ]
+        return {
+            "query": query,
+            "tenant_id": self.tenant_id,
+            "top_k": top_k,
+            "metrics": live["metrics"],
+            "distribution": label_distribution(labels),
+            "query_profile": query_profile,
+            "suspicious": suspicious,
+            "results": live["results"],
+        }
+
+    def queries_from_file(self, path: Path) -> List[str]:
+        return [
+            line.strip()
+            for line in path.read_text(encoding="utf-8").splitlines()
+            if line.strip() and not line.strip().startswith("#")
+        ]
+
+    def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
+        if not refresh and self.store.has_corpus(self.tenant_id):
+            return self.store.get_corpus_docs(self.tenant_id)
+
+        es_client = get_es_client().client
+        index_name = get_tenant_index_name(self.tenant_id)
+        docs: List[Dict[str, Any]] = []
+        for hit in scan(
+            client=es_client,
+            index=index_name,
+            query={
+                "_source": [
+                    "spu_id",
+                    "title",
+                    "vendor",
+                    "category_path",
+                    "category_name",
+                    "image_url",
+                    "skus",
+                    "tags",
+                ],
+                "query": {"match_all": {}},
+            },
+            size=500,
+            preserve_order=False,
+            clear_scroll=True,
+        ):
+            source = dict(hit.get("_source") or {})
+            source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
+            docs.append(source)
+        self.store.upsert_corpus_docs(self.tenant_id, docs)
+        return docs
+
+    def full_corpus_rerank(
+        self,
+        query: str,
+        docs: Sequence[Dict[str, Any]],
+        batch_size: int = 24,
+        force_refresh: bool = False,
+    ) -> List[Dict[str, Any]]:
+        cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
+        pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
+        if pending:
+            new_scores: Dict[str, float] = {}
+            for start in range(0, len(pending), batch_size):
+                batch = pending[start : start + batch_size]
+                scores = self._rerank_batch_with_retry(query=query, docs=batch)
+                if len(scores) != len(batch):
+                    raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
+                for doc, score in zip(batch, scores):
+                    new_scores[str(doc.get("spu_id"))] = float(score)
+            self.store.upsert_rerank_scores(
+                self.tenant_id,
+                query,
+                new_scores,
+                model_name="qwen3_vllm_score",
+            )
+            cached.update(new_scores)
+
+        ranked = []
+        for doc in docs:
+            spu_id = str(doc.get("spu_id"))
+            ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
+        ranked.sort(key=lambda item: item["score"], reverse=True)
+        return ranked
+
+    def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
+        if not docs:
+            return []
+        doc_texts = [build_rerank_doc(doc) for doc in docs]
+        try:
+            scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
+            return scores
+        except Exception:
+            if len(docs) == 1:
+                return [-1.0]
+            if len(docs) <= 6:
+                scores: List[float] = []
+                for doc in docs:
+                    scores.extend(self._rerank_batch_with_retry(query, [doc]))
+                return scores
+            mid = len(docs) // 2
+            left = self._rerank_batch_with_retry(query, docs[:mid])
+            right = self._rerank_batch_with_retry(query, docs[mid:])
+            return left + right
+
+    def annotate_missing_labels(
+        self,
+        query: str,
+        docs: Sequence[Dict[str, Any]],
+        force_refresh: bool = False,
+    ) -> Dict[str, str]:
+        query_profile = self.get_query_profile(query, force_refresh=force_refresh)
+        labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
+        missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
+        if not missing_docs:
+            return labels
+
+        for start in range(0, len(missing_docs), self.label_client.batch_size):
+            batch = missing_docs[start : start + self.label_client.batch_size]
+            batch_pairs = self._classify_with_retry(query, query_profile, batch)
+            for sub_labels, raw_response, sub_batch in batch_pairs:
+                to_store = {
+                    str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc)
+                    for doc, label in zip(sub_batch, sub_labels)
+                }
+                self.store.upsert_labels(
+                    self.tenant_id,
+                    query,
+                    to_store,
+                    judge_model=self.label_client.model,
+                    raw_response=raw_response,
+                )
+                labels.update(to_store)
+            time.sleep(0.1)
+        return labels
+
+    def _classify_with_retry(
+        self,
+        query: str,
+        query_profile: Dict[str, Any],
+        docs: Sequence[Dict[str, Any]],
+    ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
+        if not docs:
+            return []
+        try:
+            labels, raw_response = self.label_client.classify_batch(query, query_profile, docs)
+            return [(labels, raw_response, docs)]
+        except Exception:
+            if len(docs) == 1:
+                raise
+            mid = len(docs) // 2
+            return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:])
+
+    def build_query_annotation_set(
+        self,
+        query: str,
+        *,
+        search_depth: int = 1000,
+        rerank_depth: int = 10000,
+        annotate_search_top_k: int = 120,
+        annotate_rerank_top_k: int = 200,
+        language: str = "en",
+        force_refresh_rerank: bool = False,
+        force_refresh_labels: bool = False,
+    ) -> QueryBuildResult:
+        search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
+        search_results = list(search_payload.get("results") or [])
+        corpus = self.corpus_docs(refresh=False)
+        full_rerank = self.full_corpus_rerank(
+            query=query,
+            docs=corpus,
+            force_refresh=force_refresh_rerank,
+        )
+        rerank_depth_effective = min(rerank_depth, len(full_rerank))
+
+        pool_docs: Dict[str, Dict[str, Any]] = {}
+        for doc in search_results[:annotate_search_top_k]:
+            pool_docs[str(doc.get("spu_id"))] = doc
+        for item in full_rerank[:annotate_rerank_top_k]:
+            pool_docs[str(item["spu_id"])] = item["doc"]
+
+        query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels)
+        labels = self.annotate_missing_labels(
+            query=query,
+            docs=list(pool_docs.values()),
+            force_refresh=force_refresh_labels,
+        )
+
+        search_labeled_results: List[Dict[str, Any]] = []
+        for rank, doc in enumerate(search_results, start=1):
+            spu_id = str(doc.get("spu_id"))
+            label = labels.get(spu_id)
+            search_labeled_results.append(
+                {
+                    "rank": rank,
+                    "spu_id": spu_id,
+                    "title": build_display_title(doc),
+                    "image_url": doc.get("image_url"),
+                    "rerank_score": None,
+                    "label": label,
+                    "option_values": list(compact_option_values(doc.get("skus") or [])),
+                    "product": compact_product_payload(doc),
+                }
+            )
+
+        rerank_top_results: List[Dict[str, Any]] = []
+        for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
+            doc = item["doc"]
+            spu_id = str(item["spu_id"])
+            rerank_top_results.append(
+                {
+                    "rank": rank,
+                    "spu_id": spu_id,
+                    "title": build_display_title(doc),
+                    "image_url": doc.get("image_url"),
+                    "rerank_score": round(float(item["score"]), 8),
+                    "label": labels.get(spu_id),
+                    "option_values": list(compact_option_values(doc.get("skus") or [])),
+                    "product": compact_product_payload(doc),
+                }
+            )
+
+        top100_labels = [
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            for item in search_labeled_results[:100]
+        ]
+        metrics = compute_query_metrics(top100_labels)
+        output_dir = ensure_dir(self.artifact_root / "query_builds")
+        run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
+        output_json_path = output_dir / f"{run_id}.json"
+        payload = {
+            "run_id": run_id,
+            "created_at": utc_now_iso(),
+            "tenant_id": self.tenant_id,
+            "query": query,
+            "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
+            "search_total": int(search_payload.get("total") or 0),
+            "search_depth_requested": search_depth,
+            "search_depth_effective": len(search_results),
+            "rerank_depth_requested": rerank_depth,
+            "rerank_depth_effective": rerank_depth_effective,
+            "corpus_size": len(corpus),
+            "annotation_pool": {
+                "annotate_search_top_k": annotate_search_top_k,
+                "annotate_rerank_top_k": annotate_rerank_top_k,
+                "pool_size": len(pool_docs),
+            },
+            "query_profile": query_profile,
+            "metrics_top100": metrics,
+            "search_results": search_labeled_results,
+            "full_rerank_top": rerank_top_results,
+        }
+        output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+        self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
+        return QueryBuildResult(
+            query=query,
+            tenant_id=self.tenant_id,
+            search_total=int(search_payload.get("total") or 0),
+            search_depth=len(search_results),
+            rerank_corpus_size=len(corpus),
+            annotated_count=len(pool_docs),
+            output_json_path=output_json_path,
+        )
+
+    def evaluate_live_query(
+        self,
+        query: str,
+        top_k: int = 100,
+        auto_annotate: bool = True,
+        language: str = "en",
+        force_refresh_labels: bool = False,
+    ) -> Dict[str, Any]:
+        search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
+        results = list(search_payload.get("results") or [])
+        if auto_annotate:
+            self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
+        labels = self.store.get_labels(self.tenant_id, query)
+        labeled = []
+        for rank, doc in enumerate(results[:top_k], start=1):
+            spu_id = str(doc.get("spu_id"))
+            labeled.append(
+                {
+                    "rank": rank,
+                    "spu_id": spu_id,
+                    "title": build_display_title(doc),
+                    "image_url": doc.get("image_url"),
+                    "label": labels.get(spu_id),
+                    "option_values": list(compact_option_values(doc.get("skus") or [])),
+                    "product": compact_product_payload(doc),
+                }
+            )
+        metric_labels = [
+            item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+            for item in labeled
+        ]
+        return {
+            "query": query,
+            "tenant_id": self.tenant_id,
+            "top_k": top_k,
+            "metrics": compute_query_metrics(metric_labels),
+            "results": labeled,
+            "total": int(search_payload.get("total") or 0),
+        }
+
+    def batch_evaluate(
+        self,
+        queries: Sequence[str],
+        *,
+        top_k: int = 100,
+        auto_annotate: bool = True,
+        language: str = "en",
+        force_refresh_labels: bool = False,
+    ) -> Dict[str, Any]:
+        per_query = []
+        for query in queries:
+            live = self.evaluate_live_query(
+                query,
+                top_k=top_k,
+                auto_annotate=auto_annotate,
+                language=language,
+                force_refresh_labels=force_refresh_labels,
+            )
+            labels = [
+                item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
+                for item in live["results"]
+            ]
+            per_query.append(
+                {
+                    "query": live["query"],
+                    "tenant_id": live["tenant_id"],
+                    "top_k": live["top_k"],
+                    "metrics": live["metrics"],
+                    "distribution": label_distribution(labels),
+                    "total": live["total"],
+                }
+            )
+        aggregate = aggregate_metrics([item["metrics"] for item in per_query])
+        aggregate_distribution = {
+            RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
+            RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
+            RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
+        }
+        batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
+        report_dir = ensure_dir(self.artifact_root / "batch_reports")
+        config_snapshot_path = report_dir / f"{batch_id}_config.json"
+        config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
+        config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
+        output_json_path = report_dir / f"{batch_id}.json"
+        report_md_path = report_dir / f"{batch_id}.md"
+        payload = {
+            "batch_id": batch_id,
+            "created_at": utc_now_iso(),
+            "tenant_id": self.tenant_id,
+            "queries": list(queries),
+            "top_k": top_k,
+            "aggregate_metrics": aggregate,
+            "aggregate_distribution": aggregate_distribution,
+            "per_query": per_query,
+            "config_snapshot_path": str(config_snapshot_path),
+        }
+        output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
+        report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
+        self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
+        return payload
+
+
+def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
+    lines = [
+        "# Search Batch Evaluation",
+        "",
+        f"- Batch ID: {payload['batch_id']}",
+        f"- Created at: {payload['created_at']}",
+        f"- Tenant ID: {payload['tenant_id']}",
+        f"- Query count: {len(payload['queries'])}",
+        f"- Top K: {payload['top_k']}",
+        "",
+        "## Aggregate Metrics",
+        "",
+    ]
+    for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):
+        lines.append(f"- {key}: {value}")
+    distribution = payload.get("aggregate_distribution") or {}
+    if distribution:
+        lines.extend(
+            [
+                "",
+                "## Label Distribution",
+                "",
+                f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
+                f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
+                f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
+            ]
+        )
+    lines.extend(["", "## Per Query", ""])
+    for item in payload.get("per_query") or []:
+        lines.append(f"### {item['query']}")
+        lines.append("")
+        for key, value in sorted((item.get("metrics") or {}).items()):
+            lines.append(f"- {key}: {value}")
+        distribution = item.get("distribution") or {}
+        lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
+        lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
+        lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
+        lines.append("")
+    return "\n".join(lines)
+
+
+class SearchEvalRequest(BaseModel):
+    query: str
+    top_k: int = Field(default=100, ge=1, le=500)
+    auto_annotate: bool = True
+    language: str = "en"
+
+
+class BatchEvalRequest(BaseModel):
+    queries: Optional[List[str]] = None
+    top_k: int = Field(default=100, ge=1, le=500)
+    auto_annotate: bool = True
+    language: str = "en"
+    force_refresh_labels: bool = False
+
+
+def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
+    app = FastAPI(title="Search Evaluation UI", version="1.0.0")
+
+    @app.get("/", response_class=HTMLResponse)
+    def home() -> str:
+        return WEB_APP_HTML
+
+    @app.get("/api/queries")
+    def api_queries() -> Dict[str, Any]:
+        return {"queries": framework.queries_from_file(query_file)}
+
+    @app.post("/api/search-eval")
+    def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
+        return framework.evaluate_live_query(
+            query=request.query,
+            top_k=request.top_k,
+            auto_annotate=request.auto_annotate,
+            language=request.language,
+        )
+
+    @app.post("/api/batch-eval")
+    def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
+        queries = request.queries or framework.queries_from_file(query_file)
+        if not queries:
+            raise HTTPException(status_code=400, detail="No queries provided")
+        return framework.batch_evaluate(
+            queries=queries,
+            top_k=request.top_k,
+            auto_annotate=request.auto_annotate,
+            language=request.language,
+            force_refresh_labels=request.force_refresh_labels,
+        )
+
+    @app.get("/api/history")
+    def api_history() -> Dict[str, Any]:
+        return {"history": framework.store.list_batch_runs(limit=20)}
+
+    return app
+
+
+WEB_APP_HTML = """
+<!doctype html>
+<html lang="en">
+<head>
+  <meta charset="utf-8" />
+  <meta name="viewport" content="width=device-width, initial-scale=1" />
+  <title>Search Evaluation</title>
+  <style>
+    :root {
+      --bg: #f5f3ed;
+      --panel: #fffdf8;
+      --ink: #1f2a24;
+      --muted: #6b756e;
+      --line: #ddd4c6;
+      --accent: #0f766e;
+      --exact: #0f766e;
+      --partial: #b7791f;
+      --irrelevant: #b42318;
+    }
+    body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:
+      radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),
+      linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }
+    .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }
+    .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }
+    .main { padding: 24px; }
+    h1, h2 { margin: 0 0 12px; }
+    .muted { color: var(--muted); }
+    .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }
+    .query-item { display: block; width: 100%; border: 0; background: transparent; text-align: left; padding: 10px 12px; border-radius: 10px; cursor: pointer; }
+    .query-item:hover { background: #eef6f4; }
+    .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }
+    input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }
+    button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }
+    button.secondary { background: #d9e6e3; color: #12433d; }
+    .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }
+    .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }
+    .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
+    .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }
+    .results { display: grid; gap: 10px; }
+    .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
+    .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
+    .Exact { background: var(--exact); }
+    .Partial { background: var(--partial); }
+    .Irrelevant { background: var(--irrelevant); }
+    .Unknown { background: #637381; }
+    .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
+    .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
+    .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
+    .section { margin-bottom: 28px; }
+    .history { font-size: 13px; line-height: 1.5; }
+  </style>
+</head>
+<body>
+  <div class="app">
+    <aside class="sidebar">
+      <h2>Queries</h2>
+      <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
+      <div id="queryList" class="query-list"></div>
+      <div class="section">
+        <h2>History</h2>
+        <div id="history" class="history muted">Loading...</div>
+      </div>
+    </aside>
+    <main class="main">
+      <h1>Search Evaluation</h1>
+      <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
+      <div class="toolbar">
+        <input id="queryInput" type="text" placeholder="Search query" />
+        <button onclick="runSingle()">Evaluate Query</button>
+        <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
+      </div>
+      <div id="status" class="muted section"></div>
+      <section class="section">
+        <h2>Metrics</h2>
+        <div id="metrics" class="grid"></div>
+      </section>
+      <section class="section">
+        <h2>Top Results</h2>
+        <div id="results" class="results"></div>
+      </section>
+    </main>
+  </div>
+  <script>
+    async function fetchJSON(url, options) {
+      const res = await fetch(url, options);
+      if (!res.ok) throw new Error(await res.text());
+      return await res.json();
+    }
+    function renderMetrics(metrics) {
+      const root = document.getElementById('metrics');
+      root.innerHTML = '';
+      Object.entries(metrics || {}).forEach(([key, value]) => {
+        const card = document.createElement('div');
+        card.className = 'metric';
+        card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;
+        root.appendChild(card);
+      });
+    }
+    function renderResults(results) {
+      const root = document.getElementById('results');
+      root.innerHTML = '';
+      (results || []).forEach(item => {
+        const label = item.label || 'Unknown';
+        const box = document.createElement('div');
+        box.className = 'result';
+        box.innerHTML = `
+          <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div>
+          <img class="thumb" src="${item.image_url || ''}" alt="" />
+          <div>
+            <div class="title">${item.title || ''}</div>
+            <div class="options">
+              <div>${(item.option_values || [])[0] || ''}</div>
+              <div>${(item.option_values || [])[1] || ''}</div>
+              <div>${(item.option_values || [])[2] || ''}</div>
+            </div>
+          </div>`;
+        root.appendChild(box);
+      });
+    }
+    async function loadQueries() {
+      const data = await fetchJSON('/api/queries');
+      const root = document.getElementById('queryList');
+      root.innerHTML = '';
+      data.queries.forEach(query => {
+        const btn = document.createElement('button');
+        btn.className = 'query-item';
+        btn.textContent = query;
+        btn.onclick = () => {
+          document.getElementById('queryInput').value = query;
+          runSingle();
+        };
+        root.appendChild(btn);
+      });
+    }
+    async function loadHistory() {
+      const data = await fetchJSON('/api/history');
+      const root = document.getElementById('history');
+      root.innerHTML = (data.history || []).map(item =>
+        `<div><strong>${item.batch_id}</strong><br/>${item.created_at}<br/>${item.report_markdown_path}</div><br/>`
+      ).join('') || 'No history yet.';
+    }
+    async function runSingle() {
+      const query = document.getElementById('queryInput').value.trim();
+      if (!query) return;
+      document.getElementById('status').textContent = `Evaluating "${query}"...`;
+      const data = await fetchJSON('/api/search-eval', {
+        method: 'POST',
+        headers: {'Content-Type': 'application/json'},
+        body: JSON.stringify({query, top_k: 100, auto_annotate: true})
+      });
+      document.getElementById('status').textContent = `Done. total=${data.total}`;
+      renderMetrics(data.metrics);
+      renderResults(data.results);
+      loadHistory();
+    }
+    async function runBatch() {
+      document.getElementById('status').textContent = 'Running batch evaluation...';
+      const data = await fetchJSON('/api/batch-eval', {
+        method: 'POST',
+        headers: {'Content-Type': 'application/json'},
+        body: JSON.stringify({top_k: 100, auto_annotate: true})
+      });
+      document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
+      renderMetrics(data.aggregate_metrics);
+      renderResults([]);
+      loadHistory();
+    }
+    loadQueries();
+    loadHistory();
+  </script>
+</body>
+</html>
+"""
+
+
+def build_cli_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
+    sub = parser.add_subparsers(dest="command", required=True)
+
+    build = sub.add_parser("build", help="Build pooled annotation set for queries")
+    build.add_argument("--tenant-id", default="163")
+    build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
+    build.add_argument("--search-depth", type=int, default=1000)
+    build.add_argument("--rerank-depth", type=int, default=10000)
+    build.add_argument("--annotate-search-top-k", type=int, default=120)
+    build.add_argument("--annotate-rerank-top-k", type=int, default=200)
+    build.add_argument("--language", default="en")
+    build.add_argument("--force-refresh-rerank", action="store_true")
+    build.add_argument("--force-refresh-labels", action="store_true")
+
+    batch = sub.add_parser("batch", help="Run batch evaluation against live search")
+    batch.add_argument("--tenant-id", default="163")
+    batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
+    batch.add_argument("--top-k", type=int, default=100)
+    batch.add_argument("--language", default="en")
+    batch.add_argument("--force-refresh-labels", action="store_true")
+
+    audit = sub.add_parser("audit", help="Audit annotation quality for queries")
+    audit.add_argument("--tenant-id", default="163")
+    audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
+    audit.add_argument("--top-k", type=int, default=100)
+    audit.add_argument("--language", default="en")
+    audit.add_argument("--limit-suspicious", type=int, default=5)
+    audit.add_argument("--force-refresh-labels", action="store_true")
+
+    serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
+    serve.add_argument("--tenant-id", default="163")
+    serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
+    serve.add_argument("--host", default="0.0.0.0")
+    serve.add_argument("--port", type=int, default=6010)
+
+    return parser
+
+
+def run_build(args: argparse.Namespace) -> None:
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    queries = framework.queries_from_file(Path(args.queries_file))
+    summary = []
+    for query in queries:
+        result = framework.build_query_annotation_set(
+            query=query,
+            search_depth=args.search_depth,
+            rerank_depth=args.rerank_depth,
+            annotate_search_top_k=args.annotate_search_top_k,
+            annotate_rerank_top_k=args.annotate_rerank_top_k,
+            language=args.language,
+            force_refresh_rerank=args.force_refresh_rerank,
+            force_refresh_labels=args.force_refresh_labels,
+        )
+        summary.append(
+            {
+                "query": result.query,
+                "search_total": result.search_total,
+                "search_depth": result.search_depth,
+                "rerank_corpus_size": result.rerank_corpus_size,
+                "annotated_count": result.annotated_count,
+                "output_json_path": str(result.output_json_path),
+            }
+        )
+        print(
+            f"[build] query={result.query!r} search_total={result.search_total} "
+            f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
+            f"annotated={result.annotated_count} output={result.output_json_path}"
+        )
+    out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
+    out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[done] summary={out_path}")
+
+
+def run_batch(args: argparse.Namespace) -> None:
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    queries = framework.queries_from_file(Path(args.queries_file))
+    payload = framework.batch_evaluate(
+        queries=queries,
+        top_k=args.top_k,
+        auto_annotate=True,
+        language=args.language,
+        force_refresh_labels=args.force_refresh_labels,
+    )
+    print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
+
+
+def run_audit(args: argparse.Namespace) -> None:
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    queries = framework.queries_from_file(Path(args.queries_file))
+    audit_items = []
+    for query in queries:
+        item = framework.audit_live_query(
+            query=query,
+            top_k=args.top_k,
+            language=args.language,
+            auto_annotate=not args.force_refresh_labels,
+        )
+        if args.force_refresh_labels:
+            live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
+            framework.annotate_missing_labels(
+                query=query,
+                docs=list(live_payload.get("results") or [])[: args.top_k],
+                force_refresh=True,
+            )
+            item = framework.audit_live_query(
+                query=query,
+                top_k=args.top_k,
+                language=args.language,
+                auto_annotate=False,
+            )
+        audit_items.append(
+            {
+                "query": query,
+                "metrics": item["metrics"],
+                "distribution": item["distribution"],
+                "suspicious_count": len(item["suspicious"]),
+                "suspicious_examples": item["suspicious"][: args.limit_suspicious],
+            }
+        )
+        print(
+            f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
+        )
+
+    summary = {
+        "created_at": utc_now_iso(),
+        "tenant_id": args.tenant_id,
+        "top_k": args.top_k,
+        "query_count": len(queries),
+        "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
+        "queries": audit_items,
+    }
+    out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
+    out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"[done] audit={out_path}")
+
+
+def run_serve(args: argparse.Namespace) -> None:
+    framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
+    app = create_web_app(framework, Path(args.queries_file))
+    import uvicorn
+
+    uvicorn.run(app, host=args.host, port=args.port, log_level="info")
+
+
+def main() -> None:
+    parser = build_cli_parser()
+    args = parser.parse_args()
+    if args.command == "build":
+        run_build(args)
+        return
+    if args.command == "batch":
+        run_batch(args)
+        return
+    if args.command == "audit":
+        run_audit(args)
+        return
+    if args.command == "serve":
+        run_serve(args)
+        return
+    raise SystemExit(f"unknown command: {args.command}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,14 @@
+#!/usr/bin/env python3
+
+from pathlib import Path
+import sys
+
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.evaluation.eval_framework import main
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,296 @@
+#!/usr/bin/env python3
+
+from __future__ import annotations
+
+import argparse
+import copy
+import json
+import re
+import subprocess
+import sys
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Dict, List
+
+import requests
+import yaml
+
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+
+from scripts.evaluation.eval_framework import (
+    DEFAULT_ARTIFACT_ROOT,
+    DEFAULT_QUERY_FILE,
+    ensure_dir,
+    utc_now_iso,
+    utc_timestamp,
+)
+
+
+CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml"
+
+
+@dataclass
+class ExperimentSpec:
+    name: str
+    description: str
+    params: Dict[str, Any]
+
+
+def load_yaml(path: Path) -> Dict[str, Any]:
+    return yaml.safe_load(path.read_text(encoding="utf-8"))
+
+
+def write_yaml(path: Path, payload: Dict[str, Any]) -> None:
+    path.write_text(
+        yaml.safe_dump(payload, sort_keys=False, allow_unicode=True),
+        encoding="utf-8",
+    )
+
+
+def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None:
+    current = payload
+    parts = dotted_path.split(".")
+    for part in parts[:-1]:
+        current = current[part]
+    current[parts[-1]] = value
+
+
+def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
+    candidate = copy.deepcopy(base_config)
+    for dotted_path, value in params.items():
+        set_nested_value(candidate, dotted_path, value)
+    return candidate
+
+
+def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]:
+    deadline = time.time() + timeout_sec
+    last_error = None
+    while time.time() < deadline:
+        try:
+            response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10)
+            response.raise_for_status()
+            payload = response.json()
+            if str(payload.get("status")) == "healthy":
+                return payload
+            last_error = payload
+        except Exception as exc:  # noqa: BLE001
+            last_error = str(exc)
+        time.sleep(2.0)
+    raise RuntimeError(f"backend did not become healthy: {last_error}")
+
+
+def run_restart() -> None:
+    subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600)
+
+
+def read_queries(path: Path) -> List[str]:
+    return [
+        line.strip()
+        for line in path.read_text(encoding="utf-8").splitlines()
+        if line.strip() and not line.strip().startswith("#")
+    ]
+
+
+def run_batch_eval(
+    *,
+    tenant_id: str,
+    queries_file: Path,
+    top_k: int,
+    language: str,
+    force_refresh_labels: bool,
+) -> Dict[str, Any]:
+    cmd = [
+        str(PROJECT_ROOT / ".venv" / "bin" / "python"),
+        "scripts/evaluation/build_annotation_set.py",
+        "batch",
+        "--tenant-id",
+        str(tenant_id),
+        "--queries-file",
+        str(queries_file),
+        "--top-k",
+        str(top_k),
+        "--language",
+        language,
+    ]
+    if force_refresh_labels:
+        cmd.append("--force-refresh-labels")
+    completed = subprocess.run(
+        cmd,
+        cwd=PROJECT_ROOT,
+        check=True,
+        capture_output=True,
+        text=True,
+        timeout=7200,
+    )
+    output = (completed.stdout or "") + "\n" + (completed.stderr or "")
+    match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output)
+    if not match:
+        raise RuntimeError(f"failed to parse batch output: {output[-2000:]}")
+    batch_id = match.group(1)
+    aggregate_metrics = json.loads(match.group(2).replace("'", '"'))
+    return {
+        "batch_id": batch_id,
+        "aggregate_metrics": aggregate_metrics,
+        "raw_output": output,
+    }
+
+
+def render_markdown(summary: Dict[str, Any]) -> str:
+    lines = [
+        "# Fusion Tuning Report",
+        "",
+        f"- Created at: {summary['created_at']}",
+        f"- Tenant ID: {summary['tenant_id']}",
+        f"- Query count: {summary['query_count']}",
+        f"- Top K: {summary['top_k']}",
+        f"- Score metric: {summary['score_metric']}",
+        "",
+        "## Experiments",
+        "",
+        "| Rank | Name | Score | MAP_3 | MAP_2_3 | P@5 | P@10 | Config |",
+        "|---|---|---:|---:|---:|---:|---:|---|",
+    ]
+    for idx, item in enumerate(summary["experiments"], start=1):
+        metrics = item["aggregate_metrics"]
+        lines.append(
+            "| "
+            + " | ".join(
+                [
+                    str(idx),
+                    item["name"],
+                    str(item["score"]),
+                    str(metrics.get("MAP_3", "")),
+                    str(metrics.get("MAP_2_3", "")),
+                    str(metrics.get("P@5", "")),
+                    str(metrics.get("P@10", "")),
+                    item["config_snapshot_path"],
+                ]
+            )
+            + " |"
+        )
+    lines.extend(["", "## Details", ""])
+    for item in summary["experiments"]:
+        lines.append(f"### {item['name']}")
+        lines.append("")
+        lines.append(f"- Description: {item['description']}")
+        lines.append(f"- Score: {item['score']}")
+        lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`")
+        lines.append(f"- Batch report: {item['batch_report_path']}")
+        lines.append("")
+    return "\n".join(lines)
+
+
+def load_experiments(path: Path) -> List[ExperimentSpec]:
+    payload = json.loads(path.read_text(encoding="utf-8"))
+    items = payload["experiments"] if isinstance(payload, dict) else payload
+    experiments: List[ExperimentSpec] = []
+    for item in items:
+        experiments.append(
+            ExperimentSpec(
+                name=str(item["name"]),
+                description=str(item.get("description") or ""),
+                params=dict(item.get("params") or {}),
+            )
+        )
+    return experiments
+
+
+def build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend")
+    parser.add_argument("--tenant-id", default="163")
+    parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
+    parser.add_argument("--top-k", type=int, default=100)
+    parser.add_argument("--language", default="en")
+    parser.add_argument("--experiments-file", required=True)
+    parser.add_argument("--search-base-url", default="http://127.0.0.1:6002")
+    parser.add_argument("--score-metric", default="MAP_3")
+    parser.add_argument("--apply-best", action="store_true")
+    parser.add_argument("--force-refresh-labels-first-pass", action="store_true")
+    return parser
+
+
+def main() -> None:
+    args = build_parser().parse_args()
+    queries_file = Path(args.queries_file)
+    queries = read_queries(queries_file)
+    base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
+    base_config = load_yaml(CONFIG_PATH)
+    experiments = load_experiments(Path(args.experiments_file))
+
+    tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs")
+    run_id = f"tuning_{utc_timestamp()}"
+    run_dir = ensure_dir(tuning_dir / run_id)
+    results: List[Dict[str, Any]] = []
+
+    try:
+        for experiment in experiments:
+            candidate = apply_params(base_config, experiment.params)
+            write_yaml(CONFIG_PATH, candidate)
+            candidate_config_path = run_dir / f"{experiment.name}_config.yaml"
+            write_yaml(candidate_config_path, candidate)
+
+            run_restart()
+            health = wait_for_backend(args.search_base_url)
+            batch_result = run_batch_eval(
+                tenant_id=args.tenant_id,
+                queries_file=queries_file,
+                top_k=args.top_k,
+                language=args.language,
+                force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results),
+            )
+            aggregate_metrics = dict(batch_result["aggregate_metrics"])
+            results.append(
+                {
+                    "name": experiment.name,
+                    "description": experiment.description,
+                    "params": experiment.params,
+                    "aggregate_metrics": aggregate_metrics,
+                    "score": float(aggregate_metrics.get(args.score_metric, 0.0)),
+                    "batch_id": batch_result["batch_id"],
+                    "batch_report_path": str(
+                        DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md"
+                    ),
+                    "config_snapshot_path": str(candidate_config_path),
+                    "backend_health": health,
+                    "batch_stdout": batch_result["raw_output"],
+                }
+            )
+            print(
+                f"[tune] {experiment.name} score={aggregate_metrics.get(args.score_metric)} "
+                f"metrics={aggregate_metrics}"
+            )
+    finally:
+        if args.apply_best and results:
+            best = max(results, key=lambda item: item["score"])
+            best_config = apply_params(base_config, best["params"])
+            write_yaml(CONFIG_PATH, best_config)
+            run_restart()
+            wait_for_backend(args.search_base_url)
+        else:
+            CONFIG_PATH.write_text(base_config_text, encoding="utf-8")
+            run_restart()
+            wait_for_backend(args.search_base_url)
+
+    results.sort(key=lambda item: item["score"], reverse=True)
+    summary = {
+        "run_id": run_id,
+        "created_at": utc_now_iso(),
+        "tenant_id": args.tenant_id,
+        "query_count": len(queries),
+        "top_k": args.top_k,
+        "score_metric": args.score_metric,
+        "experiments": results,
+    }
+    summary_json_path = run_dir / "summary.json"
+    summary_md_path = run_dir / "summary.md"
+    summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
+    summary_md_path.write_text(render_markdown(summary), encoding="utf-8")
+    print(f"[done] summary_json={summary_json_path}")
+    print(f"[done] summary_md={summary_md_path}")
+
+
+if __name__ == "__main__":
+    main()