Commit 881d338b3acc0b3de1bb3cfb77f4fc69755bb0f7

Authored by tangwang
1 parent 432d1c88

评估框架

config/config.yaml
  1 +# Unified Configuration for Multi-Tenant Search Engine
  2 +# 统一配置文件,所有租户共用一套配置
  3 +# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
1 9 runtime:
2 10 environment: prod
3 11 index_namespace: ''
... ... @@ -13,6 +21,8 @@ runtime:
13 21 translator_port: 6006
14 22 reranker_host: 0.0.0.0
15 23 reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
16 26 infrastructure:
17 27 elasticsearch:
18 28 host: http://localhost:9200
... ... @@ -39,16 +49,30 @@ infrastructure:
39 49 secrets:
40 50 dashscope_api_key: null
41 51 deepl_auth_key: null
  52 +
  53 +# Elasticsearch Index
42 54 es_index_name: search_products
  55 +
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
43 57 indexes: []
  58 +
  59 +# Config assets
44 60 assets:
45 61 query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict
  62 +
  63 +# Product content understanding (LLM enrich-content) configuration
46 64 product_enrich:
47 65 max_workers: 40
  66 +
  67 +# ES Index Settings (基础设置)
48 68 es_settings:
49 69 number_of_shards: 1
50 70 number_of_replicas: 0
51 71 refresh_interval: 30s
  72 +
  73 +# 字段权重配置(用于搜索时的字段boost)
  74 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
  75 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
52 76 field_boosts:
53 77 title: 3.0
54 78 qanchors: 2.5
... ... @@ -61,21 +85,39 @@ field_boosts:
61 85 option1_values: 1.5
62 86 option2_values: 1.5
63 87 option3_values: 1.5
  88 +
  89 +# Query Configuration(查询配置)
64 90 query_config:
  91 + # 支持的语言
65 92 supported_languages:
66 93 - zh
67 94 - en
68 95 default_language: en
  96 +
  97 + # 功能开关(翻译开关由tenant_config控制)
69 98 enable_text_embedding: true
70 99 enable_query_rewrite: true
71   - zh_to_en_model: nllb-200-distilled-600m
72   - en_to_zh_model: nllb-200-distilled-600m
  100 +
  101 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  102 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  103 + zh_to_en_model: nllb-200-distilled-600m # "opus-mt-zh-en"
  104 + en_to_zh_model: nllb-200-distilled-600m # "opus-mt-en-zh"
73 105 default_translation_model: nllb-200-distilled-600m
  106 + # zh_to_en_model: deepl
  107 + # en_to_zh_model: deepl
  108 + # default_translation_model: deepl
  109 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
74 110 zh_to_en_model__source_not_in_index: nllb-200-distilled-600m
75 111 en_to_zh_model__source_not_in_index: nllb-200-distilled-600m
76 112 default_translation_model__source_not_in_index: nllb-200-distilled-600m
77   - translation_embedding_wait_budget_ms_source_in_index: 200
78   - translation_embedding_wait_budget_ms_source_not_in_index: 300
  113 + # zh_to_en_model__source_not_in_index: deepl
  114 + # en_to_zh_model__source_not_in_index: deepl
  115 + # default_translation_model__source_not_in_index: deepl
  116 +
  117 + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
  118 + # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
  119 + translation_embedding_wait_budget_ms_source_in_index: 200 # 80
  120 + translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200
79 121 style_intent:
80 122 enabled: true
81 123 selected_sku_boost: 1.2
... ... @@ -102,6 +144,10 @@ query_config:
102 144 product_title_exclusion:
103 145 enabled: true
104 146 dictionary_path: config/dictionaries/product_title_exclusion.tsv
  147 +
  148 + # 动态多语言检索字段配置
  149 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
  150 + # shared_fields 为无语言后缀字段。
105 151 search_fields:
106 152 multilingual_fields:
107 153 - title
... ... @@ -111,11 +157,14 @@ query_config:
111 157 - brief
112 158 - description
113 159 - vendor
  160 + # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values
114 161 shared_fields: null
115 162 core_multilingual_fields:
116 163 - title
117 164 - qanchors
118 165 - category_name_text
  166 +
  167 + # 统一文本召回策略(主查询 + 翻译查询)
119 168 text_query_strategy:
120 169 base_minimum_should_match: 60%
121 170 translation_minimum_should_match: 60%
... ... @@ -130,8 +179,14 @@ query_config:
130 179 title: 5.0
131 180 qanchors: 4.0
132 181 phrase_match_boost: 3.0
  182 +
  183 + # Embedding字段名称
133 184 text_embedding_field: title_embedding
134 185 image_embedding_field: image_embedding.vector
  186 +
  187 + # 返回字段配置(_source includes)
  188 + # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
  189 + # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致
135 190 source_fields:
136 191 - spu_id
137 192 - handle
... ... @@ -163,18 +218,26 @@ query_config:
163 218 - option3_values
164 219 - specifications
165 220 - skus
  221 +
  222 + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates)
166 223 knn_text_boost: 4
167 224 knn_image_boost: 4
  225 +
  226 + # knn_text_num_candidates = k * 3.4
168 227 knn_text_k: 160
169 228 knn_text_num_candidates: 560
170 229 knn_text_k_long: 400
171 230 knn_text_num_candidates_long: 1200
172 231 knn_image_k: 400
173 232 knn_image_num_candidates: 1200
  233 +
  234 +# Function Score配置(ES层打分规则)
174 235 function_score:
175 236 score_mode: sum
176 237 boost_mode: multiply
177 238 functions: []
  239 +
  240 +# 粗排配置(仅融合 ES 文本/向量信号,不调用模型)
178 241 coarse_rank:
179 242 enabled: true
180 243 input_window: 700
... ... @@ -182,12 +245,16 @@ coarse_rank:
182 245 fusion:
183 246 text_bias: 0.1
184 247 text_exponent: 0.35
  248 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
  249 + # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣
185 250 text_translation_weight: 1.0
186 251 knn_text_weight: 1.0
187 252 knn_image_weight: 1.0
188 253 knn_tie_breaker: 0.1
189 254 knn_bias: 0.6
190 255 knn_exponent: 0.0
  256 +
  257 +# 精排配置(轻量 reranker)
191 258 fine_rank:
192 259 enabled: false
193 260 input_window: 160
... ... @@ -196,6 +263,8 @@ fine_rank:
196 263 rerank_query_template: '{query}'
197 264 rerank_doc_template: '{title}'
198 265 service_profile: fine
  266 +
  267 +# 重排配置(provider/URL 在 services.rerank)
199 268 rerank:
200 269 enabled: true
201 270 rerank_window: 160
... ... @@ -205,6 +274,11 @@ rerank:
205 274 rerank_query_template: '{query}'
206 275 rerank_doc_template: '{title}'
207 276 service_profile: default
  277 +
  278 + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项)
  279 + # 其中 knn_score 先做一层 dis_max:
  280 + # max(knn_text_weight * text_knn, knn_image_weight * image_knn)
  281 + # + knn_tie_breaker * 另一侧较弱信号
208 282 fusion:
209 283 rerank_bias: 1.0e-05
210 284 rerank_exponent: 1.15
... ... @@ -212,22 +286,29 @@ rerank:
212 286 fine_exponent: 1.0
213 287 text_bias: 0.1
214 288 text_exponent: 0.25
  289 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
215 290 text_translation_weight: 0.8
216 291 knn_text_weight: 1.0
217 292 knn_image_weight: 1.0
218 293 knn_tie_breaker: 0.1
219 294 knn_bias: 0.6
220 295 knn_exponent: 0.0
  296 +
  297 +# 可扩展服务/provider 注册表(单一配置源)
221 298 services:
222 299 translation:
223 300 service_url: http://127.0.0.1:6006
  301 + # default_model: nllb-200-distilled-600m
224 302 default_model: nllb-200-distilled-600m
225 303 default_scene: general
226 304 timeout_sec: 10.0
227 305 cache:
228 306 ttl_seconds: 62208000
229 307 sliding_expiration: true
  308 + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
230 309 enable_model_quality_tier_cache: true
  310 + # Higher tier = better quality. Multiple models may share one tier (同级).
  311 + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
231 312 model_quality_tiers:
232 313 deepl: 30
233 314 qwen-mt: 30
... ... @@ -321,12 +402,13 @@ services:
321 402 num_beams: 1
322 403 use_cache: true
323 404 embedding:
324   - provider: http
  405 + provider: http # http
325 406 providers:
326 407 http:
327 408 text_base_url: http://127.0.0.1:6005
328 409 image_base_url: http://127.0.0.1:6008
329   - backend: tei
  410 + # 服务内文本后端(embedding 进程启动时读取)
  411 + backend: tei # tei | local_st
330 412 backends:
331 413 tei:
332 414 base_url: http://127.0.0.1:8080
... ... @@ -337,7 +419,10 @@ services:
337 419 device: cuda
338 420 batch_size: 32
339 421 normalize_embeddings: true
340   - image_backend: clip_as_service
  422 + # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name)
  423 + # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中
  424 + # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。
  425 + image_backend: clip_as_service # clip_as_service | local_cnclip
341 426 image_backends:
342 427 clip_as_service:
343 428 server: grpc://127.0.0.1:51000
... ... @@ -364,6 +449,7 @@ services:
364 449 max_docs: 1000
365 450 normalize: true
366 451 default_instance: default
  452 + # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。
367 453 instances:
368 454 default:
369 455 host: 0.0.0.0
... ... @@ -405,11 +491,29 @@ services:
405 491 enforce_eager: false
406 492 infer_batch_size: 100
407 493 sort_by_doc_length: true
408   - instruction_format: standard
  494 + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct)
  495 + instruction_format: standard # compact standard
  496 + # instruction: "Given a query, score the product for relevance"
  497 + # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
  498 + # instruction: "rank products by given query, category match first"
  499 + # instruction: "Rank products by query relevance, prioritizing category match"
  500 + # instruction: "Rank products by query relevance, prioritizing category and style match"
  501 + # instruction: "Rank by query relevance, prioritize category & style"
  502 + # instruction: "Relevance ranking: category & style match first"
  503 + # instruction: "Score product relevance by query with category & style match prioritized"
  504 + # instruction: "Rank products by query with category & style match prioritized"
  505 + # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query"
409 506 instruction: rank products by given query
  507 + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score
  508 + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。
410 509 qwen3_vllm_score:
411 510 model_name: Qwen/Qwen3-Reranker-0.6B
  511 + # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false
412 512 use_original_qwen3_hf_overrides: true
  513 + # vllm_runner: "auto"
  514 + # vllm_convert: "auto"
  515 + # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并
  516 + # hf_overrides: {}
413 517 engine: vllm
414 518 max_model_len: 172
415 519 tensor_parallel_size: 1
... ... @@ -419,15 +523,23 @@ services:
419 523 enforce_eager: false
420 524 infer_batch_size: 80
421 525 sort_by_doc_length: true
422   - instruction_format: standard
  526 + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
  527 + instruction_format: standard # compact standard
  528 + # instruction: "Rank products by query with category & style match prioritized"
  529 + # instruction: "Given a shopping query, rank products by relevance"
423 530 instruction: Rank products by query with category & style match prioritized
424 531 qwen3_transformers:
425 532 model_name: Qwen/Qwen3-Reranker-0.6B
426 533 instruction: rank products by given query
  534 + # instruction: "Score the product’s relevance to the given query"
427 535 max_length: 8192
428 536 batch_size: 64
429 537 use_fp16: true
  538 + # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2
430 539 attn_implementation: sdpa
  540 + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
  541 + # For 1 query + many short docs (for example 400 product titles), this usually reduces
  542 + # repeated prefix work and padding waste compared with pairwise batching.
431 543 qwen3_transformers_packed:
432 544 model_name: Qwen/Qwen3-Reranker-0.6B
433 545 instruction: Rank products by query with category & style match prioritized
... ... @@ -436,6 +548,8 @@ services:
436 548 max_docs_per_pack: 0
437 549 use_fp16: true
438 550 sort_by_doc_length: true
  551 + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
  552 + # If your torch/transformers stack validates it, you can benchmark "sdpa".
439 553 attn_implementation: eager
440 554 qwen3_gguf:
441 555 repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
... ... @@ -443,6 +557,7 @@ services:
443 557 cache_dir: ./model_cache
444 558 local_dir: ./models/reranker/qwen3-reranker-4b-gguf
445 559 instruction: Rank products by query with category & style match prioritized
  560 + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快
446 561 n_ctx: 512
447 562 n_batch: 512
448 563 n_ubatch: 512
... ... @@ -465,6 +580,8 @@ services:
465 580 cache_dir: ./model_cache
466 581 local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf
467 582 instruction: Rank products by query with category & style match prioritized
  583 + # 0.6B GGUF / online rerank baseline:
  584 + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。
468 585 n_ctx: 256
469 586 n_batch: 256
470 587 n_ubatch: 256
... ... @@ -484,22 +601,34 @@ services:
484 601 verbose: false
485 602 dashscope_rerank:
486 603 model_name: qwen3-rerank
  604 + # 按地域选择 endpoint:
  605 + # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
  606 + # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
  607 + # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
487 608 endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
488 609 api_key_env: RERANK_DASHSCOPE_API_KEY_CN
489 610 timeout_sec: 10.0
490   - top_n_cap: 0
491   - batchsize: 64
  611 + top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限
  612 + batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断)
492 613 instruct: Given a shopping query, rank product titles by relevance
493 614 max_retries: 2
494 615 retry_backoff_sec: 0.2
  616 +
  617 +# SPU配置(已启用,使用嵌套skus)
495 618 spu_config:
496 619 enabled: true
497 620 spu_field: spu_id
498 621 inner_hits_size: 10
  622 + # 配置哪些option维度参与检索(进索引、以及在线搜索)
  623 + # 格式为list,选择option1/option2/option3中的一个或多个
499 624 searchable_option_dimensions:
500 625 - option1
501 626 - option2
502 627 - option3
  628 +
  629 +# 租户配置(Tenant Configuration)
  630 +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选)
  631 +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
503 632 tenant_config:
504 633 default:
505 634 primary_language: en
... ...
scripts/evaluation/README.md 0 → 100644
... ... @@ -0,0 +1,330 @@
  1 +# Search Evaluation Framework
  2 +
  3 +This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation.
  4 +
  5 +It is designed around one core rule:
  6 +
  7 +- Annotation should be built offline first.
  8 +- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set.
  9 +- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete.
  10 +
  11 +## Goals
  12 +
  13 +The framework supports four related tasks:
  14 +
  15 +1. Build an annotation set for a fixed query set.
  16 +2. Evaluate a live search result list against that annotation set.
  17 +3. Run batch evaluation and store historical reports with config snapshots.
  18 +4. Tune fusion parameters reproducibly.
  19 +
  20 +## Files
  21 +
  22 +- `eval_framework.py`
  23 + Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation.
  24 +- `build_annotation_set.py`
  25 + Thin CLI entrypoint into `eval_framework.py`.
  26 +- `serve_eval_web.py`
  27 + Thin web entrypoint into `eval_framework.py`.
  28 +- `tune_fusion.py`
  29 + Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports.
  30 +- `fusion_experiments_shortlist.json`
  31 + A compact experiment set for practical tuning.
  32 +- `fusion_experiments_round1.json`
  33 + A broader first-round experiment set.
  34 +- `queries/queries.txt`
  35 + The canonical evaluation query set.
  36 +- `README_Requirement.md`
  37 + Requirement reference document.
  38 +- `quick_start_eval.sh`
  39 + Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`).
  40 +
  41 +## Quick start (from repo root)
  42 +
  43 +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend.
  44 +
  45 +```bash
  46 +# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/
  47 +./scripts/evaluation/quick_start_eval.sh batch
  48 +
  49 +# 2) Evaluation UI on http://127.0.0.1:6010/
  50 +./scripts/evaluation/quick_start_eval.sh serve
  51 +```
  52 +
  53 +Equivalent explicit commands:
  54 +
  55 +```bash
  56 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  57 + --tenant-id "${TENANT_ID:-163}" \
  58 + --queries-file scripts/evaluation/queries/queries.txt \
  59 + --top-k 50 \
  60 + --language en \
  61 + --labeler-mode simple \
  62 + --force-refresh-labels
  63 +
  64 +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
  65 + --tenant-id "${TENANT_ID:-163}" \
  66 + --queries-file scripts/evaluation/queries/queries.txt \
  67 + --host 127.0.0.1 \
  68 + --port 6010
  69 +```
  70 +
  71 +**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window.
  72 +
  73 +## Storage Layout
  74 +
  75 +All generated artifacts are under:
  76 +
  77 +- `/data/saas-search/artifacts/search_evaluation`
  78 +
  79 +Important subpaths:
  80 +
  81 +- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3`
  82 + Main cache and annotation store.
  83 +- `/data/saas-search/artifacts/search_evaluation/query_builds`
  84 + Per-query pooled annotation-set build artifacts.
  85 +- `/data/saas-search/artifacts/search_evaluation/batch_reports`
  86 + Batch evaluation JSON, Markdown reports, and config snapshots.
  87 +- `/data/saas-search/artifacts/search_evaluation/audits`
  88 + Audit summaries for label quality checks.
  89 +- `/data/saas-search/artifacts/search_evaluation/tuning_runs`
  90 + Fusion experiment summaries and per-experiment config snapshots.
  91 +
  92 +## SQLite Schema Summary
  93 +
  94 +The main tables in `search_eval.sqlite3` are:
  95 +
  96 +- `corpus_docs`
  97 + Cached product corpus for the tenant.
  98 +- `rerank_scores`
  99 + Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`.
  100 +- `relevance_labels`
  101 + Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`.
  102 +- `query_profiles`
  103 + Structured query-intent profiles extracted before labeling.
  104 +- `build_runs`
  105 + Per-query pooled-build records.
  106 +- `batch_runs`
  107 + Batch evaluation history.
  108 +
  109 +## Label Semantics
  110 +
  111 +Three labels are used throughout:
  112 +
  113 +- `Exact`
  114 + Fully matches the intended product type and all explicit required attributes.
  115 +- `Partial`
  116 + Main intent matches, but explicit attributes are missing, approximate, or weaker than requested.
  117 +- `Irrelevant`
  118 + Product type mismatches, or explicit required attributes conflict.
  119 +
  120 +The framework always uses:
  121 +
  122 +- LLM-based batched relevance classification
  123 +- caching and retry logic for robust offline labeling
  124 +
  125 +There are now two labeler modes:
  126 +
  127 +- `simple`
  128 + Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt.
  129 +- `complex`
  130 + Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default.
  131 +
  132 +## Offline-First Workflow
  133 +
  134 +### 1. Refresh labels for the evaluation query set
  135 +
  136 +For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient.
  137 +
  138 +Example:
  139 +
  140 +```bash
  141 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  142 + --tenant-id 163 \
  143 + --queries-file scripts/evaluation/queries/queries.txt \
  144 + --top-k 50 \
  145 + --language en \
  146 + --labeler-mode simple \
  147 + --force-refresh-labels
  148 +```
  149 +
  150 +This command does two things:
  151 +
  152 +- runs **every** query in the file against the live backend (no skip list)
  153 +- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM
  154 +
  155 +After this step, single-query evaluation can run in cached mode without calling the LLM again.
  156 +
  157 +### 2. Optional pooled build
  158 +
  159 +The framework also supports a heavier pooled build that combines:
  160 +
  161 +- top search results
  162 +- top full-corpus reranker results
  163 +
  164 +Example:
  165 +
  166 +```bash
  167 +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \
  168 + --tenant-id 163 \
  169 + --queries-file scripts/evaluation/queries/queries.txt \
  170 + --search-depth 1000 \
  171 + --rerank-depth 10000 \
  172 + --annotate-search-top-k 100 \
  173 + --annotate-rerank-top-k 120 \
  174 + --language en
  175 +```
  176 +
  177 +This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window.
  178 +
  179 +## Why Single-Query Evaluation Was Slow
  180 +
  181 +If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means:
  182 +
  183 +- perform live search
  184 +- detect recalled but unlabeled products
  185 +- call the LLM to label them
  186 +
  187 +That is not the intended steady-state evaluation path.
  188 +
  189 +The UI/API is now configured to prefer cached evaluation:
  190 +
  191 +- default single-query evaluation uses `auto_annotate=false`
  192 +- unlabeled recalled results are treated as `Irrelevant`
  193 +- the response includes tips explaining that coverage gap
  194 +
  195 +If you want stable, fast evaluation:
  196 +
  197 +1. prebuild labels offline
  198 +2. use cached single-query evaluation
  199 +
  200 +## Web UI
  201 +
  202 +Start the evaluation UI:
  203 +
  204 +```bash
  205 +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
  206 + --tenant-id 163 \
  207 + --queries-file scripts/evaluation/queries/queries.txt \
  208 + --host 127.0.0.1 \
  209 + --port 6010
  210 +```
  211 +
  212 +The UI provides:
  213 +
  214 +- query list loaded from `queries.txt`
  215 +- single-query evaluation
  216 +- batch evaluation
  217 +- history of batch reports
  218 +- top recalled results
  219 +- missed `Exact` and `Partial` products that were not recalled
  220 +- tips about unlabeled hits treated as `Irrelevant`
  221 +
  222 +### Single-query response behavior
  223 +
  224 +For a single query:
  225 +
  226 +1. live search returns recalled `spu_id` values
  227 +2. the framework looks up cached labels by `(query, spu_id)`
  228 +3. unlabeled recalled items are counted as `Irrelevant`
  229 +4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial`
  230 +
  231 +This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer.
  232 +
  233 +## CLI Commands
  234 +
  235 +### Build pooled annotation artifacts
  236 +
  237 +```bash
  238 +./.venv/bin/python scripts/evaluation/build_annotation_set.py build ...
  239 +```
  240 +
  241 +### Run batch evaluation
  242 +
  243 +```bash
  244 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  245 + --tenant-id 163 \
  246 + --queries-file scripts/evaluation/queries/queries.txt \
  247 + --top-k 50 \
  248 + --language en \
  249 + --labeler-mode simple
  250 +```
  251 +
  252 +Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first.
  253 +
  254 +### Audit annotation quality
  255 +
  256 +```bash
  257 +./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \
  258 + --tenant-id 163 \
  259 + --queries-file scripts/evaluation/queries/queries.txt \
  260 + --top-k 50 \
  261 + --language en \
  262 + --labeler-mode simple
  263 +```
  264 +
  265 +This checks cached labels against current guardrails and reports suspicious cases.
  266 +
  267 +## Batch Reports
  268 +
  269 +Each batch run stores:
  270 +
  271 +- aggregate metrics
  272 +- per-query metrics
  273 +- label distribution
  274 +- timestamp
  275 +- config snapshot from `/admin/config`
  276 +
  277 +Reports are written as:
  278 +
  279 +- Markdown for easy reading
  280 +- JSON for downstream processing
  281 +
  282 +## Fusion Tuning
  283 +
  284 +The tuning runner applies experiment configs sequentially and records the outcome.
  285 +
  286 +Example:
  287 +
  288 +```bash
  289 +./.venv/bin/python scripts/evaluation/tune_fusion.py \
  290 + --tenant-id 163 \
  291 + --queries-file scripts/evaluation/queries/queries.txt \
  292 + --top-k 50 \
  293 + --language en \
  294 + --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \
  295 + --score-metric MAP_3 \
  296 + --apply-best
  297 +```
  298 +
  299 +What it does:
  300 +
  301 +1. writes an experiment config into `config/config.yaml`
  302 +2. restarts backend
  303 +3. runs batch evaluation
  304 +4. stores the per-experiment result
  305 +5. optionally applies the best experiment at the end
  306 +
  307 +## Current Practical Recommendation
  308 +
  309 +For day-to-day evaluation:
  310 +
  311 +1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels`
  312 +2. run the web UI or normal batch evaluation in cached mode
  313 +3. only force-refresh labels again when:
  314 + - the query set changes
  315 + - the product corpus changes materially
  316 + - the labeling logic changes
  317 +
  318 +## Caveats
  319 +
  320 +- The current label cache is query-specific, not a full all-products all-queries matrix.
  321 +- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached.
  322 +- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting.
  323 +- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters.
  324 +
  325 +## Related Requirement Docs
  326 +
  327 +- `README_Requirement.md`
  328 +- `README_Requirement_zh.md`
  329 +
  330 +These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow.
... ...
scripts/evaluation/eval_framework.py
... ... @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant"
39 39 VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
40 40 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
41 41 DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
42   -JUDGE_PROMPT_VERSION = "v2_structured_20260331"
  42 +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
  43 +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
  44 +DEFAULT_LABELER_MODE = "simple"
43 45  
44 46  
45 47 def utc_now_iso() -> str:
... ... @@ -625,6 +627,57 @@ class DashScopeLabelClient:
625 627 content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
626 628 return content, safe_json_dumps(data)
627 629  
  630 + def classify_batch_simple(
  631 + self,
  632 + query: str,
  633 + docs: Sequence[Dict[str, Any]],
  634 + ) -> Tuple[List[str], str]:
  635 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  636 + prompt = (
  637 + "You are an e-commerce search result relevance evaluation assistant. "
  638 + "Based on the user query and each product's information, output the relevance level for each product.\n\n"
  639 + "## Relevance Level Criteria\n"
  640 + "Exact — Fully matches the user's search intent.\n"
  641 + "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), "
  642 + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
  643 + "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n"
  644 + "Additional judging guidance:\n"
  645 + "- If the query clearly names a product type, product type matching has the highest priority. "
  646 + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
  647 + "bra vs top, backpack vs bag are not interchangeable.\n"
  648 + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
  649 + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
  650 + "- Do not guess missing attributes.\n"
  651 + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
  652 + "- Be conservative with Exact.\n\n"
  653 + f"Query: {query}\n\n"
  654 + "Products:\n"
  655 + + "\n".join(numbered_docs)
  656 + + "\n\n## Output Format\n"
  657 + f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
  658 + "They must correspond sequentially to the products above. Do not output any other information.\n"
  659 + )
  660 + content, raw_response = self._chat(prompt)
  661 + labels = []
  662 + for line in str(content or "").splitlines():
  663 + label = line.strip()
  664 + if label in VALID_LABELS:
  665 + labels.append(label)
  666 + if len(labels) != len(docs):
  667 + payload = _extract_json_blob(content)
  668 + if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
  669 + labels = []
  670 + for item in payload["labels"][: len(docs)]:
  671 + if isinstance(item, dict):
  672 + label = str(item.get("label") or "").strip()
  673 + else:
  674 + label = str(item).strip()
  675 + if label in VALID_LABELS:
  676 + labels.append(label)
  677 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  678 + raise ValueError(f"unexpected simple label output: {content!r}")
  679 + return labels, raw_response
  680 +
628 681 def extract_query_profile(
629 682 self,
630 683 query: str,
... ... @@ -665,7 +718,7 @@ class DashScopeLabelClient:
665 718 payload.setdefault("notes", [])
666 719 return payload, raw_response
667 720  
668   - def classify_batch(
  721 + def classify_batch_complex(
669 722 self,
670 723 query: str,
671 724 query_profile: Dict[str, Any],
... ... @@ -763,10 +816,12 @@ class SearchEvaluationFramework:
763 816 tenant_id: str,
764 817 artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
765 818 search_base_url: str = "http://localhost:6002",
  819 + labeler_mode: str = DEFAULT_LABELER_MODE,
766 820 ):
767 821 init_service(get_app_config().infrastructure.elasticsearch.host)
768 822 self.tenant_id = str(tenant_id)
769 823 self.artifact_root = ensure_dir(artifact_root)
  824 + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
770 825 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
771 826 self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
772 827 app_cfg = get_app_config()
... ... @@ -783,17 +838,24 @@ class SearchEvaluationFramework:
783 838 base_url=str(llm_cfg["base_url"]),
784 839 api_key=str(api_key),
785 840 )
786   - self.query_parser = get_query_parser()
  841 + self.query_parser = None
  842 +
  843 + def _get_query_parser(self):
  844 + if self.query_parser is None:
  845 + self.query_parser = get_query_parser()
  846 + return self.query_parser
787 847  
788 848 def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
789   - parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"])
  849 + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
790 850 payload = parsed.to_dict()
791 851 payload["text_for_rerank"] = parsed.text_for_rerank()
792 852 return payload
793 853  
794 854 def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
  855 + if self.labeler_mode != "complex":
  856 + raise RuntimeError("query profiles are only used in complex labeler mode")
795 857 if not force_refresh:
796   - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION)
  858 + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
797 859 if cached is not None:
798 860 return cached
799 861 parser_hints = self.build_query_parser_hints(query)
... ... @@ -802,7 +864,7 @@ class SearchEvaluationFramework:
802 864 self.store.upsert_query_profile(
803 865 self.tenant_id,
804 866 query,
805   - JUDGE_PROMPT_VERSION,
  867 + JUDGE_PROMPT_VERSION_COMPLEX,
806 868 self.label_client.model,
807 869 profile,
808 870 raw_response,
... ... @@ -955,9 +1017,24 @@ class SearchEvaluationFramework:
955 1017 *,
956 1018 top_k: int = 100,
957 1019 language: str = "en",
958   - auto_annotate: bool = True,
  1020 + auto_annotate: bool = False,
959 1021 ) -> Dict[str, Any]:
960 1022 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
  1023 + if self.labeler_mode != "complex":
  1024 + labels = [
  1025 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1026 + for item in live["results"]
  1027 + ]
  1028 + return {
  1029 + "query": query,
  1030 + "tenant_id": self.tenant_id,
  1031 + "top_k": top_k,
  1032 + "metrics": live["metrics"],
  1033 + "distribution": label_distribution(labels),
  1034 + "query_profile": None,
  1035 + "suspicious": [],
  1036 + "results": live["results"],
  1037 + }
961 1038 query_profile = self.get_query_profile(query, force_refresh=False)
962 1039 suspicious: List[Dict[str, Any]] = []
963 1040  
... ... @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework:
1093 1170 docs: Sequence[Dict[str, Any]],
1094 1171 force_refresh: bool = False,
1095 1172 ) -> Dict[str, str]:
1096   - query_profile = self.get_query_profile(query, force_refresh=force_refresh)
1097 1173 labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
1098 1174 missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
1099 1175 if not missing_docs:
... ... @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework:
1101 1177  
1102 1178 for start in range(0, len(missing_docs), self.label_client.batch_size):
1103 1179 batch = missing_docs[start : start + self.label_client.batch_size]
1104   - batch_pairs = self._classify_with_retry(query, query_profile, batch)
  1180 + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
1105 1181 for sub_labels, raw_response, sub_batch in batch_pairs:
1106   - to_store = {
1107   - str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc)
1108   - for doc, label in zip(sub_batch, sub_labels)
1109   - }
  1182 + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
1110 1183 self.store.upsert_labels(
1111 1184 self.tenant_id,
1112 1185 query,
... ... @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework:
1121 1194 def _classify_with_retry(
1122 1195 self,
1123 1196 query: str,
1124   - query_profile: Dict[str, Any],
1125 1197 docs: Sequence[Dict[str, Any]],
  1198 + *,
  1199 + force_refresh: bool = False,
1126 1200 ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
1127 1201 if not docs:
1128 1202 return []
1129 1203 try:
1130   - labels, raw_response = self.label_client.classify_batch(query, query_profile, docs)
  1204 + if self.labeler_mode == "complex":
  1205 + query_profile = self.get_query_profile(query, force_refresh=force_refresh)
  1206 + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
  1207 + labels = [
  1208 + self._apply_rule_based_label_guardrails(label, query_profile, doc)
  1209 + for doc, label in zip(docs, labels)
  1210 + ]
  1211 + else:
  1212 + labels, raw_response = self.label_client.classify_batch_simple(query, docs)
1131 1213 return [(labels, raw_response, docs)]
1132 1214 except Exception:
1133 1215 if len(docs) == 1:
1134 1216 raise
1135 1217 mid = len(docs) // 2
1136   - return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:])
  1218 + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
1137 1219  
1138 1220 def build_query_annotation_set(
1139 1221 self,
... ... @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework:
1163 1245 for item in full_rerank[:annotate_rerank_top_k]:
1164 1246 pool_docs[str(item["spu_id"])] = item["doc"]
1165 1247  
1166   - query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels)
1167 1248 labels = self.annotate_missing_labels(
1168 1249 query=query,
1169 1250 docs=list(pool_docs.values()),
... ... @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework:
1229 1310 "annotate_rerank_top_k": annotate_rerank_top_k,
1230 1311 "pool_size": len(pool_docs),
1231 1312 },
1232   - "query_profile": query_profile,
  1313 + "labeler_mode": self.labeler_mode,
  1314 + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
1233 1315 "metrics_top100": metrics,
1234 1316 "search_results": search_labeled_results,
1235 1317 "full_rerank_top": rerank_top_results,
... ... @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework:
1250 1332 self,
1251 1333 query: str,
1252 1334 top_k: int = 100,
1253   - auto_annotate: bool = True,
  1335 + auto_annotate: bool = False,
1254 1336 language: str = "en",
1255 1337 force_refresh_labels: bool = False,
1256 1338 ) -> Dict[str, Any]:
... ... @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework:
1259 1341 if auto_annotate:
1260 1342 self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
1261 1343 labels = self.store.get_labels(self.tenant_id, query)
  1344 + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
1262 1345 labeled = []
  1346 + unlabeled_hits = 0
1263 1347 for rank, doc in enumerate(results[:top_k], start=1):
1264 1348 spu_id = str(doc.get("spu_id"))
  1349 + label = labels.get(spu_id)
  1350 + if label not in VALID_LABELS:
  1351 + unlabeled_hits += 1
1265 1352 labeled.append(
1266 1353 {
1267 1354 "rank": rank,
1268 1355 "spu_id": spu_id,
1269 1356 "title": build_display_title(doc),
1270 1357 "image_url": doc.get("image_url"),
1271   - "label": labels.get(spu_id),
  1358 + "label": label,
1272 1359 "option_values": list(compact_option_values(doc.get("skus") or [])),
1273 1360 "product": compact_product_payload(doc),
1274 1361 }
... ... @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework:
1277 1364 item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1278 1365 for item in labeled
1279 1366 ]
  1367 + label_stats = self.store.get_query_label_stats(self.tenant_id, query)
  1368 + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
  1369 + relevant_missing_ids = [
  1370 + spu_id
  1371 + for spu_id, label in labels.items()
  1372 + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
  1373 + ]
  1374 + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
  1375 + missing_relevant = []
  1376 + for spu_id in relevant_missing_ids:
  1377 + doc = missing_docs_map.get(spu_id)
  1378 + if not doc:
  1379 + continue
  1380 + missing_relevant.append(
  1381 + {
  1382 + "spu_id": spu_id,
  1383 + "label": labels[spu_id],
  1384 + "rerank_score": rerank_scores.get(spu_id),
  1385 + "title": build_display_title(doc),
  1386 + "image_url": doc.get("image_url"),
  1387 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  1388 + "product": compact_product_payload(doc),
  1389 + }
  1390 + )
  1391 + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
  1392 + missing_relevant.sort(
  1393 + key=lambda item: (
  1394 + label_order.get(str(item.get("label")), 9),
  1395 + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
  1396 + str(item.get("title") or ""),
  1397 + )
  1398 + )
  1399 + tips: List[str] = []
  1400 + if auto_annotate:
  1401 + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
  1402 + else:
  1403 + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
  1404 + if label_stats["total"] == 0:
  1405 + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
  1406 + if unlabeled_hits:
  1407 + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
  1408 + if not missing_relevant:
  1409 + tips.append("No cached Exact/Partial products were missed by this recall set.")
1280 1410 return {
1281 1411 "query": query,
1282 1412 "tenant_id": self.tenant_id,
1283 1413 "top_k": top_k,
1284 1414 "metrics": compute_query_metrics(metric_labels),
1285 1415 "results": labeled,
  1416 + "missing_relevant": missing_relevant,
  1417 + "label_stats": {
  1418 + **label_stats,
  1419 + "unlabeled_hits_treated_irrelevant": unlabeled_hits,
  1420 + "recalled_hits": len(labeled),
  1421 + "missing_relevant_count": len(missing_relevant),
  1422 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
  1423 + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
  1424 + },
  1425 + "tips": tips,
1286 1426 "total": int(search_payload.get("total") or 0),
1287 1427 }
1288 1428  
... ... @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
1392 1532 class SearchEvalRequest(BaseModel):
1393 1533 query: str
1394 1534 top_k: int = Field(default=100, ge=1, le=500)
1395   - auto_annotate: bool = True
  1535 + auto_annotate: bool = False
1396 1536 language: str = "en"
1397 1537  
1398 1538  
1399 1539 class BatchEvalRequest(BaseModel):
1400 1540 queries: Optional[List[str]] = None
1401 1541 top_k: int = Field(default=100, ge=1, le=500)
1402   - auto_annotate: bool = True
  1542 + auto_annotate: bool = False
1403 1543 language: str = "en"
1404 1544 force_refresh_labels: bool = False
1405 1545  
... ... @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """
1494 1634 .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
1495 1635 .section { margin-bottom: 28px; }
1496 1636 .history { font-size: 13px; line-height: 1.5; }
  1637 + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
  1638 + .tip { margin-bottom: 6px; color: var(--muted); }
1497 1639 </style>
1498 1640 </head>
1499 1641 <body>
... ... @@ -1524,6 +1666,14 @@ WEB_APP_HTML = &quot;&quot;&quot;
1524 1666 <h2>Top Results</h2>
1525 1667 <div id="results" class="results"></div>
1526 1668 </section>
  1669 + <section class="section">
  1670 + <h2>Missed Exact / Partial</h2>
  1671 + <div id="missingRelevant" class="results"></div>
  1672 + </section>
  1673 + <section class="section">
  1674 + <h2>Notes</h2>
  1675 + <div id="tips" class="tips muted"></div>
  1676 + </section>
1527 1677 </main>
1528 1678 </div>
1529 1679 <script>
... ... @@ -1542,15 +1692,15 @@ WEB_APP_HTML = &quot;&quot;&quot;
1542 1692 root.appendChild(card);
1543 1693 });
1544 1694 }
1545   - function renderResults(results) {
1546   - const root = document.getElementById('results');
1547   - root.innerHTML = '';
  1695 + function renderResults(results, rootId='results', showRank=true) {
  1696 + const mount = document.getElementById(rootId);
  1697 + mount.innerHTML = '';
1548 1698 (results || []).forEach(item => {
1549 1699 const label = item.label || 'Unknown';
1550 1700 const box = document.createElement('div');
1551 1701 box.className = 'result';
1552 1702 box.innerHTML = `
1553   - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div>
  1703 + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
1554 1704 <img class="thumb" src="${item.image_url || ''}" alt="" />
1555 1705 <div>
1556 1706 <div class="title">${item.title || ''}</div>
... ... @@ -1560,8 +1710,18 @@ WEB_APP_HTML = &quot;&quot;&quot;
1560 1710 <div>${(item.option_values || [])[2] || ''}</div>
1561 1711 </div>
1562 1712 </div>`;
1563   - root.appendChild(box);
  1713 + mount.appendChild(box);
1564 1714 });
  1715 + if (!(results || []).length) {
  1716 + mount.innerHTML = '<div class="muted">None.</div>';
  1717 + }
  1718 + }
  1719 + function renderTips(data) {
  1720 + const root = document.getElementById('tips');
  1721 + const tips = [...(data.tips || [])];
  1722 + const stats = data.label_stats || {};
  1723 + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
  1724 + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
1565 1725 }
1566 1726 async function loadQueries() {
1567 1727 const data = await fetchJSON('/api/queries');
... ... @@ -1592,11 +1752,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
1592 1752 const data = await fetchJSON('/api/search-eval', {
1593 1753 method: 'POST',
1594 1754 headers: {'Content-Type': 'application/json'},
1595   - body: JSON.stringify({query, top_k: 100, auto_annotate: true})
  1755 + body: JSON.stringify({query, top_k: 100, auto_annotate: false})
1596 1756 });
1597 1757 document.getElementById('status').textContent = `Done. total=${data.total}`;
1598 1758 renderMetrics(data.metrics);
1599   - renderResults(data.results);
  1759 + renderResults(data.results, 'results', true);
  1760 + renderResults(data.missing_relevant, 'missingRelevant', false);
  1761 + renderTips(data);
1600 1762 loadHistory();
1601 1763 }
1602 1764 async function runBatch() {
... ... @@ -1604,11 +1766,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
1604 1766 const data = await fetchJSON('/api/batch-eval', {
1605 1767 method: 'POST',
1606 1768 headers: {'Content-Type': 'application/json'},
1607   - body: JSON.stringify({top_k: 100, auto_annotate: true})
  1769 + body: JSON.stringify({top_k: 100, auto_annotate: false})
1608 1770 });
1609 1771 document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
1610 1772 renderMetrics(data.aggregate_metrics);
1611   - renderResults([]);
  1773 + renderResults([], 'results', true);
  1774 + renderResults([], 'missingRelevant', false);
  1775 + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
1612 1776 loadHistory();
1613 1777 }
1614 1778 loadQueries();
... ... @@ -1633,6 +1797,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1633 1797 build.add_argument("--language", default="en")
1634 1798 build.add_argument("--force-refresh-rerank", action="store_true")
1635 1799 build.add_argument("--force-refresh-labels", action="store_true")
  1800 + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1636 1801  
1637 1802 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
1638 1803 batch.add_argument("--tenant-id", default="163")
... ... @@ -1640,6 +1805,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1640 1805 batch.add_argument("--top-k", type=int, default=100)
1641 1806 batch.add_argument("--language", default="en")
1642 1807 batch.add_argument("--force-refresh-labels", action="store_true")
  1808 + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1643 1809  
1644 1810 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
1645 1811 audit.add_argument("--tenant-id", default="163")
... ... @@ -1648,18 +1814,20 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1648 1814 audit.add_argument("--language", default="en")
1649 1815 audit.add_argument("--limit-suspicious", type=int, default=5)
1650 1816 audit.add_argument("--force-refresh-labels", action="store_true")
  1817 + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1651 1818  
1652 1819 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
1653 1820 serve.add_argument("--tenant-id", default="163")
1654 1821 serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1655 1822 serve.add_argument("--host", default="0.0.0.0")
1656 1823 serve.add_argument("--port", type=int, default=6010)
  1824 + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1657 1825  
1658 1826 return parser
1659 1827  
1660 1828  
1661 1829 def run_build(args: argparse.Namespace) -> None:
1662   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1830 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1663 1831 queries = framework.queries_from_file(Path(args.queries_file))
1664 1832 summary = []
1665 1833 for query in queries:
... ... @@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
1694 1862  
1695 1863  
1696 1864 def run_batch(args: argparse.Namespace) -> None:
1697   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1865 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1698 1866 queries = framework.queries_from_file(Path(args.queries_file))
1699 1867 payload = framework.batch_evaluate(
1700 1868 queries=queries,
... ... @@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -&gt; None:
1707 1875  
1708 1876  
1709 1877 def run_audit(args: argparse.Namespace) -> None:
1710   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1878 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1711 1879 queries = framework.queries_from_file(Path(args.queries_file))
1712 1880 audit_items = []
1713 1881 for query in queries:
... ... @@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -&gt; None:
1757 1925  
1758 1926  
1759 1927 def run_serve(args: argparse.Namespace) -> None:
1760   - framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1928 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1761 1929 app = create_web_app(framework, Path(args.queries_file))
1762 1930 import uvicorn
1763 1931  
... ...
scripts/evaluation/queries/queries.txt
... ... @@ -30,7 +30,6 @@ khaki green backpack
30 30 高跟鞋
31 31 图案连身衣
32 32 天鹅绒鸡尾酒会礼服
33   -Wearing small clothes
34 33 gingham dress
35 34 海滩度假装
36 35 vacation outfits
... ... @@ -41,10 +40,15 @@ hiking boots
41 40 business casual women
42 41 a-line dress
43 42 涤纶短裤
44   -哺乳文胸
45 43 Compression Top Spandex
46 44 skiing trip insulated base layer
47 45 high waisted jeans
48 46 无袖夏装
49 47 雪纺衬衫
50   -convertible zip-off hiking pants
51 48 \ No newline at end of file
  49 +convertible zip-off hiking pants
  50 +petite summer linen shorts
  51 +tall slim fit men's linen shirt
  52 +tall slim fit trousers
  53 +tall straight leg pants
  54 +tassel maxi skirt
  55 +teacher clothes
52 56 \ No newline at end of file
... ...
scripts/evaluation/quick_start_eval.sh 0 → 100755
... ... @@ -0,0 +1,39 @@
  1 +#!/usr/bin/env bash
  2 +# Search evaluation quick entrypoints. Run from any cwd; resolves repo root.
  3 +set -euo pipefail
  4 +
  5 +ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
  6 +cd "$ROOT"
  7 +PY="${ROOT}/.venv/bin/python"
  8 +TENANT_ID="${TENANT_ID:-163}"
  9 +QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  10 +
  11 +usage() {
  12 + echo "Usage: $0 batch|serve"
  13 + echo " batch — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)"
  14 + echo " serve — eval UI on http://127.0.0.1:6010/"
  15 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)"
  16 +}
  17 +
  18 +case "${1:-}" in
  19 + batch)
  20 + exec "$PY" scripts/evaluation/build_annotation_set.py batch \
  21 + --tenant-id "$TENANT_ID" \
  22 + --queries-file "$QUERIES" \
  23 + --top-k 50 \
  24 + --language en \
  25 + --labeler-mode simple \
  26 + --force-refresh-labels
  27 + ;;
  28 + serve)
  29 + exec "$PY" scripts/evaluation/serve_eval_web.py serve \
  30 + --tenant-id "$TENANT_ID" \
  31 + --queries-file "$QUERIES" \
  32 + --host 127.0.0.1 \
  33 + --port 6010
  34 + ;;
  35 + *)
  36 + usage
  37 + exit 1
  38 + ;;
  39 +esac
... ...