Commit 881d338b3acc0b3de1bb3cfb77f4fc69755bb0f7

Authored by tangwang
1 parent 432d1c88

评估框架

config/config.yaml
  1 +# Unified Configuration for Multi-Tenant Search Engine
  2 +# 统一配置文件,所有租户共用一套配置
  3 +# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
  4 +#
  5 +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
  6 +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
  7 +
  8 +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
1 runtime: 9 runtime:
2 environment: prod 10 environment: prod
3 index_namespace: '' 11 index_namespace: ''
@@ -13,6 +21,8 @@ runtime: @@ -13,6 +21,8 @@ runtime:
13 translator_port: 6006 21 translator_port: 6006
14 reranker_host: 0.0.0.0 22 reranker_host: 0.0.0.0
15 reranker_port: 6007 23 reranker_port: 6007
  24 +
  25 +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
16 infrastructure: 26 infrastructure:
17 elasticsearch: 27 elasticsearch:
18 host: http://localhost:9200 28 host: http://localhost:9200
@@ -39,16 +49,30 @@ infrastructure: @@ -39,16 +49,30 @@ infrastructure:
39 secrets: 49 secrets:
40 dashscope_api_key: null 50 dashscope_api_key: null
41 deepl_auth_key: null 51 deepl_auth_key: null
  52 +
  53 +# Elasticsearch Index
42 es_index_name: search_products 54 es_index_name: search_products
  55 +
  56 +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
43 indexes: [] 57 indexes: []
  58 +
  59 +# Config assets
44 assets: 60 assets:
45 query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict 61 query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict
  62 +
  63 +# Product content understanding (LLM enrich-content) configuration
46 product_enrich: 64 product_enrich:
47 max_workers: 40 65 max_workers: 40
  66 +
  67 +# ES Index Settings (基础设置)
48 es_settings: 68 es_settings:
49 number_of_shards: 1 69 number_of_shards: 1
50 number_of_replicas: 0 70 number_of_replicas: 0
51 refresh_interval: 30s 71 refresh_interval: 30s
  72 +
  73 +# 字段权重配置(用于搜索时的字段boost)
  74 +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
  75 +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
52 field_boosts: 76 field_boosts:
53 title: 3.0 77 title: 3.0
54 qanchors: 2.5 78 qanchors: 2.5
@@ -61,21 +85,39 @@ field_boosts: @@ -61,21 +85,39 @@ field_boosts:
61 option1_values: 1.5 85 option1_values: 1.5
62 option2_values: 1.5 86 option2_values: 1.5
63 option3_values: 1.5 87 option3_values: 1.5
  88 +
  89 +# Query Configuration(查询配置)
64 query_config: 90 query_config:
  91 + # 支持的语言
65 supported_languages: 92 supported_languages:
66 - zh 93 - zh
67 - en 94 - en
68 default_language: en 95 default_language: en
  96 +
  97 + # 功能开关(翻译开关由tenant_config控制)
69 enable_text_embedding: true 98 enable_text_embedding: true
70 enable_query_rewrite: true 99 enable_query_rewrite: true
71 - zh_to_en_model: nllb-200-distilled-600m  
72 - en_to_zh_model: nllb-200-distilled-600m 100 +
  101 + # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
  102 + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
  103 + zh_to_en_model: nllb-200-distilled-600m # "opus-mt-zh-en"
  104 + en_to_zh_model: nllb-200-distilled-600m # "opus-mt-en-zh"
73 default_translation_model: nllb-200-distilled-600m 105 default_translation_model: nllb-200-distilled-600m
  106 + # zh_to_en_model: deepl
  107 + # en_to_zh_model: deepl
  108 + # default_translation_model: deepl
  109 + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
74 zh_to_en_model__source_not_in_index: nllb-200-distilled-600m 110 zh_to_en_model__source_not_in_index: nllb-200-distilled-600m
75 en_to_zh_model__source_not_in_index: nllb-200-distilled-600m 111 en_to_zh_model__source_not_in_index: nllb-200-distilled-600m
76 default_translation_model__source_not_in_index: nllb-200-distilled-600m 112 default_translation_model__source_not_in_index: nllb-200-distilled-600m
77 - translation_embedding_wait_budget_ms_source_in_index: 200  
78 - translation_embedding_wait_budget_ms_source_not_in_index: 300 113 + # zh_to_en_model__source_not_in_index: deepl
  114 + # en_to_zh_model__source_not_in_index: deepl
  115 + # default_translation_model__source_not_in_index: deepl
  116 +
  117 + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
  118 + # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
  119 + translation_embedding_wait_budget_ms_source_in_index: 200 # 80
  120 + translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200
79 style_intent: 121 style_intent:
80 enabled: true 122 enabled: true
81 selected_sku_boost: 1.2 123 selected_sku_boost: 1.2
@@ -102,6 +144,10 @@ query_config: @@ -102,6 +144,10 @@ query_config:
102 product_title_exclusion: 144 product_title_exclusion:
103 enabled: true 145 enabled: true
104 dictionary_path: config/dictionaries/product_title_exclusion.tsv 146 dictionary_path: config/dictionaries/product_title_exclusion.tsv
  147 +
  148 + # 动态多语言检索字段配置
  149 + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
  150 + # shared_fields 为无语言后缀字段。
105 search_fields: 151 search_fields:
106 multilingual_fields: 152 multilingual_fields:
107 - title 153 - title
@@ -111,11 +157,14 @@ query_config: @@ -111,11 +157,14 @@ query_config:
111 - brief 157 - brief
112 - description 158 - description
113 - vendor 159 - vendor
  160 + # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values
114 shared_fields: null 161 shared_fields: null
115 core_multilingual_fields: 162 core_multilingual_fields:
116 - title 163 - title
117 - qanchors 164 - qanchors
118 - category_name_text 165 - category_name_text
  166 +
  167 + # 统一文本召回策略(主查询 + 翻译查询)
119 text_query_strategy: 168 text_query_strategy:
120 base_minimum_should_match: 60% 169 base_minimum_should_match: 60%
121 translation_minimum_should_match: 60% 170 translation_minimum_should_match: 60%
@@ -130,8 +179,14 @@ query_config: @@ -130,8 +179,14 @@ query_config:
130 title: 5.0 179 title: 5.0
131 qanchors: 4.0 180 qanchors: 4.0
132 phrase_match_boost: 3.0 181 phrase_match_boost: 3.0
  182 +
  183 + # Embedding字段名称
133 text_embedding_field: title_embedding 184 text_embedding_field: title_embedding
134 image_embedding_field: image_embedding.vector 185 image_embedding_field: image_embedding.vector
  186 +
  187 + # 返回字段配置(_source includes)
  188 + # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
  189 + # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致
135 source_fields: 190 source_fields:
136 - spu_id 191 - spu_id
137 - handle 192 - handle
@@ -163,18 +218,26 @@ query_config: @@ -163,18 +218,26 @@ query_config:
163 - option3_values 218 - option3_values
164 - specifications 219 - specifications
165 - skus 220 - skus
  221 +
  222 + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates)
166 knn_text_boost: 4 223 knn_text_boost: 4
167 knn_image_boost: 4 224 knn_image_boost: 4
  225 +
  226 + # knn_text_num_candidates = k * 3.4
168 knn_text_k: 160 227 knn_text_k: 160
169 knn_text_num_candidates: 560 228 knn_text_num_candidates: 560
170 knn_text_k_long: 400 229 knn_text_k_long: 400
171 knn_text_num_candidates_long: 1200 230 knn_text_num_candidates_long: 1200
172 knn_image_k: 400 231 knn_image_k: 400
173 knn_image_num_candidates: 1200 232 knn_image_num_candidates: 1200
  233 +
  234 +# Function Score配置(ES层打分规则)
174 function_score: 235 function_score:
175 score_mode: sum 236 score_mode: sum
176 boost_mode: multiply 237 boost_mode: multiply
177 functions: [] 238 functions: []
  239 +
  240 +# 粗排配置(仅融合 ES 文本/向量信号,不调用模型)
178 coarse_rank: 241 coarse_rank:
179 enabled: true 242 enabled: true
180 input_window: 700 243 input_window: 700
@@ -182,12 +245,16 @@ coarse_rank: @@ -182,12 +245,16 @@ coarse_rank:
182 fusion: 245 fusion:
183 text_bias: 0.1 246 text_bias: 0.1
184 text_exponent: 0.35 247 text_exponent: 0.35
  248 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
  249 + # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣
185 text_translation_weight: 1.0 250 text_translation_weight: 1.0
186 knn_text_weight: 1.0 251 knn_text_weight: 1.0
187 knn_image_weight: 1.0 252 knn_image_weight: 1.0
188 knn_tie_breaker: 0.1 253 knn_tie_breaker: 0.1
189 knn_bias: 0.6 254 knn_bias: 0.6
190 knn_exponent: 0.0 255 knn_exponent: 0.0
  256 +
  257 +# 精排配置(轻量 reranker)
191 fine_rank: 258 fine_rank:
192 enabled: false 259 enabled: false
193 input_window: 160 260 input_window: 160
@@ -196,6 +263,8 @@ fine_rank: @@ -196,6 +263,8 @@ fine_rank:
196 rerank_query_template: '{query}' 263 rerank_query_template: '{query}'
197 rerank_doc_template: '{title}' 264 rerank_doc_template: '{title}'
198 service_profile: fine 265 service_profile: fine
  266 +
  267 +# 重排配置(provider/URL 在 services.rerank)
199 rerank: 268 rerank:
200 enabled: true 269 enabled: true
201 rerank_window: 160 270 rerank_window: 160
@@ -205,6 +274,11 @@ rerank: @@ -205,6 +274,11 @@ rerank:
205 rerank_query_template: '{query}' 274 rerank_query_template: '{query}'
206 rerank_doc_template: '{title}' 275 rerank_doc_template: '{title}'
207 service_profile: default 276 service_profile: default
  277 +
  278 + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项)
  279 + # 其中 knn_score 先做一层 dis_max:
  280 + # max(knn_text_weight * text_knn, knn_image_weight * image_knn)
  281 + # + knn_tie_breaker * 另一侧较弱信号
208 fusion: 282 fusion:
209 rerank_bias: 1.0e-05 283 rerank_bias: 1.0e-05
210 rerank_exponent: 1.15 284 rerank_exponent: 1.15
@@ -212,22 +286,29 @@ rerank: @@ -212,22 +286,29 @@ rerank:
212 fine_exponent: 1.0 286 fine_exponent: 1.0
213 text_bias: 0.1 287 text_bias: 0.1
214 text_exponent: 0.25 288 text_exponent: 0.25
  289 + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
215 text_translation_weight: 0.8 290 text_translation_weight: 0.8
216 knn_text_weight: 1.0 291 knn_text_weight: 1.0
217 knn_image_weight: 1.0 292 knn_image_weight: 1.0
218 knn_tie_breaker: 0.1 293 knn_tie_breaker: 0.1
219 knn_bias: 0.6 294 knn_bias: 0.6
220 knn_exponent: 0.0 295 knn_exponent: 0.0
  296 +
  297 +# 可扩展服务/provider 注册表(单一配置源)
221 services: 298 services:
222 translation: 299 translation:
223 service_url: http://127.0.0.1:6006 300 service_url: http://127.0.0.1:6006
  301 + # default_model: nllb-200-distilled-600m
224 default_model: nllb-200-distilled-600m 302 default_model: nllb-200-distilled-600m
225 default_scene: general 303 default_scene: general
226 timeout_sec: 10.0 304 timeout_sec: 10.0
227 cache: 305 cache:
228 ttl_seconds: 62208000 306 ttl_seconds: 62208000
229 sliding_expiration: true 307 sliding_expiration: true
  308 + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
230 enable_model_quality_tier_cache: true 309 enable_model_quality_tier_cache: true
  310 + # Higher tier = better quality. Multiple models may share one tier (同级).
  311 + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
231 model_quality_tiers: 312 model_quality_tiers:
232 deepl: 30 313 deepl: 30
233 qwen-mt: 30 314 qwen-mt: 30
@@ -321,12 +402,13 @@ services: @@ -321,12 +402,13 @@ services:
321 num_beams: 1 402 num_beams: 1
322 use_cache: true 403 use_cache: true
323 embedding: 404 embedding:
324 - provider: http 405 + provider: http # http
325 providers: 406 providers:
326 http: 407 http:
327 text_base_url: http://127.0.0.1:6005 408 text_base_url: http://127.0.0.1:6005
328 image_base_url: http://127.0.0.1:6008 409 image_base_url: http://127.0.0.1:6008
329 - backend: tei 410 + # 服务内文本后端(embedding 进程启动时读取)
  411 + backend: tei # tei | local_st
330 backends: 412 backends:
331 tei: 413 tei:
332 base_url: http://127.0.0.1:8080 414 base_url: http://127.0.0.1:8080
@@ -337,7 +419,10 @@ services: @@ -337,7 +419,10 @@ services:
337 device: cuda 419 device: cuda
338 batch_size: 32 420 batch_size: 32
339 normalize_embeddings: true 421 normalize_embeddings: true
340 - image_backend: clip_as_service 422 + # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name)
  423 + # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中
  424 + # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。
  425 + image_backend: clip_as_service # clip_as_service | local_cnclip
341 image_backends: 426 image_backends:
342 clip_as_service: 427 clip_as_service:
343 server: grpc://127.0.0.1:51000 428 server: grpc://127.0.0.1:51000
@@ -364,6 +449,7 @@ services: @@ -364,6 +449,7 @@ services:
364 max_docs: 1000 449 max_docs: 1000
365 normalize: true 450 normalize: true
366 default_instance: default 451 default_instance: default
  452 + # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。
367 instances: 453 instances:
368 default: 454 default:
369 host: 0.0.0.0 455 host: 0.0.0.0
@@ -405,11 +491,29 @@ services: @@ -405,11 +491,29 @@ services:
405 enforce_eager: false 491 enforce_eager: false
406 infer_batch_size: 100 492 infer_batch_size: 100
407 sort_by_doc_length: true 493 sort_by_doc_length: true
408 - instruction_format: standard 494 + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct)
  495 + instruction_format: standard # compact standard
  496 + # instruction: "Given a query, score the product for relevance"
  497 + # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
  498 + # instruction: "rank products by given query, category match first"
  499 + # instruction: "Rank products by query relevance, prioritizing category match"
  500 + # instruction: "Rank products by query relevance, prioritizing category and style match"
  501 + # instruction: "Rank by query relevance, prioritize category & style"
  502 + # instruction: "Relevance ranking: category & style match first"
  503 + # instruction: "Score product relevance by query with category & style match prioritized"
  504 + # instruction: "Rank products by query with category & style match prioritized"
  505 + # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query"
409 instruction: rank products by given query 506 instruction: rank products by given query
  507 + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score
  508 + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。
410 qwen3_vllm_score: 509 qwen3_vllm_score:
411 model_name: Qwen/Qwen3-Reranker-0.6B 510 model_name: Qwen/Qwen3-Reranker-0.6B
  511 + # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false
412 use_original_qwen3_hf_overrides: true 512 use_original_qwen3_hf_overrides: true
  513 + # vllm_runner: "auto"
  514 + # vllm_convert: "auto"
  515 + # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并
  516 + # hf_overrides: {}
413 engine: vllm 517 engine: vllm
414 max_model_len: 172 518 max_model_len: 172
415 tensor_parallel_size: 1 519 tensor_parallel_size: 1
@@ -419,15 +523,23 @@ services: @@ -419,15 +523,23 @@ services:
419 enforce_eager: false 523 enforce_eager: false
420 infer_batch_size: 80 524 infer_batch_size: 80
421 sort_by_doc_length: true 525 sort_by_doc_length: true
422 - instruction_format: standard 526 + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
  527 + instruction_format: standard # compact standard
  528 + # instruction: "Rank products by query with category & style match prioritized"
  529 + # instruction: "Given a shopping query, rank products by relevance"
423 instruction: Rank products by query with category & style match prioritized 530 instruction: Rank products by query with category & style match prioritized
424 qwen3_transformers: 531 qwen3_transformers:
425 model_name: Qwen/Qwen3-Reranker-0.6B 532 model_name: Qwen/Qwen3-Reranker-0.6B
426 instruction: rank products by given query 533 instruction: rank products by given query
  534 + # instruction: "Score the product’s relevance to the given query"
427 max_length: 8192 535 max_length: 8192
428 batch_size: 64 536 batch_size: 64
429 use_fp16: true 537 use_fp16: true
  538 + # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2
430 attn_implementation: sdpa 539 attn_implementation: sdpa
  540 + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
  541 + # For 1 query + many short docs (for example 400 product titles), this usually reduces
  542 + # repeated prefix work and padding waste compared with pairwise batching.
431 qwen3_transformers_packed: 543 qwen3_transformers_packed:
432 model_name: Qwen/Qwen3-Reranker-0.6B 544 model_name: Qwen/Qwen3-Reranker-0.6B
433 instruction: Rank products by query with category & style match prioritized 545 instruction: Rank products by query with category & style match prioritized
@@ -436,6 +548,8 @@ services: @@ -436,6 +548,8 @@ services:
436 max_docs_per_pack: 0 548 max_docs_per_pack: 0
437 use_fp16: true 549 use_fp16: true
438 sort_by_doc_length: true 550 sort_by_doc_length: true
  551 + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
  552 + # If your torch/transformers stack validates it, you can benchmark "sdpa".
439 attn_implementation: eager 553 attn_implementation: eager
440 qwen3_gguf: 554 qwen3_gguf:
441 repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF 555 repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
@@ -443,6 +557,7 @@ services: @@ -443,6 +557,7 @@ services:
443 cache_dir: ./model_cache 557 cache_dir: ./model_cache
444 local_dir: ./models/reranker/qwen3-reranker-4b-gguf 558 local_dir: ./models/reranker/qwen3-reranker-4b-gguf
445 instruction: Rank products by query with category & style match prioritized 559 instruction: Rank products by query with category & style match prioritized
  560 + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快
446 n_ctx: 512 561 n_ctx: 512
447 n_batch: 512 562 n_batch: 512
448 n_ubatch: 512 563 n_ubatch: 512
@@ -465,6 +580,8 @@ services: @@ -465,6 +580,8 @@ services:
465 cache_dir: ./model_cache 580 cache_dir: ./model_cache
466 local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf 581 local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf
467 instruction: Rank products by query with category & style match prioritized 582 instruction: Rank products by query with category & style match prioritized
  583 + # 0.6B GGUF / online rerank baseline:
  584 + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。
468 n_ctx: 256 585 n_ctx: 256
469 n_batch: 256 586 n_batch: 256
470 n_ubatch: 256 587 n_ubatch: 256
@@ -484,22 +601,34 @@ services: @@ -484,22 +601,34 @@ services:
484 verbose: false 601 verbose: false
485 dashscope_rerank: 602 dashscope_rerank:
486 model_name: qwen3-rerank 603 model_name: qwen3-rerank
  604 + # 按地域选择 endpoint:
  605 + # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
  606 + # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
  607 + # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
487 endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks 608 endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
488 api_key_env: RERANK_DASHSCOPE_API_KEY_CN 609 api_key_env: RERANK_DASHSCOPE_API_KEY_CN
489 timeout_sec: 10.0 610 timeout_sec: 10.0
490 - top_n_cap: 0  
491 - batchsize: 64 611 + top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限
  612 + batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断)
492 instruct: Given a shopping query, rank product titles by relevance 613 instruct: Given a shopping query, rank product titles by relevance
493 max_retries: 2 614 max_retries: 2
494 retry_backoff_sec: 0.2 615 retry_backoff_sec: 0.2
  616 +
  617 +# SPU配置(已启用,使用嵌套skus)
495 spu_config: 618 spu_config:
496 enabled: true 619 enabled: true
497 spu_field: spu_id 620 spu_field: spu_id
498 inner_hits_size: 10 621 inner_hits_size: 10
  622 + # 配置哪些option维度参与检索(进索引、以及在线搜索)
  623 + # 格式为list,选择option1/option2/option3中的一个或多个
499 searchable_option_dimensions: 624 searchable_option_dimensions:
500 - option1 625 - option1
501 - option2 626 - option2
502 - option3 627 - option3
  628 +
  629 +# 租户配置(Tenant Configuration)
  630 +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选)
  631 +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
503 tenant_config: 632 tenant_config:
504 default: 633 default:
505 primary_language: en 634 primary_language: en
scripts/evaluation/README.md 0 → 100644
@@ -0,0 +1,330 @@ @@ -0,0 +1,330 @@
  1 +# Search Evaluation Framework
  2 +
  3 +This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation.
  4 +
  5 +It is designed around one core rule:
  6 +
  7 +- Annotation should be built offline first.
  8 +- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set.
  9 +- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete.
  10 +
  11 +## Goals
  12 +
  13 +The framework supports four related tasks:
  14 +
  15 +1. Build an annotation set for a fixed query set.
  16 +2. Evaluate a live search result list against that annotation set.
  17 +3. Run batch evaluation and store historical reports with config snapshots.
  18 +4. Tune fusion parameters reproducibly.
  19 +
  20 +## Files
  21 +
  22 +- `eval_framework.py`
  23 + Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation.
  24 +- `build_annotation_set.py`
  25 + Thin CLI entrypoint into `eval_framework.py`.
  26 +- `serve_eval_web.py`
  27 + Thin web entrypoint into `eval_framework.py`.
  28 +- `tune_fusion.py`
  29 + Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports.
  30 +- `fusion_experiments_shortlist.json`
  31 + A compact experiment set for practical tuning.
  32 +- `fusion_experiments_round1.json`
  33 + A broader first-round experiment set.
  34 +- `queries/queries.txt`
  35 + The canonical evaluation query set.
  36 +- `README_Requirement.md`
  37 + Requirement reference document.
  38 +- `quick_start_eval.sh`
  39 + Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`).
  40 +
  41 +## Quick start (from repo root)
  42 +
  43 +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend.
  44 +
  45 +```bash
  46 +# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/
  47 +./scripts/evaluation/quick_start_eval.sh batch
  48 +
  49 +# 2) Evaluation UI on http://127.0.0.1:6010/
  50 +./scripts/evaluation/quick_start_eval.sh serve
  51 +```
  52 +
  53 +Equivalent explicit commands:
  54 +
  55 +```bash
  56 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  57 + --tenant-id "${TENANT_ID:-163}" \
  58 + --queries-file scripts/evaluation/queries/queries.txt \
  59 + --top-k 50 \
  60 + --language en \
  61 + --labeler-mode simple \
  62 + --force-refresh-labels
  63 +
  64 +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
  65 + --tenant-id "${TENANT_ID:-163}" \
  66 + --queries-file scripts/evaluation/queries/queries.txt \
  67 + --host 127.0.0.1 \
  68 + --port 6010
  69 +```
  70 +
  71 +**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window.
  72 +
  73 +## Storage Layout
  74 +
  75 +All generated artifacts are under:
  76 +
  77 +- `/data/saas-search/artifacts/search_evaluation`
  78 +
  79 +Important subpaths:
  80 +
  81 +- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3`
  82 + Main cache and annotation store.
  83 +- `/data/saas-search/artifacts/search_evaluation/query_builds`
  84 + Per-query pooled annotation-set build artifacts.
  85 +- `/data/saas-search/artifacts/search_evaluation/batch_reports`
  86 + Batch evaluation JSON, Markdown reports, and config snapshots.
  87 +- `/data/saas-search/artifacts/search_evaluation/audits`
  88 + Audit summaries for label quality checks.
  89 +- `/data/saas-search/artifacts/search_evaluation/tuning_runs`
  90 + Fusion experiment summaries and per-experiment config snapshots.
  91 +
  92 +## SQLite Schema Summary
  93 +
  94 +The main tables in `search_eval.sqlite3` are:
  95 +
  96 +- `corpus_docs`
  97 + Cached product corpus for the tenant.
  98 +- `rerank_scores`
  99 + Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`.
  100 +- `relevance_labels`
  101 + Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`.
  102 +- `query_profiles`
  103 + Structured query-intent profiles extracted before labeling.
  104 +- `build_runs`
  105 + Per-query pooled-build records.
  106 +- `batch_runs`
  107 + Batch evaluation history.
  108 +
  109 +## Label Semantics
  110 +
  111 +Three labels are used throughout:
  112 +
  113 +- `Exact`
  114 + Fully matches the intended product type and all explicit required attributes.
  115 +- `Partial`
  116 + Main intent matches, but explicit attributes are missing, approximate, or weaker than requested.
  117 +- `Irrelevant`
  118 + Product type mismatches, or explicit required attributes conflict.
  119 +
  120 +The framework always uses:
  121 +
  122 +- LLM-based batched relevance classification
  123 +- caching and retry logic for robust offline labeling
  124 +
  125 +There are now two labeler modes:
  126 +
  127 +- `simple`
  128 + Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt.
  129 +- `complex`
  130 + Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default.
  131 +
  132 +## Offline-First Workflow
  133 +
  134 +### 1. Refresh labels for the evaluation query set
  135 +
  136 +For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient.
  137 +
  138 +Example:
  139 +
  140 +```bash
  141 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  142 + --tenant-id 163 \
  143 + --queries-file scripts/evaluation/queries/queries.txt \
  144 + --top-k 50 \
  145 + --language en \
  146 + --labeler-mode simple \
  147 + --force-refresh-labels
  148 +```
  149 +
  150 +This command does two things:
  151 +
  152 +- runs **every** query in the file against the live backend (no skip list)
  153 +- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM
  154 +
  155 +After this step, single-query evaluation can run in cached mode without calling the LLM again.
  156 +
  157 +### 2. Optional pooled build
  158 +
  159 +The framework also supports a heavier pooled build that combines:
  160 +
  161 +- top search results
  162 +- top full-corpus reranker results
  163 +
  164 +Example:
  165 +
  166 +```bash
  167 +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \
  168 + --tenant-id 163 \
  169 + --queries-file scripts/evaluation/queries/queries.txt \
  170 + --search-depth 1000 \
  171 + --rerank-depth 10000 \
  172 + --annotate-search-top-k 100 \
  173 + --annotate-rerank-top-k 120 \
  174 + --language en
  175 +```
  176 +
  177 +This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window.
  178 +
  179 +## Why Single-Query Evaluation Was Slow
  180 +
  181 +If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means:
  182 +
  183 +- perform live search
  184 +- detect recalled but unlabeled products
  185 +- call the LLM to label them
  186 +
  187 +That is not the intended steady-state evaluation path.
  188 +
  189 +The UI/API is now configured to prefer cached evaluation:
  190 +
  191 +- default single-query evaluation uses `auto_annotate=false`
  192 +- unlabeled recalled results are treated as `Irrelevant`
  193 +- the response includes tips explaining that coverage gap
  194 +
  195 +If you want stable, fast evaluation:
  196 +
  197 +1. prebuild labels offline
  198 +2. use cached single-query evaluation
  199 +
  200 +## Web UI
  201 +
  202 +Start the evaluation UI:
  203 +
  204 +```bash
  205 +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \
  206 + --tenant-id 163 \
  207 + --queries-file scripts/evaluation/queries/queries.txt \
  208 + --host 127.0.0.1 \
  209 + --port 6010
  210 +```
  211 +
  212 +The UI provides:
  213 +
  214 +- query list loaded from `queries.txt`
  215 +- single-query evaluation
  216 +- batch evaluation
  217 +- history of batch reports
  218 +- top recalled results
  219 +- missed `Exact` and `Partial` products that were not recalled
  220 +- tips about unlabeled hits treated as `Irrelevant`
  221 +
  222 +### Single-query response behavior
  223 +
  224 +For a single query:
  225 +
  226 +1. live search returns recalled `spu_id` values
  227 +2. the framework looks up cached labels by `(query, spu_id)`
  228 +3. unlabeled recalled items are counted as `Irrelevant`
  229 +4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial`
  230 +
  231 +This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer.
  232 +
  233 +## CLI Commands
  234 +
  235 +### Build pooled annotation artifacts
  236 +
  237 +```bash
  238 +./.venv/bin/python scripts/evaluation/build_annotation_set.py build ...
  239 +```
  240 +
  241 +### Run batch evaluation
  242 +
  243 +```bash
  244 +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \
  245 + --tenant-id 163 \
  246 + --queries-file scripts/evaluation/queries/queries.txt \
  247 + --top-k 50 \
  248 + --language en \
  249 + --labeler-mode simple
  250 +```
  251 +
  252 +Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first.
  253 +
  254 +### Audit annotation quality
  255 +
  256 +```bash
  257 +./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \
  258 + --tenant-id 163 \
  259 + --queries-file scripts/evaluation/queries/queries.txt \
  260 + --top-k 50 \
  261 + --language en \
  262 + --labeler-mode simple
  263 +```
  264 +
  265 +This checks cached labels against current guardrails and reports suspicious cases.
  266 +
  267 +## Batch Reports
  268 +
  269 +Each batch run stores:
  270 +
  271 +- aggregate metrics
  272 +- per-query metrics
  273 +- label distribution
  274 +- timestamp
  275 +- config snapshot from `/admin/config`
  276 +
  277 +Reports are written as:
  278 +
  279 +- Markdown for easy reading
  280 +- JSON for downstream processing
  281 +
  282 +## Fusion Tuning
  283 +
  284 +The tuning runner applies experiment configs sequentially and records the outcome.
  285 +
  286 +Example:
  287 +
  288 +```bash
  289 +./.venv/bin/python scripts/evaluation/tune_fusion.py \
  290 + --tenant-id 163 \
  291 + --queries-file scripts/evaluation/queries/queries.txt \
  292 + --top-k 50 \
  293 + --language en \
  294 + --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \
  295 + --score-metric MAP_3 \
  296 + --apply-best
  297 +```
  298 +
  299 +What it does:
  300 +
  301 +1. writes an experiment config into `config/config.yaml`
  302 +2. restarts backend
  303 +3. runs batch evaluation
  304 +4. stores the per-experiment result
  305 +5. optionally applies the best experiment at the end
  306 +
  307 +## Current Practical Recommendation
  308 +
  309 +For day-to-day evaluation:
  310 +
  311 +1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels`
  312 +2. run the web UI or normal batch evaluation in cached mode
  313 +3. only force-refresh labels again when:
  314 + - the query set changes
  315 + - the product corpus changes materially
  316 + - the labeling logic changes
  317 +
  318 +## Caveats
  319 +
  320 +- The current label cache is query-specific, not a full all-products all-queries matrix.
  321 +- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached.
  322 +- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting.
  323 +- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters.
  324 +
  325 +## Related Requirement Docs
  326 +
  327 +- `README_Requirement.md`
  328 +- `README_Requirement_zh.md`
  329 +
  330 +These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow.
scripts/evaluation/eval_framework.py
@@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant" @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant"
39 VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} 39 VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
40 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" 40 DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
41 DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" 41 DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
42 -JUDGE_PROMPT_VERSION = "v2_structured_20260331" 42 +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331"
  43 +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331"
  44 +DEFAULT_LABELER_MODE = "simple"
43 45
44 46
45 def utc_now_iso() -> str: 47 def utc_now_iso() -> str:
@@ -625,6 +627,57 @@ class DashScopeLabelClient: @@ -625,6 +627,57 @@ class DashScopeLabelClient:
625 content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() 627 content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
626 return content, safe_json_dumps(data) 628 return content, safe_json_dumps(data)
627 629
  630 + def classify_batch_simple(
  631 + self,
  632 + query: str,
  633 + docs: Sequence[Dict[str, Any]],
  634 + ) -> Tuple[List[str], str]:
  635 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  636 + prompt = (
  637 + "You are an e-commerce search result relevance evaluation assistant. "
  638 + "Based on the user query and each product's information, output the relevance level for each product.\n\n"
  639 + "## Relevance Level Criteria\n"
  640 + "Exact — Fully matches the user's search intent.\n"
  641 + "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), "
  642 + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n"
  643 + "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n"
  644 + "Additional judging guidance:\n"
  645 + "- If the query clearly names a product type, product type matching has the highest priority. "
  646 + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, "
  647 + "bra vs top, backpack vs bag are not interchangeable.\n"
  648 + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n"
  649 + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n"
  650 + "- Do not guess missing attributes.\n"
  651 + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n"
  652 + "- Be conservative with Exact.\n\n"
  653 + f"Query: {query}\n\n"
  654 + "Products:\n"
  655 + + "\n".join(numbered_docs)
  656 + + "\n\n## Output Format\n"
  657 + f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. "
  658 + "They must correspond sequentially to the products above. Do not output any other information.\n"
  659 + )
  660 + content, raw_response = self._chat(prompt)
  661 + labels = []
  662 + for line in str(content or "").splitlines():
  663 + label = line.strip()
  664 + if label in VALID_LABELS:
  665 + labels.append(label)
  666 + if len(labels) != len(docs):
  667 + payload = _extract_json_blob(content)
  668 + if isinstance(payload, dict) and isinstance(payload.get("labels"), list):
  669 + labels = []
  670 + for item in payload["labels"][: len(docs)]:
  671 + if isinstance(item, dict):
  672 + label = str(item.get("label") or "").strip()
  673 + else:
  674 + label = str(item).strip()
  675 + if label in VALID_LABELS:
  676 + labels.append(label)
  677 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  678 + raise ValueError(f"unexpected simple label output: {content!r}")
  679 + return labels, raw_response
  680 +
628 def extract_query_profile( 681 def extract_query_profile(
629 self, 682 self,
630 query: str, 683 query: str,
@@ -665,7 +718,7 @@ class DashScopeLabelClient: @@ -665,7 +718,7 @@ class DashScopeLabelClient:
665 payload.setdefault("notes", []) 718 payload.setdefault("notes", [])
666 return payload, raw_response 719 return payload, raw_response
667 720
668 - def classify_batch( 721 + def classify_batch_complex(
669 self, 722 self,
670 query: str, 723 query: str,
671 query_profile: Dict[str, Any], 724 query_profile: Dict[str, Any],
@@ -763,10 +816,12 @@ class SearchEvaluationFramework: @@ -763,10 +816,12 @@ class SearchEvaluationFramework:
763 tenant_id: str, 816 tenant_id: str,
764 artifact_root: Path = DEFAULT_ARTIFACT_ROOT, 817 artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
765 search_base_url: str = "http://localhost:6002", 818 search_base_url: str = "http://localhost:6002",
  819 + labeler_mode: str = DEFAULT_LABELER_MODE,
766 ): 820 ):
767 init_service(get_app_config().infrastructure.elasticsearch.host) 821 init_service(get_app_config().infrastructure.elasticsearch.host)
768 self.tenant_id = str(tenant_id) 822 self.tenant_id = str(tenant_id)
769 self.artifact_root = ensure_dir(artifact_root) 823 self.artifact_root = ensure_dir(artifact_root)
  824 + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE
770 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") 825 self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
771 self.search_client = SearchServiceClient(search_base_url, self.tenant_id) 826 self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
772 app_cfg = get_app_config() 827 app_cfg = get_app_config()
@@ -783,17 +838,24 @@ class SearchEvaluationFramework: @@ -783,17 +838,24 @@ class SearchEvaluationFramework:
783 base_url=str(llm_cfg["base_url"]), 838 base_url=str(llm_cfg["base_url"]),
784 api_key=str(api_key), 839 api_key=str(api_key),
785 ) 840 )
786 - self.query_parser = get_query_parser() 841 + self.query_parser = None
  842 +
  843 + def _get_query_parser(self):
  844 + if self.query_parser is None:
  845 + self.query_parser = get_query_parser()
  846 + return self.query_parser
787 847
788 def build_query_parser_hints(self, query: str) -> Dict[str, Any]: 848 def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
789 - parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"]) 849 + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"])
790 payload = parsed.to_dict() 850 payload = parsed.to_dict()
791 payload["text_for_rerank"] = parsed.text_for_rerank() 851 payload["text_for_rerank"] = parsed.text_for_rerank()
792 return payload 852 return payload
793 853
794 def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: 854 def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
  855 + if self.labeler_mode != "complex":
  856 + raise RuntimeError("query profiles are only used in complex labeler mode")
795 if not force_refresh: 857 if not force_refresh:
796 - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION) 858 + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX)
797 if cached is not None: 859 if cached is not None:
798 return cached 860 return cached
799 parser_hints = self.build_query_parser_hints(query) 861 parser_hints = self.build_query_parser_hints(query)
@@ -802,7 +864,7 @@ class SearchEvaluationFramework: @@ -802,7 +864,7 @@ class SearchEvaluationFramework:
802 self.store.upsert_query_profile( 864 self.store.upsert_query_profile(
803 self.tenant_id, 865 self.tenant_id,
804 query, 866 query,
805 - JUDGE_PROMPT_VERSION, 867 + JUDGE_PROMPT_VERSION_COMPLEX,
806 self.label_client.model, 868 self.label_client.model,
807 profile, 869 profile,
808 raw_response, 870 raw_response,
@@ -955,9 +1017,24 @@ class SearchEvaluationFramework: @@ -955,9 +1017,24 @@ class SearchEvaluationFramework:
955 *, 1017 *,
956 top_k: int = 100, 1018 top_k: int = 100,
957 language: str = "en", 1019 language: str = "en",
958 - auto_annotate: bool = True, 1020 + auto_annotate: bool = False,
959 ) -> Dict[str, Any]: 1021 ) -> Dict[str, Any]:
960 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) 1022 live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
  1023 + if self.labeler_mode != "complex":
  1024 + labels = [
  1025 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1026 + for item in live["results"]
  1027 + ]
  1028 + return {
  1029 + "query": query,
  1030 + "tenant_id": self.tenant_id,
  1031 + "top_k": top_k,
  1032 + "metrics": live["metrics"],
  1033 + "distribution": label_distribution(labels),
  1034 + "query_profile": None,
  1035 + "suspicious": [],
  1036 + "results": live["results"],
  1037 + }
961 query_profile = self.get_query_profile(query, force_refresh=False) 1038 query_profile = self.get_query_profile(query, force_refresh=False)
962 suspicious: List[Dict[str, Any]] = [] 1039 suspicious: List[Dict[str, Any]] = []
963 1040
@@ -1093,7 +1170,6 @@ class SearchEvaluationFramework: @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework:
1093 docs: Sequence[Dict[str, Any]], 1170 docs: Sequence[Dict[str, Any]],
1094 force_refresh: bool = False, 1171 force_refresh: bool = False,
1095 ) -> Dict[str, str]: 1172 ) -> Dict[str, str]:
1096 - query_profile = self.get_query_profile(query, force_refresh=force_refresh)  
1097 labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) 1173 labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
1098 missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] 1174 missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
1099 if not missing_docs: 1175 if not missing_docs:
@@ -1101,12 +1177,9 @@ class SearchEvaluationFramework: @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework:
1101 1177
1102 for start in range(0, len(missing_docs), self.label_client.batch_size): 1178 for start in range(0, len(missing_docs), self.label_client.batch_size):
1103 batch = missing_docs[start : start + self.label_client.batch_size] 1179 batch = missing_docs[start : start + self.label_client.batch_size]
1104 - batch_pairs = self._classify_with_retry(query, query_profile, batch) 1180 + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh)
1105 for sub_labels, raw_response, sub_batch in batch_pairs: 1181 for sub_labels, raw_response, sub_batch in batch_pairs:
1106 - to_store = {  
1107 - str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc)  
1108 - for doc, label in zip(sub_batch, sub_labels)  
1109 - } 1182 + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)}
1110 self.store.upsert_labels( 1183 self.store.upsert_labels(
1111 self.tenant_id, 1184 self.tenant_id,
1112 query, 1185 query,
@@ -1121,19 +1194,28 @@ class SearchEvaluationFramework: @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework:
1121 def _classify_with_retry( 1194 def _classify_with_retry(
1122 self, 1195 self,
1123 query: str, 1196 query: str,
1124 - query_profile: Dict[str, Any],  
1125 docs: Sequence[Dict[str, Any]], 1197 docs: Sequence[Dict[str, Any]],
  1198 + *,
  1199 + force_refresh: bool = False,
1126 ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: 1200 ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
1127 if not docs: 1201 if not docs:
1128 return [] 1202 return []
1129 try: 1203 try:
1130 - labels, raw_response = self.label_client.classify_batch(query, query_profile, docs) 1204 + if self.labeler_mode == "complex":
  1205 + query_profile = self.get_query_profile(query, force_refresh=force_refresh)
  1206 + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs)
  1207 + labels = [
  1208 + self._apply_rule_based_label_guardrails(label, query_profile, doc)
  1209 + for doc, label in zip(docs, labels)
  1210 + ]
  1211 + else:
  1212 + labels, raw_response = self.label_client.classify_batch_simple(query, docs)
1131 return [(labels, raw_response, docs)] 1213 return [(labels, raw_response, docs)]
1132 except Exception: 1214 except Exception:
1133 if len(docs) == 1: 1215 if len(docs) == 1:
1134 raise 1216 raise
1135 mid = len(docs) // 2 1217 mid = len(docs) // 2
1136 - return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:]) 1218 + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh)
1137 1219
1138 def build_query_annotation_set( 1220 def build_query_annotation_set(
1139 self, 1221 self,
@@ -1163,7 +1245,6 @@ class SearchEvaluationFramework: @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework:
1163 for item in full_rerank[:annotate_rerank_top_k]: 1245 for item in full_rerank[:annotate_rerank_top_k]:
1164 pool_docs[str(item["spu_id"])] = item["doc"] 1246 pool_docs[str(item["spu_id"])] = item["doc"]
1165 1247
1166 - query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels)  
1167 labels = self.annotate_missing_labels( 1248 labels = self.annotate_missing_labels(
1168 query=query, 1249 query=query,
1169 docs=list(pool_docs.values()), 1250 docs=list(pool_docs.values()),
@@ -1229,7 +1310,8 @@ class SearchEvaluationFramework: @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework:
1229 "annotate_rerank_top_k": annotate_rerank_top_k, 1310 "annotate_rerank_top_k": annotate_rerank_top_k,
1230 "pool_size": len(pool_docs), 1311 "pool_size": len(pool_docs),
1231 }, 1312 },
1232 - "query_profile": query_profile, 1313 + "labeler_mode": self.labeler_mode,
  1314 + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None,
1233 "metrics_top100": metrics, 1315 "metrics_top100": metrics,
1234 "search_results": search_labeled_results, 1316 "search_results": search_labeled_results,
1235 "full_rerank_top": rerank_top_results, 1317 "full_rerank_top": rerank_top_results,
@@ -1250,7 +1332,7 @@ class SearchEvaluationFramework: @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework:
1250 self, 1332 self,
1251 query: str, 1333 query: str,
1252 top_k: int = 100, 1334 top_k: int = 100,
1253 - auto_annotate: bool = True, 1335 + auto_annotate: bool = False,
1254 language: str = "en", 1336 language: str = "en",
1255 force_refresh_labels: bool = False, 1337 force_refresh_labels: bool = False,
1256 ) -> Dict[str, Any]: 1338 ) -> Dict[str, Any]:
@@ -1259,16 +1341,21 @@ class SearchEvaluationFramework: @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework:
1259 if auto_annotate: 1341 if auto_annotate:
1260 self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) 1342 self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
1261 labels = self.store.get_labels(self.tenant_id, query) 1343 labels = self.store.get_labels(self.tenant_id, query)
  1344 + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]}
1262 labeled = [] 1345 labeled = []
  1346 + unlabeled_hits = 0
1263 for rank, doc in enumerate(results[:top_k], start=1): 1347 for rank, doc in enumerate(results[:top_k], start=1):
1264 spu_id = str(doc.get("spu_id")) 1348 spu_id = str(doc.get("spu_id"))
  1349 + label = labels.get(spu_id)
  1350 + if label not in VALID_LABELS:
  1351 + unlabeled_hits += 1
1265 labeled.append( 1352 labeled.append(
1266 { 1353 {
1267 "rank": rank, 1354 "rank": rank,
1268 "spu_id": spu_id, 1355 "spu_id": spu_id,
1269 "title": build_display_title(doc), 1356 "title": build_display_title(doc),
1270 "image_url": doc.get("image_url"), 1357 "image_url": doc.get("image_url"),
1271 - "label": labels.get(spu_id), 1358 + "label": label,
1272 "option_values": list(compact_option_values(doc.get("skus") or [])), 1359 "option_values": list(compact_option_values(doc.get("skus") or [])),
1273 "product": compact_product_payload(doc), 1360 "product": compact_product_payload(doc),
1274 } 1361 }
@@ -1277,12 +1364,65 @@ class SearchEvaluationFramework: @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework:
1277 item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT 1364 item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
1278 for item in labeled 1365 for item in labeled
1279 ] 1366 ]
  1367 + label_stats = self.store.get_query_label_stats(self.tenant_id, query)
  1368 + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query)
  1369 + relevant_missing_ids = [
  1370 + spu_id
  1371 + for spu_id, label in labels.items()
  1372 + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids
  1373 + ]
  1374 + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids)
  1375 + missing_relevant = []
  1376 + for spu_id in relevant_missing_ids:
  1377 + doc = missing_docs_map.get(spu_id)
  1378 + if not doc:
  1379 + continue
  1380 + missing_relevant.append(
  1381 + {
  1382 + "spu_id": spu_id,
  1383 + "label": labels[spu_id],
  1384 + "rerank_score": rerank_scores.get(spu_id),
  1385 + "title": build_display_title(doc),
  1386 + "image_url": doc.get("image_url"),
  1387 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  1388 + "product": compact_product_payload(doc),
  1389 + }
  1390 + )
  1391 + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2}
  1392 + missing_relevant.sort(
  1393 + key=lambda item: (
  1394 + label_order.get(str(item.get("label")), 9),
  1395 + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")),
  1396 + str(item.get("title") or ""),
  1397 + )
  1398 + )
  1399 + tips: List[str] = []
  1400 + if auto_annotate:
  1401 + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.")
  1402 + else:
  1403 + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.")
  1404 + if label_stats["total"] == 0:
  1405 + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.")
  1406 + if unlabeled_hits:
  1407 + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.")
  1408 + if not missing_relevant:
  1409 + tips.append("No cached Exact/Partial products were missed by this recall set.")
1280 return { 1410 return {
1281 "query": query, 1411 "query": query,
1282 "tenant_id": self.tenant_id, 1412 "tenant_id": self.tenant_id,
1283 "top_k": top_k, 1413 "top_k": top_k,
1284 "metrics": compute_query_metrics(metric_labels), 1414 "metrics": compute_query_metrics(metric_labels),
1285 "results": labeled, 1415 "results": labeled,
  1416 + "missing_relevant": missing_relevant,
  1417 + "label_stats": {
  1418 + **label_stats,
  1419 + "unlabeled_hits_treated_irrelevant": unlabeled_hits,
  1420 + "recalled_hits": len(labeled),
  1421 + "missing_relevant_count": len(missing_relevant),
  1422 + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT),
  1423 + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL),
  1424 + },
  1425 + "tips": tips,
1286 "total": int(search_payload.get("total") or 0), 1426 "total": int(search_payload.get("total") or 0),
1287 } 1427 }
1288 1428
@@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
1392 class SearchEvalRequest(BaseModel): 1532 class SearchEvalRequest(BaseModel):
1393 query: str 1533 query: str
1394 top_k: int = Field(default=100, ge=1, le=500) 1534 top_k: int = Field(default=100, ge=1, le=500)
1395 - auto_annotate: bool = True 1535 + auto_annotate: bool = False
1396 language: str = "en" 1536 language: str = "en"
1397 1537
1398 1538
1399 class BatchEvalRequest(BaseModel): 1539 class BatchEvalRequest(BaseModel):
1400 queries: Optional[List[str]] = None 1540 queries: Optional[List[str]] = None
1401 top_k: int = Field(default=100, ge=1, le=500) 1541 top_k: int = Field(default=100, ge=1, le=500)
1402 - auto_annotate: bool = True 1542 + auto_annotate: bool = False
1403 language: str = "en" 1543 language: str = "en"
1404 force_refresh_labels: bool = False 1544 force_refresh_labels: bool = False
1405 1545
@@ -1494,6 +1634,8 @@ WEB_APP_HTML = """ @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """
1494 .options { color: var(--muted); line-height: 1.5; font-size: 14px; } 1634 .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
1495 .section { margin-bottom: 28px; } 1635 .section { margin-bottom: 28px; }
1496 .history { font-size: 13px; line-height: 1.5; } 1636 .history { font-size: 13px; line-height: 1.5; }
  1637 + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; }
  1638 + .tip { margin-bottom: 6px; color: var(--muted); }
1497 </style> 1639 </style>
1498 </head> 1640 </head>
1499 <body> 1641 <body>
@@ -1524,6 +1666,14 @@ WEB_APP_HTML = &quot;&quot;&quot; @@ -1524,6 +1666,14 @@ WEB_APP_HTML = &quot;&quot;&quot;
1524 <h2>Top Results</h2> 1666 <h2>Top Results</h2>
1525 <div id="results" class="results"></div> 1667 <div id="results" class="results"></div>
1526 </section> 1668 </section>
  1669 + <section class="section">
  1670 + <h2>Missed Exact / Partial</h2>
  1671 + <div id="missingRelevant" class="results"></div>
  1672 + </section>
  1673 + <section class="section">
  1674 + <h2>Notes</h2>
  1675 + <div id="tips" class="tips muted"></div>
  1676 + </section>
1527 </main> 1677 </main>
1528 </div> 1678 </div>
1529 <script> 1679 <script>
@@ -1542,15 +1692,15 @@ WEB_APP_HTML = &quot;&quot;&quot; @@ -1542,15 +1692,15 @@ WEB_APP_HTML = &quot;&quot;&quot;
1542 root.appendChild(card); 1692 root.appendChild(card);
1543 }); 1693 });
1544 } 1694 }
1545 - function renderResults(results) {  
1546 - const root = document.getElementById('results');  
1547 - root.innerHTML = ''; 1695 + function renderResults(results, rootId='results', showRank=true) {
  1696 + const mount = document.getElementById(rootId);
  1697 + mount.innerHTML = '';
1548 (results || []).forEach(item => { 1698 (results || []).forEach(item => {
1549 const label = item.label || 'Unknown'; 1699 const label = item.label || 'Unknown';
1550 const box = document.createElement('div'); 1700 const box = document.createElement('div');
1551 box.className = 'result'; 1701 box.className = 'result';
1552 box.innerHTML = ` 1702 box.innerHTML = `
1553 - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div> 1703 + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div>
1554 <img class="thumb" src="${item.image_url || ''}" alt="" /> 1704 <img class="thumb" src="${item.image_url || ''}" alt="" />
1555 <div> 1705 <div>
1556 <div class="title">${item.title || ''}</div> 1706 <div class="title">${item.title || ''}</div>
@@ -1560,8 +1710,18 @@ WEB_APP_HTML = &quot;&quot;&quot; @@ -1560,8 +1710,18 @@ WEB_APP_HTML = &quot;&quot;&quot;
1560 <div>${(item.option_values || [])[2] || ''}</div> 1710 <div>${(item.option_values || [])[2] || ''}</div>
1561 </div> 1711 </div>
1562 </div>`; 1712 </div>`;
1563 - root.appendChild(box); 1713 + mount.appendChild(box);
1564 }); 1714 });
  1715 + if (!(results || []).length) {
  1716 + mount.innerHTML = '<div class="muted">None.</div>';
  1717 + }
  1718 + }
  1719 + function renderTips(data) {
  1720 + const root = document.getElementById('tips');
  1721 + const tips = [...(data.tips || [])];
  1722 + const stats = data.label_stats || {};
  1723 + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`);
  1724 + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join('');
1565 } 1725 }
1566 async function loadQueries() { 1726 async function loadQueries() {
1567 const data = await fetchJSON('/api/queries'); 1727 const data = await fetchJSON('/api/queries');
@@ -1592,11 +1752,13 @@ WEB_APP_HTML = &quot;&quot;&quot; @@ -1592,11 +1752,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
1592 const data = await fetchJSON('/api/search-eval', { 1752 const data = await fetchJSON('/api/search-eval', {
1593 method: 'POST', 1753 method: 'POST',
1594 headers: {'Content-Type': 'application/json'}, 1754 headers: {'Content-Type': 'application/json'},
1595 - body: JSON.stringify({query, top_k: 100, auto_annotate: true}) 1755 + body: JSON.stringify({query, top_k: 100, auto_annotate: false})
1596 }); 1756 });
1597 document.getElementById('status').textContent = `Done. total=${data.total}`; 1757 document.getElementById('status').textContent = `Done. total=${data.total}`;
1598 renderMetrics(data.metrics); 1758 renderMetrics(data.metrics);
1599 - renderResults(data.results); 1759 + renderResults(data.results, 'results', true);
  1760 + renderResults(data.missing_relevant, 'missingRelevant', false);
  1761 + renderTips(data);
1600 loadHistory(); 1762 loadHistory();
1601 } 1763 }
1602 async function runBatch() { 1764 async function runBatch() {
@@ -1604,11 +1766,13 @@ WEB_APP_HTML = &quot;&quot;&quot; @@ -1604,11 +1766,13 @@ WEB_APP_HTML = &quot;&quot;&quot;
1604 const data = await fetchJSON('/api/batch-eval', { 1766 const data = await fetchJSON('/api/batch-eval', {
1605 method: 'POST', 1767 method: 'POST',
1606 headers: {'Content-Type': 'application/json'}, 1768 headers: {'Content-Type': 'application/json'},
1607 - body: JSON.stringify({top_k: 100, auto_annotate: true}) 1769 + body: JSON.stringify({top_k: 100, auto_annotate: false})
1608 }); 1770 });
1609 document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; 1771 document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
1610 renderMetrics(data.aggregate_metrics); 1772 renderMetrics(data.aggregate_metrics);
1611 - renderResults([]); 1773 + renderResults([], 'results', true);
  1774 + renderResults([], 'missingRelevant', false);
  1775 + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>';
1612 loadHistory(); 1776 loadHistory();
1613 } 1777 }
1614 loadQueries(); 1778 loadQueries();
@@ -1633,6 +1797,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser: @@ -1633,6 +1797,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1633 build.add_argument("--language", default="en") 1797 build.add_argument("--language", default="en")
1634 build.add_argument("--force-refresh-rerank", action="store_true") 1798 build.add_argument("--force-refresh-rerank", action="store_true")
1635 build.add_argument("--force-refresh-labels", action="store_true") 1799 build.add_argument("--force-refresh-labels", action="store_true")
  1800 + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1636 1801
1637 batch = sub.add_parser("batch", help="Run batch evaluation against live search") 1802 batch = sub.add_parser("batch", help="Run batch evaluation against live search")
1638 batch.add_argument("--tenant-id", default="163") 1803 batch.add_argument("--tenant-id", default="163")
@@ -1640,6 +1805,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser: @@ -1640,6 +1805,7 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1640 batch.add_argument("--top-k", type=int, default=100) 1805 batch.add_argument("--top-k", type=int, default=100)
1641 batch.add_argument("--language", default="en") 1806 batch.add_argument("--language", default="en")
1642 batch.add_argument("--force-refresh-labels", action="store_true") 1807 batch.add_argument("--force-refresh-labels", action="store_true")
  1808 + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1643 1809
1644 audit = sub.add_parser("audit", help="Audit annotation quality for queries") 1810 audit = sub.add_parser("audit", help="Audit annotation quality for queries")
1645 audit.add_argument("--tenant-id", default="163") 1811 audit.add_argument("--tenant-id", default="163")
@@ -1648,18 +1814,20 @@ def build_cli_parser() -&gt; argparse.ArgumentParser: @@ -1648,18 +1814,20 @@ def build_cli_parser() -&gt; argparse.ArgumentParser:
1648 audit.add_argument("--language", default="en") 1814 audit.add_argument("--language", default="en")
1649 audit.add_argument("--limit-suspicious", type=int, default=5) 1815 audit.add_argument("--limit-suspicious", type=int, default=5)
1650 audit.add_argument("--force-refresh-labels", action="store_true") 1816 audit.add_argument("--force-refresh-labels", action="store_true")
  1817 + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1651 1818
1652 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") 1819 serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
1653 serve.add_argument("--tenant-id", default="163") 1820 serve.add_argument("--tenant-id", default="163")
1654 serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) 1821 serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
1655 serve.add_argument("--host", default="0.0.0.0") 1822 serve.add_argument("--host", default="0.0.0.0")
1656 serve.add_argument("--port", type=int, default=6010) 1823 serve.add_argument("--port", type=int, default=6010)
  1824 + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"])
1657 1825
1658 return parser 1826 return parser
1659 1827
1660 1828
1661 def run_build(args: argparse.Namespace) -> None: 1829 def run_build(args: argparse.Namespace) -> None:
1662 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) 1830 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1663 queries = framework.queries_from_file(Path(args.queries_file)) 1831 queries = framework.queries_from_file(Path(args.queries_file))
1664 summary = [] 1832 summary = []
1665 for query in queries: 1833 for query in queries:
@@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -&gt; None: @@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -&gt; None:
1694 1862
1695 1863
1696 def run_batch(args: argparse.Namespace) -> None: 1864 def run_batch(args: argparse.Namespace) -> None:
1697 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) 1865 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1698 queries = framework.queries_from_file(Path(args.queries_file)) 1866 queries = framework.queries_from_file(Path(args.queries_file))
1699 payload = framework.batch_evaluate( 1867 payload = framework.batch_evaluate(
1700 queries=queries, 1868 queries=queries,
@@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -&gt; None: @@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -&gt; None:
1707 1875
1708 1876
1709 def run_audit(args: argparse.Namespace) -> None: 1877 def run_audit(args: argparse.Namespace) -> None:
1710 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) 1878 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1711 queries = framework.queries_from_file(Path(args.queries_file)) 1879 queries = framework.queries_from_file(Path(args.queries_file))
1712 audit_items = [] 1880 audit_items = []
1713 for query in queries: 1881 for query in queries:
@@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -&gt; None: @@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -&gt; None:
1757 1925
1758 1926
1759 def run_serve(args: argparse.Namespace) -> None: 1927 def run_serve(args: argparse.Namespace) -> None:
1760 - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) 1928 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode)
1761 app = create_web_app(framework, Path(args.queries_file)) 1929 app = create_web_app(framework, Path(args.queries_file))
1762 import uvicorn 1930 import uvicorn
1763 1931
scripts/evaluation/queries/queries.txt
@@ -30,7 +30,6 @@ khaki green backpack @@ -30,7 +30,6 @@ khaki green backpack
30 高跟鞋 30 高跟鞋
31 图案连身衣 31 图案连身衣
32 天鹅绒鸡尾酒会礼服 32 天鹅绒鸡尾酒会礼服
33 -Wearing small clothes  
34 gingham dress 33 gingham dress
35 海滩度假装 34 海滩度假装
36 vacation outfits 35 vacation outfits
@@ -41,10 +40,15 @@ hiking boots @@ -41,10 +40,15 @@ hiking boots
41 business casual women 40 business casual women
42 a-line dress 41 a-line dress
43 涤纶短裤 42 涤纶短裤
44 -哺乳文胸  
45 Compression Top Spandex 43 Compression Top Spandex
46 skiing trip insulated base layer 44 skiing trip insulated base layer
47 high waisted jeans 45 high waisted jeans
48 无袖夏装 46 无袖夏装
49 雪纺衬衫 47 雪纺衬衫
50 -convertible zip-off hiking pants  
51 \ No newline at end of file 48 \ No newline at end of file
  49 +convertible zip-off hiking pants
  50 +petite summer linen shorts
  51 +tall slim fit men's linen shirt
  52 +tall slim fit trousers
  53 +tall straight leg pants
  54 +tassel maxi skirt
  55 +teacher clothes
52 \ No newline at end of file 56 \ No newline at end of file
scripts/evaluation/quick_start_eval.sh 0 → 100755
@@ -0,0 +1,39 @@ @@ -0,0 +1,39 @@
  1 +#!/usr/bin/env bash
  2 +# Search evaluation quick entrypoints. Run from any cwd; resolves repo root.
  3 +set -euo pipefail
  4 +
  5 +ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
  6 +cd "$ROOT"
  7 +PY="${ROOT}/.venv/bin/python"
  8 +TENANT_ID="${TENANT_ID:-163}"
  9 +QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}"
  10 +
  11 +usage() {
  12 + echo "Usage: $0 batch|serve"
  13 + echo " batch — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)"
  14 + echo " serve — eval UI on http://127.0.0.1:6010/"
  15 + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)"
  16 +}
  17 +
  18 +case "${1:-}" in
  19 + batch)
  20 + exec "$PY" scripts/evaluation/build_annotation_set.py batch \
  21 + --tenant-id "$TENANT_ID" \
  22 + --queries-file "$QUERIES" \
  23 + --top-k 50 \
  24 + --language en \
  25 + --labeler-mode simple \
  26 + --force-refresh-labels
  27 + ;;
  28 + serve)
  29 + exec "$PY" scripts/evaluation/serve_eval_web.py serve \
  30 + --tenant-id "$TENANT_ID" \
  31 + --queries-file "$QUERIES" \
  32 + --host 127.0.0.1 \
  33 + --port 6010
  34 + ;;
  35 + *)
  36 + usage
  37 + exit 1
  38 + ;;
  39 +esac