Commit 881d338b3acc0b3de1bb3cfb77f4fc69755bb0f7
1 parent
432d1c88
评估框架
Showing
5 changed files
with
719 additions
and
49 deletions
Show diff stats
config/config.yaml
| 1 | +# Unified Configuration for Multi-Tenant Search Engine | |
| 2 | +# 统一配置文件,所有租户共用一套配置 | |
| 3 | +# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 | |
| 4 | +# | |
| 5 | +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 | |
| 6 | +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 | |
| 7 | + | |
| 8 | +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) | |
| 1 | 9 | runtime: |
| 2 | 10 | environment: prod |
| 3 | 11 | index_namespace: '' |
| ... | ... | @@ -13,6 +21,8 @@ runtime: |
| 13 | 21 | translator_port: 6006 |
| 14 | 22 | reranker_host: 0.0.0.0 |
| 15 | 23 | reranker_port: 6007 |
| 24 | + | |
| 25 | +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) | |
| 16 | 26 | infrastructure: |
| 17 | 27 | elasticsearch: |
| 18 | 28 | host: http://localhost:9200 |
| ... | ... | @@ -39,16 +49,30 @@ infrastructure: |
| 39 | 49 | secrets: |
| 40 | 50 | dashscope_api_key: null |
| 41 | 51 | deepl_auth_key: null |
| 52 | + | |
| 53 | +# Elasticsearch Index | |
| 42 | 54 | es_index_name: search_products |
| 55 | + | |
| 56 | +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) | |
| 43 | 57 | indexes: [] |
| 58 | + | |
| 59 | +# Config assets | |
| 44 | 60 | assets: |
| 45 | 61 | query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict |
| 62 | + | |
| 63 | +# Product content understanding (LLM enrich-content) configuration | |
| 46 | 64 | product_enrich: |
| 47 | 65 | max_workers: 40 |
| 66 | + | |
| 67 | +# ES Index Settings (基础设置) | |
| 48 | 68 | es_settings: |
| 49 | 69 | number_of_shards: 1 |
| 50 | 70 | number_of_replicas: 0 |
| 51 | 71 | refresh_interval: 30s |
| 72 | + | |
| 73 | +# 字段权重配置(用于搜索时的字段boost) | |
| 74 | +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 | |
| 75 | +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | |
| 52 | 76 | field_boosts: |
| 53 | 77 | title: 3.0 |
| 54 | 78 | qanchors: 2.5 |
| ... | ... | @@ -61,21 +85,39 @@ field_boosts: |
| 61 | 85 | option1_values: 1.5 |
| 62 | 86 | option2_values: 1.5 |
| 63 | 87 | option3_values: 1.5 |
| 88 | + | |
| 89 | +# Query Configuration(查询配置) | |
| 64 | 90 | query_config: |
| 91 | + # 支持的语言 | |
| 65 | 92 | supported_languages: |
| 66 | 93 | - zh |
| 67 | 94 | - en |
| 68 | 95 | default_language: en |
| 96 | + | |
| 97 | + # 功能开关(翻译开关由tenant_config控制) | |
| 69 | 98 | enable_text_embedding: true |
| 70 | 99 | enable_query_rewrite: true |
| 71 | - zh_to_en_model: nllb-200-distilled-600m | |
| 72 | - en_to_zh_model: nllb-200-distilled-600m | |
| 100 | + | |
| 101 | + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) | |
| 102 | + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 | |
| 103 | + zh_to_en_model: nllb-200-distilled-600m # "opus-mt-zh-en" | |
| 104 | + en_to_zh_model: nllb-200-distilled-600m # "opus-mt-en-zh" | |
| 73 | 105 | default_translation_model: nllb-200-distilled-600m |
| 106 | + # zh_to_en_model: deepl | |
| 107 | + # en_to_zh_model: deepl | |
| 108 | + # default_translation_model: deepl | |
| 109 | + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) | |
| 74 | 110 | zh_to_en_model__source_not_in_index: nllb-200-distilled-600m |
| 75 | 111 | en_to_zh_model__source_not_in_index: nllb-200-distilled-600m |
| 76 | 112 | default_translation_model__source_not_in_index: nllb-200-distilled-600m |
| 77 | - translation_embedding_wait_budget_ms_source_in_index: 200 | |
| 78 | - translation_embedding_wait_budget_ms_source_not_in_index: 300 | |
| 113 | + # zh_to_en_model__source_not_in_index: deepl | |
| 114 | + # en_to_zh_model__source_not_in_index: deepl | |
| 115 | + # default_translation_model__source_not_in_index: deepl | |
| 116 | + | |
| 117 | + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 | |
| 118 | + # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 | |
| 119 | + translation_embedding_wait_budget_ms_source_in_index: 200 # 80 | |
| 120 | + translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 | |
| 79 | 121 | style_intent: |
| 80 | 122 | enabled: true |
| 81 | 123 | selected_sku_boost: 1.2 |
| ... | ... | @@ -102,6 +144,10 @@ query_config: |
| 102 | 144 | product_title_exclusion: |
| 103 | 145 | enabled: true |
| 104 | 146 | dictionary_path: config/dictionaries/product_title_exclusion.tsv |
| 147 | + | |
| 148 | + # 动态多语言检索字段配置 | |
| 149 | + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | |
| 150 | + # shared_fields 为无语言后缀字段。 | |
| 105 | 151 | search_fields: |
| 106 | 152 | multilingual_fields: |
| 107 | 153 | - title |
| ... | ... | @@ -111,11 +157,14 @@ query_config: |
| 111 | 157 | - brief |
| 112 | 158 | - description |
| 113 | 159 | - vendor |
| 160 | + # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values | |
| 114 | 161 | shared_fields: null |
| 115 | 162 | core_multilingual_fields: |
| 116 | 163 | - title |
| 117 | 164 | - qanchors |
| 118 | 165 | - category_name_text |
| 166 | + | |
| 167 | + # 统一文本召回策略(主查询 + 翻译查询) | |
| 119 | 168 | text_query_strategy: |
| 120 | 169 | base_minimum_should_match: 60% |
| 121 | 170 | translation_minimum_should_match: 60% |
| ... | ... | @@ -130,8 +179,14 @@ query_config: |
| 130 | 179 | title: 5.0 |
| 131 | 180 | qanchors: 4.0 |
| 132 | 181 | phrase_match_boost: 3.0 |
| 182 | + | |
| 183 | + # Embedding字段名称 | |
| 133 | 184 | text_embedding_field: title_embedding |
| 134 | 185 | image_embedding_field: image_embedding.vector |
| 186 | + | |
| 187 | + # 返回字段配置(_source includes) | |
| 188 | + # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 | |
| 189 | + # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致 | |
| 135 | 190 | source_fields: |
| 136 | 191 | - spu_id |
| 137 | 192 | - handle |
| ... | ... | @@ -163,18 +218,26 @@ query_config: |
| 163 | 218 | - option3_values |
| 164 | 219 | - specifications |
| 165 | 220 | - skus |
| 221 | + | |
| 222 | + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) | |
| 166 | 223 | knn_text_boost: 4 |
| 167 | 224 | knn_image_boost: 4 |
| 225 | + | |
| 226 | + # knn_text_num_candidates = k * 3.4 | |
| 168 | 227 | knn_text_k: 160 |
| 169 | 228 | knn_text_num_candidates: 560 |
| 170 | 229 | knn_text_k_long: 400 |
| 171 | 230 | knn_text_num_candidates_long: 1200 |
| 172 | 231 | knn_image_k: 400 |
| 173 | 232 | knn_image_num_candidates: 1200 |
| 233 | + | |
| 234 | +# Function Score配置(ES层打分规则) | |
| 174 | 235 | function_score: |
| 175 | 236 | score_mode: sum |
| 176 | 237 | boost_mode: multiply |
| 177 | 238 | functions: [] |
| 239 | + | |
| 240 | +# 粗排配置(仅融合 ES 文本/向量信号,不调用模型) | |
| 178 | 241 | coarse_rank: |
| 179 | 242 | enabled: true |
| 180 | 243 | input_window: 700 |
| ... | ... | @@ -182,12 +245,16 @@ coarse_rank: |
| 182 | 245 | fusion: |
| 183 | 246 | text_bias: 0.1 |
| 184 | 247 | text_exponent: 0.35 |
| 248 | + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | |
| 249 | + # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣 | |
| 185 | 250 | text_translation_weight: 1.0 |
| 186 | 251 | knn_text_weight: 1.0 |
| 187 | 252 | knn_image_weight: 1.0 |
| 188 | 253 | knn_tie_breaker: 0.1 |
| 189 | 254 | knn_bias: 0.6 |
| 190 | 255 | knn_exponent: 0.0 |
| 256 | + | |
| 257 | +# 精排配置(轻量 reranker) | |
| 191 | 258 | fine_rank: |
| 192 | 259 | enabled: false |
| 193 | 260 | input_window: 160 |
| ... | ... | @@ -196,6 +263,8 @@ fine_rank: |
| 196 | 263 | rerank_query_template: '{query}' |
| 197 | 264 | rerank_doc_template: '{title}' |
| 198 | 265 | service_profile: fine |
| 266 | + | |
| 267 | +# 重排配置(provider/URL 在 services.rerank) | |
| 199 | 268 | rerank: |
| 200 | 269 | enabled: true |
| 201 | 270 | rerank_window: 160 |
| ... | ... | @@ -205,6 +274,11 @@ rerank: |
| 205 | 274 | rerank_query_template: '{query}' |
| 206 | 275 | rerank_doc_template: '{title}' |
| 207 | 276 | service_profile: default |
| 277 | + | |
| 278 | + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) | |
| 279 | + # 其中 knn_score 先做一层 dis_max: | |
| 280 | + # max(knn_text_weight * text_knn, knn_image_weight * image_knn) | |
| 281 | + # + knn_tie_breaker * 另一侧较弱信号 | |
| 208 | 282 | fusion: |
| 209 | 283 | rerank_bias: 1.0e-05 |
| 210 | 284 | rerank_exponent: 1.15 |
| ... | ... | @@ -212,22 +286,29 @@ rerank: |
| 212 | 286 | fine_exponent: 1.0 |
| 213 | 287 | text_bias: 0.1 |
| 214 | 288 | text_exponent: 0.25 |
| 289 | + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | |
| 215 | 290 | text_translation_weight: 0.8 |
| 216 | 291 | knn_text_weight: 1.0 |
| 217 | 292 | knn_image_weight: 1.0 |
| 218 | 293 | knn_tie_breaker: 0.1 |
| 219 | 294 | knn_bias: 0.6 |
| 220 | 295 | knn_exponent: 0.0 |
| 296 | + | |
| 297 | +# 可扩展服务/provider 注册表(单一配置源) | |
| 221 | 298 | services: |
| 222 | 299 | translation: |
| 223 | 300 | service_url: http://127.0.0.1:6006 |
| 301 | + # default_model: nllb-200-distilled-600m | |
| 224 | 302 | default_model: nllb-200-distilled-600m |
| 225 | 303 | default_scene: general |
| 226 | 304 | timeout_sec: 10.0 |
| 227 | 305 | cache: |
| 228 | 306 | ttl_seconds: 62208000 |
| 229 | 307 | sliding_expiration: true |
| 308 | + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups). | |
| 230 | 309 | enable_model_quality_tier_cache: true |
| 310 | + # Higher tier = better quality. Multiple models may share one tier (同级). | |
| 311 | + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers). | |
| 231 | 312 | model_quality_tiers: |
| 232 | 313 | deepl: 30 |
| 233 | 314 | qwen-mt: 30 |
| ... | ... | @@ -321,12 +402,13 @@ services: |
| 321 | 402 | num_beams: 1 |
| 322 | 403 | use_cache: true |
| 323 | 404 | embedding: |
| 324 | - provider: http | |
| 405 | + provider: http # http | |
| 325 | 406 | providers: |
| 326 | 407 | http: |
| 327 | 408 | text_base_url: http://127.0.0.1:6005 |
| 328 | 409 | image_base_url: http://127.0.0.1:6008 |
| 329 | - backend: tei | |
| 410 | + # 服务内文本后端(embedding 进程启动时读取) | |
| 411 | + backend: tei # tei | local_st | |
| 330 | 412 | backends: |
| 331 | 413 | tei: |
| 332 | 414 | base_url: http://127.0.0.1:8080 |
| ... | ... | @@ -337,7 +419,10 @@ services: |
| 337 | 419 | device: cuda |
| 338 | 420 | batch_size: 32 |
| 339 | 421 | normalize_embeddings: true |
| 340 | - image_backend: clip_as_service | |
| 422 | + # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name) | |
| 423 | + # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中 | |
| 424 | + # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。 | |
| 425 | + image_backend: clip_as_service # clip_as_service | local_cnclip | |
| 341 | 426 | image_backends: |
| 342 | 427 | clip_as_service: |
| 343 | 428 | server: grpc://127.0.0.1:51000 |
| ... | ... | @@ -364,6 +449,7 @@ services: |
| 364 | 449 | max_docs: 1000 |
| 365 | 450 | normalize: true |
| 366 | 451 | default_instance: default |
| 452 | + # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。 | |
| 367 | 453 | instances: |
| 368 | 454 | default: |
| 369 | 455 | host: 0.0.0.0 |
| ... | ... | @@ -405,11 +491,29 @@ services: |
| 405 | 491 | enforce_eager: false |
| 406 | 492 | infer_batch_size: 100 |
| 407 | 493 | sort_by_doc_length: true |
| 408 | - instruction_format: standard | |
| 494 | + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) | |
| 495 | + instruction_format: standard # compact standard | |
| 496 | + # instruction: "Given a query, score the product for relevance" | |
| 497 | + # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 | |
| 498 | + # instruction: "rank products by given query, category match first" | |
| 499 | + # instruction: "Rank products by query relevance, prioritizing category match" | |
| 500 | + # instruction: "Rank products by query relevance, prioritizing category and style match" | |
| 501 | + # instruction: "Rank by query relevance, prioritize category & style" | |
| 502 | + # instruction: "Relevance ranking: category & style match first" | |
| 503 | + # instruction: "Score product relevance by query with category & style match prioritized" | |
| 504 | + # instruction: "Rank products by query with category & style match prioritized" | |
| 505 | + # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query" | |
| 409 | 506 | instruction: rank products by given query |
| 507 | + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score | |
| 508 | + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 | |
| 410 | 509 | qwen3_vllm_score: |
| 411 | 510 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 511 | + # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false | |
| 412 | 512 | use_original_qwen3_hf_overrides: true |
| 513 | + # vllm_runner: "auto" | |
| 514 | + # vllm_convert: "auto" | |
| 515 | + # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 | |
| 516 | + # hf_overrides: {} | |
| 413 | 517 | engine: vllm |
| 414 | 518 | max_model_len: 172 |
| 415 | 519 | tensor_parallel_size: 1 |
| ... | ... | @@ -419,15 +523,23 @@ services: |
| 419 | 523 | enforce_eager: false |
| 420 | 524 | infer_batch_size: 80 |
| 421 | 525 | sort_by_doc_length: true |
| 422 | - instruction_format: standard | |
| 526 | + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 | |
| 527 | + instruction_format: standard # compact standard | |
| 528 | + # instruction: "Rank products by query with category & style match prioritized" | |
| 529 | + # instruction: "Given a shopping query, rank products by relevance" | |
| 423 | 530 | instruction: Rank products by query with category & style match prioritized |
| 424 | 531 | qwen3_transformers: |
| 425 | 532 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 426 | 533 | instruction: rank products by given query |
| 534 | + # instruction: "Score the product’s relevance to the given query" | |
| 427 | 535 | max_length: 8192 |
| 428 | 536 | batch_size: 64 |
| 429 | 537 | use_fp16: true |
| 538 | + # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 | |
| 430 | 539 | attn_implementation: sdpa |
| 540 | + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. | |
| 541 | + # For 1 query + many short docs (for example 400 product titles), this usually reduces | |
| 542 | + # repeated prefix work and padding waste compared with pairwise batching. | |
| 431 | 543 | qwen3_transformers_packed: |
| 432 | 544 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 433 | 545 | instruction: Rank products by query with category & style match prioritized |
| ... | ... | @@ -436,6 +548,8 @@ services: |
| 436 | 548 | max_docs_per_pack: 0 |
| 437 | 549 | use_fp16: true |
| 438 | 550 | sort_by_doc_length: true |
| 551 | + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. | |
| 552 | + # If your torch/transformers stack validates it, you can benchmark "sdpa". | |
| 439 | 553 | attn_implementation: eager |
| 440 | 554 | qwen3_gguf: |
| 441 | 555 | repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF |
| ... | ... | @@ -443,6 +557,7 @@ services: |
| 443 | 557 | cache_dir: ./model_cache |
| 444 | 558 | local_dir: ./models/reranker/qwen3-reranker-4b-gguf |
| 445 | 559 | instruction: Rank products by query with category & style match prioritized |
| 560 | + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 | |
| 446 | 561 | n_ctx: 512 |
| 447 | 562 | n_batch: 512 |
| 448 | 563 | n_ubatch: 512 |
| ... | ... | @@ -465,6 +580,8 @@ services: |
| 465 | 580 | cache_dir: ./model_cache |
| 466 | 581 | local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf |
| 467 | 582 | instruction: Rank products by query with category & style match prioritized |
| 583 | + # 0.6B GGUF / online rerank baseline: | |
| 584 | + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 | |
| 468 | 585 | n_ctx: 256 |
| 469 | 586 | n_batch: 256 |
| 470 | 587 | n_ubatch: 256 |
| ... | ... | @@ -484,22 +601,34 @@ services: |
| 484 | 601 | verbose: false |
| 485 | 602 | dashscope_rerank: |
| 486 | 603 | model_name: qwen3-rerank |
| 604 | + # 按地域选择 endpoint: | |
| 605 | + # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks | |
| 606 | + # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks | |
| 607 | + # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks | |
| 487 | 608 | endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks |
| 488 | 609 | api_key_env: RERANK_DASHSCOPE_API_KEY_CN |
| 489 | 610 | timeout_sec: 10.0 |
| 490 | - top_n_cap: 0 | |
| 491 | - batchsize: 64 | |
| 611 | + top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限 | |
| 612 | + batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断) | |
| 492 | 613 | instruct: Given a shopping query, rank product titles by relevance |
| 493 | 614 | max_retries: 2 |
| 494 | 615 | retry_backoff_sec: 0.2 |
| 616 | + | |
| 617 | +# SPU配置(已启用,使用嵌套skus) | |
| 495 | 618 | spu_config: |
| 496 | 619 | enabled: true |
| 497 | 620 | spu_field: spu_id |
| 498 | 621 | inner_hits_size: 10 |
| 622 | + # 配置哪些option维度参与检索(进索引、以及在线搜索) | |
| 623 | + # 格式为list,选择option1/option2/option3中的一个或多个 | |
| 499 | 624 | searchable_option_dimensions: |
| 500 | 625 | - option1 |
| 501 | 626 | - option2 |
| 502 | 627 | - option3 |
| 628 | + | |
| 629 | +# 租户配置(Tenant Configuration) | |
| 630 | +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) | |
| 631 | +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集 | |
| 503 | 632 | tenant_config: |
| 504 | 633 | default: |
| 505 | 634 | primary_language: en | ... | ... |
| ... | ... | @@ -0,0 +1,330 @@ |
| 1 | +# Search Evaluation Framework | |
| 2 | + | |
| 3 | +This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation. | |
| 4 | + | |
| 5 | +It is designed around one core rule: | |
| 6 | + | |
| 7 | +- Annotation should be built offline first. | |
| 8 | +- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set. | |
| 9 | +- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete. | |
| 10 | + | |
| 11 | +## Goals | |
| 12 | + | |
| 13 | +The framework supports four related tasks: | |
| 14 | + | |
| 15 | +1. Build an annotation set for a fixed query set. | |
| 16 | +2. Evaluate a live search result list against that annotation set. | |
| 17 | +3. Run batch evaluation and store historical reports with config snapshots. | |
| 18 | +4. Tune fusion parameters reproducibly. | |
| 19 | + | |
| 20 | +## Files | |
| 21 | + | |
| 22 | +- `eval_framework.py` | |
| 23 | + Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation. | |
| 24 | +- `build_annotation_set.py` | |
| 25 | + Thin CLI entrypoint into `eval_framework.py`. | |
| 26 | +- `serve_eval_web.py` | |
| 27 | + Thin web entrypoint into `eval_framework.py`. | |
| 28 | +- `tune_fusion.py` | |
| 29 | + Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports. | |
| 30 | +- `fusion_experiments_shortlist.json` | |
| 31 | + A compact experiment set for practical tuning. | |
| 32 | +- `fusion_experiments_round1.json` | |
| 33 | + A broader first-round experiment set. | |
| 34 | +- `queries/queries.txt` | |
| 35 | + The canonical evaluation query set. | |
| 36 | +- `README_Requirement.md` | |
| 37 | + Requirement reference document. | |
| 38 | +- `quick_start_eval.sh` | |
| 39 | + Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`). | |
| 40 | + | |
| 41 | +## Quick start (from repo root) | |
| 42 | + | |
| 43 | +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend. | |
| 44 | + | |
| 45 | +```bash | |
| 46 | +# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/ | |
| 47 | +./scripts/evaluation/quick_start_eval.sh batch | |
| 48 | + | |
| 49 | +# 2) Evaluation UI on http://127.0.0.1:6010/ | |
| 50 | +./scripts/evaluation/quick_start_eval.sh serve | |
| 51 | +``` | |
| 52 | + | |
| 53 | +Equivalent explicit commands: | |
| 54 | + | |
| 55 | +```bash | |
| 56 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | |
| 57 | + --tenant-id "${TENANT_ID:-163}" \ | |
| 58 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 59 | + --top-k 50 \ | |
| 60 | + --language en \ | |
| 61 | + --labeler-mode simple \ | |
| 62 | + --force-refresh-labels | |
| 63 | + | |
| 64 | +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ | |
| 65 | + --tenant-id "${TENANT_ID:-163}" \ | |
| 66 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 67 | + --host 127.0.0.1 \ | |
| 68 | + --port 6010 | |
| 69 | +``` | |
| 70 | + | |
| 71 | +**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window. | |
| 72 | + | |
| 73 | +## Storage Layout | |
| 74 | + | |
| 75 | +All generated artifacts are under: | |
| 76 | + | |
| 77 | +- `/data/saas-search/artifacts/search_evaluation` | |
| 78 | + | |
| 79 | +Important subpaths: | |
| 80 | + | |
| 81 | +- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3` | |
| 82 | + Main cache and annotation store. | |
| 83 | +- `/data/saas-search/artifacts/search_evaluation/query_builds` | |
| 84 | + Per-query pooled annotation-set build artifacts. | |
| 85 | +- `/data/saas-search/artifacts/search_evaluation/batch_reports` | |
| 86 | + Batch evaluation JSON, Markdown reports, and config snapshots. | |
| 87 | +- `/data/saas-search/artifacts/search_evaluation/audits` | |
| 88 | + Audit summaries for label quality checks. | |
| 89 | +- `/data/saas-search/artifacts/search_evaluation/tuning_runs` | |
| 90 | + Fusion experiment summaries and per-experiment config snapshots. | |
| 91 | + | |
| 92 | +## SQLite Schema Summary | |
| 93 | + | |
| 94 | +The main tables in `search_eval.sqlite3` are: | |
| 95 | + | |
| 96 | +- `corpus_docs` | |
| 97 | + Cached product corpus for the tenant. | |
| 98 | +- `rerank_scores` | |
| 99 | + Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`. | |
| 100 | +- `relevance_labels` | |
| 101 | + Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`. | |
| 102 | +- `query_profiles` | |
| 103 | + Structured query-intent profiles extracted before labeling. | |
| 104 | +- `build_runs` | |
| 105 | + Per-query pooled-build records. | |
| 106 | +- `batch_runs` | |
| 107 | + Batch evaluation history. | |
| 108 | + | |
| 109 | +## Label Semantics | |
| 110 | + | |
| 111 | +Three labels are used throughout: | |
| 112 | + | |
| 113 | +- `Exact` | |
| 114 | + Fully matches the intended product type and all explicit required attributes. | |
| 115 | +- `Partial` | |
| 116 | + Main intent matches, but explicit attributes are missing, approximate, or weaker than requested. | |
| 117 | +- `Irrelevant` | |
| 118 | + Product type mismatches, or explicit required attributes conflict. | |
| 119 | + | |
| 120 | +The framework always uses: | |
| 121 | + | |
| 122 | +- LLM-based batched relevance classification | |
| 123 | +- caching and retry logic for robust offline labeling | |
| 124 | + | |
| 125 | +There are now two labeler modes: | |
| 126 | + | |
| 127 | +- `simple` | |
| 128 | + Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt. | |
| 129 | +- `complex` | |
| 130 | + Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default. | |
| 131 | + | |
| 132 | +## Offline-First Workflow | |
| 133 | + | |
| 134 | +### 1. Refresh labels for the evaluation query set | |
| 135 | + | |
| 136 | +For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. | |
| 137 | + | |
| 138 | +Example: | |
| 139 | + | |
| 140 | +```bash | |
| 141 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | |
| 142 | + --tenant-id 163 \ | |
| 143 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 144 | + --top-k 50 \ | |
| 145 | + --language en \ | |
| 146 | + --labeler-mode simple \ | |
| 147 | + --force-refresh-labels | |
| 148 | +``` | |
| 149 | + | |
| 150 | +This command does two things: | |
| 151 | + | |
| 152 | +- runs **every** query in the file against the live backend (no skip list) | |
| 153 | +- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM | |
| 154 | + | |
| 155 | +After this step, single-query evaluation can run in cached mode without calling the LLM again. | |
| 156 | + | |
| 157 | +### 2. Optional pooled build | |
| 158 | + | |
| 159 | +The framework also supports a heavier pooled build that combines: | |
| 160 | + | |
| 161 | +- top search results | |
| 162 | +- top full-corpus reranker results | |
| 163 | + | |
| 164 | +Example: | |
| 165 | + | |
| 166 | +```bash | |
| 167 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \ | |
| 168 | + --tenant-id 163 \ | |
| 169 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 170 | + --search-depth 1000 \ | |
| 171 | + --rerank-depth 10000 \ | |
| 172 | + --annotate-search-top-k 100 \ | |
| 173 | + --annotate-rerank-top-k 120 \ | |
| 174 | + --language en | |
| 175 | +``` | |
| 176 | + | |
| 177 | +This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window. | |
| 178 | + | |
| 179 | +## Why Single-Query Evaluation Was Slow | |
| 180 | + | |
| 181 | +If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means: | |
| 182 | + | |
| 183 | +- perform live search | |
| 184 | +- detect recalled but unlabeled products | |
| 185 | +- call the LLM to label them | |
| 186 | + | |
| 187 | +That is not the intended steady-state evaluation path. | |
| 188 | + | |
| 189 | +The UI/API is now configured to prefer cached evaluation: | |
| 190 | + | |
| 191 | +- default single-query evaluation uses `auto_annotate=false` | |
| 192 | +- unlabeled recalled results are treated as `Irrelevant` | |
| 193 | +- the response includes tips explaining that coverage gap | |
| 194 | + | |
| 195 | +If you want stable, fast evaluation: | |
| 196 | + | |
| 197 | +1. prebuild labels offline | |
| 198 | +2. use cached single-query evaluation | |
| 199 | + | |
| 200 | +## Web UI | |
| 201 | + | |
| 202 | +Start the evaluation UI: | |
| 203 | + | |
| 204 | +```bash | |
| 205 | +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ | |
| 206 | + --tenant-id 163 \ | |
| 207 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 208 | + --host 127.0.0.1 \ | |
| 209 | + --port 6010 | |
| 210 | +``` | |
| 211 | + | |
| 212 | +The UI provides: | |
| 213 | + | |
| 214 | +- query list loaded from `queries.txt` | |
| 215 | +- single-query evaluation | |
| 216 | +- batch evaluation | |
| 217 | +- history of batch reports | |
| 218 | +- top recalled results | |
| 219 | +- missed `Exact` and `Partial` products that were not recalled | |
| 220 | +- tips about unlabeled hits treated as `Irrelevant` | |
| 221 | + | |
| 222 | +### Single-query response behavior | |
| 223 | + | |
| 224 | +For a single query: | |
| 225 | + | |
| 226 | +1. live search returns recalled `spu_id` values | |
| 227 | +2. the framework looks up cached labels by `(query, spu_id)` | |
| 228 | +3. unlabeled recalled items are counted as `Irrelevant` | |
| 229 | +4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial` | |
| 230 | + | |
| 231 | +This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer. | |
| 232 | + | |
| 233 | +## CLI Commands | |
| 234 | + | |
| 235 | +### Build pooled annotation artifacts | |
| 236 | + | |
| 237 | +```bash | |
| 238 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py build ... | |
| 239 | +``` | |
| 240 | + | |
| 241 | +### Run batch evaluation | |
| 242 | + | |
| 243 | +```bash | |
| 244 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | |
| 245 | + --tenant-id 163 \ | |
| 246 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 247 | + --top-k 50 \ | |
| 248 | + --language en \ | |
| 249 | + --labeler-mode simple | |
| 250 | +``` | |
| 251 | + | |
| 252 | +Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first. | |
| 253 | + | |
| 254 | +### Audit annotation quality | |
| 255 | + | |
| 256 | +```bash | |
| 257 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \ | |
| 258 | + --tenant-id 163 \ | |
| 259 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 260 | + --top-k 50 \ | |
| 261 | + --language en \ | |
| 262 | + --labeler-mode simple | |
| 263 | +``` | |
| 264 | + | |
| 265 | +This checks cached labels against current guardrails and reports suspicious cases. | |
| 266 | + | |
| 267 | +## Batch Reports | |
| 268 | + | |
| 269 | +Each batch run stores: | |
| 270 | + | |
| 271 | +- aggregate metrics | |
| 272 | +- per-query metrics | |
| 273 | +- label distribution | |
| 274 | +- timestamp | |
| 275 | +- config snapshot from `/admin/config` | |
| 276 | + | |
| 277 | +Reports are written as: | |
| 278 | + | |
| 279 | +- Markdown for easy reading | |
| 280 | +- JSON for downstream processing | |
| 281 | + | |
| 282 | +## Fusion Tuning | |
| 283 | + | |
| 284 | +The tuning runner applies experiment configs sequentially and records the outcome. | |
| 285 | + | |
| 286 | +Example: | |
| 287 | + | |
| 288 | +```bash | |
| 289 | +./.venv/bin/python scripts/evaluation/tune_fusion.py \ | |
| 290 | + --tenant-id 163 \ | |
| 291 | + --queries-file scripts/evaluation/queries/queries.txt \ | |
| 292 | + --top-k 50 \ | |
| 293 | + --language en \ | |
| 294 | + --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \ | |
| 295 | + --score-metric MAP_3 \ | |
| 296 | + --apply-best | |
| 297 | +``` | |
| 298 | + | |
| 299 | +What it does: | |
| 300 | + | |
| 301 | +1. writes an experiment config into `config/config.yaml` | |
| 302 | +2. restarts backend | |
| 303 | +3. runs batch evaluation | |
| 304 | +4. stores the per-experiment result | |
| 305 | +5. optionally applies the best experiment at the end | |
| 306 | + | |
| 307 | +## Current Practical Recommendation | |
| 308 | + | |
| 309 | +For day-to-day evaluation: | |
| 310 | + | |
| 311 | +1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels` | |
| 312 | +2. run the web UI or normal batch evaluation in cached mode | |
| 313 | +3. only force-refresh labels again when: | |
| 314 | + - the query set changes | |
| 315 | + - the product corpus changes materially | |
| 316 | + - the labeling logic changes | |
| 317 | + | |
| 318 | +## Caveats | |
| 319 | + | |
| 320 | +- The current label cache is query-specific, not a full all-products all-queries matrix. | |
| 321 | +- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached. | |
| 322 | +- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting. | |
| 323 | +- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters. | |
| 324 | + | |
| 325 | +## Related Requirement Docs | |
| 326 | + | |
| 327 | +- `README_Requirement.md` | |
| 328 | +- `README_Requirement_zh.md` | |
| 329 | + | |
| 330 | +These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow. | ... | ... |
scripts/evaluation/eval_framework.py
| ... | ... | @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant" |
| 39 | 39 | VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} |
| 40 | 40 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| 41 | 41 | DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" |
| 42 | -JUDGE_PROMPT_VERSION = "v2_structured_20260331" | |
| 42 | +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" | |
| 43 | +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | |
| 44 | +DEFAULT_LABELER_MODE = "simple" | |
| 43 | 45 | |
| 44 | 46 | |
| 45 | 47 | def utc_now_iso() -> str: |
| ... | ... | @@ -625,6 +627,57 @@ class DashScopeLabelClient: |
| 625 | 627 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() |
| 626 | 628 | return content, safe_json_dumps(data) |
| 627 | 629 | |
| 630 | + def classify_batch_simple( | |
| 631 | + self, | |
| 632 | + query: str, | |
| 633 | + docs: Sequence[Dict[str, Any]], | |
| 634 | + ) -> Tuple[List[str], str]: | |
| 635 | + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 636 | + prompt = ( | |
| 637 | + "You are an e-commerce search result relevance evaluation assistant. " | |
| 638 | + "Based on the user query and each product's information, output the relevance level for each product.\n\n" | |
| 639 | + "## Relevance Level Criteria\n" | |
| 640 | + "Exact — Fully matches the user's search intent.\n" | |
| 641 | + "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " | |
| 642 | + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | |
| 643 | + "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" | |
| 644 | + "Additional judging guidance:\n" | |
| 645 | + "- If the query clearly names a product type, product type matching has the highest priority. " | |
| 646 | + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | |
| 647 | + "bra vs top, backpack vs bag are not interchangeable.\n" | |
| 648 | + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | |
| 649 | + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | |
| 650 | + "- Do not guess missing attributes.\n" | |
| 651 | + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | |
| 652 | + "- Be conservative with Exact.\n\n" | |
| 653 | + f"Query: {query}\n\n" | |
| 654 | + "Products:\n" | |
| 655 | + + "\n".join(numbered_docs) | |
| 656 | + + "\n\n## Output Format\n" | |
| 657 | + f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | |
| 658 | + "They must correspond sequentially to the products above. Do not output any other information.\n" | |
| 659 | + ) | |
| 660 | + content, raw_response = self._chat(prompt) | |
| 661 | + labels = [] | |
| 662 | + for line in str(content or "").splitlines(): | |
| 663 | + label = line.strip() | |
| 664 | + if label in VALID_LABELS: | |
| 665 | + labels.append(label) | |
| 666 | + if len(labels) != len(docs): | |
| 667 | + payload = _extract_json_blob(content) | |
| 668 | + if isinstance(payload, dict) and isinstance(payload.get("labels"), list): | |
| 669 | + labels = [] | |
| 670 | + for item in payload["labels"][: len(docs)]: | |
| 671 | + if isinstance(item, dict): | |
| 672 | + label = str(item.get("label") or "").strip() | |
| 673 | + else: | |
| 674 | + label = str(item).strip() | |
| 675 | + if label in VALID_LABELS: | |
| 676 | + labels.append(label) | |
| 677 | + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 678 | + raise ValueError(f"unexpected simple label output: {content!r}") | |
| 679 | + return labels, raw_response | |
| 680 | + | |
| 628 | 681 | def extract_query_profile( |
| 629 | 682 | self, |
| 630 | 683 | query: str, |
| ... | ... | @@ -665,7 +718,7 @@ class DashScopeLabelClient: |
| 665 | 718 | payload.setdefault("notes", []) |
| 666 | 719 | return payload, raw_response |
| 667 | 720 | |
| 668 | - def classify_batch( | |
| 721 | + def classify_batch_complex( | |
| 669 | 722 | self, |
| 670 | 723 | query: str, |
| 671 | 724 | query_profile: Dict[str, Any], |
| ... | ... | @@ -763,10 +816,12 @@ class SearchEvaluationFramework: |
| 763 | 816 | tenant_id: str, |
| 764 | 817 | artifact_root: Path = DEFAULT_ARTIFACT_ROOT, |
| 765 | 818 | search_base_url: str = "http://localhost:6002", |
| 819 | + labeler_mode: str = DEFAULT_LABELER_MODE, | |
| 766 | 820 | ): |
| 767 | 821 | init_service(get_app_config().infrastructure.elasticsearch.host) |
| 768 | 822 | self.tenant_id = str(tenant_id) |
| 769 | 823 | self.artifact_root = ensure_dir(artifact_root) |
| 824 | + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | |
| 770 | 825 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") |
| 771 | 826 | self.search_client = SearchServiceClient(search_base_url, self.tenant_id) |
| 772 | 827 | app_cfg = get_app_config() |
| ... | ... | @@ -783,17 +838,24 @@ class SearchEvaluationFramework: |
| 783 | 838 | base_url=str(llm_cfg["base_url"]), |
| 784 | 839 | api_key=str(api_key), |
| 785 | 840 | ) |
| 786 | - self.query_parser = get_query_parser() | |
| 841 | + self.query_parser = None | |
| 842 | + | |
| 843 | + def _get_query_parser(self): | |
| 844 | + if self.query_parser is None: | |
| 845 | + self.query_parser = get_query_parser() | |
| 846 | + return self.query_parser | |
| 787 | 847 | |
| 788 | 848 | def build_query_parser_hints(self, query: str) -> Dict[str, Any]: |
| 789 | - parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 849 | + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 790 | 850 | payload = parsed.to_dict() |
| 791 | 851 | payload["text_for_rerank"] = parsed.text_for_rerank() |
| 792 | 852 | return payload |
| 793 | 853 | |
| 794 | 854 | def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: |
| 855 | + if self.labeler_mode != "complex": | |
| 856 | + raise RuntimeError("query profiles are only used in complex labeler mode") | |
| 795 | 857 | if not force_refresh: |
| 796 | - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION) | |
| 858 | + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) | |
| 797 | 859 | if cached is not None: |
| 798 | 860 | return cached |
| 799 | 861 | parser_hints = self.build_query_parser_hints(query) |
| ... | ... | @@ -802,7 +864,7 @@ class SearchEvaluationFramework: |
| 802 | 864 | self.store.upsert_query_profile( |
| 803 | 865 | self.tenant_id, |
| 804 | 866 | query, |
| 805 | - JUDGE_PROMPT_VERSION, | |
| 867 | + JUDGE_PROMPT_VERSION_COMPLEX, | |
| 806 | 868 | self.label_client.model, |
| 807 | 869 | profile, |
| 808 | 870 | raw_response, |
| ... | ... | @@ -955,9 +1017,24 @@ class SearchEvaluationFramework: |
| 955 | 1017 | *, |
| 956 | 1018 | top_k: int = 100, |
| 957 | 1019 | language: str = "en", |
| 958 | - auto_annotate: bool = True, | |
| 1020 | + auto_annotate: bool = False, | |
| 959 | 1021 | ) -> Dict[str, Any]: |
| 960 | 1022 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 1023 | + if self.labeler_mode != "complex": | |
| 1024 | + labels = [ | |
| 1025 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1026 | + for item in live["results"] | |
| 1027 | + ] | |
| 1028 | + return { | |
| 1029 | + "query": query, | |
| 1030 | + "tenant_id": self.tenant_id, | |
| 1031 | + "top_k": top_k, | |
| 1032 | + "metrics": live["metrics"], | |
| 1033 | + "distribution": label_distribution(labels), | |
| 1034 | + "query_profile": None, | |
| 1035 | + "suspicious": [], | |
| 1036 | + "results": live["results"], | |
| 1037 | + } | |
| 961 | 1038 | query_profile = self.get_query_profile(query, force_refresh=False) |
| 962 | 1039 | suspicious: List[Dict[str, Any]] = [] |
| 963 | 1040 | |
| ... | ... | @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework: |
| 1093 | 1170 | docs: Sequence[Dict[str, Any]], |
| 1094 | 1171 | force_refresh: bool = False, |
| 1095 | 1172 | ) -> Dict[str, str]: |
| 1096 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 1097 | 1173 | labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) |
| 1098 | 1174 | missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] |
| 1099 | 1175 | if not missing_docs: |
| ... | ... | @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework: |
| 1101 | 1177 | |
| 1102 | 1178 | for start in range(0, len(missing_docs), self.label_client.batch_size): |
| 1103 | 1179 | batch = missing_docs[start : start + self.label_client.batch_size] |
| 1104 | - batch_pairs = self._classify_with_retry(query, query_profile, batch) | |
| 1180 | + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh) | |
| 1105 | 1181 | for sub_labels, raw_response, sub_batch in batch_pairs: |
| 1106 | - to_store = { | |
| 1107 | - str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 1108 | - for doc, label in zip(sub_batch, sub_labels) | |
| 1109 | - } | |
| 1182 | + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} | |
| 1110 | 1183 | self.store.upsert_labels( |
| 1111 | 1184 | self.tenant_id, |
| 1112 | 1185 | query, |
| ... | ... | @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework: |
| 1121 | 1194 | def _classify_with_retry( |
| 1122 | 1195 | self, |
| 1123 | 1196 | query: str, |
| 1124 | - query_profile: Dict[str, Any], | |
| 1125 | 1197 | docs: Sequence[Dict[str, Any]], |
| 1198 | + *, | |
| 1199 | + force_refresh: bool = False, | |
| 1126 | 1200 | ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: |
| 1127 | 1201 | if not docs: |
| 1128 | 1202 | return [] |
| 1129 | 1203 | try: |
| 1130 | - labels, raw_response = self.label_client.classify_batch(query, query_profile, docs) | |
| 1204 | + if self.labeler_mode == "complex": | |
| 1205 | + query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 1206 | + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | |
| 1207 | + labels = [ | |
| 1208 | + self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 1209 | + for doc, label in zip(docs, labels) | |
| 1210 | + ] | |
| 1211 | + else: | |
| 1212 | + labels, raw_response = self.label_client.classify_batch_simple(query, docs) | |
| 1131 | 1213 | return [(labels, raw_response, docs)] |
| 1132 | 1214 | except Exception: |
| 1133 | 1215 | if len(docs) == 1: |
| 1134 | 1216 | raise |
| 1135 | 1217 | mid = len(docs) // 2 |
| 1136 | - return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:]) | |
| 1218 | + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) | |
| 1137 | 1219 | |
| 1138 | 1220 | def build_query_annotation_set( |
| 1139 | 1221 | self, |
| ... | ... | @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework: |
| 1163 | 1245 | for item in full_rerank[:annotate_rerank_top_k]: |
| 1164 | 1246 | pool_docs[str(item["spu_id"])] = item["doc"] |
| 1165 | 1247 | |
| 1166 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels) | |
| 1167 | 1248 | labels = self.annotate_missing_labels( |
| 1168 | 1249 | query=query, |
| 1169 | 1250 | docs=list(pool_docs.values()), |
| ... | ... | @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework: |
| 1229 | 1310 | "annotate_rerank_top_k": annotate_rerank_top_k, |
| 1230 | 1311 | "pool_size": len(pool_docs), |
| 1231 | 1312 | }, |
| 1232 | - "query_profile": query_profile, | |
| 1313 | + "labeler_mode": self.labeler_mode, | |
| 1314 | + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | |
| 1233 | 1315 | "metrics_top100": metrics, |
| 1234 | 1316 | "search_results": search_labeled_results, |
| 1235 | 1317 | "full_rerank_top": rerank_top_results, |
| ... | ... | @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework: |
| 1250 | 1332 | self, |
| 1251 | 1333 | query: str, |
| 1252 | 1334 | top_k: int = 100, |
| 1253 | - auto_annotate: bool = True, | |
| 1335 | + auto_annotate: bool = False, | |
| 1254 | 1336 | language: str = "en", |
| 1255 | 1337 | force_refresh_labels: bool = False, |
| 1256 | 1338 | ) -> Dict[str, Any]: |
| ... | ... | @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework: |
| 1259 | 1341 | if auto_annotate: |
| 1260 | 1342 | self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) |
| 1261 | 1343 | labels = self.store.get_labels(self.tenant_id, query) |
| 1344 | + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]} | |
| 1262 | 1345 | labeled = [] |
| 1346 | + unlabeled_hits = 0 | |
| 1263 | 1347 | for rank, doc in enumerate(results[:top_k], start=1): |
| 1264 | 1348 | spu_id = str(doc.get("spu_id")) |
| 1349 | + label = labels.get(spu_id) | |
| 1350 | + if label not in VALID_LABELS: | |
| 1351 | + unlabeled_hits += 1 | |
| 1265 | 1352 | labeled.append( |
| 1266 | 1353 | { |
| 1267 | 1354 | "rank": rank, |
| 1268 | 1355 | "spu_id": spu_id, |
| 1269 | 1356 | "title": build_display_title(doc), |
| 1270 | 1357 | "image_url": doc.get("image_url"), |
| 1271 | - "label": labels.get(spu_id), | |
| 1358 | + "label": label, | |
| 1272 | 1359 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| 1273 | 1360 | "product": compact_product_payload(doc), |
| 1274 | 1361 | } |
| ... | ... | @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework: |
| 1277 | 1364 | item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT |
| 1278 | 1365 | for item in labeled |
| 1279 | 1366 | ] |
| 1367 | + label_stats = self.store.get_query_label_stats(self.tenant_id, query) | |
| 1368 | + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) | |
| 1369 | + relevant_missing_ids = [ | |
| 1370 | + spu_id | |
| 1371 | + for spu_id, label in labels.items() | |
| 1372 | + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | |
| 1373 | + ] | |
| 1374 | + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) | |
| 1375 | + missing_relevant = [] | |
| 1376 | + for spu_id in relevant_missing_ids: | |
| 1377 | + doc = missing_docs_map.get(spu_id) | |
| 1378 | + if not doc: | |
| 1379 | + continue | |
| 1380 | + missing_relevant.append( | |
| 1381 | + { | |
| 1382 | + "spu_id": spu_id, | |
| 1383 | + "label": labels[spu_id], | |
| 1384 | + "rerank_score": rerank_scores.get(spu_id), | |
| 1385 | + "title": build_display_title(doc), | |
| 1386 | + "image_url": doc.get("image_url"), | |
| 1387 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1388 | + "product": compact_product_payload(doc), | |
| 1389 | + } | |
| 1390 | + ) | |
| 1391 | + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | |
| 1392 | + missing_relevant.sort( | |
| 1393 | + key=lambda item: ( | |
| 1394 | + label_order.get(str(item.get("label")), 9), | |
| 1395 | + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")), | |
| 1396 | + str(item.get("title") or ""), | |
| 1397 | + ) | |
| 1398 | + ) | |
| 1399 | + tips: List[str] = [] | |
| 1400 | + if auto_annotate: | |
| 1401 | + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.") | |
| 1402 | + else: | |
| 1403 | + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.") | |
| 1404 | + if label_stats["total"] == 0: | |
| 1405 | + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.") | |
| 1406 | + if unlabeled_hits: | |
| 1407 | + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") | |
| 1408 | + if not missing_relevant: | |
| 1409 | + tips.append("No cached Exact/Partial products were missed by this recall set.") | |
| 1280 | 1410 | return { |
| 1281 | 1411 | "query": query, |
| 1282 | 1412 | "tenant_id": self.tenant_id, |
| 1283 | 1413 | "top_k": top_k, |
| 1284 | 1414 | "metrics": compute_query_metrics(metric_labels), |
| 1285 | 1415 | "results": labeled, |
| 1416 | + "missing_relevant": missing_relevant, | |
| 1417 | + "label_stats": { | |
| 1418 | + **label_stats, | |
| 1419 | + "unlabeled_hits_treated_irrelevant": unlabeled_hits, | |
| 1420 | + "recalled_hits": len(labeled), | |
| 1421 | + "missing_relevant_count": len(missing_relevant), | |
| 1422 | + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | |
| 1423 | + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | |
| 1424 | + }, | |
| 1425 | + "tips": tips, | |
| 1286 | 1426 | "total": int(search_payload.get("total") or 0), |
| 1287 | 1427 | } |
| 1288 | 1428 | |
| ... | ... | @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: |
| 1392 | 1532 | class SearchEvalRequest(BaseModel): |
| 1393 | 1533 | query: str |
| 1394 | 1534 | top_k: int = Field(default=100, ge=1, le=500) |
| 1395 | - auto_annotate: bool = True | |
| 1535 | + auto_annotate: bool = False | |
| 1396 | 1536 | language: str = "en" |
| 1397 | 1537 | |
| 1398 | 1538 | |
| 1399 | 1539 | class BatchEvalRequest(BaseModel): |
| 1400 | 1540 | queries: Optional[List[str]] = None |
| 1401 | 1541 | top_k: int = Field(default=100, ge=1, le=500) |
| 1402 | - auto_annotate: bool = True | |
| 1542 | + auto_annotate: bool = False | |
| 1403 | 1543 | language: str = "en" |
| 1404 | 1544 | force_refresh_labels: bool = False |
| 1405 | 1545 | |
| ... | ... | @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """ |
| 1494 | 1634 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } |
| 1495 | 1635 | .section { margin-bottom: 28px; } |
| 1496 | 1636 | .history { font-size: 13px; line-height: 1.5; } |
| 1637 | + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } | |
| 1638 | + .tip { margin-bottom: 6px; color: var(--muted); } | |
| 1497 | 1639 | </style> |
| 1498 | 1640 | </head> |
| 1499 | 1641 | <body> |
| ... | ... | @@ -1524,6 +1666,14 @@ WEB_APP_HTML = """ |
| 1524 | 1666 | <h2>Top Results</h2> |
| 1525 | 1667 | <div id="results" class="results"></div> |
| 1526 | 1668 | </section> |
| 1669 | + <section class="section"> | |
| 1670 | + <h2>Missed Exact / Partial</h2> | |
| 1671 | + <div id="missingRelevant" class="results"></div> | |
| 1672 | + </section> | |
| 1673 | + <section class="section"> | |
| 1674 | + <h2>Notes</h2> | |
| 1675 | + <div id="tips" class="tips muted"></div> | |
| 1676 | + </section> | |
| 1527 | 1677 | </main> |
| 1528 | 1678 | </div> |
| 1529 | 1679 | <script> |
| ... | ... | @@ -1542,15 +1692,15 @@ WEB_APP_HTML = """ |
| 1542 | 1692 | root.appendChild(card); |
| 1543 | 1693 | }); |
| 1544 | 1694 | } |
| 1545 | - function renderResults(results) { | |
| 1546 | - const root = document.getElementById('results'); | |
| 1547 | - root.innerHTML = ''; | |
| 1695 | + function renderResults(results, rootId='results', showRank=true) { | |
| 1696 | + const mount = document.getElementById(rootId); | |
| 1697 | + mount.innerHTML = ''; | |
| 1548 | 1698 | (results || []).forEach(item => { |
| 1549 | 1699 | const label = item.label || 'Unknown'; |
| 1550 | 1700 | const box = document.createElement('div'); |
| 1551 | 1701 | box.className = 'result'; |
| 1552 | 1702 | box.innerHTML = ` |
| 1553 | - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div> | |
| 1703 | + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> | |
| 1554 | 1704 | <img class="thumb" src="${item.image_url || ''}" alt="" /> |
| 1555 | 1705 | <div> |
| 1556 | 1706 | <div class="title">${item.title || ''}</div> |
| ... | ... | @@ -1560,8 +1710,18 @@ WEB_APP_HTML = """ |
| 1560 | 1710 | <div>${(item.option_values || [])[2] || ''}</div> |
| 1561 | 1711 | </div> |
| 1562 | 1712 | </div>`; |
| 1563 | - root.appendChild(box); | |
| 1713 | + mount.appendChild(box); | |
| 1564 | 1714 | }); |
| 1715 | + if (!(results || []).length) { | |
| 1716 | + mount.innerHTML = '<div class="muted">None.</div>'; | |
| 1717 | + } | |
| 1718 | + } | |
| 1719 | + function renderTips(data) { | |
| 1720 | + const root = document.getElementById('tips'); | |
| 1721 | + const tips = [...(data.tips || [])]; | |
| 1722 | + const stats = data.label_stats || {}; | |
| 1723 | + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | |
| 1724 | + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); | |
| 1565 | 1725 | } |
| 1566 | 1726 | async function loadQueries() { |
| 1567 | 1727 | const data = await fetchJSON('/api/queries'); |
| ... | ... | @@ -1592,11 +1752,13 @@ WEB_APP_HTML = """ |
| 1592 | 1752 | const data = await fetchJSON('/api/search-eval', { |
| 1593 | 1753 | method: 'POST', |
| 1594 | 1754 | headers: {'Content-Type': 'application/json'}, |
| 1595 | - body: JSON.stringify({query, top_k: 100, auto_annotate: true}) | |
| 1755 | + body: JSON.stringify({query, top_k: 100, auto_annotate: false}) | |
| 1596 | 1756 | }); |
| 1597 | 1757 | document.getElementById('status').textContent = `Done. total=${data.total}`; |
| 1598 | 1758 | renderMetrics(data.metrics); |
| 1599 | - renderResults(data.results); | |
| 1759 | + renderResults(data.results, 'results', true); | |
| 1760 | + renderResults(data.missing_relevant, 'missingRelevant', false); | |
| 1761 | + renderTips(data); | |
| 1600 | 1762 | loadHistory(); |
| 1601 | 1763 | } |
| 1602 | 1764 | async function runBatch() { |
| ... | ... | @@ -1604,11 +1766,13 @@ WEB_APP_HTML = """ |
| 1604 | 1766 | const data = await fetchJSON('/api/batch-eval', { |
| 1605 | 1767 | method: 'POST', |
| 1606 | 1768 | headers: {'Content-Type': 'application/json'}, |
| 1607 | - body: JSON.stringify({top_k: 100, auto_annotate: true}) | |
| 1769 | + body: JSON.stringify({top_k: 100, auto_annotate: false}) | |
| 1608 | 1770 | }); |
| 1609 | 1771 | document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; |
| 1610 | 1772 | renderMetrics(data.aggregate_metrics); |
| 1611 | - renderResults([]); | |
| 1773 | + renderResults([], 'results', true); | |
| 1774 | + renderResults([], 'missingRelevant', false); | |
| 1775 | + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>'; | |
| 1612 | 1776 | loadHistory(); |
| 1613 | 1777 | } |
| 1614 | 1778 | loadQueries(); |
| ... | ... | @@ -1633,6 +1797,7 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 1633 | 1797 | build.add_argument("--language", default="en") |
| 1634 | 1798 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 1635 | 1799 | build.add_argument("--force-refresh-labels", action="store_true") |
| 1800 | + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1636 | 1801 | |
| 1637 | 1802 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| 1638 | 1803 | batch.add_argument("--tenant-id", default="163") |
| ... | ... | @@ -1640,6 +1805,7 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 1640 | 1805 | batch.add_argument("--top-k", type=int, default=100) |
| 1641 | 1806 | batch.add_argument("--language", default="en") |
| 1642 | 1807 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 1808 | + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1643 | 1809 | |
| 1644 | 1810 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| 1645 | 1811 | audit.add_argument("--tenant-id", default="163") |
| ... | ... | @@ -1648,18 +1814,20 @@ def build_cli_parser() -> argparse.ArgumentParser: |
| 1648 | 1814 | audit.add_argument("--language", default="en") |
| 1649 | 1815 | audit.add_argument("--limit-suspicious", type=int, default=5) |
| 1650 | 1816 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 1817 | + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1651 | 1818 | |
| 1652 | 1819 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| 1653 | 1820 | serve.add_argument("--tenant-id", default="163") |
| 1654 | 1821 | serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) |
| 1655 | 1822 | serve.add_argument("--host", default="0.0.0.0") |
| 1656 | 1823 | serve.add_argument("--port", type=int, default=6010) |
| 1824 | + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | |
| 1657 | 1825 | |
| 1658 | 1826 | return parser |
| 1659 | 1827 | |
| 1660 | 1828 | |
| 1661 | 1829 | def run_build(args: argparse.Namespace) -> None: |
| 1662 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1830 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 1663 | 1831 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1664 | 1832 | summary = [] |
| 1665 | 1833 | for query in queries: |
| ... | ... | @@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -> None: |
| 1694 | 1862 | |
| 1695 | 1863 | |
| 1696 | 1864 | def run_batch(args: argparse.Namespace) -> None: |
| 1697 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1865 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 1698 | 1866 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1699 | 1867 | payload = framework.batch_evaluate( |
| 1700 | 1868 | queries=queries, |
| ... | ... | @@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -> None: |
| 1707 | 1875 | |
| 1708 | 1876 | |
| 1709 | 1877 | def run_audit(args: argparse.Namespace) -> None: |
| 1710 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1878 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 1711 | 1879 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1712 | 1880 | audit_items = [] |
| 1713 | 1881 | for query in queries: |
| ... | ... | @@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -> None: |
| 1757 | 1925 | |
| 1758 | 1926 | |
| 1759 | 1927 | def run_serve(args: argparse.Namespace) -> None: |
| 1760 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1928 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) | |
| 1761 | 1929 | app = create_web_app(framework, Path(args.queries_file)) |
| 1762 | 1930 | import uvicorn |
| 1763 | 1931 | ... | ... |
scripts/evaluation/queries/queries.txt
| ... | ... | @@ -30,7 +30,6 @@ khaki green backpack |
| 30 | 30 | 高跟鞋 |
| 31 | 31 | 图案连身衣 |
| 32 | 32 | 天鹅绒鸡尾酒会礼服 |
| 33 | -Wearing small clothes | |
| 34 | 33 | gingham dress |
| 35 | 34 | 海滩度假装 |
| 36 | 35 | vacation outfits |
| ... | ... | @@ -41,10 +40,15 @@ hiking boots |
| 41 | 40 | business casual women |
| 42 | 41 | a-line dress |
| 43 | 42 | 涤纶短裤 |
| 44 | -哺乳文胸 | |
| 45 | 43 | Compression Top Spandex |
| 46 | 44 | skiing trip insulated base layer |
| 47 | 45 | high waisted jeans |
| 48 | 46 | 无袖夏装 |
| 49 | 47 | 雪纺衬衫 |
| 50 | -convertible zip-off hiking pants | |
| 51 | 48 | \ No newline at end of file |
| 49 | +convertible zip-off hiking pants | |
| 50 | +petite summer linen shorts | |
| 51 | +tall slim fit men's linen shirt | |
| 52 | +tall slim fit trousers | |
| 53 | +tall straight leg pants | |
| 54 | +tassel maxi skirt | |
| 55 | +teacher clothes | |
| 52 | 56 | \ No newline at end of file | ... | ... |
| ... | ... | @@ -0,0 +1,39 @@ |
| 1 | +#!/usr/bin/env bash | |
| 2 | +# Search evaluation quick entrypoints. Run from any cwd; resolves repo root. | |
| 3 | +set -euo pipefail | |
| 4 | + | |
| 5 | +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" | |
| 6 | +cd "$ROOT" | |
| 7 | +PY="${ROOT}/.venv/bin/python" | |
| 8 | +TENANT_ID="${TENANT_ID:-163}" | |
| 9 | +QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | |
| 10 | + | |
| 11 | +usage() { | |
| 12 | + echo "Usage: $0 batch|serve" | |
| 13 | + echo " batch — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)" | |
| 14 | + echo " serve — eval UI on http://127.0.0.1:6010/" | |
| 15 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)" | |
| 16 | +} | |
| 17 | + | |
| 18 | +case "${1:-}" in | |
| 19 | + batch) | |
| 20 | + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | |
| 21 | + --tenant-id "$TENANT_ID" \ | |
| 22 | + --queries-file "$QUERIES" \ | |
| 23 | + --top-k 50 \ | |
| 24 | + --language en \ | |
| 25 | + --labeler-mode simple \ | |
| 26 | + --force-refresh-labels | |
| 27 | + ;; | |
| 28 | + serve) | |
| 29 | + exec "$PY" scripts/evaluation/serve_eval_web.py serve \ | |
| 30 | + --tenant-id "$TENANT_ID" \ | |
| 31 | + --queries-file "$QUERIES" \ | |
| 32 | + --host 127.0.0.1 \ | |
| 33 | + --port 6010 | |
| 34 | + ;; | |
| 35 | + *) | |
| 36 | + usage | |
| 37 | + exit 1 | |
| 38 | + ;; | |
| 39 | +esac | ... | ... |