Commit 881d338b3acc0b3de1bb3cfb77f4fc69755bb0f7
1 parent
432d1c88
评估框架
Showing
5 changed files
with
719 additions
and
49 deletions
Show diff stats
config/config.yaml
| 1 | +# Unified Configuration for Multi-Tenant Search Engine | ||
| 2 | +# 统一配置文件,所有租户共用一套配置 | ||
| 3 | +# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 | ||
| 4 | +# | ||
| 5 | +# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 | ||
| 6 | +#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 | ||
| 7 | + | ||
| 8 | +# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) | ||
| 1 | runtime: | 9 | runtime: |
| 2 | environment: prod | 10 | environment: prod |
| 3 | index_namespace: '' | 11 | index_namespace: '' |
| @@ -13,6 +21,8 @@ runtime: | @@ -13,6 +21,8 @@ runtime: | ||
| 13 | translator_port: 6006 | 21 | translator_port: 6006 |
| 14 | reranker_host: 0.0.0.0 | 22 | reranker_host: 0.0.0.0 |
| 15 | reranker_port: 6007 | 23 | reranker_port: 6007 |
| 24 | + | ||
| 25 | +# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) | ||
| 16 | infrastructure: | 26 | infrastructure: |
| 17 | elasticsearch: | 27 | elasticsearch: |
| 18 | host: http://localhost:9200 | 28 | host: http://localhost:9200 |
| @@ -39,16 +49,30 @@ infrastructure: | @@ -39,16 +49,30 @@ infrastructure: | ||
| 39 | secrets: | 49 | secrets: |
| 40 | dashscope_api_key: null | 50 | dashscope_api_key: null |
| 41 | deepl_auth_key: null | 51 | deepl_auth_key: null |
| 52 | + | ||
| 53 | +# Elasticsearch Index | ||
| 42 | es_index_name: search_products | 54 | es_index_name: search_products |
| 55 | + | ||
| 56 | +# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) | ||
| 43 | indexes: [] | 57 | indexes: [] |
| 58 | + | ||
| 59 | +# Config assets | ||
| 44 | assets: | 60 | assets: |
| 45 | query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict | 61 | query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict |
| 62 | + | ||
| 63 | +# Product content understanding (LLM enrich-content) configuration | ||
| 46 | product_enrich: | 64 | product_enrich: |
| 47 | max_workers: 40 | 65 | max_workers: 40 |
| 66 | + | ||
| 67 | +# ES Index Settings (基础设置) | ||
| 48 | es_settings: | 68 | es_settings: |
| 49 | number_of_shards: 1 | 69 | number_of_shards: 1 |
| 50 | number_of_replicas: 0 | 70 | number_of_replicas: 0 |
| 51 | refresh_interval: 30s | 71 | refresh_interval: 30s |
| 72 | + | ||
| 73 | +# 字段权重配置(用于搜索时的字段boost) | ||
| 74 | +# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 | ||
| 75 | +# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | ||
| 52 | field_boosts: | 76 | field_boosts: |
| 53 | title: 3.0 | 77 | title: 3.0 |
| 54 | qanchors: 2.5 | 78 | qanchors: 2.5 |
| @@ -61,21 +85,39 @@ field_boosts: | @@ -61,21 +85,39 @@ field_boosts: | ||
| 61 | option1_values: 1.5 | 85 | option1_values: 1.5 |
| 62 | option2_values: 1.5 | 86 | option2_values: 1.5 |
| 63 | option3_values: 1.5 | 87 | option3_values: 1.5 |
| 88 | + | ||
| 89 | +# Query Configuration(查询配置) | ||
| 64 | query_config: | 90 | query_config: |
| 91 | + # 支持的语言 | ||
| 65 | supported_languages: | 92 | supported_languages: |
| 66 | - zh | 93 | - zh |
| 67 | - en | 94 | - en |
| 68 | default_language: en | 95 | default_language: en |
| 96 | + | ||
| 97 | + # 功能开关(翻译开关由tenant_config控制) | ||
| 69 | enable_text_embedding: true | 98 | enable_text_embedding: true |
| 70 | enable_query_rewrite: true | 99 | enable_query_rewrite: true |
| 71 | - zh_to_en_model: nllb-200-distilled-600m | ||
| 72 | - en_to_zh_model: nllb-200-distilled-600m | 100 | + |
| 101 | + # 查询翻译模型(须与 services.translation.capabilities 中某项一致) | ||
| 102 | + # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 | ||
| 103 | + zh_to_en_model: nllb-200-distilled-600m # "opus-mt-zh-en" | ||
| 104 | + en_to_zh_model: nllb-200-distilled-600m # "opus-mt-en-zh" | ||
| 73 | default_translation_model: nllb-200-distilled-600m | 105 | default_translation_model: nllb-200-distilled-600m |
| 106 | + # zh_to_en_model: deepl | ||
| 107 | + # en_to_zh_model: deepl | ||
| 108 | + # default_translation_model: deepl | ||
| 109 | + # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) | ||
| 74 | zh_to_en_model__source_not_in_index: nllb-200-distilled-600m | 110 | zh_to_en_model__source_not_in_index: nllb-200-distilled-600m |
| 75 | en_to_zh_model__source_not_in_index: nllb-200-distilled-600m | 111 | en_to_zh_model__source_not_in_index: nllb-200-distilled-600m |
| 76 | default_translation_model__source_not_in_index: nllb-200-distilled-600m | 112 | default_translation_model__source_not_in_index: nllb-200-distilled-600m |
| 77 | - translation_embedding_wait_budget_ms_source_in_index: 200 | ||
| 78 | - translation_embedding_wait_budget_ms_source_not_in_index: 300 | 113 | + # zh_to_en_model__source_not_in_index: deepl |
| 114 | + # en_to_zh_model__source_not_in_index: deepl | ||
| 115 | + # default_translation_model__source_not_in_index: deepl | ||
| 116 | + | ||
| 117 | + # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 | ||
| 118 | + # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 | ||
| 119 | + translation_embedding_wait_budget_ms_source_in_index: 200 # 80 | ||
| 120 | + translation_embedding_wait_budget_ms_source_not_in_index: 300 # 200 | ||
| 79 | style_intent: | 121 | style_intent: |
| 80 | enabled: true | 122 | enabled: true |
| 81 | selected_sku_boost: 1.2 | 123 | selected_sku_boost: 1.2 |
| @@ -102,6 +144,10 @@ query_config: | @@ -102,6 +144,10 @@ query_config: | ||
| 102 | product_title_exclusion: | 144 | product_title_exclusion: |
| 103 | enabled: true | 145 | enabled: true |
| 104 | dictionary_path: config/dictionaries/product_title_exclusion.tsv | 146 | dictionary_path: config/dictionaries/product_title_exclusion.tsv |
| 147 | + | ||
| 148 | + # 动态多语言检索字段配置 | ||
| 149 | + # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | ||
| 150 | + # shared_fields 为无语言后缀字段。 | ||
| 105 | search_fields: | 151 | search_fields: |
| 106 | multilingual_fields: | 152 | multilingual_fields: |
| 107 | - title | 153 | - title |
| @@ -111,11 +157,14 @@ query_config: | @@ -111,11 +157,14 @@ query_config: | ||
| 111 | - brief | 157 | - brief |
| 112 | - description | 158 | - description |
| 113 | - vendor | 159 | - vendor |
| 160 | + # shared_fields: 无语言后缀字段;示例: tags, option1_values, option2_values, option3_values | ||
| 114 | shared_fields: null | 161 | shared_fields: null |
| 115 | core_multilingual_fields: | 162 | core_multilingual_fields: |
| 116 | - title | 163 | - title |
| 117 | - qanchors | 164 | - qanchors |
| 118 | - category_name_text | 165 | - category_name_text |
| 166 | + | ||
| 167 | + # 统一文本召回策略(主查询 + 翻译查询) | ||
| 119 | text_query_strategy: | 168 | text_query_strategy: |
| 120 | base_minimum_should_match: 60% | 169 | base_minimum_should_match: 60% |
| 121 | translation_minimum_should_match: 60% | 170 | translation_minimum_should_match: 60% |
| @@ -130,8 +179,14 @@ query_config: | @@ -130,8 +179,14 @@ query_config: | ||
| 130 | title: 5.0 | 179 | title: 5.0 |
| 131 | qanchors: 4.0 | 180 | qanchors: 4.0 |
| 132 | phrase_match_boost: 3.0 | 181 | phrase_match_boost: 3.0 |
| 182 | + | ||
| 183 | + # Embedding字段名称 | ||
| 133 | text_embedding_field: title_embedding | 184 | text_embedding_field: title_embedding |
| 134 | image_embedding_field: image_embedding.vector | 185 | image_embedding_field: image_embedding.vector |
| 186 | + | ||
| 187 | + # 返回字段配置(_source includes) | ||
| 188 | + # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 | ||
| 189 | + # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致 | ||
| 135 | source_fields: | 190 | source_fields: |
| 136 | - spu_id | 191 | - spu_id |
| 137 | - handle | 192 | - handle |
| @@ -163,18 +218,26 @@ query_config: | @@ -163,18 +218,26 @@ query_config: | ||
| 163 | - option3_values | 218 | - option3_values |
| 164 | - specifications | 219 | - specifications |
| 165 | - skus | 220 | - skus |
| 221 | + | ||
| 222 | + # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) | ||
| 166 | knn_text_boost: 4 | 223 | knn_text_boost: 4 |
| 167 | knn_image_boost: 4 | 224 | knn_image_boost: 4 |
| 225 | + | ||
| 226 | + # knn_text_num_candidates = k * 3.4 | ||
| 168 | knn_text_k: 160 | 227 | knn_text_k: 160 |
| 169 | knn_text_num_candidates: 560 | 228 | knn_text_num_candidates: 560 |
| 170 | knn_text_k_long: 400 | 229 | knn_text_k_long: 400 |
| 171 | knn_text_num_candidates_long: 1200 | 230 | knn_text_num_candidates_long: 1200 |
| 172 | knn_image_k: 400 | 231 | knn_image_k: 400 |
| 173 | knn_image_num_candidates: 1200 | 232 | knn_image_num_candidates: 1200 |
| 233 | + | ||
| 234 | +# Function Score配置(ES层打分规则) | ||
| 174 | function_score: | 235 | function_score: |
| 175 | score_mode: sum | 236 | score_mode: sum |
| 176 | boost_mode: multiply | 237 | boost_mode: multiply |
| 177 | functions: [] | 238 | functions: [] |
| 239 | + | ||
| 240 | +# 粗排配置(仅融合 ES 文本/向量信号,不调用模型) | ||
| 178 | coarse_rank: | 241 | coarse_rank: |
| 179 | enabled: true | 242 | enabled: true |
| 180 | input_window: 700 | 243 | input_window: 700 |
| @@ -182,12 +245,16 @@ coarse_rank: | @@ -182,12 +245,16 @@ coarse_rank: | ||
| 182 | fusion: | 245 | fusion: |
| 183 | text_bias: 0.1 | 246 | text_bias: 0.1 |
| 184 | text_exponent: 0.35 | 247 | text_exponent: 0.35 |
| 248 | + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | ||
| 249 | + # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣 | ||
| 185 | text_translation_weight: 1.0 | 250 | text_translation_weight: 1.0 |
| 186 | knn_text_weight: 1.0 | 251 | knn_text_weight: 1.0 |
| 187 | knn_image_weight: 1.0 | 252 | knn_image_weight: 1.0 |
| 188 | knn_tie_breaker: 0.1 | 253 | knn_tie_breaker: 0.1 |
| 189 | knn_bias: 0.6 | 254 | knn_bias: 0.6 |
| 190 | knn_exponent: 0.0 | 255 | knn_exponent: 0.0 |
| 256 | + | ||
| 257 | +# 精排配置(轻量 reranker) | ||
| 191 | fine_rank: | 258 | fine_rank: |
| 192 | enabled: false | 259 | enabled: false |
| 193 | input_window: 160 | 260 | input_window: 160 |
| @@ -196,6 +263,8 @@ fine_rank: | @@ -196,6 +263,8 @@ fine_rank: | ||
| 196 | rerank_query_template: '{query}' | 263 | rerank_query_template: '{query}' |
| 197 | rerank_doc_template: '{title}' | 264 | rerank_doc_template: '{title}' |
| 198 | service_profile: fine | 265 | service_profile: fine |
| 266 | + | ||
| 267 | +# 重排配置(provider/URL 在 services.rerank) | ||
| 199 | rerank: | 268 | rerank: |
| 200 | enabled: true | 269 | enabled: true |
| 201 | rerank_window: 160 | 270 | rerank_window: 160 |
| @@ -205,6 +274,11 @@ rerank: | @@ -205,6 +274,11 @@ rerank: | ||
| 205 | rerank_query_template: '{query}' | 274 | rerank_query_template: '{query}' |
| 206 | rerank_doc_template: '{title}' | 275 | rerank_doc_template: '{title}' |
| 207 | service_profile: default | 276 | service_profile: default |
| 277 | + | ||
| 278 | + # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) | ||
| 279 | + # 其中 knn_score 先做一层 dis_max: | ||
| 280 | + # max(knn_text_weight * text_knn, knn_image_weight * image_knn) | ||
| 281 | + # + knn_tie_breaker * 另一侧较弱信号 | ||
| 208 | fusion: | 282 | fusion: |
| 209 | rerank_bias: 1.0e-05 | 283 | rerank_bias: 1.0e-05 |
| 210 | rerank_exponent: 1.15 | 284 | rerank_exponent: 1.15 |
| @@ -212,22 +286,29 @@ rerank: | @@ -212,22 +286,29 @@ rerank: | ||
| 212 | fine_exponent: 1.0 | 286 | fine_exponent: 1.0 |
| 213 | text_bias: 0.1 | 287 | text_bias: 0.1 |
| 214 | text_exponent: 0.25 | 288 | text_exponent: 0.25 |
| 289 | + # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | ||
| 215 | text_translation_weight: 0.8 | 290 | text_translation_weight: 0.8 |
| 216 | knn_text_weight: 1.0 | 291 | knn_text_weight: 1.0 |
| 217 | knn_image_weight: 1.0 | 292 | knn_image_weight: 1.0 |
| 218 | knn_tie_breaker: 0.1 | 293 | knn_tie_breaker: 0.1 |
| 219 | knn_bias: 0.6 | 294 | knn_bias: 0.6 |
| 220 | knn_exponent: 0.0 | 295 | knn_exponent: 0.0 |
| 296 | + | ||
| 297 | +# 可扩展服务/provider 注册表(单一配置源) | ||
| 221 | services: | 298 | services: |
| 222 | translation: | 299 | translation: |
| 223 | service_url: http://127.0.0.1:6006 | 300 | service_url: http://127.0.0.1:6006 |
| 301 | + # default_model: nllb-200-distilled-600m | ||
| 224 | default_model: nllb-200-distilled-600m | 302 | default_model: nllb-200-distilled-600m |
| 225 | default_scene: general | 303 | default_scene: general |
| 226 | timeout_sec: 10.0 | 304 | timeout_sec: 10.0 |
| 227 | cache: | 305 | cache: |
| 228 | ttl_seconds: 62208000 | 306 | ttl_seconds: 62208000 |
| 229 | sliding_expiration: true | 307 | sliding_expiration: true |
| 308 | + # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups). | ||
| 230 | enable_model_quality_tier_cache: true | 309 | enable_model_quality_tier_cache: true |
| 310 | + # Higher tier = better quality. Multiple models may share one tier (同级). | ||
| 311 | + # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers). | ||
| 231 | model_quality_tiers: | 312 | model_quality_tiers: |
| 232 | deepl: 30 | 313 | deepl: 30 |
| 233 | qwen-mt: 30 | 314 | qwen-mt: 30 |
| @@ -321,12 +402,13 @@ services: | @@ -321,12 +402,13 @@ services: | ||
| 321 | num_beams: 1 | 402 | num_beams: 1 |
| 322 | use_cache: true | 403 | use_cache: true |
| 323 | embedding: | 404 | embedding: |
| 324 | - provider: http | 405 | + provider: http # http |
| 325 | providers: | 406 | providers: |
| 326 | http: | 407 | http: |
| 327 | text_base_url: http://127.0.0.1:6005 | 408 | text_base_url: http://127.0.0.1:6005 |
| 328 | image_base_url: http://127.0.0.1:6008 | 409 | image_base_url: http://127.0.0.1:6008 |
| 329 | - backend: tei | 410 | + # 服务内文本后端(embedding 进程启动时读取) |
| 411 | + backend: tei # tei | local_st | ||
| 330 | backends: | 412 | backends: |
| 331 | tei: | 413 | tei: |
| 332 | base_url: http://127.0.0.1:8080 | 414 | base_url: http://127.0.0.1:8080 |
| @@ -337,7 +419,10 @@ services: | @@ -337,7 +419,10 @@ services: | ||
| 337 | device: cuda | 419 | device: cuda |
| 338 | batch_size: 32 | 420 | batch_size: 32 |
| 339 | normalize_embeddings: true | 421 | normalize_embeddings: true |
| 340 | - image_backend: clip_as_service | 422 | + # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name) |
| 423 | + # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中 | ||
| 424 | + # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。 | ||
| 425 | + image_backend: clip_as_service # clip_as_service | local_cnclip | ||
| 341 | image_backends: | 426 | image_backends: |
| 342 | clip_as_service: | 427 | clip_as_service: |
| 343 | server: grpc://127.0.0.1:51000 | 428 | server: grpc://127.0.0.1:51000 |
| @@ -364,6 +449,7 @@ services: | @@ -364,6 +449,7 @@ services: | ||
| 364 | max_docs: 1000 | 449 | max_docs: 1000 |
| 365 | normalize: true | 450 | normalize: true |
| 366 | default_instance: default | 451 | default_instance: default |
| 452 | + # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。 | ||
| 367 | instances: | 453 | instances: |
| 368 | default: | 454 | default: |
| 369 | host: 0.0.0.0 | 455 | host: 0.0.0.0 |
| @@ -405,11 +491,29 @@ services: | @@ -405,11 +491,29 @@ services: | ||
| 405 | enforce_eager: false | 491 | enforce_eager: false |
| 406 | infer_batch_size: 100 | 492 | infer_batch_size: 100 |
| 407 | sort_by_doc_length: true | 493 | sort_by_doc_length: true |
| 408 | - instruction_format: standard | 494 | + # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) |
| 495 | + instruction_format: standard # compact standard | ||
| 496 | + # instruction: "Given a query, score the product for relevance" | ||
| 497 | + # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 | ||
| 498 | + # instruction: "rank products by given query, category match first" | ||
| 499 | + # instruction: "Rank products by query relevance, prioritizing category match" | ||
| 500 | + # instruction: "Rank products by query relevance, prioritizing category and style match" | ||
| 501 | + # instruction: "Rank by query relevance, prioritize category & style" | ||
| 502 | + # instruction: "Relevance ranking: category & style match first" | ||
| 503 | + # instruction: "Score product relevance by query with category & style match prioritized" | ||
| 504 | + # instruction: "Rank products by query with category & style match prioritized" | ||
| 505 | + # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query" | ||
| 409 | instruction: rank products by given query | 506 | instruction: rank products by given query |
| 507 | + # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score | ||
| 508 | + # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 | ||
| 410 | qwen3_vllm_score: | 509 | qwen3_vllm_score: |
| 411 | model_name: Qwen/Qwen3-Reranker-0.6B | 510 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 511 | + # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false | ||
| 412 | use_original_qwen3_hf_overrides: true | 512 | use_original_qwen3_hf_overrides: true |
| 513 | + # vllm_runner: "auto" | ||
| 514 | + # vllm_convert: "auto" | ||
| 515 | + # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 | ||
| 516 | + # hf_overrides: {} | ||
| 413 | engine: vllm | 517 | engine: vllm |
| 414 | max_model_len: 172 | 518 | max_model_len: 172 |
| 415 | tensor_parallel_size: 1 | 519 | tensor_parallel_size: 1 |
| @@ -419,15 +523,23 @@ services: | @@ -419,15 +523,23 @@ services: | ||
| 419 | enforce_eager: false | 523 | enforce_eager: false |
| 420 | infer_batch_size: 80 | 524 | infer_batch_size: 80 |
| 421 | sort_by_doc_length: true | 525 | sort_by_doc_length: true |
| 422 | - instruction_format: standard | 526 | + # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 |
| 527 | + instruction_format: standard # compact standard | ||
| 528 | + # instruction: "Rank products by query with category & style match prioritized" | ||
| 529 | + # instruction: "Given a shopping query, rank products by relevance" | ||
| 423 | instruction: Rank products by query with category & style match prioritized | 530 | instruction: Rank products by query with category & style match prioritized |
| 424 | qwen3_transformers: | 531 | qwen3_transformers: |
| 425 | model_name: Qwen/Qwen3-Reranker-0.6B | 532 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 426 | instruction: rank products by given query | 533 | instruction: rank products by given query |
| 534 | + # instruction: "Score the product’s relevance to the given query" | ||
| 427 | max_length: 8192 | 535 | max_length: 8192 |
| 428 | batch_size: 64 | 536 | batch_size: 64 |
| 429 | use_fp16: true | 537 | use_fp16: true |
| 538 | + # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 | ||
| 430 | attn_implementation: sdpa | 539 | attn_implementation: sdpa |
| 540 | + # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. | ||
| 541 | + # For 1 query + many short docs (for example 400 product titles), this usually reduces | ||
| 542 | + # repeated prefix work and padding waste compared with pairwise batching. | ||
| 431 | qwen3_transformers_packed: | 543 | qwen3_transformers_packed: |
| 432 | model_name: Qwen/Qwen3-Reranker-0.6B | 544 | model_name: Qwen/Qwen3-Reranker-0.6B |
| 433 | instruction: Rank products by query with category & style match prioritized | 545 | instruction: Rank products by query with category & style match prioritized |
| @@ -436,6 +548,8 @@ services: | @@ -436,6 +548,8 @@ services: | ||
| 436 | max_docs_per_pack: 0 | 548 | max_docs_per_pack: 0 |
| 437 | use_fp16: true | 549 | use_fp16: true |
| 438 | sort_by_doc_length: true | 550 | sort_by_doc_length: true |
| 551 | + # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. | ||
| 552 | + # If your torch/transformers stack validates it, you can benchmark "sdpa". | ||
| 439 | attn_implementation: eager | 553 | attn_implementation: eager |
| 440 | qwen3_gguf: | 554 | qwen3_gguf: |
| 441 | repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF | 555 | repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF |
| @@ -443,6 +557,7 @@ services: | @@ -443,6 +557,7 @@ services: | ||
| 443 | cache_dir: ./model_cache | 557 | cache_dir: ./model_cache |
| 444 | local_dir: ./models/reranker/qwen3-reranker-4b-gguf | 558 | local_dir: ./models/reranker/qwen3-reranker-4b-gguf |
| 445 | instruction: Rank products by query with category & style match prioritized | 559 | instruction: Rank products by query with category & style match prioritized |
| 560 | + # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 | ||
| 446 | n_ctx: 512 | 561 | n_ctx: 512 |
| 447 | n_batch: 512 | 562 | n_batch: 512 |
| 448 | n_ubatch: 512 | 563 | n_ubatch: 512 |
| @@ -465,6 +580,8 @@ services: | @@ -465,6 +580,8 @@ services: | ||
| 465 | cache_dir: ./model_cache | 580 | cache_dir: ./model_cache |
| 466 | local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf | 581 | local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf |
| 467 | instruction: Rank products by query with category & style match prioritized | 582 | instruction: Rank products by query with category & style match prioritized |
| 583 | + # 0.6B GGUF / online rerank baseline: | ||
| 584 | + # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 | ||
| 468 | n_ctx: 256 | 585 | n_ctx: 256 |
| 469 | n_batch: 256 | 586 | n_batch: 256 |
| 470 | n_ubatch: 256 | 587 | n_ubatch: 256 |
| @@ -484,22 +601,34 @@ services: | @@ -484,22 +601,34 @@ services: | ||
| 484 | verbose: false | 601 | verbose: false |
| 485 | dashscope_rerank: | 602 | dashscope_rerank: |
| 486 | model_name: qwen3-rerank | 603 | model_name: qwen3-rerank |
| 604 | + # 按地域选择 endpoint: | ||
| 605 | + # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks | ||
| 606 | + # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks | ||
| 607 | + # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks | ||
| 487 | endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks | 608 | endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks |
| 488 | api_key_env: RERANK_DASHSCOPE_API_KEY_CN | 609 | api_key_env: RERANK_DASHSCOPE_API_KEY_CN |
| 489 | timeout_sec: 10.0 | 610 | timeout_sec: 10.0 |
| 490 | - top_n_cap: 0 | ||
| 491 | - batchsize: 64 | 611 | + top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限 |
| 612 | + batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断) | ||
| 492 | instruct: Given a shopping query, rank product titles by relevance | 613 | instruct: Given a shopping query, rank product titles by relevance |
| 493 | max_retries: 2 | 614 | max_retries: 2 |
| 494 | retry_backoff_sec: 0.2 | 615 | retry_backoff_sec: 0.2 |
| 616 | + | ||
| 617 | +# SPU配置(已启用,使用嵌套skus) | ||
| 495 | spu_config: | 618 | spu_config: |
| 496 | enabled: true | 619 | enabled: true |
| 497 | spu_field: spu_id | 620 | spu_field: spu_id |
| 498 | inner_hits_size: 10 | 621 | inner_hits_size: 10 |
| 622 | + # 配置哪些option维度参与检索(进索引、以及在线搜索) | ||
| 623 | + # 格式为list,选择option1/option2/option3中的一个或多个 | ||
| 499 | searchable_option_dimensions: | 624 | searchable_option_dimensions: |
| 500 | - option1 | 625 | - option1 |
| 501 | - option2 | 626 | - option2 |
| 502 | - option3 | 627 | - option3 |
| 628 | + | ||
| 629 | +# 租户配置(Tenant Configuration) | ||
| 630 | +# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) | ||
| 631 | +# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集 | ||
| 503 | tenant_config: | 632 | tenant_config: |
| 504 | default: | 633 | default: |
| 505 | primary_language: en | 634 | primary_language: en |
| @@ -0,0 +1,330 @@ | @@ -0,0 +1,330 @@ | ||
| 1 | +# Search Evaluation Framework | ||
| 2 | + | ||
| 3 | +This directory contains the offline annotation set builder, the online evaluation UI/API, the audit tooling, and the fusion-tuning runner for retrieval quality evaluation. | ||
| 4 | + | ||
| 5 | +It is designed around one core rule: | ||
| 6 | + | ||
| 7 | +- Annotation should be built offline first. | ||
| 8 | +- Single-query evaluation should then map recalled `spu_id` values to the cached annotation set. | ||
| 9 | +- Recalled items without cached labels are treated as `Irrelevant` during evaluation, and the UI/API returns a tip so the operator knows coverage is incomplete. | ||
| 10 | + | ||
| 11 | +## Goals | ||
| 12 | + | ||
| 13 | +The framework supports four related tasks: | ||
| 14 | + | ||
| 15 | +1. Build an annotation set for a fixed query set. | ||
| 16 | +2. Evaluate a live search result list against that annotation set. | ||
| 17 | +3. Run batch evaluation and store historical reports with config snapshots. | ||
| 18 | +4. Tune fusion parameters reproducibly. | ||
| 19 | + | ||
| 20 | +## Files | ||
| 21 | + | ||
| 22 | +- `eval_framework.py` | ||
| 23 | + Search evaluation core implementation, CLI, FastAPI app, SQLite store, audit logic, and report generation. | ||
| 24 | +- `build_annotation_set.py` | ||
| 25 | + Thin CLI entrypoint into `eval_framework.py`. | ||
| 26 | +- `serve_eval_web.py` | ||
| 27 | + Thin web entrypoint into `eval_framework.py`. | ||
| 28 | +- `tune_fusion.py` | ||
| 29 | + Fusion experiment runner. It applies config variants, restarts backend, runs batch evaluation, and stores experiment reports. | ||
| 30 | +- `fusion_experiments_shortlist.json` | ||
| 31 | + A compact experiment set for practical tuning. | ||
| 32 | +- `fusion_experiments_round1.json` | ||
| 33 | + A broader first-round experiment set. | ||
| 34 | +- `queries/queries.txt` | ||
| 35 | + The canonical evaluation query set. | ||
| 36 | +- `README_Requirement.md` | ||
| 37 | + Requirement reference document. | ||
| 38 | +- `quick_start_eval.sh` | ||
| 39 | + Optional wrapper to run the batch refresh or the web UI from repo root (uses `./.venv/bin/python`). | ||
| 40 | + | ||
| 41 | +## Quick start (from repo root) | ||
| 42 | + | ||
| 43 | +Set tenant if needed (`export TENANT_ID=163`). Requires live search API, DashScope key for LLM when labeling, and for batch refresh a working backend. | ||
| 44 | + | ||
| 45 | +```bash | ||
| 46 | +# 1) Refresh offline labels for every line in the queries file, then write batch metrics under artifacts/ | ||
| 47 | +./scripts/evaluation/quick_start_eval.sh batch | ||
| 48 | + | ||
| 49 | +# 2) Evaluation UI on http://127.0.0.1:6010/ | ||
| 50 | +./scripts/evaluation/quick_start_eval.sh serve | ||
| 51 | +``` | ||
| 52 | + | ||
| 53 | +Equivalent explicit commands: | ||
| 54 | + | ||
| 55 | +```bash | ||
| 56 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | ||
| 57 | + --tenant-id "${TENANT_ID:-163}" \ | ||
| 58 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 59 | + --top-k 50 \ | ||
| 60 | + --language en \ | ||
| 61 | + --labeler-mode simple \ | ||
| 62 | + --force-refresh-labels | ||
| 63 | + | ||
| 64 | +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ | ||
| 65 | + --tenant-id "${TENANT_ID:-163}" \ | ||
| 66 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 67 | + --host 127.0.0.1 \ | ||
| 68 | + --port 6010 | ||
| 69 | +``` | ||
| 70 | + | ||
| 71 | +**Batch behavior:** There is no “skip queries already processed”. Each run walks the full queries file. With `--force-refresh-labels`, for **every** query the runner issues a live search and sends **all** `top_k` returned `spu_id`s through the LLM again (SQLite rows are upserted). Omit `--force-refresh-labels` if you only want to fill in labels that are missing for the current recall window. | ||
| 72 | + | ||
| 73 | +## Storage Layout | ||
| 74 | + | ||
| 75 | +All generated artifacts are under: | ||
| 76 | + | ||
| 77 | +- `/data/saas-search/artifacts/search_evaluation` | ||
| 78 | + | ||
| 79 | +Important subpaths: | ||
| 80 | + | ||
| 81 | +- `/data/saas-search/artifacts/search_evaluation/search_eval.sqlite3` | ||
| 82 | + Main cache and annotation store. | ||
| 83 | +- `/data/saas-search/artifacts/search_evaluation/query_builds` | ||
| 84 | + Per-query pooled annotation-set build artifacts. | ||
| 85 | +- `/data/saas-search/artifacts/search_evaluation/batch_reports` | ||
| 86 | + Batch evaluation JSON, Markdown reports, and config snapshots. | ||
| 87 | +- `/data/saas-search/artifacts/search_evaluation/audits` | ||
| 88 | + Audit summaries for label quality checks. | ||
| 89 | +- `/data/saas-search/artifacts/search_evaluation/tuning_runs` | ||
| 90 | + Fusion experiment summaries and per-experiment config snapshots. | ||
| 91 | + | ||
| 92 | +## SQLite Schema Summary | ||
| 93 | + | ||
| 94 | +The main tables in `search_eval.sqlite3` are: | ||
| 95 | + | ||
| 96 | +- `corpus_docs` | ||
| 97 | + Cached product corpus for the tenant. | ||
| 98 | +- `rerank_scores` | ||
| 99 | + Cached full-corpus reranker scores keyed by `(tenant_id, query_text, spu_id)`. | ||
| 100 | +- `relevance_labels` | ||
| 101 | + Cached LLM relevance labels keyed by `(tenant_id, query_text, spu_id)`. | ||
| 102 | +- `query_profiles` | ||
| 103 | + Structured query-intent profiles extracted before labeling. | ||
| 104 | +- `build_runs` | ||
| 105 | + Per-query pooled-build records. | ||
| 106 | +- `batch_runs` | ||
| 107 | + Batch evaluation history. | ||
| 108 | + | ||
| 109 | +## Label Semantics | ||
| 110 | + | ||
| 111 | +Three labels are used throughout: | ||
| 112 | + | ||
| 113 | +- `Exact` | ||
| 114 | + Fully matches the intended product type and all explicit required attributes. | ||
| 115 | +- `Partial` | ||
| 116 | + Main intent matches, but explicit attributes are missing, approximate, or weaker than requested. | ||
| 117 | +- `Irrelevant` | ||
| 118 | + Product type mismatches, or explicit required attributes conflict. | ||
| 119 | + | ||
| 120 | +The framework always uses: | ||
| 121 | + | ||
| 122 | +- LLM-based batched relevance classification | ||
| 123 | +- caching and retry logic for robust offline labeling | ||
| 124 | + | ||
| 125 | +There are now two labeler modes: | ||
| 126 | + | ||
| 127 | +- `simple` | ||
| 128 | + Default. A single low-coupling LLM judging pass per batch, using the standard relevance prompt. | ||
| 129 | +- `complex` | ||
| 130 | + Legacy structured mode. It extracts query profiles and applies extra guardrails. Kept for comparison, but no longer the default. | ||
| 131 | + | ||
| 132 | +## Offline-First Workflow | ||
| 133 | + | ||
| 134 | +### 1. Refresh labels for the evaluation query set | ||
| 135 | + | ||
| 136 | +For practical evaluation, the most important offline step is to pre-label the result window you plan to score. For the current metrics (`P@5`, `P@10`, `P@20`, `P@50`, `MAP_3`, `MAP_2_3`), a `top_k=50` cached label set is sufficient. | ||
| 137 | + | ||
| 138 | +Example: | ||
| 139 | + | ||
| 140 | +```bash | ||
| 141 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | ||
| 142 | + --tenant-id 163 \ | ||
| 143 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 144 | + --top-k 50 \ | ||
| 145 | + --language en \ | ||
| 146 | + --labeler-mode simple \ | ||
| 147 | + --force-refresh-labels | ||
| 148 | +``` | ||
| 149 | + | ||
| 150 | +This command does two things: | ||
| 151 | + | ||
| 152 | +- runs **every** query in the file against the live backend (no skip list) | ||
| 153 | +- with `--force-refresh-labels`, re-labels **all** `top_k` hits per query via the LLM and upserts SQLite; without the flag, only `spu_id`s lacking a cached label are sent to the LLM | ||
| 154 | + | ||
| 155 | +After this step, single-query evaluation can run in cached mode without calling the LLM again. | ||
| 156 | + | ||
| 157 | +### 2. Optional pooled build | ||
| 158 | + | ||
| 159 | +The framework also supports a heavier pooled build that combines: | ||
| 160 | + | ||
| 161 | +- top search results | ||
| 162 | +- top full-corpus reranker results | ||
| 163 | + | ||
| 164 | +Example: | ||
| 165 | + | ||
| 166 | +```bash | ||
| 167 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py build \ | ||
| 168 | + --tenant-id 163 \ | ||
| 169 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 170 | + --search-depth 1000 \ | ||
| 171 | + --rerank-depth 10000 \ | ||
| 172 | + --annotate-search-top-k 100 \ | ||
| 173 | + --annotate-rerank-top-k 120 \ | ||
| 174 | + --language en | ||
| 175 | +``` | ||
| 176 | + | ||
| 177 | +This is slower, but useful when you want a richer pooled annotation set beyond the current live recall window. | ||
| 178 | + | ||
| 179 | +## Why Single-Query Evaluation Was Slow | ||
| 180 | + | ||
| 181 | +If single-query evaluation is slow, the usual reason is that it is still running with `auto_annotate=true`, which means: | ||
| 182 | + | ||
| 183 | +- perform live search | ||
| 184 | +- detect recalled but unlabeled products | ||
| 185 | +- call the LLM to label them | ||
| 186 | + | ||
| 187 | +That is not the intended steady-state evaluation path. | ||
| 188 | + | ||
| 189 | +The UI/API is now configured to prefer cached evaluation: | ||
| 190 | + | ||
| 191 | +- default single-query evaluation uses `auto_annotate=false` | ||
| 192 | +- unlabeled recalled results are treated as `Irrelevant` | ||
| 193 | +- the response includes tips explaining that coverage gap | ||
| 194 | + | ||
| 195 | +If you want stable, fast evaluation: | ||
| 196 | + | ||
| 197 | +1. prebuild labels offline | ||
| 198 | +2. use cached single-query evaluation | ||
| 199 | + | ||
| 200 | +## Web UI | ||
| 201 | + | ||
| 202 | +Start the evaluation UI: | ||
| 203 | + | ||
| 204 | +```bash | ||
| 205 | +./.venv/bin/python scripts/evaluation/serve_eval_web.py serve \ | ||
| 206 | + --tenant-id 163 \ | ||
| 207 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 208 | + --host 127.0.0.1 \ | ||
| 209 | + --port 6010 | ||
| 210 | +``` | ||
| 211 | + | ||
| 212 | +The UI provides: | ||
| 213 | + | ||
| 214 | +- query list loaded from `queries.txt` | ||
| 215 | +- single-query evaluation | ||
| 216 | +- batch evaluation | ||
| 217 | +- history of batch reports | ||
| 218 | +- top recalled results | ||
| 219 | +- missed `Exact` and `Partial` products that were not recalled | ||
| 220 | +- tips about unlabeled hits treated as `Irrelevant` | ||
| 221 | + | ||
| 222 | +### Single-query response behavior | ||
| 223 | + | ||
| 224 | +For a single query: | ||
| 225 | + | ||
| 226 | +1. live search returns recalled `spu_id` values | ||
| 227 | +2. the framework looks up cached labels by `(query, spu_id)` | ||
| 228 | +3. unlabeled recalled items are counted as `Irrelevant` | ||
| 229 | +4. cached `Exact` and `Partial` products that were not recalled are listed under `Missed Exact / Partial` | ||
| 230 | + | ||
| 231 | +This makes the page useful as a real retrieval-evaluation view rather than only a search-result viewer. | ||
| 232 | + | ||
| 233 | +## CLI Commands | ||
| 234 | + | ||
| 235 | +### Build pooled annotation artifacts | ||
| 236 | + | ||
| 237 | +```bash | ||
| 238 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py build ... | ||
| 239 | +``` | ||
| 240 | + | ||
| 241 | +### Run batch evaluation | ||
| 242 | + | ||
| 243 | +```bash | ||
| 244 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py batch \ | ||
| 245 | + --tenant-id 163 \ | ||
| 246 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 247 | + --top-k 50 \ | ||
| 248 | + --language en \ | ||
| 249 | + --labeler-mode simple | ||
| 250 | +``` | ||
| 251 | + | ||
| 252 | +Use `--force-refresh-labels` if you want to rebuild the offline label cache for the recalled window first. | ||
| 253 | + | ||
| 254 | +### Audit annotation quality | ||
| 255 | + | ||
| 256 | +```bash | ||
| 257 | +./.venv/bin/python scripts/evaluation/build_annotation_set.py audit \ | ||
| 258 | + --tenant-id 163 \ | ||
| 259 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 260 | + --top-k 50 \ | ||
| 261 | + --language en \ | ||
| 262 | + --labeler-mode simple | ||
| 263 | +``` | ||
| 264 | + | ||
| 265 | +This checks cached labels against current guardrails and reports suspicious cases. | ||
| 266 | + | ||
| 267 | +## Batch Reports | ||
| 268 | + | ||
| 269 | +Each batch run stores: | ||
| 270 | + | ||
| 271 | +- aggregate metrics | ||
| 272 | +- per-query metrics | ||
| 273 | +- label distribution | ||
| 274 | +- timestamp | ||
| 275 | +- config snapshot from `/admin/config` | ||
| 276 | + | ||
| 277 | +Reports are written as: | ||
| 278 | + | ||
| 279 | +- Markdown for easy reading | ||
| 280 | +- JSON for downstream processing | ||
| 281 | + | ||
| 282 | +## Fusion Tuning | ||
| 283 | + | ||
| 284 | +The tuning runner applies experiment configs sequentially and records the outcome. | ||
| 285 | + | ||
| 286 | +Example: | ||
| 287 | + | ||
| 288 | +```bash | ||
| 289 | +./.venv/bin/python scripts/evaluation/tune_fusion.py \ | ||
| 290 | + --tenant-id 163 \ | ||
| 291 | + --queries-file scripts/evaluation/queries/queries.txt \ | ||
| 292 | + --top-k 50 \ | ||
| 293 | + --language en \ | ||
| 294 | + --experiments-file scripts/evaluation/fusion_experiments_shortlist.json \ | ||
| 295 | + --score-metric MAP_3 \ | ||
| 296 | + --apply-best | ||
| 297 | +``` | ||
| 298 | + | ||
| 299 | +What it does: | ||
| 300 | + | ||
| 301 | +1. writes an experiment config into `config/config.yaml` | ||
| 302 | +2. restarts backend | ||
| 303 | +3. runs batch evaluation | ||
| 304 | +4. stores the per-experiment result | ||
| 305 | +5. optionally applies the best experiment at the end | ||
| 306 | + | ||
| 307 | +## Current Practical Recommendation | ||
| 308 | + | ||
| 309 | +For day-to-day evaluation: | ||
| 310 | + | ||
| 311 | +1. refresh the offline labels for the fixed query set with `batch --force-refresh-labels` | ||
| 312 | +2. run the web UI or normal batch evaluation in cached mode | ||
| 313 | +3. only force-refresh labels again when: | ||
| 314 | + - the query set changes | ||
| 315 | + - the product corpus changes materially | ||
| 316 | + - the labeling logic changes | ||
| 317 | + | ||
| 318 | +## Caveats | ||
| 319 | + | ||
| 320 | +- The current label cache is query-specific, not a full all-products all-queries matrix. | ||
| 321 | +- Single-query evaluation still depends on the live search API for recall, but not on the LLM if labels are already cached. | ||
| 322 | +- The backend restart path in this environment can be briefly unstable immediately after startup; a short wait after restart is sometimes necessary for scripting. | ||
| 323 | +- Some multilingual translation hints are noisy on long-tail fashion queries, which is one reason fusion tuning around translation weight matters. | ||
| 324 | + | ||
| 325 | +## Related Requirement Docs | ||
| 326 | + | ||
| 327 | +- `README_Requirement.md` | ||
| 328 | +- `README_Requirement_zh.md` | ||
| 329 | + | ||
| 330 | +These documents describe the original problem statement. This `README.md` describes the implemented framework and the current recommended workflow. |
scripts/evaluation/eval_framework.py
| @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant" | @@ -39,7 +39,9 @@ RELEVANCE_IRRELEVANT = "Irrelevant" | ||
| 39 | VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | 39 | VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} |
| 40 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | 40 | DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" |
| 41 | DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" | 41 | DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" |
| 42 | -JUDGE_PROMPT_VERSION = "v2_structured_20260331" | 42 | +JUDGE_PROMPT_VERSION_SIMPLE = "v3_simple_20260331" |
| 43 | +JUDGE_PROMPT_VERSION_COMPLEX = "v2_structured_20260331" | ||
| 44 | +DEFAULT_LABELER_MODE = "simple" | ||
| 43 | 45 | ||
| 44 | 46 | ||
| 45 | def utc_now_iso() -> str: | 47 | def utc_now_iso() -> str: |
| @@ -625,6 +627,57 @@ class DashScopeLabelClient: | @@ -625,6 +627,57 @@ class DashScopeLabelClient: | ||
| 625 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | 627 | content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() |
| 626 | return content, safe_json_dumps(data) | 628 | return content, safe_json_dumps(data) |
| 627 | 629 | ||
| 630 | + def classify_batch_simple( | ||
| 631 | + self, | ||
| 632 | + query: str, | ||
| 633 | + docs: Sequence[Dict[str, Any]], | ||
| 634 | + ) -> Tuple[List[str], str]: | ||
| 635 | + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | ||
| 636 | + prompt = ( | ||
| 637 | + "You are an e-commerce search result relevance evaluation assistant. " | ||
| 638 | + "Based on the user query and each product's information, output the relevance level for each product.\n\n" | ||
| 639 | + "## Relevance Level Criteria\n" | ||
| 640 | + "Exact — Fully matches the user's search intent.\n" | ||
| 641 | + "Partial — Primary intent satisfied (same category or similar use, basically aligns with search intent), " | ||
| 642 | + "but secondary attributes such as color, style, size, fit, length, or material deviate from or cannot be confirmed.\n" | ||
| 643 | + "Irrelevant — Category or use case mismatched, primary intent not satisfied.\n\n" | ||
| 644 | + "Additional judging guidance:\n" | ||
| 645 | + "- If the query clearly names a product type, product type matching has the highest priority. " | ||
| 646 | + "Dress vs skirt vs jumpsuit, jeans vs pants, T-shirt vs blouse, cardigan vs sweater, boots vs shoes, " | ||
| 647 | + "bra vs top, backpack vs bag are not interchangeable.\n" | ||
| 648 | + "- When the query clearly specifies a concrete product type, a different product type should usually be Irrelevant, not Partial.\n" | ||
| 649 | + "- If an attribute looks missing or uncertain, prefer Partial instead of Exact.\n" | ||
| 650 | + "- Do not guess missing attributes.\n" | ||
| 651 | + "- Graphic, slogan, holiday, memorial, or message tees are not Exact for a plain tee query unless that graphic/theme is requested.\n" | ||
| 652 | + "- Be conservative with Exact.\n\n" | ||
| 653 | + f"Query: {query}\n\n" | ||
| 654 | + "Products:\n" | ||
| 655 | + + "\n".join(numbered_docs) | ||
| 656 | + + "\n\n## Output Format\n" | ||
| 657 | + f"Strictly output {len(docs)} lines, each line containing exactly one of Exact / Partial / Irrelevant. " | ||
| 658 | + "They must correspond sequentially to the products above. Do not output any other information.\n" | ||
| 659 | + ) | ||
| 660 | + content, raw_response = self._chat(prompt) | ||
| 661 | + labels = [] | ||
| 662 | + for line in str(content or "").splitlines(): | ||
| 663 | + label = line.strip() | ||
| 664 | + if label in VALID_LABELS: | ||
| 665 | + labels.append(label) | ||
| 666 | + if len(labels) != len(docs): | ||
| 667 | + payload = _extract_json_blob(content) | ||
| 668 | + if isinstance(payload, dict) and isinstance(payload.get("labels"), list): | ||
| 669 | + labels = [] | ||
| 670 | + for item in payload["labels"][: len(docs)]: | ||
| 671 | + if isinstance(item, dict): | ||
| 672 | + label = str(item.get("label") or "").strip() | ||
| 673 | + else: | ||
| 674 | + label = str(item).strip() | ||
| 675 | + if label in VALID_LABELS: | ||
| 676 | + labels.append(label) | ||
| 677 | + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | ||
| 678 | + raise ValueError(f"unexpected simple label output: {content!r}") | ||
| 679 | + return labels, raw_response | ||
| 680 | + | ||
| 628 | def extract_query_profile( | 681 | def extract_query_profile( |
| 629 | self, | 682 | self, |
| 630 | query: str, | 683 | query: str, |
| @@ -665,7 +718,7 @@ class DashScopeLabelClient: | @@ -665,7 +718,7 @@ class DashScopeLabelClient: | ||
| 665 | payload.setdefault("notes", []) | 718 | payload.setdefault("notes", []) |
| 666 | return payload, raw_response | 719 | return payload, raw_response |
| 667 | 720 | ||
| 668 | - def classify_batch( | 721 | + def classify_batch_complex( |
| 669 | self, | 722 | self, |
| 670 | query: str, | 723 | query: str, |
| 671 | query_profile: Dict[str, Any], | 724 | query_profile: Dict[str, Any], |
| @@ -763,10 +816,12 @@ class SearchEvaluationFramework: | @@ -763,10 +816,12 @@ class SearchEvaluationFramework: | ||
| 763 | tenant_id: str, | 816 | tenant_id: str, |
| 764 | artifact_root: Path = DEFAULT_ARTIFACT_ROOT, | 817 | artifact_root: Path = DEFAULT_ARTIFACT_ROOT, |
| 765 | search_base_url: str = "http://localhost:6002", | 818 | search_base_url: str = "http://localhost:6002", |
| 819 | + labeler_mode: str = DEFAULT_LABELER_MODE, | ||
| 766 | ): | 820 | ): |
| 767 | init_service(get_app_config().infrastructure.elasticsearch.host) | 821 | init_service(get_app_config().infrastructure.elasticsearch.host) |
| 768 | self.tenant_id = str(tenant_id) | 822 | self.tenant_id = str(tenant_id) |
| 769 | self.artifact_root = ensure_dir(artifact_root) | 823 | self.artifact_root = ensure_dir(artifact_root) |
| 824 | + self.labeler_mode = str(labeler_mode).strip().lower() or DEFAULT_LABELER_MODE | ||
| 770 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") | 825 | self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") |
| 771 | self.search_client = SearchServiceClient(search_base_url, self.tenant_id) | 826 | self.search_client = SearchServiceClient(search_base_url, self.tenant_id) |
| 772 | app_cfg = get_app_config() | 827 | app_cfg = get_app_config() |
| @@ -783,17 +838,24 @@ class SearchEvaluationFramework: | @@ -783,17 +838,24 @@ class SearchEvaluationFramework: | ||
| 783 | base_url=str(llm_cfg["base_url"]), | 838 | base_url=str(llm_cfg["base_url"]), |
| 784 | api_key=str(api_key), | 839 | api_key=str(api_key), |
| 785 | ) | 840 | ) |
| 786 | - self.query_parser = get_query_parser() | 841 | + self.query_parser = None |
| 842 | + | ||
| 843 | + def _get_query_parser(self): | ||
| 844 | + if self.query_parser is None: | ||
| 845 | + self.query_parser = get_query_parser() | ||
| 846 | + return self.query_parser | ||
| 787 | 847 | ||
| 788 | def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | 848 | def build_query_parser_hints(self, query: str) -> Dict[str, Any]: |
| 789 | - parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"]) | 849 | + parsed = self._get_query_parser().parse(query, generate_vector=False, target_languages=["en", "zh"]) |
| 790 | payload = parsed.to_dict() | 850 | payload = parsed.to_dict() |
| 791 | payload["text_for_rerank"] = parsed.text_for_rerank() | 851 | payload["text_for_rerank"] = parsed.text_for_rerank() |
| 792 | return payload | 852 | return payload |
| 793 | 853 | ||
| 794 | def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | 854 | def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: |
| 855 | + if self.labeler_mode != "complex": | ||
| 856 | + raise RuntimeError("query profiles are only used in complex labeler mode") | ||
| 795 | if not force_refresh: | 857 | if not force_refresh: |
| 796 | - cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION) | 858 | + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION_COMPLEX) |
| 797 | if cached is not None: | 859 | if cached is not None: |
| 798 | return cached | 860 | return cached |
| 799 | parser_hints = self.build_query_parser_hints(query) | 861 | parser_hints = self.build_query_parser_hints(query) |
| @@ -802,7 +864,7 @@ class SearchEvaluationFramework: | @@ -802,7 +864,7 @@ class SearchEvaluationFramework: | ||
| 802 | self.store.upsert_query_profile( | 864 | self.store.upsert_query_profile( |
| 803 | self.tenant_id, | 865 | self.tenant_id, |
| 804 | query, | 866 | query, |
| 805 | - JUDGE_PROMPT_VERSION, | 867 | + JUDGE_PROMPT_VERSION_COMPLEX, |
| 806 | self.label_client.model, | 868 | self.label_client.model, |
| 807 | profile, | 869 | profile, |
| 808 | raw_response, | 870 | raw_response, |
| @@ -955,9 +1017,24 @@ class SearchEvaluationFramework: | @@ -955,9 +1017,24 @@ class SearchEvaluationFramework: | ||
| 955 | *, | 1017 | *, |
| 956 | top_k: int = 100, | 1018 | top_k: int = 100, |
| 957 | language: str = "en", | 1019 | language: str = "en", |
| 958 | - auto_annotate: bool = True, | 1020 | + auto_annotate: bool = False, |
| 959 | ) -> Dict[str, Any]: | 1021 | ) -> Dict[str, Any]: |
| 960 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) | 1022 | live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) |
| 1023 | + if self.labeler_mode != "complex": | ||
| 1024 | + labels = [ | ||
| 1025 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | ||
| 1026 | + for item in live["results"] | ||
| 1027 | + ] | ||
| 1028 | + return { | ||
| 1029 | + "query": query, | ||
| 1030 | + "tenant_id": self.tenant_id, | ||
| 1031 | + "top_k": top_k, | ||
| 1032 | + "metrics": live["metrics"], | ||
| 1033 | + "distribution": label_distribution(labels), | ||
| 1034 | + "query_profile": None, | ||
| 1035 | + "suspicious": [], | ||
| 1036 | + "results": live["results"], | ||
| 1037 | + } | ||
| 961 | query_profile = self.get_query_profile(query, force_refresh=False) | 1038 | query_profile = self.get_query_profile(query, force_refresh=False) |
| 962 | suspicious: List[Dict[str, Any]] = [] | 1039 | suspicious: List[Dict[str, Any]] = [] |
| 963 | 1040 | ||
| @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework: | @@ -1093,7 +1170,6 @@ class SearchEvaluationFramework: | ||
| 1093 | docs: Sequence[Dict[str, Any]], | 1170 | docs: Sequence[Dict[str, Any]], |
| 1094 | force_refresh: bool = False, | 1171 | force_refresh: bool = False, |
| 1095 | ) -> Dict[str, str]: | 1172 | ) -> Dict[str, str]: |
| 1096 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh) | ||
| 1097 | labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) | 1173 | labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) |
| 1098 | missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] | 1174 | missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] |
| 1099 | if not missing_docs: | 1175 | if not missing_docs: |
| @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework: | @@ -1101,12 +1177,9 @@ class SearchEvaluationFramework: | ||
| 1101 | 1177 | ||
| 1102 | for start in range(0, len(missing_docs), self.label_client.batch_size): | 1178 | for start in range(0, len(missing_docs), self.label_client.batch_size): |
| 1103 | batch = missing_docs[start : start + self.label_client.batch_size] | 1179 | batch = missing_docs[start : start + self.label_client.batch_size] |
| 1104 | - batch_pairs = self._classify_with_retry(query, query_profile, batch) | 1180 | + batch_pairs = self._classify_with_retry(query, batch, force_refresh=force_refresh) |
| 1105 | for sub_labels, raw_response, sub_batch in batch_pairs: | 1181 | for sub_labels, raw_response, sub_batch in batch_pairs: |
| 1106 | - to_store = { | ||
| 1107 | - str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc) | ||
| 1108 | - for doc, label in zip(sub_batch, sub_labels) | ||
| 1109 | - } | 1182 | + to_store = {str(doc.get("spu_id")): label for doc, label in zip(sub_batch, sub_labels)} |
| 1110 | self.store.upsert_labels( | 1183 | self.store.upsert_labels( |
| 1111 | self.tenant_id, | 1184 | self.tenant_id, |
| 1112 | query, | 1185 | query, |
| @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework: | @@ -1121,19 +1194,28 @@ class SearchEvaluationFramework: | ||
| 1121 | def _classify_with_retry( | 1194 | def _classify_with_retry( |
| 1122 | self, | 1195 | self, |
| 1123 | query: str, | 1196 | query: str, |
| 1124 | - query_profile: Dict[str, Any], | ||
| 1125 | docs: Sequence[Dict[str, Any]], | 1197 | docs: Sequence[Dict[str, Any]], |
| 1198 | + *, | ||
| 1199 | + force_refresh: bool = False, | ||
| 1126 | ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: | 1200 | ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: |
| 1127 | if not docs: | 1201 | if not docs: |
| 1128 | return [] | 1202 | return [] |
| 1129 | try: | 1203 | try: |
| 1130 | - labels, raw_response = self.label_client.classify_batch(query, query_profile, docs) | 1204 | + if self.labeler_mode == "complex": |
| 1205 | + query_profile = self.get_query_profile(query, force_refresh=force_refresh) | ||
| 1206 | + labels, raw_response = self.label_client.classify_batch_complex(query, query_profile, docs) | ||
| 1207 | + labels = [ | ||
| 1208 | + self._apply_rule_based_label_guardrails(label, query_profile, doc) | ||
| 1209 | + for doc, label in zip(docs, labels) | ||
| 1210 | + ] | ||
| 1211 | + else: | ||
| 1212 | + labels, raw_response = self.label_client.classify_batch_simple(query, docs) | ||
| 1131 | return [(labels, raw_response, docs)] | 1213 | return [(labels, raw_response, docs)] |
| 1132 | except Exception: | 1214 | except Exception: |
| 1133 | if len(docs) == 1: | 1215 | if len(docs) == 1: |
| 1134 | raise | 1216 | raise |
| 1135 | mid = len(docs) // 2 | 1217 | mid = len(docs) // 2 |
| 1136 | - return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:]) | 1218 | + return self._classify_with_retry(query, docs[:mid], force_refresh=force_refresh) + self._classify_with_retry(query, docs[mid:], force_refresh=force_refresh) |
| 1137 | 1219 | ||
| 1138 | def build_query_annotation_set( | 1220 | def build_query_annotation_set( |
| 1139 | self, | 1221 | self, |
| @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework: | @@ -1163,7 +1245,6 @@ class SearchEvaluationFramework: | ||
| 1163 | for item in full_rerank[:annotate_rerank_top_k]: | 1245 | for item in full_rerank[:annotate_rerank_top_k]: |
| 1164 | pool_docs[str(item["spu_id"])] = item["doc"] | 1246 | pool_docs[str(item["spu_id"])] = item["doc"] |
| 1165 | 1247 | ||
| 1166 | - query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels) | ||
| 1167 | labels = self.annotate_missing_labels( | 1248 | labels = self.annotate_missing_labels( |
| 1168 | query=query, | 1249 | query=query, |
| 1169 | docs=list(pool_docs.values()), | 1250 | docs=list(pool_docs.values()), |
| @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework: | @@ -1229,7 +1310,8 @@ class SearchEvaluationFramework: | ||
| 1229 | "annotate_rerank_top_k": annotate_rerank_top_k, | 1310 | "annotate_rerank_top_k": annotate_rerank_top_k, |
| 1230 | "pool_size": len(pool_docs), | 1311 | "pool_size": len(pool_docs), |
| 1231 | }, | 1312 | }, |
| 1232 | - "query_profile": query_profile, | 1313 | + "labeler_mode": self.labeler_mode, |
| 1314 | + "query_profile": self.get_query_profile(query, force_refresh=force_refresh_labels) if self.labeler_mode == "complex" else None, | ||
| 1233 | "metrics_top100": metrics, | 1315 | "metrics_top100": metrics, |
| 1234 | "search_results": search_labeled_results, | 1316 | "search_results": search_labeled_results, |
| 1235 | "full_rerank_top": rerank_top_results, | 1317 | "full_rerank_top": rerank_top_results, |
| @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework: | @@ -1250,7 +1332,7 @@ class SearchEvaluationFramework: | ||
| 1250 | self, | 1332 | self, |
| 1251 | query: str, | 1333 | query: str, |
| 1252 | top_k: int = 100, | 1334 | top_k: int = 100, |
| 1253 | - auto_annotate: bool = True, | 1335 | + auto_annotate: bool = False, |
| 1254 | language: str = "en", | 1336 | language: str = "en", |
| 1255 | force_refresh_labels: bool = False, | 1337 | force_refresh_labels: bool = False, |
| 1256 | ) -> Dict[str, Any]: | 1338 | ) -> Dict[str, Any]: |
| @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework: | @@ -1259,16 +1341,21 @@ class SearchEvaluationFramework: | ||
| 1259 | if auto_annotate: | 1341 | if auto_annotate: |
| 1260 | self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) | 1342 | self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) |
| 1261 | labels = self.store.get_labels(self.tenant_id, query) | 1343 | labels = self.store.get_labels(self.tenant_id, query) |
| 1344 | + recalled_spu_ids = {str(doc.get("spu_id")) for doc in results[:top_k]} | ||
| 1262 | labeled = [] | 1345 | labeled = [] |
| 1346 | + unlabeled_hits = 0 | ||
| 1263 | for rank, doc in enumerate(results[:top_k], start=1): | 1347 | for rank, doc in enumerate(results[:top_k], start=1): |
| 1264 | spu_id = str(doc.get("spu_id")) | 1348 | spu_id = str(doc.get("spu_id")) |
| 1349 | + label = labels.get(spu_id) | ||
| 1350 | + if label not in VALID_LABELS: | ||
| 1351 | + unlabeled_hits += 1 | ||
| 1265 | labeled.append( | 1352 | labeled.append( |
| 1266 | { | 1353 | { |
| 1267 | "rank": rank, | 1354 | "rank": rank, |
| 1268 | "spu_id": spu_id, | 1355 | "spu_id": spu_id, |
| 1269 | "title": build_display_title(doc), | 1356 | "title": build_display_title(doc), |
| 1270 | "image_url": doc.get("image_url"), | 1357 | "image_url": doc.get("image_url"), |
| 1271 | - "label": labels.get(spu_id), | 1358 | + "label": label, |
| 1272 | "option_values": list(compact_option_values(doc.get("skus") or [])), | 1359 | "option_values": list(compact_option_values(doc.get("skus") or [])), |
| 1273 | "product": compact_product_payload(doc), | 1360 | "product": compact_product_payload(doc), |
| 1274 | } | 1361 | } |
| @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework: | @@ -1277,12 +1364,65 @@ class SearchEvaluationFramework: | ||
| 1277 | item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | 1364 | item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT |
| 1278 | for item in labeled | 1365 | for item in labeled |
| 1279 | ] | 1366 | ] |
| 1367 | + label_stats = self.store.get_query_label_stats(self.tenant_id, query) | ||
| 1368 | + rerank_scores = self.store.get_rerank_scores(self.tenant_id, query) | ||
| 1369 | + relevant_missing_ids = [ | ||
| 1370 | + spu_id | ||
| 1371 | + for spu_id, label in labels.items() | ||
| 1372 | + if label in {RELEVANCE_EXACT, RELEVANCE_PARTIAL} and spu_id not in recalled_spu_ids | ||
| 1373 | + ] | ||
| 1374 | + missing_docs_map = self.store.get_corpus_docs_by_spu_ids(self.tenant_id, relevant_missing_ids) | ||
| 1375 | + missing_relevant = [] | ||
| 1376 | + for spu_id in relevant_missing_ids: | ||
| 1377 | + doc = missing_docs_map.get(spu_id) | ||
| 1378 | + if not doc: | ||
| 1379 | + continue | ||
| 1380 | + missing_relevant.append( | ||
| 1381 | + { | ||
| 1382 | + "spu_id": spu_id, | ||
| 1383 | + "label": labels[spu_id], | ||
| 1384 | + "rerank_score": rerank_scores.get(spu_id), | ||
| 1385 | + "title": build_display_title(doc), | ||
| 1386 | + "image_url": doc.get("image_url"), | ||
| 1387 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | ||
| 1388 | + "product": compact_product_payload(doc), | ||
| 1389 | + } | ||
| 1390 | + ) | ||
| 1391 | + label_order = {RELEVANCE_EXACT: 0, RELEVANCE_PARTIAL: 1, RELEVANCE_IRRELEVANT: 2} | ||
| 1392 | + missing_relevant.sort( | ||
| 1393 | + key=lambda item: ( | ||
| 1394 | + label_order.get(str(item.get("label")), 9), | ||
| 1395 | + -(float(item.get("rerank_score")) if item.get("rerank_score") is not None else float("-inf")), | ||
| 1396 | + str(item.get("title") or ""), | ||
| 1397 | + ) | ||
| 1398 | + ) | ||
| 1399 | + tips: List[str] = [] | ||
| 1400 | + if auto_annotate: | ||
| 1401 | + tips.append("Single-query evaluation used cached labels and refreshed missing labels for recalled results.") | ||
| 1402 | + else: | ||
| 1403 | + tips.append("Single-query evaluation used the offline annotation cache only; recalled SPUs without cached labels were treated as Irrelevant.") | ||
| 1404 | + if label_stats["total"] == 0: | ||
| 1405 | + tips.append("This query has no offline annotation set yet. Build or refresh labels first if you want stable evaluation.") | ||
| 1406 | + if unlabeled_hits: | ||
| 1407 | + tips.append(f"{unlabeled_hits} recalled results were not in the annotation set and were counted as Irrelevant.") | ||
| 1408 | + if not missing_relevant: | ||
| 1409 | + tips.append("No cached Exact/Partial products were missed by this recall set.") | ||
| 1280 | return { | 1410 | return { |
| 1281 | "query": query, | 1411 | "query": query, |
| 1282 | "tenant_id": self.tenant_id, | 1412 | "tenant_id": self.tenant_id, |
| 1283 | "top_k": top_k, | 1413 | "top_k": top_k, |
| 1284 | "metrics": compute_query_metrics(metric_labels), | 1414 | "metrics": compute_query_metrics(metric_labels), |
| 1285 | "results": labeled, | 1415 | "results": labeled, |
| 1416 | + "missing_relevant": missing_relevant, | ||
| 1417 | + "label_stats": { | ||
| 1418 | + **label_stats, | ||
| 1419 | + "unlabeled_hits_treated_irrelevant": unlabeled_hits, | ||
| 1420 | + "recalled_hits": len(labeled), | ||
| 1421 | + "missing_relevant_count": len(missing_relevant), | ||
| 1422 | + "missing_exact_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_EXACT), | ||
| 1423 | + "missing_partial_count": sum(1 for item in missing_relevant if item["label"] == RELEVANCE_PARTIAL), | ||
| 1424 | + }, | ||
| 1425 | + "tips": tips, | ||
| 1286 | "total": int(search_payload.get("total") or 0), | 1426 | "total": int(search_payload.get("total") or 0), |
| 1287 | } | 1427 | } |
| 1288 | 1428 | ||
| @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | @@ -1392,14 +1532,14 @@ def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | ||
| 1392 | class SearchEvalRequest(BaseModel): | 1532 | class SearchEvalRequest(BaseModel): |
| 1393 | query: str | 1533 | query: str |
| 1394 | top_k: int = Field(default=100, ge=1, le=500) | 1534 | top_k: int = Field(default=100, ge=1, le=500) |
| 1395 | - auto_annotate: bool = True | 1535 | + auto_annotate: bool = False |
| 1396 | language: str = "en" | 1536 | language: str = "en" |
| 1397 | 1537 | ||
| 1398 | 1538 | ||
| 1399 | class BatchEvalRequest(BaseModel): | 1539 | class BatchEvalRequest(BaseModel): |
| 1400 | queries: Optional[List[str]] = None | 1540 | queries: Optional[List[str]] = None |
| 1401 | top_k: int = Field(default=100, ge=1, le=500) | 1541 | top_k: int = Field(default=100, ge=1, le=500) |
| 1402 | - auto_annotate: bool = True | 1542 | + auto_annotate: bool = False |
| 1403 | language: str = "en" | 1543 | language: str = "en" |
| 1404 | force_refresh_labels: bool = False | 1544 | force_refresh_labels: bool = False |
| 1405 | 1545 | ||
| @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """ | @@ -1494,6 +1634,8 @@ WEB_APP_HTML = """ | ||
| 1494 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } | 1634 | .options { color: var(--muted); line-height: 1.5; font-size: 14px; } |
| 1495 | .section { margin-bottom: 28px; } | 1635 | .section { margin-bottom: 28px; } |
| 1496 | .history { font-size: 13px; line-height: 1.5; } | 1636 | .history { font-size: 13px; line-height: 1.5; } |
| 1637 | + .tips { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; line-height: 1.6; } | ||
| 1638 | + .tip { margin-bottom: 6px; color: var(--muted); } | ||
| 1497 | </style> | 1639 | </style> |
| 1498 | </head> | 1640 | </head> |
| 1499 | <body> | 1641 | <body> |
| @@ -1524,6 +1666,14 @@ WEB_APP_HTML = """ | @@ -1524,6 +1666,14 @@ WEB_APP_HTML = """ | ||
| 1524 | <h2>Top Results</h2> | 1666 | <h2>Top Results</h2> |
| 1525 | <div id="results" class="results"></div> | 1667 | <div id="results" class="results"></div> |
| 1526 | </section> | 1668 | </section> |
| 1669 | + <section class="section"> | ||
| 1670 | + <h2>Missed Exact / Partial</h2> | ||
| 1671 | + <div id="missingRelevant" class="results"></div> | ||
| 1672 | + </section> | ||
| 1673 | + <section class="section"> | ||
| 1674 | + <h2>Notes</h2> | ||
| 1675 | + <div id="tips" class="tips muted"></div> | ||
| 1676 | + </section> | ||
| 1527 | </main> | 1677 | </main> |
| 1528 | </div> | 1678 | </div> |
| 1529 | <script> | 1679 | <script> |
| @@ -1542,15 +1692,15 @@ WEB_APP_HTML = """ | @@ -1542,15 +1692,15 @@ WEB_APP_HTML = """ | ||
| 1542 | root.appendChild(card); | 1692 | root.appendChild(card); |
| 1543 | }); | 1693 | }); |
| 1544 | } | 1694 | } |
| 1545 | - function renderResults(results) { | ||
| 1546 | - const root = document.getElementById('results'); | ||
| 1547 | - root.innerHTML = ''; | 1695 | + function renderResults(results, rootId='results', showRank=true) { |
| 1696 | + const mount = document.getElementById(rootId); | ||
| 1697 | + mount.innerHTML = ''; | ||
| 1548 | (results || []).forEach(item => { | 1698 | (results || []).forEach(item => { |
| 1549 | const label = item.label || 'Unknown'; | 1699 | const label = item.label || 'Unknown'; |
| 1550 | const box = document.createElement('div'); | 1700 | const box = document.createElement('div'); |
| 1551 | box.className = 'result'; | 1701 | box.className = 'result'; |
| 1552 | box.innerHTML = ` | 1702 | box.innerHTML = ` |
| 1553 | - <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div> | 1703 | + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">${showRank ? `#${item.rank || '-'}` : (item.rerank_score != null ? `rerank=${item.rerank_score.toFixed ? item.rerank_score.toFixed(4) : item.rerank_score}` : 'not recalled')}</div></div> |
| 1554 | <img class="thumb" src="${item.image_url || ''}" alt="" /> | 1704 | <img class="thumb" src="${item.image_url || ''}" alt="" /> |
| 1555 | <div> | 1705 | <div> |
| 1556 | <div class="title">${item.title || ''}</div> | 1706 | <div class="title">${item.title || ''}</div> |
| @@ -1560,8 +1710,18 @@ WEB_APP_HTML = """ | @@ -1560,8 +1710,18 @@ WEB_APP_HTML = """ | ||
| 1560 | <div>${(item.option_values || [])[2] || ''}</div> | 1710 | <div>${(item.option_values || [])[2] || ''}</div> |
| 1561 | </div> | 1711 | </div> |
| 1562 | </div>`; | 1712 | </div>`; |
| 1563 | - root.appendChild(box); | 1713 | + mount.appendChild(box); |
| 1564 | }); | 1714 | }); |
| 1715 | + if (!(results || []).length) { | ||
| 1716 | + mount.innerHTML = '<div class="muted">None.</div>'; | ||
| 1717 | + } | ||
| 1718 | + } | ||
| 1719 | + function renderTips(data) { | ||
| 1720 | + const root = document.getElementById('tips'); | ||
| 1721 | + const tips = [...(data.tips || [])]; | ||
| 1722 | + const stats = data.label_stats || {}; | ||
| 1723 | + tips.unshift(`Cached labels for query: ${stats.total || 0}. Recalled hits: ${stats.recalled_hits || 0}. Missed Exact: ${stats.missing_exact_count || 0}. Missed Partial: ${stats.missing_partial_count || 0}.`); | ||
| 1724 | + root.innerHTML = tips.map(text => `<div class="tip">${text}</div>`).join(''); | ||
| 1565 | } | 1725 | } |
| 1566 | async function loadQueries() { | 1726 | async function loadQueries() { |
| 1567 | const data = await fetchJSON('/api/queries'); | 1727 | const data = await fetchJSON('/api/queries'); |
| @@ -1592,11 +1752,13 @@ WEB_APP_HTML = """ | @@ -1592,11 +1752,13 @@ WEB_APP_HTML = """ | ||
| 1592 | const data = await fetchJSON('/api/search-eval', { | 1752 | const data = await fetchJSON('/api/search-eval', { |
| 1593 | method: 'POST', | 1753 | method: 'POST', |
| 1594 | headers: {'Content-Type': 'application/json'}, | 1754 | headers: {'Content-Type': 'application/json'}, |
| 1595 | - body: JSON.stringify({query, top_k: 100, auto_annotate: true}) | 1755 | + body: JSON.stringify({query, top_k: 100, auto_annotate: false}) |
| 1596 | }); | 1756 | }); |
| 1597 | document.getElementById('status').textContent = `Done. total=${data.total}`; | 1757 | document.getElementById('status').textContent = `Done. total=${data.total}`; |
| 1598 | renderMetrics(data.metrics); | 1758 | renderMetrics(data.metrics); |
| 1599 | - renderResults(data.results); | 1759 | + renderResults(data.results, 'results', true); |
| 1760 | + renderResults(data.missing_relevant, 'missingRelevant', false); | ||
| 1761 | + renderTips(data); | ||
| 1600 | loadHistory(); | 1762 | loadHistory(); |
| 1601 | } | 1763 | } |
| 1602 | async function runBatch() { | 1764 | async function runBatch() { |
| @@ -1604,11 +1766,13 @@ WEB_APP_HTML = """ | @@ -1604,11 +1766,13 @@ WEB_APP_HTML = """ | ||
| 1604 | const data = await fetchJSON('/api/batch-eval', { | 1766 | const data = await fetchJSON('/api/batch-eval', { |
| 1605 | method: 'POST', | 1767 | method: 'POST', |
| 1606 | headers: {'Content-Type': 'application/json'}, | 1768 | headers: {'Content-Type': 'application/json'}, |
| 1607 | - body: JSON.stringify({top_k: 100, auto_annotate: true}) | 1769 | + body: JSON.stringify({top_k: 100, auto_annotate: false}) |
| 1608 | }); | 1770 | }); |
| 1609 | document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; | 1771 | document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; |
| 1610 | renderMetrics(data.aggregate_metrics); | 1772 | renderMetrics(data.aggregate_metrics); |
| 1611 | - renderResults([]); | 1773 | + renderResults([], 'results', true); |
| 1774 | + renderResults([], 'missingRelevant', false); | ||
| 1775 | + document.getElementById('tips').innerHTML = '<div class="tip">Batch evaluation uses cached labels only unless force refresh is requested via CLI/API.</div>'; | ||
| 1612 | loadHistory(); | 1776 | loadHistory(); |
| 1613 | } | 1777 | } |
| 1614 | loadQueries(); | 1778 | loadQueries(); |
| @@ -1633,6 +1797,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -1633,6 +1797,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 1633 | build.add_argument("--language", default="en") | 1797 | build.add_argument("--language", default="en") |
| 1634 | build.add_argument("--force-refresh-rerank", action="store_true") | 1798 | build.add_argument("--force-refresh-rerank", action="store_true") |
| 1635 | build.add_argument("--force-refresh-labels", action="store_true") | 1799 | build.add_argument("--force-refresh-labels", action="store_true") |
| 1800 | + build.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | ||
| 1636 | 1801 | ||
| 1637 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") | 1802 | batch = sub.add_parser("batch", help="Run batch evaluation against live search") |
| 1638 | batch.add_argument("--tenant-id", default="163") | 1803 | batch.add_argument("--tenant-id", default="163") |
| @@ -1640,6 +1805,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -1640,6 +1805,7 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 1640 | batch.add_argument("--top-k", type=int, default=100) | 1805 | batch.add_argument("--top-k", type=int, default=100) |
| 1641 | batch.add_argument("--language", default="en") | 1806 | batch.add_argument("--language", default="en") |
| 1642 | batch.add_argument("--force-refresh-labels", action="store_true") | 1807 | batch.add_argument("--force-refresh-labels", action="store_true") |
| 1808 | + batch.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | ||
| 1643 | 1809 | ||
| 1644 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") | 1810 | audit = sub.add_parser("audit", help="Audit annotation quality for queries") |
| 1645 | audit.add_argument("--tenant-id", default="163") | 1811 | audit.add_argument("--tenant-id", default="163") |
| @@ -1648,18 +1814,20 @@ def build_cli_parser() -> argparse.ArgumentParser: | @@ -1648,18 +1814,20 @@ def build_cli_parser() -> argparse.ArgumentParser: | ||
| 1648 | audit.add_argument("--language", default="en") | 1814 | audit.add_argument("--language", default="en") |
| 1649 | audit.add_argument("--limit-suspicious", type=int, default=5) | 1815 | audit.add_argument("--limit-suspicious", type=int, default=5) |
| 1650 | audit.add_argument("--force-refresh-labels", action="store_true") | 1816 | audit.add_argument("--force-refresh-labels", action="store_true") |
| 1817 | + audit.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | ||
| 1651 | 1818 | ||
| 1652 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | 1819 | serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") |
| 1653 | serve.add_argument("--tenant-id", default="163") | 1820 | serve.add_argument("--tenant-id", default="163") |
| 1654 | serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | 1821 | serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) |
| 1655 | serve.add_argument("--host", default="0.0.0.0") | 1822 | serve.add_argument("--host", default="0.0.0.0") |
| 1656 | serve.add_argument("--port", type=int, default=6010) | 1823 | serve.add_argument("--port", type=int, default=6010) |
| 1824 | + serve.add_argument("--labeler-mode", default=DEFAULT_LABELER_MODE, choices=["simple", "complex"]) | ||
| 1657 | 1825 | ||
| 1658 | return parser | 1826 | return parser |
| 1659 | 1827 | ||
| 1660 | 1828 | ||
| 1661 | def run_build(args: argparse.Namespace) -> None: | 1829 | def run_build(args: argparse.Namespace) -> None: |
| 1662 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | 1830 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) |
| 1663 | queries = framework.queries_from_file(Path(args.queries_file)) | 1831 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1664 | summary = [] | 1832 | summary = [] |
| 1665 | for query in queries: | 1833 | for query in queries: |
| @@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -> None: | @@ -1694,7 +1862,7 @@ def run_build(args: argparse.Namespace) -> None: | ||
| 1694 | 1862 | ||
| 1695 | 1863 | ||
| 1696 | def run_batch(args: argparse.Namespace) -> None: | 1864 | def run_batch(args: argparse.Namespace) -> None: |
| 1697 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | 1865 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) |
| 1698 | queries = framework.queries_from_file(Path(args.queries_file)) | 1866 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1699 | payload = framework.batch_evaluate( | 1867 | payload = framework.batch_evaluate( |
| 1700 | queries=queries, | 1868 | queries=queries, |
| @@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -> None: | @@ -1707,7 +1875,7 @@ def run_batch(args: argparse.Namespace) -> None: | ||
| 1707 | 1875 | ||
| 1708 | 1876 | ||
| 1709 | def run_audit(args: argparse.Namespace) -> None: | 1877 | def run_audit(args: argparse.Namespace) -> None: |
| 1710 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | 1878 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) |
| 1711 | queries = framework.queries_from_file(Path(args.queries_file)) | 1879 | queries = framework.queries_from_file(Path(args.queries_file)) |
| 1712 | audit_items = [] | 1880 | audit_items = [] |
| 1713 | for query in queries: | 1881 | for query in queries: |
| @@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -> None: | @@ -1757,7 +1925,7 @@ def run_audit(args: argparse.Namespace) -> None: | ||
| 1757 | 1925 | ||
| 1758 | 1926 | ||
| 1759 | def run_serve(args: argparse.Namespace) -> None: | 1927 | def run_serve(args: argparse.Namespace) -> None: |
| 1760 | - framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | 1928 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id, labeler_mode=args.labeler_mode) |
| 1761 | app = create_web_app(framework, Path(args.queries_file)) | 1929 | app = create_web_app(framework, Path(args.queries_file)) |
| 1762 | import uvicorn | 1930 | import uvicorn |
| 1763 | 1931 |
scripts/evaluation/queries/queries.txt
| @@ -30,7 +30,6 @@ khaki green backpack | @@ -30,7 +30,6 @@ khaki green backpack | ||
| 30 | 高跟鞋 | 30 | 高跟鞋 |
| 31 | 图案连身衣 | 31 | 图案连身衣 |
| 32 | 天鹅绒鸡尾酒会礼服 | 32 | 天鹅绒鸡尾酒会礼服 |
| 33 | -Wearing small clothes | ||
| 34 | gingham dress | 33 | gingham dress |
| 35 | 海滩度假装 | 34 | 海滩度假装 |
| 36 | vacation outfits | 35 | vacation outfits |
| @@ -41,10 +40,15 @@ hiking boots | @@ -41,10 +40,15 @@ hiking boots | ||
| 41 | business casual women | 40 | business casual women |
| 42 | a-line dress | 41 | a-line dress |
| 43 | 涤纶短裤 | 42 | 涤纶短裤 |
| 44 | -哺乳文胸 | ||
| 45 | Compression Top Spandex | 43 | Compression Top Spandex |
| 46 | skiing trip insulated base layer | 44 | skiing trip insulated base layer |
| 47 | high waisted jeans | 45 | high waisted jeans |
| 48 | 无袖夏装 | 46 | 无袖夏装 |
| 49 | 雪纺衬衫 | 47 | 雪纺衬衫 |
| 50 | -convertible zip-off hiking pants | ||
| 51 | \ No newline at end of file | 48 | \ No newline at end of file |
| 49 | +convertible zip-off hiking pants | ||
| 50 | +petite summer linen shorts | ||
| 51 | +tall slim fit men's linen shirt | ||
| 52 | +tall slim fit trousers | ||
| 53 | +tall straight leg pants | ||
| 54 | +tassel maxi skirt | ||
| 55 | +teacher clothes | ||
| 52 | \ No newline at end of file | 56 | \ No newline at end of file |
| @@ -0,0 +1,39 @@ | @@ -0,0 +1,39 @@ | ||
| 1 | +#!/usr/bin/env bash | ||
| 2 | +# Search evaluation quick entrypoints. Run from any cwd; resolves repo root. | ||
| 3 | +set -euo pipefail | ||
| 4 | + | ||
| 5 | +ROOT="$(cd "$(dirname "$0")/../.." && pwd)" | ||
| 6 | +cd "$ROOT" | ||
| 7 | +PY="${ROOT}/.venv/bin/python" | ||
| 8 | +TENANT_ID="${TENANT_ID:-163}" | ||
| 9 | +QUERIES="${REPO_EVAL_QUERIES:-scripts/evaluation/queries/queries.txt}" | ||
| 10 | + | ||
| 11 | +usage() { | ||
| 12 | + echo "Usage: $0 batch|serve" | ||
| 13 | + echo " batch — refresh labels + batch metrics (default: top_k=50, simple labeler, force-refresh)" | ||
| 14 | + echo " serve — eval UI on http://127.0.0.1:6010/" | ||
| 15 | + echo "Env: TENANT_ID (default 163), REPO_EVAL_QUERIES (default $QUERIES)" | ||
| 16 | +} | ||
| 17 | + | ||
| 18 | +case "${1:-}" in | ||
| 19 | + batch) | ||
| 20 | + exec "$PY" scripts/evaluation/build_annotation_set.py batch \ | ||
| 21 | + --tenant-id "$TENANT_ID" \ | ||
| 22 | + --queries-file "$QUERIES" \ | ||
| 23 | + --top-k 50 \ | ||
| 24 | + --language en \ | ||
| 25 | + --labeler-mode simple \ | ||
| 26 | + --force-refresh-labels | ||
| 27 | + ;; | ||
| 28 | + serve) | ||
| 29 | + exec "$PY" scripts/evaluation/serve_eval_web.py serve \ | ||
| 30 | + --tenant-id "$TENANT_ID" \ | ||
| 31 | + --queries-file "$QUERIES" \ | ||
| 32 | + --host 127.0.0.1 \ | ||
| 33 | + --port 6010 | ||
| 34 | + ;; | ||
| 35 | + *) | ||
| 36 | + usage | ||
| 37 | + exit 1 | ||
| 38 | + ;; | ||
| 39 | +esac |