Commit 432d1c88efc7d32fd4ed509b3e6ad6d80acf4342
1 parent
267920e5
评估框架
Showing
7 changed files
with
2367 additions
and
348 deletions
Show diff stats
config/config.yaml
| 1 | -# Unified Configuration for Multi-Tenant Search Engine | |
| 2 | -# 统一配置文件,所有租户共用一套配置 | |
| 3 | -# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为 | |
| 4 | -# | |
| 5 | -# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项 | |
| 6 | -#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。 | |
| 7 | - | |
| 8 | -# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义) | |
| 9 | 1 | runtime: |
| 10 | - environment: "prod" | |
| 11 | - index_namespace: "" | |
| 12 | - api_host: "0.0.0.0" | |
| 2 | + environment: prod | |
| 3 | + index_namespace: '' | |
| 4 | + api_host: 0.0.0.0 | |
| 13 | 5 | api_port: 6002 |
| 14 | - indexer_host: "0.0.0.0" | |
| 6 | + indexer_host: 0.0.0.0 | |
| 15 | 7 | indexer_port: 6004 |
| 16 | - embedding_host: "0.0.0.0" | |
| 8 | + embedding_host: 0.0.0.0 | |
| 17 | 9 | embedding_port: 6005 |
| 18 | 10 | embedding_text_port: 6005 |
| 19 | 11 | embedding_image_port: 6008 |
| 20 | - translator_host: "0.0.0.0" | |
| 12 | + translator_host: 0.0.0.0 | |
| 21 | 13 | translator_port: 6006 |
| 22 | - reranker_host: "0.0.0.0" | |
| 14 | + reranker_host: 0.0.0.0 | |
| 23 | 15 | reranker_port: 6007 |
| 24 | - | |
| 25 | -# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY) | |
| 26 | 16 | infrastructure: |
| 27 | 17 | elasticsearch: |
| 28 | - host: "http://localhost:9200" | |
| 18 | + host: http://localhost:9200 | |
| 29 | 19 | username: null |
| 30 | 20 | password: null |
| 31 | 21 | redis: |
| 32 | - host: "localhost" | |
| 22 | + host: localhost | |
| 33 | 23 | port: 6479 |
| 34 | 24 | snapshot_db: 0 |
| 35 | 25 | password: null |
| ... | ... | @@ -37,8 +27,8 @@ infrastructure: |
| 37 | 27 | socket_connect_timeout: 1 |
| 38 | 28 | retry_on_timeout: false |
| 39 | 29 | cache_expire_days: 720 |
| 40 | - embedding_cache_prefix: "embedding" | |
| 41 | - anchor_cache_prefix: "product_anchors" | |
| 30 | + embedding_cache_prefix: embedding | |
| 31 | + anchor_cache_prefix: product_anchors | |
| 42 | 32 | anchor_cache_expire_days: 30 |
| 43 | 33 | database: |
| 44 | 34 | host: null |
| ... | ... | @@ -49,30 +39,16 @@ infrastructure: |
| 49 | 39 | secrets: |
| 50 | 40 | dashscope_api_key: null |
| 51 | 41 | deepl_auth_key: null |
| 52 | - | |
| 53 | -# Elasticsearch Index | |
| 54 | -es_index_name: "search_products" | |
| 55 | - | |
| 56 | -# 检索域 / 索引列表(可为空列表;每项字段均需显式给出) | |
| 42 | +es_index_name: search_products | |
| 57 | 43 | indexes: [] |
| 58 | - | |
| 59 | -# Config assets | |
| 60 | 44 | assets: |
| 61 | - query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict" | |
| 62 | - | |
| 63 | -# Product content understanding (LLM enrich-content) configuration | |
| 45 | + query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict | |
| 64 | 46 | product_enrich: |
| 65 | 47 | max_workers: 40 |
| 66 | - | |
| 67 | -# ES Index Settings (基础设置) | |
| 68 | 48 | es_settings: |
| 69 | 49 | number_of_shards: 1 |
| 70 | 50 | number_of_replicas: 0 |
| 71 | - refresh_interval: "30s" | |
| 72 | - | |
| 73 | -# 字段权重配置(用于搜索时的字段boost) | |
| 74 | -# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。 | |
| 75 | -# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。 | |
| 51 | + refresh_interval: 30s | |
| 76 | 52 | field_boosts: |
| 77 | 53 | title: 3.0 |
| 78 | 54 | qanchors: 2.5 |
| ... | ... | @@ -85,79 +61,64 @@ field_boosts: |
| 85 | 61 | option1_values: 1.5 |
| 86 | 62 | option2_values: 1.5 |
| 87 | 63 | option3_values: 1.5 |
| 88 | - | |
| 89 | -# Query Configuration(查询配置) | |
| 90 | 64 | query_config: |
| 91 | - # 支持的语言 | |
| 92 | 65 | supported_languages: |
| 93 | - - "zh" | |
| 94 | - - "en" | |
| 95 | - default_language: "en" | |
| 96 | - | |
| 97 | - # 功能开关(翻译开关由tenant_config控制) | |
| 66 | + - zh | |
| 67 | + - en | |
| 68 | + default_language: en | |
| 98 | 69 | enable_text_embedding: true |
| 99 | 70 | enable_query_rewrite: true |
| 100 | - | |
| 101 | - # 查询翻译模型(须与 services.translation.capabilities 中某项一致) | |
| 102 | - # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。 | |
| 103 | - zh_to_en_model: "nllb-200-distilled-600m" # "opus-mt-zh-en" | |
| 104 | - en_to_zh_model: "nllb-200-distilled-600m" # "opus-mt-en-zh" | |
| 105 | - default_translation_model: "nllb-200-distilled-600m" | |
| 106 | - # zh_to_en_model: "deepl" | |
| 107 | - # en_to_zh_model: "deepl" | |
| 108 | - # default_translation_model: "deepl" | |
| 109 | - # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同) | |
| 110 | - zh_to_en_model__source_not_in_index: "nllb-200-distilled-600m" | |
| 111 | - en_to_zh_model__source_not_in_index: "nllb-200-distilled-600m" | |
| 112 | - default_translation_model__source_not_in_index: "nllb-200-distilled-600m" | |
| 113 | - # zh_to_en_model__source_not_in_index: "deepl" | |
| 114 | - # en_to_zh_model__source_not_in_index: "deepl" | |
| 115 | - # default_translation_model__source_not_in_index: "deepl" | |
| 116 | - | |
| 117 | - # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。 | |
| 118 | - # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。 | |
| 119 | - translation_embedding_wait_budget_ms_source_in_index: 200 # 80 | |
| 120 | - translation_embedding_wait_budget_ms_source_not_in_index: 300 #200 | |
| 121 | - | |
| 71 | + zh_to_en_model: nllb-200-distilled-600m | |
| 72 | + en_to_zh_model: nllb-200-distilled-600m | |
| 73 | + default_translation_model: nllb-200-distilled-600m | |
| 74 | + zh_to_en_model__source_not_in_index: nllb-200-distilled-600m | |
| 75 | + en_to_zh_model__source_not_in_index: nllb-200-distilled-600m | |
| 76 | + default_translation_model__source_not_in_index: nllb-200-distilled-600m | |
| 77 | + translation_embedding_wait_budget_ms_source_in_index: 200 | |
| 78 | + translation_embedding_wait_budget_ms_source_not_in_index: 300 | |
| 122 | 79 | style_intent: |
| 123 | 80 | enabled: true |
| 124 | 81 | selected_sku_boost: 1.2 |
| 125 | - color_dictionary_path: "config/dictionaries/style_intent_color.csv" | |
| 126 | - size_dictionary_path: "config/dictionaries/style_intent_size.csv" | |
| 82 | + color_dictionary_path: config/dictionaries/style_intent_color.csv | |
| 83 | + size_dictionary_path: config/dictionaries/style_intent_size.csv | |
| 127 | 84 | dimension_aliases: |
| 128 | - color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"] | |
| 129 | - size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"] | |
| 130 | - | |
| 85 | + color: | |
| 86 | + - color | |
| 87 | + - colors | |
| 88 | + - colour | |
| 89 | + - colours | |
| 90 | + - 颜色 | |
| 91 | + - 色 | |
| 92 | + - 色系 | |
| 93 | + size: | |
| 94 | + - size | |
| 95 | + - sizes | |
| 96 | + - sizing | |
| 97 | + - 尺码 | |
| 98 | + - 尺寸 | |
| 99 | + - 码数 | |
| 100 | + - 号码 | |
| 101 | + - 码 | |
| 131 | 102 | product_title_exclusion: |
| 132 | 103 | enabled: true |
| 133 | - dictionary_path: "config/dictionaries/product_title_exclusion.tsv" | |
| 134 | - | |
| 135 | - # 动态多语言检索字段配置 | |
| 136 | - # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式; | |
| 137 | - # shared_fields 为无语言后缀字段。 | |
| 104 | + dictionary_path: config/dictionaries/product_title_exclusion.tsv | |
| 138 | 105 | search_fields: |
| 139 | 106 | multilingual_fields: |
| 140 | - - "title" | |
| 141 | - - "qanchors" | |
| 142 | - - "category_path" | |
| 143 | - - "category_name_text" | |
| 144 | - - "brief" | |
| 145 | - - "description" | |
| 146 | - - "vendor" | |
| 147 | - shared_fields: | |
| 148 | - # - "tags" | |
| 149 | - # - "option1_values" | |
| 150 | - # - "option2_values" | |
| 151 | - # - "option3_values" | |
| 107 | + - title | |
| 108 | + - qanchors | |
| 109 | + - category_path | |
| 110 | + - category_name_text | |
| 111 | + - brief | |
| 112 | + - description | |
| 113 | + - vendor | |
| 114 | + shared_fields: null | |
| 152 | 115 | core_multilingual_fields: |
| 153 | - - "title" | |
| 154 | - - "qanchors" | |
| 155 | - - "category_name_text" | |
| 156 | - | |
| 157 | - # 统一文本召回策略(主查询 + 翻译查询) | |
| 116 | + - title | |
| 117 | + - qanchors | |
| 118 | + - category_name_text | |
| 158 | 119 | text_query_strategy: |
| 159 | - base_minimum_should_match: "60%" | |
| 160 | - translation_minimum_should_match: "60%" | |
| 120 | + base_minimum_should_match: 60% | |
| 121 | + translation_minimum_should_match: 60% | |
| 161 | 122 | translation_boost: 0.75 |
| 162 | 123 | tie_breaker_base_query: 0.5 |
| 163 | 124 | best_fields_boost: 2.0 |
| ... | ... | @@ -169,67 +130,51 @@ query_config: |
| 169 | 130 | title: 5.0 |
| 170 | 131 | qanchors: 4.0 |
| 171 | 132 | phrase_match_boost: 3.0 |
| 172 | - | |
| 173 | - # Embedding字段名称 | |
| 174 | - text_embedding_field: "title_embedding" | |
| 175 | - image_embedding_field: "image_embedding.vector" | |
| 176 | - | |
| 177 | - # 返回字段配置(_source includes) | |
| 178 | - # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段 | |
| 179 | - # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致 | |
| 133 | + text_embedding_field: title_embedding | |
| 134 | + image_embedding_field: image_embedding.vector | |
| 180 | 135 | source_fields: |
| 181 | - - spu_id | |
| 182 | - - handle | |
| 183 | - - title | |
| 184 | - - brief | |
| 185 | - - description | |
| 186 | - - vendor | |
| 187 | - - category_name | |
| 188 | - - category_name_text | |
| 189 | - - category_path | |
| 190 | - - category_id | |
| 191 | - - category_level | |
| 192 | - - category1_name | |
| 193 | - - category2_name | |
| 194 | - - category3_name | |
| 195 | - - tags | |
| 196 | - - min_price | |
| 197 | - - compare_at_price | |
| 198 | - - image_url | |
| 199 | - - sku_prices | |
| 200 | - - sku_weights | |
| 201 | - - sku_weight_units | |
| 202 | - - total_inventory | |
| 203 | - - option1_name | |
| 204 | - - option1_values | |
| 205 | - - option2_name | |
| 206 | - - option2_values | |
| 207 | - - option3_name | |
| 208 | - - option3_values | |
| 209 | - - specifications | |
| 210 | - - skus | |
| 211 | - | |
| 212 | - # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates) | |
| 136 | + - spu_id | |
| 137 | + - handle | |
| 138 | + - title | |
| 139 | + - brief | |
| 140 | + - description | |
| 141 | + - vendor | |
| 142 | + - category_name | |
| 143 | + - category_name_text | |
| 144 | + - category_path | |
| 145 | + - category_id | |
| 146 | + - category_level | |
| 147 | + - category1_name | |
| 148 | + - category2_name | |
| 149 | + - category3_name | |
| 150 | + - tags | |
| 151 | + - min_price | |
| 152 | + - compare_at_price | |
| 153 | + - image_url | |
| 154 | + - sku_prices | |
| 155 | + - sku_weights | |
| 156 | + - sku_weight_units | |
| 157 | + - total_inventory | |
| 158 | + - option1_name | |
| 159 | + - option1_values | |
| 160 | + - option2_name | |
| 161 | + - option2_values | |
| 162 | + - option3_name | |
| 163 | + - option3_values | |
| 164 | + - specifications | |
| 165 | + - skus | |
| 213 | 166 | knn_text_boost: 4 |
| 214 | 167 | knn_image_boost: 4 |
| 215 | - | |
| 216 | - # knn_text_num_candidates = k * 3.4 | |
| 217 | 168 | knn_text_k: 160 |
| 218 | 169 | knn_text_num_candidates: 560 |
| 219 | - | |
| 220 | 170 | knn_text_k_long: 400 |
| 221 | 171 | knn_text_num_candidates_long: 1200 |
| 222 | - | |
| 223 | 172 | knn_image_k: 400 |
| 224 | 173 | knn_image_num_candidates: 1200 |
| 225 | - | |
| 226 | -# Function Score配置(ES层打分规则) | |
| 227 | 174 | function_score: |
| 228 | - score_mode: "sum" | |
| 229 | - boost_mode: "multiply" | |
| 175 | + score_mode: sum | |
| 176 | + boost_mode: multiply | |
| 230 | 177 | functions: [] |
| 231 | - | |
| 232 | -# 粗排配置(仅融合 ES 文本/向量信号,不调用模型) | |
| 233 | 178 | coarse_rank: |
| 234 | 179 | enabled: true |
| 235 | 180 | input_window: 700 |
| ... | ... | @@ -237,69 +182,52 @@ coarse_rank: |
| 237 | 182 | fusion: |
| 238 | 183 | text_bias: 0.1 |
| 239 | 184 | text_exponent: 0.35 |
| 240 | - # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | |
| 241 | - # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣 | |
| 242 | 185 | text_translation_weight: 1.0 |
| 243 | 186 | knn_text_weight: 1.0 |
| 244 | 187 | knn_image_weight: 1.0 |
| 245 | 188 | knn_tie_breaker: 0.1 |
| 246 | 189 | knn_bias: 0.6 |
| 247 | 190 | knn_exponent: 0.0 |
| 248 | - | |
| 249 | -# 精排配置(轻量 reranker) | |
| 250 | 191 | fine_rank: |
| 251 | 192 | enabled: false |
| 252 | 193 | input_window: 160 |
| 253 | 194 | output_window: 80 |
| 254 | 195 | timeout_sec: 10.0 |
| 255 | - rerank_query_template: "{query}" | |
| 256 | - rerank_doc_template: "{title}" | |
| 257 | - service_profile: "fine" | |
| 258 | - | |
| 259 | -# 重排配置(provider/URL 在 services.rerank) | |
| 196 | + rerank_query_template: '{query}' | |
| 197 | + rerank_doc_template: '{title}' | |
| 198 | + service_profile: fine | |
| 260 | 199 | rerank: |
| 261 | 200 | enabled: true |
| 262 | 201 | rerank_window: 160 |
| 263 | 202 | timeout_sec: 15.0 |
| 264 | 203 | weight_es: 0.4 |
| 265 | 204 | weight_ai: 0.6 |
| 266 | - rerank_query_template: "{query}" | |
| 267 | - rerank_doc_template: "{title}" | |
| 268 | - service_profile: "default" | |
| 269 | - # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项) | |
| 270 | - # 其中 knn_score 先做一层 dis_max: | |
| 271 | - # max(knn_text_weight * text_knn, knn_image_weight * image_knn) | |
| 272 | - # + knn_tie_breaker * 另一侧较弱信号 | |
| 205 | + rerank_query_template: '{query}' | |
| 206 | + rerank_doc_template: '{title}' | |
| 207 | + service_profile: default | |
| 273 | 208 | fusion: |
| 274 | - rerank_bias: 0.00001 | |
| 275 | - rerank_exponent: 1.0 | |
| 276 | - fine_bias: 0.00001 | |
| 209 | + rerank_bias: 1.0e-05 | |
| 210 | + rerank_exponent: 1.15 | |
| 211 | + fine_bias: 1.0e-05 | |
| 277 | 212 | fine_exponent: 1.0 |
| 278 | 213 | text_bias: 0.1 |
| 279 | - text_exponent: 0.35 | |
| 280 | - # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合) | |
| 281 | - text_translation_weight: 1.0 | |
| 214 | + text_exponent: 0.25 | |
| 215 | + text_translation_weight: 0.8 | |
| 282 | 216 | knn_text_weight: 1.0 |
| 283 | 217 | knn_image_weight: 1.0 |
| 284 | 218 | knn_tie_breaker: 0.1 |
| 285 | 219 | knn_bias: 0.6 |
| 286 | 220 | knn_exponent: 0.0 |
| 287 | - | |
| 288 | -# 可扩展服务/provider 注册表(单一配置源) | |
| 289 | 221 | services: |
| 290 | 222 | translation: |
| 291 | - service_url: "http://127.0.0.1:6006" | |
| 292 | - # default_model: "nllb-200-distilled-600m" | |
| 293 | - default_model: "nllb-200-distilled-600m" | |
| 294 | - default_scene: "general" | |
| 223 | + service_url: http://127.0.0.1:6006 | |
| 224 | + default_model: nllb-200-distilled-600m | |
| 225 | + default_scene: general | |
| 295 | 226 | timeout_sec: 10.0 |
| 296 | 227 | cache: |
| 297 | 228 | ttl_seconds: 62208000 |
| 298 | 229 | sliding_expiration: true |
| 299 | - # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups). | |
| 300 | 230 | enable_model_quality_tier_cache: true |
| 301 | - # Higher tier = better quality. Multiple models may share one tier (同级). | |
| 302 | - # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers). | |
| 303 | 231 | model_quality_tiers: |
| 304 | 232 | deepl: 30 |
| 305 | 233 | qwen-mt: 30 |
| ... | ... | @@ -310,43 +238,43 @@ services: |
| 310 | 238 | capabilities: |
| 311 | 239 | qwen-mt: |
| 312 | 240 | enabled: true |
| 313 | - backend: "qwen_mt" | |
| 314 | - model: "qwen-mt-flash" | |
| 315 | - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 241 | + backend: qwen_mt | |
| 242 | + model: qwen-mt-flash | |
| 243 | + base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 316 | 244 | timeout_sec: 10.0 |
| 317 | 245 | use_cache: true |
| 318 | 246 | llm: |
| 319 | 247 | enabled: true |
| 320 | - backend: "llm" | |
| 321 | - model: "qwen-flash" | |
| 322 | - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1" | |
| 248 | + backend: llm | |
| 249 | + model: qwen-flash | |
| 250 | + base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1 | |
| 323 | 251 | timeout_sec: 30.0 |
| 324 | 252 | use_cache: true |
| 325 | 253 | deepl: |
| 326 | 254 | enabled: true |
| 327 | - backend: "deepl" | |
| 328 | - api_url: "https://api.deepl.com/v2/translate" | |
| 255 | + backend: deepl | |
| 256 | + api_url: https://api.deepl.com/v2/translate | |
| 329 | 257 | timeout_sec: 10.0 |
| 330 | - glossary_id: "" | |
| 258 | + glossary_id: '' | |
| 331 | 259 | use_cache: true |
| 332 | 260 | nllb-200-distilled-600m: |
| 333 | 261 | enabled: true |
| 334 | - backend: "local_nllb" | |
| 335 | - model_id: "facebook/nllb-200-distilled-600M" | |
| 336 | - model_dir: "./models/translation/facebook/nllb-200-distilled-600M" | |
| 337 | - ct2_model_dir: "./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16" | |
| 338 | - ct2_compute_type: "float16" | |
| 339 | - ct2_conversion_quantization: "float16" | |
| 262 | + backend: local_nllb | |
| 263 | + model_id: facebook/nllb-200-distilled-600M | |
| 264 | + model_dir: ./models/translation/facebook/nllb-200-distilled-600M | |
| 265 | + ct2_model_dir: ./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16 | |
| 266 | + ct2_compute_type: float16 | |
| 267 | + ct2_conversion_quantization: float16 | |
| 340 | 268 | ct2_auto_convert: true |
| 341 | 269 | ct2_inter_threads: 4 |
| 342 | 270 | ct2_intra_threads: 0 |
| 343 | 271 | ct2_max_queued_batches: 32 |
| 344 | - ct2_batch_type: "examples" | |
| 345 | - ct2_decoding_length_mode: "source" | |
| 272 | + ct2_batch_type: examples | |
| 273 | + ct2_decoding_length_mode: source | |
| 346 | 274 | ct2_decoding_length_extra: 8 |
| 347 | 275 | ct2_decoding_length_min: 32 |
| 348 | - device: "cuda" | |
| 349 | - torch_dtype: "float16" | |
| 276 | + device: cuda | |
| 277 | + torch_dtype: float16 | |
| 350 | 278 | batch_size: 64 |
| 351 | 279 | max_input_length: 256 |
| 352 | 280 | max_new_tokens: 64 |
| ... | ... | @@ -354,19 +282,19 @@ services: |
| 354 | 282 | use_cache: true |
| 355 | 283 | opus-mt-zh-en: |
| 356 | 284 | enabled: false |
| 357 | - backend: "local_marian" | |
| 358 | - model_id: "Helsinki-NLP/opus-mt-zh-en" | |
| 359 | - model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en" | |
| 360 | - ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16" | |
| 361 | - ct2_compute_type: "float16" | |
| 362 | - ct2_conversion_quantization: "float16" | |
| 285 | + backend: local_marian | |
| 286 | + model_id: Helsinki-NLP/opus-mt-zh-en | |
| 287 | + model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en | |
| 288 | + ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16 | |
| 289 | + ct2_compute_type: float16 | |
| 290 | + ct2_conversion_quantization: float16 | |
| 363 | 291 | ct2_auto_convert: true |
| 364 | 292 | ct2_inter_threads: 1 |
| 365 | 293 | ct2_intra_threads: 0 |
| 366 | 294 | ct2_max_queued_batches: 0 |
| 367 | - ct2_batch_type: "examples" | |
| 368 | - device: "cuda" | |
| 369 | - torch_dtype: "float16" | |
| 295 | + ct2_batch_type: examples | |
| 296 | + device: cuda | |
| 297 | + torch_dtype: float16 | |
| 370 | 298 | batch_size: 16 |
| 371 | 299 | max_input_length: 256 |
| 372 | 300 | max_new_tokens: 256 |
| ... | ... | @@ -374,181 +302,147 @@ services: |
| 374 | 302 | use_cache: true |
| 375 | 303 | opus-mt-en-zh: |
| 376 | 304 | enabled: false |
| 377 | - backend: "local_marian" | |
| 378 | - model_id: "Helsinki-NLP/opus-mt-en-zh" | |
| 379 | - model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh" | |
| 380 | - ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16" | |
| 381 | - ct2_compute_type: "float16" | |
| 382 | - ct2_conversion_quantization: "float16" | |
| 305 | + backend: local_marian | |
| 306 | + model_id: Helsinki-NLP/opus-mt-en-zh | |
| 307 | + model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh | |
| 308 | + ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16 | |
| 309 | + ct2_compute_type: float16 | |
| 310 | + ct2_conversion_quantization: float16 | |
| 383 | 311 | ct2_auto_convert: true |
| 384 | 312 | ct2_inter_threads: 1 |
| 385 | 313 | ct2_intra_threads: 0 |
| 386 | 314 | ct2_max_queued_batches: 0 |
| 387 | - ct2_batch_type: "examples" | |
| 388 | - device: "cuda" | |
| 389 | - torch_dtype: "float16" | |
| 315 | + ct2_batch_type: examples | |
| 316 | + device: cuda | |
| 317 | + torch_dtype: float16 | |
| 390 | 318 | batch_size: 16 |
| 391 | 319 | max_input_length: 256 |
| 392 | 320 | max_new_tokens: 256 |
| 393 | 321 | num_beams: 1 |
| 394 | 322 | use_cache: true |
| 395 | 323 | embedding: |
| 396 | - provider: "http" # http | |
| 324 | + provider: http | |
| 397 | 325 | providers: |
| 398 | 326 | http: |
| 399 | - text_base_url: "http://127.0.0.1:6005" | |
| 400 | - image_base_url: "http://127.0.0.1:6008" | |
| 401 | - # 服务内文本后端(embedding 进程启动时读取) | |
| 402 | - backend: "tei" # tei | local_st | |
| 327 | + text_base_url: http://127.0.0.1:6005 | |
| 328 | + image_base_url: http://127.0.0.1:6008 | |
| 329 | + backend: tei | |
| 403 | 330 | backends: |
| 404 | 331 | tei: |
| 405 | - base_url: "http://127.0.0.1:8080" | |
| 332 | + base_url: http://127.0.0.1:8080 | |
| 406 | 333 | timeout_sec: 20 |
| 407 | - model_id: "Qwen/Qwen3-Embedding-0.6B" | |
| 334 | + model_id: Qwen/Qwen3-Embedding-0.6B | |
| 408 | 335 | local_st: |
| 409 | - model_id: "Qwen/Qwen3-Embedding-0.6B" | |
| 410 | - device: "cuda" | |
| 336 | + model_id: Qwen/Qwen3-Embedding-0.6B | |
| 337 | + device: cuda | |
| 411 | 338 | batch_size: 32 |
| 412 | 339 | normalize_embeddings: true |
| 413 | - # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name) | |
| 414 | - # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中 | |
| 415 | - # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。 | |
| 416 | - image_backend: "clip_as_service" # clip_as_service | local_cnclip | |
| 340 | + image_backend: clip_as_service | |
| 417 | 341 | image_backends: |
| 418 | 342 | clip_as_service: |
| 419 | - server: "grpc://127.0.0.1:51000" | |
| 420 | - model_name: "CN-CLIP/ViT-L-14" | |
| 343 | + server: grpc://127.0.0.1:51000 | |
| 344 | + model_name: CN-CLIP/ViT-L-14 | |
| 421 | 345 | batch_size: 8 |
| 422 | 346 | normalize_embeddings: true |
| 423 | 347 | local_cnclip: |
| 424 | - model_name: "ViT-L-14" | |
| 348 | + model_name: ViT-L-14 | |
| 425 | 349 | device: null |
| 426 | 350 | batch_size: 8 |
| 427 | 351 | normalize_embeddings: true |
| 428 | 352 | rerank: |
| 429 | - provider: "http" | |
| 353 | + provider: http | |
| 430 | 354 | providers: |
| 431 | 355 | http: |
| 432 | 356 | instances: |
| 433 | 357 | default: |
| 434 | - base_url: "http://127.0.0.1:6007" | |
| 435 | - service_url: "http://127.0.0.1:6007/rerank" | |
| 358 | + base_url: http://127.0.0.1:6007 | |
| 359 | + service_url: http://127.0.0.1:6007/rerank | |
| 436 | 360 | fine: |
| 437 | - base_url: "http://127.0.0.1:6009" | |
| 438 | - service_url: "http://127.0.0.1:6009/rerank" | |
| 361 | + base_url: http://127.0.0.1:6009 | |
| 362 | + service_url: http://127.0.0.1:6009/rerank | |
| 439 | 363 | request: |
| 440 | 364 | max_docs: 1000 |
| 441 | 365 | normalize: true |
| 442 | - default_instance: "default" | |
| 443 | - # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。 | |
| 366 | + default_instance: default | |
| 444 | 367 | instances: |
| 445 | 368 | default: |
| 446 | - host: "0.0.0.0" | |
| 369 | + host: 0.0.0.0 | |
| 447 | 370 | port: 6007 |
| 448 | - backend: "qwen3_vllm_score" | |
| 449 | - runtime_dir: "./.runtime/reranker/default" | |
| 371 | + backend: qwen3_vllm_score | |
| 372 | + runtime_dir: ./.runtime/reranker/default | |
| 450 | 373 | fine: |
| 451 | - host: "0.0.0.0" | |
| 374 | + host: 0.0.0.0 | |
| 452 | 375 | port: 6009 |
| 453 | - backend: "bge" | |
| 454 | - runtime_dir: "./.runtime/reranker/fine" | |
| 376 | + backend: bge | |
| 377 | + runtime_dir: ./.runtime/reranker/fine | |
| 455 | 378 | backends: |
| 456 | 379 | bge: |
| 457 | - model_name: "BAAI/bge-reranker-v2-m3" | |
| 380 | + model_name: BAAI/bge-reranker-v2-m3 | |
| 458 | 381 | device: null |
| 459 | 382 | use_fp16: true |
| 460 | 383 | batch_size: 80 |
| 461 | 384 | max_length: 160 |
| 462 | - cache_dir: "./model_cache" | |
| 385 | + cache_dir: ./model_cache | |
| 463 | 386 | enable_warmup: true |
| 464 | 387 | jina_reranker_v3: |
| 465 | - model_name: "jinaai/jina-reranker-v3" | |
| 388 | + model_name: jinaai/jina-reranker-v3 | |
| 466 | 389 | device: null |
| 467 | - dtype: "float16" | |
| 390 | + dtype: float16 | |
| 468 | 391 | batch_size: 64 |
| 469 | 392 | max_doc_length: 160 |
| 470 | 393 | max_query_length: 64 |
| 471 | 394 | sort_by_doc_length: true |
| 472 | - cache_dir: "./model_cache" | |
| 395 | + cache_dir: ./model_cache | |
| 473 | 396 | trust_remote_code: true |
| 474 | 397 | qwen3_vllm: |
| 475 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 476 | - engine: "vllm" | |
| 398 | + model_name: Qwen/Qwen3-Reranker-0.6B | |
| 399 | + engine: vllm | |
| 477 | 400 | max_model_len: 256 |
| 478 | 401 | tensor_parallel_size: 1 |
| 479 | - gpu_memory_utilization: 0.20 | |
| 480 | - dtype: "float16" | |
| 402 | + gpu_memory_utilization: 0.2 | |
| 403 | + dtype: float16 | |
| 481 | 404 | enable_prefix_caching: true |
| 482 | 405 | enforce_eager: false |
| 483 | 406 | infer_batch_size: 100 |
| 484 | 407 | sort_by_doc_length: true |
| 485 | - # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct) | |
| 486 | - instruction_format: standard # compact standard | |
| 487 | - # instruction: "Given a query, score the product for relevance" | |
| 488 | - # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点 | |
| 489 | - # instruction: "rank products by given query, category match first" | |
| 490 | - # instruction: "Rank products by query relevance, prioritizing category match" | |
| 491 | - # instruction: "Rank products by query relevance, prioritizing category and style match" | |
| 492 | - # instruction: "Rank by query relevance, prioritize category & style" | |
| 493 | - # instruction: "Relevance ranking: category & style match first" | |
| 494 | - # instruction: "Score product relevance by query with category & style match prioritized" | |
| 495 | - # instruction: "Rank products by query with category & style match prioritized" | |
| 496 | - # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query" | |
| 497 | - instruction: "rank products by given query" | |
| 498 | - # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score | |
| 499 | - # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。 | |
| 408 | + instruction_format: standard | |
| 409 | + instruction: rank products by given query | |
| 500 | 410 | qwen3_vllm_score: |
| 501 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 502 | - # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false | |
| 411 | + model_name: Qwen/Qwen3-Reranker-0.6B | |
| 503 | 412 | use_original_qwen3_hf_overrides: true |
| 504 | - # vllm_runner: "auto" | |
| 505 | - # vllm_convert: "auto" | |
| 506 | - # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并 | |
| 507 | - # hf_overrides: {} | |
| 508 | - engine: "vllm" | |
| 413 | + engine: vllm | |
| 509 | 414 | max_model_len: 172 |
| 510 | 415 | tensor_parallel_size: 1 |
| 511 | 416 | gpu_memory_utilization: 0.15 |
| 512 | - dtype: "float16" | |
| 417 | + dtype: float16 | |
| 513 | 418 | enable_prefix_caching: true |
| 514 | 419 | enforce_eager: false |
| 515 | 420 | infer_batch_size: 80 |
| 516 | 421 | sort_by_doc_length: true |
| 517 | - # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致 | |
| 518 | - instruction_format: standard # compact standard | |
| 519 | - # instruction: "Rank products by query with category & style match prioritized" | |
| 520 | - instruction: "Rank products by query with category & style match prioritized" | |
| 521 | - # instruction: "Given a shopping query, rank products by relevance" | |
| 422 | + instruction_format: standard | |
| 423 | + instruction: Rank products by query with category & style match prioritized | |
| 522 | 424 | qwen3_transformers: |
| 523 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 524 | - instruction: "rank products by given query" | |
| 525 | - # instruction: "Score the product’s relevance to the given query" | |
| 425 | + model_name: Qwen/Qwen3-Reranker-0.6B | |
| 426 | + instruction: rank products by given query | |
| 526 | 427 | max_length: 8192 |
| 527 | 428 | batch_size: 64 |
| 528 | 429 | use_fp16: true |
| 529 | - # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2 | |
| 530 | - attn_implementation: "sdpa" | |
| 531 | - # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask. | |
| 532 | - # For 1 query + many short docs (for example 400 product titles), this usually reduces | |
| 533 | - # repeated prefix work and padding waste compared with pairwise batching. | |
| 430 | + attn_implementation: sdpa | |
| 534 | 431 | qwen3_transformers_packed: |
| 535 | - model_name: "Qwen/Qwen3-Reranker-0.6B" | |
| 536 | - instruction: "Rank products by query with category & style match prioritized" | |
| 432 | + model_name: Qwen/Qwen3-Reranker-0.6B | |
| 433 | + instruction: Rank products by query with category & style match prioritized | |
| 537 | 434 | max_model_len: 256 |
| 538 | 435 | max_doc_len: 160 |
| 539 | 436 | max_docs_per_pack: 0 |
| 540 | 437 | use_fp16: true |
| 541 | 438 | sort_by_doc_length: true |
| 542 | - # Packed mode relies on a custom 4D attention mask. "eager" is the safest default. | |
| 543 | - # If your torch/transformers stack validates it, you can benchmark "sdpa". | |
| 544 | - attn_implementation: "eager" | |
| 439 | + attn_implementation: eager | |
| 545 | 440 | qwen3_gguf: |
| 546 | - repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF" | |
| 547 | - filename: "*Q8_0.gguf" | |
| 548 | - cache_dir: "./model_cache" | |
| 549 | - local_dir: "./models/reranker/qwen3-reranker-4b-gguf" | |
| 550 | - instruction: "Rank products by query with category & style match prioritized" | |
| 551 | - # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快 | |
| 441 | + repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF | |
| 442 | + filename: '*Q8_0.gguf' | |
| 443 | + cache_dir: ./model_cache | |
| 444 | + local_dir: ./models/reranker/qwen3-reranker-4b-gguf | |
| 445 | + instruction: Rank products by query with category & style match prioritized | |
| 552 | 446 | n_ctx: 512 |
| 553 | 447 | n_batch: 512 |
| 554 | 448 | n_ubatch: 512 |
| ... | ... | @@ -562,17 +456,15 @@ services: |
| 562 | 456 | use_mlock: false |
| 563 | 457 | infer_batch_size: 8 |
| 564 | 458 | sort_by_doc_length: true |
| 565 | - length_sort_mode: "char" | |
| 459 | + length_sort_mode: char | |
| 566 | 460 | enable_warmup: true |
| 567 | 461 | verbose: false |
| 568 | 462 | qwen3_gguf_06b: |
| 569 | - repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF" | |
| 570 | - filename: "qwen3-reranker-0.6b-q8_0.gguf" | |
| 571 | - cache_dir: "./model_cache" | |
| 572 | - local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf" | |
| 573 | - instruction: "Rank products by query with category & style match prioritized" | |
| 574 | - # 0.6B GGUF / online rerank baseline: | |
| 575 | - # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。 | |
| 463 | + repo_id: ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF | |
| 464 | + filename: qwen3-reranker-0.6b-q8_0.gguf | |
| 465 | + cache_dir: ./model_cache | |
| 466 | + local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf | |
| 467 | + instruction: Rank products by query with category & style match prioritized | |
| 576 | 468 | n_ctx: 256 |
| 577 | 469 | n_batch: 256 |
| 578 | 470 | n_ubatch: 256 |
| ... | ... | @@ -586,54 +478,57 @@ services: |
| 586 | 478 | use_mlock: false |
| 587 | 479 | infer_batch_size: 32 |
| 588 | 480 | sort_by_doc_length: true |
| 589 | - length_sort_mode: "char" | |
| 481 | + length_sort_mode: char | |
| 590 | 482 | reuse_query_state: false |
| 591 | 483 | enable_warmup: true |
| 592 | 484 | verbose: false |
| 593 | 485 | dashscope_rerank: |
| 594 | - model_name: "qwen3-rerank" | |
| 595 | - # 按地域选择 endpoint: | |
| 596 | - # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks | |
| 597 | - # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks | |
| 598 | - # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks | |
| 599 | - endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks" | |
| 600 | - api_key_env: "RERANK_DASHSCOPE_API_KEY_CN" | |
| 601 | - timeout_sec: 10.0 # | |
| 602 | - top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限 | |
| 603 | - batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断) | |
| 604 | - instruct: "Given a shopping query, rank product titles by relevance" | |
| 486 | + model_name: qwen3-rerank | |
| 487 | + endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks | |
| 488 | + api_key_env: RERANK_DASHSCOPE_API_KEY_CN | |
| 489 | + timeout_sec: 10.0 | |
| 490 | + top_n_cap: 0 | |
| 491 | + batchsize: 64 | |
| 492 | + instruct: Given a shopping query, rank product titles by relevance | |
| 605 | 493 | max_retries: 2 |
| 606 | 494 | retry_backoff_sec: 0.2 |
| 607 | - | |
| 608 | -# SPU配置(已启用,使用嵌套skus) | |
| 609 | 495 | spu_config: |
| 610 | 496 | enabled: true |
| 611 | - spu_field: "spu_id" | |
| 497 | + spu_field: spu_id | |
| 612 | 498 | inner_hits_size: 10 |
| 613 | - # 配置哪些option维度参与检索(进索引、以及在线搜索) | |
| 614 | - # 格式为list,选择option1/option2/option3中的一个或多个 | |
| 615 | - searchable_option_dimensions: ['option1', 'option2', 'option3'] | |
| 616 | - | |
| 617 | -# 租户配置(Tenant Configuration) | |
| 618 | -# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选) | |
| 619 | -# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集 | |
| 499 | + searchable_option_dimensions: | |
| 500 | + - option1 | |
| 501 | + - option2 | |
| 502 | + - option3 | |
| 620 | 503 | tenant_config: |
| 621 | 504 | default: |
| 622 | - primary_language: "en" | |
| 623 | - index_languages: ["en", "zh"] | |
| 505 | + primary_language: en | |
| 506 | + index_languages: | |
| 507 | + - en | |
| 508 | + - zh | |
| 624 | 509 | tenants: |
| 625 | - "1": | |
| 626 | - primary_language: "zh" | |
| 627 | - index_languages: ["zh", "en"] | |
| 628 | - "2": | |
| 629 | - primary_language: "en" | |
| 630 | - index_languages: ["en", "zh"] | |
| 631 | - "3": | |
| 632 | - primary_language: "zh" | |
| 633 | - index_languages: ["zh", "en"] | |
| 634 | - "162": | |
| 635 | - primary_language: "zh" | |
| 636 | - index_languages: ["zh", "en"] | |
| 637 | - "170": | |
| 638 | - primary_language: "en" | |
| 639 | - index_languages: ["en", "zh"] | |
| 510 | + '1': | |
| 511 | + primary_language: zh | |
| 512 | + index_languages: | |
| 513 | + - zh | |
| 514 | + - en | |
| 515 | + '2': | |
| 516 | + primary_language: en | |
| 517 | + index_languages: | |
| 518 | + - en | |
| 519 | + - zh | |
| 520 | + '3': | |
| 521 | + primary_language: zh | |
| 522 | + index_languages: | |
| 523 | + - zh | |
| 524 | + - en | |
| 525 | + '162': | |
| 526 | + primary_language: zh | |
| 527 | + index_languages: | |
| 528 | + - zh | |
| 529 | + - en | |
| 530 | + '170': | |
| 531 | + primary_language: en | |
| 532 | + index_languages: | |
| 533 | + - en | |
| 534 | + - zh | ... | ... |
scripts/evaluation/README.md renamed to scripts/evaluation/README_Requirement.md
scripts/evaluation/README_zh.md renamed to scripts/evaluation/README_Requirement_zh.md
| ... | ... | @@ -106,4 +106,18 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何 |
| 106 | 106 | 批量评估关注的是所有搜索词总体的评估指标。 |
| 107 | 107 | 需要记录测试环境时间以及当时的配置文件,以及对应的结果。要保存历次的评估记录,并能查到每一次评估结果对应的配置文件有相关的指标 |
| 108 | 108 | |
| 109 | -以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。 | |
| 110 | 109 | \ No newline at end of file |
| 110 | +以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。 | |
| 111 | + | |
| 112 | + | |
| 113 | + | |
| 114 | + | |
| 115 | + | |
| 116 | + | |
| 117 | + | |
| 118 | + | |
| 119 | + | |
| 120 | +1. 请仔细检验这个标注集的质量,如果质量不符合要求,那么你要优化工具,迭代直至标注集的结果质量足够高,可以以此为自动化工具来评估检索效果,对检索效果形成指导性意见。 | |
| 121 | +2. 在结果标注集的质量足够好,批量评估工具足够好用,并且经过你的试用,能判断出搜索质量好坏的情况下,开始真正的动手检索效果调优:基于这个50条query的结果标注集和批量评估工具,对融合公式进行调参。请你先精心地设计实验,设计几组参数,对几组参数分别修改config.yaml、重启(./restart.sh backend)、跑批量评估、收集结果。 | |
| 122 | +注意评估的过程中,如果发现工具不好用,发现日志不全,发现可以通过修改工具或者日志来提高效率,都可以先做这些,根据完善。 | |
| 123 | +注意你是代码的总负责人,你有任何权限来满足你进行检索效果调优的需要。你如果发现有其他可能带来更大提升的点,也可以进行实验,你甚至可以修改融合、重排漏斗的代码,来进行实验,以追求更好的结果指标。 | |
| 124 | +但是注意,因为收到性能和耗时的约束,不要调大reranker模型的输入条数、不要打开精排,耗时方面无法承受两轮reranker模型的调用。 | ... | ... |
| ... | ... | @@ -0,0 +1,14 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | +from pathlib import Path | |
| 4 | +import sys | |
| 5 | + | |
| 6 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| 7 | +if str(PROJECT_ROOT) not in sys.path: | |
| 8 | + sys.path.insert(0, str(PROJECT_ROOT)) | |
| 9 | + | |
| 10 | +from scripts.evaluation.eval_framework import main | |
| 11 | + | |
| 12 | + | |
| 13 | +if __name__ == "__main__": | |
| 14 | + main() | ... | ... |
| ... | ... | @@ -0,0 +1,1786 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | +""" | |
| 3 | +Search evaluation framework for pooled relevance annotation, live metrics, and reports. | |
| 4 | +""" | |
| 5 | + | |
| 6 | +from __future__ import annotations | |
| 7 | + | |
| 8 | +import argparse | |
| 9 | +import hashlib | |
| 10 | +import json | |
| 11 | +import math | |
| 12 | +import os | |
| 13 | +import re | |
| 14 | +import sqlite3 | |
| 15 | +import sys | |
| 16 | +import time | |
| 17 | +from dataclasses import dataclass | |
| 18 | +from datetime import datetime, timezone | |
| 19 | +from pathlib import Path | |
| 20 | +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple | |
| 21 | + | |
| 22 | +import requests | |
| 23 | +from elasticsearch.helpers import scan | |
| 24 | +from fastapi import FastAPI, HTTPException | |
| 25 | +from fastapi.responses import HTMLResponse | |
| 26 | +from pydantic import BaseModel, Field | |
| 27 | + | |
| 28 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| 29 | +if str(PROJECT_ROOT) not in sys.path: | |
| 30 | + sys.path.insert(0, str(PROJECT_ROOT)) | |
| 31 | + | |
| 32 | +from api.app import get_app_config, get_es_client, get_query_parser, init_service | |
| 33 | +from indexer.mapping_generator import get_tenant_index_name | |
| 34 | + | |
| 35 | + | |
| 36 | +RELEVANCE_EXACT = "Exact" | |
| 37 | +RELEVANCE_PARTIAL = "Partial" | |
| 38 | +RELEVANCE_IRRELEVANT = "Irrelevant" | |
| 39 | +VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT} | |
| 40 | +DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation" | |
| 41 | +DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt" | |
| 42 | +JUDGE_PROMPT_VERSION = "v2_structured_20260331" | |
| 43 | + | |
| 44 | + | |
| 45 | +def utc_now_iso() -> str: | |
| 46 | + return datetime.now(timezone.utc).isoformat() | |
| 47 | + | |
| 48 | + | |
| 49 | +def utc_timestamp() -> str: | |
| 50 | + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") | |
| 51 | + | |
| 52 | + | |
| 53 | +def ensure_dir(path: Path) -> Path: | |
| 54 | + path.mkdir(parents=True, exist_ok=True) | |
| 55 | + return path | |
| 56 | + | |
| 57 | + | |
| 58 | +def sha1_text(text: str) -> str: | |
| 59 | + return hashlib.sha1(text.encode("utf-8")).hexdigest() | |
| 60 | + | |
| 61 | + | |
| 62 | +def pick_text(value: Any, preferred_lang: str = "en") -> str: | |
| 63 | + if value is None: | |
| 64 | + return "" | |
| 65 | + if isinstance(value, dict): | |
| 66 | + return str( | |
| 67 | + value.get(preferred_lang) | |
| 68 | + or value.get("en") | |
| 69 | + or value.get("zh") | |
| 70 | + or next((v for v in value.values() if v), "") | |
| 71 | + ).strip() | |
| 72 | + return str(value).strip() | |
| 73 | + | |
| 74 | + | |
| 75 | +def safe_json_dumps(data: Any) -> str: | |
| 76 | + return json.dumps(data, ensure_ascii=False, separators=(",", ":")) | |
| 77 | + | |
| 78 | + | |
| 79 | +def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]: | |
| 80 | + if not skus: | |
| 81 | + return "", "", "" | |
| 82 | + first = skus[0] or {} | |
| 83 | + return ( | |
| 84 | + str(first.get("option1_value") or "").strip(), | |
| 85 | + str(first.get("option2_value") or "").strip(), | |
| 86 | + str(first.get("option3_value") or "").strip(), | |
| 87 | + ) | |
| 88 | + | |
| 89 | + | |
| 90 | +def build_display_title(doc: Dict[str, Any]) -> str: | |
| 91 | + title = doc.get("title") | |
| 92 | + en = pick_text(title, "en") | |
| 93 | + zh = pick_text(title, "zh") | |
| 94 | + if en and zh and en != zh: | |
| 95 | + return f"{en} / {zh}" | |
| 96 | + return en or zh | |
| 97 | + | |
| 98 | + | |
| 99 | +def build_rerank_doc(doc: Dict[str, Any]) -> str: | |
| 100 | + title = build_display_title(doc) | |
| 101 | + return title[:400] | |
| 102 | + | |
| 103 | + | |
| 104 | +def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: | |
| 105 | + title = build_display_title(doc) | |
| 106 | + option1, option2, option3 = compact_option_values(doc.get("skus") or []) | |
| 107 | + vendor = pick_text(doc.get("vendor"), "en") | |
| 108 | + category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") | |
| 109 | + tags = doc.get("tags") or [] | |
| 110 | + tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) | |
| 111 | + parts = [title] | |
| 112 | + if option1: | |
| 113 | + parts.append(f"option1={option1}") | |
| 114 | + if option2: | |
| 115 | + parts.append(f"option2={option2}") | |
| 116 | + if option3: | |
| 117 | + parts.append(f"option3={option3}") | |
| 118 | + if vendor: | |
| 119 | + parts.append(f"vendor={vendor}") | |
| 120 | + if category: | |
| 121 | + parts.append(f"category={category}") | |
| 122 | + if tags_text: | |
| 123 | + parts.append(f"tags={tags_text}") | |
| 124 | + return f"{idx}. " + " | ".join(part for part in parts if part) | |
| 125 | + | |
| 126 | + | |
| 127 | +def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: | |
| 128 | + return { | |
| 129 | + "spu_id": str(doc.get("spu_id") or ""), | |
| 130 | + "title": build_display_title(doc), | |
| 131 | + "image_url": doc.get("image_url"), | |
| 132 | + "vendor": pick_text(doc.get("vendor"), "en"), | |
| 133 | + "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), | |
| 134 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 135 | + "tags": list((doc.get("tags") or [])[:6]), | |
| 136 | + } | |
| 137 | + | |
| 138 | + | |
| 139 | +def normalize_text(text: Any) -> str: | |
| 140 | + value = str(text or "").strip().lower() | |
| 141 | + value = re.sub(r"\s+", " ", value) | |
| 142 | + return value | |
| 143 | + | |
| 144 | + | |
| 145 | +def _extract_json_blob(text: str) -> Any: | |
| 146 | + cleaned = str(text or "").strip() | |
| 147 | + candidates: List[str] = [cleaned] | |
| 148 | + fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I) | |
| 149 | + candidates.extend(match.strip() for match in fence_matches if match.strip()) | |
| 150 | + | |
| 151 | + for candidate in candidates: | |
| 152 | + try: | |
| 153 | + return json.loads(candidate) | |
| 154 | + except Exception: | |
| 155 | + pass | |
| 156 | + | |
| 157 | + starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"] | |
| 158 | + ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"] | |
| 159 | + for start in starts: | |
| 160 | + for end in reversed(ends): | |
| 161 | + if end <= start: | |
| 162 | + continue | |
| 163 | + fragment = cleaned[start : end + 1] | |
| 164 | + try: | |
| 165 | + return json.loads(fragment) | |
| 166 | + except Exception: | |
| 167 | + continue | |
| 168 | + raise ValueError(f"failed to parse json from: {cleaned[:500]!r}") | |
| 169 | + | |
| 170 | + | |
| 171 | +@dataclass | |
| 172 | +class QueryBuildResult: | |
| 173 | + query: str | |
| 174 | + tenant_id: str | |
| 175 | + search_total: int | |
| 176 | + search_depth: int | |
| 177 | + rerank_corpus_size: int | |
| 178 | + annotated_count: int | |
| 179 | + output_json_path: Path | |
| 180 | + | |
| 181 | + | |
| 182 | +class EvalStore: | |
| 183 | + def __init__(self, db_path: Path): | |
| 184 | + self.db_path = db_path | |
| 185 | + ensure_dir(db_path.parent) | |
| 186 | + self.conn = sqlite3.connect(str(db_path), check_same_thread=False) | |
| 187 | + self.conn.row_factory = sqlite3.Row | |
| 188 | + self._init_schema() | |
| 189 | + | |
| 190 | + def _init_schema(self) -> None: | |
| 191 | + self.conn.executescript( | |
| 192 | + """ | |
| 193 | + CREATE TABLE IF NOT EXISTS corpus_docs ( | |
| 194 | + tenant_id TEXT NOT NULL, | |
| 195 | + spu_id TEXT NOT NULL, | |
| 196 | + title_json TEXT, | |
| 197 | + vendor_json TEXT, | |
| 198 | + category_path_json TEXT, | |
| 199 | + category_name_json TEXT, | |
| 200 | + image_url TEXT, | |
| 201 | + skus_json TEXT, | |
| 202 | + tags_json TEXT, | |
| 203 | + raw_json TEXT NOT NULL, | |
| 204 | + updated_at TEXT NOT NULL, | |
| 205 | + PRIMARY KEY (tenant_id, spu_id) | |
| 206 | + ); | |
| 207 | + | |
| 208 | + CREATE TABLE IF NOT EXISTS rerank_scores ( | |
| 209 | + tenant_id TEXT NOT NULL, | |
| 210 | + query_text TEXT NOT NULL, | |
| 211 | + spu_id TEXT NOT NULL, | |
| 212 | + score REAL NOT NULL, | |
| 213 | + model_name TEXT, | |
| 214 | + updated_at TEXT NOT NULL, | |
| 215 | + PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 216 | + ); | |
| 217 | + | |
| 218 | + CREATE TABLE IF NOT EXISTS relevance_labels ( | |
| 219 | + tenant_id TEXT NOT NULL, | |
| 220 | + query_text TEXT NOT NULL, | |
| 221 | + spu_id TEXT NOT NULL, | |
| 222 | + label TEXT NOT NULL, | |
| 223 | + judge_model TEXT, | |
| 224 | + raw_response TEXT, | |
| 225 | + updated_at TEXT NOT NULL, | |
| 226 | + PRIMARY KEY (tenant_id, query_text, spu_id) | |
| 227 | + ); | |
| 228 | + | |
| 229 | + CREATE TABLE IF NOT EXISTS build_runs ( | |
| 230 | + run_id TEXT PRIMARY KEY, | |
| 231 | + tenant_id TEXT NOT NULL, | |
| 232 | + query_text TEXT NOT NULL, | |
| 233 | + output_json_path TEXT NOT NULL, | |
| 234 | + metadata_json TEXT NOT NULL, | |
| 235 | + created_at TEXT NOT NULL | |
| 236 | + ); | |
| 237 | + | |
| 238 | + CREATE TABLE IF NOT EXISTS batch_runs ( | |
| 239 | + batch_id TEXT PRIMARY KEY, | |
| 240 | + tenant_id TEXT NOT NULL, | |
| 241 | + output_json_path TEXT NOT NULL, | |
| 242 | + report_markdown_path TEXT NOT NULL, | |
| 243 | + config_snapshot_path TEXT NOT NULL, | |
| 244 | + metadata_json TEXT NOT NULL, | |
| 245 | + created_at TEXT NOT NULL | |
| 246 | + ); | |
| 247 | + | |
| 248 | + CREATE TABLE IF NOT EXISTS query_profiles ( | |
| 249 | + tenant_id TEXT NOT NULL, | |
| 250 | + query_text TEXT NOT NULL, | |
| 251 | + prompt_version TEXT NOT NULL, | |
| 252 | + judge_model TEXT, | |
| 253 | + profile_json TEXT NOT NULL, | |
| 254 | + raw_response TEXT NOT NULL, | |
| 255 | + updated_at TEXT NOT NULL, | |
| 256 | + PRIMARY KEY (tenant_id, query_text, prompt_version) | |
| 257 | + ); | |
| 258 | + """ | |
| 259 | + ) | |
| 260 | + self.conn.commit() | |
| 261 | + | |
| 262 | + def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None: | |
| 263 | + now = utc_now_iso() | |
| 264 | + rows = [] | |
| 265 | + for doc in docs: | |
| 266 | + rows.append( | |
| 267 | + ( | |
| 268 | + tenant_id, | |
| 269 | + str(doc.get("spu_id") or ""), | |
| 270 | + safe_json_dumps(doc.get("title")), | |
| 271 | + safe_json_dumps(doc.get("vendor")), | |
| 272 | + safe_json_dumps(doc.get("category_path")), | |
| 273 | + safe_json_dumps(doc.get("category_name")), | |
| 274 | + str(doc.get("image_url") or ""), | |
| 275 | + safe_json_dumps(doc.get("skus") or []), | |
| 276 | + safe_json_dumps(doc.get("tags") or []), | |
| 277 | + safe_json_dumps(doc), | |
| 278 | + now, | |
| 279 | + ) | |
| 280 | + ) | |
| 281 | + self.conn.executemany( | |
| 282 | + """ | |
| 283 | + INSERT INTO corpus_docs ( | |
| 284 | + tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json, | |
| 285 | + image_url, skus_json, tags_json, raw_json, updated_at | |
| 286 | + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) | |
| 287 | + ON CONFLICT(tenant_id, spu_id) DO UPDATE SET | |
| 288 | + title_json=excluded.title_json, | |
| 289 | + vendor_json=excluded.vendor_json, | |
| 290 | + category_path_json=excluded.category_path_json, | |
| 291 | + category_name_json=excluded.category_name_json, | |
| 292 | + image_url=excluded.image_url, | |
| 293 | + skus_json=excluded.skus_json, | |
| 294 | + tags_json=excluded.tags_json, | |
| 295 | + raw_json=excluded.raw_json, | |
| 296 | + updated_at=excluded.updated_at | |
| 297 | + """, | |
| 298 | + rows, | |
| 299 | + ) | |
| 300 | + self.conn.commit() | |
| 301 | + | |
| 302 | + def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 303 | + rows = self.conn.execute( | |
| 304 | + "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id", | |
| 305 | + (tenant_id,), | |
| 306 | + ).fetchall() | |
| 307 | + return [json.loads(row["raw_json"]) for row in rows] | |
| 308 | + | |
| 309 | + def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]: | |
| 310 | + keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()] | |
| 311 | + if not keys: | |
| 312 | + return {} | |
| 313 | + placeholders = ",".join("?" for _ in keys) | |
| 314 | + rows = self.conn.execute( | |
| 315 | + f""" | |
| 316 | + SELECT spu_id, raw_json | |
| 317 | + FROM corpus_docs | |
| 318 | + WHERE tenant_id=? AND spu_id IN ({placeholders}) | |
| 319 | + """, | |
| 320 | + [tenant_id, *keys], | |
| 321 | + ).fetchall() | |
| 322 | + return { | |
| 323 | + str(row["spu_id"]): json.loads(row["raw_json"]) | |
| 324 | + for row in rows | |
| 325 | + } | |
| 326 | + | |
| 327 | + def has_corpus(self, tenant_id: str) -> bool: | |
| 328 | + row = self.conn.execute( | |
| 329 | + "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?", | |
| 330 | + (tenant_id,), | |
| 331 | + ).fetchone() | |
| 332 | + return bool(row and row["n"] > 0) | |
| 333 | + | |
| 334 | + def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]: | |
| 335 | + rows = self.conn.execute( | |
| 336 | + """ | |
| 337 | + SELECT spu_id, score | |
| 338 | + FROM rerank_scores | |
| 339 | + WHERE tenant_id=? AND query_text=? | |
| 340 | + """, | |
| 341 | + (tenant_id, query_text), | |
| 342 | + ).fetchall() | |
| 343 | + return {str(row["spu_id"]): float(row["score"]) for row in rows} | |
| 344 | + | |
| 345 | + def upsert_rerank_scores( | |
| 346 | + self, | |
| 347 | + tenant_id: str, | |
| 348 | + query_text: str, | |
| 349 | + scores: Dict[str, float], | |
| 350 | + model_name: str, | |
| 351 | + ) -> None: | |
| 352 | + now = utc_now_iso() | |
| 353 | + rows = [ | |
| 354 | + (tenant_id, query_text, spu_id, float(score), model_name, now) | |
| 355 | + for spu_id, score in scores.items() | |
| 356 | + ] | |
| 357 | + self.conn.executemany( | |
| 358 | + """ | |
| 359 | + INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at) | |
| 360 | + VALUES (?, ?, ?, ?, ?, ?) | |
| 361 | + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 362 | + score=excluded.score, | |
| 363 | + model_name=excluded.model_name, | |
| 364 | + updated_at=excluded.updated_at | |
| 365 | + """, | |
| 366 | + rows, | |
| 367 | + ) | |
| 368 | + self.conn.commit() | |
| 369 | + | |
| 370 | + def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]: | |
| 371 | + rows = self.conn.execute( | |
| 372 | + """ | |
| 373 | + SELECT spu_id, label | |
| 374 | + FROM relevance_labels | |
| 375 | + WHERE tenant_id=? AND query_text=? | |
| 376 | + """, | |
| 377 | + (tenant_id, query_text), | |
| 378 | + ).fetchall() | |
| 379 | + return {str(row["spu_id"]): str(row["label"]) for row in rows} | |
| 380 | + | |
| 381 | + def upsert_labels( | |
| 382 | + self, | |
| 383 | + tenant_id: str, | |
| 384 | + query_text: str, | |
| 385 | + labels: Dict[str, str], | |
| 386 | + judge_model: str, | |
| 387 | + raw_response: str, | |
| 388 | + ) -> None: | |
| 389 | + now = utc_now_iso() | |
| 390 | + rows = [] | |
| 391 | + for spu_id, label in labels.items(): | |
| 392 | + if label not in VALID_LABELS: | |
| 393 | + raise ValueError(f"invalid label: {label}") | |
| 394 | + rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now)) | |
| 395 | + self.conn.executemany( | |
| 396 | + """ | |
| 397 | + INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at) | |
| 398 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 399 | + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET | |
| 400 | + label=excluded.label, | |
| 401 | + judge_model=excluded.judge_model, | |
| 402 | + raw_response=excluded.raw_response, | |
| 403 | + updated_at=excluded.updated_at | |
| 404 | + """, | |
| 405 | + rows, | |
| 406 | + ) | |
| 407 | + self.conn.commit() | |
| 408 | + | |
| 409 | + def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]: | |
| 410 | + row = self.conn.execute( | |
| 411 | + """ | |
| 412 | + SELECT profile_json | |
| 413 | + FROM query_profiles | |
| 414 | + WHERE tenant_id=? AND query_text=? AND prompt_version=? | |
| 415 | + """, | |
| 416 | + (tenant_id, query_text, prompt_version), | |
| 417 | + ).fetchone() | |
| 418 | + if not row: | |
| 419 | + return None | |
| 420 | + return json.loads(row["profile_json"]) | |
| 421 | + | |
| 422 | + def upsert_query_profile( | |
| 423 | + self, | |
| 424 | + tenant_id: str, | |
| 425 | + query_text: str, | |
| 426 | + prompt_version: str, | |
| 427 | + judge_model: str, | |
| 428 | + profile: Dict[str, Any], | |
| 429 | + raw_response: str, | |
| 430 | + ) -> None: | |
| 431 | + self.conn.execute( | |
| 432 | + """ | |
| 433 | + INSERT OR REPLACE INTO query_profiles | |
| 434 | + (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at) | |
| 435 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 436 | + """, | |
| 437 | + ( | |
| 438 | + tenant_id, | |
| 439 | + query_text, | |
| 440 | + prompt_version, | |
| 441 | + judge_model, | |
| 442 | + safe_json_dumps(profile), | |
| 443 | + raw_response, | |
| 444 | + utc_now_iso(), | |
| 445 | + ), | |
| 446 | + ) | |
| 447 | + self.conn.commit() | |
| 448 | + | |
| 449 | + def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None: | |
| 450 | + self.conn.execute( | |
| 451 | + """ | |
| 452 | + INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at) | |
| 453 | + VALUES (?, ?, ?, ?, ?, ?) | |
| 454 | + """, | |
| 455 | + (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()), | |
| 456 | + ) | |
| 457 | + self.conn.commit() | |
| 458 | + | |
| 459 | + def insert_batch_run( | |
| 460 | + self, | |
| 461 | + batch_id: str, | |
| 462 | + tenant_id: str, | |
| 463 | + output_json_path: Path, | |
| 464 | + report_markdown_path: Path, | |
| 465 | + config_snapshot_path: Path, | |
| 466 | + metadata: Dict[str, Any], | |
| 467 | + ) -> None: | |
| 468 | + self.conn.execute( | |
| 469 | + """ | |
| 470 | + INSERT OR REPLACE INTO batch_runs | |
| 471 | + (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at) | |
| 472 | + VALUES (?, ?, ?, ?, ?, ?, ?) | |
| 473 | + """, | |
| 474 | + ( | |
| 475 | + batch_id, | |
| 476 | + tenant_id, | |
| 477 | + str(output_json_path), | |
| 478 | + str(report_markdown_path), | |
| 479 | + str(config_snapshot_path), | |
| 480 | + safe_json_dumps(metadata), | |
| 481 | + utc_now_iso(), | |
| 482 | + ), | |
| 483 | + ) | |
| 484 | + self.conn.commit() | |
| 485 | + | |
| 486 | + def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]: | |
| 487 | + rows = self.conn.execute( | |
| 488 | + """ | |
| 489 | + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at | |
| 490 | + FROM batch_runs | |
| 491 | + ORDER BY created_at DESC | |
| 492 | + LIMIT ? | |
| 493 | + """, | |
| 494 | + (limit,), | |
| 495 | + ).fetchall() | |
| 496 | + items: List[Dict[str, Any]] = [] | |
| 497 | + for row in rows: | |
| 498 | + items.append( | |
| 499 | + { | |
| 500 | + "batch_id": row["batch_id"], | |
| 501 | + "tenant_id": row["tenant_id"], | |
| 502 | + "output_json_path": row["output_json_path"], | |
| 503 | + "report_markdown_path": row["report_markdown_path"], | |
| 504 | + "config_snapshot_path": row["config_snapshot_path"], | |
| 505 | + "metadata": json.loads(row["metadata_json"]), | |
| 506 | + "created_at": row["created_at"], | |
| 507 | + } | |
| 508 | + ) | |
| 509 | + return items | |
| 510 | + | |
| 511 | + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]: | |
| 512 | + rows = self.conn.execute( | |
| 513 | + """ | |
| 514 | + SELECT | |
| 515 | + query_text, | |
| 516 | + COUNT(*) AS total, | |
| 517 | + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 518 | + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 519 | + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 520 | + MAX(updated_at) AS updated_at | |
| 521 | + FROM relevance_labels | |
| 522 | + WHERE tenant_id=? | |
| 523 | + GROUP BY query_text | |
| 524 | + ORDER BY query_text | |
| 525 | + """, | |
| 526 | + (tenant_id,), | |
| 527 | + ).fetchall() | |
| 528 | + return [ | |
| 529 | + { | |
| 530 | + "query": str(row["query_text"]), | |
| 531 | + "total": int(row["total"]), | |
| 532 | + "exact_count": int(row["exact_count"] or 0), | |
| 533 | + "partial_count": int(row["partial_count"] or 0), | |
| 534 | + "irrelevant_count": int(row["irrelevant_count"] or 0), | |
| 535 | + "updated_at": row["updated_at"], | |
| 536 | + } | |
| 537 | + for row in rows | |
| 538 | + ] | |
| 539 | + | |
| 540 | + def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]: | |
| 541 | + row = self.conn.execute( | |
| 542 | + """ | |
| 543 | + SELECT | |
| 544 | + COUNT(*) AS total, | |
| 545 | + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count, | |
| 546 | + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count, | |
| 547 | + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count, | |
| 548 | + MAX(updated_at) AS updated_at | |
| 549 | + FROM relevance_labels | |
| 550 | + WHERE tenant_id=? AND query_text=? | |
| 551 | + """, | |
| 552 | + (tenant_id, query_text), | |
| 553 | + ).fetchone() | |
| 554 | + return { | |
| 555 | + "query": query_text, | |
| 556 | + "total": int((row["total"] or 0) if row else 0), | |
| 557 | + "exact_count": int((row["exact_count"] or 0) if row else 0), | |
| 558 | + "partial_count": int((row["partial_count"] or 0) if row else 0), | |
| 559 | + "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0), | |
| 560 | + "updated_at": row["updated_at"] if row else None, | |
| 561 | + } | |
| 562 | + | |
| 563 | + | |
| 564 | +class SearchServiceClient: | |
| 565 | + def __init__(self, base_url: str, tenant_id: str): | |
| 566 | + self.base_url = base_url.rstrip("/") | |
| 567 | + self.tenant_id = str(tenant_id) | |
| 568 | + self.session = requests.Session() | |
| 569 | + | |
| 570 | + def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]: | |
| 571 | + response = self.session.post( | |
| 572 | + f"{self.base_url}/search/", | |
| 573 | + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id}, | |
| 574 | + json={"query": query, "size": size, "from": from_, "language": language}, | |
| 575 | + timeout=120, | |
| 576 | + ) | |
| 577 | + response.raise_for_status() | |
| 578 | + return response.json() | |
| 579 | + | |
| 580 | + | |
| 581 | +class RerankServiceClient: | |
| 582 | + def __init__(self, service_url: str): | |
| 583 | + self.service_url = service_url.rstrip("/") | |
| 584 | + self.session = requests.Session() | |
| 585 | + | |
| 586 | + def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]: | |
| 587 | + payload: Dict[str, Any] = { | |
| 588 | + "query": query, | |
| 589 | + "docs": list(docs), | |
| 590 | + "normalize": normalize, | |
| 591 | + } | |
| 592 | + if top_n is not None: | |
| 593 | + payload["top_n"] = int(top_n) | |
| 594 | + response = self.session.post(self.service_url, json=payload, timeout=180) | |
| 595 | + response.raise_for_status() | |
| 596 | + data = response.json() | |
| 597 | + return list(data.get("scores") or []), dict(data.get("meta") or {}) | |
| 598 | + | |
| 599 | + | |
| 600 | +class DashScopeLabelClient: | |
| 601 | + def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40): | |
| 602 | + self.model = model | |
| 603 | + self.base_url = base_url.rstrip("/") | |
| 604 | + self.api_key = api_key | |
| 605 | + self.batch_size = int(batch_size) | |
| 606 | + self.session = requests.Session() | |
| 607 | + | |
| 608 | + def _chat(self, prompt: str) -> Tuple[str, str]: | |
| 609 | + response = self.session.post( | |
| 610 | + f"{self.base_url}/chat/completions", | |
| 611 | + headers={ | |
| 612 | + "Authorization": f"Bearer {self.api_key}", | |
| 613 | + "Content-Type": "application/json", | |
| 614 | + }, | |
| 615 | + json={ | |
| 616 | + "model": self.model, | |
| 617 | + "messages": [{"role": "user", "content": prompt}], | |
| 618 | + "temperature": 0, | |
| 619 | + "top_p": 0.1, | |
| 620 | + }, | |
| 621 | + timeout=180, | |
| 622 | + ) | |
| 623 | + response.raise_for_status() | |
| 624 | + data = response.json() | |
| 625 | + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip() | |
| 626 | + return content, safe_json_dumps(data) | |
| 627 | + | |
| 628 | + def extract_query_profile( | |
| 629 | + self, | |
| 630 | + query: str, | |
| 631 | + parser_hints: Dict[str, Any], | |
| 632 | + ) -> Tuple[Dict[str, Any], str]: | |
| 633 | + prompt = ( | |
| 634 | + "You are building a structured intent profile for e-commerce relevance judging.\n" | |
| 635 | + "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n" | |
| 636 | + "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n" | |
| 637 | + "Return JSON with this schema:\n" | |
| 638 | + "{\n" | |
| 639 | + ' "normalized_query_en": string,\n' | |
| 640 | + ' "primary_category": string,\n' | |
| 641 | + ' "allowed_categories": [string],\n' | |
| 642 | + ' "required_attributes": [\n' | |
| 643 | + ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n' | |
| 644 | + " ],\n" | |
| 645 | + ' "notes": [string]\n' | |
| 646 | + "}\n\n" | |
| 647 | + "Guidelines:\n" | |
| 648 | + "- Exact later will require explicit evidence for all required attributes.\n" | |
| 649 | + "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n" | |
| 650 | + "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n" | |
| 651 | + "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n" | |
| 652 | + "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n" | |
| 653 | + "- For color, include conflicting colors only when clear from the query.\n\n" | |
| 654 | + f"Original query: {query}\n" | |
| 655 | + f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n" | |
| 656 | + ) | |
| 657 | + content, raw_response = self._chat(prompt) | |
| 658 | + payload = _extract_json_blob(content) | |
| 659 | + if not isinstance(payload, dict): | |
| 660 | + raise ValueError(f"unexpected query profile payload: {content!r}") | |
| 661 | + payload.setdefault("normalized_query_en", query) | |
| 662 | + payload.setdefault("primary_category", "") | |
| 663 | + payload.setdefault("allowed_categories", []) | |
| 664 | + payload.setdefault("required_attributes", []) | |
| 665 | + payload.setdefault("notes", []) | |
| 666 | + return payload, raw_response | |
| 667 | + | |
| 668 | + def classify_batch( | |
| 669 | + self, | |
| 670 | + query: str, | |
| 671 | + query_profile: Dict[str, Any], | |
| 672 | + docs: Sequence[Dict[str, Any]], | |
| 673 | + ) -> Tuple[List[str], str]: | |
| 674 | + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)] | |
| 675 | + prompt = ( | |
| 676 | + "You are an e-commerce search relevance judge.\n" | |
| 677 | + "Judge each product against the structured query profile below.\n\n" | |
| 678 | + "Relevance rules:\n" | |
| 679 | + "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n" | |
| 680 | + "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n" | |
| 681 | + "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n" | |
| 682 | + "- Be conservative with Exact.\n" | |
| 683 | + "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n" | |
| 684 | + "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n" | |
| 685 | + f"Original query: {query}\n" | |
| 686 | + f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n" | |
| 687 | + "Products:\n" | |
| 688 | + + "\n".join(numbered_docs) | |
| 689 | + + "\n\nReturn JSON only, with schema:\n" | |
| 690 | + '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n' | |
| 691 | + ) | |
| 692 | + content, raw_response = self._chat(prompt) | |
| 693 | + payload = _extract_json_blob(content) | |
| 694 | + if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list): | |
| 695 | + raise ValueError(f"unexpected label payload: {content!r}") | |
| 696 | + labels_payload = payload["labels"] | |
| 697 | + labels: List[str] = [] | |
| 698 | + for item in labels_payload[: len(docs)]: | |
| 699 | + if not isinstance(item, dict): | |
| 700 | + continue | |
| 701 | + label = str(item.get("label") or "").strip() | |
| 702 | + if label in VALID_LABELS: | |
| 703 | + labels.append(label) | |
| 704 | + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels): | |
| 705 | + raise ValueError(f"unexpected label output: {content!r}") | |
| 706 | + return labels, raw_response | |
| 707 | + | |
| 708 | + | |
| 709 | +def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float: | |
| 710 | + if k <= 0: | |
| 711 | + return 0.0 | |
| 712 | + sliced = list(labels[:k]) | |
| 713 | + if not sliced: | |
| 714 | + return 0.0 | |
| 715 | + hits = sum(1 for label in sliced if label in relevant) | |
| 716 | + return hits / float(min(k, len(sliced))) | |
| 717 | + | |
| 718 | + | |
| 719 | +def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float: | |
| 720 | + hit_count = 0 | |
| 721 | + precision_sum = 0.0 | |
| 722 | + for idx, label in enumerate(labels, start=1): | |
| 723 | + if label not in relevant: | |
| 724 | + continue | |
| 725 | + hit_count += 1 | |
| 726 | + precision_sum += hit_count / idx | |
| 727 | + if hit_count == 0: | |
| 728 | + return 0.0 | |
| 729 | + return precision_sum / hit_count | |
| 730 | + | |
| 731 | + | |
| 732 | +def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]: | |
| 733 | + metrics: Dict[str, float] = {} | |
| 734 | + for k in (5, 10, 20, 50): | |
| 735 | + metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6) | |
| 736 | + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 737 | + metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6) | |
| 738 | + metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6) | |
| 739 | + return metrics | |
| 740 | + | |
| 741 | + | |
| 742 | +def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]: | |
| 743 | + if not metric_items: | |
| 744 | + return {} | |
| 745 | + keys = sorted(metric_items[0].keys()) | |
| 746 | + return { | |
| 747 | + key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6) | |
| 748 | + for key in keys | |
| 749 | + } | |
| 750 | + | |
| 751 | + | |
| 752 | +def label_distribution(labels: Sequence[str]) -> Dict[str, int]: | |
| 753 | + return { | |
| 754 | + RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT), | |
| 755 | + RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL), | |
| 756 | + RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT), | |
| 757 | + } | |
| 758 | + | |
| 759 | + | |
| 760 | +class SearchEvaluationFramework: | |
| 761 | + def __init__( | |
| 762 | + self, | |
| 763 | + tenant_id: str, | |
| 764 | + artifact_root: Path = DEFAULT_ARTIFACT_ROOT, | |
| 765 | + search_base_url: str = "http://localhost:6002", | |
| 766 | + ): | |
| 767 | + init_service(get_app_config().infrastructure.elasticsearch.host) | |
| 768 | + self.tenant_id = str(tenant_id) | |
| 769 | + self.artifact_root = ensure_dir(artifact_root) | |
| 770 | + self.store = EvalStore(self.artifact_root / "search_eval.sqlite3") | |
| 771 | + self.search_client = SearchServiceClient(search_base_url, self.tenant_id) | |
| 772 | + app_cfg = get_app_config() | |
| 773 | + rerank_service_url = str( | |
| 774 | + app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"] | |
| 775 | + ) | |
| 776 | + self.rerank_client = RerankServiceClient(rerank_service_url) | |
| 777 | + llm_cfg = app_cfg.services.translation.capabilities["llm"] | |
| 778 | + api_key = app_cfg.infrastructure.secrets.dashscope_api_key | |
| 779 | + if not api_key: | |
| 780 | + raise RuntimeError("dashscope_api_key is required for search evaluation annotation") | |
| 781 | + self.label_client = DashScopeLabelClient( | |
| 782 | + model=str(llm_cfg["model"]), | |
| 783 | + base_url=str(llm_cfg["base_url"]), | |
| 784 | + api_key=str(api_key), | |
| 785 | + ) | |
| 786 | + self.query_parser = get_query_parser() | |
| 787 | + | |
| 788 | + def build_query_parser_hints(self, query: str) -> Dict[str, Any]: | |
| 789 | + parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"]) | |
| 790 | + payload = parsed.to_dict() | |
| 791 | + payload["text_for_rerank"] = parsed.text_for_rerank() | |
| 792 | + return payload | |
| 793 | + | |
| 794 | + def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]: | |
| 795 | + if not force_refresh: | |
| 796 | + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION) | |
| 797 | + if cached is not None: | |
| 798 | + return cached | |
| 799 | + parser_hints = self.build_query_parser_hints(query) | |
| 800 | + profile, raw_response = self.label_client.extract_query_profile(query, parser_hints) | |
| 801 | + profile["parser_hints"] = parser_hints | |
| 802 | + self.store.upsert_query_profile( | |
| 803 | + self.tenant_id, | |
| 804 | + query, | |
| 805 | + JUDGE_PROMPT_VERSION, | |
| 806 | + self.label_client.model, | |
| 807 | + profile, | |
| 808 | + raw_response, | |
| 809 | + ) | |
| 810 | + return profile | |
| 811 | + | |
| 812 | + @staticmethod | |
| 813 | + def _doc_evidence_text(doc: Dict[str, Any]) -> str: | |
| 814 | + pieces: List[str] = [ | |
| 815 | + build_display_title(doc), | |
| 816 | + pick_text(doc.get("vendor"), "en"), | |
| 817 | + pick_text(doc.get("category_path"), "en"), | |
| 818 | + pick_text(doc.get("category_name"), "en"), | |
| 819 | + ] | |
| 820 | + for sku in doc.get("skus") or []: | |
| 821 | + pieces.extend( | |
| 822 | + [ | |
| 823 | + str(sku.get("option1_value") or ""), | |
| 824 | + str(sku.get("option2_value") or ""), | |
| 825 | + str(sku.get("option3_value") or ""), | |
| 826 | + ] | |
| 827 | + ) | |
| 828 | + for tag in doc.get("tags") or []: | |
| 829 | + pieces.append(str(tag)) | |
| 830 | + return normalize_text(" | ".join(piece for piece in pieces if piece)) | |
| 831 | + | |
| 832 | + def _apply_rule_based_label_guardrails( | |
| 833 | + self, | |
| 834 | + label: str, | |
| 835 | + query_profile: Dict[str, Any], | |
| 836 | + doc: Dict[str, Any], | |
| 837 | + ) -> str: | |
| 838 | + if label not in VALID_LABELS: | |
| 839 | + return label | |
| 840 | + evidence = self._doc_evidence_text(doc) | |
| 841 | + category = normalize_text(query_profile.get("primary_category")) | |
| 842 | + allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()] | |
| 843 | + | |
| 844 | + primary_category_match = True | |
| 845 | + if category: | |
| 846 | + primary_category_match = category in evidence | |
| 847 | + allowed_category_match = True | |
| 848 | + if allowed_categories: | |
| 849 | + allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 850 | + | |
| 851 | + if label == RELEVANCE_EXACT and not primary_category_match: | |
| 852 | + if allowed_category_match: | |
| 853 | + label = RELEVANCE_PARTIAL | |
| 854 | + else: | |
| 855 | + return RELEVANCE_IRRELEVANT | |
| 856 | + | |
| 857 | + for attr in query_profile.get("required_attributes") or []: | |
| 858 | + if not isinstance(attr, dict): | |
| 859 | + continue | |
| 860 | + attr_name = normalize_text(attr.get("name")) | |
| 861 | + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}: | |
| 862 | + continue | |
| 863 | + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 864 | + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 865 | + if attr_name == "fit": | |
| 866 | + if any(term in {"oversized", "oversize"} for term in required_terms): | |
| 867 | + conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"]) | |
| 868 | + if any(term in {"fitted", "slim fit", "tight"} for term in required_terms): | |
| 869 | + conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"]) | |
| 870 | + has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 871 | + has_conflict = any(term in evidence for term in conflicting_terms) | |
| 872 | + | |
| 873 | + if has_conflict: | |
| 874 | + return RELEVANCE_IRRELEVANT | |
| 875 | + if label == RELEVANCE_EXACT and not has_required: | |
| 876 | + label = RELEVANCE_PARTIAL | |
| 877 | + | |
| 878 | + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 879 | + return RELEVANCE_IRRELEVANT | |
| 880 | + | |
| 881 | + return label | |
| 882 | + | |
| 883 | + @staticmethod | |
| 884 | + def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]: | |
| 885 | + option_values = list(item.get("option_values") or []) | |
| 886 | + while len(option_values) < 3: | |
| 887 | + option_values.append("") | |
| 888 | + product = dict(item.get("product") or {}) | |
| 889 | + return { | |
| 890 | + "spu_id": item.get("spu_id"), | |
| 891 | + "title": product.get("title") or item.get("title"), | |
| 892 | + "vendor": product.get("vendor"), | |
| 893 | + "category_path": product.get("category"), | |
| 894 | + "category_name": product.get("category"), | |
| 895 | + "image_url": item.get("image_url") or product.get("image_url"), | |
| 896 | + "tags": product.get("tags") or [], | |
| 897 | + "skus": [ | |
| 898 | + { | |
| 899 | + "option1_value": option_values[0], | |
| 900 | + "option2_value": option_values[1], | |
| 901 | + "option3_value": option_values[2], | |
| 902 | + } | |
| 903 | + ], | |
| 904 | + } | |
| 905 | + | |
| 906 | + def _collect_label_issues( | |
| 907 | + self, | |
| 908 | + label: str, | |
| 909 | + query_profile: Dict[str, Any], | |
| 910 | + doc: Dict[str, Any], | |
| 911 | + ) -> List[str]: | |
| 912 | + evidence = self._doc_evidence_text(doc) | |
| 913 | + issues: List[str] = [] | |
| 914 | + category = normalize_text(query_profile.get("primary_category")) | |
| 915 | + allowed_categories = [ | |
| 916 | + normalize_text(item) | |
| 917 | + for item in query_profile.get("allowed_categories") or [] | |
| 918 | + if str(item).strip() | |
| 919 | + ] | |
| 920 | + | |
| 921 | + primary_category_match = True if not category else category in evidence | |
| 922 | + allowed_category_match = False if allowed_categories else primary_category_match | |
| 923 | + if allowed_categories: | |
| 924 | + allowed_category_match = any(signal in evidence for signal in allowed_categories) | |
| 925 | + | |
| 926 | + if label == RELEVANCE_EXACT and not primary_category_match: | |
| 927 | + if allowed_category_match: | |
| 928 | + issues.append("Exact missing primary category evidence") | |
| 929 | + else: | |
| 930 | + issues.append("Exact has category mismatch") | |
| 931 | + | |
| 932 | + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match: | |
| 933 | + issues.append("Partial has category mismatch") | |
| 934 | + | |
| 935 | + for attr in query_profile.get("required_attributes") or []: | |
| 936 | + if not isinstance(attr, dict): | |
| 937 | + continue | |
| 938 | + attr_name = normalize_text(attr.get("name")) | |
| 939 | + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}: | |
| 940 | + continue | |
| 941 | + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)] | |
| 942 | + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)] | |
| 943 | + has_required = any(term in evidence for term in required_terms) if required_terms else True | |
| 944 | + has_conflict = any(term in evidence for term in conflicting_terms) | |
| 945 | + | |
| 946 | + if has_conflict and label != RELEVANCE_IRRELEVANT: | |
| 947 | + issues.append(f"{label} conflicts on {attr_name}") | |
| 948 | + if label == RELEVANCE_EXACT and not has_required: | |
| 949 | + issues.append(f"Exact missing {attr_name}") | |
| 950 | + return issues | |
| 951 | + | |
| 952 | + def audit_live_query( | |
| 953 | + self, | |
| 954 | + query: str, | |
| 955 | + *, | |
| 956 | + top_k: int = 100, | |
| 957 | + language: str = "en", | |
| 958 | + auto_annotate: bool = True, | |
| 959 | + ) -> Dict[str, Any]: | |
| 960 | + live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language) | |
| 961 | + query_profile = self.get_query_profile(query, force_refresh=False) | |
| 962 | + suspicious: List[Dict[str, Any]] = [] | |
| 963 | + | |
| 964 | + for item in live["results"]: | |
| 965 | + doc = self._result_item_to_doc(item) | |
| 966 | + issues = self._collect_label_issues(item["label"] or "", query_profile, doc) | |
| 967 | + suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc) | |
| 968 | + if suggested_label != (item["label"] or ""): | |
| 969 | + issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"] | |
| 970 | + if issues: | |
| 971 | + suspicious.append( | |
| 972 | + { | |
| 973 | + "rank": item["rank"], | |
| 974 | + "spu_id": item["spu_id"], | |
| 975 | + "title": item["title"], | |
| 976 | + "label": item["label"], | |
| 977 | + "suggested_label": suggested_label, | |
| 978 | + "issues": issues, | |
| 979 | + } | |
| 980 | + ) | |
| 981 | + | |
| 982 | + labels = [ | |
| 983 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 984 | + for item in live["results"] | |
| 985 | + ] | |
| 986 | + return { | |
| 987 | + "query": query, | |
| 988 | + "tenant_id": self.tenant_id, | |
| 989 | + "top_k": top_k, | |
| 990 | + "metrics": live["metrics"], | |
| 991 | + "distribution": label_distribution(labels), | |
| 992 | + "query_profile": query_profile, | |
| 993 | + "suspicious": suspicious, | |
| 994 | + "results": live["results"], | |
| 995 | + } | |
| 996 | + | |
| 997 | + def queries_from_file(self, path: Path) -> List[str]: | |
| 998 | + return [ | |
| 999 | + line.strip() | |
| 1000 | + for line in path.read_text(encoding="utf-8").splitlines() | |
| 1001 | + if line.strip() and not line.strip().startswith("#") | |
| 1002 | + ] | |
| 1003 | + | |
| 1004 | + def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]: | |
| 1005 | + if not refresh and self.store.has_corpus(self.tenant_id): | |
| 1006 | + return self.store.get_corpus_docs(self.tenant_id) | |
| 1007 | + | |
| 1008 | + es_client = get_es_client().client | |
| 1009 | + index_name = get_tenant_index_name(self.tenant_id) | |
| 1010 | + docs: List[Dict[str, Any]] = [] | |
| 1011 | + for hit in scan( | |
| 1012 | + client=es_client, | |
| 1013 | + index=index_name, | |
| 1014 | + query={ | |
| 1015 | + "_source": [ | |
| 1016 | + "spu_id", | |
| 1017 | + "title", | |
| 1018 | + "vendor", | |
| 1019 | + "category_path", | |
| 1020 | + "category_name", | |
| 1021 | + "image_url", | |
| 1022 | + "skus", | |
| 1023 | + "tags", | |
| 1024 | + ], | |
| 1025 | + "query": {"match_all": {}}, | |
| 1026 | + }, | |
| 1027 | + size=500, | |
| 1028 | + preserve_order=False, | |
| 1029 | + clear_scroll=True, | |
| 1030 | + ): | |
| 1031 | + source = dict(hit.get("_source") or {}) | |
| 1032 | + source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "") | |
| 1033 | + docs.append(source) | |
| 1034 | + self.store.upsert_corpus_docs(self.tenant_id, docs) | |
| 1035 | + return docs | |
| 1036 | + | |
| 1037 | + def full_corpus_rerank( | |
| 1038 | + self, | |
| 1039 | + query: str, | |
| 1040 | + docs: Sequence[Dict[str, Any]], | |
| 1041 | + batch_size: int = 24, | |
| 1042 | + force_refresh: bool = False, | |
| 1043 | + ) -> List[Dict[str, Any]]: | |
| 1044 | + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query) | |
| 1045 | + pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached] | |
| 1046 | + if pending: | |
| 1047 | + new_scores: Dict[str, float] = {} | |
| 1048 | + for start in range(0, len(pending), batch_size): | |
| 1049 | + batch = pending[start : start + batch_size] | |
| 1050 | + scores = self._rerank_batch_with_retry(query=query, docs=batch) | |
| 1051 | + if len(scores) != len(batch): | |
| 1052 | + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs") | |
| 1053 | + for doc, score in zip(batch, scores): | |
| 1054 | + new_scores[str(doc.get("spu_id"))] = float(score) | |
| 1055 | + self.store.upsert_rerank_scores( | |
| 1056 | + self.tenant_id, | |
| 1057 | + query, | |
| 1058 | + new_scores, | |
| 1059 | + model_name="qwen3_vllm_score", | |
| 1060 | + ) | |
| 1061 | + cached.update(new_scores) | |
| 1062 | + | |
| 1063 | + ranked = [] | |
| 1064 | + for doc in docs: | |
| 1065 | + spu_id = str(doc.get("spu_id")) | |
| 1066 | + ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc}) | |
| 1067 | + ranked.sort(key=lambda item: item["score"], reverse=True) | |
| 1068 | + return ranked | |
| 1069 | + | |
| 1070 | + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]: | |
| 1071 | + if not docs: | |
| 1072 | + return [] | |
| 1073 | + doc_texts = [build_rerank_doc(doc) for doc in docs] | |
| 1074 | + try: | |
| 1075 | + scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False) | |
| 1076 | + return scores | |
| 1077 | + except Exception: | |
| 1078 | + if len(docs) == 1: | |
| 1079 | + return [-1.0] | |
| 1080 | + if len(docs) <= 6: | |
| 1081 | + scores: List[float] = [] | |
| 1082 | + for doc in docs: | |
| 1083 | + scores.extend(self._rerank_batch_with_retry(query, [doc])) | |
| 1084 | + return scores | |
| 1085 | + mid = len(docs) // 2 | |
| 1086 | + left = self._rerank_batch_with_retry(query, docs[:mid]) | |
| 1087 | + right = self._rerank_batch_with_retry(query, docs[mid:]) | |
| 1088 | + return left + right | |
| 1089 | + | |
| 1090 | + def annotate_missing_labels( | |
| 1091 | + self, | |
| 1092 | + query: str, | |
| 1093 | + docs: Sequence[Dict[str, Any]], | |
| 1094 | + force_refresh: bool = False, | |
| 1095 | + ) -> Dict[str, str]: | |
| 1096 | + query_profile = self.get_query_profile(query, force_refresh=force_refresh) | |
| 1097 | + labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query) | |
| 1098 | + missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels] | |
| 1099 | + if not missing_docs: | |
| 1100 | + return labels | |
| 1101 | + | |
| 1102 | + for start in range(0, len(missing_docs), self.label_client.batch_size): | |
| 1103 | + batch = missing_docs[start : start + self.label_client.batch_size] | |
| 1104 | + batch_pairs = self._classify_with_retry(query, query_profile, batch) | |
| 1105 | + for sub_labels, raw_response, sub_batch in batch_pairs: | |
| 1106 | + to_store = { | |
| 1107 | + str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc) | |
| 1108 | + for doc, label in zip(sub_batch, sub_labels) | |
| 1109 | + } | |
| 1110 | + self.store.upsert_labels( | |
| 1111 | + self.tenant_id, | |
| 1112 | + query, | |
| 1113 | + to_store, | |
| 1114 | + judge_model=self.label_client.model, | |
| 1115 | + raw_response=raw_response, | |
| 1116 | + ) | |
| 1117 | + labels.update(to_store) | |
| 1118 | + time.sleep(0.1) | |
| 1119 | + return labels | |
| 1120 | + | |
| 1121 | + def _classify_with_retry( | |
| 1122 | + self, | |
| 1123 | + query: str, | |
| 1124 | + query_profile: Dict[str, Any], | |
| 1125 | + docs: Sequence[Dict[str, Any]], | |
| 1126 | + ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]: | |
| 1127 | + if not docs: | |
| 1128 | + return [] | |
| 1129 | + try: | |
| 1130 | + labels, raw_response = self.label_client.classify_batch(query, query_profile, docs) | |
| 1131 | + return [(labels, raw_response, docs)] | |
| 1132 | + except Exception: | |
| 1133 | + if len(docs) == 1: | |
| 1134 | + raise | |
| 1135 | + mid = len(docs) // 2 | |
| 1136 | + return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:]) | |
| 1137 | + | |
| 1138 | + def build_query_annotation_set( | |
| 1139 | + self, | |
| 1140 | + query: str, | |
| 1141 | + *, | |
| 1142 | + search_depth: int = 1000, | |
| 1143 | + rerank_depth: int = 10000, | |
| 1144 | + annotate_search_top_k: int = 120, | |
| 1145 | + annotate_rerank_top_k: int = 200, | |
| 1146 | + language: str = "en", | |
| 1147 | + force_refresh_rerank: bool = False, | |
| 1148 | + force_refresh_labels: bool = False, | |
| 1149 | + ) -> QueryBuildResult: | |
| 1150 | + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language) | |
| 1151 | + search_results = list(search_payload.get("results") or []) | |
| 1152 | + corpus = self.corpus_docs(refresh=False) | |
| 1153 | + full_rerank = self.full_corpus_rerank( | |
| 1154 | + query=query, | |
| 1155 | + docs=corpus, | |
| 1156 | + force_refresh=force_refresh_rerank, | |
| 1157 | + ) | |
| 1158 | + rerank_depth_effective = min(rerank_depth, len(full_rerank)) | |
| 1159 | + | |
| 1160 | + pool_docs: Dict[str, Dict[str, Any]] = {} | |
| 1161 | + for doc in search_results[:annotate_search_top_k]: | |
| 1162 | + pool_docs[str(doc.get("spu_id"))] = doc | |
| 1163 | + for item in full_rerank[:annotate_rerank_top_k]: | |
| 1164 | + pool_docs[str(item["spu_id"])] = item["doc"] | |
| 1165 | + | |
| 1166 | + query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels) | |
| 1167 | + labels = self.annotate_missing_labels( | |
| 1168 | + query=query, | |
| 1169 | + docs=list(pool_docs.values()), | |
| 1170 | + force_refresh=force_refresh_labels, | |
| 1171 | + ) | |
| 1172 | + | |
| 1173 | + search_labeled_results: List[Dict[str, Any]] = [] | |
| 1174 | + for rank, doc in enumerate(search_results, start=1): | |
| 1175 | + spu_id = str(doc.get("spu_id")) | |
| 1176 | + label = labels.get(spu_id) | |
| 1177 | + search_labeled_results.append( | |
| 1178 | + { | |
| 1179 | + "rank": rank, | |
| 1180 | + "spu_id": spu_id, | |
| 1181 | + "title": build_display_title(doc), | |
| 1182 | + "image_url": doc.get("image_url"), | |
| 1183 | + "rerank_score": None, | |
| 1184 | + "label": label, | |
| 1185 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1186 | + "product": compact_product_payload(doc), | |
| 1187 | + } | |
| 1188 | + ) | |
| 1189 | + | |
| 1190 | + rerank_top_results: List[Dict[str, Any]] = [] | |
| 1191 | + for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1): | |
| 1192 | + doc = item["doc"] | |
| 1193 | + spu_id = str(item["spu_id"]) | |
| 1194 | + rerank_top_results.append( | |
| 1195 | + { | |
| 1196 | + "rank": rank, | |
| 1197 | + "spu_id": spu_id, | |
| 1198 | + "title": build_display_title(doc), | |
| 1199 | + "image_url": doc.get("image_url"), | |
| 1200 | + "rerank_score": round(float(item["score"]), 8), | |
| 1201 | + "label": labels.get(spu_id), | |
| 1202 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1203 | + "product": compact_product_payload(doc), | |
| 1204 | + } | |
| 1205 | + ) | |
| 1206 | + | |
| 1207 | + top100_labels = [ | |
| 1208 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1209 | + for item in search_labeled_results[:100] | |
| 1210 | + ] | |
| 1211 | + metrics = compute_query_metrics(top100_labels) | |
| 1212 | + output_dir = ensure_dir(self.artifact_root / "query_builds") | |
| 1213 | + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}" | |
| 1214 | + output_json_path = output_dir / f"{run_id}.json" | |
| 1215 | + payload = { | |
| 1216 | + "run_id": run_id, | |
| 1217 | + "created_at": utc_now_iso(), | |
| 1218 | + "tenant_id": self.tenant_id, | |
| 1219 | + "query": query, | |
| 1220 | + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(), | |
| 1221 | + "search_total": int(search_payload.get("total") or 0), | |
| 1222 | + "search_depth_requested": search_depth, | |
| 1223 | + "search_depth_effective": len(search_results), | |
| 1224 | + "rerank_depth_requested": rerank_depth, | |
| 1225 | + "rerank_depth_effective": rerank_depth_effective, | |
| 1226 | + "corpus_size": len(corpus), | |
| 1227 | + "annotation_pool": { | |
| 1228 | + "annotate_search_top_k": annotate_search_top_k, | |
| 1229 | + "annotate_rerank_top_k": annotate_rerank_top_k, | |
| 1230 | + "pool_size": len(pool_docs), | |
| 1231 | + }, | |
| 1232 | + "query_profile": query_profile, | |
| 1233 | + "metrics_top100": metrics, | |
| 1234 | + "search_results": search_labeled_results, | |
| 1235 | + "full_rerank_top": rerank_top_results, | |
| 1236 | + } | |
| 1237 | + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1238 | + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"]) | |
| 1239 | + return QueryBuildResult( | |
| 1240 | + query=query, | |
| 1241 | + tenant_id=self.tenant_id, | |
| 1242 | + search_total=int(search_payload.get("total") or 0), | |
| 1243 | + search_depth=len(search_results), | |
| 1244 | + rerank_corpus_size=len(corpus), | |
| 1245 | + annotated_count=len(pool_docs), | |
| 1246 | + output_json_path=output_json_path, | |
| 1247 | + ) | |
| 1248 | + | |
| 1249 | + def evaluate_live_query( | |
| 1250 | + self, | |
| 1251 | + query: str, | |
| 1252 | + top_k: int = 100, | |
| 1253 | + auto_annotate: bool = True, | |
| 1254 | + language: str = "en", | |
| 1255 | + force_refresh_labels: bool = False, | |
| 1256 | + ) -> Dict[str, Any]: | |
| 1257 | + search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language) | |
| 1258 | + results = list(search_payload.get("results") or []) | |
| 1259 | + if auto_annotate: | |
| 1260 | + self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels) | |
| 1261 | + labels = self.store.get_labels(self.tenant_id, query) | |
| 1262 | + labeled = [] | |
| 1263 | + for rank, doc in enumerate(results[:top_k], start=1): | |
| 1264 | + spu_id = str(doc.get("spu_id")) | |
| 1265 | + labeled.append( | |
| 1266 | + { | |
| 1267 | + "rank": rank, | |
| 1268 | + "spu_id": spu_id, | |
| 1269 | + "title": build_display_title(doc), | |
| 1270 | + "image_url": doc.get("image_url"), | |
| 1271 | + "label": labels.get(spu_id), | |
| 1272 | + "option_values": list(compact_option_values(doc.get("skus") or [])), | |
| 1273 | + "product": compact_product_payload(doc), | |
| 1274 | + } | |
| 1275 | + ) | |
| 1276 | + metric_labels = [ | |
| 1277 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1278 | + for item in labeled | |
| 1279 | + ] | |
| 1280 | + return { | |
| 1281 | + "query": query, | |
| 1282 | + "tenant_id": self.tenant_id, | |
| 1283 | + "top_k": top_k, | |
| 1284 | + "metrics": compute_query_metrics(metric_labels), | |
| 1285 | + "results": labeled, | |
| 1286 | + "total": int(search_payload.get("total") or 0), | |
| 1287 | + } | |
| 1288 | + | |
| 1289 | + def batch_evaluate( | |
| 1290 | + self, | |
| 1291 | + queries: Sequence[str], | |
| 1292 | + *, | |
| 1293 | + top_k: int = 100, | |
| 1294 | + auto_annotate: bool = True, | |
| 1295 | + language: str = "en", | |
| 1296 | + force_refresh_labels: bool = False, | |
| 1297 | + ) -> Dict[str, Any]: | |
| 1298 | + per_query = [] | |
| 1299 | + for query in queries: | |
| 1300 | + live = self.evaluate_live_query( | |
| 1301 | + query, | |
| 1302 | + top_k=top_k, | |
| 1303 | + auto_annotate=auto_annotate, | |
| 1304 | + language=language, | |
| 1305 | + force_refresh_labels=force_refresh_labels, | |
| 1306 | + ) | |
| 1307 | + labels = [ | |
| 1308 | + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT | |
| 1309 | + for item in live["results"] | |
| 1310 | + ] | |
| 1311 | + per_query.append( | |
| 1312 | + { | |
| 1313 | + "query": live["query"], | |
| 1314 | + "tenant_id": live["tenant_id"], | |
| 1315 | + "top_k": live["top_k"], | |
| 1316 | + "metrics": live["metrics"], | |
| 1317 | + "distribution": label_distribution(labels), | |
| 1318 | + "total": live["total"], | |
| 1319 | + } | |
| 1320 | + ) | |
| 1321 | + aggregate = aggregate_metrics([item["metrics"] for item in per_query]) | |
| 1322 | + aggregate_distribution = { | |
| 1323 | + RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query), | |
| 1324 | + RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query), | |
| 1325 | + RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query), | |
| 1326 | + } | |
| 1327 | + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}" | |
| 1328 | + report_dir = ensure_dir(self.artifact_root / "batch_reports") | |
| 1329 | + config_snapshot_path = report_dir / f"{batch_id}_config.json" | |
| 1330 | + config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json() | |
| 1331 | + config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1332 | + output_json_path = report_dir / f"{batch_id}.json" | |
| 1333 | + report_md_path = report_dir / f"{batch_id}.md" | |
| 1334 | + payload = { | |
| 1335 | + "batch_id": batch_id, | |
| 1336 | + "created_at": utc_now_iso(), | |
| 1337 | + "tenant_id": self.tenant_id, | |
| 1338 | + "queries": list(queries), | |
| 1339 | + "top_k": top_k, | |
| 1340 | + "aggregate_metrics": aggregate, | |
| 1341 | + "aggregate_distribution": aggregate_distribution, | |
| 1342 | + "per_query": per_query, | |
| 1343 | + "config_snapshot_path": str(config_snapshot_path), | |
| 1344 | + } | |
| 1345 | + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1346 | + report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8") | |
| 1347 | + self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload) | |
| 1348 | + return payload | |
| 1349 | + | |
| 1350 | + | |
| 1351 | +def render_batch_report_markdown(payload: Dict[str, Any]) -> str: | |
| 1352 | + lines = [ | |
| 1353 | + "# Search Batch Evaluation", | |
| 1354 | + "", | |
| 1355 | + f"- Batch ID: {payload['batch_id']}", | |
| 1356 | + f"- Created at: {payload['created_at']}", | |
| 1357 | + f"- Tenant ID: {payload['tenant_id']}", | |
| 1358 | + f"- Query count: {len(payload['queries'])}", | |
| 1359 | + f"- Top K: {payload['top_k']}", | |
| 1360 | + "", | |
| 1361 | + "## Aggregate Metrics", | |
| 1362 | + "", | |
| 1363 | + ] | |
| 1364 | + for key, value in sorted((payload.get("aggregate_metrics") or {}).items()): | |
| 1365 | + lines.append(f"- {key}: {value}") | |
| 1366 | + distribution = payload.get("aggregate_distribution") or {} | |
| 1367 | + if distribution: | |
| 1368 | + lines.extend( | |
| 1369 | + [ | |
| 1370 | + "", | |
| 1371 | + "## Label Distribution", | |
| 1372 | + "", | |
| 1373 | + f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}", | |
| 1374 | + f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}", | |
| 1375 | + f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}", | |
| 1376 | + ] | |
| 1377 | + ) | |
| 1378 | + lines.extend(["", "## Per Query", ""]) | |
| 1379 | + for item in payload.get("per_query") or []: | |
| 1380 | + lines.append(f"### {item['query']}") | |
| 1381 | + lines.append("") | |
| 1382 | + for key, value in sorted((item.get("metrics") or {}).items()): | |
| 1383 | + lines.append(f"- {key}: {value}") | |
| 1384 | + distribution = item.get("distribution") or {} | |
| 1385 | + lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}") | |
| 1386 | + lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}") | |
| 1387 | + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}") | |
| 1388 | + lines.append("") | |
| 1389 | + return "\n".join(lines) | |
| 1390 | + | |
| 1391 | + | |
| 1392 | +class SearchEvalRequest(BaseModel): | |
| 1393 | + query: str | |
| 1394 | + top_k: int = Field(default=100, ge=1, le=500) | |
| 1395 | + auto_annotate: bool = True | |
| 1396 | + language: str = "en" | |
| 1397 | + | |
| 1398 | + | |
| 1399 | +class BatchEvalRequest(BaseModel): | |
| 1400 | + queries: Optional[List[str]] = None | |
| 1401 | + top_k: int = Field(default=100, ge=1, le=500) | |
| 1402 | + auto_annotate: bool = True | |
| 1403 | + language: str = "en" | |
| 1404 | + force_refresh_labels: bool = False | |
| 1405 | + | |
| 1406 | + | |
| 1407 | +def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI: | |
| 1408 | + app = FastAPI(title="Search Evaluation UI", version="1.0.0") | |
| 1409 | + | |
| 1410 | + @app.get("/", response_class=HTMLResponse) | |
| 1411 | + def home() -> str: | |
| 1412 | + return WEB_APP_HTML | |
| 1413 | + | |
| 1414 | + @app.get("/api/queries") | |
| 1415 | + def api_queries() -> Dict[str, Any]: | |
| 1416 | + return {"queries": framework.queries_from_file(query_file)} | |
| 1417 | + | |
| 1418 | + @app.post("/api/search-eval") | |
| 1419 | + def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]: | |
| 1420 | + return framework.evaluate_live_query( | |
| 1421 | + query=request.query, | |
| 1422 | + top_k=request.top_k, | |
| 1423 | + auto_annotate=request.auto_annotate, | |
| 1424 | + language=request.language, | |
| 1425 | + ) | |
| 1426 | + | |
| 1427 | + @app.post("/api/batch-eval") | |
| 1428 | + def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]: | |
| 1429 | + queries = request.queries or framework.queries_from_file(query_file) | |
| 1430 | + if not queries: | |
| 1431 | + raise HTTPException(status_code=400, detail="No queries provided") | |
| 1432 | + return framework.batch_evaluate( | |
| 1433 | + queries=queries, | |
| 1434 | + top_k=request.top_k, | |
| 1435 | + auto_annotate=request.auto_annotate, | |
| 1436 | + language=request.language, | |
| 1437 | + force_refresh_labels=request.force_refresh_labels, | |
| 1438 | + ) | |
| 1439 | + | |
| 1440 | + @app.get("/api/history") | |
| 1441 | + def api_history() -> Dict[str, Any]: | |
| 1442 | + return {"history": framework.store.list_batch_runs(limit=20)} | |
| 1443 | + | |
| 1444 | + return app | |
| 1445 | + | |
| 1446 | + | |
| 1447 | +WEB_APP_HTML = """ | |
| 1448 | +<!doctype html> | |
| 1449 | +<html lang="en"> | |
| 1450 | +<head> | |
| 1451 | + <meta charset="utf-8" /> | |
| 1452 | + <meta name="viewport" content="width=device-width, initial-scale=1" /> | |
| 1453 | + <title>Search Evaluation</title> | |
| 1454 | + <style> | |
| 1455 | + :root { | |
| 1456 | + --bg: #f5f3ed; | |
| 1457 | + --panel: #fffdf8; | |
| 1458 | + --ink: #1f2a24; | |
| 1459 | + --muted: #6b756e; | |
| 1460 | + --line: #ddd4c6; | |
| 1461 | + --accent: #0f766e; | |
| 1462 | + --exact: #0f766e; | |
| 1463 | + --partial: #b7791f; | |
| 1464 | + --irrelevant: #b42318; | |
| 1465 | + } | |
| 1466 | + body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background: | |
| 1467 | + radial-gradient(circle at top left, #f0e6d6 0, transparent 28%), | |
| 1468 | + linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); } | |
| 1469 | + .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; } | |
| 1470 | + .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); } | |
| 1471 | + .main { padding: 24px; } | |
| 1472 | + h1, h2 { margin: 0 0 12px; } | |
| 1473 | + .muted { color: var(--muted); } | |
| 1474 | + .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; } | |
| 1475 | + .query-item { display: block; width: 100%; border: 0; background: transparent; text-align: left; padding: 10px 12px; border-radius: 10px; cursor: pointer; } | |
| 1476 | + .query-item:hover { background: #eef6f4; } | |
| 1477 | + .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; } | |
| 1478 | + input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; } | |
| 1479 | + button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; } | |
| 1480 | + button.secondary { background: #d9e6e3; color: #12433d; } | |
| 1481 | + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; } | |
| 1482 | + .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; } | |
| 1483 | + .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; } | |
| 1484 | + .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; } | |
| 1485 | + .results { display: grid; gap: 10px; } | |
| 1486 | + .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; } | |
| 1487 | + .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; } | |
| 1488 | + .Exact { background: var(--exact); } | |
| 1489 | + .Partial { background: var(--partial); } | |
| 1490 | + .Irrelevant { background: var(--irrelevant); } | |
| 1491 | + .Unknown { background: #637381; } | |
| 1492 | + .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; } | |
| 1493 | + .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; } | |
| 1494 | + .options { color: var(--muted); line-height: 1.5; font-size: 14px; } | |
| 1495 | + .section { margin-bottom: 28px; } | |
| 1496 | + .history { font-size: 13px; line-height: 1.5; } | |
| 1497 | + </style> | |
| 1498 | +</head> | |
| 1499 | +<body> | |
| 1500 | + <div class="app"> | |
| 1501 | + <aside class="sidebar"> | |
| 1502 | + <h2>Queries</h2> | |
| 1503 | + <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p> | |
| 1504 | + <div id="queryList" class="query-list"></div> | |
| 1505 | + <div class="section"> | |
| 1506 | + <h2>History</h2> | |
| 1507 | + <div id="history" class="history muted">Loading...</div> | |
| 1508 | + </div> | |
| 1509 | + </aside> | |
| 1510 | + <main class="main"> | |
| 1511 | + <h1>Search Evaluation</h1> | |
| 1512 | + <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p> | |
| 1513 | + <div class="toolbar"> | |
| 1514 | + <input id="queryInput" type="text" placeholder="Search query" /> | |
| 1515 | + <button onclick="runSingle()">Evaluate Query</button> | |
| 1516 | + <button class="secondary" onclick="runBatch()">Batch Evaluation</button> | |
| 1517 | + </div> | |
| 1518 | + <div id="status" class="muted section"></div> | |
| 1519 | + <section class="section"> | |
| 1520 | + <h2>Metrics</h2> | |
| 1521 | + <div id="metrics" class="grid"></div> | |
| 1522 | + </section> | |
| 1523 | + <section class="section"> | |
| 1524 | + <h2>Top Results</h2> | |
| 1525 | + <div id="results" class="results"></div> | |
| 1526 | + </section> | |
| 1527 | + </main> | |
| 1528 | + </div> | |
| 1529 | + <script> | |
| 1530 | + async function fetchJSON(url, options) { | |
| 1531 | + const res = await fetch(url, options); | |
| 1532 | + if (!res.ok) throw new Error(await res.text()); | |
| 1533 | + return await res.json(); | |
| 1534 | + } | |
| 1535 | + function renderMetrics(metrics) { | |
| 1536 | + const root = document.getElementById('metrics'); | |
| 1537 | + root.innerHTML = ''; | |
| 1538 | + Object.entries(metrics || {}).forEach(([key, value]) => { | |
| 1539 | + const card = document.createElement('div'); | |
| 1540 | + card.className = 'metric'; | |
| 1541 | + card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`; | |
| 1542 | + root.appendChild(card); | |
| 1543 | + }); | |
| 1544 | + } | |
| 1545 | + function renderResults(results) { | |
| 1546 | + const root = document.getElementById('results'); | |
| 1547 | + root.innerHTML = ''; | |
| 1548 | + (results || []).forEach(item => { | |
| 1549 | + const label = item.label || 'Unknown'; | |
| 1550 | + const box = document.createElement('div'); | |
| 1551 | + box.className = 'result'; | |
| 1552 | + box.innerHTML = ` | |
| 1553 | + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div> | |
| 1554 | + <img class="thumb" src="${item.image_url || ''}" alt="" /> | |
| 1555 | + <div> | |
| 1556 | + <div class="title">${item.title || ''}</div> | |
| 1557 | + <div class="options"> | |
| 1558 | + <div>${(item.option_values || [])[0] || ''}</div> | |
| 1559 | + <div>${(item.option_values || [])[1] || ''}</div> | |
| 1560 | + <div>${(item.option_values || [])[2] || ''}</div> | |
| 1561 | + </div> | |
| 1562 | + </div>`; | |
| 1563 | + root.appendChild(box); | |
| 1564 | + }); | |
| 1565 | + } | |
| 1566 | + async function loadQueries() { | |
| 1567 | + const data = await fetchJSON('/api/queries'); | |
| 1568 | + const root = document.getElementById('queryList'); | |
| 1569 | + root.innerHTML = ''; | |
| 1570 | + data.queries.forEach(query => { | |
| 1571 | + const btn = document.createElement('button'); | |
| 1572 | + btn.className = 'query-item'; | |
| 1573 | + btn.textContent = query; | |
| 1574 | + btn.onclick = () => { | |
| 1575 | + document.getElementById('queryInput').value = query; | |
| 1576 | + runSingle(); | |
| 1577 | + }; | |
| 1578 | + root.appendChild(btn); | |
| 1579 | + }); | |
| 1580 | + } | |
| 1581 | + async function loadHistory() { | |
| 1582 | + const data = await fetchJSON('/api/history'); | |
| 1583 | + const root = document.getElementById('history'); | |
| 1584 | + root.innerHTML = (data.history || []).map(item => | |
| 1585 | + `<div><strong>${item.batch_id}</strong><br/>${item.created_at}<br/>${item.report_markdown_path}</div><br/>` | |
| 1586 | + ).join('') || 'No history yet.'; | |
| 1587 | + } | |
| 1588 | + async function runSingle() { | |
| 1589 | + const query = document.getElementById('queryInput').value.trim(); | |
| 1590 | + if (!query) return; | |
| 1591 | + document.getElementById('status').textContent = `Evaluating "${query}"...`; | |
| 1592 | + const data = await fetchJSON('/api/search-eval', { | |
| 1593 | + method: 'POST', | |
| 1594 | + headers: {'Content-Type': 'application/json'}, | |
| 1595 | + body: JSON.stringify({query, top_k: 100, auto_annotate: true}) | |
| 1596 | + }); | |
| 1597 | + document.getElementById('status').textContent = `Done. total=${data.total}`; | |
| 1598 | + renderMetrics(data.metrics); | |
| 1599 | + renderResults(data.results); | |
| 1600 | + loadHistory(); | |
| 1601 | + } | |
| 1602 | + async function runBatch() { | |
| 1603 | + document.getElementById('status').textContent = 'Running batch evaluation...'; | |
| 1604 | + const data = await fetchJSON('/api/batch-eval', { | |
| 1605 | + method: 'POST', | |
| 1606 | + headers: {'Content-Type': 'application/json'}, | |
| 1607 | + body: JSON.stringify({top_k: 100, auto_annotate: true}) | |
| 1608 | + }); | |
| 1609 | + document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`; | |
| 1610 | + renderMetrics(data.aggregate_metrics); | |
| 1611 | + renderResults([]); | |
| 1612 | + loadHistory(); | |
| 1613 | + } | |
| 1614 | + loadQueries(); | |
| 1615 | + loadHistory(); | |
| 1616 | + </script> | |
| 1617 | +</body> | |
| 1618 | +</html> | |
| 1619 | +""" | |
| 1620 | + | |
| 1621 | + | |
| 1622 | +def build_cli_parser() -> argparse.ArgumentParser: | |
| 1623 | + parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI") | |
| 1624 | + sub = parser.add_subparsers(dest="command", required=True) | |
| 1625 | + | |
| 1626 | + build = sub.add_parser("build", help="Build pooled annotation set for queries") | |
| 1627 | + build.add_argument("--tenant-id", default="163") | |
| 1628 | + build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1629 | + build.add_argument("--search-depth", type=int, default=1000) | |
| 1630 | + build.add_argument("--rerank-depth", type=int, default=10000) | |
| 1631 | + build.add_argument("--annotate-search-top-k", type=int, default=120) | |
| 1632 | + build.add_argument("--annotate-rerank-top-k", type=int, default=200) | |
| 1633 | + build.add_argument("--language", default="en") | |
| 1634 | + build.add_argument("--force-refresh-rerank", action="store_true") | |
| 1635 | + build.add_argument("--force-refresh-labels", action="store_true") | |
| 1636 | + | |
| 1637 | + batch = sub.add_parser("batch", help="Run batch evaluation against live search") | |
| 1638 | + batch.add_argument("--tenant-id", default="163") | |
| 1639 | + batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1640 | + batch.add_argument("--top-k", type=int, default=100) | |
| 1641 | + batch.add_argument("--language", default="en") | |
| 1642 | + batch.add_argument("--force-refresh-labels", action="store_true") | |
| 1643 | + | |
| 1644 | + audit = sub.add_parser("audit", help="Audit annotation quality for queries") | |
| 1645 | + audit.add_argument("--tenant-id", default="163") | |
| 1646 | + audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1647 | + audit.add_argument("--top-k", type=int, default=100) | |
| 1648 | + audit.add_argument("--language", default="en") | |
| 1649 | + audit.add_argument("--limit-suspicious", type=int, default=5) | |
| 1650 | + audit.add_argument("--force-refresh-labels", action="store_true") | |
| 1651 | + | |
| 1652 | + serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010") | |
| 1653 | + serve.add_argument("--tenant-id", default="163") | |
| 1654 | + serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 1655 | + serve.add_argument("--host", default="0.0.0.0") | |
| 1656 | + serve.add_argument("--port", type=int, default=6010) | |
| 1657 | + | |
| 1658 | + return parser | |
| 1659 | + | |
| 1660 | + | |
| 1661 | +def run_build(args: argparse.Namespace) -> None: | |
| 1662 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1663 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 1664 | + summary = [] | |
| 1665 | + for query in queries: | |
| 1666 | + result = framework.build_query_annotation_set( | |
| 1667 | + query=query, | |
| 1668 | + search_depth=args.search_depth, | |
| 1669 | + rerank_depth=args.rerank_depth, | |
| 1670 | + annotate_search_top_k=args.annotate_search_top_k, | |
| 1671 | + annotate_rerank_top_k=args.annotate_rerank_top_k, | |
| 1672 | + language=args.language, | |
| 1673 | + force_refresh_rerank=args.force_refresh_rerank, | |
| 1674 | + force_refresh_labels=args.force_refresh_labels, | |
| 1675 | + ) | |
| 1676 | + summary.append( | |
| 1677 | + { | |
| 1678 | + "query": result.query, | |
| 1679 | + "search_total": result.search_total, | |
| 1680 | + "search_depth": result.search_depth, | |
| 1681 | + "rerank_corpus_size": result.rerank_corpus_size, | |
| 1682 | + "annotated_count": result.annotated_count, | |
| 1683 | + "output_json_path": str(result.output_json_path), | |
| 1684 | + } | |
| 1685 | + ) | |
| 1686 | + print( | |
| 1687 | + f"[build] query={result.query!r} search_total={result.search_total} " | |
| 1688 | + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} " | |
| 1689 | + f"annotated={result.annotated_count} output={result.output_json_path}" | |
| 1690 | + ) | |
| 1691 | + out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json" | |
| 1692 | + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1693 | + print(f"[done] summary={out_path}") | |
| 1694 | + | |
| 1695 | + | |
| 1696 | +def run_batch(args: argparse.Namespace) -> None: | |
| 1697 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1698 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 1699 | + payload = framework.batch_evaluate( | |
| 1700 | + queries=queries, | |
| 1701 | + top_k=args.top_k, | |
| 1702 | + auto_annotate=True, | |
| 1703 | + language=args.language, | |
| 1704 | + force_refresh_labels=args.force_refresh_labels, | |
| 1705 | + ) | |
| 1706 | + print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}") | |
| 1707 | + | |
| 1708 | + | |
| 1709 | +def run_audit(args: argparse.Namespace) -> None: | |
| 1710 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1711 | + queries = framework.queries_from_file(Path(args.queries_file)) | |
| 1712 | + audit_items = [] | |
| 1713 | + for query in queries: | |
| 1714 | + item = framework.audit_live_query( | |
| 1715 | + query=query, | |
| 1716 | + top_k=args.top_k, | |
| 1717 | + language=args.language, | |
| 1718 | + auto_annotate=not args.force_refresh_labels, | |
| 1719 | + ) | |
| 1720 | + if args.force_refresh_labels: | |
| 1721 | + live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language) | |
| 1722 | + framework.annotate_missing_labels( | |
| 1723 | + query=query, | |
| 1724 | + docs=list(live_payload.get("results") or [])[: args.top_k], | |
| 1725 | + force_refresh=True, | |
| 1726 | + ) | |
| 1727 | + item = framework.audit_live_query( | |
| 1728 | + query=query, | |
| 1729 | + top_k=args.top_k, | |
| 1730 | + language=args.language, | |
| 1731 | + auto_annotate=False, | |
| 1732 | + ) | |
| 1733 | + audit_items.append( | |
| 1734 | + { | |
| 1735 | + "query": query, | |
| 1736 | + "metrics": item["metrics"], | |
| 1737 | + "distribution": item["distribution"], | |
| 1738 | + "suspicious_count": len(item["suspicious"]), | |
| 1739 | + "suspicious_examples": item["suspicious"][: args.limit_suspicious], | |
| 1740 | + } | |
| 1741 | + ) | |
| 1742 | + print( | |
| 1743 | + f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}" | |
| 1744 | + ) | |
| 1745 | + | |
| 1746 | + summary = { | |
| 1747 | + "created_at": utc_now_iso(), | |
| 1748 | + "tenant_id": args.tenant_id, | |
| 1749 | + "top_k": args.top_k, | |
| 1750 | + "query_count": len(queries), | |
| 1751 | + "total_suspicious": sum(item["suspicious_count"] for item in audit_items), | |
| 1752 | + "queries": audit_items, | |
| 1753 | + } | |
| 1754 | + out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json" | |
| 1755 | + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 1756 | + print(f"[done] audit={out_path}") | |
| 1757 | + | |
| 1758 | + | |
| 1759 | +def run_serve(args: argparse.Namespace) -> None: | |
| 1760 | + framework = SearchEvaluationFramework(tenant_id=args.tenant_id) | |
| 1761 | + app = create_web_app(framework, Path(args.queries_file)) | |
| 1762 | + import uvicorn | |
| 1763 | + | |
| 1764 | + uvicorn.run(app, host=args.host, port=args.port, log_level="info") | |
| 1765 | + | |
| 1766 | + | |
| 1767 | +def main() -> None: | |
| 1768 | + parser = build_cli_parser() | |
| 1769 | + args = parser.parse_args() | |
| 1770 | + if args.command == "build": | |
| 1771 | + run_build(args) | |
| 1772 | + return | |
| 1773 | + if args.command == "batch": | |
| 1774 | + run_batch(args) | |
| 1775 | + return | |
| 1776 | + if args.command == "audit": | |
| 1777 | + run_audit(args) | |
| 1778 | + return | |
| 1779 | + if args.command == "serve": | |
| 1780 | + run_serve(args) | |
| 1781 | + return | |
| 1782 | + raise SystemExit(f"unknown command: {args.command}") | |
| 1783 | + | |
| 1784 | + | |
| 1785 | +if __name__ == "__main__": | |
| 1786 | + main() | ... | ... |
| ... | ... | @@ -0,0 +1,14 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | +from pathlib import Path | |
| 4 | +import sys | |
| 5 | + | |
| 6 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| 7 | +if str(PROJECT_ROOT) not in sys.path: | |
| 8 | + sys.path.insert(0, str(PROJECT_ROOT)) | |
| 9 | + | |
| 10 | +from scripts.evaluation.eval_framework import main | |
| 11 | + | |
| 12 | + | |
| 13 | +if __name__ == "__main__": | |
| 14 | + main() | ... | ... |
| ... | ... | @@ -0,0 +1,296 @@ |
| 1 | +#!/usr/bin/env python3 | |
| 2 | + | |
| 3 | +from __future__ import annotations | |
| 4 | + | |
| 5 | +import argparse | |
| 6 | +import copy | |
| 7 | +import json | |
| 8 | +import re | |
| 9 | +import subprocess | |
| 10 | +import sys | |
| 11 | +import time | |
| 12 | +from dataclasses import dataclass | |
| 13 | +from pathlib import Path | |
| 14 | +from typing import Any, Dict, List | |
| 15 | + | |
| 16 | +import requests | |
| 17 | +import yaml | |
| 18 | + | |
| 19 | +PROJECT_ROOT = Path(__file__).resolve().parents[2] | |
| 20 | +if str(PROJECT_ROOT) not in sys.path: | |
| 21 | + sys.path.insert(0, str(PROJECT_ROOT)) | |
| 22 | + | |
| 23 | +from scripts.evaluation.eval_framework import ( | |
| 24 | + DEFAULT_ARTIFACT_ROOT, | |
| 25 | + DEFAULT_QUERY_FILE, | |
| 26 | + ensure_dir, | |
| 27 | + utc_now_iso, | |
| 28 | + utc_timestamp, | |
| 29 | +) | |
| 30 | + | |
| 31 | + | |
| 32 | +CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml" | |
| 33 | + | |
| 34 | + | |
| 35 | +@dataclass | |
| 36 | +class ExperimentSpec: | |
| 37 | + name: str | |
| 38 | + description: str | |
| 39 | + params: Dict[str, Any] | |
| 40 | + | |
| 41 | + | |
| 42 | +def load_yaml(path: Path) -> Dict[str, Any]: | |
| 43 | + return yaml.safe_load(path.read_text(encoding="utf-8")) | |
| 44 | + | |
| 45 | + | |
| 46 | +def write_yaml(path: Path, payload: Dict[str, Any]) -> None: | |
| 47 | + path.write_text( | |
| 48 | + yaml.safe_dump(payload, sort_keys=False, allow_unicode=True), | |
| 49 | + encoding="utf-8", | |
| 50 | + ) | |
| 51 | + | |
| 52 | + | |
| 53 | +def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None: | |
| 54 | + current = payload | |
| 55 | + parts = dotted_path.split(".") | |
| 56 | + for part in parts[:-1]: | |
| 57 | + current = current[part] | |
| 58 | + current[parts[-1]] = value | |
| 59 | + | |
| 60 | + | |
| 61 | +def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]: | |
| 62 | + candidate = copy.deepcopy(base_config) | |
| 63 | + for dotted_path, value in params.items(): | |
| 64 | + set_nested_value(candidate, dotted_path, value) | |
| 65 | + return candidate | |
| 66 | + | |
| 67 | + | |
| 68 | +def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]: | |
| 69 | + deadline = time.time() + timeout_sec | |
| 70 | + last_error = None | |
| 71 | + while time.time() < deadline: | |
| 72 | + try: | |
| 73 | + response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10) | |
| 74 | + response.raise_for_status() | |
| 75 | + payload = response.json() | |
| 76 | + if str(payload.get("status")) == "healthy": | |
| 77 | + return payload | |
| 78 | + last_error = payload | |
| 79 | + except Exception as exc: # noqa: BLE001 | |
| 80 | + last_error = str(exc) | |
| 81 | + time.sleep(2.0) | |
| 82 | + raise RuntimeError(f"backend did not become healthy: {last_error}") | |
| 83 | + | |
| 84 | + | |
| 85 | +def run_restart() -> None: | |
| 86 | + subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600) | |
| 87 | + | |
| 88 | + | |
| 89 | +def read_queries(path: Path) -> List[str]: | |
| 90 | + return [ | |
| 91 | + line.strip() | |
| 92 | + for line in path.read_text(encoding="utf-8").splitlines() | |
| 93 | + if line.strip() and not line.strip().startswith("#") | |
| 94 | + ] | |
| 95 | + | |
| 96 | + | |
| 97 | +def run_batch_eval( | |
| 98 | + *, | |
| 99 | + tenant_id: str, | |
| 100 | + queries_file: Path, | |
| 101 | + top_k: int, | |
| 102 | + language: str, | |
| 103 | + force_refresh_labels: bool, | |
| 104 | +) -> Dict[str, Any]: | |
| 105 | + cmd = [ | |
| 106 | + str(PROJECT_ROOT / ".venv" / "bin" / "python"), | |
| 107 | + "scripts/evaluation/build_annotation_set.py", | |
| 108 | + "batch", | |
| 109 | + "--tenant-id", | |
| 110 | + str(tenant_id), | |
| 111 | + "--queries-file", | |
| 112 | + str(queries_file), | |
| 113 | + "--top-k", | |
| 114 | + str(top_k), | |
| 115 | + "--language", | |
| 116 | + language, | |
| 117 | + ] | |
| 118 | + if force_refresh_labels: | |
| 119 | + cmd.append("--force-refresh-labels") | |
| 120 | + completed = subprocess.run( | |
| 121 | + cmd, | |
| 122 | + cwd=PROJECT_ROOT, | |
| 123 | + check=True, | |
| 124 | + capture_output=True, | |
| 125 | + text=True, | |
| 126 | + timeout=7200, | |
| 127 | + ) | |
| 128 | + output = (completed.stdout or "") + "\n" + (completed.stderr or "") | |
| 129 | + match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output) | |
| 130 | + if not match: | |
| 131 | + raise RuntimeError(f"failed to parse batch output: {output[-2000:]}") | |
| 132 | + batch_id = match.group(1) | |
| 133 | + aggregate_metrics = json.loads(match.group(2).replace("'", '"')) | |
| 134 | + return { | |
| 135 | + "batch_id": batch_id, | |
| 136 | + "aggregate_metrics": aggregate_metrics, | |
| 137 | + "raw_output": output, | |
| 138 | + } | |
| 139 | + | |
| 140 | + | |
| 141 | +def render_markdown(summary: Dict[str, Any]) -> str: | |
| 142 | + lines = [ | |
| 143 | + "# Fusion Tuning Report", | |
| 144 | + "", | |
| 145 | + f"- Created at: {summary['created_at']}", | |
| 146 | + f"- Tenant ID: {summary['tenant_id']}", | |
| 147 | + f"- Query count: {summary['query_count']}", | |
| 148 | + f"- Top K: {summary['top_k']}", | |
| 149 | + f"- Score metric: {summary['score_metric']}", | |
| 150 | + "", | |
| 151 | + "## Experiments", | |
| 152 | + "", | |
| 153 | + "| Rank | Name | Score | MAP_3 | MAP_2_3 | P@5 | P@10 | Config |", | |
| 154 | + "|---|---|---:|---:|---:|---:|---:|---|", | |
| 155 | + ] | |
| 156 | + for idx, item in enumerate(summary["experiments"], start=1): | |
| 157 | + metrics = item["aggregate_metrics"] | |
| 158 | + lines.append( | |
| 159 | + "| " | |
| 160 | + + " | ".join( | |
| 161 | + [ | |
| 162 | + str(idx), | |
| 163 | + item["name"], | |
| 164 | + str(item["score"]), | |
| 165 | + str(metrics.get("MAP_3", "")), | |
| 166 | + str(metrics.get("MAP_2_3", "")), | |
| 167 | + str(metrics.get("P@5", "")), | |
| 168 | + str(metrics.get("P@10", "")), | |
| 169 | + item["config_snapshot_path"], | |
| 170 | + ] | |
| 171 | + ) | |
| 172 | + + " |" | |
| 173 | + ) | |
| 174 | + lines.extend(["", "## Details", ""]) | |
| 175 | + for item in summary["experiments"]: | |
| 176 | + lines.append(f"### {item['name']}") | |
| 177 | + lines.append("") | |
| 178 | + lines.append(f"- Description: {item['description']}") | |
| 179 | + lines.append(f"- Score: {item['score']}") | |
| 180 | + lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`") | |
| 181 | + lines.append(f"- Batch report: {item['batch_report_path']}") | |
| 182 | + lines.append("") | |
| 183 | + return "\n".join(lines) | |
| 184 | + | |
| 185 | + | |
| 186 | +def load_experiments(path: Path) -> List[ExperimentSpec]: | |
| 187 | + payload = json.loads(path.read_text(encoding="utf-8")) | |
| 188 | + items = payload["experiments"] if isinstance(payload, dict) else payload | |
| 189 | + experiments: List[ExperimentSpec] = [] | |
| 190 | + for item in items: | |
| 191 | + experiments.append( | |
| 192 | + ExperimentSpec( | |
| 193 | + name=str(item["name"]), | |
| 194 | + description=str(item.get("description") or ""), | |
| 195 | + params=dict(item.get("params") or {}), | |
| 196 | + ) | |
| 197 | + ) | |
| 198 | + return experiments | |
| 199 | + | |
| 200 | + | |
| 201 | +def build_parser() -> argparse.ArgumentParser: | |
| 202 | + parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend") | |
| 203 | + parser.add_argument("--tenant-id", default="163") | |
| 204 | + parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE)) | |
| 205 | + parser.add_argument("--top-k", type=int, default=100) | |
| 206 | + parser.add_argument("--language", default="en") | |
| 207 | + parser.add_argument("--experiments-file", required=True) | |
| 208 | + parser.add_argument("--search-base-url", default="http://127.0.0.1:6002") | |
| 209 | + parser.add_argument("--score-metric", default="MAP_3") | |
| 210 | + parser.add_argument("--apply-best", action="store_true") | |
| 211 | + parser.add_argument("--force-refresh-labels-first-pass", action="store_true") | |
| 212 | + return parser | |
| 213 | + | |
| 214 | + | |
| 215 | +def main() -> None: | |
| 216 | + args = build_parser().parse_args() | |
| 217 | + queries_file = Path(args.queries_file) | |
| 218 | + queries = read_queries(queries_file) | |
| 219 | + base_config_text = CONFIG_PATH.read_text(encoding="utf-8") | |
| 220 | + base_config = load_yaml(CONFIG_PATH) | |
| 221 | + experiments = load_experiments(Path(args.experiments_file)) | |
| 222 | + | |
| 223 | + tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs") | |
| 224 | + run_id = f"tuning_{utc_timestamp()}" | |
| 225 | + run_dir = ensure_dir(tuning_dir / run_id) | |
| 226 | + results: List[Dict[str, Any]] = [] | |
| 227 | + | |
| 228 | + try: | |
| 229 | + for experiment in experiments: | |
| 230 | + candidate = apply_params(base_config, experiment.params) | |
| 231 | + write_yaml(CONFIG_PATH, candidate) | |
| 232 | + candidate_config_path = run_dir / f"{experiment.name}_config.yaml" | |
| 233 | + write_yaml(candidate_config_path, candidate) | |
| 234 | + | |
| 235 | + run_restart() | |
| 236 | + health = wait_for_backend(args.search_base_url) | |
| 237 | + batch_result = run_batch_eval( | |
| 238 | + tenant_id=args.tenant_id, | |
| 239 | + queries_file=queries_file, | |
| 240 | + top_k=args.top_k, | |
| 241 | + language=args.language, | |
| 242 | + force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results), | |
| 243 | + ) | |
| 244 | + aggregate_metrics = dict(batch_result["aggregate_metrics"]) | |
| 245 | + results.append( | |
| 246 | + { | |
| 247 | + "name": experiment.name, | |
| 248 | + "description": experiment.description, | |
| 249 | + "params": experiment.params, | |
| 250 | + "aggregate_metrics": aggregate_metrics, | |
| 251 | + "score": float(aggregate_metrics.get(args.score_metric, 0.0)), | |
| 252 | + "batch_id": batch_result["batch_id"], | |
| 253 | + "batch_report_path": str( | |
| 254 | + DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md" | |
| 255 | + ), | |
| 256 | + "config_snapshot_path": str(candidate_config_path), | |
| 257 | + "backend_health": health, | |
| 258 | + "batch_stdout": batch_result["raw_output"], | |
| 259 | + } | |
| 260 | + ) | |
| 261 | + print( | |
| 262 | + f"[tune] {experiment.name} score={aggregate_metrics.get(args.score_metric)} " | |
| 263 | + f"metrics={aggregate_metrics}" | |
| 264 | + ) | |
| 265 | + finally: | |
| 266 | + if args.apply_best and results: | |
| 267 | + best = max(results, key=lambda item: item["score"]) | |
| 268 | + best_config = apply_params(base_config, best["params"]) | |
| 269 | + write_yaml(CONFIG_PATH, best_config) | |
| 270 | + run_restart() | |
| 271 | + wait_for_backend(args.search_base_url) | |
| 272 | + else: | |
| 273 | + CONFIG_PATH.write_text(base_config_text, encoding="utf-8") | |
| 274 | + run_restart() | |
| 275 | + wait_for_backend(args.search_base_url) | |
| 276 | + | |
| 277 | + results.sort(key=lambda item: item["score"], reverse=True) | |
| 278 | + summary = { | |
| 279 | + "run_id": run_id, | |
| 280 | + "created_at": utc_now_iso(), | |
| 281 | + "tenant_id": args.tenant_id, | |
| 282 | + "query_count": len(queries), | |
| 283 | + "top_k": args.top_k, | |
| 284 | + "score_metric": args.score_metric, | |
| 285 | + "experiments": results, | |
| 286 | + } | |
| 287 | + summary_json_path = run_dir / "summary.json" | |
| 288 | + summary_md_path = run_dir / "summary.md" | |
| 289 | + summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8") | |
| 290 | + summary_md_path.write_text(render_markdown(summary), encoding="utf-8") | |
| 291 | + print(f"[done] summary_json={summary_json_path}") | |
| 292 | + print(f"[done] summary_md={summary_md_path}") | |
| 293 | + | |
| 294 | + | |
| 295 | +if __name__ == "__main__": | |
| 296 | + main() | ... | ... |