Commit 432d1c88efc7d32fd4ed509b3e6ad6d80acf4342

Authored by tangwang
1 parent 267920e5

评估框架

config/config.yaml
1   -# Unified Configuration for Multi-Tenant Search Engine
2   -# 统一配置文件,所有租户共用一套配置
3   -# 注意:索引结构由 mappings/search_products.json 定义,此文件只配置搜索行为
4   -#
5   -# 约定:下列键为必填;进程环境变量可覆盖 infrastructure / runtime 中同名语义项
6   -#(如 ES_HOST、API_PORT 等),未设置环境变量时使用本文件中的值。
7   -
8   -# Process / bind addresses (环境变量 APP_ENV、RUNTIME_ENV、ES_INDEX_NAMESPACE 可覆盖前两者的语义)
9 1 runtime:
10   - environment: "prod"
11   - index_namespace: ""
12   - api_host: "0.0.0.0"
  2 + environment: prod
  3 + index_namespace: ''
  4 + api_host: 0.0.0.0
13 5 api_port: 6002
14   - indexer_host: "0.0.0.0"
  6 + indexer_host: 0.0.0.0
15 7 indexer_port: 6004
16   - embedding_host: "0.0.0.0"
  8 + embedding_host: 0.0.0.0
17 9 embedding_port: 6005
18 10 embedding_text_port: 6005
19 11 embedding_image_port: 6008
20   - translator_host: "0.0.0.0"
  12 + translator_host: 0.0.0.0
21 13 translator_port: 6006
22   - reranker_host: "0.0.0.0"
  14 + reranker_host: 0.0.0.0
23 15 reranker_port: 6007
24   -
25   -# 基础设施连接(敏感项优先读环境变量:ES_*、REDIS_*、DB_*、DASHSCOPE_API_KEY、DEEPL_AUTH_KEY)
26 16 infrastructure:
27 17 elasticsearch:
28   - host: "http://localhost:9200"
  18 + host: http://localhost:9200
29 19 username: null
30 20 password: null
31 21 redis:
32   - host: "localhost"
  22 + host: localhost
33 23 port: 6479
34 24 snapshot_db: 0
35 25 password: null
... ... @@ -37,8 +27,8 @@ infrastructure:
37 27 socket_connect_timeout: 1
38 28 retry_on_timeout: false
39 29 cache_expire_days: 720
40   - embedding_cache_prefix: "embedding"
41   - anchor_cache_prefix: "product_anchors"
  30 + embedding_cache_prefix: embedding
  31 + anchor_cache_prefix: product_anchors
42 32 anchor_cache_expire_days: 30
43 33 database:
44 34 host: null
... ... @@ -49,30 +39,16 @@ infrastructure:
49 39 secrets:
50 40 dashscope_api_key: null
51 41 deepl_auth_key: null
52   -
53   -# Elasticsearch Index
54   -es_index_name: "search_products"
55   -
56   -# 检索域 / 索引列表(可为空列表;每项字段均需显式给出)
  42 +es_index_name: search_products
57 43 indexes: []
58   -
59   -# Config assets
60 44 assets:
61   - query_rewrite_dictionary_path: "config/dictionaries/query_rewrite.dict"
62   -
63   -# Product content understanding (LLM enrich-content) configuration
  45 + query_rewrite_dictionary_path: config/dictionaries/query_rewrite.dict
64 46 product_enrich:
65 47 max_workers: 40
66   -
67   -# ES Index Settings (基础设置)
68 48 es_settings:
69 49 number_of_shards: 1
70 50 number_of_replicas: 0
71   - refresh_interval: "30s"
72   -
73   -# 字段权重配置(用于搜索时的字段boost)
74   -# 统一按“字段基名”配置;查询时按实际检索语言动态拼接 .{lang}。
75   -# 若需要按某个语言单独调权,也可以加显式 key(例如 title.de: 3.2)。
  51 + refresh_interval: 30s
76 52 field_boosts:
77 53 title: 3.0
78 54 qanchors: 2.5
... ... @@ -85,79 +61,64 @@ field_boosts:
85 61 option1_values: 1.5
86 62 option2_values: 1.5
87 63 option3_values: 1.5
88   -
89   -# Query Configuration(查询配置)
90 64 query_config:
91   - # 支持的语言
92 65 supported_languages:
93   - - "zh"
94   - - "en"
95   - default_language: "en"
96   -
97   - # 功能开关(翻译开关由tenant_config控制)
  66 + - zh
  67 + - en
  68 + default_language: en
98 69 enable_text_embedding: true
99 70 enable_query_rewrite: true
100   -
101   - # 查询翻译模型(须与 services.translation.capabilities 中某项一致)
102   - # 源语种在租户 index_languages 内:主召回可打在源语种字段,用下面三项。
103   - zh_to_en_model: "nllb-200-distilled-600m" # "opus-mt-zh-en"
104   - en_to_zh_model: "nllb-200-distilled-600m" # "opus-mt-en-zh"
105   - default_translation_model: "nllb-200-distilled-600m"
106   - # zh_to_en_model: "deepl"
107   - # en_to_zh_model: "deepl"
108   - # default_translation_model: "deepl"
109   - # 源语种不在 index_languages:翻译对可检索文本更关键,可单独指定(缺省则与上一组相同)
110   - zh_to_en_model__source_not_in_index: "nllb-200-distilled-600m"
111   - en_to_zh_model__source_not_in_index: "nllb-200-distilled-600m"
112   - default_translation_model__source_not_in_index: "nllb-200-distilled-600m"
113   - # zh_to_en_model__source_not_in_index: "deepl"
114   - # en_to_zh_model__source_not_in_index: "deepl"
115   - # default_translation_model__source_not_in_index: "deepl"
116   -
117   - # 查询解析阶段:翻译与 query 向量并发执行,共用同一等待预算(毫秒)。
118   - # 检测语言已在租户 index_languages 内:较短;不在索引语言内:较长(翻译对召回更关键)。
119   - translation_embedding_wait_budget_ms_source_in_index: 200 # 80
120   - translation_embedding_wait_budget_ms_source_not_in_index: 300 #200
121   -
  71 + zh_to_en_model: nllb-200-distilled-600m
  72 + en_to_zh_model: nllb-200-distilled-600m
  73 + default_translation_model: nllb-200-distilled-600m
  74 + zh_to_en_model__source_not_in_index: nllb-200-distilled-600m
  75 + en_to_zh_model__source_not_in_index: nllb-200-distilled-600m
  76 + default_translation_model__source_not_in_index: nllb-200-distilled-600m
  77 + translation_embedding_wait_budget_ms_source_in_index: 200
  78 + translation_embedding_wait_budget_ms_source_not_in_index: 300
122 79 style_intent:
123 80 enabled: true
124 81 selected_sku_boost: 1.2
125   - color_dictionary_path: "config/dictionaries/style_intent_color.csv"
126   - size_dictionary_path: "config/dictionaries/style_intent_size.csv"
  82 + color_dictionary_path: config/dictionaries/style_intent_color.csv
  83 + size_dictionary_path: config/dictionaries/style_intent_size.csv
127 84 dimension_aliases:
128   - color: ["color", "colors", "colour", "colours", "颜色", "色", "色系"]
129   - size: ["size", "sizes", "sizing", "尺码", "尺寸", "码数", "号码", "码"]
130   -
  85 + color:
  86 + - color
  87 + - colors
  88 + - colour
  89 + - colours
  90 + - 颜色
  91 + - 色
  92 + - 色系
  93 + size:
  94 + - size
  95 + - sizes
  96 + - sizing
  97 + - 尺码
  98 + - 尺寸
  99 + - 码数
  100 + - 号码
  101 + - 码
131 102 product_title_exclusion:
132 103 enabled: true
133   - dictionary_path: "config/dictionaries/product_title_exclusion.tsv"
134   -
135   - # 动态多语言检索字段配置
136   - # multilingual_fields 会被拼成 title.{lang}/brief.{lang}/... 形式;
137   - # shared_fields 为无语言后缀字段。
  104 + dictionary_path: config/dictionaries/product_title_exclusion.tsv
138 105 search_fields:
139 106 multilingual_fields:
140   - - "title"
141   - - "qanchors"
142   - - "category_path"
143   - - "category_name_text"
144   - - "brief"
145   - - "description"
146   - - "vendor"
147   - shared_fields:
148   - # - "tags"
149   - # - "option1_values"
150   - # - "option2_values"
151   - # - "option3_values"
  107 + - title
  108 + - qanchors
  109 + - category_path
  110 + - category_name_text
  111 + - brief
  112 + - description
  113 + - vendor
  114 + shared_fields: null
152 115 core_multilingual_fields:
153   - - "title"
154   - - "qanchors"
155   - - "category_name_text"
156   -
157   - # 统一文本召回策略(主查询 + 翻译查询)
  116 + - title
  117 + - qanchors
  118 + - category_name_text
158 119 text_query_strategy:
159   - base_minimum_should_match: "60%"
160   - translation_minimum_should_match: "60%"
  120 + base_minimum_should_match: 60%
  121 + translation_minimum_should_match: 60%
161 122 translation_boost: 0.75
162 123 tie_breaker_base_query: 0.5
163 124 best_fields_boost: 2.0
... ... @@ -169,67 +130,51 @@ query_config:
169 130 title: 5.0
170 131 qanchors: 4.0
171 132 phrase_match_boost: 3.0
172   -
173   - # Embedding字段名称
174   - text_embedding_field: "title_embedding"
175   - image_embedding_field: "image_embedding.vector"
176   -
177   - # 返回字段配置(_source includes)
178   - # null表示返回所有字段,[]表示不返回任何字段,列表表示只返回指定字段
179   - # 下列字段与 api/result_formatter.py(SpuResult 填充)及 search/searcher.py(SKU 排序/主图替换)一致
  133 + text_embedding_field: title_embedding
  134 + image_embedding_field: image_embedding.vector
180 135 source_fields:
181   - - spu_id
182   - - handle
183   - - title
184   - - brief
185   - - description
186   - - vendor
187   - - category_name
188   - - category_name_text
189   - - category_path
190   - - category_id
191   - - category_level
192   - - category1_name
193   - - category2_name
194   - - category3_name
195   - - tags
196   - - min_price
197   - - compare_at_price
198   - - image_url
199   - - sku_prices
200   - - sku_weights
201   - - sku_weight_units
202   - - total_inventory
203   - - option1_name
204   - - option1_values
205   - - option2_name
206   - - option2_values
207   - - option3_name
208   - - option3_values
209   - - specifications
210   - - skus
211   -
212   - # KNN:文本向量与多模态(图片)向量各自 boost 与召回(k / num_candidates)
  136 + - spu_id
  137 + - handle
  138 + - title
  139 + - brief
  140 + - description
  141 + - vendor
  142 + - category_name
  143 + - category_name_text
  144 + - category_path
  145 + - category_id
  146 + - category_level
  147 + - category1_name
  148 + - category2_name
  149 + - category3_name
  150 + - tags
  151 + - min_price
  152 + - compare_at_price
  153 + - image_url
  154 + - sku_prices
  155 + - sku_weights
  156 + - sku_weight_units
  157 + - total_inventory
  158 + - option1_name
  159 + - option1_values
  160 + - option2_name
  161 + - option2_values
  162 + - option3_name
  163 + - option3_values
  164 + - specifications
  165 + - skus
213 166 knn_text_boost: 4
214 167 knn_image_boost: 4
215   -
216   - # knn_text_num_candidates = k * 3.4
217 168 knn_text_k: 160
218 169 knn_text_num_candidates: 560
219   -
220 170 knn_text_k_long: 400
221 171 knn_text_num_candidates_long: 1200
222   -
223 172 knn_image_k: 400
224 173 knn_image_num_candidates: 1200
225   -
226   -# Function Score配置(ES层打分规则)
227 174 function_score:
228   - score_mode: "sum"
229   - boost_mode: "multiply"
  175 + score_mode: sum
  176 + boost_mode: multiply
230 177 functions: []
231   -
232   -# 粗排配置(仅融合 ES 文本/向量信号,不调用模型)
233 178 coarse_rank:
234 179 enabled: true
235 180 input_window: 700
... ... @@ -237,69 +182,52 @@ coarse_rank:
237 182 fusion:
238 183 text_bias: 0.1
239 184 text_exponent: 0.35
240   - # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
241   - # 因为es的打分已经给了trans进行了折扣,所以这里不再继续折扣
242 185 text_translation_weight: 1.0
243 186 knn_text_weight: 1.0
244 187 knn_image_weight: 1.0
245 188 knn_tie_breaker: 0.1
246 189 knn_bias: 0.6
247 190 knn_exponent: 0.0
248   -
249   -# 精排配置(轻量 reranker)
250 191 fine_rank:
251 192 enabled: false
252 193 input_window: 160
253 194 output_window: 80
254 195 timeout_sec: 10.0
255   - rerank_query_template: "{query}"
256   - rerank_doc_template: "{title}"
257   - service_profile: "fine"
258   -
259   -# 重排配置(provider/URL 在 services.rerank)
  196 + rerank_query_template: '{query}'
  197 + rerank_doc_template: '{title}'
  198 + service_profile: fine
260 199 rerank:
261 200 enabled: true
262 201 rerank_window: 160
263 202 timeout_sec: 15.0
264 203 weight_es: 0.4
265 204 weight_ai: 0.6
266   - rerank_query_template: "{query}"
267   - rerank_doc_template: "{title}"
268   - service_profile: "default"
269   - # 乘法融合:fused = Π (max(score,0) + bias) ** exponent(rerank / text / knn 三项)
270   - # 其中 knn_score 先做一层 dis_max:
271   - # max(knn_text_weight * text_knn, knn_image_weight * image_knn)
272   - # + knn_tie_breaker * 另一侧较弱信号
  205 + rerank_query_template: '{query}'
  206 + rerank_doc_template: '{title}'
  207 + service_profile: default
273 208 fusion:
274   - rerank_bias: 0.00001
275   - rerank_exponent: 1.0
276   - fine_bias: 0.00001
  209 + rerank_bias: 1.0e-05
  210 + rerank_exponent: 1.15
  211 + fine_bias: 1.0e-05
277 212 fine_exponent: 1.0
278 213 text_bias: 0.1
279   - text_exponent: 0.35
280   - # base_query_trans_* 相对 base_query 的权重(见 search/rerank_client 中文本 dismax 融合)
281   - text_translation_weight: 1.0
  214 + text_exponent: 0.25
  215 + text_translation_weight: 0.8
282 216 knn_text_weight: 1.0
283 217 knn_image_weight: 1.0
284 218 knn_tie_breaker: 0.1
285 219 knn_bias: 0.6
286 220 knn_exponent: 0.0
287   -
288   -# 可扩展服务/provider 注册表(单一配置源)
289 221 services:
290 222 translation:
291   - service_url: "http://127.0.0.1:6006"
292   - # default_model: "nllb-200-distilled-600m"
293   - default_model: "nllb-200-distilled-600m"
294   - default_scene: "general"
  223 + service_url: http://127.0.0.1:6006
  224 + default_model: nllb-200-distilled-600m
  225 + default_scene: general
295 226 timeout_sec: 10.0
296 227 cache:
297 228 ttl_seconds: 62208000
298 229 sliding_expiration: true
299   - # When false, cache keys are exact-match per request model only (ignores model_quality_tiers for lookups).
300 230 enable_model_quality_tier_cache: true
301   - # Higher tier = better quality. Multiple models may share one tier (同级).
302   - # A request may reuse Redis keys from models with tier > A or tier == A (not from lower tiers).
303 231 model_quality_tiers:
304 232 deepl: 30
305 233 qwen-mt: 30
... ... @@ -310,43 +238,43 @@ services:
310 238 capabilities:
311 239 qwen-mt:
312 240 enabled: true
313   - backend: "qwen_mt"
314   - model: "qwen-mt-flash"
315   - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  241 + backend: qwen_mt
  242 + model: qwen-mt-flash
  243 + base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1
316 244 timeout_sec: 10.0
317 245 use_cache: true
318 246 llm:
319 247 enabled: true
320   - backend: "llm"
321   - model: "qwen-flash"
322   - base_url: "https://dashscope-us.aliyuncs.com/compatible-mode/v1"
  248 + backend: llm
  249 + model: qwen-flash
  250 + base_url: https://dashscope-us.aliyuncs.com/compatible-mode/v1
323 251 timeout_sec: 30.0
324 252 use_cache: true
325 253 deepl:
326 254 enabled: true
327   - backend: "deepl"
328   - api_url: "https://api.deepl.com/v2/translate"
  255 + backend: deepl
  256 + api_url: https://api.deepl.com/v2/translate
329 257 timeout_sec: 10.0
330   - glossary_id: ""
  258 + glossary_id: ''
331 259 use_cache: true
332 260 nllb-200-distilled-600m:
333 261 enabled: true
334   - backend: "local_nllb"
335   - model_id: "facebook/nllb-200-distilled-600M"
336   - model_dir: "./models/translation/facebook/nllb-200-distilled-600M"
337   - ct2_model_dir: "./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16"
338   - ct2_compute_type: "float16"
339   - ct2_conversion_quantization: "float16"
  262 + backend: local_nllb
  263 + model_id: facebook/nllb-200-distilled-600M
  264 + model_dir: ./models/translation/facebook/nllb-200-distilled-600M
  265 + ct2_model_dir: ./models/translation/facebook/nllb-200-distilled-600M/ctranslate2-float16
  266 + ct2_compute_type: float16
  267 + ct2_conversion_quantization: float16
340 268 ct2_auto_convert: true
341 269 ct2_inter_threads: 4
342 270 ct2_intra_threads: 0
343 271 ct2_max_queued_batches: 32
344   - ct2_batch_type: "examples"
345   - ct2_decoding_length_mode: "source"
  272 + ct2_batch_type: examples
  273 + ct2_decoding_length_mode: source
346 274 ct2_decoding_length_extra: 8
347 275 ct2_decoding_length_min: 32
348   - device: "cuda"
349   - torch_dtype: "float16"
  276 + device: cuda
  277 + torch_dtype: float16
350 278 batch_size: 64
351 279 max_input_length: 256
352 280 max_new_tokens: 64
... ... @@ -354,19 +282,19 @@ services:
354 282 use_cache: true
355 283 opus-mt-zh-en:
356 284 enabled: false
357   - backend: "local_marian"
358   - model_id: "Helsinki-NLP/opus-mt-zh-en"
359   - model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en"
360   - ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16"
361   - ct2_compute_type: "float16"
362   - ct2_conversion_quantization: "float16"
  285 + backend: local_marian
  286 + model_id: Helsinki-NLP/opus-mt-zh-en
  287 + model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en
  288 + ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-zh-en/ctranslate2-float16
  289 + ct2_compute_type: float16
  290 + ct2_conversion_quantization: float16
363 291 ct2_auto_convert: true
364 292 ct2_inter_threads: 1
365 293 ct2_intra_threads: 0
366 294 ct2_max_queued_batches: 0
367   - ct2_batch_type: "examples"
368   - device: "cuda"
369   - torch_dtype: "float16"
  295 + ct2_batch_type: examples
  296 + device: cuda
  297 + torch_dtype: float16
370 298 batch_size: 16
371 299 max_input_length: 256
372 300 max_new_tokens: 256
... ... @@ -374,181 +302,147 @@ services:
374 302 use_cache: true
375 303 opus-mt-en-zh:
376 304 enabled: false
377   - backend: "local_marian"
378   - model_id: "Helsinki-NLP/opus-mt-en-zh"
379   - model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh"
380   - ct2_model_dir: "./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16"
381   - ct2_compute_type: "float16"
382   - ct2_conversion_quantization: "float16"
  305 + backend: local_marian
  306 + model_id: Helsinki-NLP/opus-mt-en-zh
  307 + model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh
  308 + ct2_model_dir: ./models/translation/Helsinki-NLP/opus-mt-en-zh/ctranslate2-float16
  309 + ct2_compute_type: float16
  310 + ct2_conversion_quantization: float16
383 311 ct2_auto_convert: true
384 312 ct2_inter_threads: 1
385 313 ct2_intra_threads: 0
386 314 ct2_max_queued_batches: 0
387   - ct2_batch_type: "examples"
388   - device: "cuda"
389   - torch_dtype: "float16"
  315 + ct2_batch_type: examples
  316 + device: cuda
  317 + torch_dtype: float16
390 318 batch_size: 16
391 319 max_input_length: 256
392 320 max_new_tokens: 256
393 321 num_beams: 1
394 322 use_cache: true
395 323 embedding:
396   - provider: "http" # http
  324 + provider: http
397 325 providers:
398 326 http:
399   - text_base_url: "http://127.0.0.1:6005"
400   - image_base_url: "http://127.0.0.1:6008"
401   - # 服务内文本后端(embedding 进程启动时读取)
402   - backend: "tei" # tei | local_st
  327 + text_base_url: http://127.0.0.1:6005
  328 + image_base_url: http://127.0.0.1:6008
  329 + backend: tei
403 330 backends:
404 331 tei:
405   - base_url: "http://127.0.0.1:8080"
  332 + base_url: http://127.0.0.1:8080
406 333 timeout_sec: 20
407   - model_id: "Qwen/Qwen3-Embedding-0.6B"
  334 + model_id: Qwen/Qwen3-Embedding-0.6B
408 335 local_st:
409   - model_id: "Qwen/Qwen3-Embedding-0.6B"
410   - device: "cuda"
  336 + model_id: Qwen/Qwen3-Embedding-0.6B
  337 + device: cuda
411 338 batch_size: 32
412 339 normalize_embeddings: true
413   - # 服务内图片后端(embedding 进程启动时读取;cnclip gRPC 与 6008 须同一 model_name)
414   - # Chinese-CLIP:ViT-H-14 → 1024 维,ViT-L-14 → 768 维。须与 mappings/search_products.json 中
415   - # image_embedding.vector.dims 一致(当前索引为 1024 → 默认 ViT-H-14)。
416   - image_backend: "clip_as_service" # clip_as_service | local_cnclip
  340 + image_backend: clip_as_service
417 341 image_backends:
418 342 clip_as_service:
419   - server: "grpc://127.0.0.1:51000"
420   - model_name: "CN-CLIP/ViT-L-14"
  343 + server: grpc://127.0.0.1:51000
  344 + model_name: CN-CLIP/ViT-L-14
421 345 batch_size: 8
422 346 normalize_embeddings: true
423 347 local_cnclip:
424   - model_name: "ViT-L-14"
  348 + model_name: ViT-L-14
425 349 device: null
426 350 batch_size: 8
427 351 normalize_embeddings: true
428 352 rerank:
429   - provider: "http"
  353 + provider: http
430 354 providers:
431 355 http:
432 356 instances:
433 357 default:
434   - base_url: "http://127.0.0.1:6007"
435   - service_url: "http://127.0.0.1:6007/rerank"
  358 + base_url: http://127.0.0.1:6007
  359 + service_url: http://127.0.0.1:6007/rerank
436 360 fine:
437   - base_url: "http://127.0.0.1:6009"
438   - service_url: "http://127.0.0.1:6009/rerank"
  361 + base_url: http://127.0.0.1:6009
  362 + service_url: http://127.0.0.1:6009/rerank
439 363 request:
440 364 max_docs: 1000
441 365 normalize: true
442   - default_instance: "default"
443   - # 命名实例:同一套 reranker 代码按实例名读取不同端口 / 后端 / runtime 目录。
  366 + default_instance: default
444 367 instances:
445 368 default:
446   - host: "0.0.0.0"
  369 + host: 0.0.0.0
447 370 port: 6007
448   - backend: "qwen3_vllm_score"
449   - runtime_dir: "./.runtime/reranker/default"
  371 + backend: qwen3_vllm_score
  372 + runtime_dir: ./.runtime/reranker/default
450 373 fine:
451   - host: "0.0.0.0"
  374 + host: 0.0.0.0
452 375 port: 6009
453   - backend: "bge"
454   - runtime_dir: "./.runtime/reranker/fine"
  376 + backend: bge
  377 + runtime_dir: ./.runtime/reranker/fine
455 378 backends:
456 379 bge:
457   - model_name: "BAAI/bge-reranker-v2-m3"
  380 + model_name: BAAI/bge-reranker-v2-m3
458 381 device: null
459 382 use_fp16: true
460 383 batch_size: 80
461 384 max_length: 160
462   - cache_dir: "./model_cache"
  385 + cache_dir: ./model_cache
463 386 enable_warmup: true
464 387 jina_reranker_v3:
465   - model_name: "jinaai/jina-reranker-v3"
  388 + model_name: jinaai/jina-reranker-v3
466 389 device: null
467   - dtype: "float16"
  390 + dtype: float16
468 391 batch_size: 64
469 392 max_doc_length: 160
470 393 max_query_length: 64
471 394 sort_by_doc_length: true
472   - cache_dir: "./model_cache"
  395 + cache_dir: ./model_cache
473 396 trust_remote_code: true
474 397 qwen3_vllm:
475   - model_name: "Qwen/Qwen3-Reranker-0.6B"
476   - engine: "vllm"
  398 + model_name: Qwen/Qwen3-Reranker-0.6B
  399 + engine: vllm
477 400 max_model_len: 256
478 401 tensor_parallel_size: 1
479   - gpu_memory_utilization: 0.20
480   - dtype: "float16"
  402 + gpu_memory_utilization: 0.2
  403 + dtype: float16
481 404 enable_prefix_caching: true
482 405 enforce_eager: false
483 406 infer_batch_size: 100
484 407 sort_by_doc_length: true
485   - # standard=_format_instruction__standard(固定 yes/no system);compact=_format_instruction(instruction 作 system 且 user 内重复 Instruct)
486   - instruction_format: standard # compact standard
487   - # instruction: "Given a query, score the product for relevance"
488   - # "rank products by given query" 比 “Given a query, score the product for relevance” 更好点
489   - # instruction: "rank products by given query, category match first"
490   - # instruction: "Rank products by query relevance, prioritizing category match"
491   - # instruction: "Rank products by query relevance, prioritizing category and style match"
492   - # instruction: "Rank by query relevance, prioritize category & style"
493   - # instruction: "Relevance ranking: category & style match first"
494   - # instruction: "Score product relevance by query with category & style match prioritized"
495   - # instruction: "Rank products by query with category & style match prioritized"
496   - # instruction: "Given a fashion shopping query, retrieve relevant products that answer the query"
497   - instruction: "rank products by given query"
498   - # vLLM LLM.score()(跨编码打分)。独立高性能环境 .venv-reranker-score(vllm 0.18 固定版):./scripts/setup_reranker_venv.sh qwen3_vllm_score
499   - # 与 qwen3_vllm 可共用同一 model_name / HF 缓存;venv 分离以便升级 vLLM 而不影响 generate 后端。
  408 + instruction_format: standard
  409 + instruction: rank products by given query
500 410 qwen3_vllm_score:
501   - model_name: "Qwen/Qwen3-Reranker-0.6B"
502   - # 官方 Hub 原版需 true;若改用已转换的 seq-cls 权重(如 tomaarsen/...-seq-cls)则设为 false
  411 + model_name: Qwen/Qwen3-Reranker-0.6B
503 412 use_original_qwen3_hf_overrides: true
504   - # vllm_runner: "auto"
505   - # vllm_convert: "auto"
506   - # 可选:在 use_original_qwen3_hf_overrides 为 true 时与内置 overrides 合并
507   - # hf_overrides: {}
508   - engine: "vllm"
  413 + engine: vllm
509 414 max_model_len: 172
510 415 tensor_parallel_size: 1
511 416 gpu_memory_utilization: 0.15
512   - dtype: "float16"
  417 + dtype: float16
513 418 enable_prefix_caching: true
514 419 enforce_eager: false
515 420 infer_batch_size: 80
516 421 sort_by_doc_length: true
517   - # 默认 standard 与 vLLM 官方 Qwen3 reranker 前缀一致
518   - instruction_format: standard # compact standard
519   - # instruction: "Rank products by query with category & style match prioritized"
520   - instruction: "Rank products by query with category & style match prioritized"
521   - # instruction: "Given a shopping query, rank products by relevance"
  422 + instruction_format: standard
  423 + instruction: Rank products by query with category & style match prioritized
522 424 qwen3_transformers:
523   - model_name: "Qwen/Qwen3-Reranker-0.6B"
524   - instruction: "rank products by given query"
525   - # instruction: "Score the product’s relevance to the given query"
  425 + model_name: Qwen/Qwen3-Reranker-0.6B
  426 + instruction: rank products by given query
526 427 max_length: 8192
527 428 batch_size: 64
528 429 use_fp16: true
529   - # sdpa:默认无需 flash-attn;若已安装 flash_attn 可改为 flash_attention_2
530   - attn_implementation: "sdpa"
531   - # Packed Transformers backend: shared query prefix + custom position_ids/attention_mask.
532   - # For 1 query + many short docs (for example 400 product titles), this usually reduces
533   - # repeated prefix work and padding waste compared with pairwise batching.
  430 + attn_implementation: sdpa
534 431 qwen3_transformers_packed:
535   - model_name: "Qwen/Qwen3-Reranker-0.6B"
536   - instruction: "Rank products by query with category & style match prioritized"
  432 + model_name: Qwen/Qwen3-Reranker-0.6B
  433 + instruction: Rank products by query with category & style match prioritized
537 434 max_model_len: 256
538 435 max_doc_len: 160
539 436 max_docs_per_pack: 0
540 437 use_fp16: true
541 438 sort_by_doc_length: true
542   - # Packed mode relies on a custom 4D attention mask. "eager" is the safest default.
543   - # If your torch/transformers stack validates it, you can benchmark "sdpa".
544   - attn_implementation: "eager"
  439 + attn_implementation: eager
545 440 qwen3_gguf:
546   - repo_id: "DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF"
547   - filename: "*Q8_0.gguf"
548   - cache_dir: "./model_cache"
549   - local_dir: "./models/reranker/qwen3-reranker-4b-gguf"
550   - instruction: "Rank products by query with category & style match prioritized"
551   - # T4 16GB / 性能优先配置:全量层 offload,实测比保守配置明显更快
  441 + repo_id: DevQuasar/Qwen.Qwen3-Reranker-4B-GGUF
  442 + filename: '*Q8_0.gguf'
  443 + cache_dir: ./model_cache
  444 + local_dir: ./models/reranker/qwen3-reranker-4b-gguf
  445 + instruction: Rank products by query with category & style match prioritized
552 446 n_ctx: 512
553 447 n_batch: 512
554 448 n_ubatch: 512
... ... @@ -562,17 +456,15 @@ services:
562 456 use_mlock: false
563 457 infer_batch_size: 8
564 458 sort_by_doc_length: true
565   - length_sort_mode: "char"
  459 + length_sort_mode: char
566 460 enable_warmup: true
567 461 verbose: false
568 462 qwen3_gguf_06b:
569   - repo_id: "ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF"
570   - filename: "qwen3-reranker-0.6b-q8_0.gguf"
571   - cache_dir: "./model_cache"
572   - local_dir: "./models/reranker/qwen3-reranker-0.6b-q8_0-gguf"
573   - instruction: "Rank products by query with category & style match prioritized"
574   - # 0.6B GGUF / online rerank baseline:
575   - # 实测 400 titles 单请求约 265s,因此它更适合作为低显存功能后备,不适合在线低延迟主路由。
  463 + repo_id: ggml-org/Qwen3-Reranker-0.6B-Q8_0-GGUF
  464 + filename: qwen3-reranker-0.6b-q8_0.gguf
  465 + cache_dir: ./model_cache
  466 + local_dir: ./models/reranker/qwen3-reranker-0.6b-q8_0-gguf
  467 + instruction: Rank products by query with category & style match prioritized
576 468 n_ctx: 256
577 469 n_batch: 256
578 470 n_ubatch: 256
... ... @@ -586,54 +478,57 @@ services:
586 478 use_mlock: false
587 479 infer_batch_size: 32
588 480 sort_by_doc_length: true
589   - length_sort_mode: "char"
  481 + length_sort_mode: char
590 482 reuse_query_state: false
591 483 enable_warmup: true
592 484 verbose: false
593 485 dashscope_rerank:
594   - model_name: "qwen3-rerank"
595   - # 按地域选择 endpoint:
596   - # 中国: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
597   - # 新加坡: https://dashscope-intl.aliyuncs.com/compatible-api/v1/reranks
598   - # 美国: https://dashscope-us.aliyuncs.com/compatible-api/v1/reranks
599   - endpoint: "https://dashscope.aliyuncs.com/compatible-api/v1/reranks"
600   - api_key_env: "RERANK_DASHSCOPE_API_KEY_CN"
601   - timeout_sec: 10.0 #
602   - top_n_cap: 0 # 0 表示 top_n=当前请求文档数;>0 则限制 top_n 上限
603   - batchsize: 64 # 0 关闭;>0 启用并发小包调度(top_n/top_n_cap 仍生效,分包后全局截断)
604   - instruct: "Given a shopping query, rank product titles by relevance"
  486 + model_name: qwen3-rerank
  487 + endpoint: https://dashscope.aliyuncs.com/compatible-api/v1/reranks
  488 + api_key_env: RERANK_DASHSCOPE_API_KEY_CN
  489 + timeout_sec: 10.0
  490 + top_n_cap: 0
  491 + batchsize: 64
  492 + instruct: Given a shopping query, rank product titles by relevance
605 493 max_retries: 2
606 494 retry_backoff_sec: 0.2
607   -
608   -# SPU配置(已启用,使用嵌套skus)
609 495 spu_config:
610 496 enabled: true
611   - spu_field: "spu_id"
  497 + spu_field: spu_id
612 498 inner_hits_size: 10
613   - # 配置哪些option维度参与检索(进索引、以及在线搜索)
614   - # 格式为list,选择option1/option2/option3中的一个或多个
615   - searchable_option_dimensions: ['option1', 'option2', 'option3']
616   -
617   -# 租户配置(Tenant Configuration)
618   -# 每个租户可配置主语言 primary_language 与索引语言 index_languages(主市场语言,商家可勾选)
619   -# 默认 index_languages: [en, zh],可配置为任意 SOURCE_LANG_CODE_MAP.keys() 的子集
  499 + searchable_option_dimensions:
  500 + - option1
  501 + - option2
  502 + - option3
620 503 tenant_config:
621 504 default:
622   - primary_language: "en"
623   - index_languages: ["en", "zh"]
  505 + primary_language: en
  506 + index_languages:
  507 + - en
  508 + - zh
624 509 tenants:
625   - "1":
626   - primary_language: "zh"
627   - index_languages: ["zh", "en"]
628   - "2":
629   - primary_language: "en"
630   - index_languages: ["en", "zh"]
631   - "3":
632   - primary_language: "zh"
633   - index_languages: ["zh", "en"]
634   - "162":
635   - primary_language: "zh"
636   - index_languages: ["zh", "en"]
637   - "170":
638   - primary_language: "en"
639   - index_languages: ["en", "zh"]
  510 + '1':
  511 + primary_language: zh
  512 + index_languages:
  513 + - zh
  514 + - en
  515 + '2':
  516 + primary_language: en
  517 + index_languages:
  518 + - en
  519 + - zh
  520 + '3':
  521 + primary_language: zh
  522 + index_languages:
  523 + - zh
  524 + - en
  525 + '162':
  526 + primary_language: zh
  527 + index_languages:
  528 + - zh
  529 + - en
  530 + '170':
  531 + primary_language: en
  532 + index_languages:
  533 + - en
  534 + - zh
... ...
scripts/evaluation/README.md renamed to scripts/evaluation/README_Requirement.md
scripts/evaluation/README_zh.md renamed to scripts/evaluation/README_Requirement_zh.md
... ... @@ -106,4 +106,18 @@ queries默认是queries/queries.txt,填入左侧列表框,点击其中任何
106 106 批量评估关注的是所有搜索词总体的评估指标。
107 107 需要记录测试环境时间以及当时的配置文件,以及对应的结果。要保存历次的评估记录,并能查到每一次评估结果对应的配置文件有相关的指标
108 108  
109   -以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。
110 109 \ No newline at end of file
  110 +以上是我的总体设计,但有不周全的地方。你要站在更高的层次理解我的需求,你有足够的自由可以适当调整设计,基于你所了解的自动化搜索评估框架的最佳实践,做出更优秀的设计和更好的实现。
  111 +
  112 +
  113 +
  114 +
  115 +
  116 +
  117 +
  118 +
  119 +
  120 +1. 请仔细检验这个标注集的质量,如果质量不符合要求,那么你要优化工具,迭代直至标注集的结果质量足够高,可以以此为自动化工具来评估检索效果,对检索效果形成指导性意见。
  121 +2. 在结果标注集的质量足够好,批量评估工具足够好用,并且经过你的试用,能判断出搜索质量好坏的情况下,开始真正的动手检索效果调优:基于这个50条query的结果标注集和批量评估工具,对融合公式进行调参。请你先精心地设计实验,设计几组参数,对几组参数分别修改config.yaml、重启(./restart.sh backend)、跑批量评估、收集结果。
  122 +注意评估的过程中,如果发现工具不好用,发现日志不全,发现可以通过修改工具或者日志来提高效率,都可以先做这些,根据完善。
  123 +注意你是代码的总负责人,你有任何权限来满足你进行检索效果调优的需要。你如果发现有其他可能带来更大提升的点,也可以进行实验,你甚至可以修改融合、重排漏斗的代码,来进行实验,以追求更好的结果指标。
  124 +但是注意,因为收到性能和耗时的约束,不要调大reranker模型的输入条数、不要打开精排,耗时方面无法承受两轮reranker模型的调用。
... ...
scripts/evaluation/build_annotation_set.py 0 → 100644
... ... @@ -0,0 +1,14 @@
  1 +#!/usr/bin/env python3
  2 +
  3 +from pathlib import Path
  4 +import sys
  5 +
  6 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  7 +if str(PROJECT_ROOT) not in sys.path:
  8 + sys.path.insert(0, str(PROJECT_ROOT))
  9 +
  10 +from scripts.evaluation.eval_framework import main
  11 +
  12 +
  13 +if __name__ == "__main__":
  14 + main()
... ...
scripts/evaluation/eval_framework.py 0 → 100644
... ... @@ -0,0 +1,1786 @@
  1 +#!/usr/bin/env python3
  2 +"""
  3 +Search evaluation framework for pooled relevance annotation, live metrics, and reports.
  4 +"""
  5 +
  6 +from __future__ import annotations
  7 +
  8 +import argparse
  9 +import hashlib
  10 +import json
  11 +import math
  12 +import os
  13 +import re
  14 +import sqlite3
  15 +import sys
  16 +import time
  17 +from dataclasses import dataclass
  18 +from datetime import datetime, timezone
  19 +from pathlib import Path
  20 +from typing import Any, Dict, Iterable, List, Optional, Sequence, Tuple
  21 +
  22 +import requests
  23 +from elasticsearch.helpers import scan
  24 +from fastapi import FastAPI, HTTPException
  25 +from fastapi.responses import HTMLResponse
  26 +from pydantic import BaseModel, Field
  27 +
  28 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  29 +if str(PROJECT_ROOT) not in sys.path:
  30 + sys.path.insert(0, str(PROJECT_ROOT))
  31 +
  32 +from api.app import get_app_config, get_es_client, get_query_parser, init_service
  33 +from indexer.mapping_generator import get_tenant_index_name
  34 +
  35 +
  36 +RELEVANCE_EXACT = "Exact"
  37 +RELEVANCE_PARTIAL = "Partial"
  38 +RELEVANCE_IRRELEVANT = "Irrelevant"
  39 +VALID_LABELS = {RELEVANCE_EXACT, RELEVANCE_PARTIAL, RELEVANCE_IRRELEVANT}
  40 +DEFAULT_ARTIFACT_ROOT = PROJECT_ROOT / "artifacts" / "search_evaluation"
  41 +DEFAULT_QUERY_FILE = PROJECT_ROOT / "scripts" / "evaluation" / "queries" / "queries.txt"
  42 +JUDGE_PROMPT_VERSION = "v2_structured_20260331"
  43 +
  44 +
  45 +def utc_now_iso() -> str:
  46 + return datetime.now(timezone.utc).isoformat()
  47 +
  48 +
  49 +def utc_timestamp() -> str:
  50 + return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
  51 +
  52 +
  53 +def ensure_dir(path: Path) -> Path:
  54 + path.mkdir(parents=True, exist_ok=True)
  55 + return path
  56 +
  57 +
  58 +def sha1_text(text: str) -> str:
  59 + return hashlib.sha1(text.encode("utf-8")).hexdigest()
  60 +
  61 +
  62 +def pick_text(value: Any, preferred_lang: str = "en") -> str:
  63 + if value is None:
  64 + return ""
  65 + if isinstance(value, dict):
  66 + return str(
  67 + value.get(preferred_lang)
  68 + or value.get("en")
  69 + or value.get("zh")
  70 + or next((v for v in value.values() if v), "")
  71 + ).strip()
  72 + return str(value).strip()
  73 +
  74 +
  75 +def safe_json_dumps(data: Any) -> str:
  76 + return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
  77 +
  78 +
  79 +def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
  80 + if not skus:
  81 + return "", "", ""
  82 + first = skus[0] or {}
  83 + return (
  84 + str(first.get("option1_value") or "").strip(),
  85 + str(first.get("option2_value") or "").strip(),
  86 + str(first.get("option3_value") or "").strip(),
  87 + )
  88 +
  89 +
  90 +def build_display_title(doc: Dict[str, Any]) -> str:
  91 + title = doc.get("title")
  92 + en = pick_text(title, "en")
  93 + zh = pick_text(title, "zh")
  94 + if en and zh and en != zh:
  95 + return f"{en} / {zh}"
  96 + return en or zh
  97 +
  98 +
  99 +def build_rerank_doc(doc: Dict[str, Any]) -> str:
  100 + title = build_display_title(doc)
  101 + return title[:400]
  102 +
  103 +
  104 +def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
  105 + title = build_display_title(doc)
  106 + option1, option2, option3 = compact_option_values(doc.get("skus") or [])
  107 + vendor = pick_text(doc.get("vendor"), "en")
  108 + category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
  109 + tags = doc.get("tags") or []
  110 + tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
  111 + parts = [title]
  112 + if option1:
  113 + parts.append(f"option1={option1}")
  114 + if option2:
  115 + parts.append(f"option2={option2}")
  116 + if option3:
  117 + parts.append(f"option3={option3}")
  118 + if vendor:
  119 + parts.append(f"vendor={vendor}")
  120 + if category:
  121 + parts.append(f"category={category}")
  122 + if tags_text:
  123 + parts.append(f"tags={tags_text}")
  124 + return f"{idx}. " + " | ".join(part for part in parts if part)
  125 +
  126 +
  127 +def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
  128 + return {
  129 + "spu_id": str(doc.get("spu_id") or ""),
  130 + "title": build_display_title(doc),
  131 + "image_url": doc.get("image_url"),
  132 + "vendor": pick_text(doc.get("vendor"), "en"),
  133 + "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
  134 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  135 + "tags": list((doc.get("tags") or [])[:6]),
  136 + }
  137 +
  138 +
  139 +def normalize_text(text: Any) -> str:
  140 + value = str(text or "").strip().lower()
  141 + value = re.sub(r"\s+", " ", value)
  142 + return value
  143 +
  144 +
  145 +def _extract_json_blob(text: str) -> Any:
  146 + cleaned = str(text or "").strip()
  147 + candidates: List[str] = [cleaned]
  148 + fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
  149 + candidates.extend(match.strip() for match in fence_matches if match.strip())
  150 +
  151 + for candidate in candidates:
  152 + try:
  153 + return json.loads(candidate)
  154 + except Exception:
  155 + pass
  156 +
  157 + starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
  158 + ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
  159 + for start in starts:
  160 + for end in reversed(ends):
  161 + if end <= start:
  162 + continue
  163 + fragment = cleaned[start : end + 1]
  164 + try:
  165 + return json.loads(fragment)
  166 + except Exception:
  167 + continue
  168 + raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
  169 +
  170 +
  171 +@dataclass
  172 +class QueryBuildResult:
  173 + query: str
  174 + tenant_id: str
  175 + search_total: int
  176 + search_depth: int
  177 + rerank_corpus_size: int
  178 + annotated_count: int
  179 + output_json_path: Path
  180 +
  181 +
  182 +class EvalStore:
  183 + def __init__(self, db_path: Path):
  184 + self.db_path = db_path
  185 + ensure_dir(db_path.parent)
  186 + self.conn = sqlite3.connect(str(db_path), check_same_thread=False)
  187 + self.conn.row_factory = sqlite3.Row
  188 + self._init_schema()
  189 +
  190 + def _init_schema(self) -> None:
  191 + self.conn.executescript(
  192 + """
  193 + CREATE TABLE IF NOT EXISTS corpus_docs (
  194 + tenant_id TEXT NOT NULL,
  195 + spu_id TEXT NOT NULL,
  196 + title_json TEXT,
  197 + vendor_json TEXT,
  198 + category_path_json TEXT,
  199 + category_name_json TEXT,
  200 + image_url TEXT,
  201 + skus_json TEXT,
  202 + tags_json TEXT,
  203 + raw_json TEXT NOT NULL,
  204 + updated_at TEXT NOT NULL,
  205 + PRIMARY KEY (tenant_id, spu_id)
  206 + );
  207 +
  208 + CREATE TABLE IF NOT EXISTS rerank_scores (
  209 + tenant_id TEXT NOT NULL,
  210 + query_text TEXT NOT NULL,
  211 + spu_id TEXT NOT NULL,
  212 + score REAL NOT NULL,
  213 + model_name TEXT,
  214 + updated_at TEXT NOT NULL,
  215 + PRIMARY KEY (tenant_id, query_text, spu_id)
  216 + );
  217 +
  218 + CREATE TABLE IF NOT EXISTS relevance_labels (
  219 + tenant_id TEXT NOT NULL,
  220 + query_text TEXT NOT NULL,
  221 + spu_id TEXT NOT NULL,
  222 + label TEXT NOT NULL,
  223 + judge_model TEXT,
  224 + raw_response TEXT,
  225 + updated_at TEXT NOT NULL,
  226 + PRIMARY KEY (tenant_id, query_text, spu_id)
  227 + );
  228 +
  229 + CREATE TABLE IF NOT EXISTS build_runs (
  230 + run_id TEXT PRIMARY KEY,
  231 + tenant_id TEXT NOT NULL,
  232 + query_text TEXT NOT NULL,
  233 + output_json_path TEXT NOT NULL,
  234 + metadata_json TEXT NOT NULL,
  235 + created_at TEXT NOT NULL
  236 + );
  237 +
  238 + CREATE TABLE IF NOT EXISTS batch_runs (
  239 + batch_id TEXT PRIMARY KEY,
  240 + tenant_id TEXT NOT NULL,
  241 + output_json_path TEXT NOT NULL,
  242 + report_markdown_path TEXT NOT NULL,
  243 + config_snapshot_path TEXT NOT NULL,
  244 + metadata_json TEXT NOT NULL,
  245 + created_at TEXT NOT NULL
  246 + );
  247 +
  248 + CREATE TABLE IF NOT EXISTS query_profiles (
  249 + tenant_id TEXT NOT NULL,
  250 + query_text TEXT NOT NULL,
  251 + prompt_version TEXT NOT NULL,
  252 + judge_model TEXT,
  253 + profile_json TEXT NOT NULL,
  254 + raw_response TEXT NOT NULL,
  255 + updated_at TEXT NOT NULL,
  256 + PRIMARY KEY (tenant_id, query_text, prompt_version)
  257 + );
  258 + """
  259 + )
  260 + self.conn.commit()
  261 +
  262 + def upsert_corpus_docs(self, tenant_id: str, docs: Sequence[Dict[str, Any]]) -> None:
  263 + now = utc_now_iso()
  264 + rows = []
  265 + for doc in docs:
  266 + rows.append(
  267 + (
  268 + tenant_id,
  269 + str(doc.get("spu_id") or ""),
  270 + safe_json_dumps(doc.get("title")),
  271 + safe_json_dumps(doc.get("vendor")),
  272 + safe_json_dumps(doc.get("category_path")),
  273 + safe_json_dumps(doc.get("category_name")),
  274 + str(doc.get("image_url") or ""),
  275 + safe_json_dumps(doc.get("skus") or []),
  276 + safe_json_dumps(doc.get("tags") or []),
  277 + safe_json_dumps(doc),
  278 + now,
  279 + )
  280 + )
  281 + self.conn.executemany(
  282 + """
  283 + INSERT INTO corpus_docs (
  284 + tenant_id, spu_id, title_json, vendor_json, category_path_json, category_name_json,
  285 + image_url, skus_json, tags_json, raw_json, updated_at
  286 + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
  287 + ON CONFLICT(tenant_id, spu_id) DO UPDATE SET
  288 + title_json=excluded.title_json,
  289 + vendor_json=excluded.vendor_json,
  290 + category_path_json=excluded.category_path_json,
  291 + category_name_json=excluded.category_name_json,
  292 + image_url=excluded.image_url,
  293 + skus_json=excluded.skus_json,
  294 + tags_json=excluded.tags_json,
  295 + raw_json=excluded.raw_json,
  296 + updated_at=excluded.updated_at
  297 + """,
  298 + rows,
  299 + )
  300 + self.conn.commit()
  301 +
  302 + def get_corpus_docs(self, tenant_id: str) -> List[Dict[str, Any]]:
  303 + rows = self.conn.execute(
  304 + "SELECT raw_json FROM corpus_docs WHERE tenant_id=? ORDER BY spu_id",
  305 + (tenant_id,),
  306 + ).fetchall()
  307 + return [json.loads(row["raw_json"]) for row in rows]
  308 +
  309 + def get_corpus_docs_by_spu_ids(self, tenant_id: str, spu_ids: Sequence[str]) -> Dict[str, Dict[str, Any]]:
  310 + keys = [str(spu_id) for spu_id in spu_ids if str(spu_id).strip()]
  311 + if not keys:
  312 + return {}
  313 + placeholders = ",".join("?" for _ in keys)
  314 + rows = self.conn.execute(
  315 + f"""
  316 + SELECT spu_id, raw_json
  317 + FROM corpus_docs
  318 + WHERE tenant_id=? AND spu_id IN ({placeholders})
  319 + """,
  320 + [tenant_id, *keys],
  321 + ).fetchall()
  322 + return {
  323 + str(row["spu_id"]): json.loads(row["raw_json"])
  324 + for row in rows
  325 + }
  326 +
  327 + def has_corpus(self, tenant_id: str) -> bool:
  328 + row = self.conn.execute(
  329 + "SELECT COUNT(1) AS n FROM corpus_docs WHERE tenant_id=?",
  330 + (tenant_id,),
  331 + ).fetchone()
  332 + return bool(row and row["n"] > 0)
  333 +
  334 + def get_rerank_scores(self, tenant_id: str, query_text: str) -> Dict[str, float]:
  335 + rows = self.conn.execute(
  336 + """
  337 + SELECT spu_id, score
  338 + FROM rerank_scores
  339 + WHERE tenant_id=? AND query_text=?
  340 + """,
  341 + (tenant_id, query_text),
  342 + ).fetchall()
  343 + return {str(row["spu_id"]): float(row["score"]) for row in rows}
  344 +
  345 + def upsert_rerank_scores(
  346 + self,
  347 + tenant_id: str,
  348 + query_text: str,
  349 + scores: Dict[str, float],
  350 + model_name: str,
  351 + ) -> None:
  352 + now = utc_now_iso()
  353 + rows = [
  354 + (tenant_id, query_text, spu_id, float(score), model_name, now)
  355 + for spu_id, score in scores.items()
  356 + ]
  357 + self.conn.executemany(
  358 + """
  359 + INSERT INTO rerank_scores (tenant_id, query_text, spu_id, score, model_name, updated_at)
  360 + VALUES (?, ?, ?, ?, ?, ?)
  361 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  362 + score=excluded.score,
  363 + model_name=excluded.model_name,
  364 + updated_at=excluded.updated_at
  365 + """,
  366 + rows,
  367 + )
  368 + self.conn.commit()
  369 +
  370 + def get_labels(self, tenant_id: str, query_text: str) -> Dict[str, str]:
  371 + rows = self.conn.execute(
  372 + """
  373 + SELECT spu_id, label
  374 + FROM relevance_labels
  375 + WHERE tenant_id=? AND query_text=?
  376 + """,
  377 + (tenant_id, query_text),
  378 + ).fetchall()
  379 + return {str(row["spu_id"]): str(row["label"]) for row in rows}
  380 +
  381 + def upsert_labels(
  382 + self,
  383 + tenant_id: str,
  384 + query_text: str,
  385 + labels: Dict[str, str],
  386 + judge_model: str,
  387 + raw_response: str,
  388 + ) -> None:
  389 + now = utc_now_iso()
  390 + rows = []
  391 + for spu_id, label in labels.items():
  392 + if label not in VALID_LABELS:
  393 + raise ValueError(f"invalid label: {label}")
  394 + rows.append((tenant_id, query_text, spu_id, label, judge_model, raw_response, now))
  395 + self.conn.executemany(
  396 + """
  397 + INSERT INTO relevance_labels (tenant_id, query_text, spu_id, label, judge_model, raw_response, updated_at)
  398 + VALUES (?, ?, ?, ?, ?, ?, ?)
  399 + ON CONFLICT(tenant_id, query_text, spu_id) DO UPDATE SET
  400 + label=excluded.label,
  401 + judge_model=excluded.judge_model,
  402 + raw_response=excluded.raw_response,
  403 + updated_at=excluded.updated_at
  404 + """,
  405 + rows,
  406 + )
  407 + self.conn.commit()
  408 +
  409 + def get_query_profile(self, tenant_id: str, query_text: str, prompt_version: str) -> Optional[Dict[str, Any]]:
  410 + row = self.conn.execute(
  411 + """
  412 + SELECT profile_json
  413 + FROM query_profiles
  414 + WHERE tenant_id=? AND query_text=? AND prompt_version=?
  415 + """,
  416 + (tenant_id, query_text, prompt_version),
  417 + ).fetchone()
  418 + if not row:
  419 + return None
  420 + return json.loads(row["profile_json"])
  421 +
  422 + def upsert_query_profile(
  423 + self,
  424 + tenant_id: str,
  425 + query_text: str,
  426 + prompt_version: str,
  427 + judge_model: str,
  428 + profile: Dict[str, Any],
  429 + raw_response: str,
  430 + ) -> None:
  431 + self.conn.execute(
  432 + """
  433 + INSERT OR REPLACE INTO query_profiles
  434 + (tenant_id, query_text, prompt_version, judge_model, profile_json, raw_response, updated_at)
  435 + VALUES (?, ?, ?, ?, ?, ?, ?)
  436 + """,
  437 + (
  438 + tenant_id,
  439 + query_text,
  440 + prompt_version,
  441 + judge_model,
  442 + safe_json_dumps(profile),
  443 + raw_response,
  444 + utc_now_iso(),
  445 + ),
  446 + )
  447 + self.conn.commit()
  448 +
  449 + def insert_build_run(self, run_id: str, tenant_id: str, query_text: str, output_json_path: Path, metadata: Dict[str, Any]) -> None:
  450 + self.conn.execute(
  451 + """
  452 + INSERT OR REPLACE INTO build_runs (run_id, tenant_id, query_text, output_json_path, metadata_json, created_at)
  453 + VALUES (?, ?, ?, ?, ?, ?)
  454 + """,
  455 + (run_id, tenant_id, query_text, str(output_json_path), safe_json_dumps(metadata), utc_now_iso()),
  456 + )
  457 + self.conn.commit()
  458 +
  459 + def insert_batch_run(
  460 + self,
  461 + batch_id: str,
  462 + tenant_id: str,
  463 + output_json_path: Path,
  464 + report_markdown_path: Path,
  465 + config_snapshot_path: Path,
  466 + metadata: Dict[str, Any],
  467 + ) -> None:
  468 + self.conn.execute(
  469 + """
  470 + INSERT OR REPLACE INTO batch_runs
  471 + (batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at)
  472 + VALUES (?, ?, ?, ?, ?, ?, ?)
  473 + """,
  474 + (
  475 + batch_id,
  476 + tenant_id,
  477 + str(output_json_path),
  478 + str(report_markdown_path),
  479 + str(config_snapshot_path),
  480 + safe_json_dumps(metadata),
  481 + utc_now_iso(),
  482 + ),
  483 + )
  484 + self.conn.commit()
  485 +
  486 + def list_batch_runs(self, limit: int = 20) -> List[Dict[str, Any]]:
  487 + rows = self.conn.execute(
  488 + """
  489 + SELECT batch_id, tenant_id, output_json_path, report_markdown_path, config_snapshot_path, metadata_json, created_at
  490 + FROM batch_runs
  491 + ORDER BY created_at DESC
  492 + LIMIT ?
  493 + """,
  494 + (limit,),
  495 + ).fetchall()
  496 + items: List[Dict[str, Any]] = []
  497 + for row in rows:
  498 + items.append(
  499 + {
  500 + "batch_id": row["batch_id"],
  501 + "tenant_id": row["tenant_id"],
  502 + "output_json_path": row["output_json_path"],
  503 + "report_markdown_path": row["report_markdown_path"],
  504 + "config_snapshot_path": row["config_snapshot_path"],
  505 + "metadata": json.loads(row["metadata_json"]),
  506 + "created_at": row["created_at"],
  507 + }
  508 + )
  509 + return items
  510 +
  511 + def list_query_label_stats(self, tenant_id: str) -> List[Dict[str, Any]]:
  512 + rows = self.conn.execute(
  513 + """
  514 + SELECT
  515 + query_text,
  516 + COUNT(*) AS total,
  517 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  518 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  519 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  520 + MAX(updated_at) AS updated_at
  521 + FROM relevance_labels
  522 + WHERE tenant_id=?
  523 + GROUP BY query_text
  524 + ORDER BY query_text
  525 + """,
  526 + (tenant_id,),
  527 + ).fetchall()
  528 + return [
  529 + {
  530 + "query": str(row["query_text"]),
  531 + "total": int(row["total"]),
  532 + "exact_count": int(row["exact_count"] or 0),
  533 + "partial_count": int(row["partial_count"] or 0),
  534 + "irrelevant_count": int(row["irrelevant_count"] or 0),
  535 + "updated_at": row["updated_at"],
  536 + }
  537 + for row in rows
  538 + ]
  539 +
  540 + def get_query_label_stats(self, tenant_id: str, query_text: str) -> Dict[str, Any]:
  541 + row = self.conn.execute(
  542 + """
  543 + SELECT
  544 + COUNT(*) AS total,
  545 + SUM(CASE WHEN label='Exact' THEN 1 ELSE 0 END) AS exact_count,
  546 + SUM(CASE WHEN label='Partial' THEN 1 ELSE 0 END) AS partial_count,
  547 + SUM(CASE WHEN label='Irrelevant' THEN 1 ELSE 0 END) AS irrelevant_count,
  548 + MAX(updated_at) AS updated_at
  549 + FROM relevance_labels
  550 + WHERE tenant_id=? AND query_text=?
  551 + """,
  552 + (tenant_id, query_text),
  553 + ).fetchone()
  554 + return {
  555 + "query": query_text,
  556 + "total": int((row["total"] or 0) if row else 0),
  557 + "exact_count": int((row["exact_count"] or 0) if row else 0),
  558 + "partial_count": int((row["partial_count"] or 0) if row else 0),
  559 + "irrelevant_count": int((row["irrelevant_count"] or 0) if row else 0),
  560 + "updated_at": row["updated_at"] if row else None,
  561 + }
  562 +
  563 +
  564 +class SearchServiceClient:
  565 + def __init__(self, base_url: str, tenant_id: str):
  566 + self.base_url = base_url.rstrip("/")
  567 + self.tenant_id = str(tenant_id)
  568 + self.session = requests.Session()
  569 +
  570 + def search(self, query: str, size: int, from_: int = 0, language: str = "en") -> Dict[str, Any]:
  571 + response = self.session.post(
  572 + f"{self.base_url}/search/",
  573 + headers={"Content-Type": "application/json", "X-Tenant-ID": self.tenant_id},
  574 + json={"query": query, "size": size, "from": from_, "language": language},
  575 + timeout=120,
  576 + )
  577 + response.raise_for_status()
  578 + return response.json()
  579 +
  580 +
  581 +class RerankServiceClient:
  582 + def __init__(self, service_url: str):
  583 + self.service_url = service_url.rstrip("/")
  584 + self.session = requests.Session()
  585 +
  586 + def rerank(self, query: str, docs: Sequence[str], normalize: bool = False, top_n: Optional[int] = None) -> Tuple[List[float], Dict[str, Any]]:
  587 + payload: Dict[str, Any] = {
  588 + "query": query,
  589 + "docs": list(docs),
  590 + "normalize": normalize,
  591 + }
  592 + if top_n is not None:
  593 + payload["top_n"] = int(top_n)
  594 + response = self.session.post(self.service_url, json=payload, timeout=180)
  595 + response.raise_for_status()
  596 + data = response.json()
  597 + return list(data.get("scores") or []), dict(data.get("meta") or {})
  598 +
  599 +
  600 +class DashScopeLabelClient:
  601 + def __init__(self, model: str, base_url: str, api_key: str, batch_size: int = 40):
  602 + self.model = model
  603 + self.base_url = base_url.rstrip("/")
  604 + self.api_key = api_key
  605 + self.batch_size = int(batch_size)
  606 + self.session = requests.Session()
  607 +
  608 + def _chat(self, prompt: str) -> Tuple[str, str]:
  609 + response = self.session.post(
  610 + f"{self.base_url}/chat/completions",
  611 + headers={
  612 + "Authorization": f"Bearer {self.api_key}",
  613 + "Content-Type": "application/json",
  614 + },
  615 + json={
  616 + "model": self.model,
  617 + "messages": [{"role": "user", "content": prompt}],
  618 + "temperature": 0,
  619 + "top_p": 0.1,
  620 + },
  621 + timeout=180,
  622 + )
  623 + response.raise_for_status()
  624 + data = response.json()
  625 + content = str(((data.get("choices") or [{}])[0].get("message") or {}).get("content") or "").strip()
  626 + return content, safe_json_dumps(data)
  627 +
  628 + def extract_query_profile(
  629 + self,
  630 + query: str,
  631 + parser_hints: Dict[str, Any],
  632 + ) -> Tuple[Dict[str, Any], str]:
  633 + prompt = (
  634 + "You are building a structured intent profile for e-commerce relevance judging.\n"
  635 + "Use the original user query as the source of truth. Parser hints may help, but if a hint conflicts with the original query, trust the original query.\n"
  636 + "Be conservative: only mark an attribute as required if the user explicitly asked for it.\n\n"
  637 + "Return JSON with this schema:\n"
  638 + "{\n"
  639 + ' "normalized_query_en": string,\n'
  640 + ' "primary_category": string,\n'
  641 + ' "allowed_categories": [string],\n'
  642 + ' "required_attributes": [\n'
  643 + ' {"name": string, "required_terms": [string], "conflicting_terms": [string], "match_mode": "explicit"}\n'
  644 + " ],\n"
  645 + ' "notes": [string]\n'
  646 + "}\n\n"
  647 + "Guidelines:\n"
  648 + "- Exact later will require explicit evidence for all required attributes.\n"
  649 + "- allowed_categories should contain only near-synonyms of the same product type, not substitutes. For example dress can allow midi dress/cocktail dress, but not skirt, top, jumpsuit, or outfit unless the query explicitly asks for them.\n"
  650 + "- If the query asks for dress/skirt/jeans/t-shirt, near but different product types are not Exact.\n"
  651 + "- If the query includes color, fit, silhouette, or length, include them as required_attributes.\n"
  652 + "- For fit words, include conflicting terms when obvious, e.g. fitted conflicts with oversized/loose; oversized conflicts with fitted/tight.\n"
  653 + "- For color, include conflicting colors only when clear from the query.\n\n"
  654 + f"Original query: {query}\n"
  655 + f"Parser hints JSON: {json.dumps(parser_hints, ensure_ascii=False)}\n"
  656 + )
  657 + content, raw_response = self._chat(prompt)
  658 + payload = _extract_json_blob(content)
  659 + if not isinstance(payload, dict):
  660 + raise ValueError(f"unexpected query profile payload: {content!r}")
  661 + payload.setdefault("normalized_query_en", query)
  662 + payload.setdefault("primary_category", "")
  663 + payload.setdefault("allowed_categories", [])
  664 + payload.setdefault("required_attributes", [])
  665 + payload.setdefault("notes", [])
  666 + return payload, raw_response
  667 +
  668 + def classify_batch(
  669 + self,
  670 + query: str,
  671 + query_profile: Dict[str, Any],
  672 + docs: Sequence[Dict[str, Any]],
  673 + ) -> Tuple[List[str], str]:
  674 + numbered_docs = [build_label_doc_line(idx + 1, doc) for idx, doc in enumerate(docs)]
  675 + prompt = (
  676 + "You are an e-commerce search relevance judge.\n"
  677 + "Judge each product against the structured query profile below.\n\n"
  678 + "Relevance rules:\n"
  679 + "- Exact: product type matches the target intent, and every explicit required attribute is positively supported by the title/options/tags/category. If an attribute is missing or only guessed, it is NOT Exact.\n"
  680 + "- Partial: main product type/use case matches, but some required attribute is missing, weaker, uncertain, or only approximately matched.\n"
  681 + "- Irrelevant: product type/use case mismatched, or an explicit required attribute clearly conflicts.\n"
  682 + "- Be conservative with Exact.\n"
  683 + "- Graphic/holiday/message tees are not Exact for a plain color/style tee query unless that graphic/theme was requested.\n"
  684 + "- Jumpsuit/romper/set is not Exact for dress/skirt/jeans queries.\n\n"
  685 + f"Original query: {query}\n"
  686 + f"Structured query profile JSON: {json.dumps(query_profile, ensure_ascii=False)}\n\n"
  687 + "Products:\n"
  688 + + "\n".join(numbered_docs)
  689 + + "\n\nReturn JSON only, with schema:\n"
  690 + '{"labels":[{"index":1,"label":"Exact","reason":"short phrase"}]}\n'
  691 + )
  692 + content, raw_response = self._chat(prompt)
  693 + payload = _extract_json_blob(content)
  694 + if not isinstance(payload, dict) or not isinstance(payload.get("labels"), list):
  695 + raise ValueError(f"unexpected label payload: {content!r}")
  696 + labels_payload = payload["labels"]
  697 + labels: List[str] = []
  698 + for item in labels_payload[: len(docs)]:
  699 + if not isinstance(item, dict):
  700 + continue
  701 + label = str(item.get("label") or "").strip()
  702 + if label in VALID_LABELS:
  703 + labels.append(label)
  704 + if len(labels) != len(docs) or any(label not in VALID_LABELS for label in labels):
  705 + raise ValueError(f"unexpected label output: {content!r}")
  706 + return labels, raw_response
  707 +
  708 +
  709 +def precision_at_k(labels: Sequence[str], k: int, relevant: Sequence[str]) -> float:
  710 + if k <= 0:
  711 + return 0.0
  712 + sliced = list(labels[:k])
  713 + if not sliced:
  714 + return 0.0
  715 + hits = sum(1 for label in sliced if label in relevant)
  716 + return hits / float(min(k, len(sliced)))
  717 +
  718 +
  719 +def average_precision(labels: Sequence[str], relevant: Sequence[str]) -> float:
  720 + hit_count = 0
  721 + precision_sum = 0.0
  722 + for idx, label in enumerate(labels, start=1):
  723 + if label not in relevant:
  724 + continue
  725 + hit_count += 1
  726 + precision_sum += hit_count / idx
  727 + if hit_count == 0:
  728 + return 0.0
  729 + return precision_sum / hit_count
  730 +
  731 +
  732 +def compute_query_metrics(labels: Sequence[str]) -> Dict[str, float]:
  733 + metrics: Dict[str, float] = {}
  734 + for k in (5, 10, 20, 50):
  735 + metrics[f"P@{k}"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT]), 6)
  736 + metrics[f"P@{k}_2_3"] = round(precision_at_k(labels, k, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  737 + metrics["MAP_3"] = round(average_precision(labels, [RELEVANCE_EXACT]), 6)
  738 + metrics["MAP_2_3"] = round(average_precision(labels, [RELEVANCE_EXACT, RELEVANCE_PARTIAL]), 6)
  739 + return metrics
  740 +
  741 +
  742 +def aggregate_metrics(metric_items: Sequence[Dict[str, float]]) -> Dict[str, float]:
  743 + if not metric_items:
  744 + return {}
  745 + keys = sorted(metric_items[0].keys())
  746 + return {
  747 + key: round(sum(float(item.get(key, 0.0)) for item in metric_items) / len(metric_items), 6)
  748 + for key in keys
  749 + }
  750 +
  751 +
  752 +def label_distribution(labels: Sequence[str]) -> Dict[str, int]:
  753 + return {
  754 + RELEVANCE_EXACT: sum(1 for label in labels if label == RELEVANCE_EXACT),
  755 + RELEVANCE_PARTIAL: sum(1 for label in labels if label == RELEVANCE_PARTIAL),
  756 + RELEVANCE_IRRELEVANT: sum(1 for label in labels if label == RELEVANCE_IRRELEVANT),
  757 + }
  758 +
  759 +
  760 +class SearchEvaluationFramework:
  761 + def __init__(
  762 + self,
  763 + tenant_id: str,
  764 + artifact_root: Path = DEFAULT_ARTIFACT_ROOT,
  765 + search_base_url: str = "http://localhost:6002",
  766 + ):
  767 + init_service(get_app_config().infrastructure.elasticsearch.host)
  768 + self.tenant_id = str(tenant_id)
  769 + self.artifact_root = ensure_dir(artifact_root)
  770 + self.store = EvalStore(self.artifact_root / "search_eval.sqlite3")
  771 + self.search_client = SearchServiceClient(search_base_url, self.tenant_id)
  772 + app_cfg = get_app_config()
  773 + rerank_service_url = str(
  774 + app_cfg.services.rerank.providers["http"]["instances"]["default"]["service_url"]
  775 + )
  776 + self.rerank_client = RerankServiceClient(rerank_service_url)
  777 + llm_cfg = app_cfg.services.translation.capabilities["llm"]
  778 + api_key = app_cfg.infrastructure.secrets.dashscope_api_key
  779 + if not api_key:
  780 + raise RuntimeError("dashscope_api_key is required for search evaluation annotation")
  781 + self.label_client = DashScopeLabelClient(
  782 + model=str(llm_cfg["model"]),
  783 + base_url=str(llm_cfg["base_url"]),
  784 + api_key=str(api_key),
  785 + )
  786 + self.query_parser = get_query_parser()
  787 +
  788 + def build_query_parser_hints(self, query: str) -> Dict[str, Any]:
  789 + parsed = self.query_parser.parse(query, generate_vector=False, target_languages=["en", "zh"])
  790 + payload = parsed.to_dict()
  791 + payload["text_for_rerank"] = parsed.text_for_rerank()
  792 + return payload
  793 +
  794 + def get_query_profile(self, query: str, force_refresh: bool = False) -> Dict[str, Any]:
  795 + if not force_refresh:
  796 + cached = self.store.get_query_profile(self.tenant_id, query, JUDGE_PROMPT_VERSION)
  797 + if cached is not None:
  798 + return cached
  799 + parser_hints = self.build_query_parser_hints(query)
  800 + profile, raw_response = self.label_client.extract_query_profile(query, parser_hints)
  801 + profile["parser_hints"] = parser_hints
  802 + self.store.upsert_query_profile(
  803 + self.tenant_id,
  804 + query,
  805 + JUDGE_PROMPT_VERSION,
  806 + self.label_client.model,
  807 + profile,
  808 + raw_response,
  809 + )
  810 + return profile
  811 +
  812 + @staticmethod
  813 + def _doc_evidence_text(doc: Dict[str, Any]) -> str:
  814 + pieces: List[str] = [
  815 + build_display_title(doc),
  816 + pick_text(doc.get("vendor"), "en"),
  817 + pick_text(doc.get("category_path"), "en"),
  818 + pick_text(doc.get("category_name"), "en"),
  819 + ]
  820 + for sku in doc.get("skus") or []:
  821 + pieces.extend(
  822 + [
  823 + str(sku.get("option1_value") or ""),
  824 + str(sku.get("option2_value") or ""),
  825 + str(sku.get("option3_value") or ""),
  826 + ]
  827 + )
  828 + for tag in doc.get("tags") or []:
  829 + pieces.append(str(tag))
  830 + return normalize_text(" | ".join(piece for piece in pieces if piece))
  831 +
  832 + def _apply_rule_based_label_guardrails(
  833 + self,
  834 + label: str,
  835 + query_profile: Dict[str, Any],
  836 + doc: Dict[str, Any],
  837 + ) -> str:
  838 + if label not in VALID_LABELS:
  839 + return label
  840 + evidence = self._doc_evidence_text(doc)
  841 + category = normalize_text(query_profile.get("primary_category"))
  842 + allowed_categories = [normalize_text(item) for item in query_profile.get("allowed_categories") or [] if str(item).strip()]
  843 +
  844 + primary_category_match = True
  845 + if category:
  846 + primary_category_match = category in evidence
  847 + allowed_category_match = True
  848 + if allowed_categories:
  849 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  850 +
  851 + if label == RELEVANCE_EXACT and not primary_category_match:
  852 + if allowed_category_match:
  853 + label = RELEVANCE_PARTIAL
  854 + else:
  855 + return RELEVANCE_IRRELEVANT
  856 +
  857 + for attr in query_profile.get("required_attributes") or []:
  858 + if not isinstance(attr, dict):
  859 + continue
  860 + attr_name = normalize_text(attr.get("name"))
  861 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style", "waist_style", "rise"}:
  862 + continue
  863 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  864 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  865 + if attr_name == "fit":
  866 + if any(term in {"oversized", "oversize"} for term in required_terms):
  867 + conflicting_terms.extend(["slim", "slimming", "fitted", "tight", "close-fitting"])
  868 + if any(term in {"fitted", "slim fit", "tight"} for term in required_terms):
  869 + conflicting_terms.extend(["oversized", "oversize", "loose", "relaxed"])
  870 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  871 + has_conflict = any(term in evidence for term in conflicting_terms)
  872 +
  873 + if has_conflict:
  874 + return RELEVANCE_IRRELEVANT
  875 + if label == RELEVANCE_EXACT and not has_required:
  876 + label = RELEVANCE_PARTIAL
  877 +
  878 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  879 + return RELEVANCE_IRRELEVANT
  880 +
  881 + return label
  882 +
  883 + @staticmethod
  884 + def _result_item_to_doc(item: Dict[str, Any]) -> Dict[str, Any]:
  885 + option_values = list(item.get("option_values") or [])
  886 + while len(option_values) < 3:
  887 + option_values.append("")
  888 + product = dict(item.get("product") or {})
  889 + return {
  890 + "spu_id": item.get("spu_id"),
  891 + "title": product.get("title") or item.get("title"),
  892 + "vendor": product.get("vendor"),
  893 + "category_path": product.get("category"),
  894 + "category_name": product.get("category"),
  895 + "image_url": item.get("image_url") or product.get("image_url"),
  896 + "tags": product.get("tags") or [],
  897 + "skus": [
  898 + {
  899 + "option1_value": option_values[0],
  900 + "option2_value": option_values[1],
  901 + "option3_value": option_values[2],
  902 + }
  903 + ],
  904 + }
  905 +
  906 + def _collect_label_issues(
  907 + self,
  908 + label: str,
  909 + query_profile: Dict[str, Any],
  910 + doc: Dict[str, Any],
  911 + ) -> List[str]:
  912 + evidence = self._doc_evidence_text(doc)
  913 + issues: List[str] = []
  914 + category = normalize_text(query_profile.get("primary_category"))
  915 + allowed_categories = [
  916 + normalize_text(item)
  917 + for item in query_profile.get("allowed_categories") or []
  918 + if str(item).strip()
  919 + ]
  920 +
  921 + primary_category_match = True if not category else category in evidence
  922 + allowed_category_match = False if allowed_categories else primary_category_match
  923 + if allowed_categories:
  924 + allowed_category_match = any(signal in evidence for signal in allowed_categories)
  925 +
  926 + if label == RELEVANCE_EXACT and not primary_category_match:
  927 + if allowed_category_match:
  928 + issues.append("Exact missing primary category evidence")
  929 + else:
  930 + issues.append("Exact has category mismatch")
  931 +
  932 + if label == RELEVANCE_PARTIAL and not primary_category_match and not allowed_category_match:
  933 + issues.append("Partial has category mismatch")
  934 +
  935 + for attr in query_profile.get("required_attributes") or []:
  936 + if not isinstance(attr, dict):
  937 + continue
  938 + attr_name = normalize_text(attr.get("name"))
  939 + if attr_name not in {"color", "fit", "length", "type", "product_type", "material", "size", "gender", "style"}:
  940 + continue
  941 + required_terms = [normalize_text(item) for item in attr.get("required_terms") or [] if normalize_text(item)]
  942 + conflicting_terms = [normalize_text(item) for item in attr.get("conflicting_terms") or [] if normalize_text(item)]
  943 + has_required = any(term in evidence for term in required_terms) if required_terms else True
  944 + has_conflict = any(term in evidence for term in conflicting_terms)
  945 +
  946 + if has_conflict and label != RELEVANCE_IRRELEVANT:
  947 + issues.append(f"{label} conflicts on {attr_name}")
  948 + if label == RELEVANCE_EXACT and not has_required:
  949 + issues.append(f"Exact missing {attr_name}")
  950 + return issues
  951 +
  952 + def audit_live_query(
  953 + self,
  954 + query: str,
  955 + *,
  956 + top_k: int = 100,
  957 + language: str = "en",
  958 + auto_annotate: bool = True,
  959 + ) -> Dict[str, Any]:
  960 + live = self.evaluate_live_query(query=query, top_k=top_k, auto_annotate=auto_annotate, language=language)
  961 + query_profile = self.get_query_profile(query, force_refresh=False)
  962 + suspicious: List[Dict[str, Any]] = []
  963 +
  964 + for item in live["results"]:
  965 + doc = self._result_item_to_doc(item)
  966 + issues = self._collect_label_issues(item["label"] or "", query_profile, doc)
  967 + suggested_label = self._apply_rule_based_label_guardrails(item["label"] or "", query_profile, doc)
  968 + if suggested_label != (item["label"] or ""):
  969 + issues = list(issues) + [f"Suggested relabel: {item['label']} -> {suggested_label}"]
  970 + if issues:
  971 + suspicious.append(
  972 + {
  973 + "rank": item["rank"],
  974 + "spu_id": item["spu_id"],
  975 + "title": item["title"],
  976 + "label": item["label"],
  977 + "suggested_label": suggested_label,
  978 + "issues": issues,
  979 + }
  980 + )
  981 +
  982 + labels = [
  983 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  984 + for item in live["results"]
  985 + ]
  986 + return {
  987 + "query": query,
  988 + "tenant_id": self.tenant_id,
  989 + "top_k": top_k,
  990 + "metrics": live["metrics"],
  991 + "distribution": label_distribution(labels),
  992 + "query_profile": query_profile,
  993 + "suspicious": suspicious,
  994 + "results": live["results"],
  995 + }
  996 +
  997 + def queries_from_file(self, path: Path) -> List[str]:
  998 + return [
  999 + line.strip()
  1000 + for line in path.read_text(encoding="utf-8").splitlines()
  1001 + if line.strip() and not line.strip().startswith("#")
  1002 + ]
  1003 +
  1004 + def corpus_docs(self, refresh: bool = False) -> List[Dict[str, Any]]:
  1005 + if not refresh and self.store.has_corpus(self.tenant_id):
  1006 + return self.store.get_corpus_docs(self.tenant_id)
  1007 +
  1008 + es_client = get_es_client().client
  1009 + index_name = get_tenant_index_name(self.tenant_id)
  1010 + docs: List[Dict[str, Any]] = []
  1011 + for hit in scan(
  1012 + client=es_client,
  1013 + index=index_name,
  1014 + query={
  1015 + "_source": [
  1016 + "spu_id",
  1017 + "title",
  1018 + "vendor",
  1019 + "category_path",
  1020 + "category_name",
  1021 + "image_url",
  1022 + "skus",
  1023 + "tags",
  1024 + ],
  1025 + "query": {"match_all": {}},
  1026 + },
  1027 + size=500,
  1028 + preserve_order=False,
  1029 + clear_scroll=True,
  1030 + ):
  1031 + source = dict(hit.get("_source") or {})
  1032 + source["spu_id"] = str(source.get("spu_id") or hit.get("_id") or "")
  1033 + docs.append(source)
  1034 + self.store.upsert_corpus_docs(self.tenant_id, docs)
  1035 + return docs
  1036 +
  1037 + def full_corpus_rerank(
  1038 + self,
  1039 + query: str,
  1040 + docs: Sequence[Dict[str, Any]],
  1041 + batch_size: int = 24,
  1042 + force_refresh: bool = False,
  1043 + ) -> List[Dict[str, Any]]:
  1044 + cached = {} if force_refresh else self.store.get_rerank_scores(self.tenant_id, query)
  1045 + pending: List[Dict[str, Any]] = [doc for doc in docs if str(doc.get("spu_id")) not in cached]
  1046 + if pending:
  1047 + new_scores: Dict[str, float] = {}
  1048 + for start in range(0, len(pending), batch_size):
  1049 + batch = pending[start : start + batch_size]
  1050 + scores = self._rerank_batch_with_retry(query=query, docs=batch)
  1051 + if len(scores) != len(batch):
  1052 + raise RuntimeError(f"rerank returned {len(scores)} scores for {len(batch)} docs")
  1053 + for doc, score in zip(batch, scores):
  1054 + new_scores[str(doc.get("spu_id"))] = float(score)
  1055 + self.store.upsert_rerank_scores(
  1056 + self.tenant_id,
  1057 + query,
  1058 + new_scores,
  1059 + model_name="qwen3_vllm_score",
  1060 + )
  1061 + cached.update(new_scores)
  1062 +
  1063 + ranked = []
  1064 + for doc in docs:
  1065 + spu_id = str(doc.get("spu_id"))
  1066 + ranked.append({"spu_id": spu_id, "score": float(cached.get(spu_id, float("-inf"))), "doc": doc})
  1067 + ranked.sort(key=lambda item: item["score"], reverse=True)
  1068 + return ranked
  1069 +
  1070 + def _rerank_batch_with_retry(self, query: str, docs: Sequence[Dict[str, Any]]) -> List[float]:
  1071 + if not docs:
  1072 + return []
  1073 + doc_texts = [build_rerank_doc(doc) for doc in docs]
  1074 + try:
  1075 + scores, _meta = self.rerank_client.rerank(query=query, docs=doc_texts, normalize=False)
  1076 + return scores
  1077 + except Exception:
  1078 + if len(docs) == 1:
  1079 + return [-1.0]
  1080 + if len(docs) <= 6:
  1081 + scores: List[float] = []
  1082 + for doc in docs:
  1083 + scores.extend(self._rerank_batch_with_retry(query, [doc]))
  1084 + return scores
  1085 + mid = len(docs) // 2
  1086 + left = self._rerank_batch_with_retry(query, docs[:mid])
  1087 + right = self._rerank_batch_with_retry(query, docs[mid:])
  1088 + return left + right
  1089 +
  1090 + def annotate_missing_labels(
  1091 + self,
  1092 + query: str,
  1093 + docs: Sequence[Dict[str, Any]],
  1094 + force_refresh: bool = False,
  1095 + ) -> Dict[str, str]:
  1096 + query_profile = self.get_query_profile(query, force_refresh=force_refresh)
  1097 + labels = {} if force_refresh else self.store.get_labels(self.tenant_id, query)
  1098 + missing_docs = [doc for doc in docs if str(doc.get("spu_id")) not in labels]
  1099 + if not missing_docs:
  1100 + return labels
  1101 +
  1102 + for start in range(0, len(missing_docs), self.label_client.batch_size):
  1103 + batch = missing_docs[start : start + self.label_client.batch_size]
  1104 + batch_pairs = self._classify_with_retry(query, query_profile, batch)
  1105 + for sub_labels, raw_response, sub_batch in batch_pairs:
  1106 + to_store = {
  1107 + str(doc.get("spu_id")): self._apply_rule_based_label_guardrails(label, query_profile, doc)
  1108 + for doc, label in zip(sub_batch, sub_labels)
  1109 + }
  1110 + self.store.upsert_labels(
  1111 + self.tenant_id,
  1112 + query,
  1113 + to_store,
  1114 + judge_model=self.label_client.model,
  1115 + raw_response=raw_response,
  1116 + )
  1117 + labels.update(to_store)
  1118 + time.sleep(0.1)
  1119 + return labels
  1120 +
  1121 + def _classify_with_retry(
  1122 + self,
  1123 + query: str,
  1124 + query_profile: Dict[str, Any],
  1125 + docs: Sequence[Dict[str, Any]],
  1126 + ) -> List[Tuple[List[str], str, Sequence[Dict[str, Any]]]]:
  1127 + if not docs:
  1128 + return []
  1129 + try:
  1130 + labels, raw_response = self.label_client.classify_batch(query, query_profile, docs)
  1131 + return [(labels, raw_response, docs)]
  1132 + except Exception:
  1133 + if len(docs) == 1:
  1134 + raise
  1135 + mid = len(docs) // 2
  1136 + return self._classify_with_retry(query, query_profile, docs[:mid]) + self._classify_with_retry(query, query_profile, docs[mid:])
  1137 +
  1138 + def build_query_annotation_set(
  1139 + self,
  1140 + query: str,
  1141 + *,
  1142 + search_depth: int = 1000,
  1143 + rerank_depth: int = 10000,
  1144 + annotate_search_top_k: int = 120,
  1145 + annotate_rerank_top_k: int = 200,
  1146 + language: str = "en",
  1147 + force_refresh_rerank: bool = False,
  1148 + force_refresh_labels: bool = False,
  1149 + ) -> QueryBuildResult:
  1150 + search_payload = self.search_client.search(query=query, size=search_depth, from_=0, language=language)
  1151 + search_results = list(search_payload.get("results") or [])
  1152 + corpus = self.corpus_docs(refresh=False)
  1153 + full_rerank = self.full_corpus_rerank(
  1154 + query=query,
  1155 + docs=corpus,
  1156 + force_refresh=force_refresh_rerank,
  1157 + )
  1158 + rerank_depth_effective = min(rerank_depth, len(full_rerank))
  1159 +
  1160 + pool_docs: Dict[str, Dict[str, Any]] = {}
  1161 + for doc in search_results[:annotate_search_top_k]:
  1162 + pool_docs[str(doc.get("spu_id"))] = doc
  1163 + for item in full_rerank[:annotate_rerank_top_k]:
  1164 + pool_docs[str(item["spu_id"])] = item["doc"]
  1165 +
  1166 + query_profile = self.get_query_profile(query, force_refresh=force_refresh_labels)
  1167 + labels = self.annotate_missing_labels(
  1168 + query=query,
  1169 + docs=list(pool_docs.values()),
  1170 + force_refresh=force_refresh_labels,
  1171 + )
  1172 +
  1173 + search_labeled_results: List[Dict[str, Any]] = []
  1174 + for rank, doc in enumerate(search_results, start=1):
  1175 + spu_id = str(doc.get("spu_id"))
  1176 + label = labels.get(spu_id)
  1177 + search_labeled_results.append(
  1178 + {
  1179 + "rank": rank,
  1180 + "spu_id": spu_id,
  1181 + "title": build_display_title(doc),
  1182 + "image_url": doc.get("image_url"),
  1183 + "rerank_score": None,
  1184 + "label": label,
  1185 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  1186 + "product": compact_product_payload(doc),
  1187 + }
  1188 + )
  1189 +
  1190 + rerank_top_results: List[Dict[str, Any]] = []
  1191 + for rank, item in enumerate(full_rerank[:rerank_depth_effective], start=1):
  1192 + doc = item["doc"]
  1193 + spu_id = str(item["spu_id"])
  1194 + rerank_top_results.append(
  1195 + {
  1196 + "rank": rank,
  1197 + "spu_id": spu_id,
  1198 + "title": build_display_title(doc),
  1199 + "image_url": doc.get("image_url"),
  1200 + "rerank_score": round(float(item["score"]), 8),
  1201 + "label": labels.get(spu_id),
  1202 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  1203 + "product": compact_product_payload(doc),
  1204 + }
  1205 + )
  1206 +
  1207 + top100_labels = [
  1208 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1209 + for item in search_labeled_results[:100]
  1210 + ]
  1211 + metrics = compute_query_metrics(top100_labels)
  1212 + output_dir = ensure_dir(self.artifact_root / "query_builds")
  1213 + run_id = f"{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + query)[:10]}"
  1214 + output_json_path = output_dir / f"{run_id}.json"
  1215 + payload = {
  1216 + "run_id": run_id,
  1217 + "created_at": utc_now_iso(),
  1218 + "tenant_id": self.tenant_id,
  1219 + "query": query,
  1220 + "config_meta": requests.get("http://localhost:6002/admin/config/meta", timeout=20).json(),
  1221 + "search_total": int(search_payload.get("total") or 0),
  1222 + "search_depth_requested": search_depth,
  1223 + "search_depth_effective": len(search_results),
  1224 + "rerank_depth_requested": rerank_depth,
  1225 + "rerank_depth_effective": rerank_depth_effective,
  1226 + "corpus_size": len(corpus),
  1227 + "annotation_pool": {
  1228 + "annotate_search_top_k": annotate_search_top_k,
  1229 + "annotate_rerank_top_k": annotate_rerank_top_k,
  1230 + "pool_size": len(pool_docs),
  1231 + },
  1232 + "query_profile": query_profile,
  1233 + "metrics_top100": metrics,
  1234 + "search_results": search_labeled_results,
  1235 + "full_rerank_top": rerank_top_results,
  1236 + }
  1237 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  1238 + self.store.insert_build_run(run_id, self.tenant_id, query, output_json_path, payload["metrics_top100"])
  1239 + return QueryBuildResult(
  1240 + query=query,
  1241 + tenant_id=self.tenant_id,
  1242 + search_total=int(search_payload.get("total") or 0),
  1243 + search_depth=len(search_results),
  1244 + rerank_corpus_size=len(corpus),
  1245 + annotated_count=len(pool_docs),
  1246 + output_json_path=output_json_path,
  1247 + )
  1248 +
  1249 + def evaluate_live_query(
  1250 + self,
  1251 + query: str,
  1252 + top_k: int = 100,
  1253 + auto_annotate: bool = True,
  1254 + language: str = "en",
  1255 + force_refresh_labels: bool = False,
  1256 + ) -> Dict[str, Any]:
  1257 + search_payload = self.search_client.search(query=query, size=max(top_k, 100), from_=0, language=language)
  1258 + results = list(search_payload.get("results") or [])
  1259 + if auto_annotate:
  1260 + self.annotate_missing_labels(query=query, docs=results[:top_k], force_refresh=force_refresh_labels)
  1261 + labels = self.store.get_labels(self.tenant_id, query)
  1262 + labeled = []
  1263 + for rank, doc in enumerate(results[:top_k], start=1):
  1264 + spu_id = str(doc.get("spu_id"))
  1265 + labeled.append(
  1266 + {
  1267 + "rank": rank,
  1268 + "spu_id": spu_id,
  1269 + "title": build_display_title(doc),
  1270 + "image_url": doc.get("image_url"),
  1271 + "label": labels.get(spu_id),
  1272 + "option_values": list(compact_option_values(doc.get("skus") or [])),
  1273 + "product": compact_product_payload(doc),
  1274 + }
  1275 + )
  1276 + metric_labels = [
  1277 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1278 + for item in labeled
  1279 + ]
  1280 + return {
  1281 + "query": query,
  1282 + "tenant_id": self.tenant_id,
  1283 + "top_k": top_k,
  1284 + "metrics": compute_query_metrics(metric_labels),
  1285 + "results": labeled,
  1286 + "total": int(search_payload.get("total") or 0),
  1287 + }
  1288 +
  1289 + def batch_evaluate(
  1290 + self,
  1291 + queries: Sequence[str],
  1292 + *,
  1293 + top_k: int = 100,
  1294 + auto_annotate: bool = True,
  1295 + language: str = "en",
  1296 + force_refresh_labels: bool = False,
  1297 + ) -> Dict[str, Any]:
  1298 + per_query = []
  1299 + for query in queries:
  1300 + live = self.evaluate_live_query(
  1301 + query,
  1302 + top_k=top_k,
  1303 + auto_annotate=auto_annotate,
  1304 + language=language,
  1305 + force_refresh_labels=force_refresh_labels,
  1306 + )
  1307 + labels = [
  1308 + item["label"] if item["label"] in VALID_LABELS else RELEVANCE_IRRELEVANT
  1309 + for item in live["results"]
  1310 + ]
  1311 + per_query.append(
  1312 + {
  1313 + "query": live["query"],
  1314 + "tenant_id": live["tenant_id"],
  1315 + "top_k": live["top_k"],
  1316 + "metrics": live["metrics"],
  1317 + "distribution": label_distribution(labels),
  1318 + "total": live["total"],
  1319 + }
  1320 + )
  1321 + aggregate = aggregate_metrics([item["metrics"] for item in per_query])
  1322 + aggregate_distribution = {
  1323 + RELEVANCE_EXACT: sum(item["distribution"][RELEVANCE_EXACT] for item in per_query),
  1324 + RELEVANCE_PARTIAL: sum(item["distribution"][RELEVANCE_PARTIAL] for item in per_query),
  1325 + RELEVANCE_IRRELEVANT: sum(item["distribution"][RELEVANCE_IRRELEVANT] for item in per_query),
  1326 + }
  1327 + batch_id = f"batch_{utc_timestamp()}_{sha1_text(self.tenant_id + '|' + '|'.join(queries))[:10]}"
  1328 + report_dir = ensure_dir(self.artifact_root / "batch_reports")
  1329 + config_snapshot_path = report_dir / f"{batch_id}_config.json"
  1330 + config_snapshot = requests.get("http://localhost:6002/admin/config", timeout=20).json()
  1331 + config_snapshot_path.write_text(json.dumps(config_snapshot, ensure_ascii=False, indent=2), encoding="utf-8")
  1332 + output_json_path = report_dir / f"{batch_id}.json"
  1333 + report_md_path = report_dir / f"{batch_id}.md"
  1334 + payload = {
  1335 + "batch_id": batch_id,
  1336 + "created_at": utc_now_iso(),
  1337 + "tenant_id": self.tenant_id,
  1338 + "queries": list(queries),
  1339 + "top_k": top_k,
  1340 + "aggregate_metrics": aggregate,
  1341 + "aggregate_distribution": aggregate_distribution,
  1342 + "per_query": per_query,
  1343 + "config_snapshot_path": str(config_snapshot_path),
  1344 + }
  1345 + output_json_path.write_text(json.dumps(payload, ensure_ascii=False, indent=2), encoding="utf-8")
  1346 + report_md_path.write_text(render_batch_report_markdown(payload), encoding="utf-8")
  1347 + self.store.insert_batch_run(batch_id, self.tenant_id, output_json_path, report_md_path, config_snapshot_path, payload)
  1348 + return payload
  1349 +
  1350 +
  1351 +def render_batch_report_markdown(payload: Dict[str, Any]) -> str:
  1352 + lines = [
  1353 + "# Search Batch Evaluation",
  1354 + "",
  1355 + f"- Batch ID: {payload['batch_id']}",
  1356 + f"- Created at: {payload['created_at']}",
  1357 + f"- Tenant ID: {payload['tenant_id']}",
  1358 + f"- Query count: {len(payload['queries'])}",
  1359 + f"- Top K: {payload['top_k']}",
  1360 + "",
  1361 + "## Aggregate Metrics",
  1362 + "",
  1363 + ]
  1364 + for key, value in sorted((payload.get("aggregate_metrics") or {}).items()):
  1365 + lines.append(f"- {key}: {value}")
  1366 + distribution = payload.get("aggregate_distribution") or {}
  1367 + if distribution:
  1368 + lines.extend(
  1369 + [
  1370 + "",
  1371 + "## Label Distribution",
  1372 + "",
  1373 + f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}",
  1374 + f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}",
  1375 + f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}",
  1376 + ]
  1377 + )
  1378 + lines.extend(["", "## Per Query", ""])
  1379 + for item in payload.get("per_query") or []:
  1380 + lines.append(f"### {item['query']}")
  1381 + lines.append("")
  1382 + for key, value in sorted((item.get("metrics") or {}).items()):
  1383 + lines.append(f"- {key}: {value}")
  1384 + distribution = item.get("distribution") or {}
  1385 + lines.append(f"- Exact: {distribution.get(RELEVANCE_EXACT, 0)}")
  1386 + lines.append(f"- Partial: {distribution.get(RELEVANCE_PARTIAL, 0)}")
  1387 + lines.append(f"- Irrelevant: {distribution.get(RELEVANCE_IRRELEVANT, 0)}")
  1388 + lines.append("")
  1389 + return "\n".join(lines)
  1390 +
  1391 +
  1392 +class SearchEvalRequest(BaseModel):
  1393 + query: str
  1394 + top_k: int = Field(default=100, ge=1, le=500)
  1395 + auto_annotate: bool = True
  1396 + language: str = "en"
  1397 +
  1398 +
  1399 +class BatchEvalRequest(BaseModel):
  1400 + queries: Optional[List[str]] = None
  1401 + top_k: int = Field(default=100, ge=1, le=500)
  1402 + auto_annotate: bool = True
  1403 + language: str = "en"
  1404 + force_refresh_labels: bool = False
  1405 +
  1406 +
  1407 +def create_web_app(framework: SearchEvaluationFramework, query_file: Path = DEFAULT_QUERY_FILE) -> FastAPI:
  1408 + app = FastAPI(title="Search Evaluation UI", version="1.0.0")
  1409 +
  1410 + @app.get("/", response_class=HTMLResponse)
  1411 + def home() -> str:
  1412 + return WEB_APP_HTML
  1413 +
  1414 + @app.get("/api/queries")
  1415 + def api_queries() -> Dict[str, Any]:
  1416 + return {"queries": framework.queries_from_file(query_file)}
  1417 +
  1418 + @app.post("/api/search-eval")
  1419 + def api_search_eval(request: SearchEvalRequest) -> Dict[str, Any]:
  1420 + return framework.evaluate_live_query(
  1421 + query=request.query,
  1422 + top_k=request.top_k,
  1423 + auto_annotate=request.auto_annotate,
  1424 + language=request.language,
  1425 + )
  1426 +
  1427 + @app.post("/api/batch-eval")
  1428 + def api_batch_eval(request: BatchEvalRequest) -> Dict[str, Any]:
  1429 + queries = request.queries or framework.queries_from_file(query_file)
  1430 + if not queries:
  1431 + raise HTTPException(status_code=400, detail="No queries provided")
  1432 + return framework.batch_evaluate(
  1433 + queries=queries,
  1434 + top_k=request.top_k,
  1435 + auto_annotate=request.auto_annotate,
  1436 + language=request.language,
  1437 + force_refresh_labels=request.force_refresh_labels,
  1438 + )
  1439 +
  1440 + @app.get("/api/history")
  1441 + def api_history() -> Dict[str, Any]:
  1442 + return {"history": framework.store.list_batch_runs(limit=20)}
  1443 +
  1444 + return app
  1445 +
  1446 +
  1447 +WEB_APP_HTML = """
  1448 +<!doctype html>
  1449 +<html lang="en">
  1450 +<head>
  1451 + <meta charset="utf-8" />
  1452 + <meta name="viewport" content="width=device-width, initial-scale=1" />
  1453 + <title>Search Evaluation</title>
  1454 + <style>
  1455 + :root {
  1456 + --bg: #f5f3ed;
  1457 + --panel: #fffdf8;
  1458 + --ink: #1f2a24;
  1459 + --muted: #6b756e;
  1460 + --line: #ddd4c6;
  1461 + --accent: #0f766e;
  1462 + --exact: #0f766e;
  1463 + --partial: #b7791f;
  1464 + --irrelevant: #b42318;
  1465 + }
  1466 + body { margin: 0; font-family: "IBM Plex Sans", "Segoe UI", sans-serif; color: var(--ink); background:
  1467 + radial-gradient(circle at top left, #f0e6d6 0, transparent 28%),
  1468 + linear-gradient(180deg, #f9f6f0 0%, #f0ece3 100%); }
  1469 + .app { display: grid; grid-template-columns: 280px 1fr; min-height: 100vh; }
  1470 + .sidebar { border-right: 1px solid var(--line); padding: 20px; background: rgba(255,255,255,0.55); backdrop-filter: blur(10px); }
  1471 + .main { padding: 24px; }
  1472 + h1, h2 { margin: 0 0 12px; }
  1473 + .muted { color: var(--muted); }
  1474 + .query-list { max-height: 60vh; overflow: auto; border: 1px solid var(--line); background: var(--panel); border-radius: 14px; padding: 8px; }
  1475 + .query-item { display: block; width: 100%; border: 0; background: transparent; text-align: left; padding: 10px 12px; border-radius: 10px; cursor: pointer; }
  1476 + .query-item:hover { background: #eef6f4; }
  1477 + .toolbar { display: flex; gap: 12px; flex-wrap: wrap; align-items: center; margin-bottom: 16px; }
  1478 + input[type=text] { flex: 1 1 420px; padding: 12px 14px; border-radius: 14px; border: 1px solid var(--line); font-size: 15px; }
  1479 + button { border: 0; background: var(--accent); color: white; padding: 12px 16px; border-radius: 14px; cursor: pointer; font-weight: 600; }
  1480 + button.secondary { background: #d9e6e3; color: #12433d; }
  1481 + .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(170px, 1fr)); gap: 12px; margin-bottom: 16px; }
  1482 + .metric { background: var(--panel); border: 1px solid var(--line); border-radius: 16px; padding: 14px; }
  1483 + .metric .label { font-size: 12px; color: var(--muted); text-transform: uppercase; letter-spacing: 0.04em; }
  1484 + .metric .value { font-size: 24px; font-weight: 700; margin-top: 4px; }
  1485 + .results { display: grid; gap: 10px; }
  1486 + .result { display: grid; grid-template-columns: 110px 100px 1fr; gap: 14px; align-items: center; background: var(--panel); border: 1px solid var(--line); border-radius: 18px; padding: 12px; }
  1487 + .badge { display: inline-block; padding: 8px 10px; border-radius: 999px; color: white; font-weight: 700; text-align: center; }
  1488 + .Exact { background: var(--exact); }
  1489 + .Partial { background: var(--partial); }
  1490 + .Irrelevant { background: var(--irrelevant); }
  1491 + .Unknown { background: #637381; }
  1492 + .thumb { width: 100px; height: 100px; object-fit: cover; border-radius: 14px; background: #e7e1d4; }
  1493 + .title { font-size: 16px; font-weight: 700; margin-bottom: 8px; }
  1494 + .options { color: var(--muted); line-height: 1.5; font-size: 14px; }
  1495 + .section { margin-bottom: 28px; }
  1496 + .history { font-size: 13px; line-height: 1.5; }
  1497 + </style>
  1498 +</head>
  1499 +<body>
  1500 + <div class="app">
  1501 + <aside class="sidebar">
  1502 + <h2>Queries</h2>
  1503 + <p class="muted">Loaded from <code>scripts/evaluation/queries/queries.txt</code></p>
  1504 + <div id="queryList" class="query-list"></div>
  1505 + <div class="section">
  1506 + <h2>History</h2>
  1507 + <div id="history" class="history muted">Loading...</div>
  1508 + </div>
  1509 + </aside>
  1510 + <main class="main">
  1511 + <h1>Search Evaluation</h1>
  1512 + <p class="muted">Single-query evaluation and batch evaluation share the same service on port 6010.</p>
  1513 + <div class="toolbar">
  1514 + <input id="queryInput" type="text" placeholder="Search query" />
  1515 + <button onclick="runSingle()">Evaluate Query</button>
  1516 + <button class="secondary" onclick="runBatch()">Batch Evaluation</button>
  1517 + </div>
  1518 + <div id="status" class="muted section"></div>
  1519 + <section class="section">
  1520 + <h2>Metrics</h2>
  1521 + <div id="metrics" class="grid"></div>
  1522 + </section>
  1523 + <section class="section">
  1524 + <h2>Top Results</h2>
  1525 + <div id="results" class="results"></div>
  1526 + </section>
  1527 + </main>
  1528 + </div>
  1529 + <script>
  1530 + async function fetchJSON(url, options) {
  1531 + const res = await fetch(url, options);
  1532 + if (!res.ok) throw new Error(await res.text());
  1533 + return await res.json();
  1534 + }
  1535 + function renderMetrics(metrics) {
  1536 + const root = document.getElementById('metrics');
  1537 + root.innerHTML = '';
  1538 + Object.entries(metrics || {}).forEach(([key, value]) => {
  1539 + const card = document.createElement('div');
  1540 + card.className = 'metric';
  1541 + card.innerHTML = `<div class="label">${key}</div><div class="value">${value}</div>`;
  1542 + root.appendChild(card);
  1543 + });
  1544 + }
  1545 + function renderResults(results) {
  1546 + const root = document.getElementById('results');
  1547 + root.innerHTML = '';
  1548 + (results || []).forEach(item => {
  1549 + const label = item.label || 'Unknown';
  1550 + const box = document.createElement('div');
  1551 + box.className = 'result';
  1552 + box.innerHTML = `
  1553 + <div><span class="badge ${label}">${label}</span><div class="muted" style="margin-top:8px">#${item.rank}</div></div>
  1554 + <img class="thumb" src="${item.image_url || ''}" alt="" />
  1555 + <div>
  1556 + <div class="title">${item.title || ''}</div>
  1557 + <div class="options">
  1558 + <div>${(item.option_values || [])[0] || ''}</div>
  1559 + <div>${(item.option_values || [])[1] || ''}</div>
  1560 + <div>${(item.option_values || [])[2] || ''}</div>
  1561 + </div>
  1562 + </div>`;
  1563 + root.appendChild(box);
  1564 + });
  1565 + }
  1566 + async function loadQueries() {
  1567 + const data = await fetchJSON('/api/queries');
  1568 + const root = document.getElementById('queryList');
  1569 + root.innerHTML = '';
  1570 + data.queries.forEach(query => {
  1571 + const btn = document.createElement('button');
  1572 + btn.className = 'query-item';
  1573 + btn.textContent = query;
  1574 + btn.onclick = () => {
  1575 + document.getElementById('queryInput').value = query;
  1576 + runSingle();
  1577 + };
  1578 + root.appendChild(btn);
  1579 + });
  1580 + }
  1581 + async function loadHistory() {
  1582 + const data = await fetchJSON('/api/history');
  1583 + const root = document.getElementById('history');
  1584 + root.innerHTML = (data.history || []).map(item =>
  1585 + `<div><strong>${item.batch_id}</strong><br/>${item.created_at}<br/>${item.report_markdown_path}</div><br/>`
  1586 + ).join('') || 'No history yet.';
  1587 + }
  1588 + async function runSingle() {
  1589 + const query = document.getElementById('queryInput').value.trim();
  1590 + if (!query) return;
  1591 + document.getElementById('status').textContent = `Evaluating "${query}"...`;
  1592 + const data = await fetchJSON('/api/search-eval', {
  1593 + method: 'POST',
  1594 + headers: {'Content-Type': 'application/json'},
  1595 + body: JSON.stringify({query, top_k: 100, auto_annotate: true})
  1596 + });
  1597 + document.getElementById('status').textContent = `Done. total=${data.total}`;
  1598 + renderMetrics(data.metrics);
  1599 + renderResults(data.results);
  1600 + loadHistory();
  1601 + }
  1602 + async function runBatch() {
  1603 + document.getElementById('status').textContent = 'Running batch evaluation...';
  1604 + const data = await fetchJSON('/api/batch-eval', {
  1605 + method: 'POST',
  1606 + headers: {'Content-Type': 'application/json'},
  1607 + body: JSON.stringify({top_k: 100, auto_annotate: true})
  1608 + });
  1609 + document.getElementById('status').textContent = `Batch done. report=${data.batch_id}`;
  1610 + renderMetrics(data.aggregate_metrics);
  1611 + renderResults([]);
  1612 + loadHistory();
  1613 + }
  1614 + loadQueries();
  1615 + loadHistory();
  1616 + </script>
  1617 +</body>
  1618 +</html>
  1619 +"""
  1620 +
  1621 +
  1622 +def build_cli_parser() -> argparse.ArgumentParser:
  1623 + parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
  1624 + sub = parser.add_subparsers(dest="command", required=True)
  1625 +
  1626 + build = sub.add_parser("build", help="Build pooled annotation set for queries")
  1627 + build.add_argument("--tenant-id", default="163")
  1628 + build.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  1629 + build.add_argument("--search-depth", type=int, default=1000)
  1630 + build.add_argument("--rerank-depth", type=int, default=10000)
  1631 + build.add_argument("--annotate-search-top-k", type=int, default=120)
  1632 + build.add_argument("--annotate-rerank-top-k", type=int, default=200)
  1633 + build.add_argument("--language", default="en")
  1634 + build.add_argument("--force-refresh-rerank", action="store_true")
  1635 + build.add_argument("--force-refresh-labels", action="store_true")
  1636 +
  1637 + batch = sub.add_parser("batch", help="Run batch evaluation against live search")
  1638 + batch.add_argument("--tenant-id", default="163")
  1639 + batch.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  1640 + batch.add_argument("--top-k", type=int, default=100)
  1641 + batch.add_argument("--language", default="en")
  1642 + batch.add_argument("--force-refresh-labels", action="store_true")
  1643 +
  1644 + audit = sub.add_parser("audit", help="Audit annotation quality for queries")
  1645 + audit.add_argument("--tenant-id", default="163")
  1646 + audit.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  1647 + audit.add_argument("--top-k", type=int, default=100)
  1648 + audit.add_argument("--language", default="en")
  1649 + audit.add_argument("--limit-suspicious", type=int, default=5)
  1650 + audit.add_argument("--force-refresh-labels", action="store_true")
  1651 +
  1652 + serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
  1653 + serve.add_argument("--tenant-id", default="163")
  1654 + serve.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  1655 + serve.add_argument("--host", default="0.0.0.0")
  1656 + serve.add_argument("--port", type=int, default=6010)
  1657 +
  1658 + return parser
  1659 +
  1660 +
  1661 +def run_build(args: argparse.Namespace) -> None:
  1662 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1663 + queries = framework.queries_from_file(Path(args.queries_file))
  1664 + summary = []
  1665 + for query in queries:
  1666 + result = framework.build_query_annotation_set(
  1667 + query=query,
  1668 + search_depth=args.search_depth,
  1669 + rerank_depth=args.rerank_depth,
  1670 + annotate_search_top_k=args.annotate_search_top_k,
  1671 + annotate_rerank_top_k=args.annotate_rerank_top_k,
  1672 + language=args.language,
  1673 + force_refresh_rerank=args.force_refresh_rerank,
  1674 + force_refresh_labels=args.force_refresh_labels,
  1675 + )
  1676 + summary.append(
  1677 + {
  1678 + "query": result.query,
  1679 + "search_total": result.search_total,
  1680 + "search_depth": result.search_depth,
  1681 + "rerank_corpus_size": result.rerank_corpus_size,
  1682 + "annotated_count": result.annotated_count,
  1683 + "output_json_path": str(result.output_json_path),
  1684 + }
  1685 + )
  1686 + print(
  1687 + f"[build] query={result.query!r} search_total={result.search_total} "
  1688 + f"search_depth={result.search_depth} corpus={result.rerank_corpus_size} "
  1689 + f"annotated={result.annotated_count} output={result.output_json_path}"
  1690 + )
  1691 + out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
  1692 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  1693 + print(f"[done] summary={out_path}")
  1694 +
  1695 +
  1696 +def run_batch(args: argparse.Namespace) -> None:
  1697 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1698 + queries = framework.queries_from_file(Path(args.queries_file))
  1699 + payload = framework.batch_evaluate(
  1700 + queries=queries,
  1701 + top_k=args.top_k,
  1702 + auto_annotate=True,
  1703 + language=args.language,
  1704 + force_refresh_labels=args.force_refresh_labels,
  1705 + )
  1706 + print(f"[done] batch_id={payload['batch_id']} aggregate_metrics={payload['aggregate_metrics']}")
  1707 +
  1708 +
  1709 +def run_audit(args: argparse.Namespace) -> None:
  1710 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1711 + queries = framework.queries_from_file(Path(args.queries_file))
  1712 + audit_items = []
  1713 + for query in queries:
  1714 + item = framework.audit_live_query(
  1715 + query=query,
  1716 + top_k=args.top_k,
  1717 + language=args.language,
  1718 + auto_annotate=not args.force_refresh_labels,
  1719 + )
  1720 + if args.force_refresh_labels:
  1721 + live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
  1722 + framework.annotate_missing_labels(
  1723 + query=query,
  1724 + docs=list(live_payload.get("results") or [])[: args.top_k],
  1725 + force_refresh=True,
  1726 + )
  1727 + item = framework.audit_live_query(
  1728 + query=query,
  1729 + top_k=args.top_k,
  1730 + language=args.language,
  1731 + auto_annotate=False,
  1732 + )
  1733 + audit_items.append(
  1734 + {
  1735 + "query": query,
  1736 + "metrics": item["metrics"],
  1737 + "distribution": item["distribution"],
  1738 + "suspicious_count": len(item["suspicious"]),
  1739 + "suspicious_examples": item["suspicious"][: args.limit_suspicious],
  1740 + }
  1741 + )
  1742 + print(
  1743 + f"[audit] query={query!r} suspicious={len(item['suspicious'])} metrics={item['metrics']}"
  1744 + )
  1745 +
  1746 + summary = {
  1747 + "created_at": utc_now_iso(),
  1748 + "tenant_id": args.tenant_id,
  1749 + "top_k": args.top_k,
  1750 + "query_count": len(queries),
  1751 + "total_suspicious": sum(item["suspicious_count"] for item in audit_items),
  1752 + "queries": audit_items,
  1753 + }
  1754 + out_path = ensure_dir(framework.artifact_root / "audits") / f"audit_{utc_timestamp()}.json"
  1755 + out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  1756 + print(f"[done] audit={out_path}")
  1757 +
  1758 +
  1759 +def run_serve(args: argparse.Namespace) -> None:
  1760 + framework = SearchEvaluationFramework(tenant_id=args.tenant_id)
  1761 + app = create_web_app(framework, Path(args.queries_file))
  1762 + import uvicorn
  1763 +
  1764 + uvicorn.run(app, host=args.host, port=args.port, log_level="info")
  1765 +
  1766 +
  1767 +def main() -> None:
  1768 + parser = build_cli_parser()
  1769 + args = parser.parse_args()
  1770 + if args.command == "build":
  1771 + run_build(args)
  1772 + return
  1773 + if args.command == "batch":
  1774 + run_batch(args)
  1775 + return
  1776 + if args.command == "audit":
  1777 + run_audit(args)
  1778 + return
  1779 + if args.command == "serve":
  1780 + run_serve(args)
  1781 + return
  1782 + raise SystemExit(f"unknown command: {args.command}")
  1783 +
  1784 +
  1785 +if __name__ == "__main__":
  1786 + main()
... ...
scripts/evaluation/serve_eval_web.py 0 → 100644
... ... @@ -0,0 +1,14 @@
  1 +#!/usr/bin/env python3
  2 +
  3 +from pathlib import Path
  4 +import sys
  5 +
  6 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  7 +if str(PROJECT_ROOT) not in sys.path:
  8 + sys.path.insert(0, str(PROJECT_ROOT))
  9 +
  10 +from scripts.evaluation.eval_framework import main
  11 +
  12 +
  13 +if __name__ == "__main__":
  14 + main()
... ...
scripts/evaluation/tune_fusion.py 0 → 100644
... ... @@ -0,0 +1,296 @@
  1 +#!/usr/bin/env python3
  2 +
  3 +from __future__ import annotations
  4 +
  5 +import argparse
  6 +import copy
  7 +import json
  8 +import re
  9 +import subprocess
  10 +import sys
  11 +import time
  12 +from dataclasses import dataclass
  13 +from pathlib import Path
  14 +from typing import Any, Dict, List
  15 +
  16 +import requests
  17 +import yaml
  18 +
  19 +PROJECT_ROOT = Path(__file__).resolve().parents[2]
  20 +if str(PROJECT_ROOT) not in sys.path:
  21 + sys.path.insert(0, str(PROJECT_ROOT))
  22 +
  23 +from scripts.evaluation.eval_framework import (
  24 + DEFAULT_ARTIFACT_ROOT,
  25 + DEFAULT_QUERY_FILE,
  26 + ensure_dir,
  27 + utc_now_iso,
  28 + utc_timestamp,
  29 +)
  30 +
  31 +
  32 +CONFIG_PATH = PROJECT_ROOT / "config" / "config.yaml"
  33 +
  34 +
  35 +@dataclass
  36 +class ExperimentSpec:
  37 + name: str
  38 + description: str
  39 + params: Dict[str, Any]
  40 +
  41 +
  42 +def load_yaml(path: Path) -> Dict[str, Any]:
  43 + return yaml.safe_load(path.read_text(encoding="utf-8"))
  44 +
  45 +
  46 +def write_yaml(path: Path, payload: Dict[str, Any]) -> None:
  47 + path.write_text(
  48 + yaml.safe_dump(payload, sort_keys=False, allow_unicode=True),
  49 + encoding="utf-8",
  50 + )
  51 +
  52 +
  53 +def set_nested_value(payload: Dict[str, Any], dotted_path: str, value: Any) -> None:
  54 + current = payload
  55 + parts = dotted_path.split(".")
  56 + for part in parts[:-1]:
  57 + current = current[part]
  58 + current[parts[-1]] = value
  59 +
  60 +
  61 +def apply_params(base_config: Dict[str, Any], params: Dict[str, Any]) -> Dict[str, Any]:
  62 + candidate = copy.deepcopy(base_config)
  63 + for dotted_path, value in params.items():
  64 + set_nested_value(candidate, dotted_path, value)
  65 + return candidate
  66 +
  67 +
  68 +def wait_for_backend(base_url: str, timeout_sec: float = 300.0) -> Dict[str, Any]:
  69 + deadline = time.time() + timeout_sec
  70 + last_error = None
  71 + while time.time() < deadline:
  72 + try:
  73 + response = requests.get(f"{base_url.rstrip('/')}/health", timeout=10)
  74 + response.raise_for_status()
  75 + payload = response.json()
  76 + if str(payload.get("status")) == "healthy":
  77 + return payload
  78 + last_error = payload
  79 + except Exception as exc: # noqa: BLE001
  80 + last_error = str(exc)
  81 + time.sleep(2.0)
  82 + raise RuntimeError(f"backend did not become healthy: {last_error}")
  83 +
  84 +
  85 +def run_restart() -> None:
  86 + subprocess.run(["./restart.sh", "backend"], cwd=PROJECT_ROOT, check=True, timeout=600)
  87 +
  88 +
  89 +def read_queries(path: Path) -> List[str]:
  90 + return [
  91 + line.strip()
  92 + for line in path.read_text(encoding="utf-8").splitlines()
  93 + if line.strip() and not line.strip().startswith("#")
  94 + ]
  95 +
  96 +
  97 +def run_batch_eval(
  98 + *,
  99 + tenant_id: str,
  100 + queries_file: Path,
  101 + top_k: int,
  102 + language: str,
  103 + force_refresh_labels: bool,
  104 +) -> Dict[str, Any]:
  105 + cmd = [
  106 + str(PROJECT_ROOT / ".venv" / "bin" / "python"),
  107 + "scripts/evaluation/build_annotation_set.py",
  108 + "batch",
  109 + "--tenant-id",
  110 + str(tenant_id),
  111 + "--queries-file",
  112 + str(queries_file),
  113 + "--top-k",
  114 + str(top_k),
  115 + "--language",
  116 + language,
  117 + ]
  118 + if force_refresh_labels:
  119 + cmd.append("--force-refresh-labels")
  120 + completed = subprocess.run(
  121 + cmd,
  122 + cwd=PROJECT_ROOT,
  123 + check=True,
  124 + capture_output=True,
  125 + text=True,
  126 + timeout=7200,
  127 + )
  128 + output = (completed.stdout or "") + "\n" + (completed.stderr or "")
  129 + match = re.search(r"batch_id=([A-Za-z0-9_]+)\s+aggregate_metrics=(\{.*\})", output)
  130 + if not match:
  131 + raise RuntimeError(f"failed to parse batch output: {output[-2000:]}")
  132 + batch_id = match.group(1)
  133 + aggregate_metrics = json.loads(match.group(2).replace("'", '"'))
  134 + return {
  135 + "batch_id": batch_id,
  136 + "aggregate_metrics": aggregate_metrics,
  137 + "raw_output": output,
  138 + }
  139 +
  140 +
  141 +def render_markdown(summary: Dict[str, Any]) -> str:
  142 + lines = [
  143 + "# Fusion Tuning Report",
  144 + "",
  145 + f"- Created at: {summary['created_at']}",
  146 + f"- Tenant ID: {summary['tenant_id']}",
  147 + f"- Query count: {summary['query_count']}",
  148 + f"- Top K: {summary['top_k']}",
  149 + f"- Score metric: {summary['score_metric']}",
  150 + "",
  151 + "## Experiments",
  152 + "",
  153 + "| Rank | Name | Score | MAP_3 | MAP_2_3 | P@5 | P@10 | Config |",
  154 + "|---|---|---:|---:|---:|---:|---:|---|",
  155 + ]
  156 + for idx, item in enumerate(summary["experiments"], start=1):
  157 + metrics = item["aggregate_metrics"]
  158 + lines.append(
  159 + "| "
  160 + + " | ".join(
  161 + [
  162 + str(idx),
  163 + item["name"],
  164 + str(item["score"]),
  165 + str(metrics.get("MAP_3", "")),
  166 + str(metrics.get("MAP_2_3", "")),
  167 + str(metrics.get("P@5", "")),
  168 + str(metrics.get("P@10", "")),
  169 + item["config_snapshot_path"],
  170 + ]
  171 + )
  172 + + " |"
  173 + )
  174 + lines.extend(["", "## Details", ""])
  175 + for item in summary["experiments"]:
  176 + lines.append(f"### {item['name']}")
  177 + lines.append("")
  178 + lines.append(f"- Description: {item['description']}")
  179 + lines.append(f"- Score: {item['score']}")
  180 + lines.append(f"- Params: `{json.dumps(item['params'], ensure_ascii=False, sort_keys=True)}`")
  181 + lines.append(f"- Batch report: {item['batch_report_path']}")
  182 + lines.append("")
  183 + return "\n".join(lines)
  184 +
  185 +
  186 +def load_experiments(path: Path) -> List[ExperimentSpec]:
  187 + payload = json.loads(path.read_text(encoding="utf-8"))
  188 + items = payload["experiments"] if isinstance(payload, dict) else payload
  189 + experiments: List[ExperimentSpec] = []
  190 + for item in items:
  191 + experiments.append(
  192 + ExperimentSpec(
  193 + name=str(item["name"]),
  194 + description=str(item.get("description") or ""),
  195 + params=dict(item.get("params") or {}),
  196 + )
  197 + )
  198 + return experiments
  199 +
  200 +
  201 +def build_parser() -> argparse.ArgumentParser:
  202 + parser = argparse.ArgumentParser(description="Run fusion tuning experiments against the live backend")
  203 + parser.add_argument("--tenant-id", default="163")
  204 + parser.add_argument("--queries-file", default=str(DEFAULT_QUERY_FILE))
  205 + parser.add_argument("--top-k", type=int, default=100)
  206 + parser.add_argument("--language", default="en")
  207 + parser.add_argument("--experiments-file", required=True)
  208 + parser.add_argument("--search-base-url", default="http://127.0.0.1:6002")
  209 + parser.add_argument("--score-metric", default="MAP_3")
  210 + parser.add_argument("--apply-best", action="store_true")
  211 + parser.add_argument("--force-refresh-labels-first-pass", action="store_true")
  212 + return parser
  213 +
  214 +
  215 +def main() -> None:
  216 + args = build_parser().parse_args()
  217 + queries_file = Path(args.queries_file)
  218 + queries = read_queries(queries_file)
  219 + base_config_text = CONFIG_PATH.read_text(encoding="utf-8")
  220 + base_config = load_yaml(CONFIG_PATH)
  221 + experiments = load_experiments(Path(args.experiments_file))
  222 +
  223 + tuning_dir = ensure_dir(DEFAULT_ARTIFACT_ROOT / "tuning_runs")
  224 + run_id = f"tuning_{utc_timestamp()}"
  225 + run_dir = ensure_dir(tuning_dir / run_id)
  226 + results: List[Dict[str, Any]] = []
  227 +
  228 + try:
  229 + for experiment in experiments:
  230 + candidate = apply_params(base_config, experiment.params)
  231 + write_yaml(CONFIG_PATH, candidate)
  232 + candidate_config_path = run_dir / f"{experiment.name}_config.yaml"
  233 + write_yaml(candidate_config_path, candidate)
  234 +
  235 + run_restart()
  236 + health = wait_for_backend(args.search_base_url)
  237 + batch_result = run_batch_eval(
  238 + tenant_id=args.tenant_id,
  239 + queries_file=queries_file,
  240 + top_k=args.top_k,
  241 + language=args.language,
  242 + force_refresh_labels=bool(args.force_refresh_labels_first_pass and not results),
  243 + )
  244 + aggregate_metrics = dict(batch_result["aggregate_metrics"])
  245 + results.append(
  246 + {
  247 + "name": experiment.name,
  248 + "description": experiment.description,
  249 + "params": experiment.params,
  250 + "aggregate_metrics": aggregate_metrics,
  251 + "score": float(aggregate_metrics.get(args.score_metric, 0.0)),
  252 + "batch_id": batch_result["batch_id"],
  253 + "batch_report_path": str(
  254 + DEFAULT_ARTIFACT_ROOT / "batch_reports" / f"{batch_result['batch_id']}.md"
  255 + ),
  256 + "config_snapshot_path": str(candidate_config_path),
  257 + "backend_health": health,
  258 + "batch_stdout": batch_result["raw_output"],
  259 + }
  260 + )
  261 + print(
  262 + f"[tune] {experiment.name} score={aggregate_metrics.get(args.score_metric)} "
  263 + f"metrics={aggregate_metrics}"
  264 + )
  265 + finally:
  266 + if args.apply_best and results:
  267 + best = max(results, key=lambda item: item["score"])
  268 + best_config = apply_params(base_config, best["params"])
  269 + write_yaml(CONFIG_PATH, best_config)
  270 + run_restart()
  271 + wait_for_backend(args.search_base_url)
  272 + else:
  273 + CONFIG_PATH.write_text(base_config_text, encoding="utf-8")
  274 + run_restart()
  275 + wait_for_backend(args.search_base_url)
  276 +
  277 + results.sort(key=lambda item: item["score"], reverse=True)
  278 + summary = {
  279 + "run_id": run_id,
  280 + "created_at": utc_now_iso(),
  281 + "tenant_id": args.tenant_id,
  282 + "query_count": len(queries),
  283 + "top_k": args.top_k,
  284 + "score_metric": args.score_metric,
  285 + "experiments": results,
  286 + }
  287 + summary_json_path = run_dir / "summary.json"
  288 + summary_md_path = run_dir / "summary.md"
  289 + summary_json_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
  290 + summary_md_path.write_text(render_markdown(summary), encoding="utf-8")
  291 + print(f"[done] summary_json={summary_json_path}")
  292 + print(f"[done] summary_md={summary_md_path}")
  293 +
  294 +
  295 +if __name__ == "__main__":
  296 + main()
... ...