c81b0fc1
tangwang
scripts/evaluatio...
|
1
2
3
4
5
6
|
"""CLI: build annotations, batch eval, audit, serve web UI."""
from __future__ import annotations
import argparse
import json
|
cdd8ee3a
tangwang
eval框架日志独立
|
7
|
import logging
|
310bb3bc
tangwang
eval tools
|
8
|
import shutil
|
c81b0fc1
tangwang
scripts/evaluatio...
|
9
|
from pathlib import Path
|
bdb65283
tangwang
标注框架 批量标注
|
10
|
from typing import Any, Dict
|
c81b0fc1
tangwang
scripts/evaluatio...
|
11
|
|
2059d959
tangwang
feat(eval): 多评估集统...
|
12
13
14
|
from config.loader import get_app_config
from .datasets import audits_dir, query_builds_dir, resolve_dataset
|
c81b0fc1
tangwang
scripts/evaluatio...
|
15
|
from .framework import SearchEvaluationFramework
|
cdd8ee3a
tangwang
eval框架日志独立
|
16
|
from .logging_setup import setup_eval_logging
|
c81b0fc1
tangwang
scripts/evaluatio...
|
17
18
19
|
from .utils import ensure_dir, utc_now_iso, utc_timestamp
from .web_app import create_web_app
|
cdd8ee3a
tangwang
eval框架日志独立
|
20
21
|
_cli_log = logging.getLogger("search_eval.cli")
|
c81b0fc1
tangwang
scripts/evaluatio...
|
22
|
|
2059d959
tangwang
feat(eval): 多评估集统...
|
23
|
def _reset_build_artifacts(dataset_id: str) -> None:
|
310bb3bc
tangwang
eval tools
|
24
25
|
artifact_root = get_app_config().search_evaluation.artifact_root
removed = []
|
2059d959
tangwang
feat(eval): 多评估集统...
|
26
27
28
29
30
31
32
33
|
dataset_query_builds = query_builds_dir(artifact_root, dataset_id)
dataset_audits = audits_dir(artifact_root, dataset_id)
if dataset_query_builds.exists():
shutil.rmtree(dataset_query_builds)
removed.append(str(dataset_query_builds))
if dataset_audits.exists():
shutil.rmtree(dataset_audits)
removed.append(str(dataset_audits))
|
310bb3bc
tangwang
eval tools
|
34
|
if removed:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
35
|
_cli_log.info("[build] reset dataset artifacts for %s: %s", dataset_id, ", ".join(removed))
|
310bb3bc
tangwang
eval tools
|
36
|
else:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
37
|
_cli_log.info("[build] no previous dataset artifacts to reset under %s for dataset=%s", artifact_root, dataset_id)
|
310bb3bc
tangwang
eval tools
|
38
39
|
|
bdb65283
tangwang
标注框架 批量标注
|
40
41
42
43
44
|
def add_judge_llm_args(p: argparse.ArgumentParser) -> None:
p.add_argument(
"--judge-model",
default=None,
metavar="MODEL",
|
331861d5
tangwang
eval框架配置化
|
45
|
help="Judge LLM model (default: config.yaml search_evaluation.judge_model).",
|
bdb65283
tangwang
标注框架 批量标注
|
46
47
48
49
50
|
)
p.add_argument(
"--enable-thinking",
action=argparse.BooleanOptionalAction,
default=None,
|
331861d5
tangwang
eval框架配置化
|
51
|
help="enable_thinking for DashScope (default: search_evaluation.judge_enable_thinking).",
|
bdb65283
tangwang
标注框架 批量标注
|
52
53
54
55
56
|
)
p.add_argument(
"--dashscope-batch",
action=argparse.BooleanOptionalAction,
default=None,
|
331861d5
tangwang
eval框架配置化
|
57
|
help="DashScope Batch File API vs sync chat (default: search_evaluation.judge_dashscope_batch).",
|
bdb65283
tangwang
标注框架 批量标注
|
58
59
60
|
)
|
cdd8ee3a
tangwang
eval框架日志独立
|
61
62
63
64
65
|
def add_intent_llm_args(p: argparse.ArgumentParser) -> None:
p.add_argument(
"--intent-model",
default=None,
metavar="MODEL",
|
331861d5
tangwang
eval框架配置化
|
66
|
help="Query-intent LLM model before relevance judging (default: search_evaluation.intent_model).",
|
cdd8ee3a
tangwang
eval框架日志独立
|
67
68
69
70
71
|
)
p.add_argument(
"--intent-enable-thinking",
action=argparse.BooleanOptionalAction,
default=None,
|
331861d5
tangwang
eval框架配置化
|
72
|
help="enable_thinking for intent model (default: search_evaluation.intent_enable_thinking).",
|
cdd8ee3a
tangwang
eval框架日志独立
|
73
74
75
|
)
|
bdb65283
tangwang
标注框架 批量标注
|
76
77
78
79
80
81
82
83
|
def framework_kwargs_from_args(args: argparse.Namespace) -> Dict[str, Any]:
kw: Dict[str, Any] = {}
if args.judge_model is not None:
kw["judge_model"] = args.judge_model
if args.enable_thinking is not None:
kw["enable_thinking"] = args.enable_thinking
if args.dashscope_batch is not None:
kw["use_dashscope_batch"] = args.dashscope_batch
|
cdd8ee3a
tangwang
eval框架日志独立
|
84
85
86
87
|
if getattr(args, "intent_model", None) is not None:
kw["intent_model"] = args.intent_model
if getattr(args, "intent_enable_thinking", None) is not None:
kw["intent_enable_thinking"] = args.intent_enable_thinking
|
bdb65283
tangwang
标注框架 批量标注
|
88
89
90
|
return kw
|
331861d5
tangwang
eval框架配置化
|
91
92
|
def _apply_search_evaluation_cli_defaults(args: argparse.Namespace) -> None:
"""Fill None CLI defaults from ``config.yaml`` ``search_evaluation`` (via ``get_app_config()``)."""
|
331861d5
tangwang
eval框架配置化
|
93
|
se = get_app_config().search_evaluation
|
2059d959
tangwang
feat(eval): 多评估集统...
|
94
95
|
if getattr(args, "dataset_id", None) in (None, "") and getattr(args, "queries_file", None) in (None, ""):
args.dataset_id = se.default_dataset_id
|
331861d5
tangwang
eval框架配置化
|
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
|
if getattr(args, "tenant_id", None) in (None, ""):
args.tenant_id = se.default_tenant_id
if getattr(args, "queries_file", None) in (None, ""):
args.queries_file = str(se.queries_file)
if getattr(args, "language", None) in (None, ""):
args.language = se.default_language
if args.command == "serve":
if getattr(args, "host", None) in (None, ""):
args.host = se.web_host
if getattr(args, "port", None) is None:
args.port = se.web_port
if args.command == "batch":
if getattr(args, "top_k", None) is None:
args.top_k = se.batch_top_k
if args.command == "audit":
if getattr(args, "top_k", None) is None:
args.top_k = se.audit_top_k
if getattr(args, "limit_suspicious", None) is None:
args.limit_suspicious = se.audit_limit_suspicious
if args.command == "build":
if getattr(args, "search_depth", None) is None:
args.search_depth = se.build_search_depth
if getattr(args, "rerank_depth", None) is None:
args.rerank_depth = se.build_rerank_depth
if getattr(args, "annotate_search_top_k", None) is None:
args.annotate_search_top_k = se.annotate_search_top_k
if getattr(args, "annotate_rerank_top_k", None) is None:
args.annotate_rerank_top_k = se.annotate_rerank_top_k
if getattr(args, "search_recall_top_k", None) is None:
args.search_recall_top_k = se.search_recall_top_k
if getattr(args, "rerank_high_threshold", None) is None:
args.rerank_high_threshold = se.rerank_high_threshold
if getattr(args, "rerank_high_skip_count", None) is None:
args.rerank_high_skip_count = se.rerank_high_skip_count
if getattr(args, "rebuild_llm_batch_size", None) is None:
args.rebuild_llm_batch_size = se.rebuild_llm_batch_size
if getattr(args, "rebuild_min_batches", None) is None:
args.rebuild_min_batches = se.rebuild_min_llm_batches
if getattr(args, "rebuild_max_batches", None) is None:
args.rebuild_max_batches = se.rebuild_max_llm_batches
if getattr(args, "rebuild_irrelevant_stop_ratio", None) is None:
args.rebuild_irrelevant_stop_ratio = se.rebuild_irrelevant_stop_ratio
if getattr(args, "rebuild_irrel_low_combined_stop_ratio", None) is None:
args.rebuild_irrel_low_combined_stop_ratio = se.rebuild_irrel_low_combined_stop_ratio
if getattr(args, "rebuild_irrelevant_stop_streak", None) is None:
args.rebuild_irrelevant_stop_streak = se.rebuild_irrelevant_stop_streak
|
2059d959
tangwang
feat(eval): 多评估集统...
|
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
|
def _resolve_dataset_from_args(args: argparse.Namespace, *, require_enabled: bool = False):
queries_file = getattr(args, "queries_file", None)
query_path = Path(str(queries_file)).resolve() if queries_file not in (None, "") else None
dataset = resolve_dataset(
dataset_id=getattr(args, "dataset_id", None),
query_file=query_path,
tenant_id=getattr(args, "tenant_id", None),
language=getattr(args, "language", None),
require_enabled=require_enabled,
)
args.dataset_id = dataset.dataset_id
args.queries_file = str(dataset.query_file)
args.tenant_id = dataset.tenant_id
args.language = dataset.language
return dataset
|
c81b0fc1
tangwang
scripts/evaluatio...
|
165
166
167
168
169
|
def build_cli_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(description="Search evaluation annotation builder and web UI")
sub = parser.add_subparsers(dest="command", required=True)
build = sub.add_parser("build", help="Build pooled annotation set for queries")
|
331861d5
tangwang
eval框架配置化
|
170
171
172
173
174
|
build.add_argument(
"--tenant-id",
default=None,
help="Tenant id (default: search_evaluation.default_tenant_id in config.yaml).",
)
|
2059d959
tangwang
feat(eval): 多评估集统...
|
175
|
build.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
|
331861d5
tangwang
eval框架配置化
|
176
177
178
|
build.add_argument(
"--queries-file",
default=None,
|
2059d959
tangwang
feat(eval): 多评估集统...
|
179
|
help="Legacy override for query list file. Prefer --dataset-id.",
|
331861d5
tangwang
eval框架配置化
|
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
|
)
build.add_argument(
"--search-depth",
type=int,
default=None,
help="Default: search_evaluation.build_search_depth.",
)
build.add_argument(
"--rerank-depth",
type=int,
default=None,
help="Default: search_evaluation.build_rerank_depth.",
)
build.add_argument(
"--annotate-search-top-k",
type=int,
default=None,
help="Default: search_evaluation.annotate_search_top_k.",
)
build.add_argument(
"--annotate-rerank-top-k",
type=int,
default=None,
help="Default: search_evaluation.annotate_rerank_top_k.",
)
|
d172c259
tangwang
eval框架
|
205
206
207
208
|
build.add_argument(
"--search-recall-top-k",
type=int,
default=None,
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
209
|
help="Rebuild mode only: top-K search hits enter recall pool with score 1 (default when --force-refresh-labels: 200).",
|
d172c259
tangwang
eval框架
|
210
211
212
213
214
215
216
217
218
219
220
221
222
223
|
)
build.add_argument(
"--rerank-high-threshold",
type=float,
default=None,
help="Rebuild only: count rerank scores above this on non-pool docs (default 0.5).",
)
build.add_argument(
"--rerank-high-skip-count",
type=int,
default=None,
help="Rebuild only: skip query if more than this many non-pool docs have rerank score > threshold (default 1000).",
)
build.add_argument("--rebuild-llm-batch-size", type=int, default=None, help="Rebuild only: LLM batch size (default 50).")
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
224
|
build.add_argument("--rebuild-min-batches", type=int, default=None, help="Rebuild only: min LLM batches before early stop (default 10).")
|
d172c259
tangwang
eval框架
|
225
226
227
228
229
|
build.add_argument("--rebuild-max-batches", type=int, default=None, help="Rebuild only: max LLM batches (default 40).")
build.add_argument(
"--rebuild-irrelevant-stop-ratio",
type=float,
default=None,
|
331861d5
tangwang
eval框架配置化
|
230
|
help="Rebuild only: bad batch requires irrelevant_ratio > this (default: search_evaluation.rebuild_irrelevant_stop_ratio).",
|
dedd31c5
tangwang
1. 搜索 recall 池「1 ...
|
231
232
233
234
235
|
)
build.add_argument(
"--rebuild-irrel-low-combined-stop-ratio",
type=float,
default=None,
|
35ae3b29
tangwang
批量评估框架,召回参数修改和llm...
|
236
|
help="Rebuild only: bad batch requires (irrelevant+low)/n > this (default 0.959).",
|
d172c259
tangwang
eval框架
|
237
238
239
240
241
|
)
build.add_argument(
"--rebuild-irrelevant-stop-streak",
type=int,
default=None,
|
35ae3b29
tangwang
批量评估框架,召回参数修改和llm...
|
242
|
help="Rebuild only: consecutive bad batches (both thresholds strict >) before early stop (default 3).",
|
d172c259
tangwang
eval框架
|
243
|
)
|
331861d5
tangwang
eval框架配置化
|
244
245
246
247
248
|
build.add_argument(
"--language",
default=None,
help="Default: search_evaluation.default_language.",
)
|
310bb3bc
tangwang
eval tools
|
249
|
build.add_argument(
|
310bb3bc
tangwang
eval tools
|
250
251
|
"--reset-artifacts",
action="store_true",
|
2059d959
tangwang
feat(eval): 多评估集统...
|
252
|
help="Delete dataset-specific query_builds/audits before starting. Shared SQLite cache is preserved.",
|
310bb3bc
tangwang
eval tools
|
253
|
)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
254
255
|
build.add_argument("--force-refresh-rerank", action="store_true")
build.add_argument("--force-refresh-labels", action="store_true")
|
bdb65283
tangwang
标注框架 批量标注
|
256
|
add_judge_llm_args(build)
|
cdd8ee3a
tangwang
eval框架日志独立
|
257
|
add_intent_llm_args(build)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
258
259
|
batch = sub.add_parser("batch", help="Run batch evaluation against live search")
|
331861d5
tangwang
eval框架配置化
|
260
|
batch.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
|
2059d959
tangwang
feat(eval): 多评估集统...
|
261
262
|
batch.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
batch.add_argument("--queries-file", default=None, help="Legacy override for query list file. Prefer --dataset-id.")
|
331861d5
tangwang
eval框架配置化
|
263
264
|
batch.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.batch_top_k.")
batch.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
|
c81b0fc1
tangwang
scripts/evaluatio...
|
265
|
batch.add_argument("--force-refresh-labels", action="store_true")
|
bdb65283
tangwang
标注框架 批量标注
|
266
|
add_judge_llm_args(batch)
|
cdd8ee3a
tangwang
eval框架日志独立
|
267
|
add_intent_llm_args(batch)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
268
269
|
audit = sub.add_parser("audit", help="Audit annotation quality for queries")
|
331861d5
tangwang
eval框架配置化
|
270
|
audit.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
|
2059d959
tangwang
feat(eval): 多评估集统...
|
271
272
|
audit.add_argument("--dataset-id", default=None, help="Named evaluation dataset id from config.yaml.")
audit.add_argument("--queries-file", default=None, help="Legacy override for query list file. Prefer --dataset-id.")
|
331861d5
tangwang
eval框架配置化
|
273
274
275
|
audit.add_argument("--top-k", type=int, default=None, help="Default: search_evaluation.audit_top_k.")
audit.add_argument("--language", default=None, help="Default: search_evaluation.default_language.")
audit.add_argument(
|
331861d5
tangwang
eval框架配置化
|
276
277
278
279
280
|
"--limit-suspicious",
type=int,
default=None,
help="Default: search_evaluation.audit_limit_suspicious.",
)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
281
|
audit.add_argument("--force-refresh-labels", action="store_true")
|
bdb65283
tangwang
标注框架 批量标注
|
282
|
add_judge_llm_args(audit)
|
cdd8ee3a
tangwang
eval框架日志独立
|
283
|
add_intent_llm_args(audit)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
284
285
|
serve = sub.add_parser("serve", help="Serve evaluation web UI on port 6010")
|
331861d5
tangwang
eval框架配置化
|
286
|
serve.add_argument("--tenant-id", default=None, help="Default: search_evaluation.default_tenant_id.")
|
2059d959
tangwang
feat(eval): 多评估集统...
|
287
288
|
serve.add_argument("--dataset-id", default=None, help="Initial evaluation dataset id from config.yaml.")
serve.add_argument("--queries-file", default=None, help="Legacy initial query file override. Prefer --dataset-id.")
|
331861d5
tangwang
eval框架配置化
|
289
290
|
serve.add_argument("--host", default=None, help="Default: search_evaluation.web_host.")
serve.add_argument("--port", type=int, default=None, help="Default: search_evaluation.web_port.")
|
bdb65283
tangwang
标注框架 批量标注
|
291
|
add_judge_llm_args(serve)
|
cdd8ee3a
tangwang
eval框架日志独立
|
292
|
add_intent_llm_args(serve)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
293
294
295
296
297
|
return parser
def run_build(args: argparse.Namespace) -> None:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
298
|
dataset = _resolve_dataset_from_args(args)
|
310bb3bc
tangwang
eval tools
|
299
|
if args.reset_artifacts:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
300
|
_reset_build_artifacts(dataset.dataset_id)
|
a345b01f
tangwang
eval framework
|
301
|
framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
|
2059d959
tangwang
feat(eval): 多评估集统...
|
302
|
queries = list(dataset.queries)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
303
|
summary = []
|
d172c259
tangwang
eval框架
|
304
305
306
|
rebuild_kwargs = {}
if args.force_refresh_labels:
rebuild_kwargs = {
|
331861d5
tangwang
eval框架配置化
|
307
308
309
310
311
312
313
314
315
|
"search_recall_top_k": args.search_recall_top_k,
"rerank_high_threshold": args.rerank_high_threshold,
"rerank_high_skip_count": args.rerank_high_skip_count,
"rebuild_llm_batch_size": args.rebuild_llm_batch_size,
"rebuild_min_batches": args.rebuild_min_batches,
"rebuild_max_batches": args.rebuild_max_batches,
"rebuild_irrelevant_stop_ratio": args.rebuild_irrelevant_stop_ratio,
"rebuild_irrel_low_combined_stop_ratio": args.rebuild_irrel_low_combined_stop_ratio,
"rebuild_irrelevant_stop_streak": args.rebuild_irrelevant_stop_streak,
|
d172c259
tangwang
eval框架
|
316
|
}
|
331861d5
tangwang
eval框架配置化
|
317
318
319
|
total_q = len(queries)
for q_index, query in enumerate(queries, start=1):
_cli_log.info("[build] (%s/%s) starting query=%r", q_index, total_q, query)
|
286e9b4f
tangwang
evalution
|
320
321
322
|
try:
result = framework.build_query_annotation_set(
query=query,
|
2059d959
tangwang
feat(eval): 多评估集统...
|
323
|
dataset=dataset,
|
286e9b4f
tangwang
evalution
|
324
325
326
327
328
329
330
331
332
333
334
335
|
search_depth=args.search_depth,
rerank_depth=args.rerank_depth,
annotate_search_top_k=args.annotate_search_top_k,
annotate_rerank_top_k=args.annotate_rerank_top_k,
language=args.language,
force_refresh_rerank=args.force_refresh_rerank,
force_refresh_labels=args.force_refresh_labels,
**rebuild_kwargs,
)
except Exception:
_cli_log.exception("[build] failed query=%r index=%s/%s", query, q_index, total_q)
raise
|
c81b0fc1
tangwang
scripts/evaluatio...
|
336
337
338
339
340
341
342
343
344
345
|
summary.append(
{
"query": result.query,
"search_total": result.search_total,
"search_depth": result.search_depth,
"rerank_corpus_size": result.rerank_corpus_size,
"annotated_count": result.annotated_count,
"output_json_path": str(result.output_json_path),
}
)
|
cdd8ee3a
tangwang
eval框架日志独立
|
346
347
348
349
350
351
352
353
|
_cli_log.info(
"[build] query=%r search_total=%s search_depth=%s corpus=%s annotated=%s output=%s",
result.query,
result.search_total,
result.search_depth,
result.rerank_corpus_size,
result.annotated_count,
result.output_json_path,
|
c81b0fc1
tangwang
scripts/evaluatio...
|
354
355
|
)
out_path = ensure_dir(framework.artifact_root / "query_builds") / f"build_summary_{utc_timestamp()}.json"
|
2059d959
tangwang
feat(eval): 多评估集统...
|
356
|
out_path = query_builds_dir(framework.artifact_root, dataset.dataset_id) / f"build_summary_{utc_timestamp()}.json"
|
c81b0fc1
tangwang
scripts/evaluatio...
|
357
|
out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
cdd8ee3a
tangwang
eval框架日志独立
|
358
|
_cli_log.info("[done] summary=%s", out_path)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
359
360
361
|
def run_batch(args: argparse.Namespace) -> None:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
362
|
dataset = _resolve_dataset_from_args(args, require_enabled=True)
|
a345b01f
tangwang
eval framework
|
363
|
framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
|
2059d959
tangwang
feat(eval): 多评估集统...
|
364
365
|
queries = list(dataset.queries)
_cli_log.info("[batch] dataset_id=%s queries_file=%s count=%s", dataset.dataset_id, args.queries_file, len(queries))
|
286e9b4f
tangwang
evalution
|
366
367
368
|
try:
payload = framework.batch_evaluate(
queries=queries,
|
2059d959
tangwang
feat(eval): 多评估集统...
|
369
|
dataset=dataset,
|
286e9b4f
tangwang
evalution
|
370
371
372
373
374
375
376
377
|
top_k=args.top_k,
auto_annotate=True,
language=args.language,
force_refresh_labels=args.force_refresh_labels,
)
except Exception:
_cli_log.exception("[batch] failed while evaluating query list from %s", args.queries_file)
raise
|
cdd8ee3a
tangwang
eval框架日志独立
|
378
|
_cli_log.info("[done] batch_id=%s aggregate_metrics=%s", payload["batch_id"], payload["aggregate_metrics"])
|
c81b0fc1
tangwang
scripts/evaluatio...
|
379
380
381
|
def run_audit(args: argparse.Namespace) -> None:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
382
|
dataset = _resolve_dataset_from_args(args, require_enabled=True)
|
a345b01f
tangwang
eval framework
|
383
|
framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
|
2059d959
tangwang
feat(eval): 多评估集统...
|
384
|
queries = list(dataset.queries)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
|
audit_items = []
for query in queries:
item = framework.audit_live_query(
query=query,
top_k=args.top_k,
language=args.language,
auto_annotate=not args.force_refresh_labels,
)
if args.force_refresh_labels:
live_payload = framework.search_client.search(query=query, size=max(args.top_k, 100), from_=0, language=args.language)
framework.annotate_missing_labels(
query=query,
docs=list(live_payload.get("results") or [])[: args.top_k],
force_refresh=True,
)
item = framework.audit_live_query(
query=query,
top_k=args.top_k,
language=args.language,
auto_annotate=False,
)
audit_items.append(
{
"query": query,
"metrics": item["metrics"],
"distribution": item["distribution"],
"suspicious_count": len(item["suspicious"]),
"suspicious_examples": item["suspicious"][: args.limit_suspicious],
}
)
|
cdd8ee3a
tangwang
eval框架日志独立
|
415
416
417
418
419
|
_cli_log.info(
"[audit] query=%r suspicious=%s metrics=%s",
query,
len(item["suspicious"]),
item["metrics"],
|
c81b0fc1
tangwang
scripts/evaluatio...
|
420
421
422
423
424
|
)
summary = {
"created_at": utc_now_iso(),
"tenant_id": args.tenant_id,
|
2059d959
tangwang
feat(eval): 多评估集统...
|
425
|
"dataset": dataset.summary(),
|
c81b0fc1
tangwang
scripts/evaluatio...
|
426
427
428
429
430
|
"top_k": args.top_k,
"query_count": len(queries),
"total_suspicious": sum(item["suspicious_count"] for item in audit_items),
"queries": audit_items,
}
|
2059d959
tangwang
feat(eval): 多评估集统...
|
431
|
out_path = audits_dir(framework.artifact_root, dataset.dataset_id) / f"audit_{utc_timestamp()}.json"
|
c81b0fc1
tangwang
scripts/evaluatio...
|
432
|
out_path.write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
|
cdd8ee3a
tangwang
eval框架日志独立
|
433
|
_cli_log.info("[done] audit=%s", out_path)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
434
435
436
|
def run_serve(args: argparse.Namespace) -> None:
|
2059d959
tangwang
feat(eval): 多评估集统...
|
437
|
dataset = _resolve_dataset_from_args(args, require_enabled=True)
|
a345b01f
tangwang
eval framework
|
438
|
framework = SearchEvaluationFramework(tenant_id=args.tenant_id, **framework_kwargs_from_args(args))
|
2059d959
tangwang
feat(eval): 多评估集统...
|
439
|
app = create_web_app(framework, initial_dataset_id=dataset.dataset_id)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
440
441
442
443
444
445
|
import uvicorn
uvicorn.run(app, host=args.host, port=args.port, log_level="info")
def main() -> None:
|
331861d5
tangwang
eval框架配置化
|
446
447
|
se = get_app_config().search_evaluation
log_file = setup_eval_logging(se.eval_log_dir)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
448
449
|
parser = build_cli_parser()
args = parser.parse_args()
|
331861d5
tangwang
eval框架配置化
|
450
|
_apply_search_evaluation_cli_defaults(args)
|
cdd8ee3a
tangwang
eval框架日志独立
|
451
452
453
454
|
logging.getLogger("search_eval").info(
"CLI start command=%s tenant_id=%s log_file=%s",
args.command,
getattr(args, "tenant_id", ""),
|
331861d5
tangwang
eval框架配置化
|
455
|
log_file.resolve(),
|
cdd8ee3a
tangwang
eval框架日志独立
|
456
|
)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
457
458
459
460
461
462
463
464
465
466
467
468
469
|
if args.command == "build":
run_build(args)
return
if args.command == "batch":
run_batch(args)
return
if args.command == "audit":
run_audit(args)
return
if args.command == "serve":
run_serve(args)
return
raise SystemExit(f"unknown command: {args.command}")
|