c81b0fc1
tangwang
scripts/evaluatio...
|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
"""Small helpers: time, JSON, document text, LLM output parsing."""
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Sequence, Tuple
from .constants import PROJECT_ROOT
|
42024409
tangwang
评估框架-批量打标
|
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
_LABEL_OPTION_MAX_CHARS = 40
_LABEL_DOC_LINE_MAX_CHARS = 260
def _truncate_text(value: Any, max_chars: int) -> str:
text = str(value or "").strip()
if max_chars <= 0:
return ""
if len(text) <= max_chars:
return text
if max_chars <= 3:
return text[:max_chars]
return text[: max_chars - 3].rstrip() + "..."
|
c81b0fc1
tangwang
scripts/evaluatio...
|
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
|
def utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def utc_timestamp() -> str:
return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def sha1_text(text: str) -> str:
return hashlib.sha1(text.encode("utf-8")).hexdigest()
def pick_text(value: Any, preferred_lang: str = "en") -> str:
if value is None:
return ""
if isinstance(value, dict):
return str(
value.get(preferred_lang)
or value.get("en")
or value.get("zh")
or next((v for v in value.values() if v), "")
).strip()
return str(value).strip()
|
167f33b4
tangwang
eval框架前端
|
59
60
61
62
63
64
65
66
|
def zh_title_from_multilingual(title_multilingual: Any) -> str:
"""Chinese title string from API debug ``title_multilingual`` (ES-style dict)."""
if not isinstance(title_multilingual, dict):
return ""
zh = str(title_multilingual.get("zh") or "").strip()
return zh
|
c81b0fc1
tangwang
scripts/evaluatio...
|
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
|
def safe_json_dumps(data: Any) -> str:
return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
if not skus:
return "", "", ""
first = skus[0] or {}
return (
str(first.get("option1_value") or "").strip(),
str(first.get("option2_value") or "").strip(),
str(first.get("option3_value") or "").strip(),
)
def build_display_title(doc: Dict[str, Any]) -> str:
title = doc.get("title")
|
42024409
tangwang
评估框架-批量打标
|
84
|
return pick_text(title, "en") or pick_text(title, "zh")
|
c81b0fc1
tangwang
scripts/evaluatio...
|
85
86
87
88
89
90
91
92
93
94
|
def build_rerank_doc(doc: Dict[str, Any]) -> str:
title = build_display_title(doc)
return title[:400]
def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
title = build_display_title(doc)
option1, option2, option3 = compact_option_values(doc.get("skus") or [])
|
c81b0fc1
tangwang
scripts/evaluatio...
|
95
96
|
parts = [title]
if option1:
|
42024409
tangwang
评估框架-批量打标
|
97
|
parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS))
|
c81b0fc1
tangwang
scripts/evaluatio...
|
98
|
if option2:
|
42024409
tangwang
评估框架-批量打标
|
99
100
101
102
103
|
parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS))
if option3:
parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS))
line = " ".join(part for part in parts if part)
return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS)
|
c81b0fc1
tangwang
scripts/evaluatio...
|
104
105
106
107
108
109
110
111
112
|
def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
return {
"spu_id": str(doc.get("spu_id") or ""),
"title": build_display_title(doc),
"image_url": doc.get("image_url"),
"vendor": pick_text(doc.get("vendor"), "en"),
"category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
|
286e9b4f
tangwang
evalution
|
113
|
"option_values": list(compact_option_values(doc.get("skus") or []))
|
c81b0fc1
tangwang
scripts/evaluatio...
|
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
}
def normalize_text(text: Any) -> str:
value = str(text or "").strip().lower()
value = re.sub(r"\s+", " ", value)
return value
def extract_json_blob(text: str) -> Any:
cleaned = str(text or "").strip()
candidates: List[str] = [cleaned]
fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
candidates.extend(match.strip() for match in fence_matches if match.strip())
for candidate in candidates:
try:
return json.loads(candidate)
except Exception:
pass
starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
for start in starts:
for end in reversed(ends):
if end <= start:
continue
fragment = cleaned[start : end + 1]
try:
return json.loads(fragment)
except Exception:
continue
raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
def ensure_project_on_path() -> None:
import sys
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))
|