utils.py
4.16 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""Small helpers: time, JSON, document text, LLM output parsing."""
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Sequence, Tuple
from .constants import PROJECT_ROOT
def utc_now_iso() -> str:
return datetime.now(timezone.utc).isoformat()
def utc_timestamp() -> str:
return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
def ensure_dir(path: Path) -> Path:
path.mkdir(parents=True, exist_ok=True)
return path
def sha1_text(text: str) -> str:
return hashlib.sha1(text.encode("utf-8")).hexdigest()
def pick_text(value: Any, preferred_lang: str = "en") -> str:
if value is None:
return ""
if isinstance(value, dict):
return str(
value.get(preferred_lang)
or value.get("en")
or value.get("zh")
or next((v for v in value.values() if v), "")
).strip()
return str(value).strip()
def zh_title_from_multilingual(title_multilingual: Any) -> str:
"""Chinese title string from API debug ``title_multilingual`` (ES-style dict)."""
if not isinstance(title_multilingual, dict):
return ""
zh = str(title_multilingual.get("zh") or "").strip()
return zh
def safe_json_dumps(data: Any) -> str:
return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
if not skus:
return "", "", ""
first = skus[0] or {}
return (
str(first.get("option1_value") or "").strip(),
str(first.get("option2_value") or "").strip(),
str(first.get("option3_value") or "").strip(),
)
def build_display_title(doc: Dict[str, Any]) -> str:
title = doc.get("title")
en = pick_text(title, "en")
zh = pick_text(title, "zh")
if en and zh and en != zh:
return f"{en} / {zh}"
return en or zh
def build_rerank_doc(doc: Dict[str, Any]) -> str:
title = build_display_title(doc)
return title[:400]
def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
title = build_display_title(doc)
option1, option2, option3 = compact_option_values(doc.get("skus") or [])
vendor = pick_text(doc.get("vendor"), "en")
category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
parts = [title]
if option1:
parts.append(f"{option1}")
if option2:
parts.append(f"{option2}")
return f"{idx}. " + " ".join(part for part in parts if part)
def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
return {
"spu_id": str(doc.get("spu_id") or ""),
"title": build_display_title(doc),
"image_url": doc.get("image_url"),
"vendor": pick_text(doc.get("vendor"), "en"),
"category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
"option_values": list(compact_option_values(doc.get("skus") or []))
}
def normalize_text(text: Any) -> str:
value = str(text or "").strip().lower()
value = re.sub(r"\s+", " ", value)
return value
def extract_json_blob(text: str) -> Any:
cleaned = str(text or "").strip()
candidates: List[str] = [cleaned]
fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
candidates.extend(match.strip() for match in fence_matches if match.strip())
for candidate in candidates:
try:
return json.loads(candidate)
except Exception:
pass
starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
for start in starts:
for end in reversed(ends):
if end <= start:
continue
fragment = cleaned[start : end + 1]
try:
return json.loads(fragment)
except Exception:
continue
raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
def ensure_project_on_path() -> None:
import sys
if str(PROJECT_ROOT) not in sys.path:
sys.path.insert(0, str(PROJECT_ROOT))