"""Small helpers: time, JSON, document text, LLM output parsing.""" from __future__ import annotations import hashlib import json import re from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Sequence, Tuple from .constants import PROJECT_ROOT def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() def utc_timestamp() -> str: return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") def ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def sha1_text(text: str) -> str: return hashlib.sha1(text.encode("utf-8")).hexdigest() def pick_text(value: Any, preferred_lang: str = "en") -> str: if value is None: return "" if isinstance(value, dict): return str( value.get(preferred_lang) or value.get("en") or value.get("zh") or next((v for v in value.values() if v), "") ).strip() return str(value).strip() def safe_json_dumps(data: Any) -> str: return json.dumps(data, ensure_ascii=False, separators=(",", ":")) def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]: if not skus: return "", "", "" first = skus[0] or {} return ( str(first.get("option1_value") or "").strip(), str(first.get("option2_value") or "").strip(), str(first.get("option3_value") or "").strip(), ) def build_display_title(doc: Dict[str, Any]) -> str: title = doc.get("title") en = pick_text(title, "en") zh = pick_text(title, "zh") if en and zh and en != zh: return f"{en} / {zh}" return en or zh def build_rerank_doc(doc: Dict[str, Any]) -> str: title = build_display_title(doc) return title[:400] def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: title = build_display_title(doc) option1, option2, option3 = compact_option_values(doc.get("skus") or []) vendor = pick_text(doc.get("vendor"), "en") category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en") tags = doc.get("tags") or [] tags_text = ", ".join(str(tag) for tag in tags[:4] if tag) parts = [title] if option1: parts.append(f"option1={option1}") if option2: parts.append(f"option2={option2}") if option3: parts.append(f"option3={option3}") if vendor: parts.append(f"vendor={vendor}") if category: parts.append(f"category={category}") if tags_text: parts.append(f"tags={tags_text}") return f"{idx}. " + " | ".join(part for part in parts if part) def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: return { "spu_id": str(doc.get("spu_id") or ""), "title": build_display_title(doc), "image_url": doc.get("image_url"), "vendor": pick_text(doc.get("vendor"), "en"), "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), "option_values": list(compact_option_values(doc.get("skus") or [])), "tags": list((doc.get("tags") or [])[:6]), } def normalize_text(text: Any) -> str: value = str(text or "").strip().lower() value = re.sub(r"\s+", " ", value) return value def extract_json_blob(text: str) -> Any: cleaned = str(text or "").strip() candidates: List[str] = [cleaned] fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I) candidates.extend(match.strip() for match in fence_matches if match.strip()) for candidate in candidates: try: return json.loads(candidate) except Exception: pass starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"] ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"] for start in starts: for end in reversed(ends): if end <= start: continue fragment = cleaned[start : end + 1] try: return json.loads(fragment) except Exception: continue raise ValueError(f"failed to parse json from: {cleaned[:500]!r}") def ensure_project_on_path() -> None: import sys if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT))