"""Small helpers: time, JSON, document text, LLM output parsing.""" from __future__ import annotations import hashlib import json import re from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, List, Sequence, Tuple from .constants import PROJECT_ROOT _LABEL_OPTION_MAX_CHARS = 40 _LABEL_DOC_LINE_MAX_CHARS = 260 def _truncate_text(value: Any, max_chars: int) -> str: text = str(value or "").strip() if max_chars <= 0: return "" if len(text) <= max_chars: return text if max_chars <= 3: return text[:max_chars] return text[: max_chars - 3].rstrip() + "..." def utc_now_iso() -> str: return datetime.now(timezone.utc).isoformat() def utc_timestamp() -> str: return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ") def ensure_dir(path: Path) -> Path: path.mkdir(parents=True, exist_ok=True) return path def sha1_text(text: str) -> str: return hashlib.sha1(text.encode("utf-8")).hexdigest() def pick_text(value: Any, preferred_lang: str = "en") -> str: if value is None: return "" if isinstance(value, dict): return str( value.get(preferred_lang) or value.get("en") or value.get("zh") or next((v for v in value.values() if v), "") ).strip() return str(value).strip() def zh_title_from_multilingual(title_multilingual: Any) -> str: """Chinese title string from API debug ``title_multilingual`` (ES-style dict).""" if not isinstance(title_multilingual, dict): return "" zh = str(title_multilingual.get("zh") or "").strip() return zh def safe_json_dumps(data: Any) -> str: return json.dumps(data, ensure_ascii=False, separators=(",", ":")) def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]: if not skus: return "", "", "" first = skus[0] or {} return ( str(first.get("option1_value") or "").strip(), str(first.get("option2_value") or "").strip(), str(first.get("option3_value") or "").strip(), ) def build_display_title(doc: Dict[str, Any]) -> str: title = doc.get("title") return pick_text(title, "en") or pick_text(title, "zh") def build_rerank_doc(doc: Dict[str, Any]) -> str: title = build_display_title(doc) return title[:400] def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str: title = build_display_title(doc) option1, option2, option3 = compact_option_values(doc.get("skus") or []) parts = [title] if option1: parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS)) if option2: parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS)) if option3: parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS)) line = " ".join(part for part in parts if part) return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS) def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]: return { "spu_id": str(doc.get("spu_id") or ""), "title": build_display_title(doc), "image_url": doc.get("image_url"), "vendor": pick_text(doc.get("vendor"), "en"), "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"), "option_values": list(compact_option_values(doc.get("skus") or [])) } def normalize_text(text: Any) -> str: value = str(text or "").strip().lower() value = re.sub(r"\s+", " ", value) return value def extract_json_blob(text: str) -> Any: cleaned = str(text or "").strip() candidates: List[str] = [cleaned] fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I) candidates.extend(match.strip() for match in fence_matches if match.strip()) for candidate in candidates: try: return json.loads(candidate) except Exception: pass starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"] ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"] for start in starts: for end in reversed(ends): if end <= start: continue fragment = cleaned[start : end + 1] try: return json.loads(fragment) except Exception: continue raise ValueError(f"failed to parse json from: {cleaned[:500]!r}") def ensure_project_on_path() -> None: import sys if str(PROJECT_ROOT) not in sys.path: sys.path.insert(0, str(PROJECT_ROOT))