utils.py 4.27 KB
"""Small helpers: time, JSON, document text, LLM output parsing."""

from __future__ import annotations

import hashlib
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Sequence, Tuple

from .constants import PROJECT_ROOT


def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()


def utc_timestamp() -> str:
    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")


def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path


def sha1_text(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()


def pick_text(value: Any, preferred_lang: str = "en") -> str:
    if value is None:
        return ""
    if isinstance(value, dict):
        return str(
            value.get(preferred_lang)
            or value.get("en")
            or value.get("zh")
            or next((v for v in value.values() if v), "")
        ).strip()
    return str(value).strip()


def safe_json_dumps(data: Any) -> str:
    return json.dumps(data, ensure_ascii=False, separators=(",", ":"))


def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
    if not skus:
        return "", "", ""
    first = skus[0] or {}
    return (
        str(first.get("option1_value") or "").strip(),
        str(first.get("option2_value") or "").strip(),
        str(first.get("option3_value") or "").strip(),
    )


def build_display_title(doc: Dict[str, Any]) -> str:
    title = doc.get("title")
    en = pick_text(title, "en")
    zh = pick_text(title, "zh")
    if en and zh and en != zh:
        return f"{en} / {zh}"
    return en or zh


def build_rerank_doc(doc: Dict[str, Any]) -> str:
    title = build_display_title(doc)
    return title[:400]


def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
    title = build_display_title(doc)
    option1, option2, option3 = compact_option_values(doc.get("skus") or [])
    vendor = pick_text(doc.get("vendor"), "en")
    category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
    tags = doc.get("tags") or []
    tags_text = ", ".join(str(tag) for tag in tags[:4] if tag)
    parts = [title]
    if option1:
        parts.append(f"option1={option1}")
    if option2:
        parts.append(f"option2={option2}")
    if option3:
        parts.append(f"option3={option3}")
    if vendor:
        parts.append(f"vendor={vendor}")
    if category:
        parts.append(f"category={category}")
    if tags_text:
        parts.append(f"tags={tags_text}")
    return f"{idx}. " + " | ".join(part for part in parts if part)


def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "spu_id": str(doc.get("spu_id") or ""),
        "title": build_display_title(doc),
        "image_url": doc.get("image_url"),
        "vendor": pick_text(doc.get("vendor"), "en"),
        "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
        "option_values": list(compact_option_values(doc.get("skus") or [])),
        "tags": list((doc.get("tags") or [])[:6]),
    }


def normalize_text(text: Any) -> str:
    value = str(text or "").strip().lower()
    value = re.sub(r"\s+", " ", value)
    return value


def extract_json_blob(text: str) -> Any:
    cleaned = str(text or "").strip()
    candidates: List[str] = [cleaned]
    fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
    candidates.extend(match.strip() for match in fence_matches if match.strip())

    for candidate in candidates:
        try:
            return json.loads(candidate)
        except Exception:
            pass

    starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
    ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
    for start in starts:
        for end in reversed(ends):
            if end <= start:
                continue
            fragment = cleaned[start : end + 1]
            try:
                return json.loads(fragment)
            except Exception:
                continue
    raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")


def ensure_project_on_path() -> None:
    import sys

    if str(PROJECT_ROOT) not in sys.path:
        sys.path.insert(0, str(PROJECT_ROOT))