scripts/evaluation/eval_framework/utils.py

"""Small helpers: time, JSON, document text, LLM output parsing."""
from __future__ import annotations
import hashlib
import json
import re
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Sequence, Tuple
from .constants import PROJECT_ROOT
_LABEL_OPTION_MAX_CHARS = 40
_LABEL_DOC_LINE_MAX_CHARS = 260
def _truncate_text(value: Any, max_chars: int) -> str:
    text = str(value or "").strip()
    if max_chars <= 0:
        return ""
    if len(text) <= max_chars:
        return text
    if max_chars <= 3:
        return text[:max_chars]
    return text[: max_chars - 3].rstrip() + "..."
def utc_now_iso() -> str:
    return datetime.now(timezone.utc).isoformat()
def utc_timestamp() -> str:
    return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
def ensure_dir(path: Path) -> Path:
    path.mkdir(parents=True, exist_ok=True)
    return path
def sha1_text(text: str) -> str:
    return hashlib.sha1(text.encode("utf-8")).hexdigest()
def pick_text(value: Any, preferred_lang: str = "en") -> str:
    if value is None:
        return ""
    if isinstance(value, dict):
        return str(
            value.get(preferred_lang)
            or value.get("en")
            or value.get("zh")
            or next((v for v in value.values() if v), "")
        ).strip()
    return str(value).strip()
def zh_title_from_multilingual(title_multilingual: Any) -> str:
    """Chinese title string from API debug ``title_multilingual`` (ES-style dict)."""
    if not isinstance(title_multilingual, dict):
        return ""
    zh = str(title_multilingual.get("zh") or "").strip()
    return zh
def safe_json_dumps(data: Any) -> str:
    return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
    if not skus:
        return "", "", ""
    first = skus[0] or {}
    return (
        str(first.get("option1_value") or "").strip(),
        str(first.get("option2_value") or "").strip(),
        str(first.get("option3_value") or "").strip(),
    )
def build_display_title(doc: Dict[str, Any]) -> str:
    title = doc.get("title")
    return pick_text(title, "en") or pick_text(title, "zh")
def build_rerank_doc(doc: Dict[str, Any]) -> str:
    title = build_display_title(doc)
    return title[:400]
def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
    title = build_display_title(doc)
    option1, option2, option3 = compact_option_values(doc.get("skus") or [])
    parts = [title]
    if option1:
        parts.append(_truncate_text(option1, _LABEL_OPTION_MAX_CHARS))
    if option2:
        parts.append(_truncate_text(option2, _LABEL_OPTION_MAX_CHARS))
    if option3:
        parts.append(_truncate_text(option3, _LABEL_OPTION_MAX_CHARS))
    line = " ".join(part for part in parts if part)
    return _truncate_text(f"{idx}. {line}", _LABEL_DOC_LINE_MAX_CHARS)
def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "spu_id": str(doc.get("spu_id") or ""),
        "title": build_display_title(doc),
        "image_url": doc.get("image_url"),
        "vendor": pick_text(doc.get("vendor"), "en"),
        "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
        "option_values": list(compact_option_values(doc.get("skus") or []))
    }
def normalize_text(text: Any) -> str:
    value = str(text or "").strip().lower()
    value = re.sub(r"\s+", " ", value)
    return value
def extract_json_blob(text: str) -> Any:
    cleaned = str(text or "").strip()
    candidates: List[str] = [cleaned]
    fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
    candidates.extend(match.strip() for match in fence_matches if match.strip())
    for candidate in candidates:
        try:
            return json.loads(candidate)
        except Exception:
            pass
    starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
    ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
    for start in starts:
        for end in reversed(ends):
            if end <= start:
                continue
            fragment = cleaned[start : end + 1]
            try:
                return json.loads(fragment)
            except Exception:
                continue
    raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
def ensure_project_on_path() -> None:
    import sys
    if str(PROJECT_ROOT) not in sys.path:
        sys.path.insert(0, str(PROJECT_ROOT))