Blame view

scripts/evaluation/eval_framework/utils.py 4.16 KB
c81b0fc1   tangwang   scripts/evaluatio...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
  """Small helpers: time, JSON, document text, LLM output parsing."""
  
  from __future__ import annotations
  
  import hashlib
  import json
  import re
  from datetime import datetime, timezone
  from pathlib import Path
  from typing import Any, Dict, List, Sequence, Tuple
  
  from .constants import PROJECT_ROOT
  
  
  def utc_now_iso() -> str:
      return datetime.now(timezone.utc).isoformat()
  
  
  def utc_timestamp() -> str:
      return datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
  
  
  def ensure_dir(path: Path) -> Path:
      path.mkdir(parents=True, exist_ok=True)
      return path
  
  
  def sha1_text(text: str) -> str:
      return hashlib.sha1(text.encode("utf-8")).hexdigest()
  
  
  def pick_text(value: Any, preferred_lang: str = "en") -> str:
      if value is None:
          return ""
      if isinstance(value, dict):
          return str(
              value.get(preferred_lang)
              or value.get("en")
              or value.get("zh")
              or next((v for v in value.values() if v), "")
          ).strip()
      return str(value).strip()
  
  
167f33b4   tangwang   eval框架前端
45
46
47
48
49
50
51
52
  def zh_title_from_multilingual(title_multilingual: Any) -> str:
      """Chinese title string from API debug ``title_multilingual`` (ES-style dict)."""
      if not isinstance(title_multilingual, dict):
          return ""
      zh = str(title_multilingual.get("zh") or "").strip()
      return zh
  
  
c81b0fc1   tangwang   scripts/evaluatio...
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
  def safe_json_dumps(data: Any) -> str:
      return json.dumps(data, ensure_ascii=False, separators=(",", ":"))
  
  
  def compact_option_values(skus: Sequence[Dict[str, Any]]) -> Tuple[str, str, str]:
      if not skus:
          return "", "", ""
      first = skus[0] or {}
      return (
          str(first.get("option1_value") or "").strip(),
          str(first.get("option2_value") or "").strip(),
          str(first.get("option3_value") or "").strip(),
      )
  
  
  def build_display_title(doc: Dict[str, Any]) -> str:
      title = doc.get("title")
      en = pick_text(title, "en")
      zh = pick_text(title, "zh")
      if en and zh and en != zh:
          return f"{en} / {zh}"
      return en or zh
  
  
  def build_rerank_doc(doc: Dict[str, Any]) -> str:
      title = build_display_title(doc)
      return title[:400]
  
  
  def build_label_doc_line(idx: int, doc: Dict[str, Any]) -> str:
      title = build_display_title(doc)
      option1, option2, option3 = compact_option_values(doc.get("skus") or [])
      vendor = pick_text(doc.get("vendor"), "en")
      category = pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en")
c81b0fc1   tangwang   scripts/evaluatio...
87
88
      parts = [title]
      if option1:
286e9b4f   tangwang   evalution
89
          parts.append(f"{option1}")
c81b0fc1   tangwang   scripts/evaluatio...
90
      if option2:
286e9b4f   tangwang   evalution
91
92
          parts.append(f"{option2}")
      return f"{idx}. " + " ".join(part for part in parts if part)
c81b0fc1   tangwang   scripts/evaluatio...
93
94
95
96
97
98
99
100
101
  
  
  def compact_product_payload(doc: Dict[str, Any]) -> Dict[str, Any]:
      return {
          "spu_id": str(doc.get("spu_id") or ""),
          "title": build_display_title(doc),
          "image_url": doc.get("image_url"),
          "vendor": pick_text(doc.get("vendor"), "en"),
          "category": pick_text(doc.get("category_path"), "en") or pick_text(doc.get("category_name"), "en"),
286e9b4f   tangwang   evalution
102
          "option_values": list(compact_option_values(doc.get("skus") or []))
c81b0fc1   tangwang   scripts/evaluatio...
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
      }
  
  
  def normalize_text(text: Any) -> str:
      value = str(text or "").strip().lower()
      value = re.sub(r"\s+", " ", value)
      return value
  
  
  def extract_json_blob(text: str) -> Any:
      cleaned = str(text or "").strip()
      candidates: List[str] = [cleaned]
      fence_matches = re.findall(r"```(?:json)?\s*(.*?)```", cleaned, flags=re.S | re.I)
      candidates.extend(match.strip() for match in fence_matches if match.strip())
  
      for candidate in candidates:
          try:
              return json.loads(candidate)
          except Exception:
              pass
  
      starts = [idx for idx, ch in enumerate(cleaned) if ch in "[{"]
      ends = [idx for idx, ch in enumerate(cleaned) if ch in "]}"]
      for start in starts:
          for end in reversed(ends):
              if end <= start:
                  continue
              fragment = cleaned[start : end + 1]
              try:
                  return json.loads(fragment)
              except Exception:
                  continue
      raise ValueError(f"failed to parse json from: {cleaned[:500]!r}")
  
  
  def ensure_project_on_path() -> None:
      import sys
  
      if str(PROJECT_ROOT) not in sys.path:
          sys.path.insert(0, str(PROJECT_ROOT))