Blame view

utils/cache.py 4.79 KB
be52af70   tangwang   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  """
  Cache utility for storing embedding results.
  """
  
  import json
  import hashlib
  import pickle
  from pathlib import Path
  from typing import Any, Optional
  import numpy as np
  
  
  class EmbeddingCache:
      """
      Simple file-based cache for embeddings.
  
      Uses MD5 hash of input text/URL as cache key.
      """
  
      def __init__(self, cache_dir: str = ".cache/embeddings"):
          self.cache_dir = Path(cache_dir)
          self.cache_dir.mkdir(parents=True, exist_ok=True)
  
      def _get_cache_key(self, input_str: str) -> str:
          """Generate cache key from input string."""
          return hashlib.md5(input_str.encode('utf-8')).hexdigest()
  
      def get(self, input_str: str) -> Optional[np.ndarray]:
          """
          Get cached embedding.
  
          Args:
              input_str: Input text or URL
  
          Returns:
              Cached embedding or None if not found
          """
          cache_key = self._get_cache_key(input_str)
          cache_file = self.cache_dir / f"{cache_key}.npy"
  
          if cache_file.exists():
              try:
                  return np.load(cache_file)
              except Exception as e:
                  print(f"Failed to load cache for {input_str}: {e}")
                  return None
          return None
  
      def set(self, input_str: str, embedding: np.ndarray) -> bool:
          """
          Store embedding in cache.
  
          Args:
              input_str: Input text or URL
              embedding: Embedding vector
  
          Returns:
              True if successful
          """
          cache_key = self._get_cache_key(input_str)
          cache_file = self.cache_dir / f"{cache_key}.npy"
  
          try:
              np.save(cache_file, embedding)
              return True
          except Exception as e:
              print(f"Failed to cache embedding for {input_str}: {e}")
              return False
  
      def exists(self, input_str: str) -> bool:
          """Check if embedding is cached."""
          cache_key = self._get_cache_key(input_str)
          cache_file = self.cache_dir / f"{cache_key}.npy"
          return cache_file.exists()
  
      def clear(self) -> int:
          """
          Clear all cached embeddings.
  
          Returns:
              Number of files deleted
          """
          count = 0
          for cache_file in self.cache_dir.glob("*.npy"):
              cache_file.unlink()
              count += 1
          return count
  
      def size(self) -> int:
          """Get number of cached embeddings."""
          return len(list(self.cache_dir.glob("*.npy")))
  
  
  class DictCache:
      """
      Simple dictionary-based cache for query rewrite rules, translations, etc.
      """
  
      def __init__(self, cache_file: str = ".cache/dict_cache.json"):
          self.cache_file = Path(cache_file)
          self.cache_file.parent.mkdir(parents=True, exist_ok=True)
          self.cache = self._load()
  
      def _load(self) -> dict:
          """Load cache from file."""
          if self.cache_file.exists():
              try:
                  with open(self.cache_file, 'r', encoding='utf-8') as f:
                      return json.load(f)
              except Exception as e:
                  print(f"Failed to load cache: {e}")
                  return {}
          return {}
  
      def _save(self) -> bool:
          """Save cache to file."""
          try:
              with open(self.cache_file, 'w', encoding='utf-8') as f:
                  json.dump(self.cache, f, ensure_ascii=False, indent=2)
              return True
          except Exception as e:
              print(f"Failed to save cache: {e}")
              return False
  
      def get(self, key: str, category: str = "default") -> Optional[Any]:
          """
          Get cached value.
  
          Args:
              key: Cache key
              category: Cache category (for organizing different types of data)
  
          Returns:
              Cached value or None
          """
          return self.cache.get(category, {}).get(key)
  
      def set(self, key: str, value: Any, category: str = "default") -> bool:
          """
          Store value in cache.
  
          Args:
              key: Cache key
              value: Value to cache
              category: Cache category
  
          Returns:
              True if successful
          """
          if category not in self.cache:
              self.cache[category] = {}
          self.cache[category][key] = value
          return self._save()
  
      def exists(self, key: str, category: str = "default") -> bool:
          """Check if key exists in cache."""
          return category in self.cache and key in self.cache[category]
  
      def clear(self, category: Optional[str] = None) -> bool:
          """
          Clear cache.
  
          Args:
              category: If specified, clear only this category. Otherwise clear all.
  
          Returns:
              True if successful
          """
          if category:
              if category in self.cache:
                  del self.cache[category]
          else:
              self.cache = {}
          return self._save()