test_cache_keys.py
1.73 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
"""Unit tests for embeddings/cache_keys.py (hashing long bodies)."""
import hashlib
from embeddings import cache_keys as ck
def test_stable_body_short_unchanged():
s = "a" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS
assert ck._stable_body_for_cache_key(s) == s
def test_stable_body_long_hashes():
s = "a" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
out = ck._stable_body_for_cache_key(s)
assert out == "h:sha256:" + hashlib.sha256(s.encode("utf-8")).hexdigest()
assert out.startswith("h:sha256:")
assert len(out) == len("h:sha256:") + 64
def test_stable_body_utf8_counts_unicode_codepoints():
# 2 codepoints, not 6 bytes — still short
s = "你好"
assert ck._stable_body_for_cache_key(s) == s
def test_build_text_cache_key_uses_digest_when_long():
# Default max 256: 257 'x' -> digest
long_text = "x" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
key = ck.build_text_cache_key(long_text, normalize=True)
assert key.startswith("embed:norm1:h:sha256:")
digest = hashlib.sha256(long_text.encode("utf-8")).hexdigest()
assert key == f"embed:norm1:h:sha256:{digest}"
def test_build_image_cache_key_uses_digest_when_long():
url = "https://x.example/" + "y" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS
key = ck.build_image_cache_key(url, normalize=True, model_name="CN-CLIP/ViT-H-14")
digest = hashlib.sha256(url.encode("utf-8")).hexdigest()
assert key == f"embed:CN-CLIP/ViT-H-14:txt:norm1:h:sha256:{digest}"
def test_build_clip_text_cache_key_uses_digest_when_long():
t = "词" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
key = ck.build_clip_text_cache_key(t, normalize=False, model_name="m")
digest = hashlib.sha256(t.encode("utf-8")).hexdigest()
assert key == f"embed:m:img:norm0:h:sha256:{digest}"