"""Unit tests for embeddings/cache_keys.py (hashing long bodies).""" import hashlib from embeddings import cache_keys as ck def test_stable_body_short_unchanged(): s = "a" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS assert ck._stable_body_for_cache_key(s) == s def test_stable_body_long_hashes(): s = "a" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) out = ck._stable_body_for_cache_key(s) assert out == "h:sha256:" + hashlib.sha256(s.encode("utf-8")).hexdigest() assert out.startswith("h:sha256:") assert len(out) == len("h:sha256:") + 64 def test_stable_body_utf8_counts_unicode_codepoints(): # 2 codepoints, not 6 bytes — still short s = "你好" assert ck._stable_body_for_cache_key(s) == s def test_build_text_cache_key_uses_digest_when_long(): # Default max 256: 257 'x' -> digest long_text = "x" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) key = ck.build_text_cache_key(long_text, normalize=True) assert key.startswith("embed:norm1:h:sha256:") digest = hashlib.sha256(long_text.encode("utf-8")).hexdigest() assert key == f"embed:norm1:h:sha256:{digest}" def test_build_image_cache_key_uses_digest_when_long(): url = "https://x.example/" + "y" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS key = ck.build_image_cache_key(url, normalize=True, model_name="CN-CLIP/ViT-H-14") digest = hashlib.sha256(url.encode("utf-8")).hexdigest() assert key == f"embed:CN-CLIP/ViT-H-14:txt:norm1:h:sha256:{digest}" def test_build_clip_text_cache_key_uses_digest_when_long(): t = "词" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) key = ck.build_clip_text_cache_key(t, normalize=False, model_name="m") digest = hashlib.sha256(t.encode("utf-8")).hexdigest() assert key == f"embed:m:img:norm0:h:sha256:{digest}"