test_cache_keys.py 1.73 KB
"""Unit tests for embeddings/cache_keys.py (hashing long bodies)."""

import hashlib

from embeddings import cache_keys as ck


def test_stable_body_short_unchanged():
    s = "a" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS
    assert ck._stable_body_for_cache_key(s) == s


def test_stable_body_long_hashes():
    s = "a" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
    out = ck._stable_body_for_cache_key(s)
    assert out == "h:sha256:" + hashlib.sha256(s.encode("utf-8")).hexdigest()
    assert out.startswith("h:sha256:")
    assert len(out) == len("h:sha256:") + 64


def test_stable_body_utf8_counts_unicode_codepoints():
    # 2 codepoints, not 6 bytes — still short
    s = "你好"
    assert ck._stable_body_for_cache_key(s) == s


def test_build_text_cache_key_uses_digest_when_long():
    # Default max 256: 257 'x' -> digest
    long_text = "x" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
    key = ck.build_text_cache_key(long_text, normalize=True)
    assert key.startswith("embed:norm1:h:sha256:")
    digest = hashlib.sha256(long_text.encode("utf-8")).hexdigest()
    assert key == f"embed:norm1:h:sha256:{digest}"


def test_build_image_cache_key_uses_digest_when_long():
    url = "https://x.example/" + "y" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS
    key = ck.build_image_cache_key(url, normalize=True, model_name="CN-CLIP/ViT-H-14")
    digest = hashlib.sha256(url.encode("utf-8")).hexdigest()
    assert key == f"embed:CN-CLIP/ViT-H-14:txt:norm1:h:sha256:{digest}"


def test_build_clip_text_cache_key_uses_digest_when_long():
    t = "词" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1)
    key = ck.build_clip_text_cache_key(t, normalize=False, model_name="m")
    digest = hashlib.sha256(t.encode("utf-8")).hexdigest()
    assert key == f"embed:m:img:norm0:h:sha256:{digest}"