cache_keys.py 2.34 KB
"""Shared cache key helpers for embedding inputs.

Multimodal (CN-CLIP) raw keys include ``model_name`` so switching ViT-L / ViT-H does not reuse stale vectors.

- 图片:``embed:{model_name}:txt:norm{0|1}:<url_or_digest>``
- 多模态文本(与 /embed/image 同空间):``embed:{model_name}:img:norm{0|1}:<text_or_digest>``

TEI/BGE 文本(title_embedding 等):``embed:norm{0|1}:<text_or_digest>``

超长 URL/文本(按 Unicode 码点计数,超过 ``CACHE_KEY_RAW_BODY_MAX_CHARS``)时,尾部负载改为
``h:sha256:<64 hex>``,避免 Redis key 过长。

`RedisEmbeddingCache` adds the configured key prefix and optional namespace on top.
"""

from __future__ import annotations

import hashlib

# Max length (Unicode codepoints) of the raw URL/text segment before switching to SHA256 digest form.
CACHE_KEY_RAW_BODY_MAX_CHARS = 256


def _stable_body_for_cache_key(body: str, *, max_chars: int | None = None) -> str:
    """
    Return ``body`` unchanged when ``len(body) <= max_chars``; otherwise a fixed-length digest key.

    Hash is SHA-256 over UTF-8 bytes of ``body``; prefix ``h:sha256:`` avoids collision with literals.
    """
    if max_chars is None:
        max_chars = CACHE_KEY_RAW_BODY_MAX_CHARS
    if len(body) <= max_chars:
        return body
    digest = hashlib.sha256(body.encode("utf-8")).hexdigest()
    return f"h:sha256:{digest}"


def build_text_cache_key(text: str, *, normalize: bool) -> str:
    normalized_text = str(text or "").strip()
    payload = _stable_body_for_cache_key(normalized_text)
    return f"embed:norm{1 if normalize else 0}:{payload}"


def build_image_cache_key(url: str, *, normalize: bool, model_name: str) -> str:
    """CN-CLIP 图片向量缓存逻辑键(业务约定段名为 txt)。"""
    normalized_url = str(url or "").strip()
    payload = _stable_body_for_cache_key(normalized_url)
    m = str(model_name or "").strip() or "unknown"
    return f"embed:{m}:txt:norm{1 if normalize else 0}:{payload}"


def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: str) -> str:
    """CN-CLIP 文本塔缓存逻辑键(与图同空间;业务约定段名为 img)。"""
    normalized_text = str(text or "").strip()
    payload = _stable_body_for_cache_key(normalized_text)
    m = str(model_name or "").strip() or "unknown"
    return f"embed:{m}:img:norm{1 if normalize else 0}:{payload}"