cache_keys.py
2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""Shared cache key helpers for embedding inputs.
Multimodal (CN-CLIP) raw keys include ``model_name`` so switching ViT-L / ViT-H does not reuse stale vectors.
- 图片:``embed:{model_name}:txt:norm{0|1}:<url_or_digest>``
- 多模态文本(与 /embed/image 同空间):``embed:{model_name}:img:norm{0|1}:<text_or_digest>``
TEI/BGE 文本(title_embedding 等):``embed:norm{0|1}:<text_or_digest>``
超长 URL/文本(按 Unicode 码点计数,超过 ``CACHE_KEY_RAW_BODY_MAX_CHARS``)时,尾部负载改为
``h:sha256:<64 hex>``,避免 Redis key 过长。
`RedisEmbeddingCache` adds the configured key prefix and optional namespace on top.
"""
from __future__ import annotations
import hashlib
# Max length (Unicode codepoints) of the raw URL/text segment before switching to SHA256 digest form.
CACHE_KEY_RAW_BODY_MAX_CHARS = 256
def _stable_body_for_cache_key(body: str, *, max_chars: int | None = None) -> str:
"""
Return ``body`` unchanged when ``len(body) <= max_chars``; otherwise a fixed-length digest key.
Hash is SHA-256 over UTF-8 bytes of ``body``; prefix ``h:sha256:`` avoids collision with literals.
"""
if max_chars is None:
max_chars = CACHE_KEY_RAW_BODY_MAX_CHARS
if len(body) <= max_chars:
return body
digest = hashlib.sha256(body.encode("utf-8")).hexdigest()
return f"h:sha256:{digest}"
def build_text_cache_key(text: str, *, normalize: bool) -> str:
normalized_text = str(text or "").strip()
payload = _stable_body_for_cache_key(normalized_text)
return f"embed:norm{1 if normalize else 0}:{payload}"
def build_image_cache_key(url: str, *, normalize: bool, model_name: str) -> str:
"""CN-CLIP 图片向量缓存逻辑键(业务约定段名为 txt)。"""
normalized_url = str(url or "").strip()
payload = _stable_body_for_cache_key(normalized_url)
m = str(model_name or "").strip() or "unknown"
return f"embed:{m}:txt:norm{1 if normalize else 0}:{payload}"
def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: str) -> str:
"""CN-CLIP 文本塔缓存逻辑键(与图同空间;业务约定段名为 img)。"""
normalized_text = str(text or "").strip()
payload = _stable_body_for_cache_key(normalized_text)
m = str(model_name or "").strip() or "unknown"
return f"embed:{m}:img:norm{1 if normalize else 0}:{payload}"