"""Shared cache key helpers for embedding inputs. Multimodal (CN-CLIP) raw keys include ``model_name`` so switching ViT-L / ViT-H does not reuse stale vectors. - 图片:``embed:{model_name}:txt:norm{0|1}:`` - 多模态文本(与 /embed/image 同空间):``embed:{model_name}:img:norm{0|1}:`` TEI/BGE 文本(title_embedding 等):``embed:norm{0|1}:`` 超长 URL/文本(按 Unicode 码点计数,超过 ``CACHE_KEY_RAW_BODY_MAX_CHARS``)时,尾部负载改为 ``h:sha256:<64 hex>``,避免 Redis key 过长。 `RedisEmbeddingCache` adds the configured key prefix and optional namespace on top. """ from __future__ import annotations import hashlib # Max length (Unicode codepoints) of the raw URL/text segment before switching to SHA256 digest form. CACHE_KEY_RAW_BODY_MAX_CHARS = 256 def _stable_body_for_cache_key(body: str, *, max_chars: int | None = None) -> str: """ Return ``body`` unchanged when ``len(body) <= max_chars``; otherwise a fixed-length digest key. Hash is SHA-256 over UTF-8 bytes of ``body``; prefix ``h:sha256:`` avoids collision with literals. """ if max_chars is None: max_chars = CACHE_KEY_RAW_BODY_MAX_CHARS if len(body) <= max_chars: return body digest = hashlib.sha256(body.encode("utf-8")).hexdigest() return f"h:sha256:{digest}" def build_text_cache_key(text: str, *, normalize: bool) -> str: normalized_text = str(text or "").strip() payload = _stable_body_for_cache_key(normalized_text) return f"embed:norm{1 if normalize else 0}:{payload}" def build_image_cache_key(url: str, *, normalize: bool, model_name: str) -> str: """CN-CLIP 图片向量缓存逻辑键(业务约定段名为 txt)。""" normalized_url = str(url or "").strip() payload = _stable_body_for_cache_key(normalized_url) m = str(model_name or "").strip() or "unknown" return f"embed:{m}:txt:norm{1 if normalize else 0}:{payload}" def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: str) -> str: """CN-CLIP 文本塔缓存逻辑键(与图同空间;业务约定段名为 img)。""" normalized_text = str(text or "").strip() payload = _stable_body_for_cache_key(normalized_text) m = str(model_name or "").strip() or "unknown" return f"embed:{m}:img:norm{1 if normalize else 0}:{payload}"