Blame view

embeddings/cache_keys.py 2.34 KB
5bac9649   tangwang   文本 embedding 与图片 ...
1
2
  """Shared cache key helpers for embedding inputs.
  
5a01af3c   tangwang   多模态hashkey调整:1. 加...
3
4
5
6
7
8
9
10
11
  Multimodal (CN-CLIP) raw keys include ``model_name`` so switching ViT-L / ViT-H does not reuse stale vectors.
  
  - 图片:``embed:{model_name}:txt:norm{0|1}:<url_or_digest>``
  - 多模态文本(与 /embed/image 同空间):``embed:{model_name}:img:norm{0|1}:<text_or_digest>``
  
  TEI/BGE 文本(title_embedding 等):``embed:norm{0|1}:<text_or_digest>``
  
  超长 URL/文本(按 Unicode 码点计数,超过 ``CACHE_KEY_RAW_BODY_MAX_CHARS``)时,尾部负载改为
  ``h:sha256:<64 hex>``,避免 Redis key 过长。
5bac9649   tangwang   文本 embedding 与图片 ...
12
13
14
  
  `RedisEmbeddingCache` adds the configured key prefix and optional namespace on top.
  """
7214c2e7   tangwang   mplemented**
15
16
17
  
  from __future__ import annotations
  
5a01af3c   tangwang   多模态hashkey调整:1. 加...
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
  import hashlib
  
  # Max length (Unicode codepoints) of the raw URL/text segment before switching to SHA256 digest form.
  CACHE_KEY_RAW_BODY_MAX_CHARS = 256
  
  
  def _stable_body_for_cache_key(body: str, *, max_chars: int | None = None) -> str:
      """
      Return ``body`` unchanged when ``len(body) <= max_chars``; otherwise a fixed-length digest key.
  
      Hash is SHA-256 over UTF-8 bytes of ``body``; prefix ``h:sha256:`` avoids collision with literals.
      """
      if max_chars is None:
          max_chars = CACHE_KEY_RAW_BODY_MAX_CHARS
      if len(body) <= max_chars:
          return body
      digest = hashlib.sha256(body.encode("utf-8")).hexdigest()
      return f"h:sha256:{digest}"
  
7214c2e7   tangwang   mplemented**
37
38
39
  
  def build_text_cache_key(text: str, *, normalize: bool) -> str:
      normalized_text = str(text or "").strip()
5a01af3c   tangwang   多模态hashkey调整:1. 加...
40
41
      payload = _stable_body_for_cache_key(normalized_text)
      return f"embed:norm{1 if normalize else 0}:{payload}"
7214c2e7   tangwang   mplemented**
42
43
  
  
5a01af3c   tangwang   多模态hashkey调整:1. 加...
44
45
  def build_image_cache_key(url: str, *, normalize: bool, model_name: str) -> str:
      """CN-CLIP 图片向量缓存逻辑键(业务约定段名为 txt)。"""
7214c2e7   tangwang   mplemented**
46
      normalized_url = str(url or "").strip()
5a01af3c   tangwang   多模态hashkey调整:1. 加...
47
48
49
      payload = _stable_body_for_cache_key(normalized_url)
      m = str(model_name or "").strip() or "unknown"
      return f"embed:{m}:txt:norm{1 if normalize else 0}:{payload}"
7a013ca7   tangwang   多模态文本向量服务ok
50
51
  
  
5a01af3c   tangwang   多模态hashkey调整:1. 加...
52
53
  def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: str) -> str:
      """CN-CLIP 文本塔缓存逻辑键(与图同空间;业务约定段名为 img)。"""
7a013ca7   tangwang   多模态文本向量服务ok
54
      normalized_text = str(text or "").strip()
5a01af3c   tangwang   多模态hashkey调整:1. 加...
55
56
57
      payload = _stable_body_for_cache_key(normalized_text)
      m = str(model_name or "").strip() or "unknown"
      return f"embed:{m}:img:norm{1 if normalize else 0}:{payload}"