diff --git a/docs/常用查询 - sql.sql b/docs/常用查询 - sql.sql index 12aa787..4b74e69 100644 --- a/docs/常用查询 - sql.sql +++ b/docs/常用查询 - sql.sql @@ -552,3 +552,29 @@ WHERE tenant_id = 162 ORDER BY id LIMIT 50 OFFSET 0; -- 修改OFFSET查看不同页 + +-- ====================================== +-- 12. 查询店铺增量、全量相关数据 +-- ====================================== +1. 查看店铺配置 +```sql +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e "SELECT * FROM shoplazza_shop_config\G" +``` + +2. 查看增量、全量条数 +```sql +cd /data/saas-search && MYSQL_PWD='qY8tgodLoA&KT#yQ' mysql -h 10.200.16.14 -P 3316 -u root saas -e " +SELECT 'shoplazza_sync_log' AS table_name, COUNT(*) AS row_count FROM shoplazza_sync_log where tenant_id = 163 +UNION ALL +SELECT 'shoplazza_product_index_increment', COUNT(*) FROM shoplazza_product_index_increment where tenant_id = 163; +" +``` +-- ====================================== +-- 12. 重建索引 +-- ====================================== +删除下面两个表中 tenant_id=163的所有行 +shoplazza_sync_log +shoplazza_product_index_increment + +然后触发重新安装: +https://47167113.myshoplaza.com/admin/oauth/redirect_from_partner_center?client_id=kqN5QTBARwPAEO_ThHi8mikyFC_4DLkwOOrzQsUL3L0 diff --git a/docs/缓存与Redis使用说明.md b/docs/缓存与Redis使用说明.md index 928cc16..b88868e 100644 --- a/docs/缓存与Redis使用说明.md +++ b/docs/缓存与Redis使用说明.md @@ -52,24 +52,27 @@ ### 2.1 Key 设计 - 统一 helper:`embeddings/cache_keys.py` -- 文本主 key:`build_text_cache_key(text, normalize=...)` -- 图片主 key:`build_image_cache_key(url, normalize=...)` +- 文本主 key(TEI/BGE):`build_text_cache_key(text, normalize=...)` +- 多模态图片(CN-CLIP `/embed/image`):`build_image_cache_key(url, normalize=..., model_name=...)`,其中 `model_name` 来自 `services.embedding.image_backends.*.model_name`(与 `embeddings.config.CONFIG.MULTIMODAL_MODEL_NAME` 一致) +- 多模态文本(CN-CLIP `/embed/clip_text`):`build_clip_text_cache_key(text, normalize=..., model_name=...)` - 模板: ```text -文本: {EMBEDDING_CACHE_PREFIX}:embed:norm{0|1}:{text} -图片: {EMBEDDING_CACHE_PREFIX}:image:embed:norm{0|1}:{url_or_path} +TEI 文本: {EMBEDDING_CACHE_PREFIX}:embed:norm{0|1}:{text} +CN-CLIP 图片: {EMBEDDING_CACHE_PREFIX}:image:embed:{model_name}:txt:norm{0|1}:{url_or_path} +CN-CLIP 文本塔: {EMBEDDING_CACHE_PREFIX}:clip_text:embed:{model_name}:img:norm{0|1}:{text} ``` - 字段说明: - `EMBEDDING_CACHE_PREFIX`:来自 `REDIS_CONFIG["embedding_cache_prefix"]`,默认值为 `"embedding"`,可通过环境变量 `REDIS_EMBEDDING_CACHE_PREFIX` 覆盖; + - `model_name`:如 `CN-CLIP/ViT-H-14`;切换模型时自动使用新 key 空间,避免混用旧维度向量; - `norm1` / `norm0`:分别表示 `normalize=true` / `normalize=false`; - - `text` / `url_or_path`:当前仍直接使用规范化后的原始输入,不做哈希。 + - `text` / `url_or_path`:经 `strip` 后,若 Unicode 长度 **≤** ``CACHE_KEY_RAW_BODY_MAX_CHARS``(默认 256,见 ``embeddings/cache_keys.py``)则原样写入键尾;**更长**则改为 ``h:sha256:<64 hex>``(对 UTF-8 字节做 SHA-256)。TEI 文本与多模态共用 ``_stable_body_for_cache_key``。 补充说明: -- 本次把 raw key 格式统一成 `embed:norm{0|1}:...`,比以 `norm:` 开头更清晰,也更接近历史命名习惯。 -- 当前实现**不再兼容历史 key 协议**,只保留这一套主 key 规则,以降低运行时复杂度和歧义。 +- TEI 文本 raw key 仍为 `embed:norm{0|1}:...`(尾部负载规则同上)。 +- 多模态键名中 `txt` / `img` 段为项目内约定(与 `embeddings/cache_keys.py` 一致),用于区分图片 lane 与 clip 文本 lane。 ### 2.2 Value 与类型 diff --git a/embeddings/README.md b/embeddings/README.md index 937038a..e7acb6c 100644 --- a/embeddings/README.md +++ b/embeddings/README.md @@ -51,11 +51,12 @@ - 现在是**双层缓存**: - client 侧:`text_encoder.py` / `image_encoder.py` - service 侧:`server.py` -- 当前主 key 格式: - - 文本(TEI):`embedding:embed:norm{0|1}:{text}` - - 图片:`embedding:image:embed:norm{0|1}:{url_or_path}` - - CN-CLIP 文本:`embedding:clip_text:clip_mm:norm{0|1}:{text}` -- 当前实现不再兼容历史 key 规则,只保留这一套格式,减少代码路径和缓存歧义。 +- 当前主 key 格式(`model_name` 见 `CONFIG.MULTIMODAL_MODEL_NAME`,与 `services.embedding.image_backends` 一致): + - 文本(TEI):`embedding:embed:norm{0|1}:{text_or_sha256_digest}` + - CN-CLIP 图片:`embedding:image:embed:{model_name}:txt:norm{0|1}:{url_or_sha256_digest}` + - CN-CLIP 文本塔:`embedding:clip_text:embed:{model_name}:img:norm{0|1}:{text_or_sha256_digest}` +- 尾部负载:长度 ≤ `CACHE_KEY_RAW_BODY_MAX_CHARS`(默认 256,见 `embeddings/cache_keys.py`)用原文;更长用 `h:sha256:`(TEI 与多模态共用同一辅助函数)。 +- 切换多模态模型会自然换 key 空间;旧键需自行清理或等待过期。 ### 压力隔离与拒绝策略 diff --git a/embeddings/cache_keys.py b/embeddings/cache_keys.py index 1680263..6fc18c6 100644 --- a/embeddings/cache_keys.py +++ b/embeddings/cache_keys.py @@ -1,27 +1,57 @@ """Shared cache key helpers for embedding inputs. -Current canonical raw-key format: -- text (TEI/BGE): ``embed:norm1:`` / ``embed:norm0:`` -- image (CLIP): ``embed:norm1:`` / ``embed:norm0:`` -- clip_text (CN-CLIP 文本,与图同空间): ``clip_mm:norm1:`` / ``clip_mm:norm0:`` +Multimodal (CN-CLIP) raw keys include ``model_name`` so switching ViT-L / ViT-H does not reuse stale vectors. + +- 图片:``embed:{model_name}:txt:norm{0|1}:`` +- 多模态文本(与 /embed/image 同空间):``embed:{model_name}:img:norm{0|1}:`` + +TEI/BGE 文本(title_embedding 等):``embed:norm{0|1}:`` + +超长 URL/文本(按 Unicode 码点计数,超过 ``CACHE_KEY_RAW_BODY_MAX_CHARS``)时,尾部负载改为 +``h:sha256:<64 hex>``,避免 Redis key 过长。 `RedisEmbeddingCache` adds the configured key prefix and optional namespace on top. """ from __future__ import annotations +import hashlib + +# Max length (Unicode codepoints) of the raw URL/text segment before switching to SHA256 digest form. +CACHE_KEY_RAW_BODY_MAX_CHARS = 256 + + +def _stable_body_for_cache_key(body: str, *, max_chars: int | None = None) -> str: + """ + Return ``body`` unchanged when ``len(body) <= max_chars``; otherwise a fixed-length digest key. + + Hash is SHA-256 over UTF-8 bytes of ``body``; prefix ``h:sha256:`` avoids collision with literals. + """ + if max_chars is None: + max_chars = CACHE_KEY_RAW_BODY_MAX_CHARS + if len(body) <= max_chars: + return body + digest = hashlib.sha256(body.encode("utf-8")).hexdigest() + return f"h:sha256:{digest}" + def build_text_cache_key(text: str, *, normalize: bool) -> str: normalized_text = str(text or "").strip() - return f"embed:norm{1 if normalize else 0}:{normalized_text}" + payload = _stable_body_for_cache_key(normalized_text) + return f"embed:norm{1 if normalize else 0}:{payload}" -def build_image_cache_key(url: str, *, normalize: bool) -> str: +def build_image_cache_key(url: str, *, normalize: bool, model_name: str) -> str: + """CN-CLIP 图片向量缓存逻辑键(业务约定段名为 txt)。""" normalized_url = str(url or "").strip() - return f"embed:norm{1 if normalize else 0}:{normalized_url}" + payload = _stable_body_for_cache_key(normalized_url) + m = str(model_name or "").strip() or "unknown" + return f"embed:{m}:txt:norm{1 if normalize else 0}:{payload}" -def build_clip_text_cache_key(text: str, *, normalize: bool) -> str: - """CN-CLIP / multimodal text (same vector space as /embed/image).""" +def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: str) -> str: + """CN-CLIP 文本塔缓存逻辑键(与图同空间;业务约定段名为 img)。""" normalized_text = str(text or "").strip() - return f"clip_mm:norm{1 if normalize else 0}:{normalized_text}" + payload = _stable_body_for_cache_key(normalized_text) + m = str(model_name or "").strip() or "unknown" + return f"embed:{m}:img:norm{1 if normalize else 0}:{payload}" diff --git a/embeddings/config.py b/embeddings/config.py index a986bd3..bf46e2f 100644 --- a/embeddings/config.py +++ b/embeddings/config.py @@ -37,6 +37,11 @@ class EmbeddingConfig(object): self.CLIP_AS_SERVICE_MODEL_NAME = str(image_backend.get("model_name") or "CN-CLIP/ViT-H-14") self.IMAGE_MODEL_NAME = str(image_backend.get("model_name") or "ViT-H-14") + # Redis multimodal cache keys (image + clip_text) include this string; change model → new key space. + self.MULTIMODAL_MODEL_NAME = str( + image_backend.get("model_name") + or ("CN-CLIP/ViT-H-14" if self.USE_CLIP_AS_SERVICE else "ViT-H-14") + ) self.IMAGE_DEVICE = image_backend.get("device") # type: Optional[str] self.IMAGE_BATCH_SIZE = int(image_backend.get("batch_size", 8)) self.IMAGE_NORMALIZE_EMBEDDINGS = bool(image_backend.get("normalize_embeddings", True)) diff --git a/embeddings/image_encoder.py b/embeddings/image_encoder.py index f7f058f..2333f66 100644 --- a/embeddings/image_encoder.py +++ b/embeddings/image_encoder.py @@ -12,6 +12,7 @@ logger = logging.getLogger(__name__) from config.loader import get_app_config from config.services_config import get_embedding_image_backend_config, get_embedding_image_base_url from embeddings.cache_keys import build_clip_text_cache_key, build_image_cache_key +from embeddings.config import CONFIG from embeddings.redis_embedding_cache import RedisEmbeddingCache from request_log_context import build_downstream_request_headers, build_request_log_extra @@ -31,6 +32,7 @@ class CLIPImageEncoder: self.clip_text_endpoint = f"{self.service_url}/embed/clip_text" # Reuse embedding cache prefix, but separate namespace for images to avoid collisions. self.cache_prefix = str(redis_config.embedding_cache_prefix).strip() or "embedding" + self._mm_model_name = CONFIG.MULTIMODAL_MODEL_NAME logger.info("Creating CLIPImageEncoder instance with service URL: %s", self.service_url) self.cache = RedisEmbeddingCache( key_prefix=self.cache_prefix, @@ -171,7 +173,9 @@ class CLIPImageEncoder: """ CN-CLIP 文本塔(与 ``/embed/image`` 同向量空间),对应服务端 ``POST /embed/clip_text``。 """ - cache_key = build_clip_text_cache_key(text, normalize=normalize_embeddings) + cache_key = build_clip_text_cache_key( + text, normalize=normalize_embeddings, model_name=self._mm_model_name + ) cached = self._clip_text_cache.get(cache_key) if cached is not None: return cached @@ -216,7 +220,9 @@ class CLIPImageEncoder: Returns: Embedding vector """ - cache_key = build_image_cache_key(url, normalize=normalize_embeddings) + cache_key = build_image_cache_key( + url, normalize=normalize_embeddings, model_name=self._mm_model_name + ) cached = self.cache.get(cache_key) if cached is not None: return cached @@ -267,7 +273,9 @@ class CLIPImageEncoder: normalized_urls = [str(u).strip() for u in images] # type: ignore[list-item] for pos, url in enumerate(normalized_urls): - cache_key = build_image_cache_key(url, normalize=normalize_embeddings) + cache_key = build_image_cache_key( + url, normalize=normalize_embeddings, model_name=self._mm_model_name + ) cached = self.cache.get(cache_key) if cached is not None: results.append(cached) @@ -297,7 +305,12 @@ class CLIPImageEncoder: vec = np.array(embedding, dtype=np.float32) if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): raise RuntimeError(f"Invalid image embedding returned for URL: {url}") - self.cache.set(build_image_cache_key(url, normalize=normalize_embeddings), vec) + self.cache.set( + build_image_cache_key( + url, normalize=normalize_embeddings, model_name=self._mm_model_name + ), + vec, + ) pos = pending_positions[i + j] results[pos] = vec diff --git a/embeddings/server.py b/embeddings/server.py index 1feff47..ef92bc7 100644 --- a/embeddings/server.py +++ b/embeddings/server.py @@ -23,7 +23,11 @@ from fastapi.concurrency import run_in_threadpool from config.env_config import REDIS_CONFIG from config.services_config import get_embedding_backend_config -from embeddings.cache_keys import build_clip_text_cache_key, build_image_cache_key, build_text_cache_key +from embeddings.cache_keys import ( + build_clip_text_cache_key as _mm_clip_text_cache_key, + build_image_cache_key as _mm_image_cache_key, + build_text_cache_key, +) from embeddings.config import CONFIG from embeddings.protocols import ImageEncoderProtocol from embeddings.redis_embedding_cache import RedisEmbeddingCache @@ -763,10 +767,14 @@ def _try_full_image_lane_cache_hit( out: List[Optional[List[float]]] = [] for item in items: if lane == "image": - ck = build_image_cache_key(item, normalize=effective_normalize) + ck = _mm_image_cache_key( + item, normalize=effective_normalize, model_name=CONFIG.MULTIMODAL_MODEL_NAME + ) cached = _image_cache.get(ck) else: - ck = build_clip_text_cache_key(item, normalize=effective_normalize) + ck = _mm_clip_text_cache_key( + item, normalize=effective_normalize, model_name=CONFIG.MULTIMODAL_MODEL_NAME + ) cached = _clip_text_cache.get(ck) if cached is None: return None @@ -801,10 +809,14 @@ def _embed_image_lane_impl( cache_hits = 0 for idx, item in enumerate(items): if lane == "image": - ck = build_image_cache_key(item, normalize=effective_normalize) + ck = _mm_image_cache_key( + item, normalize=effective_normalize, model_name=CONFIG.MULTIMODAL_MODEL_NAME + ) cached = _image_cache.get(ck) else: - ck = build_clip_text_cache_key(item, normalize=effective_normalize) + ck = _mm_clip_text_cache_key( + item, normalize=effective_normalize, model_name=CONFIG.MULTIMODAL_MODEL_NAME + ) cached = _clip_text_cache.get(ck) if cached is not None: vec = _as_list(cached, normalize=False) @@ -1497,3 +1509,17 @@ async def embed_clip_text( priority=priority, preview_chars=_LOG_TEXT_PREVIEW_CHARS, ) + + +def build_image_cache_key(url: str, *, normalize: bool, model_name: Optional[str] = None) -> str: + """Tests/tools: same key as ``/embed/image`` lane; defaults to ``CONFIG.MULTIMODAL_MODEL_NAME``.""" + return _mm_image_cache_key( + url, normalize=normalize, model_name=model_name or CONFIG.MULTIMODAL_MODEL_NAME + ) + + +def build_clip_text_cache_key(text: str, *, normalize: bool, model_name: Optional[str] = None) -> str: + """Tests/tools: same key as ``/embed/clip_text`` lane; defaults to ``CONFIG.MULTIMODAL_MODEL_NAME``.""" + return _mm_clip_text_cache_key( + text, normalize=normalize, model_name=model_name or CONFIG.MULTIMODAL_MODEL_NAME + ) diff --git a/scripts/create_tenant_index.sh b/scripts/create_tenant_index.sh index f10b0e4..cf168ec 100755 --- a/scripts/create_tenant_index.sh +++ b/scripts/create_tenant_index.sh @@ -61,6 +61,7 @@ echo "删除索引: $ES_INDEX" echo curl -X DELETE "${ES_HOST}/${ES_INDEX}" $AUTH_PARAM -s -o /dev/null -w "HTTP状态码: %{http_code}\n" + echo echo "创建索引: $ES_INDEX" echo diff --git a/tests/test_cache_keys.py b/tests/test_cache_keys.py new file mode 100644 index 0000000..6e1e4f6 --- /dev/null +++ b/tests/test_cache_keys.py @@ -0,0 +1,47 @@ +"""Unit tests for embeddings/cache_keys.py (hashing long bodies).""" + +import hashlib + +from embeddings import cache_keys as ck + + +def test_stable_body_short_unchanged(): + s = "a" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS + assert ck._stable_body_for_cache_key(s) == s + + +def test_stable_body_long_hashes(): + s = "a" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) + out = ck._stable_body_for_cache_key(s) + assert out == "h:sha256:" + hashlib.sha256(s.encode("utf-8")).hexdigest() + assert out.startswith("h:sha256:") + assert len(out) == len("h:sha256:") + 64 + + +def test_stable_body_utf8_counts_unicode_codepoints(): + # 2 codepoints, not 6 bytes — still short + s = "你好" + assert ck._stable_body_for_cache_key(s) == s + + +def test_build_text_cache_key_uses_digest_when_long(): + # Default max 256: 257 'x' -> digest + long_text = "x" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) + key = ck.build_text_cache_key(long_text, normalize=True) + assert key.startswith("embed:norm1:h:sha256:") + digest = hashlib.sha256(long_text.encode("utf-8")).hexdigest() + assert key == f"embed:norm1:h:sha256:{digest}" + + +def test_build_image_cache_key_uses_digest_when_long(): + url = "https://x.example/" + "y" * ck.CACHE_KEY_RAW_BODY_MAX_CHARS + key = ck.build_image_cache_key(url, normalize=True, model_name="CN-CLIP/ViT-H-14") + digest = hashlib.sha256(url.encode("utf-8")).hexdigest() + assert key == f"embed:CN-CLIP/ViT-H-14:txt:norm1:h:sha256:{digest}" + + +def test_build_clip_text_cache_key_uses_digest_when_long(): + t = "词" * (ck.CACHE_KEY_RAW_BODY_MAX_CHARS + 1) + key = ck.build_clip_text_cache_key(t, normalize=False, model_name="m") + digest = hashlib.sha256(t.encode("utf-8")).hexdigest() + assert key == f"embed:m:img:norm0:h:sha256:{digest}" diff --git a/tests/test_embedding_pipeline.py b/tests/test_embedding_pipeline.py index 48dc0ff..01f587c 100644 --- a/tests/test_embedding_pipeline.py +++ b/tests/test_embedding_pipeline.py @@ -16,6 +16,7 @@ from embeddings.image_encoder import CLIPImageEncoder from embeddings.text_embedding_tei import TEITextModel from embeddings.bf16 import encode_embedding_for_redis from embeddings.cache_keys import build_image_cache_key, build_text_cache_key +from embeddings.config import CONFIG from query import QueryParser from context.request_context import create_request_context, set_current_request_context, clear_current_request_context @@ -207,7 +208,9 @@ def test_image_embedding_encoder_cache_hit(monkeypatch): fake_cache = _FakeEmbeddingCache() cached = np.array([0.5, 0.6], dtype=np.float32) url = "https://example.com/a.jpg" - fake_cache.store[build_image_cache_key(url, normalize=True)] = cached + fake_cache.store[ + build_image_cache_key(url, normalize=True, model_name=CONFIG.MULTIMODAL_MODEL_NAME) + ] = cached monkeypatch.setattr("embeddings.image_encoder.RedisEmbeddingCache", lambda **kwargs: fake_cache) calls = {"count": 0} -- libgit2 0.21.2