Commit ed94866621f769cb80425846a68db4a01a9f5caa

Authored by tangwang
1 parent 950a640e

tidy

activate.sh
1 1 #!/bin/bash
2 2 #
3   -# Unified environment activator (venv preferred, conda fallback).
4   -#
5 3 # Usage:
6 4 # source activate.sh
7 5 #
8   -# Priority:
  6 +# Required:
9 7 # 1) ./.venv (Python venv)
10   -# 2) conda env "searchengine" (legacy)
11 8 #
12 9  
13 10 # Must be sourced
... ... @@ -18,28 +15,16 @@ fi
18 15  
19 16 PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20 17  
21   -# 1) venv (preferred)
  18 +# 1) venv (required)
22 19 VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate"
23 20 if [[ -f "${VENV_ACTIVATE}" ]]; then
24 21 # shellcheck disable=SC1090
25 22 source "${VENV_ACTIVATE}"
26 23 ENV_KIND="venv"
27 24 else
28   - # 2) conda fallback (legacy)
29   - # 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径
30   - # 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3
31   - CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}"
32   - if [[ -f "${CONDA_ROOT}/etc/profile.d/conda.sh" ]]; then
33   - # shellcheck disable=SC1091
34   - source "${CONDA_ROOT}/etc/profile.d/conda.sh"
35   - conda activate searchengine
36   - ENV_KIND="conda"
37   - else
38   - echo "ERROR: No .venv found and conda.sh not found at ${CONDA_ROOT}/etc/profile.d/conda.sh" >&2
39   - echo " - Create venv: ./scripts/create_venv.sh" >&2
40   - echo " - Or set CONDA_ROOT to your conda install path" >&2
41   - return 1
42   - fi
  25 + echo "ERROR: No .venv found at ${VENV_ACTIVATE}" >&2
  26 + echo " - Create venv: ./scripts/create_venv.sh" >&2
  27 + return 1
43 28 fi
44 29  
45 30 # 如果需要加载 .env 中的环境变量
... ...
api/indexer_app.py
... ... @@ -124,12 +124,10 @@ def init_indexer_service(es_host: str = "http://localhost:9200"):
124 124 ]
125 125 if not value
126 126 ]
127   - logger.warning(
128   - "Database config incomplete for indexer, services will not be available. "
  127 + raise RuntimeError(
  128 + "Database config incomplete for indexer. "
129 129 f"Missing: {', '.join(missing)}"
130 130 )
131   - _incremental_service = None
132   - _bulk_indexing_service = None
133 131  
134 132 elapsed = time.time() - start_time
135 133 logger.info(f"Indexer service ready! (took {elapsed:.2f}s)")
... ... @@ -180,14 +178,17 @@ async def startup_event():
180 178 # If no explicit tenants configured, skip warmup.
181 179 if tenants:
182 180 warm = _incremental_service.warmup_transformers(tenants)
  181 + if warm.get("failed"):
  182 + raise RuntimeError(f"Indexer warmup failed: {warm['failed']}")
183 183 logger.info("Indexer warmup completed: %s", warm)
184 184 else:
185 185 logger.info("Indexer warmup skipped (no tenant ids in config.tenant_config.tenants)")
186 186 except Exception as e:
187   - logger.warning("Indexer warmup failed (service still starts): %s", e, exc_info=True)
  187 + logger.error("Indexer warmup failed: %s", e, exc_info=True)
  188 + raise
188 189 except Exception as e:
189 190 logger.error(f"Failed to initialize indexer service: {e}", exc_info=True)
190   - logger.warning("Indexer service will start but may not function correctly")
  191 + raise
191 192  
192 193  
193 194 @app.on_event("shutdown")
... ... @@ -215,7 +216,7 @@ async def global_exception_handler(request: Request, exc: Exception):
215 216 async def health_check():
216 217 """Simple health check for indexer service."""
217 218 try:
218   - # ensure ES is reachable (best-effort)
  219 + # ensure ES is reachable
219 220 if _es_client is None:
220 221 raise RuntimeError("ES client is not initialized")
221 222 return {
... ... @@ -262,4 +263,3 @@ if __name__ == "__main__":
262 263 port=args.port,
263 264 reload=args.reload,
264 265 )
265   -
... ...
api/routes/indexer.py
... ... @@ -244,31 +244,19 @@ async def build_docs(request: BuildDocsRequest):
244 244 title_text = str(v)
245 245 break
246 246 if title_text and str(title_text).strip():
247   - try:
248   - import numpy as np
249   -
250   - embeddings = encoder.encode(title_text)
251   - if embeddings is not None and len(embeddings) > 0:
252   - emb0 = embeddings[0]
253   - if isinstance(emb0, np.ndarray) and emb0.size > 0:
254   - doc["title_embedding"] = emb0.tolist()
255   - else:
256   - logger.warning(
257   - "build-docs: title_embedding skipped (encoder returned None/invalid for title: %s...)",
258   - title_text[:50],
259   - )
260   - else:
261   - logger.warning(
262   - "build-docs: title_embedding skipped (encoder returned empty for title: %s...)",
263   - title_text[:50],
264   - )
265   - except Exception as e:
266   - logger.warning(
267   - "build-docs: title_embedding failed for spu_id=%s: %s",
268   - doc.get("spu_id"),
269   - e,
  247 + import numpy as np
  248 +
  249 + embeddings = encoder.encode(title_text)
  250 + if embeddings is None or len(embeddings) == 0:
  251 + raise RuntimeError(
  252 + f"title_embedding empty for spu_id={doc.get('spu_id')}"
  253 + )
  254 + emb0 = np.asarray(embeddings[0], dtype=np.float32)
  255 + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all():
  256 + raise RuntimeError(
  257 + f"title_embedding invalid for spu_id={doc.get('spu_id')}"
270 258 )
271   - # 构建 doc 接口不因为 embedding 失败而整体失败
  259 + doc["title_embedding"] = emb0.tolist()
272 260  
273 261 docs.append(doc)
274 262 except Exception as e:
... ... @@ -486,4 +474,3 @@ async def indexer_health_check():
486 474 except Exception as e:
487 475 logger.error(f"Error checking indexer health: {e}", exc_info=True)
488 476 return {"status": "error", "message": str(e)}
489   -
... ...
config/__init__.py
... ... @@ -13,19 +13,13 @@ from .config_loader import (
13 13 FunctionScoreConfig,
14 14 RerankConfig,
15 15 ConfigLoader,
16   - ConfigurationError,
17   - load_tenant_config
  16 + ConfigurationError
18 17 )
19 18  
20 19 from .utils import (
21 20 get_match_fields_for_index,
22 21 get_domain_fields
23 22 )
24   -from .service_endpoints import (
25   - resolve_translation_service_url,
26   - resolve_embedding_service_url,
27   - resolve_reranker_service_url,
28   -)
29 23 from .services_config import (
30 24 get_translation_config,
31 25 get_embedding_config,
... ... @@ -50,12 +44,8 @@ __all__ = [
50 44 # Loader and utilities
51 45 'ConfigLoader',
52 46 'ConfigurationError',
53   - 'load_tenant_config',
54 47 'get_match_fields_for_index',
55 48 'get_domain_fields',
56   - 'resolve_translation_service_url',
57   - 'resolve_embedding_service_url',
58   - 'resolve_reranker_service_url',
59 49 'get_translation_config',
60 50 'get_embedding_config',
61 51 'get_rerank_config',
... ...
config/config_loader.py
... ... @@ -433,17 +433,3 @@ class ConfigLoader:
433 433 result["example"] = index.example
434 434  
435 435 return result
436   -
437   -
438   -def load_tenant_config(tenant_id: Optional[str] = None) -> SearchConfig:
439   - """
440   - Load tenant configuration (backward compatibility wrapper).
441   -
442   - Args:
443   - tenant_id: Ignored (kept for backward compatibility)
444   -
445   - Returns:
446   - SearchConfig loaded from config/config.yaml
447   - """
448   - loader = ConfigLoader()
449   - return loader.load_config()
... ...
config/service_endpoints.py deleted
... ... @@ -1,12 +0,0 @@
1   -"""
2   -Endpoint resolvers - delegate to services_config.
3   -
4   -Deprecated: use config.services_config directly.
5   -Kept for backward compatibility.
6   -"""
7   -
8   -from .services_config import (
9   - get_translation_base_url as resolve_translation_service_url,
10   - get_embedding_base_url as resolve_embedding_service_url,
11   - get_rerank_service_url as resolve_reranker_service_url,
12   -)
docs/DEVELOPER_GUIDE.md
... ... @@ -216,9 +216,9 @@ docs/ # 文档(含本指南)
216 216 - 索引结构以 `mappings/search_products.json` 为唯一来源;indexer 产出的 doc 必须与该 mapping 一致。
217 217 - 查询侧使用的字段名、多语言后缀(.zh/.en)、嵌套路径等与 mapping 保持一致;新增字段时同步更新 mapping 与查询/分面/过滤逻辑。
218 218  
219   -### 5.7 错误与降级
  219 +### 5.7 错误处理
220 220  
221   -- 外部能力(翻译、向量、重排)调用失败时,应有明确降级策略(如跳过向量、仅用 BM25、重排失败时保留 ES 顺序),并打日志便于排查;不因单一能力不可用导致整请求失败
  221 +- 外部能力(翻译、向量、重排)调用失败时应立即报错并中止请求,禁止静默降级;通过日志与监控尽早暴露问题并修复
222 222  
223 223 ---
224 224  
... ... @@ -315,11 +315,11 @@ services:
315 315 5. **调用方**:无需修改;仅部署时启动使用新后端的 reranker 服务即可。
316 316 6. **文档与依赖**:在 `reranker/README.md` 或 docs 中说明依赖(如 vllm)、显存建议;可选依赖放入 `requirements_ml.txt` 或 extra。
317 317  
318   -### 7.7 与现有配置的兼容说明
  318 +### 7.7 配置一致性说明
319 319  
320   -- **reranker**:当前 `reranker/config.py` 的 BGE 相关默认值可保留为兜底,或将默认值迁移到 `config.yaml` 的 `services.rerank.backends.bge`,由 config 只读环境变量与 YAML。
321   -- **embeddings**:`embeddings/config.py` 的文本/图片及 clip-as-service 开关与 `services.embedding` 的 URL 分离;后续多种后端可在 `services.embedding.backends` 中增加条目。
322   -- **环境变量**:所有能力均支持环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),便于多环境部署。
  320 +- **单一路径**:Provider 和 backend 必须由 `config/config.yaml` 的 `services` 块显式指定;未知配置应直接报错。
  321 +- **无兼容回退**:不保留“旧配置自动推导/兜底默认值”机制,避免静默行为偏差。
  322 +- **环境变量覆盖**:允许环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),但覆盖后仍需满足合法性校验。
323 323  
324 324 ---
325 325  
... ... @@ -346,7 +346,7 @@ services:
346 346  
347 347 ### 8.4 日志与可观测性
348 348  
349   -- 关键路径(请求入口、外部调用、失败降级)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,warning 用于降级)。
  349 +- 关键路径(请求入口、外部调用、失败报错)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,error 用于失败)。
350 350 - 对外接口的耗时、错误码、租户等可考虑结构化日志或后续接入监控,便于运维与排查。
351 351  
352 352 ---
... ...
docs/Usage-Guide.md
... ... @@ -29,7 +29,7 @@
29 29  
30 30 #### 1. 安装 Python 依赖与激活环境
31 31  
32   -**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。目前推荐 venv(`.venv`);Conda 仅作为兼容回退(需要 `CONDA_ROOT`)。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。
  32 +**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。当前仅支持 venv(`.venv`),不存在 Conda 回退。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。
33 33  
34 34 ```bash
35 35 cd /data/saas-search
... ... @@ -663,4 +663,3 @@ curl -X POST http://localhost:6002/search/ \
663 663  
664 664 **文档版本**: v2.0
665 665 **最后更新**: 2024-12
666   -
... ...
embeddings/__init__.py
1   -"""
2   -Embeddings module.
  1 +"""Embeddings module exports."""
3 2  
4   -Important: keep package import lightweight.
  3 +from .text_encoder import TextEmbeddingEncoder
  4 +from .image_encoder import CLIPImageEncoder
5 5  
6   -Some callers do:
7   - - `from embeddings import TextEmbeddingEncoder`
8   - - `from embeddings import BgeEncoder` (deprecated alias)
9   - - `from embeddings import CLIPImageEncoder`
10   -
11   -But the underlying implementations may import heavy optional deps (Pillow, torch, etc).
12   -To avoid importing those at package import time (and to allow the embedding service to boot
13   -without importing client code), we provide small lazy factories here.
14   -"""
15   -
16   -
17   -class TextEmbeddingEncoder(object):
18   - """Lazy factory for `embeddings.text_encoder.TextEmbeddingEncoder`."""
19   -
20   - def __new__(cls, *args, **kwargs):
21   - from .text_encoder import TextEmbeddingEncoder as _Real
22   -
23   - return _Real(*args, **kwargs)
24   -
25   -
26   -class BgeEncoder(TextEmbeddingEncoder):
27   - """Deprecated backward-compatible alias for old class name."""
28   -
29   -
30   -class CLIPImageEncoder(object):
31   - """Lazy factory for `embeddings.image_encoder.CLIPImageEncoder`."""
32   -
33   - def __new__(cls, *args, **kwargs):
34   - from .image_encoder import CLIPImageEncoder as _Real
35   -
36   - return _Real(*args, **kwargs)
37   -
38   -
39   -__all__ = ["TextEmbeddingEncoder", "BgeEncoder", "CLIPImageEncoder"]
  6 +__all__ = [
  7 + "TextEmbeddingEncoder",
  8 + "CLIPImageEncoder",
  9 +]
... ...
embeddings/clip_as_service_encoder.py
... ... @@ -65,13 +65,21 @@ class ClipAsServiceImageEncoder:
65 65 self._server = server
66 66 self._batch_size = batch_size
67 67 self._show_progress = show_progress
68   - self._client = Client(server)
  68 + try:
  69 + self._client = Client(server)
  70 + except ModuleNotFoundError as e:
  71 + if str(e) == "No module named 'pkg_resources'":
  72 + raise RuntimeError(
  73 + "clip-as-service requires pkg_resources via jina/hubble. "
  74 + "Install compatible setuptools (<82) in current venv."
  75 + ) from e
  76 + raise
69 77  
70 78 def encode_image_urls(
71 79 self,
72 80 urls: List[str],
73 81 batch_size: Optional[int] = None,
74   - ) -> List[Optional[np.ndarray]]:
  82 + ) -> List[np.ndarray]:
75 83 """
76 84 Encode a list of image URLs to vectors.
77 85  
... ... @@ -80,42 +88,36 @@ class ClipAsServiceImageEncoder:
80 88 batch_size: override instance batch_size for this call.
81 89  
82 90 Returns:
83   - List of vectors (1024-dim float32) or None for failed items, same length as urls.
  91 + List of vectors (float32), same length as urls.
84 92 """
85 93 if not urls:
86 94 return []
87 95  
88 96 normalized = [_normalize_image_url(u) for u in urls]
89   - valid_indices = [i for i, u in enumerate(normalized) if u]
90   - if not valid_indices:
91   - return [None] * len(urls)
92   -
93   - valid_urls = [normalized[i] for i in valid_indices]
94 97 bs = batch_size if batch_size is not None else self._batch_size
95   - out: List[Optional[np.ndarray]] = [None] * len(urls)
96   -
97   - try:
98   - # Client.encode(iterable of str) returns np.ndarray [N, D] for string input
99   - arr = self._client.encode(
100   - valid_urls,
101   - batch_size=bs,
102   - show_progress=self._show_progress,
  98 + invalid_indices = [i for i, u in enumerate(normalized) if not u]
  99 + if invalid_indices:
  100 + raise ValueError(f"Invalid empty image URL at indices: {invalid_indices}")
  101 +
  102 + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input
  103 + arr = self._client.encode(
  104 + normalized,
  105 + batch_size=bs,
  106 + show_progress=self._show_progress,
  107 + )
  108 + if arr is None or not hasattr(arr, "shape"):
  109 + raise RuntimeError("clip-as-service encode returned empty result")
  110 + if len(arr) != len(normalized):
  111 + raise RuntimeError(
  112 + f"clip-as-service encode length mismatch: expected {len(normalized)}, got {len(arr)}"
103 113 )
104   - if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices):
105   - for j, idx in enumerate(valid_indices):
106   - row = arr[j]
107   - if row is not None and hasattr(row, "tolist"):
108   - out[idx] = np.asarray(row, dtype=np.float32)
109   - else:
110   - out[idx] = np.array(row, dtype=np.float32)
111   - else:
112   - logger.warning(
113   - "clip-as-service encode returned unexpected shape/length, "
114   - "expected %d vectors", len(valid_indices)
115   - )
116   - except Exception as e:
117   - logger.warning("clip-as-service encode failed: %s", e, exc_info=True)
118 114  
  115 + out: List[np.ndarray] = []
  116 + for row in arr:
  117 + vec = np.asarray(row, dtype=np.float32)
  118 + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
  119 + raise RuntimeError("clip-as-service returned invalid embedding vector")
  120 + out.append(vec)
119 121 return out
120 122  
121 123 def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:
... ...
embeddings/image_encoder.py
... ... @@ -48,16 +48,15 @@ class CLIPImageEncoder:
48 48 logger.error(f"CLIPImageEncoder service request failed: {e}", exc_info=True)
49 49 raise
50 50  
51   - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]:
  51 + def encode_image(self, image: Image.Image) -> np.ndarray:
52 52 """
53 53 Encode image to embedding vector using network service.
54 54  
55 55 Note: This method is kept for compatibility but the service only works with URLs.
56 56 """
57   - logger.warning("encode_image with PIL Image not supported by service, returning None")
58   - return None
  57 + raise NotImplementedError("encode_image with PIL Image is not supported by embedding service")
59 58  
60   - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]:
  59 + def encode_image_from_url(self, url: str) -> np.ndarray:
61 60 """
62 61 Generate image embedding via network service using URL.
63 62  
... ... @@ -65,24 +64,21 @@ class CLIPImageEncoder:
65 64 url: Image URL to process
66 65  
67 66 Returns:
68   - Embedding vector or None if failed
  67 + Embedding vector
69 68 """
70   - try:
71   - response_data = self._call_service([url])
72   - if response_data and len(response_data) > 0 and response_data[0] is not None:
73   - return np.array(response_data[0], dtype=np.float32)
74   - logger.warning(f"No embedding for URL {url}")
75   - return None
76   -
77   - except Exception as e:
78   - logger.error(f"Failed to process image from URL {url}: {str(e)}", exc_info=True)
79   - return None
  69 + response_data = self._call_service([url])
  70 + if not response_data or len(response_data) != 1 or response_data[0] is None:
  71 + raise RuntimeError(f"No image embedding returned for URL: {url}")
  72 + vec = np.array(response_data[0], dtype=np.float32)
  73 + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
  74 + raise RuntimeError(f"Invalid image embedding returned for URL: {url}")
  75 + return vec
80 76  
81 77 def encode_batch(
82 78 self,
83 79 images: List[Union[str, Image.Image]],
84 80 batch_size: int = 8
85   - ) -> List[Optional[np.ndarray]]:
  81 + ) -> List[np.ndarray]:
86 82 """
87 83 Encode a batch of images efficiently via network service.
88 84  
... ... @@ -91,50 +87,31 @@ class CLIPImageEncoder:
91 87 batch_size: Batch size for processing (used for service requests)
92 88  
93 89 Returns:
94   - List of embeddings (or None for failed images)
  90 + List of embeddings
95 91 """
96   - # Initialize results with None for all images
97   - results = [None] * len(images)
98   -
99   - # Filter out PIL Images since service only supports URLs
100   - url_images = []
101   - url_indices = []
102   -
103 92 for i, img in enumerate(images):
104   - if isinstance(img, str):
105   - url_images.append(img)
106   - url_indices.append(i)
107   - elif isinstance(img, Image.Image):
108   - logger.warning(f"PIL Image at index {i} not supported by service, returning None")
109   - # results[i] is already None
110   -
111   - # Process URLs in batches
112   - for i in range(0, len(url_images), batch_size):
113   - batch_urls = url_images[i:i + batch_size]
114   - batch_indices = url_indices[i:i + batch_size]
115   -
116   - try:
117   - # Call service
118   - response_data = self._call_service(batch_urls)
119   -
120   - # Process response (aligned list)
121   - batch_results = []
122   - for j, url in enumerate(batch_urls):
123   - if response_data and j < len(response_data) and response_data[j] is not None:
124   - batch_results.append(np.array(response_data[j], dtype=np.float32))
125   - else:
126   - logger.warning(f"Failed to encode URL {url}: no embedding")
127   - batch_results.append(None)
128   -
129   - # Insert results at the correct positions
130   - for j, result in enumerate(batch_results):
131   - results[batch_indices[j]] = result
132   -
133   - except Exception as e:
134   - logger.error(f"Batch processing failed: {e}", exc_info=True)
135   - # Fill with None for this batch
136   - for j in range(len(batch_urls)):
137   - results[batch_indices[j]] = None
  93 + if isinstance(img, Image.Image):
  94 + raise NotImplementedError(f"PIL Image at index {i} is not supported by service")
  95 + if not isinstance(img, str) or not img.strip():
  96 + raise ValueError(f"Invalid image URL/path at index {i}: {img!r}")
  97 +
  98 + results: List[np.ndarray] = []
  99 + for i in range(0, len(images), batch_size):
  100 + batch_urls = [str(u).strip() for u in images[i:i + batch_size]]
  101 + response_data = self._call_service(batch_urls)
  102 + if not response_data or len(response_data) != len(batch_urls):
  103 + raise RuntimeError(
  104 + f"Image embedding response length mismatch: expected {len(batch_urls)}, "
  105 + f"got {0 if response_data is None else len(response_data)}"
  106 + )
  107 + for j, url in enumerate(batch_urls):
  108 + embedding = response_data[j]
  109 + if embedding is None:
  110 + raise RuntimeError(f"No image embedding returned for URL: {url}")
  111 + vec = np.array(embedding, dtype=np.float32)
  112 + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
  113 + raise RuntimeError(f"Invalid image embedding returned for URL: {url}")
  114 + results.append(vec)
138 115  
139 116 return results
140 117  
... ... @@ -142,7 +119,7 @@ class CLIPImageEncoder:
142 119 self,
143 120 urls: List[str],
144 121 batch_size: Optional[int] = None,
145   - ) -> List[Optional[np.ndarray]]:
  122 + ) -> List[np.ndarray]:
146 123 """
147 124 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。
148 125  
... ... @@ -151,6 +128,6 @@ class CLIPImageEncoder:
151 128 batch_size: 批大小(默认 8)
152 129  
153 130 Returns:
154   - 与 urls 等长的向量列表,失败为 None
  131 + 与 urls 等长的向量列表
155 132 """
156 133 return self.encode_batch(urls, batch_size=batch_size or 8)
... ...
embeddings/qwen3_model.py
... ... @@ -35,43 +35,15 @@ class Qwen3TextModel(object):
35 35 ) -> np.ndarray:
36 36 if device == "gpu":
37 37 device = "cuda"
38   -
39   - # Try requested device, fallback to CPU if CUDA is unavailable/insufficient.
40   - try:
41   - if device == "cuda":
42   - import torch
43   -
44   - if torch.cuda.is_available():
45   - free_memory = (
46   - torch.cuda.get_device_properties(0).total_memory
47   - - torch.cuda.memory_allocated()
48   - )
49   - if free_memory < 1024 * 1024 * 1024: # 1GB
50   - device = "cpu"
51   - else:
52   - device = "cpu"
53   -
54   - self.model = self.model.to(device)
55   - embeddings = self.model.encode(
56   - sentences,
57   - normalize_embeddings=normalize_embeddings,
58   - device=device,
59   - show_progress_bar=False,
60   - batch_size=batch_size,
61   - )
62   - return embeddings
63   - except Exception:
64   - if device != "cpu":
65   - self.model = self.model.to("cpu")
66   - embeddings = self.model.encode(
67   - sentences,
68   - normalize_embeddings=normalize_embeddings,
69   - device="cpu",
70   - show_progress_bar=False,
71   - batch_size=batch_size,
72   - )
73   - return embeddings
74   - raise
  38 + self.model = self.model.to(device)
  39 + embeddings = self.model.encode(
  40 + sentences,
  41 + normalize_embeddings=normalize_embeddings,
  42 + device=device,
  43 + show_progress_bar=False,
  44 + batch_size=batch_size,
  45 + )
  46 + return embeddings
75 47  
76 48 def encode_batch(
77 49 self,
... ... @@ -86,4 +58,3 @@ class Qwen3TextModel(object):
86 58 device=device,
87 59 normalize_embeddings=normalize_embeddings,
88 60 )
89   -
... ...
embeddings/server.py
1 1 """
2 2 Embedding service (FastAPI).
3 3  
4   -API (simple list-in, list-out; aligned by index; failures -> null):
5   -- POST /embed/text body: ["text1", "text2", ...] -> [[...], null, ...]
6   -- POST /embed/image body: ["url_or_path1", ...] -> [[...], null, ...]
  4 +API (simple list-in, list-out; aligned by index):
  5 +- POST /embed/text body: ["text1", "text2", ...] -> [[...], ...]
  6 +- POST /embed/image body: ["url_or_path1", ...] -> [[...], ...]
7 7 """
8 8  
9 9 import logging
... ... @@ -11,7 +11,7 @@ import threading
11 11 from typing import Any, Dict, List, Optional
12 12  
13 13 import numpy as np
14   -from fastapi import FastAPI
  14 +from fastapi import FastAPI, HTTPException
15 15  
16 16 from embeddings.config import CONFIG
17 17 from embeddings.protocols import ImageEncoderProtocol
... ... @@ -51,9 +51,6 @@ def load_models():
51 51  
52 52  
53 53 # Load image model: clip-as-service (recommended) or local CN-CLIP
54   - # IMPORTANT: failures here should NOT prevent the whole service from starting.
55   - # If image model cannot be loaded, we keep `_image_model` as None and only
56   - # disable /embed/image while keeping /embed/text fully functional.
57 54 if open_image_model:
58 55 try:
59 56 if CONFIG.USE_CLIP_AS_SERVICE:
... ... @@ -75,12 +72,8 @@ def load_models():
75 72 )
76 73 logger.info("Image model (local CN-CLIP) loaded successfully")
77 74 except Exception as e:
78   - logger.error(
79   - "Failed to load image model; image embeddings will be disabled but text embeddings remain available: %s",
80   - e,
81   - exc_info=True,
82   - )
83   - _image_model = None
  75 + logger.error("Failed to load image model: %s", e, exc_info=True)
  76 + raise
84 77  
85 78 logger.info("All embedding models loaded successfully, service ready")
86 79  
... ... @@ -109,70 +102,60 @@ def health() -&gt; Dict[str, Any]:
109 102 def embed_text(texts: List[str]) -> List[Optional[List[float]]]:
110 103 if _text_model is None:
111 104 raise RuntimeError("Text model not loaded")
112   - out: List[Optional[List[float]]] = [None] * len(texts)
113   -
114   - indexed_texts: List[tuple] = []
  105 + normalized: List[str] = []
115 106 for i, t in enumerate(texts):
116   - if t is None:
117   - continue
118 107 if not isinstance(t, str):
119   - t = str(t)
120   - t = t.strip()
121   - if not t:
122   - continue
123   - indexed_texts.append((i, t))
124   -
125   - if not indexed_texts:
126   - return out
127   -
128   - batch_texts = [t for _, t in indexed_texts]
129   - try:
130   - with _text_encode_lock:
131   - embs = _text_model.encode_batch(
132   - batch_texts,
133   - batch_size=int(CONFIG.TEXT_BATCH_SIZE),
134   - device=CONFIG.TEXT_DEVICE,
135   - normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS),
136   - )
137   - for j, (idx, _t) in enumerate(indexed_texts):
138   - out[idx] = _as_list(embs[j])
139   - except Exception:
140   - # keep Nones
141   - pass
  108 + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: must be string")
  109 + s = t.strip()
  110 + if not s:
  111 + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string")
  112 + normalized.append(s)
  113 +
  114 + with _text_encode_lock:
  115 + embs = _text_model.encode_batch(
  116 + normalized,
  117 + batch_size=int(CONFIG.TEXT_BATCH_SIZE),
  118 + device=CONFIG.TEXT_DEVICE,
  119 + normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS),
  120 + )
  121 + if embs is None or len(embs) != len(normalized):
  122 + raise RuntimeError(
  123 + f"Text model response length mismatch: expected {len(normalized)}, "
  124 + f"got {0 if embs is None else len(embs)}"
  125 + )
  126 + out: List[Optional[List[float]]] = []
  127 + for i, emb in enumerate(embs):
  128 + vec = _as_list(emb)
  129 + if vec is None:
  130 + raise RuntimeError(f"Text model returned empty embedding for index {i}")
  131 + out.append(vec)
142 132 return out
143 133  
144 134  
145 135 @app.post("/embed/image")
146 136 def embed_image(images: List[str]) -> List[Optional[List[float]]]:
147 137 if _image_model is None:
148   - # Graceful degradation: keep API shape but return all None
149   - logger.warning("embed_image called but image model is not loaded; returning all None vectors")
150   - return [None] * len(images)
151   - out: List[Optional[List[float]]] = [None] * len(images)
152   -
153   - # Normalize inputs
154   - urls = []
155   - indices = []
  138 + raise RuntimeError("Image model not loaded")
  139 + urls: List[str] = []
156 140 for i, url_or_path in enumerate(images):
157   - if url_or_path is None:
158   - continue
159 141 if not isinstance(url_or_path, str):
160   - url_or_path = str(url_or_path)
161   - url_or_path = url_or_path.strip()
162   - if url_or_path:
163   - urls.append(url_or_path)
164   - indices.append(i)
165   -
166   - if not urls:
167   - return out
  142 + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: must be string URL/path")
  143 + s = url_or_path.strip()
  144 + if not s:
  145 + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path")
  146 + urls.append(s)
168 147  
169 148 with _image_encode_lock:
170   - try:
171   - # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size)
172   - vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE)
173   - for j, idx in enumerate(indices):
174   - out[idx] = _as_list(vectors[j] if j < len(vectors) else None)
175   - except Exception:
176   - for idx in indices:
177   - out[idx] = None
  149 + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE)
  150 + if vectors is None or len(vectors) != len(urls):
  151 + raise RuntimeError(
  152 + f"Image model response length mismatch: expected {len(urls)}, "
  153 + f"got {0 if vectors is None else len(vectors)}"
  154 + )
  155 + out: List[Optional[List[float]]] = []
  156 + for i, vec in enumerate(vectors):
  157 + out_vec = _as_list(vec)
  158 + if out_vec is None:
  159 + raise RuntimeError(f"Image model returned empty embedding for index {i}")
  160 + out.append(out_vec)
178 161 return out
... ...
embeddings/text_encoder.py
... ... @@ -89,9 +89,8 @@ class TextEmbeddingEncoder:
89 89 batch_size: Batch size for processing (used for service requests)
90 90  
91 91 Returns:
92   - numpy array of dtype=object, where each element is either:
93   - - np.ndarray (valid embedding vector) or
94   - - None (no embedding available for that text)
  92 + numpy array of dtype=object,元素均为有效 np.ndarray 向量。
  93 + 若任一输入无法生成向量,将直接抛出异常。
95 94 """
96 95 # Convert single string to list
97 96 if isinstance(sentences, str):
... ... @@ -101,8 +100,6 @@ class TextEmbeddingEncoder:
101 100 uncached_indices: List[int] = []
102 101 uncached_texts: List[str] = []
103 102  
104   - # Process response
105   - # Each element can be np.ndarray or None (表示该文本没有可用的向量)
106 103 embeddings: List[Optional[np.ndarray]] = [None] * len(sentences)
107 104  
108 105 for i, text in enumerate(sentences):
... ... @@ -118,40 +115,27 @@ class TextEmbeddingEncoder:
118 115  
119 116 # If there are uncached texts, call service
120 117 if uncached_texts:
121   - try:
122   - # Call service
123   - response_data = self._call_service(request_data)
  118 + response_data = self._call_service(request_data)
124 119  
125   - # Process response
126   - for i, text in enumerate(uncached_texts):
127   - original_idx = uncached_indices[i]
128   - if response_data and i < len(response_data):
129   - embedding = response_data[i]
130   - else:
131   - embedding = None
  120 + # Process response
  121 + for i, text in enumerate(uncached_texts):
  122 + original_idx = uncached_indices[i]
  123 + if response_data and i < len(response_data):
  124 + embedding = response_data[i]
  125 + else:
  126 + embedding = None
132 127  
133   - if embedding is not None:
134   - embedding_array = np.array(embedding, dtype=np.float32)
135   - # Validate embedding from service - if invalid, treat as no result
136   - if self._is_valid_embedding(embedding_array):
137   - embeddings[original_idx] = embedding_array
138   - # Cache the embedding
139   - self._set_cached_embedding(text, "generic", embedding_array)
140   - else:
141   - logger.warning(
142   - f"Invalid embedding returned from service for text {original_idx} "
143   - f"(contains NaN/Inf or invalid shape), treating as no result. "
144   - f"Text preview: {text[:50]}..."
145   - )
146   - embeddings[original_idx] = None
  128 + if embedding is not None:
  129 + embedding_array = np.array(embedding, dtype=np.float32)
  130 + if self._is_valid_embedding(embedding_array):
  131 + embeddings[original_idx] = embedding_array
  132 + self._set_cached_embedding(text, "generic", embedding_array)
147 133 else:
148   - logger.warning(f"No embedding found for text {original_idx}: {text[:50]}...")
149   - embeddings[original_idx] = None
150   -
151   - except Exception as e:
152   - logger.error(f"Failed to encode texts: {e}", exc_info=True)
153   - # 出错时不要生成兜底全零向量,保持为 None
154   - pass
  134 + raise ValueError(
  135 + f"Invalid embedding returned from service for text index {original_idx}"
  136 + )
  137 + else:
  138 + raise ValueError(f"No embedding found for text index {original_idx}: {text[:50]}...")
155 139  
156 140 # 返回 numpy 数组(dtype=object),元素为 np.ndarray 或 None
157 141 return np.array(embeddings, dtype=object)
... ... @@ -254,7 +238,3 @@ class TextEmbeddingEncoder:
254 238 except Exception as e:
255 239 logger.error(f"Error storing embedding in cache: {e}")
256 240 return False
257   -
258   -
259   -# Backward compatibility for existing imports/usages.
260   -BgeEncoder = TextEmbeddingEncoder
... ...
indexer/document_transformer.py
... ... @@ -406,23 +406,21 @@ class SPUDocumentTransformer:
406 406  
407 407 if not urls:
408 408 return
409   - try:
410   - vectors = self.image_encoder.encode_image_urls(urls, batch_size=8)
411   - if not vectors or len(vectors) != len(urls):
412   - return
413   - out = []
414   - for url, vec in zip(urls, vectors):
415   - if vec is None:
416   - continue
417   - if isinstance(vec, np.ndarray):
418   - vec = vec.astype(np.float32)
419   - out.append({"vector": vec.tolist(), "url": url})
420   - elif hasattr(vec, "tolist"):
421   - out.append({"vector": vec.tolist(), "url": url})
422   - if out:
423   - doc["image_embedding"] = out
424   - except Exception as e:
425   - logger.warning("Failed to generate image_embedding for SPU %s: %s", doc.get("spu_id"), e)
  409 + vectors = self.image_encoder.encode_image_urls(urls, batch_size=8)
  410 + if not vectors or len(vectors) != len(urls):
  411 + raise RuntimeError(
  412 + f"image_embedding response length mismatch for SPU {doc.get('spu_id')}: "
  413 + f"expected {len(urls)}, got {0 if vectors is None else len(vectors)}"
  414 + )
  415 + out = []
  416 + for url, vec in zip(urls, vectors):
  417 + arr = np.asarray(vec, dtype=np.float32)
  418 + if arr.ndim != 1 or arr.size == 0 or not np.isfinite(arr).all():
  419 + raise RuntimeError(
  420 + f"Invalid image embedding for SPU {doc.get('spu_id')} and URL {url}"
  421 + )
  422 + out.append({"vector": arr.tolist(), "url": url})
  423 + doc["image_embedding"] = out
426 424  
427 425 def _process_skus(
428 426 self,
... ... @@ -722,21 +720,14 @@ class SPUDocumentTransformer:
722 720 logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}")
723 721 return
724 722  
725   - try:
726   - # 使用文本向量编码器生成 embedding
727   - # encode方法返回numpy数组,形状为(n, 1024)
728   - embeddings = self.encoder.encode(title_text)
729   -
730   - if embeddings is not None and len(embeddings) > 0:
731   - # 取第一个embedding(因为只传了一个文本)
732   - embedding = embeddings[0]
733   - if not isinstance(embedding, np.ndarray):
734   - logger.warning(f"Embedding is None/invalid for title: {title_text[:50]}...")
735   - return
736   - # 转换为列表格式(ES需要)
737   - doc['title_embedding'] = embedding.tolist()
738   - logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...")
739   - else:
740   - logger.warning(f"Failed to generate embedding for title: {title_text[:50]}...")
741   - except Exception as e:
742   - logger.error(f"Error generating title_embedding for SPU {doc.get('spu_id')}: {e}", exc_info=True)
  723 + # 使用文本向量编码器生成 embedding
  724 + # encode方法返回numpy数组,形状为(n, d)
  725 + embeddings = self.encoder.encode(title_text)
  726 + if embeddings is None or len(embeddings) == 0:
  727 + raise RuntimeError(f"Failed to generate title embedding for SPU {doc.get('spu_id')}")
  728 +
  729 + embedding = np.asarray(embeddings[0], dtype=np.float32)
  730 + if embedding.ndim != 1 or embedding.size == 0 or not np.isfinite(embedding).all():
  731 + raise RuntimeError(f"Invalid title embedding for SPU {doc.get('spu_id')}")
  732 + doc['title_embedding'] = embedding.tolist()
  733 + logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...")
... ...
indexer/incremental_service.py
... ... @@ -33,9 +33,9 @@ class IncrementalIndexerService:
33 33 logger.info(f"Preloaded {len(self.category_id_to_name)} category mappings")
34 34  
35 35 # 缓存:避免频繁增量请求重复加载 config / 构造 transformer
36   - # NOTE: 为避免“首请求”懒加载导致超时,尽量在进程启动阶段完成初始化:
  36 + # 启动阶段强校验初始化:
37 37 # - config.yaml 加载
38   - # - translator / embedding / image encoder provider 初始化(best-effort)
  38 + # - translator / embedding / image encoder provider 初始化
39 39 self._config: Optional[Any] = None
40 40 self._config_lock = threading.Lock()
41 41 self._translator: Optional[Any] = None
... ... @@ -50,59 +50,35 @@ class IncrementalIndexerService:
50 50 self._transformer_cache_lock = threading.Lock()
51 51  
52 52 def _eager_init(self) -> None:
53   - """Best-effort eager initialization to reduce first-request latency."""
54   - try:
55   - self._config = ConfigLoader("config/config.yaml").load_config()
56   - except Exception as e:
57   - logger.warning("Failed to eagerly load config/config.yaml: %s", e, exc_info=True)
58   - self._config = None
59   - return
  53 + """Strict eager initialization. Any dependency failure should fail fast."""
  54 + self._config = ConfigLoader("config/config.yaml").load_config()
  55 + self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {}
  56 + self._searchable_option_dimensions = (
  57 + getattr(self._config.spu_config, "searchable_option_dimensions", None)
  58 + or ["option1", "option2", "option3"]
  59 + )
60 60  
61   - try:
62   - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {}
63   - self._searchable_option_dimensions = (
64   - getattr(self._config.spu_config, "searchable_option_dimensions", None)
65   - or ["option1", "option2", "option3"]
66   - )
67   - except Exception:
68   - self._translation_prompts = {}
69   - self._searchable_option_dimensions = ["option1", "option2", "option3"]
  61 + from providers import create_translation_provider
70 62  
71   - # Translator provider (best-effort)
72   - try:
73   - from providers import create_translation_provider
  63 + self._translator = create_translation_provider(self._config.query_config)
74 64  
75   - self._translator = create_translation_provider(self._config.query_config)
76   - except Exception as e:
77   - logger.warning("Failed to initialize translation provider at startup: %s", e)
78   - self._translator = None
79   -
80   - # Text embedding encoder (best-effort)
  65 + # Text embedding encoder (strict when enabled)
81 66 if bool(getattr(self._config.query_config, "enable_text_embedding", False)):
82   - try:
83   - from embeddings.text_encoder import TextEmbeddingEncoder
  67 + from embeddings.text_encoder import TextEmbeddingEncoder
84 68  
85   - self._shared_text_encoder = TextEmbeddingEncoder()
86   - except Exception as e:
87   - logger.warning("Failed to initialize TextEmbeddingEncoder at startup: %s", e)
88   - self._shared_text_encoder = None
  69 + self._shared_text_encoder = TextEmbeddingEncoder()
  70 + else:
  71 + self._shared_text_encoder = None
89 72  
90   - # Image embedding encoder (best-effort; may be unavailable if embedding service not running)
91   - try:
92   - from embeddings.image_encoder import CLIPImageEncoder
  73 + # Image embedding encoder (strict)
  74 + from embeddings.image_encoder import CLIPImageEncoder
93 75  
94   - self._shared_image_encoder = CLIPImageEncoder()
95   - except Exception as e:
96   - logger.debug("Image encoder not available for indexer startup: %s", e)
97   - self._shared_image_encoder = None
  76 + self._shared_image_encoder = CLIPImageEncoder()
98 77  
99 78 def _get_config(self) -> Any:
100 79 """Load config once per process (thread-safe)."""
101   - if self._config is not None:
102   - return self._config
103   - with self._config_lock:
104   - if self._config is None:
105   - self._config = ConfigLoader("config/config.yaml").load_config()
  80 + if self._config is None:
  81 + raise RuntimeError("Indexer config is not initialized")
106 82 return self._config
107 83  
108 84 def _get_transformer_bundle(self, tenant_id: str) -> Tuple[Any, Optional[Any], bool]:
... ... @@ -121,32 +97,13 @@ class IncrementalIndexerService:
121 97 config = self._get_config()
122 98 enable_embedding = bool(getattr(config.query_config, "enable_text_embedding", False))
123 99  
124   - # Use shared encoders/providers preloaded at startup when可用;
125   - # 若启动时初始化失败,则在首次请求时做一次兜底初始化,避免永久禁用。
126 100 encoder: Optional[Any] = self._shared_text_encoder if enable_embedding else None
127 101 if enable_embedding and encoder is None:
128   - try:
129   - from embeddings.text_encoder import TextEmbeddingEncoder
130   -
131   - encoder = TextEmbeddingEncoder()
132   - self._shared_text_encoder = encoder
133   - logger.info("TextEmbeddingEncoder lazily initialized in _get_transformer_bundle")
134   - except Exception as e:
135   - logger.warning("Failed to lazily initialize TextEmbeddingEncoder for tenant_id=%s: %s", tenant_id, e)
136   - encoder = None
137   - enable_embedding = False
  102 + raise RuntimeError("Text embedding is enabled but TextEmbeddingEncoder is not initialized")
138 103  
139 104 image_encoder: Optional[Any] = self._shared_image_encoder
140 105 if image_encoder is None:
141   - try:
142   - from embeddings.image_encoder import CLIPImageEncoder
143   -
144   - image_encoder = CLIPImageEncoder()
145   - self._shared_image_encoder = image_encoder
146   - logger.info("CLIPImageEncoder lazily initialized in _get_transformer_bundle")
147   - except Exception as e:
148   - logger.debug("Image encoder not available for indexer (lazy init): %s", e)
149   - image_encoder = None
  106 + raise RuntimeError("CLIPImageEncoder is not initialized")
150 107  
151 108 transformer = create_document_transformer(
152 109 category_id_to_name=self.category_id_to_name,
... ... @@ -157,7 +114,7 @@ class IncrementalIndexerService:
157 114 encoder=encoder,
158 115 enable_title_embedding=False, # batch fill later
159 116 image_encoder=image_encoder,
160   - enable_image_embedding=(image_encoder is not None),
  117 + enable_image_embedding=True,
161 118 config=config,
162 119 )
163 120  
... ... @@ -236,14 +193,13 @@ class IncrementalIndexerService:
236 193 title_text = str(v)
237 194 break
238 195 if title_text and str(title_text).strip():
239   - try:
240   - embeddings = encoder.encode(title_text)
241   - if embeddings is not None and len(embeddings) > 0:
242   - emb0 = embeddings[0]
243   - if isinstance(emb0, np.ndarray):
244   - doc["title_embedding"] = emb0.tolist()
245   - except Exception as e:
246   - logger.warning(f"Failed to generate embedding for spu_id={spu_id}: {e}")
  196 + embeddings = encoder.encode(title_text)
  197 + if embeddings is None or len(embeddings) == 0:
  198 + raise RuntimeError(f"Failed to generate title embedding for spu_id={spu_id}")
  199 + emb0 = np.asarray(embeddings[0], dtype=np.float32)
  200 + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all():
  201 + raise RuntimeError(f"Invalid title embedding for spu_id={spu_id}")
  202 + doc["title_embedding"] = emb0.tolist()
247 203  
248 204 return doc
249 205  
... ... @@ -678,14 +634,20 @@ class IncrementalIndexerService:
678 634 title_doc_indices.append(i)
679 635  
680 636 if title_texts:
681   - try:
682   - embeddings = encoder.encode_batch(title_texts, batch_size=32)
683   - for j, emb in enumerate(embeddings):
684   - doc_idx = title_doc_indices[j]
685   - if isinstance(emb, np.ndarray):
686   - documents[doc_idx][1]["title_embedding"] = emb.tolist()
687   - except Exception as e:
688   - logger.warning(f"[IncrementalIndexing] Batch embedding failed for tenant_id={tenant_id}: {e}", exc_info=True)
  637 + embeddings = encoder.encode_batch(title_texts, batch_size=32)
  638 + if embeddings is None or len(embeddings) != len(title_texts):
  639 + raise RuntimeError(
  640 + f"[IncrementalIndexing] Batch embedding length mismatch for tenant_id={tenant_id}: "
  641 + f"expected {len(title_texts)}, got {0 if embeddings is None else len(embeddings)}"
  642 + )
  643 + for j, emb in enumerate(embeddings):
  644 + vec = np.asarray(emb, dtype=np.float32)
  645 + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all():
  646 + raise RuntimeError(
  647 + f"[IncrementalIndexing] Invalid title embedding in batch for tenant_id={tenant_id}, index={j}"
  648 + )
  649 + doc_idx = title_doc_indices[j]
  650 + documents[doc_idx][1]["title_embedding"] = vec.tolist()
689 651  
690 652 logger.info(f"[IncrementalIndexing] Transformed {len(documents)}/{total_count} documents")
691 653  
... ... @@ -789,4 +751,3 @@ class IncrementalIndexerService:
789 751 "index_name": index_name,
790 752 "tenant_id": tenant_id
791 753 }
792   -
... ...
indexer/indexing_utils.py
... ... @@ -92,38 +92,29 @@ def create_document_transformer(
92 92 or (encoder is None and enable_title_embedding)
93 93 or config is None
94 94 ):
95   - try:
96   - if config is None:
97   - config_loader = ConfigLoader()
98   - config = config_loader.load_config()
99   -
100   - if searchable_option_dimensions is None:
101   - searchable_option_dimensions = config.spu_config.searchable_option_dimensions
102   -
103   - index_langs = tenant_config.get("index_languages") or []
104   - need_translator = len(index_langs) > 1
105   - if translator is None and need_translator:
106   - from providers import create_translation_provider
107   - translator = create_translation_provider(config.query_config)
108   -
109   - if translation_prompts is None:
110   - translation_prompts = config.query_config.translation_prompts
111   -
112   - # 初始化encoder(如果启用标题向量化且未提供encoder)
113   - if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
114   - try:
115   - from embeddings.text_encoder import TextEmbeddingEncoder
116   - encoder = TextEmbeddingEncoder()
117   - logger.info("TextEmbeddingEncoder initialized for title embedding")
118   - except Exception as e:
119   - logger.warning(f"Failed to initialize TextEmbeddingEncoder: {e}, title embedding will be disabled")
120   - enable_title_embedding = False
121   - except Exception as e:
122   - logger.warning(f"Failed to load config, using defaults: {e}")
123   - if searchable_option_dimensions is None:
124   - searchable_option_dimensions = ['option1', 'option2', 'option3']
125   - if translation_prompts is None:
126   - translation_prompts = {}
  95 + if config is None:
  96 + config_loader = ConfigLoader()
  97 + config = config_loader.load_config()
  98 +
  99 + if searchable_option_dimensions is None:
  100 + searchable_option_dimensions = config.spu_config.searchable_option_dimensions
  101 +
  102 + index_langs = tenant_config.get("index_languages") or []
  103 + need_translator = len(index_langs) > 1
  104 + if translator is None and need_translator:
  105 + from providers import create_translation_provider
  106 +
  107 + translator = create_translation_provider(config.query_config)
  108 +
  109 + if translation_prompts is None:
  110 + translation_prompts = config.query_config.translation_prompts
  111 +
  112 + # 初始化encoder(如果启用标题向量化且未提供encoder)
  113 + if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding:
  114 + from embeddings.text_encoder import TextEmbeddingEncoder
  115 +
  116 + encoder = TextEmbeddingEncoder()
  117 + logger.info("TextEmbeddingEncoder initialized for title embedding")
127 118  
128 119 return SPUDocumentTransformer(
129 120 category_id_to_name=category_id_to_name,
... ...
... ... @@ -22,6 +22,7 @@ from utils import ESClient
22 22 from search import Searcher
23 23 from suggestion import SuggestionIndexBuilder
24 24 from utils.db_connector import create_db_connection
  25 +from context.request_context import create_request_context
25 26  
26 27  
27 28 def cmd_serve(args):
... ... @@ -78,7 +79,8 @@ def cmd_search(args):
78 79 result = searcher.search(
79 80 query=args.query,
80 81 tenant_id=args.tenant_id,
81   - size=args.size
  82 + size=args.size,
  83 + context=create_request_context(),
82 84 )
83 85  
84 86 # Display results
... ...
providers/embedding.py
1   -"""
2   -Embedding provider - HTTP service (vllm reserved).
3   -
4   -Returns text/image encoders configured via services_config.
5   -"""
  1 +"""Embedding provider - HTTP service."""
6 2  
7 3 from __future__ import annotations
8 4  
... ... @@ -14,10 +10,7 @@ def create_embedding_provider() -&gt; &quot;EmbeddingProvider&quot;:
14 10 cfg = get_embedding_config()
15 11 provider = (cfg.provider or "http").strip().lower()
16 12 if provider != "http":
17   - import logging
18   - logging.getLogger(__name__).warning(
19   - "Unsupported embedding provider '%s', fallback to HTTP provider.", provider
20   - )
  13 + raise ValueError(f"Unsupported embedding provider: {provider}")
21 14 return EmbeddingProvider()
22 15  
23 16  
... ...
providers/rerank.py
1   -"""
2   -Rerank provider - HTTP service (vllm reserved).
3   -"""
  1 +"""Rerank provider - HTTP service."""
4 2  
5 3 from __future__ import annotations
6 4  
... ... @@ -61,8 +59,8 @@ def create_rerank_provider() -&gt; HttpRerankProvider:
61 59 cfg = get_rerank_config()
62 60 provider = (cfg.provider or "http").strip().lower()
63 61  
64   - if provider == "vllm":
65   - logger.warning("rerank provider 'vllm' is reserved, using HTTP.")
  62 + if provider != "http":
  63 + raise ValueError(f"Unsupported rerank provider: {provider}")
66 64  
67 65 url = get_rerank_service_url()
68 66 return HttpRerankProvider(service_url=url)
... ...
providers/translation.py
... ... @@ -158,19 +158,7 @@ def create_translation_provider(query_config: Any = None) -&gt; Any:
158 158 translation_context=getattr(qc, "translation_context", "e-commerce product search"),
159 159 )
160 160  
161   - logger.warning(
162   - "Unsupported translation provider '%s', fallback to direct.",
163   - provider,
164   - )
165   - from query.translator import Translator
166   - qc = query_config or _empty_query_config()
167   - return Translator(
168   - model=pc.get("model") or "qwen",
169   - api_key=getattr(qc, "translation_api_key", None),
170   - use_cache=True,
171   - glossary_id=getattr(qc, "translation_glossary_id", None),
172   - translation_context=getattr(qc, "translation_context", "e-commerce product search"),
173   - )
  161 + raise ValueError(f"Unsupported translation provider: {provider}")
174 162  
175 163  
176 164 def _empty_query_config() -> Any:
... ...
query/__init__.py
... ... @@ -2,15 +2,12 @@
2 2  
3 3 from .language_detector import LanguageDetector
4 4 from .translator import Translator
5   -from .translation_client import HttpTranslationClient, create_translation_client
6 5 from .query_rewriter import QueryRewriter, QueryNormalizer
7 6 from .query_parser import QueryParser, ParsedQuery
8 7  
9 8 __all__ = [
10 9 'LanguageDetector',
11 10 'Translator',
12   - 'HttpTranslationClient',
13   - 'create_translation_client',
14 11 'QueryRewriter',
15 12 'QueryNormalizer',
16 13 'QueryParser',
... ...
query/translation_client.py deleted
... ... @@ -1,20 +0,0 @@
1   -"""
2   -Translation client - delegates to providers.
3   -
4   -Deprecated: use providers.create_translation_provider() instead.
5   -Kept for backward compatibility.
6   -"""
7   -
8   -from __future__ import annotations
9   -
10   -from typing import Any
11   -
12   -from providers.translation import (
13   - HttpTranslationProvider as HttpTranslationClient,
14   - create_translation_provider,
15   -)
16   -
17   -
18   -def create_translation_client(query_config: Any) -> Any:
19   - """Backward compat: delegate to create_translation_provider."""
20   - return create_translation_provider(query_config)
requirements.txt
... ... @@ -35,6 +35,7 @@ requests&gt;=2.31.0
35 35 # Utilities
36 36 tqdm>=4.65.0
37 37 click>=8.1.0
  38 +setuptools<82
38 39  
39 40 # Testing
40 41 pytest>=7.4.0
... ...
scripts/create_venv.sh
... ... @@ -44,7 +44,8 @@ fi
44 44 # shellcheck disable=SC1091
45 45 source "${VENV_DIR}/bin/activate"
46 46  
47   -python -m pip install --upgrade pip setuptools wheel
  47 +python -m pip install --upgrade pip wheel
  48 +python -m pip install "setuptools<82"
48 49 python -m pip install -r requirements.txt
49 50  
50 51 if [[ "${INSTALL_ML:-0}" == "1" ]]; then
... ...
scripts/start_embedding_service.sh
... ... @@ -16,10 +16,27 @@ source ./activate.sh
16 16  
17 17 DEFAULT_EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)")
18 18 DEFAULT_EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)")
  19 +USE_CLIP_AS_SERVICE=$(python -c "from embeddings.config import CONFIG; print('1' if CONFIG.USE_CLIP_AS_SERVICE else '0')")
19 20  
20 21 EMBEDDING_SERVICE_HOST="${EMBEDDING_HOST:-${DEFAULT_EMBEDDING_SERVICE_HOST}}"
21 22 EMBEDDING_SERVICE_PORT="${EMBEDDING_PORT:-${DEFAULT_EMBEDDING_SERVICE_PORT}}"
22 23  
  24 +if [[ "${USE_CLIP_AS_SERVICE}" == "1" ]]; then
  25 + if ! python - <<'PY'
  26 +try:
  27 + import pkg_resources # noqa: F401
  28 +except Exception:
  29 + raise SystemExit(1)
  30 +PY
  31 + then
  32 + echo "ERROR: clip-as-service image embedding requires pkg_resources, but current venv is missing it." >&2
  33 + echo "Fix:" >&2
  34 + echo " python -m pip install 'setuptools<82'" >&2
  35 + echo "Then restart: ./scripts/start_embedding_service.sh" >&2
  36 + exit 1
  37 + fi
  38 +fi
  39 +
23 40 echo "========================================"
24 41 echo "Starting Local Embedding Service"
25 42 echo "========================================"
... ... @@ -36,4 +53,3 @@ exec python -m uvicorn embeddings.server:app \
36 53 --port "${EMBEDDING_SERVICE_PORT}" \
37 54 --workers 1
38 55  
39   -
... ...
scripts/test_build_docs_api.py
... ... @@ -29,6 +29,10 @@ except ImportError:
29 29 def build_sample_request():
30 30 """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。"""
31 31 now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
  32 + sample_image_url = os.getenv(
  33 + "SAMPLE_IMAGE_URL",
  34 + "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg",
  35 + )
32 36 return {
33 37 "tenant_id": "162",
34 38 "items": [
... ... @@ -45,7 +49,7 @@ def build_sample_request():
45 49 "category_level": 2,
46 50 "category_path": "服装/上衣/T恤",
47 51 "fake_sales": 1280,
48   - "image_src": "https://example.com/img/tshirt.jpg",
  52 + "image_src": sample_image_url,
49 53 "tags": "T恤,纯棉,短袖,夏季",
50 54 "create_time": now,
51 55 "update_time": now,
... ...
scripts/trace_indexer_calls.sh 0 → 100755
... ... @@ -0,0 +1,74 @@
  1 +#!/bin/bash
  2 +#
  3 +# 排查「谁在调用索引服务」的脚本
  4 +# 用法: ./scripts/trace_indexer_calls.sh
  5 +#
  6 +
  7 +set -euo pipefail
  8 +
  9 +cd "$(dirname "$0")/.."
  10 +source ./activate.sh 2>/dev/null || true
  11 +
  12 +echo "=========================================="
  13 +echo "索引服务调用方排查"
  14 +echo "=========================================="
  15 +
  16 +INDEXER_PORT="${INDEXER_PORT:-6004}"
  17 +EMBEDDING_PORT="${EMBEDDING_PORT:-6005}"
  18 +
  19 +echo ""
  20 +echo "1. 监听端口 6004 的进程(Indexer 服务)"
  21 +echo "------------------------------------------"
  22 +if command -v lsof >/dev/null 2>&1; then
  23 + lsof -i :"${INDEXER_PORT}" 2>/dev/null || echo " (无进程监听或 lsof 无权限)"
  24 +else
  25 + ss -tlnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (无进程监听)"
  26 +fi
  27 +
  28 +echo ""
  29 +echo "2. 连接到 6004 的客户端(谁在请求 Indexer)"
  30 +echo "------------------------------------------"
  31 +if command -v ss >/dev/null 2>&1; then
  32 + ss -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)"
  33 +elif command -v netstat >/dev/null 2>&1; then
  34 + netstat -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)"
  35 +else
  36 + echo " 请安装 ss 或 netstat"
  37 +fi
  38 +
  39 +echo ""
  40 +echo "3. 连接到 6005 的客户端(Indexer 会调用 Embedding 服务)"
  41 +echo "------------------------------------------"
  42 +if command -v ss >/dev/null 2>&1; then
  43 + ss -tnp 2>/dev/null | grep ":${EMBEDDING_PORT}" || echo " (当前无活跃连接)"
  44 +fi
  45 +
  46 +echo ""
  47 +echo "4. 检查定时任务(cron)"
  48 +echo "------------------------------------------"
  49 +(crontab -l 2>/dev/null | grep -i indexer) || echo " 当前用户无相关 cron"
  50 +if [ -d /etc/cron.d ]; then
  51 + grep -l -i indexer /etc/cron.d/* 2>/dev/null || true
  52 +fi
  53 +
  54 +echo ""
  55 +echo "5. 端口与逻辑说明"
  56 +echo "------------------------------------------"
  57 +echo " - Indexer 服务: 端口 ${INDEXER_PORT}"
  58 +echo " 启动: ./scripts/start_indexer.sh 或 python main.py serve-indexer"
  59 +echo " 接口: POST /indexer/reindex, POST /indexer/index, POST /indexer/build-docs 等"
  60 +echo ""
  61 +echo " - 调用方(文档说明): 外部 Java 程序或 curl 等 HTTP 客户端"
  62 +echo " 全量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/reindex -d '{\"tenant_id\":\"170\",\"batch_size\":500}'"
  63 +echo " 增量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/index -d '{\"tenant_id\":\"170\",\"spu_ids\":[\"123\"]}'"
  64 +echo ""
  65 +echo " - Indexer 内部会调用:"
  66 +echo " - Embedding 服务 (${EMBEDDING_PORT}): POST /embed/text"
  67 +echo " - Qwen API: dashscope.aliyuncs.com (翻译、LLM 分析)"
  68 +echo " - MySQL: 商品数据"
  69 +echo " - Elasticsearch: 写入索引"
  70 +echo ""
  71 +echo "6. 实时监控连接(按 Ctrl+C 停止)"
  72 +echo "------------------------------------------"
  73 +echo " 运行: watch -n 2 'ss -tnp | grep -E \":${INDEXER_PORT}|:${EMBEDDING_PORT}\"'"
  74 +echo ""
... ...
search/es_query_builder.py
... ... @@ -19,7 +19,6 @@ class ESQueryBuilder:
19 19  
20 20 def __init__(
21 21 self,
22   - index_name: str,
23 22 match_fields: List[str],
24 23 text_embedding_field: Optional[str] = None,
25 24 image_embedding_field: Optional[str] = None,
... ... @@ -33,7 +32,6 @@ class ESQueryBuilder:
33 32 Initialize query builder.
34 33  
35 34 Args:
36   - index_name: ES index name
37 35 match_fields: Fields to search for text matching
38 36 text_embedding_field: Field name for text embeddings
39 37 image_embedding_field: Field name for image embeddings
... ... @@ -43,7 +41,6 @@ class ESQueryBuilder:
43 41 default_language: Default language to use when detection fails or returns "unknown"
44 42 knn_boost: Boost value for KNN (embedding recall)
45 43 """
46   - self.index_name = index_name
47 44 self.match_fields = match_fields
48 45 self.text_embedding_field = text_embedding_field
49 46 self.image_embedding_field = image_embedding_field
... ... @@ -351,7 +348,6 @@ class ESQueryBuilder:
351 348 def _build_text_query(self, query_text: str) -> Dict[str, Any]:
352 349 """
353 350 Build simple text matching query (BM25).
354   - Legacy method - kept for backward compatibility.
355 351  
356 352 Args:
357 353 query_text: Query text
... ...
search/searcher.py
... ... @@ -17,7 +17,7 @@ from .es_query_builder import ESQueryBuilder
17 17 from config import SearchConfig
18 18 from config.tenant_config_loader import get_tenant_config_loader
19 19 from config.utils import get_match_fields_for_index
20   -from context.request_context import RequestContext, RequestContextStage, create_request_context
  20 +from context.request_context import RequestContext, RequestContextStage
21 21 from api.models import FacetResult, FacetValue, FacetConfig
22 22 from api.result_formatter import ResultFormatter
23 23 from indexer.mapping_generator import get_tenant_index_name
... ... @@ -107,9 +107,7 @@ class Searcher:
107 107 self.source_fields = config.query_config.source_fields or []
108 108  
109 109 # Query builder - simplified single-layer architecture
110   - # index_name is no longer needed in query builder since we use tenant-specific indices
111 110 self.query_builder = ESQueryBuilder(
112   - index_name="", # Not used, kept for backward compatibility
113 111 match_fields=self.match_fields,
114 112 text_embedding_field=self.text_embedding_field,
115 113 image_embedding_field=self.image_embedding_field,
... ... @@ -160,9 +158,8 @@ class Searcher:
160 158 Returns:
161 159 SearchResult object with formatted results
162 160 """
163   - # Create context if not provided (backward compatibility)
164 161 if context is None:
165   - context = create_request_context()
  162 + raise ValueError("context is required")
166 163  
167 164 # 根据租户配置决定翻译开关(离线/在线统一)
168 165 tenant_loader = get_tenant_config_loader()
... ...
tests/ci/test_service_api_contracts.py
... ... @@ -179,6 +179,7 @@ def indexer_client(monkeypatch):
179 179 import api.indexer_app as indexer_app
180 180 import api.routes.indexer as indexer_routes
181 181  
  182 + indexer_app.app.router.on_startup.clear()
182 183 monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None)
183 184 monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService())
184 185 monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService())
... ...
tests/test_cnclip_service.py
... ... @@ -6,21 +6,35 @@ CN-CLIP 服务测试脚本
6 6 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议)
7 7  
8 8 使用方法:
9   - python scripts/test_cnclip_service.py [PORT]
  9 + python tests/test_cnclip_service.py [PORT]
10 10  
11 11 参数:
12 12 PORT: 服务端口(默认:51000)
13 13 """
14 14  
15 15 import sys
  16 +import os
16 17  
17   -import pytest
  18 +import numpy as np
  19 +
  20 +# Skip clip_client version check (it imports pkg_resources in legacy path).
  21 +os.environ.setdefault("NO_VERSION_CHECK", "1")
  22 +
  23 +# Ensure vendored client is importable in direct `python tests/test_cnclip_service.py` mode.
  24 +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
  25 +VENDORED_CLIENT = os.path.join(ROOT, "third-party", "clip-as-service", "client")
  26 +if os.path.isdir(VENDORED_CLIENT) and VENDORED_CLIENT not in sys.path:
  27 + sys.path.insert(0, VENDORED_CLIENT)
18 28  
19 29 try:
20   - import numpy as np
21 30 from clip_client import Client
22   -except ImportError:
23   - pytest.skip("clip_client not installed (optional clip-as-service client)", allow_module_level=True)
  31 +except ImportError as e:
  32 + print("✗ 无法导入 clip_client。请先安装/暴露客户端依赖:")
  33 + print(" 1) pip install -e third-party/clip-as-service/client")
  34 + print(" 或")
  35 + print(" 2) export PYTHONPATH=third-party/clip-as-service/client:$PYTHONPATH")
  36 + print(f" 详细错误: {e}")
  37 + sys.exit(1)
24 38  
25 39  
26 40 def _test_encoding(client, test_name, inputs):
... ... @@ -72,6 +86,15 @@ def main():
72 86 # 创建客户端
73 87 try:
74 88 client = Client(grpc_url)
  89 + except ModuleNotFoundError as e:
  90 + if str(e) == "No module named 'pkg_resources'":
  91 + print("✗ 当前环境缺少 pkg_resources,clip_client/jina 无法初始化。")
  92 + print(" 建议使用专用环境运行:")
  93 + print(" .venv-cnclip/bin/python tests/test_cnclip_service.py 51000")
  94 + print(" 或在当前 .venv 安装兼容 setuptools(包含 pkg_resources)。")
  95 + sys.exit(1)
  96 + print(f"✗ 客户端创建失败: {e}")
  97 + sys.exit(1)
75 98 except Exception as e:
76 99 print(f"✗ 客户端创建失败: {e}")
77 100 sys.exit(1)
... ... @@ -118,4 +141,3 @@ def main():
118 141  
119 142 if __name__ == '__main__':
120 143 main()
121   -
... ...
tests/test_embedding_pipeline.py
... ... @@ -2,6 +2,7 @@ import pickle
2 2 from typing import Any, Dict, List, Optional
3 3  
4 4 import numpy as np
  5 +import pytest
5 6  
6 7 from config import (
7 8 FunctionScoreConfig,
... ... @@ -100,7 +101,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch):
100 101 def _fake_post(url, json, timeout):
101 102 assert url.endswith("/embed/text")
102 103 assert json == ["hello", "world"]
103   - return _FakeResponse([[0.1, 0.2], None])
  104 + return _FakeResponse([[0.1, 0.2], [0.3, 0.4]])
104 105  
105 106 monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post)
106 107  
... ... @@ -110,7 +111,22 @@ def test_text_embedding_encoder_response_alignment(monkeypatch):
110 111 assert len(out) == 2
111 112 assert isinstance(out[0], np.ndarray)
112 113 assert out[0].shape == (2,)
113   - assert out[1] is None
  114 + assert isinstance(out[1], np.ndarray)
  115 + assert out[1].shape == (2,)
  116 +
  117 +
  118 +def test_text_embedding_encoder_raises_on_missing_vector(monkeypatch):
  119 + fake_redis = _FakeRedis()
  120 + monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis)
  121 +
  122 + def _fake_post(url, json, timeout):
  123 + return _FakeResponse([[0.1, 0.2], None])
  124 +
  125 + monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post)
  126 +
  127 + encoder = TextEmbeddingEncoder(service_url="http://127.0.0.1:6005")
  128 + with pytest.raises(ValueError):
  129 + encoder.encode(["hello", "world"])
114 130  
115 131  
116 132 def test_text_embedding_encoder_cache_hit(monkeypatch):
... ... @@ -156,4 +172,3 @@ def test_query_parser_skips_query_vector_when_disabled():
156 172  
157 173 parsed = parser.parse("red dress", tenant_id="162", generate_vector=False)
158 174 assert parsed.query_vector is None
159   -
... ...