Commit ed94866621f769cb80425846a68db4a01a9f5caa
1 parent
950a640e
tidy
Showing
33 changed files
with
440 additions
and
594 deletions
Show diff stats
activate.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | # |
| 3 | -# Unified environment activator (venv preferred, conda fallback). | |
| 4 | -# | |
| 5 | 3 | # Usage: |
| 6 | 4 | # source activate.sh |
| 7 | 5 | # |
| 8 | -# Priority: | |
| 6 | +# Required: | |
| 9 | 7 | # 1) ./.venv (Python venv) |
| 10 | -# 2) conda env "searchengine" (legacy) | |
| 11 | 8 | # |
| 12 | 9 | |
| 13 | 10 | # Must be sourced |
| ... | ... | @@ -18,28 +15,16 @@ fi |
| 18 | 15 | |
| 19 | 16 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 20 | 17 | |
| 21 | -# 1) venv (preferred) | |
| 18 | +# 1) venv (required) | |
| 22 | 19 | VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate" |
| 23 | 20 | if [[ -f "${VENV_ACTIVATE}" ]]; then |
| 24 | 21 | # shellcheck disable=SC1090 |
| 25 | 22 | source "${VENV_ACTIVATE}" |
| 26 | 23 | ENV_KIND="venv" |
| 27 | 24 | else |
| 28 | - # 2) conda fallback (legacy) | |
| 29 | - # 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径 | |
| 30 | - # 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3 | |
| 31 | - CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" | |
| 32 | - if [[ -f "${CONDA_ROOT}/etc/profile.d/conda.sh" ]]; then | |
| 33 | - # shellcheck disable=SC1091 | |
| 34 | - source "${CONDA_ROOT}/etc/profile.d/conda.sh" | |
| 35 | - conda activate searchengine | |
| 36 | - ENV_KIND="conda" | |
| 37 | - else | |
| 38 | - echo "ERROR: No .venv found and conda.sh not found at ${CONDA_ROOT}/etc/profile.d/conda.sh" >&2 | |
| 39 | - echo " - Create venv: ./scripts/create_venv.sh" >&2 | |
| 40 | - echo " - Or set CONDA_ROOT to your conda install path" >&2 | |
| 41 | - return 1 | |
| 42 | - fi | |
| 25 | + echo "ERROR: No .venv found at ${VENV_ACTIVATE}" >&2 | |
| 26 | + echo " - Create venv: ./scripts/create_venv.sh" >&2 | |
| 27 | + return 1 | |
| 43 | 28 | fi |
| 44 | 29 | |
| 45 | 30 | # 如果需要加载 .env 中的环境变量 | ... | ... |
api/indexer_app.py
| ... | ... | @@ -124,12 +124,10 @@ def init_indexer_service(es_host: str = "http://localhost:9200"): |
| 124 | 124 | ] |
| 125 | 125 | if not value |
| 126 | 126 | ] |
| 127 | - logger.warning( | |
| 128 | - "Database config incomplete for indexer, services will not be available. " | |
| 127 | + raise RuntimeError( | |
| 128 | + "Database config incomplete for indexer. " | |
| 129 | 129 | f"Missing: {', '.join(missing)}" |
| 130 | 130 | ) |
| 131 | - _incremental_service = None | |
| 132 | - _bulk_indexing_service = None | |
| 133 | 131 | |
| 134 | 132 | elapsed = time.time() - start_time |
| 135 | 133 | logger.info(f"Indexer service ready! (took {elapsed:.2f}s)") |
| ... | ... | @@ -180,14 +178,17 @@ async def startup_event(): |
| 180 | 178 | # If no explicit tenants configured, skip warmup. |
| 181 | 179 | if tenants: |
| 182 | 180 | warm = _incremental_service.warmup_transformers(tenants) |
| 181 | + if warm.get("failed"): | |
| 182 | + raise RuntimeError(f"Indexer warmup failed: {warm['failed']}") | |
| 183 | 183 | logger.info("Indexer warmup completed: %s", warm) |
| 184 | 184 | else: |
| 185 | 185 | logger.info("Indexer warmup skipped (no tenant ids in config.tenant_config.tenants)") |
| 186 | 186 | except Exception as e: |
| 187 | - logger.warning("Indexer warmup failed (service still starts): %s", e, exc_info=True) | |
| 187 | + logger.error("Indexer warmup failed: %s", e, exc_info=True) | |
| 188 | + raise | |
| 188 | 189 | except Exception as e: |
| 189 | 190 | logger.error(f"Failed to initialize indexer service: {e}", exc_info=True) |
| 190 | - logger.warning("Indexer service will start but may not function correctly") | |
| 191 | + raise | |
| 191 | 192 | |
| 192 | 193 | |
| 193 | 194 | @app.on_event("shutdown") |
| ... | ... | @@ -215,7 +216,7 @@ async def global_exception_handler(request: Request, exc: Exception): |
| 215 | 216 | async def health_check(): |
| 216 | 217 | """Simple health check for indexer service.""" |
| 217 | 218 | try: |
| 218 | - # ensure ES is reachable (best-effort) | |
| 219 | + # ensure ES is reachable | |
| 219 | 220 | if _es_client is None: |
| 220 | 221 | raise RuntimeError("ES client is not initialized") |
| 221 | 222 | return { |
| ... | ... | @@ -262,4 +263,3 @@ if __name__ == "__main__": |
| 262 | 263 | port=args.port, |
| 263 | 264 | reload=args.reload, |
| 264 | 265 | ) |
| 265 | - | ... | ... |
api/routes/indexer.py
| ... | ... | @@ -244,31 +244,19 @@ async def build_docs(request: BuildDocsRequest): |
| 244 | 244 | title_text = str(v) |
| 245 | 245 | break |
| 246 | 246 | if title_text and str(title_text).strip(): |
| 247 | - try: | |
| 248 | - import numpy as np | |
| 249 | - | |
| 250 | - embeddings = encoder.encode(title_text) | |
| 251 | - if embeddings is not None and len(embeddings) > 0: | |
| 252 | - emb0 = embeddings[0] | |
| 253 | - if isinstance(emb0, np.ndarray) and emb0.size > 0: | |
| 254 | - doc["title_embedding"] = emb0.tolist() | |
| 255 | - else: | |
| 256 | - logger.warning( | |
| 257 | - "build-docs: title_embedding skipped (encoder returned None/invalid for title: %s...)", | |
| 258 | - title_text[:50], | |
| 259 | - ) | |
| 260 | - else: | |
| 261 | - logger.warning( | |
| 262 | - "build-docs: title_embedding skipped (encoder returned empty for title: %s...)", | |
| 263 | - title_text[:50], | |
| 264 | - ) | |
| 265 | - except Exception as e: | |
| 266 | - logger.warning( | |
| 267 | - "build-docs: title_embedding failed for spu_id=%s: %s", | |
| 268 | - doc.get("spu_id"), | |
| 269 | - e, | |
| 247 | + import numpy as np | |
| 248 | + | |
| 249 | + embeddings = encoder.encode(title_text) | |
| 250 | + if embeddings is None or len(embeddings) == 0: | |
| 251 | + raise RuntimeError( | |
| 252 | + f"title_embedding empty for spu_id={doc.get('spu_id')}" | |
| 253 | + ) | |
| 254 | + emb0 = np.asarray(embeddings[0], dtype=np.float32) | |
| 255 | + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all(): | |
| 256 | + raise RuntimeError( | |
| 257 | + f"title_embedding invalid for spu_id={doc.get('spu_id')}" | |
| 270 | 258 | ) |
| 271 | - # 构建 doc 接口不因为 embedding 失败而整体失败 | |
| 259 | + doc["title_embedding"] = emb0.tolist() | |
| 272 | 260 | |
| 273 | 261 | docs.append(doc) |
| 274 | 262 | except Exception as e: |
| ... | ... | @@ -486,4 +474,3 @@ async def indexer_health_check(): |
| 486 | 474 | except Exception as e: |
| 487 | 475 | logger.error(f"Error checking indexer health: {e}", exc_info=True) |
| 488 | 476 | return {"status": "error", "message": str(e)} |
| 489 | - | ... | ... |
config/__init__.py
| ... | ... | @@ -13,19 +13,13 @@ from .config_loader import ( |
| 13 | 13 | FunctionScoreConfig, |
| 14 | 14 | RerankConfig, |
| 15 | 15 | ConfigLoader, |
| 16 | - ConfigurationError, | |
| 17 | - load_tenant_config | |
| 16 | + ConfigurationError | |
| 18 | 17 | ) |
| 19 | 18 | |
| 20 | 19 | from .utils import ( |
| 21 | 20 | get_match_fields_for_index, |
| 22 | 21 | get_domain_fields |
| 23 | 22 | ) |
| 24 | -from .service_endpoints import ( | |
| 25 | - resolve_translation_service_url, | |
| 26 | - resolve_embedding_service_url, | |
| 27 | - resolve_reranker_service_url, | |
| 28 | -) | |
| 29 | 23 | from .services_config import ( |
| 30 | 24 | get_translation_config, |
| 31 | 25 | get_embedding_config, |
| ... | ... | @@ -50,12 +44,8 @@ __all__ = [ |
| 50 | 44 | # Loader and utilities |
| 51 | 45 | 'ConfigLoader', |
| 52 | 46 | 'ConfigurationError', |
| 53 | - 'load_tenant_config', | |
| 54 | 47 | 'get_match_fields_for_index', |
| 55 | 48 | 'get_domain_fields', |
| 56 | - 'resolve_translation_service_url', | |
| 57 | - 'resolve_embedding_service_url', | |
| 58 | - 'resolve_reranker_service_url', | |
| 59 | 49 | 'get_translation_config', |
| 60 | 50 | 'get_embedding_config', |
| 61 | 51 | 'get_rerank_config', | ... | ... |
config/config_loader.py
| ... | ... | @@ -433,17 +433,3 @@ class ConfigLoader: |
| 433 | 433 | result["example"] = index.example |
| 434 | 434 | |
| 435 | 435 | return result |
| 436 | - | |
| 437 | - | |
| 438 | -def load_tenant_config(tenant_id: Optional[str] = None) -> SearchConfig: | |
| 439 | - """ | |
| 440 | - Load tenant configuration (backward compatibility wrapper). | |
| 441 | - | |
| 442 | - Args: | |
| 443 | - tenant_id: Ignored (kept for backward compatibility) | |
| 444 | - | |
| 445 | - Returns: | |
| 446 | - SearchConfig loaded from config/config.yaml | |
| 447 | - """ | |
| 448 | - loader = ConfigLoader() | |
| 449 | - return loader.load_config() | ... | ... |
config/service_endpoints.py deleted
| ... | ... | @@ -1,12 +0,0 @@ |
| 1 | -""" | |
| 2 | -Endpoint resolvers - delegate to services_config. | |
| 3 | - | |
| 4 | -Deprecated: use config.services_config directly. | |
| 5 | -Kept for backward compatibility. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from .services_config import ( | |
| 9 | - get_translation_base_url as resolve_translation_service_url, | |
| 10 | - get_embedding_base_url as resolve_embedding_service_url, | |
| 11 | - get_rerank_service_url as resolve_reranker_service_url, | |
| 12 | -) |
docs/DEVELOPER_GUIDE.md
| ... | ... | @@ -216,9 +216,9 @@ docs/ # 文档(含本指南) |
| 216 | 216 | - 索引结构以 `mappings/search_products.json` 为唯一来源;indexer 产出的 doc 必须与该 mapping 一致。 |
| 217 | 217 | - 查询侧使用的字段名、多语言后缀(.zh/.en)、嵌套路径等与 mapping 保持一致;新增字段时同步更新 mapping 与查询/分面/过滤逻辑。 |
| 218 | 218 | |
| 219 | -### 5.7 错误与降级 | |
| 219 | +### 5.7 错误处理 | |
| 220 | 220 | |
| 221 | -- 外部能力(翻译、向量、重排)调用失败时,应有明确降级策略(如跳过向量、仅用 BM25、重排失败时保留 ES 顺序),并打日志便于排查;不因单一能力不可用导致整请求失败。 | |
| 221 | +- 外部能力(翻译、向量、重排)调用失败时应立即报错并中止请求,禁止静默降级;通过日志与监控尽早暴露问题并修复。 | |
| 222 | 222 | |
| 223 | 223 | --- |
| 224 | 224 | |
| ... | ... | @@ -315,11 +315,11 @@ services: |
| 315 | 315 | 5. **调用方**:无需修改;仅部署时启动使用新后端的 reranker 服务即可。 |
| 316 | 316 | 6. **文档与依赖**:在 `reranker/README.md` 或 docs 中说明依赖(如 vllm)、显存建议;可选依赖放入 `requirements_ml.txt` 或 extra。 |
| 317 | 317 | |
| 318 | -### 7.7 与现有配置的兼容说明 | |
| 318 | +### 7.7 配置一致性说明 | |
| 319 | 319 | |
| 320 | -- **reranker**:当前 `reranker/config.py` 的 BGE 相关默认值可保留为兜底,或将默认值迁移到 `config.yaml` 的 `services.rerank.backends.bge`,由 config 只读环境变量与 YAML。 | |
| 321 | -- **embeddings**:`embeddings/config.py` 的文本/图片及 clip-as-service 开关与 `services.embedding` 的 URL 分离;后续多种后端可在 `services.embedding.backends` 中增加条目。 | |
| 322 | -- **环境变量**:所有能力均支持环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),便于多环境部署。 | |
| 320 | +- **单一路径**:Provider 和 backend 必须由 `config/config.yaml` 的 `services` 块显式指定;未知配置应直接报错。 | |
| 321 | +- **无兼容回退**:不保留“旧配置自动推导/兜底默认值”机制,避免静默行为偏差。 | |
| 322 | +- **环境变量覆盖**:允许环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),但覆盖后仍需满足合法性校验。 | |
| 323 | 323 | |
| 324 | 324 | --- |
| 325 | 325 | |
| ... | ... | @@ -346,7 +346,7 @@ services: |
| 346 | 346 | |
| 347 | 347 | ### 8.4 日志与可观测性 |
| 348 | 348 | |
| 349 | -- 关键路径(请求入口、外部调用、失败降级)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,warning 用于降级)。 | |
| 349 | +- 关键路径(请求入口、外部调用、失败报错)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,error 用于失败)。 | |
| 350 | 350 | - 对外接口的耗时、错误码、租户等可考虑结构化日志或后续接入监控,便于运维与排查。 |
| 351 | 351 | |
| 352 | 352 | --- | ... | ... |
docs/Usage-Guide.md
| ... | ... | @@ -29,7 +29,7 @@ |
| 29 | 29 | |
| 30 | 30 | #### 1. 安装 Python 依赖与激活环境 |
| 31 | 31 | |
| 32 | -**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。目前推荐 venv(`.venv`);Conda 仅作为兼容回退(需要 `CONDA_ROOT`)。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。 | |
| 32 | +**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。当前仅支持 venv(`.venv`),不存在 Conda 回退。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。 | |
| 33 | 33 | |
| 34 | 34 | ```bash |
| 35 | 35 | cd /data/saas-search |
| ... | ... | @@ -663,4 +663,3 @@ curl -X POST http://localhost:6002/search/ \ |
| 663 | 663 | |
| 664 | 664 | **文档版本**: v2.0 |
| 665 | 665 | **最后更新**: 2024-12 |
| 666 | - | ... | ... |
embeddings/__init__.py
| 1 | -""" | |
| 2 | -Embeddings module. | |
| 1 | +"""Embeddings module exports.""" | |
| 3 | 2 | |
| 4 | -Important: keep package import lightweight. | |
| 3 | +from .text_encoder import TextEmbeddingEncoder | |
| 4 | +from .image_encoder import CLIPImageEncoder | |
| 5 | 5 | |
| 6 | -Some callers do: | |
| 7 | - - `from embeddings import TextEmbeddingEncoder` | |
| 8 | - - `from embeddings import BgeEncoder` (deprecated alias) | |
| 9 | - - `from embeddings import CLIPImageEncoder` | |
| 10 | - | |
| 11 | -But the underlying implementations may import heavy optional deps (Pillow, torch, etc). | |
| 12 | -To avoid importing those at package import time (and to allow the embedding service to boot | |
| 13 | -without importing client code), we provide small lazy factories here. | |
| 14 | -""" | |
| 15 | - | |
| 16 | - | |
| 17 | -class TextEmbeddingEncoder(object): | |
| 18 | - """Lazy factory for `embeddings.text_encoder.TextEmbeddingEncoder`.""" | |
| 19 | - | |
| 20 | - def __new__(cls, *args, **kwargs): | |
| 21 | - from .text_encoder import TextEmbeddingEncoder as _Real | |
| 22 | - | |
| 23 | - return _Real(*args, **kwargs) | |
| 24 | - | |
| 25 | - | |
| 26 | -class BgeEncoder(TextEmbeddingEncoder): | |
| 27 | - """Deprecated backward-compatible alias for old class name.""" | |
| 28 | - | |
| 29 | - | |
| 30 | -class CLIPImageEncoder(object): | |
| 31 | - """Lazy factory for `embeddings.image_encoder.CLIPImageEncoder`.""" | |
| 32 | - | |
| 33 | - def __new__(cls, *args, **kwargs): | |
| 34 | - from .image_encoder import CLIPImageEncoder as _Real | |
| 35 | - | |
| 36 | - return _Real(*args, **kwargs) | |
| 37 | - | |
| 38 | - | |
| 39 | -__all__ = ["TextEmbeddingEncoder", "BgeEncoder", "CLIPImageEncoder"] | |
| 6 | +__all__ = [ | |
| 7 | + "TextEmbeddingEncoder", | |
| 8 | + "CLIPImageEncoder", | |
| 9 | +] | ... | ... |
embeddings/clip_as_service_encoder.py
| ... | ... | @@ -65,13 +65,21 @@ class ClipAsServiceImageEncoder: |
| 65 | 65 | self._server = server |
| 66 | 66 | self._batch_size = batch_size |
| 67 | 67 | self._show_progress = show_progress |
| 68 | - self._client = Client(server) | |
| 68 | + try: | |
| 69 | + self._client = Client(server) | |
| 70 | + except ModuleNotFoundError as e: | |
| 71 | + if str(e) == "No module named 'pkg_resources'": | |
| 72 | + raise RuntimeError( | |
| 73 | + "clip-as-service requires pkg_resources via jina/hubble. " | |
| 74 | + "Install compatible setuptools (<82) in current venv." | |
| 75 | + ) from e | |
| 76 | + raise | |
| 69 | 77 | |
| 70 | 78 | def encode_image_urls( |
| 71 | 79 | self, |
| 72 | 80 | urls: List[str], |
| 73 | 81 | batch_size: Optional[int] = None, |
| 74 | - ) -> List[Optional[np.ndarray]]: | |
| 82 | + ) -> List[np.ndarray]: | |
| 75 | 83 | """ |
| 76 | 84 | Encode a list of image URLs to vectors. |
| 77 | 85 | |
| ... | ... | @@ -80,42 +88,36 @@ class ClipAsServiceImageEncoder: |
| 80 | 88 | batch_size: override instance batch_size for this call. |
| 81 | 89 | |
| 82 | 90 | Returns: |
| 83 | - List of vectors (1024-dim float32) or None for failed items, same length as urls. | |
| 91 | + List of vectors (float32), same length as urls. | |
| 84 | 92 | """ |
| 85 | 93 | if not urls: |
| 86 | 94 | return [] |
| 87 | 95 | |
| 88 | 96 | normalized = [_normalize_image_url(u) for u in urls] |
| 89 | - valid_indices = [i for i, u in enumerate(normalized) if u] | |
| 90 | - if not valid_indices: | |
| 91 | - return [None] * len(urls) | |
| 92 | - | |
| 93 | - valid_urls = [normalized[i] for i in valid_indices] | |
| 94 | 97 | bs = batch_size if batch_size is not None else self._batch_size |
| 95 | - out: List[Optional[np.ndarray]] = [None] * len(urls) | |
| 96 | - | |
| 97 | - try: | |
| 98 | - # Client.encode(iterable of str) returns np.ndarray [N, D] for string input | |
| 99 | - arr = self._client.encode( | |
| 100 | - valid_urls, | |
| 101 | - batch_size=bs, | |
| 102 | - show_progress=self._show_progress, | |
| 98 | + invalid_indices = [i for i, u in enumerate(normalized) if not u] | |
| 99 | + if invalid_indices: | |
| 100 | + raise ValueError(f"Invalid empty image URL at indices: {invalid_indices}") | |
| 101 | + | |
| 102 | + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input | |
| 103 | + arr = self._client.encode( | |
| 104 | + normalized, | |
| 105 | + batch_size=bs, | |
| 106 | + show_progress=self._show_progress, | |
| 107 | + ) | |
| 108 | + if arr is None or not hasattr(arr, "shape"): | |
| 109 | + raise RuntimeError("clip-as-service encode returned empty result") | |
| 110 | + if len(arr) != len(normalized): | |
| 111 | + raise RuntimeError( | |
| 112 | + f"clip-as-service encode length mismatch: expected {len(normalized)}, got {len(arr)}" | |
| 103 | 113 | ) |
| 104 | - if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices): | |
| 105 | - for j, idx in enumerate(valid_indices): | |
| 106 | - row = arr[j] | |
| 107 | - if row is not None and hasattr(row, "tolist"): | |
| 108 | - out[idx] = np.asarray(row, dtype=np.float32) | |
| 109 | - else: | |
| 110 | - out[idx] = np.array(row, dtype=np.float32) | |
| 111 | - else: | |
| 112 | - logger.warning( | |
| 113 | - "clip-as-service encode returned unexpected shape/length, " | |
| 114 | - "expected %d vectors", len(valid_indices) | |
| 115 | - ) | |
| 116 | - except Exception as e: | |
| 117 | - logger.warning("clip-as-service encode failed: %s", e, exc_info=True) | |
| 118 | 114 | |
| 115 | + out: List[np.ndarray] = [] | |
| 116 | + for row in arr: | |
| 117 | + vec = np.asarray(row, dtype=np.float32) | |
| 118 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | |
| 119 | + raise RuntimeError("clip-as-service returned invalid embedding vector") | |
| 120 | + out.append(vec) | |
| 119 | 121 | return out |
| 120 | 122 | |
| 121 | 123 | def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | ... | ... |
embeddings/image_encoder.py
| ... | ... | @@ -48,16 +48,15 @@ class CLIPImageEncoder: |
| 48 | 48 | logger.error(f"CLIPImageEncoder service request failed: {e}", exc_info=True) |
| 49 | 49 | raise |
| 50 | 50 | |
| 51 | - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]: | |
| 51 | + def encode_image(self, image: Image.Image) -> np.ndarray: | |
| 52 | 52 | """ |
| 53 | 53 | Encode image to embedding vector using network service. |
| 54 | 54 | |
| 55 | 55 | Note: This method is kept for compatibility but the service only works with URLs. |
| 56 | 56 | """ |
| 57 | - logger.warning("encode_image with PIL Image not supported by service, returning None") | |
| 58 | - return None | |
| 57 | + raise NotImplementedError("encode_image with PIL Image is not supported by embedding service") | |
| 59 | 58 | |
| 60 | - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | |
| 59 | + def encode_image_from_url(self, url: str) -> np.ndarray: | |
| 61 | 60 | """ |
| 62 | 61 | Generate image embedding via network service using URL. |
| 63 | 62 | |
| ... | ... | @@ -65,24 +64,21 @@ class CLIPImageEncoder: |
| 65 | 64 | url: Image URL to process |
| 66 | 65 | |
| 67 | 66 | Returns: |
| 68 | - Embedding vector or None if failed | |
| 67 | + Embedding vector | |
| 69 | 68 | """ |
| 70 | - try: | |
| 71 | - response_data = self._call_service([url]) | |
| 72 | - if response_data and len(response_data) > 0 and response_data[0] is not None: | |
| 73 | - return np.array(response_data[0], dtype=np.float32) | |
| 74 | - logger.warning(f"No embedding for URL {url}") | |
| 75 | - return None | |
| 76 | - | |
| 77 | - except Exception as e: | |
| 78 | - logger.error(f"Failed to process image from URL {url}: {str(e)}", exc_info=True) | |
| 79 | - return None | |
| 69 | + response_data = self._call_service([url]) | |
| 70 | + if not response_data or len(response_data) != 1 or response_data[0] is None: | |
| 71 | + raise RuntimeError(f"No image embedding returned for URL: {url}") | |
| 72 | + vec = np.array(response_data[0], dtype=np.float32) | |
| 73 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | |
| 74 | + raise RuntimeError(f"Invalid image embedding returned for URL: {url}") | |
| 75 | + return vec | |
| 80 | 76 | |
| 81 | 77 | def encode_batch( |
| 82 | 78 | self, |
| 83 | 79 | images: List[Union[str, Image.Image]], |
| 84 | 80 | batch_size: int = 8 |
| 85 | - ) -> List[Optional[np.ndarray]]: | |
| 81 | + ) -> List[np.ndarray]: | |
| 86 | 82 | """ |
| 87 | 83 | Encode a batch of images efficiently via network service. |
| 88 | 84 | |
| ... | ... | @@ -91,50 +87,31 @@ class CLIPImageEncoder: |
| 91 | 87 | batch_size: Batch size for processing (used for service requests) |
| 92 | 88 | |
| 93 | 89 | Returns: |
| 94 | - List of embeddings (or None for failed images) | |
| 90 | + List of embeddings | |
| 95 | 91 | """ |
| 96 | - # Initialize results with None for all images | |
| 97 | - results = [None] * len(images) | |
| 98 | - | |
| 99 | - # Filter out PIL Images since service only supports URLs | |
| 100 | - url_images = [] | |
| 101 | - url_indices = [] | |
| 102 | - | |
| 103 | 92 | for i, img in enumerate(images): |
| 104 | - if isinstance(img, str): | |
| 105 | - url_images.append(img) | |
| 106 | - url_indices.append(i) | |
| 107 | - elif isinstance(img, Image.Image): | |
| 108 | - logger.warning(f"PIL Image at index {i} not supported by service, returning None") | |
| 109 | - # results[i] is already None | |
| 110 | - | |
| 111 | - # Process URLs in batches | |
| 112 | - for i in range(0, len(url_images), batch_size): | |
| 113 | - batch_urls = url_images[i:i + batch_size] | |
| 114 | - batch_indices = url_indices[i:i + batch_size] | |
| 115 | - | |
| 116 | - try: | |
| 117 | - # Call service | |
| 118 | - response_data = self._call_service(batch_urls) | |
| 119 | - | |
| 120 | - # Process response (aligned list) | |
| 121 | - batch_results = [] | |
| 122 | - for j, url in enumerate(batch_urls): | |
| 123 | - if response_data and j < len(response_data) and response_data[j] is not None: | |
| 124 | - batch_results.append(np.array(response_data[j], dtype=np.float32)) | |
| 125 | - else: | |
| 126 | - logger.warning(f"Failed to encode URL {url}: no embedding") | |
| 127 | - batch_results.append(None) | |
| 128 | - | |
| 129 | - # Insert results at the correct positions | |
| 130 | - for j, result in enumerate(batch_results): | |
| 131 | - results[batch_indices[j]] = result | |
| 132 | - | |
| 133 | - except Exception as e: | |
| 134 | - logger.error(f"Batch processing failed: {e}", exc_info=True) | |
| 135 | - # Fill with None for this batch | |
| 136 | - for j in range(len(batch_urls)): | |
| 137 | - results[batch_indices[j]] = None | |
| 93 | + if isinstance(img, Image.Image): | |
| 94 | + raise NotImplementedError(f"PIL Image at index {i} is not supported by service") | |
| 95 | + if not isinstance(img, str) or not img.strip(): | |
| 96 | + raise ValueError(f"Invalid image URL/path at index {i}: {img!r}") | |
| 97 | + | |
| 98 | + results: List[np.ndarray] = [] | |
| 99 | + for i in range(0, len(images), batch_size): | |
| 100 | + batch_urls = [str(u).strip() for u in images[i:i + batch_size]] | |
| 101 | + response_data = self._call_service(batch_urls) | |
| 102 | + if not response_data or len(response_data) != len(batch_urls): | |
| 103 | + raise RuntimeError( | |
| 104 | + f"Image embedding response length mismatch: expected {len(batch_urls)}, " | |
| 105 | + f"got {0 if response_data is None else len(response_data)}" | |
| 106 | + ) | |
| 107 | + for j, url in enumerate(batch_urls): | |
| 108 | + embedding = response_data[j] | |
| 109 | + if embedding is None: | |
| 110 | + raise RuntimeError(f"No image embedding returned for URL: {url}") | |
| 111 | + vec = np.array(embedding, dtype=np.float32) | |
| 112 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | |
| 113 | + raise RuntimeError(f"Invalid image embedding returned for URL: {url}") | |
| 114 | + results.append(vec) | |
| 138 | 115 | |
| 139 | 116 | return results |
| 140 | 117 | |
| ... | ... | @@ -142,7 +119,7 @@ class CLIPImageEncoder: |
| 142 | 119 | self, |
| 143 | 120 | urls: List[str], |
| 144 | 121 | batch_size: Optional[int] = None, |
| 145 | - ) -> List[Optional[np.ndarray]]: | |
| 122 | + ) -> List[np.ndarray]: | |
| 146 | 123 | """ |
| 147 | 124 | 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。 |
| 148 | 125 | |
| ... | ... | @@ -151,6 +128,6 @@ class CLIPImageEncoder: |
| 151 | 128 | batch_size: 批大小(默认 8) |
| 152 | 129 | |
| 153 | 130 | Returns: |
| 154 | - 与 urls 等长的向量列表,失败为 None | |
| 131 | + 与 urls 等长的向量列表 | |
| 155 | 132 | """ |
| 156 | 133 | return self.encode_batch(urls, batch_size=batch_size or 8) | ... | ... |
embeddings/qwen3_model.py
| ... | ... | @@ -35,43 +35,15 @@ class Qwen3TextModel(object): |
| 35 | 35 | ) -> np.ndarray: |
| 36 | 36 | if device == "gpu": |
| 37 | 37 | device = "cuda" |
| 38 | - | |
| 39 | - # Try requested device, fallback to CPU if CUDA is unavailable/insufficient. | |
| 40 | - try: | |
| 41 | - if device == "cuda": | |
| 42 | - import torch | |
| 43 | - | |
| 44 | - if torch.cuda.is_available(): | |
| 45 | - free_memory = ( | |
| 46 | - torch.cuda.get_device_properties(0).total_memory | |
| 47 | - - torch.cuda.memory_allocated() | |
| 48 | - ) | |
| 49 | - if free_memory < 1024 * 1024 * 1024: # 1GB | |
| 50 | - device = "cpu" | |
| 51 | - else: | |
| 52 | - device = "cpu" | |
| 53 | - | |
| 54 | - self.model = self.model.to(device) | |
| 55 | - embeddings = self.model.encode( | |
| 56 | - sentences, | |
| 57 | - normalize_embeddings=normalize_embeddings, | |
| 58 | - device=device, | |
| 59 | - show_progress_bar=False, | |
| 60 | - batch_size=batch_size, | |
| 61 | - ) | |
| 62 | - return embeddings | |
| 63 | - except Exception: | |
| 64 | - if device != "cpu": | |
| 65 | - self.model = self.model.to("cpu") | |
| 66 | - embeddings = self.model.encode( | |
| 67 | - sentences, | |
| 68 | - normalize_embeddings=normalize_embeddings, | |
| 69 | - device="cpu", | |
| 70 | - show_progress_bar=False, | |
| 71 | - batch_size=batch_size, | |
| 72 | - ) | |
| 73 | - return embeddings | |
| 74 | - raise | |
| 38 | + self.model = self.model.to(device) | |
| 39 | + embeddings = self.model.encode( | |
| 40 | + sentences, | |
| 41 | + normalize_embeddings=normalize_embeddings, | |
| 42 | + device=device, | |
| 43 | + show_progress_bar=False, | |
| 44 | + batch_size=batch_size, | |
| 45 | + ) | |
| 46 | + return embeddings | |
| 75 | 47 | |
| 76 | 48 | def encode_batch( |
| 77 | 49 | self, |
| ... | ... | @@ -86,4 +58,3 @@ class Qwen3TextModel(object): |
| 86 | 58 | device=device, |
| 87 | 59 | normalize_embeddings=normalize_embeddings, |
| 88 | 60 | ) |
| 89 | - | ... | ... |
embeddings/server.py
| 1 | 1 | """ |
| 2 | 2 | Embedding service (FastAPI). |
| 3 | 3 | |
| 4 | -API (simple list-in, list-out; aligned by index; failures -> null): | |
| 5 | -- POST /embed/text body: ["text1", "text2", ...] -> [[...], null, ...] | |
| 6 | -- POST /embed/image body: ["url_or_path1", ...] -> [[...], null, ...] | |
| 4 | +API (simple list-in, list-out; aligned by index): | |
| 5 | +- POST /embed/text body: ["text1", "text2", ...] -> [[...], ...] | |
| 6 | +- POST /embed/image body: ["url_or_path1", ...] -> [[...], ...] | |
| 7 | 7 | """ |
| 8 | 8 | |
| 9 | 9 | import logging |
| ... | ... | @@ -11,7 +11,7 @@ import threading |
| 11 | 11 | from typing import Any, Dict, List, Optional |
| 12 | 12 | |
| 13 | 13 | import numpy as np |
| 14 | -from fastapi import FastAPI | |
| 14 | +from fastapi import FastAPI, HTTPException | |
| 15 | 15 | |
| 16 | 16 | from embeddings.config import CONFIG |
| 17 | 17 | from embeddings.protocols import ImageEncoderProtocol |
| ... | ... | @@ -51,9 +51,6 @@ def load_models(): |
| 51 | 51 | |
| 52 | 52 | |
| 53 | 53 | # Load image model: clip-as-service (recommended) or local CN-CLIP |
| 54 | - # IMPORTANT: failures here should NOT prevent the whole service from starting. | |
| 55 | - # If image model cannot be loaded, we keep `_image_model` as None and only | |
| 56 | - # disable /embed/image while keeping /embed/text fully functional. | |
| 57 | 54 | if open_image_model: |
| 58 | 55 | try: |
| 59 | 56 | if CONFIG.USE_CLIP_AS_SERVICE: |
| ... | ... | @@ -75,12 +72,8 @@ def load_models(): |
| 75 | 72 | ) |
| 76 | 73 | logger.info("Image model (local CN-CLIP) loaded successfully") |
| 77 | 74 | except Exception as e: |
| 78 | - logger.error( | |
| 79 | - "Failed to load image model; image embeddings will be disabled but text embeddings remain available: %s", | |
| 80 | - e, | |
| 81 | - exc_info=True, | |
| 82 | - ) | |
| 83 | - _image_model = None | |
| 75 | + logger.error("Failed to load image model: %s", e, exc_info=True) | |
| 76 | + raise | |
| 84 | 77 | |
| 85 | 78 | logger.info("All embedding models loaded successfully, service ready") |
| 86 | 79 | |
| ... | ... | @@ -109,70 +102,60 @@ def health() -> Dict[str, Any]: |
| 109 | 102 | def embed_text(texts: List[str]) -> List[Optional[List[float]]]: |
| 110 | 103 | if _text_model is None: |
| 111 | 104 | raise RuntimeError("Text model not loaded") |
| 112 | - out: List[Optional[List[float]]] = [None] * len(texts) | |
| 113 | - | |
| 114 | - indexed_texts: List[tuple] = [] | |
| 105 | + normalized: List[str] = [] | |
| 115 | 106 | for i, t in enumerate(texts): |
| 116 | - if t is None: | |
| 117 | - continue | |
| 118 | 107 | if not isinstance(t, str): |
| 119 | - t = str(t) | |
| 120 | - t = t.strip() | |
| 121 | - if not t: | |
| 122 | - continue | |
| 123 | - indexed_texts.append((i, t)) | |
| 124 | - | |
| 125 | - if not indexed_texts: | |
| 126 | - return out | |
| 127 | - | |
| 128 | - batch_texts = [t for _, t in indexed_texts] | |
| 129 | - try: | |
| 130 | - with _text_encode_lock: | |
| 131 | - embs = _text_model.encode_batch( | |
| 132 | - batch_texts, | |
| 133 | - batch_size=int(CONFIG.TEXT_BATCH_SIZE), | |
| 134 | - device=CONFIG.TEXT_DEVICE, | |
| 135 | - normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS), | |
| 136 | - ) | |
| 137 | - for j, (idx, _t) in enumerate(indexed_texts): | |
| 138 | - out[idx] = _as_list(embs[j]) | |
| 139 | - except Exception: | |
| 140 | - # keep Nones | |
| 141 | - pass | |
| 108 | + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: must be string") | |
| 109 | + s = t.strip() | |
| 110 | + if not s: | |
| 111 | + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string") | |
| 112 | + normalized.append(s) | |
| 113 | + | |
| 114 | + with _text_encode_lock: | |
| 115 | + embs = _text_model.encode_batch( | |
| 116 | + normalized, | |
| 117 | + batch_size=int(CONFIG.TEXT_BATCH_SIZE), | |
| 118 | + device=CONFIG.TEXT_DEVICE, | |
| 119 | + normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS), | |
| 120 | + ) | |
| 121 | + if embs is None or len(embs) != len(normalized): | |
| 122 | + raise RuntimeError( | |
| 123 | + f"Text model response length mismatch: expected {len(normalized)}, " | |
| 124 | + f"got {0 if embs is None else len(embs)}" | |
| 125 | + ) | |
| 126 | + out: List[Optional[List[float]]] = [] | |
| 127 | + for i, emb in enumerate(embs): | |
| 128 | + vec = _as_list(emb) | |
| 129 | + if vec is None: | |
| 130 | + raise RuntimeError(f"Text model returned empty embedding for index {i}") | |
| 131 | + out.append(vec) | |
| 142 | 132 | return out |
| 143 | 133 | |
| 144 | 134 | |
| 145 | 135 | @app.post("/embed/image") |
| 146 | 136 | def embed_image(images: List[str]) -> List[Optional[List[float]]]: |
| 147 | 137 | if _image_model is None: |
| 148 | - # Graceful degradation: keep API shape but return all None | |
| 149 | - logger.warning("embed_image called but image model is not loaded; returning all None vectors") | |
| 150 | - return [None] * len(images) | |
| 151 | - out: List[Optional[List[float]]] = [None] * len(images) | |
| 152 | - | |
| 153 | - # Normalize inputs | |
| 154 | - urls = [] | |
| 155 | - indices = [] | |
| 138 | + raise RuntimeError("Image model not loaded") | |
| 139 | + urls: List[str] = [] | |
| 156 | 140 | for i, url_or_path in enumerate(images): |
| 157 | - if url_or_path is None: | |
| 158 | - continue | |
| 159 | 141 | if not isinstance(url_or_path, str): |
| 160 | - url_or_path = str(url_or_path) | |
| 161 | - url_or_path = url_or_path.strip() | |
| 162 | - if url_or_path: | |
| 163 | - urls.append(url_or_path) | |
| 164 | - indices.append(i) | |
| 165 | - | |
| 166 | - if not urls: | |
| 167 | - return out | |
| 142 | + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: must be string URL/path") | |
| 143 | + s = url_or_path.strip() | |
| 144 | + if not s: | |
| 145 | + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path") | |
| 146 | + urls.append(s) | |
| 168 | 147 | |
| 169 | 148 | with _image_encode_lock: |
| 170 | - try: | |
| 171 | - # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size) | |
| 172 | - vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE) | |
| 173 | - for j, idx in enumerate(indices): | |
| 174 | - out[idx] = _as_list(vectors[j] if j < len(vectors) else None) | |
| 175 | - except Exception: | |
| 176 | - for idx in indices: | |
| 177 | - out[idx] = None | |
| 149 | + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE) | |
| 150 | + if vectors is None or len(vectors) != len(urls): | |
| 151 | + raise RuntimeError( | |
| 152 | + f"Image model response length mismatch: expected {len(urls)}, " | |
| 153 | + f"got {0 if vectors is None else len(vectors)}" | |
| 154 | + ) | |
| 155 | + out: List[Optional[List[float]]] = [] | |
| 156 | + for i, vec in enumerate(vectors): | |
| 157 | + out_vec = _as_list(vec) | |
| 158 | + if out_vec is None: | |
| 159 | + raise RuntimeError(f"Image model returned empty embedding for index {i}") | |
| 160 | + out.append(out_vec) | |
| 178 | 161 | return out | ... | ... |
embeddings/text_encoder.py
| ... | ... | @@ -89,9 +89,8 @@ class TextEmbeddingEncoder: |
| 89 | 89 | batch_size: Batch size for processing (used for service requests) |
| 90 | 90 | |
| 91 | 91 | Returns: |
| 92 | - numpy array of dtype=object, where each element is either: | |
| 93 | - - np.ndarray (valid embedding vector) or | |
| 94 | - - None (no embedding available for that text) | |
| 92 | + numpy array of dtype=object,元素均为有效 np.ndarray 向量。 | |
| 93 | + 若任一输入无法生成向量,将直接抛出异常。 | |
| 95 | 94 | """ |
| 96 | 95 | # Convert single string to list |
| 97 | 96 | if isinstance(sentences, str): |
| ... | ... | @@ -101,8 +100,6 @@ class TextEmbeddingEncoder: |
| 101 | 100 | uncached_indices: List[int] = [] |
| 102 | 101 | uncached_texts: List[str] = [] |
| 103 | 102 | |
| 104 | - # Process response | |
| 105 | - # Each element can be np.ndarray or None (表示该文本没有可用的向量) | |
| 106 | 103 | embeddings: List[Optional[np.ndarray]] = [None] * len(sentences) |
| 107 | 104 | |
| 108 | 105 | for i, text in enumerate(sentences): |
| ... | ... | @@ -118,40 +115,27 @@ class TextEmbeddingEncoder: |
| 118 | 115 | |
| 119 | 116 | # If there are uncached texts, call service |
| 120 | 117 | if uncached_texts: |
| 121 | - try: | |
| 122 | - # Call service | |
| 123 | - response_data = self._call_service(request_data) | |
| 118 | + response_data = self._call_service(request_data) | |
| 124 | 119 | |
| 125 | - # Process response | |
| 126 | - for i, text in enumerate(uncached_texts): | |
| 127 | - original_idx = uncached_indices[i] | |
| 128 | - if response_data and i < len(response_data): | |
| 129 | - embedding = response_data[i] | |
| 130 | - else: | |
| 131 | - embedding = None | |
| 120 | + # Process response | |
| 121 | + for i, text in enumerate(uncached_texts): | |
| 122 | + original_idx = uncached_indices[i] | |
| 123 | + if response_data and i < len(response_data): | |
| 124 | + embedding = response_data[i] | |
| 125 | + else: | |
| 126 | + embedding = None | |
| 132 | 127 | |
| 133 | - if embedding is not None: | |
| 134 | - embedding_array = np.array(embedding, dtype=np.float32) | |
| 135 | - # Validate embedding from service - if invalid, treat as no result | |
| 136 | - if self._is_valid_embedding(embedding_array): | |
| 137 | - embeddings[original_idx] = embedding_array | |
| 138 | - # Cache the embedding | |
| 139 | - self._set_cached_embedding(text, "generic", embedding_array) | |
| 140 | - else: | |
| 141 | - logger.warning( | |
| 142 | - f"Invalid embedding returned from service for text {original_idx} " | |
| 143 | - f"(contains NaN/Inf or invalid shape), treating as no result. " | |
| 144 | - f"Text preview: {text[:50]}..." | |
| 145 | - ) | |
| 146 | - embeddings[original_idx] = None | |
| 128 | + if embedding is not None: | |
| 129 | + embedding_array = np.array(embedding, dtype=np.float32) | |
| 130 | + if self._is_valid_embedding(embedding_array): | |
| 131 | + embeddings[original_idx] = embedding_array | |
| 132 | + self._set_cached_embedding(text, "generic", embedding_array) | |
| 147 | 133 | else: |
| 148 | - logger.warning(f"No embedding found for text {original_idx}: {text[:50]}...") | |
| 149 | - embeddings[original_idx] = None | |
| 150 | - | |
| 151 | - except Exception as e: | |
| 152 | - logger.error(f"Failed to encode texts: {e}", exc_info=True) | |
| 153 | - # 出错时不要生成兜底全零向量,保持为 None | |
| 154 | - pass | |
| 134 | + raise ValueError( | |
| 135 | + f"Invalid embedding returned from service for text index {original_idx}" | |
| 136 | + ) | |
| 137 | + else: | |
| 138 | + raise ValueError(f"No embedding found for text index {original_idx}: {text[:50]}...") | |
| 155 | 139 | |
| 156 | 140 | # 返回 numpy 数组(dtype=object),元素为 np.ndarray 或 None |
| 157 | 141 | return np.array(embeddings, dtype=object) |
| ... | ... | @@ -254,7 +238,3 @@ class TextEmbeddingEncoder: |
| 254 | 238 | except Exception as e: |
| 255 | 239 | logger.error(f"Error storing embedding in cache: {e}") |
| 256 | 240 | return False |
| 257 | - | |
| 258 | - | |
| 259 | -# Backward compatibility for existing imports/usages. | |
| 260 | -BgeEncoder = TextEmbeddingEncoder | ... | ... |
indexer/document_transformer.py
| ... | ... | @@ -406,23 +406,21 @@ class SPUDocumentTransformer: |
| 406 | 406 | |
| 407 | 407 | if not urls: |
| 408 | 408 | return |
| 409 | - try: | |
| 410 | - vectors = self.image_encoder.encode_image_urls(urls, batch_size=8) | |
| 411 | - if not vectors or len(vectors) != len(urls): | |
| 412 | - return | |
| 413 | - out = [] | |
| 414 | - for url, vec in zip(urls, vectors): | |
| 415 | - if vec is None: | |
| 416 | - continue | |
| 417 | - if isinstance(vec, np.ndarray): | |
| 418 | - vec = vec.astype(np.float32) | |
| 419 | - out.append({"vector": vec.tolist(), "url": url}) | |
| 420 | - elif hasattr(vec, "tolist"): | |
| 421 | - out.append({"vector": vec.tolist(), "url": url}) | |
| 422 | - if out: | |
| 423 | - doc["image_embedding"] = out | |
| 424 | - except Exception as e: | |
| 425 | - logger.warning("Failed to generate image_embedding for SPU %s: %s", doc.get("spu_id"), e) | |
| 409 | + vectors = self.image_encoder.encode_image_urls(urls, batch_size=8) | |
| 410 | + if not vectors or len(vectors) != len(urls): | |
| 411 | + raise RuntimeError( | |
| 412 | + f"image_embedding response length mismatch for SPU {doc.get('spu_id')}: " | |
| 413 | + f"expected {len(urls)}, got {0 if vectors is None else len(vectors)}" | |
| 414 | + ) | |
| 415 | + out = [] | |
| 416 | + for url, vec in zip(urls, vectors): | |
| 417 | + arr = np.asarray(vec, dtype=np.float32) | |
| 418 | + if arr.ndim != 1 or arr.size == 0 or not np.isfinite(arr).all(): | |
| 419 | + raise RuntimeError( | |
| 420 | + f"Invalid image embedding for SPU {doc.get('spu_id')} and URL {url}" | |
| 421 | + ) | |
| 422 | + out.append({"vector": arr.tolist(), "url": url}) | |
| 423 | + doc["image_embedding"] = out | |
| 426 | 424 | |
| 427 | 425 | def _process_skus( |
| 428 | 426 | self, |
| ... | ... | @@ -722,21 +720,14 @@ class SPUDocumentTransformer: |
| 722 | 720 | logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}") |
| 723 | 721 | return |
| 724 | 722 | |
| 725 | - try: | |
| 726 | - # 使用文本向量编码器生成 embedding | |
| 727 | - # encode方法返回numpy数组,形状为(n, 1024) | |
| 728 | - embeddings = self.encoder.encode(title_text) | |
| 729 | - | |
| 730 | - if embeddings is not None and len(embeddings) > 0: | |
| 731 | - # 取第一个embedding(因为只传了一个文本) | |
| 732 | - embedding = embeddings[0] | |
| 733 | - if not isinstance(embedding, np.ndarray): | |
| 734 | - logger.warning(f"Embedding is None/invalid for title: {title_text[:50]}...") | |
| 735 | - return | |
| 736 | - # 转换为列表格式(ES需要) | |
| 737 | - doc['title_embedding'] = embedding.tolist() | |
| 738 | - logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...") | |
| 739 | - else: | |
| 740 | - logger.warning(f"Failed to generate embedding for title: {title_text[:50]}...") | |
| 741 | - except Exception as e: | |
| 742 | - logger.error(f"Error generating title_embedding for SPU {doc.get('spu_id')}: {e}", exc_info=True) | |
| 723 | + # 使用文本向量编码器生成 embedding | |
| 724 | + # encode方法返回numpy数组,形状为(n, d) | |
| 725 | + embeddings = self.encoder.encode(title_text) | |
| 726 | + if embeddings is None or len(embeddings) == 0: | |
| 727 | + raise RuntimeError(f"Failed to generate title embedding for SPU {doc.get('spu_id')}") | |
| 728 | + | |
| 729 | + embedding = np.asarray(embeddings[0], dtype=np.float32) | |
| 730 | + if embedding.ndim != 1 or embedding.size == 0 or not np.isfinite(embedding).all(): | |
| 731 | + raise RuntimeError(f"Invalid title embedding for SPU {doc.get('spu_id')}") | |
| 732 | + doc['title_embedding'] = embedding.tolist() | |
| 733 | + logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...") | ... | ... |
indexer/incremental_service.py
| ... | ... | @@ -33,9 +33,9 @@ class IncrementalIndexerService: |
| 33 | 33 | logger.info(f"Preloaded {len(self.category_id_to_name)} category mappings") |
| 34 | 34 | |
| 35 | 35 | # 缓存:避免频繁增量请求重复加载 config / 构造 transformer |
| 36 | - # NOTE: 为避免“首请求”懒加载导致超时,尽量在进程启动阶段完成初始化: | |
| 36 | + # 启动阶段强校验初始化: | |
| 37 | 37 | # - config.yaml 加载 |
| 38 | - # - translator / embedding / image encoder provider 初始化(best-effort) | |
| 38 | + # - translator / embedding / image encoder provider 初始化 | |
| 39 | 39 | self._config: Optional[Any] = None |
| 40 | 40 | self._config_lock = threading.Lock() |
| 41 | 41 | self._translator: Optional[Any] = None |
| ... | ... | @@ -50,59 +50,35 @@ class IncrementalIndexerService: |
| 50 | 50 | self._transformer_cache_lock = threading.Lock() |
| 51 | 51 | |
| 52 | 52 | def _eager_init(self) -> None: |
| 53 | - """Best-effort eager initialization to reduce first-request latency.""" | |
| 54 | - try: | |
| 55 | - self._config = ConfigLoader("config/config.yaml").load_config() | |
| 56 | - except Exception as e: | |
| 57 | - logger.warning("Failed to eagerly load config/config.yaml: %s", e, exc_info=True) | |
| 58 | - self._config = None | |
| 59 | - return | |
| 53 | + """Strict eager initialization. Any dependency failure should fail fast.""" | |
| 54 | + self._config = ConfigLoader("config/config.yaml").load_config() | |
| 55 | + self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | |
| 56 | + self._searchable_option_dimensions = ( | |
| 57 | + getattr(self._config.spu_config, "searchable_option_dimensions", None) | |
| 58 | + or ["option1", "option2", "option3"] | |
| 59 | + ) | |
| 60 | 60 | |
| 61 | - try: | |
| 62 | - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | |
| 63 | - self._searchable_option_dimensions = ( | |
| 64 | - getattr(self._config.spu_config, "searchable_option_dimensions", None) | |
| 65 | - or ["option1", "option2", "option3"] | |
| 66 | - ) | |
| 67 | - except Exception: | |
| 68 | - self._translation_prompts = {} | |
| 69 | - self._searchable_option_dimensions = ["option1", "option2", "option3"] | |
| 61 | + from providers import create_translation_provider | |
| 70 | 62 | |
| 71 | - # Translator provider (best-effort) | |
| 72 | - try: | |
| 73 | - from providers import create_translation_provider | |
| 63 | + self._translator = create_translation_provider(self._config.query_config) | |
| 74 | 64 | |
| 75 | - self._translator = create_translation_provider(self._config.query_config) | |
| 76 | - except Exception as e: | |
| 77 | - logger.warning("Failed to initialize translation provider at startup: %s", e) | |
| 78 | - self._translator = None | |
| 79 | - | |
| 80 | - # Text embedding encoder (best-effort) | |
| 65 | + # Text embedding encoder (strict when enabled) | |
| 81 | 66 | if bool(getattr(self._config.query_config, "enable_text_embedding", False)): |
| 82 | - try: | |
| 83 | - from embeddings.text_encoder import TextEmbeddingEncoder | |
| 67 | + from embeddings.text_encoder import TextEmbeddingEncoder | |
| 84 | 68 | |
| 85 | - self._shared_text_encoder = TextEmbeddingEncoder() | |
| 86 | - except Exception as e: | |
| 87 | - logger.warning("Failed to initialize TextEmbeddingEncoder at startup: %s", e) | |
| 88 | - self._shared_text_encoder = None | |
| 69 | + self._shared_text_encoder = TextEmbeddingEncoder() | |
| 70 | + else: | |
| 71 | + self._shared_text_encoder = None | |
| 89 | 72 | |
| 90 | - # Image embedding encoder (best-effort; may be unavailable if embedding service not running) | |
| 91 | - try: | |
| 92 | - from embeddings.image_encoder import CLIPImageEncoder | |
| 73 | + # Image embedding encoder (strict) | |
| 74 | + from embeddings.image_encoder import CLIPImageEncoder | |
| 93 | 75 | |
| 94 | - self._shared_image_encoder = CLIPImageEncoder() | |
| 95 | - except Exception as e: | |
| 96 | - logger.debug("Image encoder not available for indexer startup: %s", e) | |
| 97 | - self._shared_image_encoder = None | |
| 76 | + self._shared_image_encoder = CLIPImageEncoder() | |
| 98 | 77 | |
| 99 | 78 | def _get_config(self) -> Any: |
| 100 | 79 | """Load config once per process (thread-safe).""" |
| 101 | - if self._config is not None: | |
| 102 | - return self._config | |
| 103 | - with self._config_lock: | |
| 104 | - if self._config is None: | |
| 105 | - self._config = ConfigLoader("config/config.yaml").load_config() | |
| 80 | + if self._config is None: | |
| 81 | + raise RuntimeError("Indexer config is not initialized") | |
| 106 | 82 | return self._config |
| 107 | 83 | |
| 108 | 84 | def _get_transformer_bundle(self, tenant_id: str) -> Tuple[Any, Optional[Any], bool]: |
| ... | ... | @@ -121,32 +97,13 @@ class IncrementalIndexerService: |
| 121 | 97 | config = self._get_config() |
| 122 | 98 | enable_embedding = bool(getattr(config.query_config, "enable_text_embedding", False)) |
| 123 | 99 | |
| 124 | - # Use shared encoders/providers preloaded at startup when可用; | |
| 125 | - # 若启动时初始化失败,则在首次请求时做一次兜底初始化,避免永久禁用。 | |
| 126 | 100 | encoder: Optional[Any] = self._shared_text_encoder if enable_embedding else None |
| 127 | 101 | if enable_embedding and encoder is None: |
| 128 | - try: | |
| 129 | - from embeddings.text_encoder import TextEmbeddingEncoder | |
| 130 | - | |
| 131 | - encoder = TextEmbeddingEncoder() | |
| 132 | - self._shared_text_encoder = encoder | |
| 133 | - logger.info("TextEmbeddingEncoder lazily initialized in _get_transformer_bundle") | |
| 134 | - except Exception as e: | |
| 135 | - logger.warning("Failed to lazily initialize TextEmbeddingEncoder for tenant_id=%s: %s", tenant_id, e) | |
| 136 | - encoder = None | |
| 137 | - enable_embedding = False | |
| 102 | + raise RuntimeError("Text embedding is enabled but TextEmbeddingEncoder is not initialized") | |
| 138 | 103 | |
| 139 | 104 | image_encoder: Optional[Any] = self._shared_image_encoder |
| 140 | 105 | if image_encoder is None: |
| 141 | - try: | |
| 142 | - from embeddings.image_encoder import CLIPImageEncoder | |
| 143 | - | |
| 144 | - image_encoder = CLIPImageEncoder() | |
| 145 | - self._shared_image_encoder = image_encoder | |
| 146 | - logger.info("CLIPImageEncoder lazily initialized in _get_transformer_bundle") | |
| 147 | - except Exception as e: | |
| 148 | - logger.debug("Image encoder not available for indexer (lazy init): %s", e) | |
| 149 | - image_encoder = None | |
| 106 | + raise RuntimeError("CLIPImageEncoder is not initialized") | |
| 150 | 107 | |
| 151 | 108 | transformer = create_document_transformer( |
| 152 | 109 | category_id_to_name=self.category_id_to_name, |
| ... | ... | @@ -157,7 +114,7 @@ class IncrementalIndexerService: |
| 157 | 114 | encoder=encoder, |
| 158 | 115 | enable_title_embedding=False, # batch fill later |
| 159 | 116 | image_encoder=image_encoder, |
| 160 | - enable_image_embedding=(image_encoder is not None), | |
| 117 | + enable_image_embedding=True, | |
| 161 | 118 | config=config, |
| 162 | 119 | ) |
| 163 | 120 | |
| ... | ... | @@ -236,14 +193,13 @@ class IncrementalIndexerService: |
| 236 | 193 | title_text = str(v) |
| 237 | 194 | break |
| 238 | 195 | if title_text and str(title_text).strip(): |
| 239 | - try: | |
| 240 | - embeddings = encoder.encode(title_text) | |
| 241 | - if embeddings is not None and len(embeddings) > 0: | |
| 242 | - emb0 = embeddings[0] | |
| 243 | - if isinstance(emb0, np.ndarray): | |
| 244 | - doc["title_embedding"] = emb0.tolist() | |
| 245 | - except Exception as e: | |
| 246 | - logger.warning(f"Failed to generate embedding for spu_id={spu_id}: {e}") | |
| 196 | + embeddings = encoder.encode(title_text) | |
| 197 | + if embeddings is None or len(embeddings) == 0: | |
| 198 | + raise RuntimeError(f"Failed to generate title embedding for spu_id={spu_id}") | |
| 199 | + emb0 = np.asarray(embeddings[0], dtype=np.float32) | |
| 200 | + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all(): | |
| 201 | + raise RuntimeError(f"Invalid title embedding for spu_id={spu_id}") | |
| 202 | + doc["title_embedding"] = emb0.tolist() | |
| 247 | 203 | |
| 248 | 204 | return doc |
| 249 | 205 | |
| ... | ... | @@ -678,14 +634,20 @@ class IncrementalIndexerService: |
| 678 | 634 | title_doc_indices.append(i) |
| 679 | 635 | |
| 680 | 636 | if title_texts: |
| 681 | - try: | |
| 682 | - embeddings = encoder.encode_batch(title_texts, batch_size=32) | |
| 683 | - for j, emb in enumerate(embeddings): | |
| 684 | - doc_idx = title_doc_indices[j] | |
| 685 | - if isinstance(emb, np.ndarray): | |
| 686 | - documents[doc_idx][1]["title_embedding"] = emb.tolist() | |
| 687 | - except Exception as e: | |
| 688 | - logger.warning(f"[IncrementalIndexing] Batch embedding failed for tenant_id={tenant_id}: {e}", exc_info=True) | |
| 637 | + embeddings = encoder.encode_batch(title_texts, batch_size=32) | |
| 638 | + if embeddings is None or len(embeddings) != len(title_texts): | |
| 639 | + raise RuntimeError( | |
| 640 | + f"[IncrementalIndexing] Batch embedding length mismatch for tenant_id={tenant_id}: " | |
| 641 | + f"expected {len(title_texts)}, got {0 if embeddings is None else len(embeddings)}" | |
| 642 | + ) | |
| 643 | + for j, emb in enumerate(embeddings): | |
| 644 | + vec = np.asarray(emb, dtype=np.float32) | |
| 645 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | |
| 646 | + raise RuntimeError( | |
| 647 | + f"[IncrementalIndexing] Invalid title embedding in batch for tenant_id={tenant_id}, index={j}" | |
| 648 | + ) | |
| 649 | + doc_idx = title_doc_indices[j] | |
| 650 | + documents[doc_idx][1]["title_embedding"] = vec.tolist() | |
| 689 | 651 | |
| 690 | 652 | logger.info(f"[IncrementalIndexing] Transformed {len(documents)}/{total_count} documents") |
| 691 | 653 | |
| ... | ... | @@ -789,4 +751,3 @@ class IncrementalIndexerService: |
| 789 | 751 | "index_name": index_name, |
| 790 | 752 | "tenant_id": tenant_id |
| 791 | 753 | } |
| 792 | - | ... | ... |
indexer/indexing_utils.py
| ... | ... | @@ -92,38 +92,29 @@ def create_document_transformer( |
| 92 | 92 | or (encoder is None and enable_title_embedding) |
| 93 | 93 | or config is None |
| 94 | 94 | ): |
| 95 | - try: | |
| 96 | - if config is None: | |
| 97 | - config_loader = ConfigLoader() | |
| 98 | - config = config_loader.load_config() | |
| 99 | - | |
| 100 | - if searchable_option_dimensions is None: | |
| 101 | - searchable_option_dimensions = config.spu_config.searchable_option_dimensions | |
| 102 | - | |
| 103 | - index_langs = tenant_config.get("index_languages") or [] | |
| 104 | - need_translator = len(index_langs) > 1 | |
| 105 | - if translator is None and need_translator: | |
| 106 | - from providers import create_translation_provider | |
| 107 | - translator = create_translation_provider(config.query_config) | |
| 108 | - | |
| 109 | - if translation_prompts is None: | |
| 110 | - translation_prompts = config.query_config.translation_prompts | |
| 111 | - | |
| 112 | - # 初始化encoder(如果启用标题向量化且未提供encoder) | |
| 113 | - if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: | |
| 114 | - try: | |
| 115 | - from embeddings.text_encoder import TextEmbeddingEncoder | |
| 116 | - encoder = TextEmbeddingEncoder() | |
| 117 | - logger.info("TextEmbeddingEncoder initialized for title embedding") | |
| 118 | - except Exception as e: | |
| 119 | - logger.warning(f"Failed to initialize TextEmbeddingEncoder: {e}, title embedding will be disabled") | |
| 120 | - enable_title_embedding = False | |
| 121 | - except Exception as e: | |
| 122 | - logger.warning(f"Failed to load config, using defaults: {e}") | |
| 123 | - if searchable_option_dimensions is None: | |
| 124 | - searchable_option_dimensions = ['option1', 'option2', 'option3'] | |
| 125 | - if translation_prompts is None: | |
| 126 | - translation_prompts = {} | |
| 95 | + if config is None: | |
| 96 | + config_loader = ConfigLoader() | |
| 97 | + config = config_loader.load_config() | |
| 98 | + | |
| 99 | + if searchable_option_dimensions is None: | |
| 100 | + searchable_option_dimensions = config.spu_config.searchable_option_dimensions | |
| 101 | + | |
| 102 | + index_langs = tenant_config.get("index_languages") or [] | |
| 103 | + need_translator = len(index_langs) > 1 | |
| 104 | + if translator is None and need_translator: | |
| 105 | + from providers import create_translation_provider | |
| 106 | + | |
| 107 | + translator = create_translation_provider(config.query_config) | |
| 108 | + | |
| 109 | + if translation_prompts is None: | |
| 110 | + translation_prompts = config.query_config.translation_prompts | |
| 111 | + | |
| 112 | + # 初始化encoder(如果启用标题向量化且未提供encoder) | |
| 113 | + if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: | |
| 114 | + from embeddings.text_encoder import TextEmbeddingEncoder | |
| 115 | + | |
| 116 | + encoder = TextEmbeddingEncoder() | |
| 117 | + logger.info("TextEmbeddingEncoder initialized for title embedding") | |
| 127 | 118 | |
| 128 | 119 | return SPUDocumentTransformer( |
| 129 | 120 | category_id_to_name=category_id_to_name, | ... | ... |
main.py
| ... | ... | @@ -22,6 +22,7 @@ from utils import ESClient |
| 22 | 22 | from search import Searcher |
| 23 | 23 | from suggestion import SuggestionIndexBuilder |
| 24 | 24 | from utils.db_connector import create_db_connection |
| 25 | +from context.request_context import create_request_context | |
| 25 | 26 | |
| 26 | 27 | |
| 27 | 28 | def cmd_serve(args): |
| ... | ... | @@ -78,7 +79,8 @@ def cmd_search(args): |
| 78 | 79 | result = searcher.search( |
| 79 | 80 | query=args.query, |
| 80 | 81 | tenant_id=args.tenant_id, |
| 81 | - size=args.size | |
| 82 | + size=args.size, | |
| 83 | + context=create_request_context(), | |
| 82 | 84 | ) |
| 83 | 85 | |
| 84 | 86 | # Display results | ... | ... |
providers/embedding.py
| 1 | -""" | |
| 2 | -Embedding provider - HTTP service (vllm reserved). | |
| 3 | - | |
| 4 | -Returns text/image encoders configured via services_config. | |
| 5 | -""" | |
| 1 | +"""Embedding provider - HTTP service.""" | |
| 6 | 2 | |
| 7 | 3 | from __future__ import annotations |
| 8 | 4 | |
| ... | ... | @@ -14,10 +10,7 @@ def create_embedding_provider() -> "EmbeddingProvider": |
| 14 | 10 | cfg = get_embedding_config() |
| 15 | 11 | provider = (cfg.provider or "http").strip().lower() |
| 16 | 12 | if provider != "http": |
| 17 | - import logging | |
| 18 | - logging.getLogger(__name__).warning( | |
| 19 | - "Unsupported embedding provider '%s', fallback to HTTP provider.", provider | |
| 20 | - ) | |
| 13 | + raise ValueError(f"Unsupported embedding provider: {provider}") | |
| 21 | 14 | return EmbeddingProvider() |
| 22 | 15 | |
| 23 | 16 | ... | ... |
providers/rerank.py
| 1 | -""" | |
| 2 | -Rerank provider - HTTP service (vllm reserved). | |
| 3 | -""" | |
| 1 | +"""Rerank provider - HTTP service.""" | |
| 4 | 2 | |
| 5 | 3 | from __future__ import annotations |
| 6 | 4 | |
| ... | ... | @@ -61,8 +59,8 @@ def create_rerank_provider() -> HttpRerankProvider: |
| 61 | 59 | cfg = get_rerank_config() |
| 62 | 60 | provider = (cfg.provider or "http").strip().lower() |
| 63 | 61 | |
| 64 | - if provider == "vllm": | |
| 65 | - logger.warning("rerank provider 'vllm' is reserved, using HTTP.") | |
| 62 | + if provider != "http": | |
| 63 | + raise ValueError(f"Unsupported rerank provider: {provider}") | |
| 66 | 64 | |
| 67 | 65 | url = get_rerank_service_url() |
| 68 | 66 | return HttpRerankProvider(service_url=url) | ... | ... |
providers/translation.py
| ... | ... | @@ -158,19 +158,7 @@ def create_translation_provider(query_config: Any = None) -> Any: |
| 158 | 158 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), |
| 159 | 159 | ) |
| 160 | 160 | |
| 161 | - logger.warning( | |
| 162 | - "Unsupported translation provider '%s', fallback to direct.", | |
| 163 | - provider, | |
| 164 | - ) | |
| 165 | - from query.translator import Translator | |
| 166 | - qc = query_config or _empty_query_config() | |
| 167 | - return Translator( | |
| 168 | - model=pc.get("model") or "qwen", | |
| 169 | - api_key=getattr(qc, "translation_api_key", None), | |
| 170 | - use_cache=True, | |
| 171 | - glossary_id=getattr(qc, "translation_glossary_id", None), | |
| 172 | - translation_context=getattr(qc, "translation_context", "e-commerce product search"), | |
| 173 | - ) | |
| 161 | + raise ValueError(f"Unsupported translation provider: {provider}") | |
| 174 | 162 | |
| 175 | 163 | |
| 176 | 164 | def _empty_query_config() -> Any: | ... | ... |
query/__init__.py
| ... | ... | @@ -2,15 +2,12 @@ |
| 2 | 2 | |
| 3 | 3 | from .language_detector import LanguageDetector |
| 4 | 4 | from .translator import Translator |
| 5 | -from .translation_client import HttpTranslationClient, create_translation_client | |
| 6 | 5 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 7 | 6 | from .query_parser import QueryParser, ParsedQuery |
| 8 | 7 | |
| 9 | 8 | __all__ = [ |
| 10 | 9 | 'LanguageDetector', |
| 11 | 10 | 'Translator', |
| 12 | - 'HttpTranslationClient', | |
| 13 | - 'create_translation_client', | |
| 14 | 11 | 'QueryRewriter', |
| 15 | 12 | 'QueryNormalizer', |
| 16 | 13 | 'QueryParser', | ... | ... |
query/translation_client.py deleted
| ... | ... | @@ -1,20 +0,0 @@ |
| 1 | -""" | |
| 2 | -Translation client - delegates to providers. | |
| 3 | - | |
| 4 | -Deprecated: use providers.create_translation_provider() instead. | |
| 5 | -Kept for backward compatibility. | |
| 6 | -""" | |
| 7 | - | |
| 8 | -from __future__ import annotations | |
| 9 | - | |
| 10 | -from typing import Any | |
| 11 | - | |
| 12 | -from providers.translation import ( | |
| 13 | - HttpTranslationProvider as HttpTranslationClient, | |
| 14 | - create_translation_provider, | |
| 15 | -) | |
| 16 | - | |
| 17 | - | |
| 18 | -def create_translation_client(query_config: Any) -> Any: | |
| 19 | - """Backward compat: delegate to create_translation_provider.""" | |
| 20 | - return create_translation_provider(query_config) |
requirements.txt
scripts/create_venv.sh
| ... | ... | @@ -44,7 +44,8 @@ fi |
| 44 | 44 | # shellcheck disable=SC1091 |
| 45 | 45 | source "${VENV_DIR}/bin/activate" |
| 46 | 46 | |
| 47 | -python -m pip install --upgrade pip setuptools wheel | |
| 47 | +python -m pip install --upgrade pip wheel | |
| 48 | +python -m pip install "setuptools<82" | |
| 48 | 49 | python -m pip install -r requirements.txt |
| 49 | 50 | |
| 50 | 51 | if [[ "${INSTALL_ML:-0}" == "1" ]]; then | ... | ... |
scripts/start_embedding_service.sh
| ... | ... | @@ -16,10 +16,27 @@ source ./activate.sh |
| 16 | 16 | |
| 17 | 17 | DEFAULT_EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)") |
| 18 | 18 | DEFAULT_EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)") |
| 19 | +USE_CLIP_AS_SERVICE=$(python -c "from embeddings.config import CONFIG; print('1' if CONFIG.USE_CLIP_AS_SERVICE else '0')") | |
| 19 | 20 | |
| 20 | 21 | EMBEDDING_SERVICE_HOST="${EMBEDDING_HOST:-${DEFAULT_EMBEDDING_SERVICE_HOST}}" |
| 21 | 22 | EMBEDDING_SERVICE_PORT="${EMBEDDING_PORT:-${DEFAULT_EMBEDDING_SERVICE_PORT}}" |
| 22 | 23 | |
| 24 | +if [[ "${USE_CLIP_AS_SERVICE}" == "1" ]]; then | |
| 25 | + if ! python - <<'PY' | |
| 26 | +try: | |
| 27 | + import pkg_resources # noqa: F401 | |
| 28 | +except Exception: | |
| 29 | + raise SystemExit(1) | |
| 30 | +PY | |
| 31 | + then | |
| 32 | + echo "ERROR: clip-as-service image embedding requires pkg_resources, but current venv is missing it." >&2 | |
| 33 | + echo "Fix:" >&2 | |
| 34 | + echo " python -m pip install 'setuptools<82'" >&2 | |
| 35 | + echo "Then restart: ./scripts/start_embedding_service.sh" >&2 | |
| 36 | + exit 1 | |
| 37 | + fi | |
| 38 | +fi | |
| 39 | + | |
| 23 | 40 | echo "========================================" |
| 24 | 41 | echo "Starting Local Embedding Service" |
| 25 | 42 | echo "========================================" |
| ... | ... | @@ -36,4 +53,3 @@ exec python -m uvicorn embeddings.server:app \ |
| 36 | 53 | --port "${EMBEDDING_SERVICE_PORT}" \ |
| 37 | 54 | --workers 1 |
| 38 | 55 | |
| 39 | - | ... | ... |
scripts/test_build_docs_api.py
| ... | ... | @@ -29,6 +29,10 @@ except ImportError: |
| 29 | 29 | def build_sample_request(): |
| 30 | 30 | """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。""" |
| 31 | 31 | now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
| 32 | + sample_image_url = os.getenv( | |
| 33 | + "SAMPLE_IMAGE_URL", | |
| 34 | + "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", | |
| 35 | + ) | |
| 32 | 36 | return { |
| 33 | 37 | "tenant_id": "162", |
| 34 | 38 | "items": [ |
| ... | ... | @@ -45,7 +49,7 @@ def build_sample_request(): |
| 45 | 49 | "category_level": 2, |
| 46 | 50 | "category_path": "服装/上衣/T恤", |
| 47 | 51 | "fake_sales": 1280, |
| 48 | - "image_src": "https://example.com/img/tshirt.jpg", | |
| 52 | + "image_src": sample_image_url, | |
| 49 | 53 | "tags": "T恤,纯棉,短袖,夏季", |
| 50 | 54 | "create_time": now, |
| 51 | 55 | "update_time": now, | ... | ... |
| ... | ... | @@ -0,0 +1,74 @@ |
| 1 | +#!/bin/bash | |
| 2 | +# | |
| 3 | +# 排查「谁在调用索引服务」的脚本 | |
| 4 | +# 用法: ./scripts/trace_indexer_calls.sh | |
| 5 | +# | |
| 6 | + | |
| 7 | +set -euo pipefail | |
| 8 | + | |
| 9 | +cd "$(dirname "$0")/.." | |
| 10 | +source ./activate.sh 2>/dev/null || true | |
| 11 | + | |
| 12 | +echo "==========================================" | |
| 13 | +echo "索引服务调用方排查" | |
| 14 | +echo "==========================================" | |
| 15 | + | |
| 16 | +INDEXER_PORT="${INDEXER_PORT:-6004}" | |
| 17 | +EMBEDDING_PORT="${EMBEDDING_PORT:-6005}" | |
| 18 | + | |
| 19 | +echo "" | |
| 20 | +echo "1. 监听端口 6004 的进程(Indexer 服务)" | |
| 21 | +echo "------------------------------------------" | |
| 22 | +if command -v lsof >/dev/null 2>&1; then | |
| 23 | + lsof -i :"${INDEXER_PORT}" 2>/dev/null || echo " (无进程监听或 lsof 无权限)" | |
| 24 | +else | |
| 25 | + ss -tlnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (无进程监听)" | |
| 26 | +fi | |
| 27 | + | |
| 28 | +echo "" | |
| 29 | +echo "2. 连接到 6004 的客户端(谁在请求 Indexer)" | |
| 30 | +echo "------------------------------------------" | |
| 31 | +if command -v ss >/dev/null 2>&1; then | |
| 32 | + ss -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)" | |
| 33 | +elif command -v netstat >/dev/null 2>&1; then | |
| 34 | + netstat -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)" | |
| 35 | +else | |
| 36 | + echo " 请安装 ss 或 netstat" | |
| 37 | +fi | |
| 38 | + | |
| 39 | +echo "" | |
| 40 | +echo "3. 连接到 6005 的客户端(Indexer 会调用 Embedding 服务)" | |
| 41 | +echo "------------------------------------------" | |
| 42 | +if command -v ss >/dev/null 2>&1; then | |
| 43 | + ss -tnp 2>/dev/null | grep ":${EMBEDDING_PORT}" || echo " (当前无活跃连接)" | |
| 44 | +fi | |
| 45 | + | |
| 46 | +echo "" | |
| 47 | +echo "4. 检查定时任务(cron)" | |
| 48 | +echo "------------------------------------------" | |
| 49 | +(crontab -l 2>/dev/null | grep -i indexer) || echo " 当前用户无相关 cron" | |
| 50 | +if [ -d /etc/cron.d ]; then | |
| 51 | + grep -l -i indexer /etc/cron.d/* 2>/dev/null || true | |
| 52 | +fi | |
| 53 | + | |
| 54 | +echo "" | |
| 55 | +echo "5. 端口与逻辑说明" | |
| 56 | +echo "------------------------------------------" | |
| 57 | +echo " - Indexer 服务: 端口 ${INDEXER_PORT}" | |
| 58 | +echo " 启动: ./scripts/start_indexer.sh 或 python main.py serve-indexer" | |
| 59 | +echo " 接口: POST /indexer/reindex, POST /indexer/index, POST /indexer/build-docs 等" | |
| 60 | +echo "" | |
| 61 | +echo " - 调用方(文档说明): 外部 Java 程序或 curl 等 HTTP 客户端" | |
| 62 | +echo " 全量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/reindex -d '{\"tenant_id\":\"170\",\"batch_size\":500}'" | |
| 63 | +echo " 增量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/index -d '{\"tenant_id\":\"170\",\"spu_ids\":[\"123\"]}'" | |
| 64 | +echo "" | |
| 65 | +echo " - Indexer 内部会调用:" | |
| 66 | +echo " - Embedding 服务 (${EMBEDDING_PORT}): POST /embed/text" | |
| 67 | +echo " - Qwen API: dashscope.aliyuncs.com (翻译、LLM 分析)" | |
| 68 | +echo " - MySQL: 商品数据" | |
| 69 | +echo " - Elasticsearch: 写入索引" | |
| 70 | +echo "" | |
| 71 | +echo "6. 实时监控连接(按 Ctrl+C 停止)" | |
| 72 | +echo "------------------------------------------" | |
| 73 | +echo " 运行: watch -n 2 'ss -tnp | grep -E \":${INDEXER_PORT}|:${EMBEDDING_PORT}\"'" | |
| 74 | +echo "" | ... | ... |
search/es_query_builder.py
| ... | ... | @@ -19,7 +19,6 @@ class ESQueryBuilder: |
| 19 | 19 | |
| 20 | 20 | def __init__( |
| 21 | 21 | self, |
| 22 | - index_name: str, | |
| 23 | 22 | match_fields: List[str], |
| 24 | 23 | text_embedding_field: Optional[str] = None, |
| 25 | 24 | image_embedding_field: Optional[str] = None, |
| ... | ... | @@ -33,7 +32,6 @@ class ESQueryBuilder: |
| 33 | 32 | Initialize query builder. |
| 34 | 33 | |
| 35 | 34 | Args: |
| 36 | - index_name: ES index name | |
| 37 | 35 | match_fields: Fields to search for text matching |
| 38 | 36 | text_embedding_field: Field name for text embeddings |
| 39 | 37 | image_embedding_field: Field name for image embeddings |
| ... | ... | @@ -43,7 +41,6 @@ class ESQueryBuilder: |
| 43 | 41 | default_language: Default language to use when detection fails or returns "unknown" |
| 44 | 42 | knn_boost: Boost value for KNN (embedding recall) |
| 45 | 43 | """ |
| 46 | - self.index_name = index_name | |
| 47 | 44 | self.match_fields = match_fields |
| 48 | 45 | self.text_embedding_field = text_embedding_field |
| 49 | 46 | self.image_embedding_field = image_embedding_field |
| ... | ... | @@ -351,7 +348,6 @@ class ESQueryBuilder: |
| 351 | 348 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: |
| 352 | 349 | """ |
| 353 | 350 | Build simple text matching query (BM25). |
| 354 | - Legacy method - kept for backward compatibility. | |
| 355 | 351 | |
| 356 | 352 | Args: |
| 357 | 353 | query_text: Query text | ... | ... |
search/searcher.py
| ... | ... | @@ -17,7 +17,7 @@ from .es_query_builder import ESQueryBuilder |
| 17 | 17 | from config import SearchConfig |
| 18 | 18 | from config.tenant_config_loader import get_tenant_config_loader |
| 19 | 19 | from config.utils import get_match_fields_for_index |
| 20 | -from context.request_context import RequestContext, RequestContextStage, create_request_context | |
| 20 | +from context.request_context import RequestContext, RequestContextStage | |
| 21 | 21 | from api.models import FacetResult, FacetValue, FacetConfig |
| 22 | 22 | from api.result_formatter import ResultFormatter |
| 23 | 23 | from indexer.mapping_generator import get_tenant_index_name |
| ... | ... | @@ -107,9 +107,7 @@ class Searcher: |
| 107 | 107 | self.source_fields = config.query_config.source_fields or [] |
| 108 | 108 | |
| 109 | 109 | # Query builder - simplified single-layer architecture |
| 110 | - # index_name is no longer needed in query builder since we use tenant-specific indices | |
| 111 | 110 | self.query_builder = ESQueryBuilder( |
| 112 | - index_name="", # Not used, kept for backward compatibility | |
| 113 | 111 | match_fields=self.match_fields, |
| 114 | 112 | text_embedding_field=self.text_embedding_field, |
| 115 | 113 | image_embedding_field=self.image_embedding_field, |
| ... | ... | @@ -160,9 +158,8 @@ class Searcher: |
| 160 | 158 | Returns: |
| 161 | 159 | SearchResult object with formatted results |
| 162 | 160 | """ |
| 163 | - # Create context if not provided (backward compatibility) | |
| 164 | 161 | if context is None: |
| 165 | - context = create_request_context() | |
| 162 | + raise ValueError("context is required") | |
| 166 | 163 | |
| 167 | 164 | # 根据租户配置决定翻译开关(离线/在线统一) |
| 168 | 165 | tenant_loader = get_tenant_config_loader() | ... | ... |
tests/ci/test_service_api_contracts.py
| ... | ... | @@ -179,6 +179,7 @@ def indexer_client(monkeypatch): |
| 179 | 179 | import api.indexer_app as indexer_app |
| 180 | 180 | import api.routes.indexer as indexer_routes |
| 181 | 181 | |
| 182 | + indexer_app.app.router.on_startup.clear() | |
| 182 | 183 | monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None) |
| 183 | 184 | monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService()) |
| 184 | 185 | monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService()) | ... | ... |
tests/test_cnclip_service.py
| ... | ... | @@ -6,21 +6,35 @@ CN-CLIP 服务测试脚本 |
| 6 | 6 | 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议) |
| 7 | 7 | |
| 8 | 8 | 使用方法: |
| 9 | - python scripts/test_cnclip_service.py [PORT] | |
| 9 | + python tests/test_cnclip_service.py [PORT] | |
| 10 | 10 | |
| 11 | 11 | 参数: |
| 12 | 12 | PORT: 服务端口(默认:51000) |
| 13 | 13 | """ |
| 14 | 14 | |
| 15 | 15 | import sys |
| 16 | +import os | |
| 16 | 17 | |
| 17 | -import pytest | |
| 18 | +import numpy as np | |
| 19 | + | |
| 20 | +# Skip clip_client version check (it imports pkg_resources in legacy path). | |
| 21 | +os.environ.setdefault("NO_VERSION_CHECK", "1") | |
| 22 | + | |
| 23 | +# Ensure vendored client is importable in direct `python tests/test_cnclip_service.py` mode. | |
| 24 | +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| 25 | +VENDORED_CLIENT = os.path.join(ROOT, "third-party", "clip-as-service", "client") | |
| 26 | +if os.path.isdir(VENDORED_CLIENT) and VENDORED_CLIENT not in sys.path: | |
| 27 | + sys.path.insert(0, VENDORED_CLIENT) | |
| 18 | 28 | |
| 19 | 29 | try: |
| 20 | - import numpy as np | |
| 21 | 30 | from clip_client import Client |
| 22 | -except ImportError: | |
| 23 | - pytest.skip("clip_client not installed (optional clip-as-service client)", allow_module_level=True) | |
| 31 | +except ImportError as e: | |
| 32 | + print("✗ 无法导入 clip_client。请先安装/暴露客户端依赖:") | |
| 33 | + print(" 1) pip install -e third-party/clip-as-service/client") | |
| 34 | + print(" 或") | |
| 35 | + print(" 2) export PYTHONPATH=third-party/clip-as-service/client:$PYTHONPATH") | |
| 36 | + print(f" 详细错误: {e}") | |
| 37 | + sys.exit(1) | |
| 24 | 38 | |
| 25 | 39 | |
| 26 | 40 | def _test_encoding(client, test_name, inputs): |
| ... | ... | @@ -72,6 +86,15 @@ def main(): |
| 72 | 86 | # 创建客户端 |
| 73 | 87 | try: |
| 74 | 88 | client = Client(grpc_url) |
| 89 | + except ModuleNotFoundError as e: | |
| 90 | + if str(e) == "No module named 'pkg_resources'": | |
| 91 | + print("✗ 当前环境缺少 pkg_resources,clip_client/jina 无法初始化。") | |
| 92 | + print(" 建议使用专用环境运行:") | |
| 93 | + print(" .venv-cnclip/bin/python tests/test_cnclip_service.py 51000") | |
| 94 | + print(" 或在当前 .venv 安装兼容 setuptools(包含 pkg_resources)。") | |
| 95 | + sys.exit(1) | |
| 96 | + print(f"✗ 客户端创建失败: {e}") | |
| 97 | + sys.exit(1) | |
| 75 | 98 | except Exception as e: |
| 76 | 99 | print(f"✗ 客户端创建失败: {e}") |
| 77 | 100 | sys.exit(1) |
| ... | ... | @@ -118,4 +141,3 @@ def main(): |
| 118 | 141 | |
| 119 | 142 | if __name__ == '__main__': |
| 120 | 143 | main() |
| 121 | - | ... | ... |
tests/test_embedding_pipeline.py
| ... | ... | @@ -2,6 +2,7 @@ import pickle |
| 2 | 2 | from typing import Any, Dict, List, Optional |
| 3 | 3 | |
| 4 | 4 | import numpy as np |
| 5 | +import pytest | |
| 5 | 6 | |
| 6 | 7 | from config import ( |
| 7 | 8 | FunctionScoreConfig, |
| ... | ... | @@ -100,7 +101,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): |
| 100 | 101 | def _fake_post(url, json, timeout): |
| 101 | 102 | assert url.endswith("/embed/text") |
| 102 | 103 | assert json == ["hello", "world"] |
| 103 | - return _FakeResponse([[0.1, 0.2], None]) | |
| 104 | + return _FakeResponse([[0.1, 0.2], [0.3, 0.4]]) | |
| 104 | 105 | |
| 105 | 106 | monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) |
| 106 | 107 | |
| ... | ... | @@ -110,7 +111,22 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): |
| 110 | 111 | assert len(out) == 2 |
| 111 | 112 | assert isinstance(out[0], np.ndarray) |
| 112 | 113 | assert out[0].shape == (2,) |
| 113 | - assert out[1] is None | |
| 114 | + assert isinstance(out[1], np.ndarray) | |
| 115 | + assert out[1].shape == (2,) | |
| 116 | + | |
| 117 | + | |
| 118 | +def test_text_embedding_encoder_raises_on_missing_vector(monkeypatch): | |
| 119 | + fake_redis = _FakeRedis() | |
| 120 | + monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis) | |
| 121 | + | |
| 122 | + def _fake_post(url, json, timeout): | |
| 123 | + return _FakeResponse([[0.1, 0.2], None]) | |
| 124 | + | |
| 125 | + monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) | |
| 126 | + | |
| 127 | + encoder = TextEmbeddingEncoder(service_url="http://127.0.0.1:6005") | |
| 128 | + with pytest.raises(ValueError): | |
| 129 | + encoder.encode(["hello", "world"]) | |
| 114 | 130 | |
| 115 | 131 | |
| 116 | 132 | def test_text_embedding_encoder_cache_hit(monkeypatch): |
| ... | ... | @@ -156,4 +172,3 @@ def test_query_parser_skips_query_vector_when_disabled(): |
| 156 | 172 | |
| 157 | 173 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) |
| 158 | 174 | assert parsed.query_vector is None |
| 159 | - | ... | ... |