Commit ed94866621f769cb80425846a68db4a01a9f5caa
1 parent
950a640e
tidy
Showing
33 changed files
with
440 additions
and
594 deletions
Show diff stats
activate.sh
| 1 | #!/bin/bash | 1 | #!/bin/bash |
| 2 | # | 2 | # |
| 3 | -# Unified environment activator (venv preferred, conda fallback). | ||
| 4 | -# | ||
| 5 | # Usage: | 3 | # Usage: |
| 6 | # source activate.sh | 4 | # source activate.sh |
| 7 | # | 5 | # |
| 8 | -# Priority: | 6 | +# Required: |
| 9 | # 1) ./.venv (Python venv) | 7 | # 1) ./.venv (Python venv) |
| 10 | -# 2) conda env "searchengine" (legacy) | ||
| 11 | # | 8 | # |
| 12 | 9 | ||
| 13 | # Must be sourced | 10 | # Must be sourced |
| @@ -18,28 +15,16 @@ fi | @@ -18,28 +15,16 @@ fi | ||
| 18 | 15 | ||
| 19 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | 16 | PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 20 | 17 | ||
| 21 | -# 1) venv (preferred) | 18 | +# 1) venv (required) |
| 22 | VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate" | 19 | VENV_ACTIVATE="${PROJECT_ROOT}/.venv/bin/activate" |
| 23 | if [[ -f "${VENV_ACTIVATE}" ]]; then | 20 | if [[ -f "${VENV_ACTIVATE}" ]]; then |
| 24 | # shellcheck disable=SC1090 | 21 | # shellcheck disable=SC1090 |
| 25 | source "${VENV_ACTIVATE}" | 22 | source "${VENV_ACTIVATE}" |
| 26 | ENV_KIND="venv" | 23 | ENV_KIND="venv" |
| 27 | else | 24 | else |
| 28 | - # 2) conda fallback (legacy) | ||
| 29 | - # 新机器部署:可设置 CONDA_ROOT 指向本机 Conda 路径 | ||
| 30 | - # 例如你的 conda 是 ~/anaconda3/bin/conda,则 export CONDA_ROOT=$HOME/anaconda3 | ||
| 31 | - CONDA_ROOT="${CONDA_ROOT:-/home/tw/miniconda3}" | ||
| 32 | - if [[ -f "${CONDA_ROOT}/etc/profile.d/conda.sh" ]]; then | ||
| 33 | - # shellcheck disable=SC1091 | ||
| 34 | - source "${CONDA_ROOT}/etc/profile.d/conda.sh" | ||
| 35 | - conda activate searchengine | ||
| 36 | - ENV_KIND="conda" | ||
| 37 | - else | ||
| 38 | - echo "ERROR: No .venv found and conda.sh not found at ${CONDA_ROOT}/etc/profile.d/conda.sh" >&2 | ||
| 39 | - echo " - Create venv: ./scripts/create_venv.sh" >&2 | ||
| 40 | - echo " - Or set CONDA_ROOT to your conda install path" >&2 | ||
| 41 | - return 1 | ||
| 42 | - fi | 25 | + echo "ERROR: No .venv found at ${VENV_ACTIVATE}" >&2 |
| 26 | + echo " - Create venv: ./scripts/create_venv.sh" >&2 | ||
| 27 | + return 1 | ||
| 43 | fi | 28 | fi |
| 44 | 29 | ||
| 45 | # 如果需要加载 .env 中的环境变量 | 30 | # 如果需要加载 .env 中的环境变量 |
api/indexer_app.py
| @@ -124,12 +124,10 @@ def init_indexer_service(es_host: str = "http://localhost:9200"): | @@ -124,12 +124,10 @@ def init_indexer_service(es_host: str = "http://localhost:9200"): | ||
| 124 | ] | 124 | ] |
| 125 | if not value | 125 | if not value |
| 126 | ] | 126 | ] |
| 127 | - logger.warning( | ||
| 128 | - "Database config incomplete for indexer, services will not be available. " | 127 | + raise RuntimeError( |
| 128 | + "Database config incomplete for indexer. " | ||
| 129 | f"Missing: {', '.join(missing)}" | 129 | f"Missing: {', '.join(missing)}" |
| 130 | ) | 130 | ) |
| 131 | - _incremental_service = None | ||
| 132 | - _bulk_indexing_service = None | ||
| 133 | 131 | ||
| 134 | elapsed = time.time() - start_time | 132 | elapsed = time.time() - start_time |
| 135 | logger.info(f"Indexer service ready! (took {elapsed:.2f}s)") | 133 | logger.info(f"Indexer service ready! (took {elapsed:.2f}s)") |
| @@ -180,14 +178,17 @@ async def startup_event(): | @@ -180,14 +178,17 @@ async def startup_event(): | ||
| 180 | # If no explicit tenants configured, skip warmup. | 178 | # If no explicit tenants configured, skip warmup. |
| 181 | if tenants: | 179 | if tenants: |
| 182 | warm = _incremental_service.warmup_transformers(tenants) | 180 | warm = _incremental_service.warmup_transformers(tenants) |
| 181 | + if warm.get("failed"): | ||
| 182 | + raise RuntimeError(f"Indexer warmup failed: {warm['failed']}") | ||
| 183 | logger.info("Indexer warmup completed: %s", warm) | 183 | logger.info("Indexer warmup completed: %s", warm) |
| 184 | else: | 184 | else: |
| 185 | logger.info("Indexer warmup skipped (no tenant ids in config.tenant_config.tenants)") | 185 | logger.info("Indexer warmup skipped (no tenant ids in config.tenant_config.tenants)") |
| 186 | except Exception as e: | 186 | except Exception as e: |
| 187 | - logger.warning("Indexer warmup failed (service still starts): %s", e, exc_info=True) | 187 | + logger.error("Indexer warmup failed: %s", e, exc_info=True) |
| 188 | + raise | ||
| 188 | except Exception as e: | 189 | except Exception as e: |
| 189 | logger.error(f"Failed to initialize indexer service: {e}", exc_info=True) | 190 | logger.error(f"Failed to initialize indexer service: {e}", exc_info=True) |
| 190 | - logger.warning("Indexer service will start but may not function correctly") | 191 | + raise |
| 191 | 192 | ||
| 192 | 193 | ||
| 193 | @app.on_event("shutdown") | 194 | @app.on_event("shutdown") |
| @@ -215,7 +216,7 @@ async def global_exception_handler(request: Request, exc: Exception): | @@ -215,7 +216,7 @@ async def global_exception_handler(request: Request, exc: Exception): | ||
| 215 | async def health_check(): | 216 | async def health_check(): |
| 216 | """Simple health check for indexer service.""" | 217 | """Simple health check for indexer service.""" |
| 217 | try: | 218 | try: |
| 218 | - # ensure ES is reachable (best-effort) | 219 | + # ensure ES is reachable |
| 219 | if _es_client is None: | 220 | if _es_client is None: |
| 220 | raise RuntimeError("ES client is not initialized") | 221 | raise RuntimeError("ES client is not initialized") |
| 221 | return { | 222 | return { |
| @@ -262,4 +263,3 @@ if __name__ == "__main__": | @@ -262,4 +263,3 @@ if __name__ == "__main__": | ||
| 262 | port=args.port, | 263 | port=args.port, |
| 263 | reload=args.reload, | 264 | reload=args.reload, |
| 264 | ) | 265 | ) |
| 265 | - |
api/routes/indexer.py
| @@ -244,31 +244,19 @@ async def build_docs(request: BuildDocsRequest): | @@ -244,31 +244,19 @@ async def build_docs(request: BuildDocsRequest): | ||
| 244 | title_text = str(v) | 244 | title_text = str(v) |
| 245 | break | 245 | break |
| 246 | if title_text and str(title_text).strip(): | 246 | if title_text and str(title_text).strip(): |
| 247 | - try: | ||
| 248 | - import numpy as np | ||
| 249 | - | ||
| 250 | - embeddings = encoder.encode(title_text) | ||
| 251 | - if embeddings is not None and len(embeddings) > 0: | ||
| 252 | - emb0 = embeddings[0] | ||
| 253 | - if isinstance(emb0, np.ndarray) and emb0.size > 0: | ||
| 254 | - doc["title_embedding"] = emb0.tolist() | ||
| 255 | - else: | ||
| 256 | - logger.warning( | ||
| 257 | - "build-docs: title_embedding skipped (encoder returned None/invalid for title: %s...)", | ||
| 258 | - title_text[:50], | ||
| 259 | - ) | ||
| 260 | - else: | ||
| 261 | - logger.warning( | ||
| 262 | - "build-docs: title_embedding skipped (encoder returned empty for title: %s...)", | ||
| 263 | - title_text[:50], | ||
| 264 | - ) | ||
| 265 | - except Exception as e: | ||
| 266 | - logger.warning( | ||
| 267 | - "build-docs: title_embedding failed for spu_id=%s: %s", | ||
| 268 | - doc.get("spu_id"), | ||
| 269 | - e, | 247 | + import numpy as np |
| 248 | + | ||
| 249 | + embeddings = encoder.encode(title_text) | ||
| 250 | + if embeddings is None or len(embeddings) == 0: | ||
| 251 | + raise RuntimeError( | ||
| 252 | + f"title_embedding empty for spu_id={doc.get('spu_id')}" | ||
| 253 | + ) | ||
| 254 | + emb0 = np.asarray(embeddings[0], dtype=np.float32) | ||
| 255 | + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all(): | ||
| 256 | + raise RuntimeError( | ||
| 257 | + f"title_embedding invalid for spu_id={doc.get('spu_id')}" | ||
| 270 | ) | 258 | ) |
| 271 | - # 构建 doc 接口不因为 embedding 失败而整体失败 | 259 | + doc["title_embedding"] = emb0.tolist() |
| 272 | 260 | ||
| 273 | docs.append(doc) | 261 | docs.append(doc) |
| 274 | except Exception as e: | 262 | except Exception as e: |
| @@ -486,4 +474,3 @@ async def indexer_health_check(): | @@ -486,4 +474,3 @@ async def indexer_health_check(): | ||
| 486 | except Exception as e: | 474 | except Exception as e: |
| 487 | logger.error(f"Error checking indexer health: {e}", exc_info=True) | 475 | logger.error(f"Error checking indexer health: {e}", exc_info=True) |
| 488 | return {"status": "error", "message": str(e)} | 476 | return {"status": "error", "message": str(e)} |
| 489 | - |
config/__init__.py
| @@ -13,19 +13,13 @@ from .config_loader import ( | @@ -13,19 +13,13 @@ from .config_loader import ( | ||
| 13 | FunctionScoreConfig, | 13 | FunctionScoreConfig, |
| 14 | RerankConfig, | 14 | RerankConfig, |
| 15 | ConfigLoader, | 15 | ConfigLoader, |
| 16 | - ConfigurationError, | ||
| 17 | - load_tenant_config | 16 | + ConfigurationError |
| 18 | ) | 17 | ) |
| 19 | 18 | ||
| 20 | from .utils import ( | 19 | from .utils import ( |
| 21 | get_match_fields_for_index, | 20 | get_match_fields_for_index, |
| 22 | get_domain_fields | 21 | get_domain_fields |
| 23 | ) | 22 | ) |
| 24 | -from .service_endpoints import ( | ||
| 25 | - resolve_translation_service_url, | ||
| 26 | - resolve_embedding_service_url, | ||
| 27 | - resolve_reranker_service_url, | ||
| 28 | -) | ||
| 29 | from .services_config import ( | 23 | from .services_config import ( |
| 30 | get_translation_config, | 24 | get_translation_config, |
| 31 | get_embedding_config, | 25 | get_embedding_config, |
| @@ -50,12 +44,8 @@ __all__ = [ | @@ -50,12 +44,8 @@ __all__ = [ | ||
| 50 | # Loader and utilities | 44 | # Loader and utilities |
| 51 | 'ConfigLoader', | 45 | 'ConfigLoader', |
| 52 | 'ConfigurationError', | 46 | 'ConfigurationError', |
| 53 | - 'load_tenant_config', | ||
| 54 | 'get_match_fields_for_index', | 47 | 'get_match_fields_for_index', |
| 55 | 'get_domain_fields', | 48 | 'get_domain_fields', |
| 56 | - 'resolve_translation_service_url', | ||
| 57 | - 'resolve_embedding_service_url', | ||
| 58 | - 'resolve_reranker_service_url', | ||
| 59 | 'get_translation_config', | 49 | 'get_translation_config', |
| 60 | 'get_embedding_config', | 50 | 'get_embedding_config', |
| 61 | 'get_rerank_config', | 51 | 'get_rerank_config', |
config/config_loader.py
| @@ -433,17 +433,3 @@ class ConfigLoader: | @@ -433,17 +433,3 @@ class ConfigLoader: | ||
| 433 | result["example"] = index.example | 433 | result["example"] = index.example |
| 434 | 434 | ||
| 435 | return result | 435 | return result |
| 436 | - | ||
| 437 | - | ||
| 438 | -def load_tenant_config(tenant_id: Optional[str] = None) -> SearchConfig: | ||
| 439 | - """ | ||
| 440 | - Load tenant configuration (backward compatibility wrapper). | ||
| 441 | - | ||
| 442 | - Args: | ||
| 443 | - tenant_id: Ignored (kept for backward compatibility) | ||
| 444 | - | ||
| 445 | - Returns: | ||
| 446 | - SearchConfig loaded from config/config.yaml | ||
| 447 | - """ | ||
| 448 | - loader = ConfigLoader() | ||
| 449 | - return loader.load_config() |
config/service_endpoints.py deleted
| @@ -1,12 +0,0 @@ | @@ -1,12 +0,0 @@ | ||
| 1 | -""" | ||
| 2 | -Endpoint resolvers - delegate to services_config. | ||
| 3 | - | ||
| 4 | -Deprecated: use config.services_config directly. | ||
| 5 | -Kept for backward compatibility. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from .services_config import ( | ||
| 9 | - get_translation_base_url as resolve_translation_service_url, | ||
| 10 | - get_embedding_base_url as resolve_embedding_service_url, | ||
| 11 | - get_rerank_service_url as resolve_reranker_service_url, | ||
| 12 | -) |
docs/DEVELOPER_GUIDE.md
| @@ -216,9 +216,9 @@ docs/ # 文档(含本指南) | @@ -216,9 +216,9 @@ docs/ # 文档(含本指南) | ||
| 216 | - 索引结构以 `mappings/search_products.json` 为唯一来源;indexer 产出的 doc 必须与该 mapping 一致。 | 216 | - 索引结构以 `mappings/search_products.json` 为唯一来源;indexer 产出的 doc 必须与该 mapping 一致。 |
| 217 | - 查询侧使用的字段名、多语言后缀(.zh/.en)、嵌套路径等与 mapping 保持一致;新增字段时同步更新 mapping 与查询/分面/过滤逻辑。 | 217 | - 查询侧使用的字段名、多语言后缀(.zh/.en)、嵌套路径等与 mapping 保持一致;新增字段时同步更新 mapping 与查询/分面/过滤逻辑。 |
| 218 | 218 | ||
| 219 | -### 5.7 错误与降级 | 219 | +### 5.7 错误处理 |
| 220 | 220 | ||
| 221 | -- 外部能力(翻译、向量、重排)调用失败时,应有明确降级策略(如跳过向量、仅用 BM25、重排失败时保留 ES 顺序),并打日志便于排查;不因单一能力不可用导致整请求失败。 | 221 | +- 外部能力(翻译、向量、重排)调用失败时应立即报错并中止请求,禁止静默降级;通过日志与监控尽早暴露问题并修复。 |
| 222 | 222 | ||
| 223 | --- | 223 | --- |
| 224 | 224 | ||
| @@ -315,11 +315,11 @@ services: | @@ -315,11 +315,11 @@ services: | ||
| 315 | 5. **调用方**:无需修改;仅部署时启动使用新后端的 reranker 服务即可。 | 315 | 5. **调用方**:无需修改;仅部署时启动使用新后端的 reranker 服务即可。 |
| 316 | 6. **文档与依赖**:在 `reranker/README.md` 或 docs 中说明依赖(如 vllm)、显存建议;可选依赖放入 `requirements_ml.txt` 或 extra。 | 316 | 6. **文档与依赖**:在 `reranker/README.md` 或 docs 中说明依赖(如 vllm)、显存建议;可选依赖放入 `requirements_ml.txt` 或 extra。 |
| 317 | 317 | ||
| 318 | -### 7.7 与现有配置的兼容说明 | 318 | +### 7.7 配置一致性说明 |
| 319 | 319 | ||
| 320 | -- **reranker**:当前 `reranker/config.py` 的 BGE 相关默认值可保留为兜底,或将默认值迁移到 `config.yaml` 的 `services.rerank.backends.bge`,由 config 只读环境变量与 YAML。 | ||
| 321 | -- **embeddings**:`embeddings/config.py` 的文本/图片及 clip-as-service 开关与 `services.embedding` 的 URL 分离;后续多种后端可在 `services.embedding.backends` 中增加条目。 | ||
| 322 | -- **环境变量**:所有能力均支持环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),便于多环境部署。 | 320 | +- **单一路径**:Provider 和 backend 必须由 `config/config.yaml` 的 `services` 块显式指定;未知配置应直接报错。 |
| 321 | +- **无兼容回退**:不保留“旧配置自动推导/兜底默认值”机制,避免静默行为偏差。 | ||
| 322 | +- **环境变量覆盖**:允许环境变量覆盖(如 `RERANKER_SERVICE_URL`、`RERANK_BACKEND`、`EMBEDDING_SERVICE_URL`),但覆盖后仍需满足合法性校验。 | ||
| 323 | 323 | ||
| 324 | --- | 324 | --- |
| 325 | 325 | ||
| @@ -346,7 +346,7 @@ services: | @@ -346,7 +346,7 @@ services: | ||
| 346 | 346 | ||
| 347 | ### 8.4 日志与可观测性 | 347 | ### 8.4 日志与可观测性 |
| 348 | 348 | ||
| 349 | -- 关键路径(请求入口、外部调用、失败降级)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,warning 用于降级)。 | 349 | +- 关键路径(请求入口、外部调用、失败报错)打日志;日志级别合理(如 debug 用于详细参数,info 用于流程,error 用于失败)。 |
| 350 | - 对外接口的耗时、错误码、租户等可考虑结构化日志或后续接入监控,便于运维与排查。 | 350 | - 对外接口的耗时、错误码、租户等可考虑结构化日志或后续接入监控,便于运维与排查。 |
| 351 | 351 | ||
| 352 | --- | 352 | --- |
docs/Usage-Guide.md
| @@ -29,7 +29,7 @@ | @@ -29,7 +29,7 @@ | ||
| 29 | 29 | ||
| 30 | #### 1. 安装 Python 依赖与激活环境 | 30 | #### 1. 安装 Python 依赖与激活环境 |
| 31 | 31 | ||
| 32 | -**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。目前推荐 venv(`.venv`);Conda 仅作为兼容回退(需要 `CONDA_ROOT`)。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。 | 32 | +**推荐**:使用项目根目录的 `activate.sh` 激活环境(会加载 `.env`)。当前仅支持 venv(`.venv`),不存在 Conda 回退。系统要求、Python 环境、生产凭证与 `.env` 模板见 [QUICKSTART.md](./QUICKSTART.md) §1.4–1.8。 |
| 33 | 33 | ||
| 34 | ```bash | 34 | ```bash |
| 35 | cd /data/saas-search | 35 | cd /data/saas-search |
| @@ -663,4 +663,3 @@ curl -X POST http://localhost:6002/search/ \ | @@ -663,4 +663,3 @@ curl -X POST http://localhost:6002/search/ \ | ||
| 663 | 663 | ||
| 664 | **文档版本**: v2.0 | 664 | **文档版本**: v2.0 |
| 665 | **最后更新**: 2024-12 | 665 | **最后更新**: 2024-12 |
| 666 | - |
embeddings/__init__.py
| 1 | -""" | ||
| 2 | -Embeddings module. | 1 | +"""Embeddings module exports.""" |
| 3 | 2 | ||
| 4 | -Important: keep package import lightweight. | 3 | +from .text_encoder import TextEmbeddingEncoder |
| 4 | +from .image_encoder import CLIPImageEncoder | ||
| 5 | 5 | ||
| 6 | -Some callers do: | ||
| 7 | - - `from embeddings import TextEmbeddingEncoder` | ||
| 8 | - - `from embeddings import BgeEncoder` (deprecated alias) | ||
| 9 | - - `from embeddings import CLIPImageEncoder` | ||
| 10 | - | ||
| 11 | -But the underlying implementations may import heavy optional deps (Pillow, torch, etc). | ||
| 12 | -To avoid importing those at package import time (and to allow the embedding service to boot | ||
| 13 | -without importing client code), we provide small lazy factories here. | ||
| 14 | -""" | ||
| 15 | - | ||
| 16 | - | ||
| 17 | -class TextEmbeddingEncoder(object): | ||
| 18 | - """Lazy factory for `embeddings.text_encoder.TextEmbeddingEncoder`.""" | ||
| 19 | - | ||
| 20 | - def __new__(cls, *args, **kwargs): | ||
| 21 | - from .text_encoder import TextEmbeddingEncoder as _Real | ||
| 22 | - | ||
| 23 | - return _Real(*args, **kwargs) | ||
| 24 | - | ||
| 25 | - | ||
| 26 | -class BgeEncoder(TextEmbeddingEncoder): | ||
| 27 | - """Deprecated backward-compatible alias for old class name.""" | ||
| 28 | - | ||
| 29 | - | ||
| 30 | -class CLIPImageEncoder(object): | ||
| 31 | - """Lazy factory for `embeddings.image_encoder.CLIPImageEncoder`.""" | ||
| 32 | - | ||
| 33 | - def __new__(cls, *args, **kwargs): | ||
| 34 | - from .image_encoder import CLIPImageEncoder as _Real | ||
| 35 | - | ||
| 36 | - return _Real(*args, **kwargs) | ||
| 37 | - | ||
| 38 | - | ||
| 39 | -__all__ = ["TextEmbeddingEncoder", "BgeEncoder", "CLIPImageEncoder"] | 6 | +__all__ = [ |
| 7 | + "TextEmbeddingEncoder", | ||
| 8 | + "CLIPImageEncoder", | ||
| 9 | +] |
embeddings/clip_as_service_encoder.py
| @@ -65,13 +65,21 @@ class ClipAsServiceImageEncoder: | @@ -65,13 +65,21 @@ class ClipAsServiceImageEncoder: | ||
| 65 | self._server = server | 65 | self._server = server |
| 66 | self._batch_size = batch_size | 66 | self._batch_size = batch_size |
| 67 | self._show_progress = show_progress | 67 | self._show_progress = show_progress |
| 68 | - self._client = Client(server) | 68 | + try: |
| 69 | + self._client = Client(server) | ||
| 70 | + except ModuleNotFoundError as e: | ||
| 71 | + if str(e) == "No module named 'pkg_resources'": | ||
| 72 | + raise RuntimeError( | ||
| 73 | + "clip-as-service requires pkg_resources via jina/hubble. " | ||
| 74 | + "Install compatible setuptools (<82) in current venv." | ||
| 75 | + ) from e | ||
| 76 | + raise | ||
| 69 | 77 | ||
| 70 | def encode_image_urls( | 78 | def encode_image_urls( |
| 71 | self, | 79 | self, |
| 72 | urls: List[str], | 80 | urls: List[str], |
| 73 | batch_size: Optional[int] = None, | 81 | batch_size: Optional[int] = None, |
| 74 | - ) -> List[Optional[np.ndarray]]: | 82 | + ) -> List[np.ndarray]: |
| 75 | """ | 83 | """ |
| 76 | Encode a list of image URLs to vectors. | 84 | Encode a list of image URLs to vectors. |
| 77 | 85 | ||
| @@ -80,42 +88,36 @@ class ClipAsServiceImageEncoder: | @@ -80,42 +88,36 @@ class ClipAsServiceImageEncoder: | ||
| 80 | batch_size: override instance batch_size for this call. | 88 | batch_size: override instance batch_size for this call. |
| 81 | 89 | ||
| 82 | Returns: | 90 | Returns: |
| 83 | - List of vectors (1024-dim float32) or None for failed items, same length as urls. | 91 | + List of vectors (float32), same length as urls. |
| 84 | """ | 92 | """ |
| 85 | if not urls: | 93 | if not urls: |
| 86 | return [] | 94 | return [] |
| 87 | 95 | ||
| 88 | normalized = [_normalize_image_url(u) for u in urls] | 96 | normalized = [_normalize_image_url(u) for u in urls] |
| 89 | - valid_indices = [i for i, u in enumerate(normalized) if u] | ||
| 90 | - if not valid_indices: | ||
| 91 | - return [None] * len(urls) | ||
| 92 | - | ||
| 93 | - valid_urls = [normalized[i] for i in valid_indices] | ||
| 94 | bs = batch_size if batch_size is not None else self._batch_size | 97 | bs = batch_size if batch_size is not None else self._batch_size |
| 95 | - out: List[Optional[np.ndarray]] = [None] * len(urls) | ||
| 96 | - | ||
| 97 | - try: | ||
| 98 | - # Client.encode(iterable of str) returns np.ndarray [N, D] for string input | ||
| 99 | - arr = self._client.encode( | ||
| 100 | - valid_urls, | ||
| 101 | - batch_size=bs, | ||
| 102 | - show_progress=self._show_progress, | 98 | + invalid_indices = [i for i, u in enumerate(normalized) if not u] |
| 99 | + if invalid_indices: | ||
| 100 | + raise ValueError(f"Invalid empty image URL at indices: {invalid_indices}") | ||
| 101 | + | ||
| 102 | + # Client.encode(iterable of str) returns np.ndarray [N, D] for string input | ||
| 103 | + arr = self._client.encode( | ||
| 104 | + normalized, | ||
| 105 | + batch_size=bs, | ||
| 106 | + show_progress=self._show_progress, | ||
| 107 | + ) | ||
| 108 | + if arr is None or not hasattr(arr, "shape"): | ||
| 109 | + raise RuntimeError("clip-as-service encode returned empty result") | ||
| 110 | + if len(arr) != len(normalized): | ||
| 111 | + raise RuntimeError( | ||
| 112 | + f"clip-as-service encode length mismatch: expected {len(normalized)}, got {len(arr)}" | ||
| 103 | ) | 113 | ) |
| 104 | - if arr is not None and hasattr(arr, "shape") and len(arr) == len(valid_indices): | ||
| 105 | - for j, idx in enumerate(valid_indices): | ||
| 106 | - row = arr[j] | ||
| 107 | - if row is not None and hasattr(row, "tolist"): | ||
| 108 | - out[idx] = np.asarray(row, dtype=np.float32) | ||
| 109 | - else: | ||
| 110 | - out[idx] = np.array(row, dtype=np.float32) | ||
| 111 | - else: | ||
| 112 | - logger.warning( | ||
| 113 | - "clip-as-service encode returned unexpected shape/length, " | ||
| 114 | - "expected %d vectors", len(valid_indices) | ||
| 115 | - ) | ||
| 116 | - except Exception as e: | ||
| 117 | - logger.warning("clip-as-service encode failed: %s", e, exc_info=True) | ||
| 118 | 114 | ||
| 115 | + out: List[np.ndarray] = [] | ||
| 116 | + for row in arr: | ||
| 117 | + vec = np.asarray(row, dtype=np.float32) | ||
| 118 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | ||
| 119 | + raise RuntimeError("clip-as-service returned invalid embedding vector") | ||
| 120 | + out.append(vec) | ||
| 119 | return out | 121 | return out |
| 120 | 122 | ||
| 121 | def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | 123 | def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: |
embeddings/image_encoder.py
| @@ -48,16 +48,15 @@ class CLIPImageEncoder: | @@ -48,16 +48,15 @@ class CLIPImageEncoder: | ||
| 48 | logger.error(f"CLIPImageEncoder service request failed: {e}", exc_info=True) | 48 | logger.error(f"CLIPImageEncoder service request failed: {e}", exc_info=True) |
| 49 | raise | 49 | raise |
| 50 | 50 | ||
| 51 | - def encode_image(self, image: Image.Image) -> Optional[np.ndarray]: | 51 | + def encode_image(self, image: Image.Image) -> np.ndarray: |
| 52 | """ | 52 | """ |
| 53 | Encode image to embedding vector using network service. | 53 | Encode image to embedding vector using network service. |
| 54 | 54 | ||
| 55 | Note: This method is kept for compatibility but the service only works with URLs. | 55 | Note: This method is kept for compatibility but the service only works with URLs. |
| 56 | """ | 56 | """ |
| 57 | - logger.warning("encode_image with PIL Image not supported by service, returning None") | ||
| 58 | - return None | 57 | + raise NotImplementedError("encode_image with PIL Image is not supported by embedding service") |
| 59 | 58 | ||
| 60 | - def encode_image_from_url(self, url: str) -> Optional[np.ndarray]: | 59 | + def encode_image_from_url(self, url: str) -> np.ndarray: |
| 61 | """ | 60 | """ |
| 62 | Generate image embedding via network service using URL. | 61 | Generate image embedding via network service using URL. |
| 63 | 62 | ||
| @@ -65,24 +64,21 @@ class CLIPImageEncoder: | @@ -65,24 +64,21 @@ class CLIPImageEncoder: | ||
| 65 | url: Image URL to process | 64 | url: Image URL to process |
| 66 | 65 | ||
| 67 | Returns: | 66 | Returns: |
| 68 | - Embedding vector or None if failed | 67 | + Embedding vector |
| 69 | """ | 68 | """ |
| 70 | - try: | ||
| 71 | - response_data = self._call_service([url]) | ||
| 72 | - if response_data and len(response_data) > 0 and response_data[0] is not None: | ||
| 73 | - return np.array(response_data[0], dtype=np.float32) | ||
| 74 | - logger.warning(f"No embedding for URL {url}") | ||
| 75 | - return None | ||
| 76 | - | ||
| 77 | - except Exception as e: | ||
| 78 | - logger.error(f"Failed to process image from URL {url}: {str(e)}", exc_info=True) | ||
| 79 | - return None | 69 | + response_data = self._call_service([url]) |
| 70 | + if not response_data or len(response_data) != 1 or response_data[0] is None: | ||
| 71 | + raise RuntimeError(f"No image embedding returned for URL: {url}") | ||
| 72 | + vec = np.array(response_data[0], dtype=np.float32) | ||
| 73 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | ||
| 74 | + raise RuntimeError(f"Invalid image embedding returned for URL: {url}") | ||
| 75 | + return vec | ||
| 80 | 76 | ||
| 81 | def encode_batch( | 77 | def encode_batch( |
| 82 | self, | 78 | self, |
| 83 | images: List[Union[str, Image.Image]], | 79 | images: List[Union[str, Image.Image]], |
| 84 | batch_size: int = 8 | 80 | batch_size: int = 8 |
| 85 | - ) -> List[Optional[np.ndarray]]: | 81 | + ) -> List[np.ndarray]: |
| 86 | """ | 82 | """ |
| 87 | Encode a batch of images efficiently via network service. | 83 | Encode a batch of images efficiently via network service. |
| 88 | 84 | ||
| @@ -91,50 +87,31 @@ class CLIPImageEncoder: | @@ -91,50 +87,31 @@ class CLIPImageEncoder: | ||
| 91 | batch_size: Batch size for processing (used for service requests) | 87 | batch_size: Batch size for processing (used for service requests) |
| 92 | 88 | ||
| 93 | Returns: | 89 | Returns: |
| 94 | - List of embeddings (or None for failed images) | 90 | + List of embeddings |
| 95 | """ | 91 | """ |
| 96 | - # Initialize results with None for all images | ||
| 97 | - results = [None] * len(images) | ||
| 98 | - | ||
| 99 | - # Filter out PIL Images since service only supports URLs | ||
| 100 | - url_images = [] | ||
| 101 | - url_indices = [] | ||
| 102 | - | ||
| 103 | for i, img in enumerate(images): | 92 | for i, img in enumerate(images): |
| 104 | - if isinstance(img, str): | ||
| 105 | - url_images.append(img) | ||
| 106 | - url_indices.append(i) | ||
| 107 | - elif isinstance(img, Image.Image): | ||
| 108 | - logger.warning(f"PIL Image at index {i} not supported by service, returning None") | ||
| 109 | - # results[i] is already None | ||
| 110 | - | ||
| 111 | - # Process URLs in batches | ||
| 112 | - for i in range(0, len(url_images), batch_size): | ||
| 113 | - batch_urls = url_images[i:i + batch_size] | ||
| 114 | - batch_indices = url_indices[i:i + batch_size] | ||
| 115 | - | ||
| 116 | - try: | ||
| 117 | - # Call service | ||
| 118 | - response_data = self._call_service(batch_urls) | ||
| 119 | - | ||
| 120 | - # Process response (aligned list) | ||
| 121 | - batch_results = [] | ||
| 122 | - for j, url in enumerate(batch_urls): | ||
| 123 | - if response_data and j < len(response_data) and response_data[j] is not None: | ||
| 124 | - batch_results.append(np.array(response_data[j], dtype=np.float32)) | ||
| 125 | - else: | ||
| 126 | - logger.warning(f"Failed to encode URL {url}: no embedding") | ||
| 127 | - batch_results.append(None) | ||
| 128 | - | ||
| 129 | - # Insert results at the correct positions | ||
| 130 | - for j, result in enumerate(batch_results): | ||
| 131 | - results[batch_indices[j]] = result | ||
| 132 | - | ||
| 133 | - except Exception as e: | ||
| 134 | - logger.error(f"Batch processing failed: {e}", exc_info=True) | ||
| 135 | - # Fill with None for this batch | ||
| 136 | - for j in range(len(batch_urls)): | ||
| 137 | - results[batch_indices[j]] = None | 93 | + if isinstance(img, Image.Image): |
| 94 | + raise NotImplementedError(f"PIL Image at index {i} is not supported by service") | ||
| 95 | + if not isinstance(img, str) or not img.strip(): | ||
| 96 | + raise ValueError(f"Invalid image URL/path at index {i}: {img!r}") | ||
| 97 | + | ||
| 98 | + results: List[np.ndarray] = [] | ||
| 99 | + for i in range(0, len(images), batch_size): | ||
| 100 | + batch_urls = [str(u).strip() for u in images[i:i + batch_size]] | ||
| 101 | + response_data = self._call_service(batch_urls) | ||
| 102 | + if not response_data or len(response_data) != len(batch_urls): | ||
| 103 | + raise RuntimeError( | ||
| 104 | + f"Image embedding response length mismatch: expected {len(batch_urls)}, " | ||
| 105 | + f"got {0 if response_data is None else len(response_data)}" | ||
| 106 | + ) | ||
| 107 | + for j, url in enumerate(batch_urls): | ||
| 108 | + embedding = response_data[j] | ||
| 109 | + if embedding is None: | ||
| 110 | + raise RuntimeError(f"No image embedding returned for URL: {url}") | ||
| 111 | + vec = np.array(embedding, dtype=np.float32) | ||
| 112 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | ||
| 113 | + raise RuntimeError(f"Invalid image embedding returned for URL: {url}") | ||
| 114 | + results.append(vec) | ||
| 138 | 115 | ||
| 139 | return results | 116 | return results |
| 140 | 117 | ||
| @@ -142,7 +119,7 @@ class CLIPImageEncoder: | @@ -142,7 +119,7 @@ class CLIPImageEncoder: | ||
| 142 | self, | 119 | self, |
| 143 | urls: List[str], | 120 | urls: List[str], |
| 144 | batch_size: Optional[int] = None, | 121 | batch_size: Optional[int] = None, |
| 145 | - ) -> List[Optional[np.ndarray]]: | 122 | + ) -> List[np.ndarray]: |
| 146 | """ | 123 | """ |
| 147 | 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。 | 124 | 与 ClipImageModel / ClipAsServiceImageEncoder 一致的接口,供索引器 document_transformer 调用。 |
| 148 | 125 | ||
| @@ -151,6 +128,6 @@ class CLIPImageEncoder: | @@ -151,6 +128,6 @@ class CLIPImageEncoder: | ||
| 151 | batch_size: 批大小(默认 8) | 128 | batch_size: 批大小(默认 8) |
| 152 | 129 | ||
| 153 | Returns: | 130 | Returns: |
| 154 | - 与 urls 等长的向量列表,失败为 None | 131 | + 与 urls 等长的向量列表 |
| 155 | """ | 132 | """ |
| 156 | return self.encode_batch(urls, batch_size=batch_size or 8) | 133 | return self.encode_batch(urls, batch_size=batch_size or 8) |
embeddings/qwen3_model.py
| @@ -35,43 +35,15 @@ class Qwen3TextModel(object): | @@ -35,43 +35,15 @@ class Qwen3TextModel(object): | ||
| 35 | ) -> np.ndarray: | 35 | ) -> np.ndarray: |
| 36 | if device == "gpu": | 36 | if device == "gpu": |
| 37 | device = "cuda" | 37 | device = "cuda" |
| 38 | - | ||
| 39 | - # Try requested device, fallback to CPU if CUDA is unavailable/insufficient. | ||
| 40 | - try: | ||
| 41 | - if device == "cuda": | ||
| 42 | - import torch | ||
| 43 | - | ||
| 44 | - if torch.cuda.is_available(): | ||
| 45 | - free_memory = ( | ||
| 46 | - torch.cuda.get_device_properties(0).total_memory | ||
| 47 | - - torch.cuda.memory_allocated() | ||
| 48 | - ) | ||
| 49 | - if free_memory < 1024 * 1024 * 1024: # 1GB | ||
| 50 | - device = "cpu" | ||
| 51 | - else: | ||
| 52 | - device = "cpu" | ||
| 53 | - | ||
| 54 | - self.model = self.model.to(device) | ||
| 55 | - embeddings = self.model.encode( | ||
| 56 | - sentences, | ||
| 57 | - normalize_embeddings=normalize_embeddings, | ||
| 58 | - device=device, | ||
| 59 | - show_progress_bar=False, | ||
| 60 | - batch_size=batch_size, | ||
| 61 | - ) | ||
| 62 | - return embeddings | ||
| 63 | - except Exception: | ||
| 64 | - if device != "cpu": | ||
| 65 | - self.model = self.model.to("cpu") | ||
| 66 | - embeddings = self.model.encode( | ||
| 67 | - sentences, | ||
| 68 | - normalize_embeddings=normalize_embeddings, | ||
| 69 | - device="cpu", | ||
| 70 | - show_progress_bar=False, | ||
| 71 | - batch_size=batch_size, | ||
| 72 | - ) | ||
| 73 | - return embeddings | ||
| 74 | - raise | 38 | + self.model = self.model.to(device) |
| 39 | + embeddings = self.model.encode( | ||
| 40 | + sentences, | ||
| 41 | + normalize_embeddings=normalize_embeddings, | ||
| 42 | + device=device, | ||
| 43 | + show_progress_bar=False, | ||
| 44 | + batch_size=batch_size, | ||
| 45 | + ) | ||
| 46 | + return embeddings | ||
| 75 | 47 | ||
| 76 | def encode_batch( | 48 | def encode_batch( |
| 77 | self, | 49 | self, |
| @@ -86,4 +58,3 @@ class Qwen3TextModel(object): | @@ -86,4 +58,3 @@ class Qwen3TextModel(object): | ||
| 86 | device=device, | 58 | device=device, |
| 87 | normalize_embeddings=normalize_embeddings, | 59 | normalize_embeddings=normalize_embeddings, |
| 88 | ) | 60 | ) |
| 89 | - |
embeddings/server.py
| 1 | """ | 1 | """ |
| 2 | Embedding service (FastAPI). | 2 | Embedding service (FastAPI). |
| 3 | 3 | ||
| 4 | -API (simple list-in, list-out; aligned by index; failures -> null): | ||
| 5 | -- POST /embed/text body: ["text1", "text2", ...] -> [[...], null, ...] | ||
| 6 | -- POST /embed/image body: ["url_or_path1", ...] -> [[...], null, ...] | 4 | +API (simple list-in, list-out; aligned by index): |
| 5 | +- POST /embed/text body: ["text1", "text2", ...] -> [[...], ...] | ||
| 6 | +- POST /embed/image body: ["url_or_path1", ...] -> [[...], ...] | ||
| 7 | """ | 7 | """ |
| 8 | 8 | ||
| 9 | import logging | 9 | import logging |
| @@ -11,7 +11,7 @@ import threading | @@ -11,7 +11,7 @@ import threading | ||
| 11 | from typing import Any, Dict, List, Optional | 11 | from typing import Any, Dict, List, Optional |
| 12 | 12 | ||
| 13 | import numpy as np | 13 | import numpy as np |
| 14 | -from fastapi import FastAPI | 14 | +from fastapi import FastAPI, HTTPException |
| 15 | 15 | ||
| 16 | from embeddings.config import CONFIG | 16 | from embeddings.config import CONFIG |
| 17 | from embeddings.protocols import ImageEncoderProtocol | 17 | from embeddings.protocols import ImageEncoderProtocol |
| @@ -51,9 +51,6 @@ def load_models(): | @@ -51,9 +51,6 @@ def load_models(): | ||
| 51 | 51 | ||
| 52 | 52 | ||
| 53 | # Load image model: clip-as-service (recommended) or local CN-CLIP | 53 | # Load image model: clip-as-service (recommended) or local CN-CLIP |
| 54 | - # IMPORTANT: failures here should NOT prevent the whole service from starting. | ||
| 55 | - # If image model cannot be loaded, we keep `_image_model` as None and only | ||
| 56 | - # disable /embed/image while keeping /embed/text fully functional. | ||
| 57 | if open_image_model: | 54 | if open_image_model: |
| 58 | try: | 55 | try: |
| 59 | if CONFIG.USE_CLIP_AS_SERVICE: | 56 | if CONFIG.USE_CLIP_AS_SERVICE: |
| @@ -75,12 +72,8 @@ def load_models(): | @@ -75,12 +72,8 @@ def load_models(): | ||
| 75 | ) | 72 | ) |
| 76 | logger.info("Image model (local CN-CLIP) loaded successfully") | 73 | logger.info("Image model (local CN-CLIP) loaded successfully") |
| 77 | except Exception as e: | 74 | except Exception as e: |
| 78 | - logger.error( | ||
| 79 | - "Failed to load image model; image embeddings will be disabled but text embeddings remain available: %s", | ||
| 80 | - e, | ||
| 81 | - exc_info=True, | ||
| 82 | - ) | ||
| 83 | - _image_model = None | 75 | + logger.error("Failed to load image model: %s", e, exc_info=True) |
| 76 | + raise | ||
| 84 | 77 | ||
| 85 | logger.info("All embedding models loaded successfully, service ready") | 78 | logger.info("All embedding models loaded successfully, service ready") |
| 86 | 79 | ||
| @@ -109,70 +102,60 @@ def health() -> Dict[str, Any]: | @@ -109,70 +102,60 @@ def health() -> Dict[str, Any]: | ||
| 109 | def embed_text(texts: List[str]) -> List[Optional[List[float]]]: | 102 | def embed_text(texts: List[str]) -> List[Optional[List[float]]]: |
| 110 | if _text_model is None: | 103 | if _text_model is None: |
| 111 | raise RuntimeError("Text model not loaded") | 104 | raise RuntimeError("Text model not loaded") |
| 112 | - out: List[Optional[List[float]]] = [None] * len(texts) | ||
| 113 | - | ||
| 114 | - indexed_texts: List[tuple] = [] | 105 | + normalized: List[str] = [] |
| 115 | for i, t in enumerate(texts): | 106 | for i, t in enumerate(texts): |
| 116 | - if t is None: | ||
| 117 | - continue | ||
| 118 | if not isinstance(t, str): | 107 | if not isinstance(t, str): |
| 119 | - t = str(t) | ||
| 120 | - t = t.strip() | ||
| 121 | - if not t: | ||
| 122 | - continue | ||
| 123 | - indexed_texts.append((i, t)) | ||
| 124 | - | ||
| 125 | - if not indexed_texts: | ||
| 126 | - return out | ||
| 127 | - | ||
| 128 | - batch_texts = [t for _, t in indexed_texts] | ||
| 129 | - try: | ||
| 130 | - with _text_encode_lock: | ||
| 131 | - embs = _text_model.encode_batch( | ||
| 132 | - batch_texts, | ||
| 133 | - batch_size=int(CONFIG.TEXT_BATCH_SIZE), | ||
| 134 | - device=CONFIG.TEXT_DEVICE, | ||
| 135 | - normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS), | ||
| 136 | - ) | ||
| 137 | - for j, (idx, _t) in enumerate(indexed_texts): | ||
| 138 | - out[idx] = _as_list(embs[j]) | ||
| 139 | - except Exception: | ||
| 140 | - # keep Nones | ||
| 141 | - pass | 108 | + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: must be string") |
| 109 | + s = t.strip() | ||
| 110 | + if not s: | ||
| 111 | + raise HTTPException(status_code=400, detail=f"Invalid text at index {i}: empty string") | ||
| 112 | + normalized.append(s) | ||
| 113 | + | ||
| 114 | + with _text_encode_lock: | ||
| 115 | + embs = _text_model.encode_batch( | ||
| 116 | + normalized, | ||
| 117 | + batch_size=int(CONFIG.TEXT_BATCH_SIZE), | ||
| 118 | + device=CONFIG.TEXT_DEVICE, | ||
| 119 | + normalize_embeddings=bool(CONFIG.TEXT_NORMALIZE_EMBEDDINGS), | ||
| 120 | + ) | ||
| 121 | + if embs is None or len(embs) != len(normalized): | ||
| 122 | + raise RuntimeError( | ||
| 123 | + f"Text model response length mismatch: expected {len(normalized)}, " | ||
| 124 | + f"got {0 if embs is None else len(embs)}" | ||
| 125 | + ) | ||
| 126 | + out: List[Optional[List[float]]] = [] | ||
| 127 | + for i, emb in enumerate(embs): | ||
| 128 | + vec = _as_list(emb) | ||
| 129 | + if vec is None: | ||
| 130 | + raise RuntimeError(f"Text model returned empty embedding for index {i}") | ||
| 131 | + out.append(vec) | ||
| 142 | return out | 132 | return out |
| 143 | 133 | ||
| 144 | 134 | ||
| 145 | @app.post("/embed/image") | 135 | @app.post("/embed/image") |
| 146 | def embed_image(images: List[str]) -> List[Optional[List[float]]]: | 136 | def embed_image(images: List[str]) -> List[Optional[List[float]]]: |
| 147 | if _image_model is None: | 137 | if _image_model is None: |
| 148 | - # Graceful degradation: keep API shape but return all None | ||
| 149 | - logger.warning("embed_image called but image model is not loaded; returning all None vectors") | ||
| 150 | - return [None] * len(images) | ||
| 151 | - out: List[Optional[List[float]]] = [None] * len(images) | ||
| 152 | - | ||
| 153 | - # Normalize inputs | ||
| 154 | - urls = [] | ||
| 155 | - indices = [] | 138 | + raise RuntimeError("Image model not loaded") |
| 139 | + urls: List[str] = [] | ||
| 156 | for i, url_or_path in enumerate(images): | 140 | for i, url_or_path in enumerate(images): |
| 157 | - if url_or_path is None: | ||
| 158 | - continue | ||
| 159 | if not isinstance(url_or_path, str): | 141 | if not isinstance(url_or_path, str): |
| 160 | - url_or_path = str(url_or_path) | ||
| 161 | - url_or_path = url_or_path.strip() | ||
| 162 | - if url_or_path: | ||
| 163 | - urls.append(url_or_path) | ||
| 164 | - indices.append(i) | ||
| 165 | - | ||
| 166 | - if not urls: | ||
| 167 | - return out | 142 | + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: must be string URL/path") |
| 143 | + s = url_or_path.strip() | ||
| 144 | + if not s: | ||
| 145 | + raise HTTPException(status_code=400, detail=f"Invalid image at index {i}: empty URL/path") | ||
| 146 | + urls.append(s) | ||
| 168 | 147 | ||
| 169 | with _image_encode_lock: | 148 | with _image_encode_lock: |
| 170 | - try: | ||
| 171 | - # Both ClipAsServiceImageEncoder and ClipImageModel implement encode_image_urls(urls, batch_size) | ||
| 172 | - vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE) | ||
| 173 | - for j, idx in enumerate(indices): | ||
| 174 | - out[idx] = _as_list(vectors[j] if j < len(vectors) else None) | ||
| 175 | - except Exception: | ||
| 176 | - for idx in indices: | ||
| 177 | - out[idx] = None | 149 | + vectors = _image_model.encode_image_urls(urls, batch_size=CONFIG.IMAGE_BATCH_SIZE) |
| 150 | + if vectors is None or len(vectors) != len(urls): | ||
| 151 | + raise RuntimeError( | ||
| 152 | + f"Image model response length mismatch: expected {len(urls)}, " | ||
| 153 | + f"got {0 if vectors is None else len(vectors)}" | ||
| 154 | + ) | ||
| 155 | + out: List[Optional[List[float]]] = [] | ||
| 156 | + for i, vec in enumerate(vectors): | ||
| 157 | + out_vec = _as_list(vec) | ||
| 158 | + if out_vec is None: | ||
| 159 | + raise RuntimeError(f"Image model returned empty embedding for index {i}") | ||
| 160 | + out.append(out_vec) | ||
| 178 | return out | 161 | return out |
embeddings/text_encoder.py
| @@ -89,9 +89,8 @@ class TextEmbeddingEncoder: | @@ -89,9 +89,8 @@ class TextEmbeddingEncoder: | ||
| 89 | batch_size: Batch size for processing (used for service requests) | 89 | batch_size: Batch size for processing (used for service requests) |
| 90 | 90 | ||
| 91 | Returns: | 91 | Returns: |
| 92 | - numpy array of dtype=object, where each element is either: | ||
| 93 | - - np.ndarray (valid embedding vector) or | ||
| 94 | - - None (no embedding available for that text) | 92 | + numpy array of dtype=object,元素均为有效 np.ndarray 向量。 |
| 93 | + 若任一输入无法生成向量,将直接抛出异常。 | ||
| 95 | """ | 94 | """ |
| 96 | # Convert single string to list | 95 | # Convert single string to list |
| 97 | if isinstance(sentences, str): | 96 | if isinstance(sentences, str): |
| @@ -101,8 +100,6 @@ class TextEmbeddingEncoder: | @@ -101,8 +100,6 @@ class TextEmbeddingEncoder: | ||
| 101 | uncached_indices: List[int] = [] | 100 | uncached_indices: List[int] = [] |
| 102 | uncached_texts: List[str] = [] | 101 | uncached_texts: List[str] = [] |
| 103 | 102 | ||
| 104 | - # Process response | ||
| 105 | - # Each element can be np.ndarray or None (表示该文本没有可用的向量) | ||
| 106 | embeddings: List[Optional[np.ndarray]] = [None] * len(sentences) | 103 | embeddings: List[Optional[np.ndarray]] = [None] * len(sentences) |
| 107 | 104 | ||
| 108 | for i, text in enumerate(sentences): | 105 | for i, text in enumerate(sentences): |
| @@ -118,40 +115,27 @@ class TextEmbeddingEncoder: | @@ -118,40 +115,27 @@ class TextEmbeddingEncoder: | ||
| 118 | 115 | ||
| 119 | # If there are uncached texts, call service | 116 | # If there are uncached texts, call service |
| 120 | if uncached_texts: | 117 | if uncached_texts: |
| 121 | - try: | ||
| 122 | - # Call service | ||
| 123 | - response_data = self._call_service(request_data) | 118 | + response_data = self._call_service(request_data) |
| 124 | 119 | ||
| 125 | - # Process response | ||
| 126 | - for i, text in enumerate(uncached_texts): | ||
| 127 | - original_idx = uncached_indices[i] | ||
| 128 | - if response_data and i < len(response_data): | ||
| 129 | - embedding = response_data[i] | ||
| 130 | - else: | ||
| 131 | - embedding = None | 120 | + # Process response |
| 121 | + for i, text in enumerate(uncached_texts): | ||
| 122 | + original_idx = uncached_indices[i] | ||
| 123 | + if response_data and i < len(response_data): | ||
| 124 | + embedding = response_data[i] | ||
| 125 | + else: | ||
| 126 | + embedding = None | ||
| 132 | 127 | ||
| 133 | - if embedding is not None: | ||
| 134 | - embedding_array = np.array(embedding, dtype=np.float32) | ||
| 135 | - # Validate embedding from service - if invalid, treat as no result | ||
| 136 | - if self._is_valid_embedding(embedding_array): | ||
| 137 | - embeddings[original_idx] = embedding_array | ||
| 138 | - # Cache the embedding | ||
| 139 | - self._set_cached_embedding(text, "generic", embedding_array) | ||
| 140 | - else: | ||
| 141 | - logger.warning( | ||
| 142 | - f"Invalid embedding returned from service for text {original_idx} " | ||
| 143 | - f"(contains NaN/Inf or invalid shape), treating as no result. " | ||
| 144 | - f"Text preview: {text[:50]}..." | ||
| 145 | - ) | ||
| 146 | - embeddings[original_idx] = None | 128 | + if embedding is not None: |
| 129 | + embedding_array = np.array(embedding, dtype=np.float32) | ||
| 130 | + if self._is_valid_embedding(embedding_array): | ||
| 131 | + embeddings[original_idx] = embedding_array | ||
| 132 | + self._set_cached_embedding(text, "generic", embedding_array) | ||
| 147 | else: | 133 | else: |
| 148 | - logger.warning(f"No embedding found for text {original_idx}: {text[:50]}...") | ||
| 149 | - embeddings[original_idx] = None | ||
| 150 | - | ||
| 151 | - except Exception as e: | ||
| 152 | - logger.error(f"Failed to encode texts: {e}", exc_info=True) | ||
| 153 | - # 出错时不要生成兜底全零向量,保持为 None | ||
| 154 | - pass | 134 | + raise ValueError( |
| 135 | + f"Invalid embedding returned from service for text index {original_idx}" | ||
| 136 | + ) | ||
| 137 | + else: | ||
| 138 | + raise ValueError(f"No embedding found for text index {original_idx}: {text[:50]}...") | ||
| 155 | 139 | ||
| 156 | # 返回 numpy 数组(dtype=object),元素为 np.ndarray 或 None | 140 | # 返回 numpy 数组(dtype=object),元素为 np.ndarray 或 None |
| 157 | return np.array(embeddings, dtype=object) | 141 | return np.array(embeddings, dtype=object) |
| @@ -254,7 +238,3 @@ class TextEmbeddingEncoder: | @@ -254,7 +238,3 @@ class TextEmbeddingEncoder: | ||
| 254 | except Exception as e: | 238 | except Exception as e: |
| 255 | logger.error(f"Error storing embedding in cache: {e}") | 239 | logger.error(f"Error storing embedding in cache: {e}") |
| 256 | return False | 240 | return False |
| 257 | - | ||
| 258 | - | ||
| 259 | -# Backward compatibility for existing imports/usages. | ||
| 260 | -BgeEncoder = TextEmbeddingEncoder |
indexer/document_transformer.py
| @@ -406,23 +406,21 @@ class SPUDocumentTransformer: | @@ -406,23 +406,21 @@ class SPUDocumentTransformer: | ||
| 406 | 406 | ||
| 407 | if not urls: | 407 | if not urls: |
| 408 | return | 408 | return |
| 409 | - try: | ||
| 410 | - vectors = self.image_encoder.encode_image_urls(urls, batch_size=8) | ||
| 411 | - if not vectors or len(vectors) != len(urls): | ||
| 412 | - return | ||
| 413 | - out = [] | ||
| 414 | - for url, vec in zip(urls, vectors): | ||
| 415 | - if vec is None: | ||
| 416 | - continue | ||
| 417 | - if isinstance(vec, np.ndarray): | ||
| 418 | - vec = vec.astype(np.float32) | ||
| 419 | - out.append({"vector": vec.tolist(), "url": url}) | ||
| 420 | - elif hasattr(vec, "tolist"): | ||
| 421 | - out.append({"vector": vec.tolist(), "url": url}) | ||
| 422 | - if out: | ||
| 423 | - doc["image_embedding"] = out | ||
| 424 | - except Exception as e: | ||
| 425 | - logger.warning("Failed to generate image_embedding for SPU %s: %s", doc.get("spu_id"), e) | 409 | + vectors = self.image_encoder.encode_image_urls(urls, batch_size=8) |
| 410 | + if not vectors or len(vectors) != len(urls): | ||
| 411 | + raise RuntimeError( | ||
| 412 | + f"image_embedding response length mismatch for SPU {doc.get('spu_id')}: " | ||
| 413 | + f"expected {len(urls)}, got {0 if vectors is None else len(vectors)}" | ||
| 414 | + ) | ||
| 415 | + out = [] | ||
| 416 | + for url, vec in zip(urls, vectors): | ||
| 417 | + arr = np.asarray(vec, dtype=np.float32) | ||
| 418 | + if arr.ndim != 1 or arr.size == 0 or not np.isfinite(arr).all(): | ||
| 419 | + raise RuntimeError( | ||
| 420 | + f"Invalid image embedding for SPU {doc.get('spu_id')} and URL {url}" | ||
| 421 | + ) | ||
| 422 | + out.append({"vector": arr.tolist(), "url": url}) | ||
| 423 | + doc["image_embedding"] = out | ||
| 426 | 424 | ||
| 427 | def _process_skus( | 425 | def _process_skus( |
| 428 | self, | 426 | self, |
| @@ -722,21 +720,14 @@ class SPUDocumentTransformer: | @@ -722,21 +720,14 @@ class SPUDocumentTransformer: | ||
| 722 | logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}") | 720 | logger.debug(f"No title text available for embedding, SPU: {doc.get('spu_id')}") |
| 723 | return | 721 | return |
| 724 | 722 | ||
| 725 | - try: | ||
| 726 | - # 使用文本向量编码器生成 embedding | ||
| 727 | - # encode方法返回numpy数组,形状为(n, 1024) | ||
| 728 | - embeddings = self.encoder.encode(title_text) | ||
| 729 | - | ||
| 730 | - if embeddings is not None and len(embeddings) > 0: | ||
| 731 | - # 取第一个embedding(因为只传了一个文本) | ||
| 732 | - embedding = embeddings[0] | ||
| 733 | - if not isinstance(embedding, np.ndarray): | ||
| 734 | - logger.warning(f"Embedding is None/invalid for title: {title_text[:50]}...") | ||
| 735 | - return | ||
| 736 | - # 转换为列表格式(ES需要) | ||
| 737 | - doc['title_embedding'] = embedding.tolist() | ||
| 738 | - logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...") | ||
| 739 | - else: | ||
| 740 | - logger.warning(f"Failed to generate embedding for title: {title_text[:50]}...") | ||
| 741 | - except Exception as e: | ||
| 742 | - logger.error(f"Error generating title_embedding for SPU {doc.get('spu_id')}: {e}", exc_info=True) | 723 | + # 使用文本向量编码器生成 embedding |
| 724 | + # encode方法返回numpy数组,形状为(n, d) | ||
| 725 | + embeddings = self.encoder.encode(title_text) | ||
| 726 | + if embeddings is None or len(embeddings) == 0: | ||
| 727 | + raise RuntimeError(f"Failed to generate title embedding for SPU {doc.get('spu_id')}") | ||
| 728 | + | ||
| 729 | + embedding = np.asarray(embeddings[0], dtype=np.float32) | ||
| 730 | + if embedding.ndim != 1 or embedding.size == 0 or not np.isfinite(embedding).all(): | ||
| 731 | + raise RuntimeError(f"Invalid title embedding for SPU {doc.get('spu_id')}") | ||
| 732 | + doc['title_embedding'] = embedding.tolist() | ||
| 733 | + logger.debug(f"Generated title_embedding for SPU: {doc.get('spu_id')}, title: {title_text[:50]}...") |
indexer/incremental_service.py
| @@ -33,9 +33,9 @@ class IncrementalIndexerService: | @@ -33,9 +33,9 @@ class IncrementalIndexerService: | ||
| 33 | logger.info(f"Preloaded {len(self.category_id_to_name)} category mappings") | 33 | logger.info(f"Preloaded {len(self.category_id_to_name)} category mappings") |
| 34 | 34 | ||
| 35 | # 缓存:避免频繁增量请求重复加载 config / 构造 transformer | 35 | # 缓存:避免频繁增量请求重复加载 config / 构造 transformer |
| 36 | - # NOTE: 为避免“首请求”懒加载导致超时,尽量在进程启动阶段完成初始化: | 36 | + # 启动阶段强校验初始化: |
| 37 | # - config.yaml 加载 | 37 | # - config.yaml 加载 |
| 38 | - # - translator / embedding / image encoder provider 初始化(best-effort) | 38 | + # - translator / embedding / image encoder provider 初始化 |
| 39 | self._config: Optional[Any] = None | 39 | self._config: Optional[Any] = None |
| 40 | self._config_lock = threading.Lock() | 40 | self._config_lock = threading.Lock() |
| 41 | self._translator: Optional[Any] = None | 41 | self._translator: Optional[Any] = None |
| @@ -50,59 +50,35 @@ class IncrementalIndexerService: | @@ -50,59 +50,35 @@ class IncrementalIndexerService: | ||
| 50 | self._transformer_cache_lock = threading.Lock() | 50 | self._transformer_cache_lock = threading.Lock() |
| 51 | 51 | ||
| 52 | def _eager_init(self) -> None: | 52 | def _eager_init(self) -> None: |
| 53 | - """Best-effort eager initialization to reduce first-request latency.""" | ||
| 54 | - try: | ||
| 55 | - self._config = ConfigLoader("config/config.yaml").load_config() | ||
| 56 | - except Exception as e: | ||
| 57 | - logger.warning("Failed to eagerly load config/config.yaml: %s", e, exc_info=True) | ||
| 58 | - self._config = None | ||
| 59 | - return | 53 | + """Strict eager initialization. Any dependency failure should fail fast.""" |
| 54 | + self._config = ConfigLoader("config/config.yaml").load_config() | ||
| 55 | + self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | ||
| 56 | + self._searchable_option_dimensions = ( | ||
| 57 | + getattr(self._config.spu_config, "searchable_option_dimensions", None) | ||
| 58 | + or ["option1", "option2", "option3"] | ||
| 59 | + ) | ||
| 60 | 60 | ||
| 61 | - try: | ||
| 62 | - self._translation_prompts = getattr(self._config.query_config, "translation_prompts", {}) or {} | ||
| 63 | - self._searchable_option_dimensions = ( | ||
| 64 | - getattr(self._config.spu_config, "searchable_option_dimensions", None) | ||
| 65 | - or ["option1", "option2", "option3"] | ||
| 66 | - ) | ||
| 67 | - except Exception: | ||
| 68 | - self._translation_prompts = {} | ||
| 69 | - self._searchable_option_dimensions = ["option1", "option2", "option3"] | 61 | + from providers import create_translation_provider |
| 70 | 62 | ||
| 71 | - # Translator provider (best-effort) | ||
| 72 | - try: | ||
| 73 | - from providers import create_translation_provider | 63 | + self._translator = create_translation_provider(self._config.query_config) |
| 74 | 64 | ||
| 75 | - self._translator = create_translation_provider(self._config.query_config) | ||
| 76 | - except Exception as e: | ||
| 77 | - logger.warning("Failed to initialize translation provider at startup: %s", e) | ||
| 78 | - self._translator = None | ||
| 79 | - | ||
| 80 | - # Text embedding encoder (best-effort) | 65 | + # Text embedding encoder (strict when enabled) |
| 81 | if bool(getattr(self._config.query_config, "enable_text_embedding", False)): | 66 | if bool(getattr(self._config.query_config, "enable_text_embedding", False)): |
| 82 | - try: | ||
| 83 | - from embeddings.text_encoder import TextEmbeddingEncoder | 67 | + from embeddings.text_encoder import TextEmbeddingEncoder |
| 84 | 68 | ||
| 85 | - self._shared_text_encoder = TextEmbeddingEncoder() | ||
| 86 | - except Exception as e: | ||
| 87 | - logger.warning("Failed to initialize TextEmbeddingEncoder at startup: %s", e) | ||
| 88 | - self._shared_text_encoder = None | 69 | + self._shared_text_encoder = TextEmbeddingEncoder() |
| 70 | + else: | ||
| 71 | + self._shared_text_encoder = None | ||
| 89 | 72 | ||
| 90 | - # Image embedding encoder (best-effort; may be unavailable if embedding service not running) | ||
| 91 | - try: | ||
| 92 | - from embeddings.image_encoder import CLIPImageEncoder | 73 | + # Image embedding encoder (strict) |
| 74 | + from embeddings.image_encoder import CLIPImageEncoder | ||
| 93 | 75 | ||
| 94 | - self._shared_image_encoder = CLIPImageEncoder() | ||
| 95 | - except Exception as e: | ||
| 96 | - logger.debug("Image encoder not available for indexer startup: %s", e) | ||
| 97 | - self._shared_image_encoder = None | 76 | + self._shared_image_encoder = CLIPImageEncoder() |
| 98 | 77 | ||
| 99 | def _get_config(self) -> Any: | 78 | def _get_config(self) -> Any: |
| 100 | """Load config once per process (thread-safe).""" | 79 | """Load config once per process (thread-safe).""" |
| 101 | - if self._config is not None: | ||
| 102 | - return self._config | ||
| 103 | - with self._config_lock: | ||
| 104 | - if self._config is None: | ||
| 105 | - self._config = ConfigLoader("config/config.yaml").load_config() | 80 | + if self._config is None: |
| 81 | + raise RuntimeError("Indexer config is not initialized") | ||
| 106 | return self._config | 82 | return self._config |
| 107 | 83 | ||
| 108 | def _get_transformer_bundle(self, tenant_id: str) -> Tuple[Any, Optional[Any], bool]: | 84 | def _get_transformer_bundle(self, tenant_id: str) -> Tuple[Any, Optional[Any], bool]: |
| @@ -121,32 +97,13 @@ class IncrementalIndexerService: | @@ -121,32 +97,13 @@ class IncrementalIndexerService: | ||
| 121 | config = self._get_config() | 97 | config = self._get_config() |
| 122 | enable_embedding = bool(getattr(config.query_config, "enable_text_embedding", False)) | 98 | enable_embedding = bool(getattr(config.query_config, "enable_text_embedding", False)) |
| 123 | 99 | ||
| 124 | - # Use shared encoders/providers preloaded at startup when可用; | ||
| 125 | - # 若启动时初始化失败,则在首次请求时做一次兜底初始化,避免永久禁用。 | ||
| 126 | encoder: Optional[Any] = self._shared_text_encoder if enable_embedding else None | 100 | encoder: Optional[Any] = self._shared_text_encoder if enable_embedding else None |
| 127 | if enable_embedding and encoder is None: | 101 | if enable_embedding and encoder is None: |
| 128 | - try: | ||
| 129 | - from embeddings.text_encoder import TextEmbeddingEncoder | ||
| 130 | - | ||
| 131 | - encoder = TextEmbeddingEncoder() | ||
| 132 | - self._shared_text_encoder = encoder | ||
| 133 | - logger.info("TextEmbeddingEncoder lazily initialized in _get_transformer_bundle") | ||
| 134 | - except Exception as e: | ||
| 135 | - logger.warning("Failed to lazily initialize TextEmbeddingEncoder for tenant_id=%s: %s", tenant_id, e) | ||
| 136 | - encoder = None | ||
| 137 | - enable_embedding = False | 102 | + raise RuntimeError("Text embedding is enabled but TextEmbeddingEncoder is not initialized") |
| 138 | 103 | ||
| 139 | image_encoder: Optional[Any] = self._shared_image_encoder | 104 | image_encoder: Optional[Any] = self._shared_image_encoder |
| 140 | if image_encoder is None: | 105 | if image_encoder is None: |
| 141 | - try: | ||
| 142 | - from embeddings.image_encoder import CLIPImageEncoder | ||
| 143 | - | ||
| 144 | - image_encoder = CLIPImageEncoder() | ||
| 145 | - self._shared_image_encoder = image_encoder | ||
| 146 | - logger.info("CLIPImageEncoder lazily initialized in _get_transformer_bundle") | ||
| 147 | - except Exception as e: | ||
| 148 | - logger.debug("Image encoder not available for indexer (lazy init): %s", e) | ||
| 149 | - image_encoder = None | 106 | + raise RuntimeError("CLIPImageEncoder is not initialized") |
| 150 | 107 | ||
| 151 | transformer = create_document_transformer( | 108 | transformer = create_document_transformer( |
| 152 | category_id_to_name=self.category_id_to_name, | 109 | category_id_to_name=self.category_id_to_name, |
| @@ -157,7 +114,7 @@ class IncrementalIndexerService: | @@ -157,7 +114,7 @@ class IncrementalIndexerService: | ||
| 157 | encoder=encoder, | 114 | encoder=encoder, |
| 158 | enable_title_embedding=False, # batch fill later | 115 | enable_title_embedding=False, # batch fill later |
| 159 | image_encoder=image_encoder, | 116 | image_encoder=image_encoder, |
| 160 | - enable_image_embedding=(image_encoder is not None), | 117 | + enable_image_embedding=True, |
| 161 | config=config, | 118 | config=config, |
| 162 | ) | 119 | ) |
| 163 | 120 | ||
| @@ -236,14 +193,13 @@ class IncrementalIndexerService: | @@ -236,14 +193,13 @@ class IncrementalIndexerService: | ||
| 236 | title_text = str(v) | 193 | title_text = str(v) |
| 237 | break | 194 | break |
| 238 | if title_text and str(title_text).strip(): | 195 | if title_text and str(title_text).strip(): |
| 239 | - try: | ||
| 240 | - embeddings = encoder.encode(title_text) | ||
| 241 | - if embeddings is not None and len(embeddings) > 0: | ||
| 242 | - emb0 = embeddings[0] | ||
| 243 | - if isinstance(emb0, np.ndarray): | ||
| 244 | - doc["title_embedding"] = emb0.tolist() | ||
| 245 | - except Exception as e: | ||
| 246 | - logger.warning(f"Failed to generate embedding for spu_id={spu_id}: {e}") | 196 | + embeddings = encoder.encode(title_text) |
| 197 | + if embeddings is None or len(embeddings) == 0: | ||
| 198 | + raise RuntimeError(f"Failed to generate title embedding for spu_id={spu_id}") | ||
| 199 | + emb0 = np.asarray(embeddings[0], dtype=np.float32) | ||
| 200 | + if emb0.ndim != 1 or emb0.size == 0 or not np.isfinite(emb0).all(): | ||
| 201 | + raise RuntimeError(f"Invalid title embedding for spu_id={spu_id}") | ||
| 202 | + doc["title_embedding"] = emb0.tolist() | ||
| 247 | 203 | ||
| 248 | return doc | 204 | return doc |
| 249 | 205 | ||
| @@ -678,14 +634,20 @@ class IncrementalIndexerService: | @@ -678,14 +634,20 @@ class IncrementalIndexerService: | ||
| 678 | title_doc_indices.append(i) | 634 | title_doc_indices.append(i) |
| 679 | 635 | ||
| 680 | if title_texts: | 636 | if title_texts: |
| 681 | - try: | ||
| 682 | - embeddings = encoder.encode_batch(title_texts, batch_size=32) | ||
| 683 | - for j, emb in enumerate(embeddings): | ||
| 684 | - doc_idx = title_doc_indices[j] | ||
| 685 | - if isinstance(emb, np.ndarray): | ||
| 686 | - documents[doc_idx][1]["title_embedding"] = emb.tolist() | ||
| 687 | - except Exception as e: | ||
| 688 | - logger.warning(f"[IncrementalIndexing] Batch embedding failed for tenant_id={tenant_id}: {e}", exc_info=True) | 637 | + embeddings = encoder.encode_batch(title_texts, batch_size=32) |
| 638 | + if embeddings is None or len(embeddings) != len(title_texts): | ||
| 639 | + raise RuntimeError( | ||
| 640 | + f"[IncrementalIndexing] Batch embedding length mismatch for tenant_id={tenant_id}: " | ||
| 641 | + f"expected {len(title_texts)}, got {0 if embeddings is None else len(embeddings)}" | ||
| 642 | + ) | ||
| 643 | + for j, emb in enumerate(embeddings): | ||
| 644 | + vec = np.asarray(emb, dtype=np.float32) | ||
| 645 | + if vec.ndim != 1 or vec.size == 0 or not np.isfinite(vec).all(): | ||
| 646 | + raise RuntimeError( | ||
| 647 | + f"[IncrementalIndexing] Invalid title embedding in batch for tenant_id={tenant_id}, index={j}" | ||
| 648 | + ) | ||
| 649 | + doc_idx = title_doc_indices[j] | ||
| 650 | + documents[doc_idx][1]["title_embedding"] = vec.tolist() | ||
| 689 | 651 | ||
| 690 | logger.info(f"[IncrementalIndexing] Transformed {len(documents)}/{total_count} documents") | 652 | logger.info(f"[IncrementalIndexing] Transformed {len(documents)}/{total_count} documents") |
| 691 | 653 | ||
| @@ -789,4 +751,3 @@ class IncrementalIndexerService: | @@ -789,4 +751,3 @@ class IncrementalIndexerService: | ||
| 789 | "index_name": index_name, | 751 | "index_name": index_name, |
| 790 | "tenant_id": tenant_id | 752 | "tenant_id": tenant_id |
| 791 | } | 753 | } |
| 792 | - |
indexer/indexing_utils.py
| @@ -92,38 +92,29 @@ def create_document_transformer( | @@ -92,38 +92,29 @@ def create_document_transformer( | ||
| 92 | or (encoder is None and enable_title_embedding) | 92 | or (encoder is None and enable_title_embedding) |
| 93 | or config is None | 93 | or config is None |
| 94 | ): | 94 | ): |
| 95 | - try: | ||
| 96 | - if config is None: | ||
| 97 | - config_loader = ConfigLoader() | ||
| 98 | - config = config_loader.load_config() | ||
| 99 | - | ||
| 100 | - if searchable_option_dimensions is None: | ||
| 101 | - searchable_option_dimensions = config.spu_config.searchable_option_dimensions | ||
| 102 | - | ||
| 103 | - index_langs = tenant_config.get("index_languages") or [] | ||
| 104 | - need_translator = len(index_langs) > 1 | ||
| 105 | - if translator is None and need_translator: | ||
| 106 | - from providers import create_translation_provider | ||
| 107 | - translator = create_translation_provider(config.query_config) | ||
| 108 | - | ||
| 109 | - if translation_prompts is None: | ||
| 110 | - translation_prompts = config.query_config.translation_prompts | ||
| 111 | - | ||
| 112 | - # 初始化encoder(如果启用标题向量化且未提供encoder) | ||
| 113 | - if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: | ||
| 114 | - try: | ||
| 115 | - from embeddings.text_encoder import TextEmbeddingEncoder | ||
| 116 | - encoder = TextEmbeddingEncoder() | ||
| 117 | - logger.info("TextEmbeddingEncoder initialized for title embedding") | ||
| 118 | - except Exception as e: | ||
| 119 | - logger.warning(f"Failed to initialize TextEmbeddingEncoder: {e}, title embedding will be disabled") | ||
| 120 | - enable_title_embedding = False | ||
| 121 | - except Exception as e: | ||
| 122 | - logger.warning(f"Failed to load config, using defaults: {e}") | ||
| 123 | - if searchable_option_dimensions is None: | ||
| 124 | - searchable_option_dimensions = ['option1', 'option2', 'option3'] | ||
| 125 | - if translation_prompts is None: | ||
| 126 | - translation_prompts = {} | 95 | + if config is None: |
| 96 | + config_loader = ConfigLoader() | ||
| 97 | + config = config_loader.load_config() | ||
| 98 | + | ||
| 99 | + if searchable_option_dimensions is None: | ||
| 100 | + searchable_option_dimensions = config.spu_config.searchable_option_dimensions | ||
| 101 | + | ||
| 102 | + index_langs = tenant_config.get("index_languages") or [] | ||
| 103 | + need_translator = len(index_langs) > 1 | ||
| 104 | + if translator is None and need_translator: | ||
| 105 | + from providers import create_translation_provider | ||
| 106 | + | ||
| 107 | + translator = create_translation_provider(config.query_config) | ||
| 108 | + | ||
| 109 | + if translation_prompts is None: | ||
| 110 | + translation_prompts = config.query_config.translation_prompts | ||
| 111 | + | ||
| 112 | + # 初始化encoder(如果启用标题向量化且未提供encoder) | ||
| 113 | + if encoder is None and enable_title_embedding and config.query_config.enable_text_embedding: | ||
| 114 | + from embeddings.text_encoder import TextEmbeddingEncoder | ||
| 115 | + | ||
| 116 | + encoder = TextEmbeddingEncoder() | ||
| 117 | + logger.info("TextEmbeddingEncoder initialized for title embedding") | ||
| 127 | 118 | ||
| 128 | return SPUDocumentTransformer( | 119 | return SPUDocumentTransformer( |
| 129 | category_id_to_name=category_id_to_name, | 120 | category_id_to_name=category_id_to_name, |
main.py
| @@ -22,6 +22,7 @@ from utils import ESClient | @@ -22,6 +22,7 @@ from utils import ESClient | ||
| 22 | from search import Searcher | 22 | from search import Searcher |
| 23 | from suggestion import SuggestionIndexBuilder | 23 | from suggestion import SuggestionIndexBuilder |
| 24 | from utils.db_connector import create_db_connection | 24 | from utils.db_connector import create_db_connection |
| 25 | +from context.request_context import create_request_context | ||
| 25 | 26 | ||
| 26 | 27 | ||
| 27 | def cmd_serve(args): | 28 | def cmd_serve(args): |
| @@ -78,7 +79,8 @@ def cmd_search(args): | @@ -78,7 +79,8 @@ def cmd_search(args): | ||
| 78 | result = searcher.search( | 79 | result = searcher.search( |
| 79 | query=args.query, | 80 | query=args.query, |
| 80 | tenant_id=args.tenant_id, | 81 | tenant_id=args.tenant_id, |
| 81 | - size=args.size | 82 | + size=args.size, |
| 83 | + context=create_request_context(), | ||
| 82 | ) | 84 | ) |
| 83 | 85 | ||
| 84 | # Display results | 86 | # Display results |
providers/embedding.py
| 1 | -""" | ||
| 2 | -Embedding provider - HTTP service (vllm reserved). | ||
| 3 | - | ||
| 4 | -Returns text/image encoders configured via services_config. | ||
| 5 | -""" | 1 | +"""Embedding provider - HTTP service.""" |
| 6 | 2 | ||
| 7 | from __future__ import annotations | 3 | from __future__ import annotations |
| 8 | 4 | ||
| @@ -14,10 +10,7 @@ def create_embedding_provider() -> "EmbeddingProvider": | @@ -14,10 +10,7 @@ def create_embedding_provider() -> "EmbeddingProvider": | ||
| 14 | cfg = get_embedding_config() | 10 | cfg = get_embedding_config() |
| 15 | provider = (cfg.provider or "http").strip().lower() | 11 | provider = (cfg.provider or "http").strip().lower() |
| 16 | if provider != "http": | 12 | if provider != "http": |
| 17 | - import logging | ||
| 18 | - logging.getLogger(__name__).warning( | ||
| 19 | - "Unsupported embedding provider '%s', fallback to HTTP provider.", provider | ||
| 20 | - ) | 13 | + raise ValueError(f"Unsupported embedding provider: {provider}") |
| 21 | return EmbeddingProvider() | 14 | return EmbeddingProvider() |
| 22 | 15 | ||
| 23 | 16 |
providers/rerank.py
| 1 | -""" | ||
| 2 | -Rerank provider - HTTP service (vllm reserved). | ||
| 3 | -""" | 1 | +"""Rerank provider - HTTP service.""" |
| 4 | 2 | ||
| 5 | from __future__ import annotations | 3 | from __future__ import annotations |
| 6 | 4 | ||
| @@ -61,8 +59,8 @@ def create_rerank_provider() -> HttpRerankProvider: | @@ -61,8 +59,8 @@ def create_rerank_provider() -> HttpRerankProvider: | ||
| 61 | cfg = get_rerank_config() | 59 | cfg = get_rerank_config() |
| 62 | provider = (cfg.provider or "http").strip().lower() | 60 | provider = (cfg.provider or "http").strip().lower() |
| 63 | 61 | ||
| 64 | - if provider == "vllm": | ||
| 65 | - logger.warning("rerank provider 'vllm' is reserved, using HTTP.") | 62 | + if provider != "http": |
| 63 | + raise ValueError(f"Unsupported rerank provider: {provider}") | ||
| 66 | 64 | ||
| 67 | url = get_rerank_service_url() | 65 | url = get_rerank_service_url() |
| 68 | return HttpRerankProvider(service_url=url) | 66 | return HttpRerankProvider(service_url=url) |
providers/translation.py
| @@ -158,19 +158,7 @@ def create_translation_provider(query_config: Any = None) -> Any: | @@ -158,19 +158,7 @@ def create_translation_provider(query_config: Any = None) -> Any: | ||
| 158 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), | 158 | translation_context=getattr(qc, "translation_context", "e-commerce product search"), |
| 159 | ) | 159 | ) |
| 160 | 160 | ||
| 161 | - logger.warning( | ||
| 162 | - "Unsupported translation provider '%s', fallback to direct.", | ||
| 163 | - provider, | ||
| 164 | - ) | ||
| 165 | - from query.translator import Translator | ||
| 166 | - qc = query_config or _empty_query_config() | ||
| 167 | - return Translator( | ||
| 168 | - model=pc.get("model") or "qwen", | ||
| 169 | - api_key=getattr(qc, "translation_api_key", None), | ||
| 170 | - use_cache=True, | ||
| 171 | - glossary_id=getattr(qc, "translation_glossary_id", None), | ||
| 172 | - translation_context=getattr(qc, "translation_context", "e-commerce product search"), | ||
| 173 | - ) | 161 | + raise ValueError(f"Unsupported translation provider: {provider}") |
| 174 | 162 | ||
| 175 | 163 | ||
| 176 | def _empty_query_config() -> Any: | 164 | def _empty_query_config() -> Any: |
query/__init__.py
| @@ -2,15 +2,12 @@ | @@ -2,15 +2,12 @@ | ||
| 2 | 2 | ||
| 3 | from .language_detector import LanguageDetector | 3 | from .language_detector import LanguageDetector |
| 4 | from .translator import Translator | 4 | from .translator import Translator |
| 5 | -from .translation_client import HttpTranslationClient, create_translation_client | ||
| 6 | from .query_rewriter import QueryRewriter, QueryNormalizer | 5 | from .query_rewriter import QueryRewriter, QueryNormalizer |
| 7 | from .query_parser import QueryParser, ParsedQuery | 6 | from .query_parser import QueryParser, ParsedQuery |
| 8 | 7 | ||
| 9 | __all__ = [ | 8 | __all__ = [ |
| 10 | 'LanguageDetector', | 9 | 'LanguageDetector', |
| 11 | 'Translator', | 10 | 'Translator', |
| 12 | - 'HttpTranslationClient', | ||
| 13 | - 'create_translation_client', | ||
| 14 | 'QueryRewriter', | 11 | 'QueryRewriter', |
| 15 | 'QueryNormalizer', | 12 | 'QueryNormalizer', |
| 16 | 'QueryParser', | 13 | 'QueryParser', |
query/translation_client.py deleted
| @@ -1,20 +0,0 @@ | @@ -1,20 +0,0 @@ | ||
| 1 | -""" | ||
| 2 | -Translation client - delegates to providers. | ||
| 3 | - | ||
| 4 | -Deprecated: use providers.create_translation_provider() instead. | ||
| 5 | -Kept for backward compatibility. | ||
| 6 | -""" | ||
| 7 | - | ||
| 8 | -from __future__ import annotations | ||
| 9 | - | ||
| 10 | -from typing import Any | ||
| 11 | - | ||
| 12 | -from providers.translation import ( | ||
| 13 | - HttpTranslationProvider as HttpTranslationClient, | ||
| 14 | - create_translation_provider, | ||
| 15 | -) | ||
| 16 | - | ||
| 17 | - | ||
| 18 | -def create_translation_client(query_config: Any) -> Any: | ||
| 19 | - """Backward compat: delegate to create_translation_provider.""" | ||
| 20 | - return create_translation_provider(query_config) |
requirements.txt
scripts/create_venv.sh
| @@ -44,7 +44,8 @@ fi | @@ -44,7 +44,8 @@ fi | ||
| 44 | # shellcheck disable=SC1091 | 44 | # shellcheck disable=SC1091 |
| 45 | source "${VENV_DIR}/bin/activate" | 45 | source "${VENV_DIR}/bin/activate" |
| 46 | 46 | ||
| 47 | -python -m pip install --upgrade pip setuptools wheel | 47 | +python -m pip install --upgrade pip wheel |
| 48 | +python -m pip install "setuptools<82" | ||
| 48 | python -m pip install -r requirements.txt | 49 | python -m pip install -r requirements.txt |
| 49 | 50 | ||
| 50 | if [[ "${INSTALL_ML:-0}" == "1" ]]; then | 51 | if [[ "${INSTALL_ML:-0}" == "1" ]]; then |
scripts/start_embedding_service.sh
| @@ -16,10 +16,27 @@ source ./activate.sh | @@ -16,10 +16,27 @@ source ./activate.sh | ||
| 16 | 16 | ||
| 17 | DEFAULT_EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)") | 17 | DEFAULT_EMBEDDING_SERVICE_HOST=$(python -c "from embeddings.config import CONFIG; print(CONFIG.HOST)") |
| 18 | DEFAULT_EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)") | 18 | DEFAULT_EMBEDDING_SERVICE_PORT=$(python -c "from embeddings.config import CONFIG; print(CONFIG.PORT)") |
| 19 | +USE_CLIP_AS_SERVICE=$(python -c "from embeddings.config import CONFIG; print('1' if CONFIG.USE_CLIP_AS_SERVICE else '0')") | ||
| 19 | 20 | ||
| 20 | EMBEDDING_SERVICE_HOST="${EMBEDDING_HOST:-${DEFAULT_EMBEDDING_SERVICE_HOST}}" | 21 | EMBEDDING_SERVICE_HOST="${EMBEDDING_HOST:-${DEFAULT_EMBEDDING_SERVICE_HOST}}" |
| 21 | EMBEDDING_SERVICE_PORT="${EMBEDDING_PORT:-${DEFAULT_EMBEDDING_SERVICE_PORT}}" | 22 | EMBEDDING_SERVICE_PORT="${EMBEDDING_PORT:-${DEFAULT_EMBEDDING_SERVICE_PORT}}" |
| 22 | 23 | ||
| 24 | +if [[ "${USE_CLIP_AS_SERVICE}" == "1" ]]; then | ||
| 25 | + if ! python - <<'PY' | ||
| 26 | +try: | ||
| 27 | + import pkg_resources # noqa: F401 | ||
| 28 | +except Exception: | ||
| 29 | + raise SystemExit(1) | ||
| 30 | +PY | ||
| 31 | + then | ||
| 32 | + echo "ERROR: clip-as-service image embedding requires pkg_resources, but current venv is missing it." >&2 | ||
| 33 | + echo "Fix:" >&2 | ||
| 34 | + echo " python -m pip install 'setuptools<82'" >&2 | ||
| 35 | + echo "Then restart: ./scripts/start_embedding_service.sh" >&2 | ||
| 36 | + exit 1 | ||
| 37 | + fi | ||
| 38 | +fi | ||
| 39 | + | ||
| 23 | echo "========================================" | 40 | echo "========================================" |
| 24 | echo "Starting Local Embedding Service" | 41 | echo "Starting Local Embedding Service" |
| 25 | echo "========================================" | 42 | echo "========================================" |
| @@ -36,4 +53,3 @@ exec python -m uvicorn embeddings.server:app \ | @@ -36,4 +53,3 @@ exec python -m uvicorn embeddings.server:app \ | ||
| 36 | --port "${EMBEDDING_SERVICE_PORT}" \ | 53 | --port "${EMBEDDING_SERVICE_PORT}" \ |
| 37 | --workers 1 | 54 | --workers 1 |
| 38 | 55 | ||
| 39 | - |
scripts/test_build_docs_api.py
| @@ -29,6 +29,10 @@ except ImportError: | @@ -29,6 +29,10 @@ except ImportError: | ||
| 29 | def build_sample_request(): | 29 | def build_sample_request(): |
| 30 | """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。""" | 30 | """构造一条完整的 build-docs 请求体(对应 shoplazza_product_spu / sku / option 表结构)。""" |
| 31 | now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") | 31 | now = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") |
| 32 | + sample_image_url = os.getenv( | ||
| 33 | + "SAMPLE_IMAGE_URL", | ||
| 34 | + "https://oss.essa.cn/98532128-cf8e-456c-9e30-6f2a5ea0c19f.jpg", | ||
| 35 | + ) | ||
| 32 | return { | 36 | return { |
| 33 | "tenant_id": "162", | 37 | "tenant_id": "162", |
| 34 | "items": [ | 38 | "items": [ |
| @@ -45,7 +49,7 @@ def build_sample_request(): | @@ -45,7 +49,7 @@ def build_sample_request(): | ||
| 45 | "category_level": 2, | 49 | "category_level": 2, |
| 46 | "category_path": "服装/上衣/T恤", | 50 | "category_path": "服装/上衣/T恤", |
| 47 | "fake_sales": 1280, | 51 | "fake_sales": 1280, |
| 48 | - "image_src": "https://example.com/img/tshirt.jpg", | 52 | + "image_src": sample_image_url, |
| 49 | "tags": "T恤,纯棉,短袖,夏季", | 53 | "tags": "T恤,纯棉,短袖,夏季", |
| 50 | "create_time": now, | 54 | "create_time": now, |
| 51 | "update_time": now, | 55 | "update_time": now, |
| @@ -0,0 +1,74 @@ | @@ -0,0 +1,74 @@ | ||
| 1 | +#!/bin/bash | ||
| 2 | +# | ||
| 3 | +# 排查「谁在调用索引服务」的脚本 | ||
| 4 | +# 用法: ./scripts/trace_indexer_calls.sh | ||
| 5 | +# | ||
| 6 | + | ||
| 7 | +set -euo pipefail | ||
| 8 | + | ||
| 9 | +cd "$(dirname "$0")/.." | ||
| 10 | +source ./activate.sh 2>/dev/null || true | ||
| 11 | + | ||
| 12 | +echo "==========================================" | ||
| 13 | +echo "索引服务调用方排查" | ||
| 14 | +echo "==========================================" | ||
| 15 | + | ||
| 16 | +INDEXER_PORT="${INDEXER_PORT:-6004}" | ||
| 17 | +EMBEDDING_PORT="${EMBEDDING_PORT:-6005}" | ||
| 18 | + | ||
| 19 | +echo "" | ||
| 20 | +echo "1. 监听端口 6004 的进程(Indexer 服务)" | ||
| 21 | +echo "------------------------------------------" | ||
| 22 | +if command -v lsof >/dev/null 2>&1; then | ||
| 23 | + lsof -i :"${INDEXER_PORT}" 2>/dev/null || echo " (无进程监听或 lsof 无权限)" | ||
| 24 | +else | ||
| 25 | + ss -tlnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (无进程监听)" | ||
| 26 | +fi | ||
| 27 | + | ||
| 28 | +echo "" | ||
| 29 | +echo "2. 连接到 6004 的客户端(谁在请求 Indexer)" | ||
| 30 | +echo "------------------------------------------" | ||
| 31 | +if command -v ss >/dev/null 2>&1; then | ||
| 32 | + ss -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)" | ||
| 33 | +elif command -v netstat >/dev/null 2>&1; then | ||
| 34 | + netstat -tnp 2>/dev/null | grep ":${INDEXER_PORT}" || echo " (当前无活跃连接)" | ||
| 35 | +else | ||
| 36 | + echo " 请安装 ss 或 netstat" | ||
| 37 | +fi | ||
| 38 | + | ||
| 39 | +echo "" | ||
| 40 | +echo "3. 连接到 6005 的客户端(Indexer 会调用 Embedding 服务)" | ||
| 41 | +echo "------------------------------------------" | ||
| 42 | +if command -v ss >/dev/null 2>&1; then | ||
| 43 | + ss -tnp 2>/dev/null | grep ":${EMBEDDING_PORT}" || echo " (当前无活跃连接)" | ||
| 44 | +fi | ||
| 45 | + | ||
| 46 | +echo "" | ||
| 47 | +echo "4. 检查定时任务(cron)" | ||
| 48 | +echo "------------------------------------------" | ||
| 49 | +(crontab -l 2>/dev/null | grep -i indexer) || echo " 当前用户无相关 cron" | ||
| 50 | +if [ -d /etc/cron.d ]; then | ||
| 51 | + grep -l -i indexer /etc/cron.d/* 2>/dev/null || true | ||
| 52 | +fi | ||
| 53 | + | ||
| 54 | +echo "" | ||
| 55 | +echo "5. 端口与逻辑说明" | ||
| 56 | +echo "------------------------------------------" | ||
| 57 | +echo " - Indexer 服务: 端口 ${INDEXER_PORT}" | ||
| 58 | +echo " 启动: ./scripts/start_indexer.sh 或 python main.py serve-indexer" | ||
| 59 | +echo " 接口: POST /indexer/reindex, POST /indexer/index, POST /indexer/build-docs 等" | ||
| 60 | +echo "" | ||
| 61 | +echo " - 调用方(文档说明): 外部 Java 程序或 curl 等 HTTP 客户端" | ||
| 62 | +echo " 全量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/reindex -d '{\"tenant_id\":\"170\",\"batch_size\":500}'" | ||
| 63 | +echo " 增量: curl -X POST http://localhost:${INDEXER_PORT}/indexer/index -d '{\"tenant_id\":\"170\",\"spu_ids\":[\"123\"]}'" | ||
| 64 | +echo "" | ||
| 65 | +echo " - Indexer 内部会调用:" | ||
| 66 | +echo " - Embedding 服务 (${EMBEDDING_PORT}): POST /embed/text" | ||
| 67 | +echo " - Qwen API: dashscope.aliyuncs.com (翻译、LLM 分析)" | ||
| 68 | +echo " - MySQL: 商品数据" | ||
| 69 | +echo " - Elasticsearch: 写入索引" | ||
| 70 | +echo "" | ||
| 71 | +echo "6. 实时监控连接(按 Ctrl+C 停止)" | ||
| 72 | +echo "------------------------------------------" | ||
| 73 | +echo " 运行: watch -n 2 'ss -tnp | grep -E \":${INDEXER_PORT}|:${EMBEDDING_PORT}\"'" | ||
| 74 | +echo "" |
search/es_query_builder.py
| @@ -19,7 +19,6 @@ class ESQueryBuilder: | @@ -19,7 +19,6 @@ class ESQueryBuilder: | ||
| 19 | 19 | ||
| 20 | def __init__( | 20 | def __init__( |
| 21 | self, | 21 | self, |
| 22 | - index_name: str, | ||
| 23 | match_fields: List[str], | 22 | match_fields: List[str], |
| 24 | text_embedding_field: Optional[str] = None, | 23 | text_embedding_field: Optional[str] = None, |
| 25 | image_embedding_field: Optional[str] = None, | 24 | image_embedding_field: Optional[str] = None, |
| @@ -33,7 +32,6 @@ class ESQueryBuilder: | @@ -33,7 +32,6 @@ class ESQueryBuilder: | ||
| 33 | Initialize query builder. | 32 | Initialize query builder. |
| 34 | 33 | ||
| 35 | Args: | 34 | Args: |
| 36 | - index_name: ES index name | ||
| 37 | match_fields: Fields to search for text matching | 35 | match_fields: Fields to search for text matching |
| 38 | text_embedding_field: Field name for text embeddings | 36 | text_embedding_field: Field name for text embeddings |
| 39 | image_embedding_field: Field name for image embeddings | 37 | image_embedding_field: Field name for image embeddings |
| @@ -43,7 +41,6 @@ class ESQueryBuilder: | @@ -43,7 +41,6 @@ class ESQueryBuilder: | ||
| 43 | default_language: Default language to use when detection fails or returns "unknown" | 41 | default_language: Default language to use when detection fails or returns "unknown" |
| 44 | knn_boost: Boost value for KNN (embedding recall) | 42 | knn_boost: Boost value for KNN (embedding recall) |
| 45 | """ | 43 | """ |
| 46 | - self.index_name = index_name | ||
| 47 | self.match_fields = match_fields | 44 | self.match_fields = match_fields |
| 48 | self.text_embedding_field = text_embedding_field | 45 | self.text_embedding_field = text_embedding_field |
| 49 | self.image_embedding_field = image_embedding_field | 46 | self.image_embedding_field = image_embedding_field |
| @@ -351,7 +348,6 @@ class ESQueryBuilder: | @@ -351,7 +348,6 @@ class ESQueryBuilder: | ||
| 351 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: | 348 | def _build_text_query(self, query_text: str) -> Dict[str, Any]: |
| 352 | """ | 349 | """ |
| 353 | Build simple text matching query (BM25). | 350 | Build simple text matching query (BM25). |
| 354 | - Legacy method - kept for backward compatibility. | ||
| 355 | 351 | ||
| 356 | Args: | 352 | Args: |
| 357 | query_text: Query text | 353 | query_text: Query text |
search/searcher.py
| @@ -17,7 +17,7 @@ from .es_query_builder import ESQueryBuilder | @@ -17,7 +17,7 @@ from .es_query_builder import ESQueryBuilder | ||
| 17 | from config import SearchConfig | 17 | from config import SearchConfig |
| 18 | from config.tenant_config_loader import get_tenant_config_loader | 18 | from config.tenant_config_loader import get_tenant_config_loader |
| 19 | from config.utils import get_match_fields_for_index | 19 | from config.utils import get_match_fields_for_index |
| 20 | -from context.request_context import RequestContext, RequestContextStage, create_request_context | 20 | +from context.request_context import RequestContext, RequestContextStage |
| 21 | from api.models import FacetResult, FacetValue, FacetConfig | 21 | from api.models import FacetResult, FacetValue, FacetConfig |
| 22 | from api.result_formatter import ResultFormatter | 22 | from api.result_formatter import ResultFormatter |
| 23 | from indexer.mapping_generator import get_tenant_index_name | 23 | from indexer.mapping_generator import get_tenant_index_name |
| @@ -107,9 +107,7 @@ class Searcher: | @@ -107,9 +107,7 @@ class Searcher: | ||
| 107 | self.source_fields = config.query_config.source_fields or [] | 107 | self.source_fields = config.query_config.source_fields or [] |
| 108 | 108 | ||
| 109 | # Query builder - simplified single-layer architecture | 109 | # Query builder - simplified single-layer architecture |
| 110 | - # index_name is no longer needed in query builder since we use tenant-specific indices | ||
| 111 | self.query_builder = ESQueryBuilder( | 110 | self.query_builder = ESQueryBuilder( |
| 112 | - index_name="", # Not used, kept for backward compatibility | ||
| 113 | match_fields=self.match_fields, | 111 | match_fields=self.match_fields, |
| 114 | text_embedding_field=self.text_embedding_field, | 112 | text_embedding_field=self.text_embedding_field, |
| 115 | image_embedding_field=self.image_embedding_field, | 113 | image_embedding_field=self.image_embedding_field, |
| @@ -160,9 +158,8 @@ class Searcher: | @@ -160,9 +158,8 @@ class Searcher: | ||
| 160 | Returns: | 158 | Returns: |
| 161 | SearchResult object with formatted results | 159 | SearchResult object with formatted results |
| 162 | """ | 160 | """ |
| 163 | - # Create context if not provided (backward compatibility) | ||
| 164 | if context is None: | 161 | if context is None: |
| 165 | - context = create_request_context() | 162 | + raise ValueError("context is required") |
| 166 | 163 | ||
| 167 | # 根据租户配置决定翻译开关(离线/在线统一) | 164 | # 根据租户配置决定翻译开关(离线/在线统一) |
| 168 | tenant_loader = get_tenant_config_loader() | 165 | tenant_loader = get_tenant_config_loader() |
tests/ci/test_service_api_contracts.py
| @@ -179,6 +179,7 @@ def indexer_client(monkeypatch): | @@ -179,6 +179,7 @@ def indexer_client(monkeypatch): | ||
| 179 | import api.indexer_app as indexer_app | 179 | import api.indexer_app as indexer_app |
| 180 | import api.routes.indexer as indexer_routes | 180 | import api.routes.indexer as indexer_routes |
| 181 | 181 | ||
| 182 | + indexer_app.app.router.on_startup.clear() | ||
| 182 | monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None) | 183 | monkeypatch.setattr(indexer_app, "init_indexer_service", lambda es_host="": None) |
| 183 | monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService()) | 184 | monkeypatch.setattr(indexer_routes, "get_bulk_indexing_service", lambda: _FakeBulkService()) |
| 184 | monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService()) | 185 | monkeypatch.setattr(indexer_routes, "get_incremental_service", lambda: _FakeIncrementalService()) |
tests/test_cnclip_service.py
| @@ -6,21 +6,35 @@ CN-CLIP 服务测试脚本 | @@ -6,21 +6,35 @@ CN-CLIP 服务测试脚本 | ||
| 6 | 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议) | 6 | 测试 CN-CLIP 服务的文本和图像编码功能(使用 gRPC 协议) |
| 7 | 7 | ||
| 8 | 使用方法: | 8 | 使用方法: |
| 9 | - python scripts/test_cnclip_service.py [PORT] | 9 | + python tests/test_cnclip_service.py [PORT] |
| 10 | 10 | ||
| 11 | 参数: | 11 | 参数: |
| 12 | PORT: 服务端口(默认:51000) | 12 | PORT: 服务端口(默认:51000) |
| 13 | """ | 13 | """ |
| 14 | 14 | ||
| 15 | import sys | 15 | import sys |
| 16 | +import os | ||
| 16 | 17 | ||
| 17 | -import pytest | 18 | +import numpy as np |
| 19 | + | ||
| 20 | +# Skip clip_client version check (it imports pkg_resources in legacy path). | ||
| 21 | +os.environ.setdefault("NO_VERSION_CHECK", "1") | ||
| 22 | + | ||
| 23 | +# Ensure vendored client is importable in direct `python tests/test_cnclip_service.py` mode. | ||
| 24 | +ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
| 25 | +VENDORED_CLIENT = os.path.join(ROOT, "third-party", "clip-as-service", "client") | ||
| 26 | +if os.path.isdir(VENDORED_CLIENT) and VENDORED_CLIENT not in sys.path: | ||
| 27 | + sys.path.insert(0, VENDORED_CLIENT) | ||
| 18 | 28 | ||
| 19 | try: | 29 | try: |
| 20 | - import numpy as np | ||
| 21 | from clip_client import Client | 30 | from clip_client import Client |
| 22 | -except ImportError: | ||
| 23 | - pytest.skip("clip_client not installed (optional clip-as-service client)", allow_module_level=True) | 31 | +except ImportError as e: |
| 32 | + print("✗ 无法导入 clip_client。请先安装/暴露客户端依赖:") | ||
| 33 | + print(" 1) pip install -e third-party/clip-as-service/client") | ||
| 34 | + print(" 或") | ||
| 35 | + print(" 2) export PYTHONPATH=third-party/clip-as-service/client:$PYTHONPATH") | ||
| 36 | + print(f" 详细错误: {e}") | ||
| 37 | + sys.exit(1) | ||
| 24 | 38 | ||
| 25 | 39 | ||
| 26 | def _test_encoding(client, test_name, inputs): | 40 | def _test_encoding(client, test_name, inputs): |
| @@ -72,6 +86,15 @@ def main(): | @@ -72,6 +86,15 @@ def main(): | ||
| 72 | # 创建客户端 | 86 | # 创建客户端 |
| 73 | try: | 87 | try: |
| 74 | client = Client(grpc_url) | 88 | client = Client(grpc_url) |
| 89 | + except ModuleNotFoundError as e: | ||
| 90 | + if str(e) == "No module named 'pkg_resources'": | ||
| 91 | + print("✗ 当前环境缺少 pkg_resources,clip_client/jina 无法初始化。") | ||
| 92 | + print(" 建议使用专用环境运行:") | ||
| 93 | + print(" .venv-cnclip/bin/python tests/test_cnclip_service.py 51000") | ||
| 94 | + print(" 或在当前 .venv 安装兼容 setuptools(包含 pkg_resources)。") | ||
| 95 | + sys.exit(1) | ||
| 96 | + print(f"✗ 客户端创建失败: {e}") | ||
| 97 | + sys.exit(1) | ||
| 75 | except Exception as e: | 98 | except Exception as e: |
| 76 | print(f"✗ 客户端创建失败: {e}") | 99 | print(f"✗ 客户端创建失败: {e}") |
| 77 | sys.exit(1) | 100 | sys.exit(1) |
| @@ -118,4 +141,3 @@ def main(): | @@ -118,4 +141,3 @@ def main(): | ||
| 118 | 141 | ||
| 119 | if __name__ == '__main__': | 142 | if __name__ == '__main__': |
| 120 | main() | 143 | main() |
| 121 | - |
tests/test_embedding_pipeline.py
| @@ -2,6 +2,7 @@ import pickle | @@ -2,6 +2,7 @@ import pickle | ||
| 2 | from typing import Any, Dict, List, Optional | 2 | from typing import Any, Dict, List, Optional |
| 3 | 3 | ||
| 4 | import numpy as np | 4 | import numpy as np |
| 5 | +import pytest | ||
| 5 | 6 | ||
| 6 | from config import ( | 7 | from config import ( |
| 7 | FunctionScoreConfig, | 8 | FunctionScoreConfig, |
| @@ -100,7 +101,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): | @@ -100,7 +101,7 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): | ||
| 100 | def _fake_post(url, json, timeout): | 101 | def _fake_post(url, json, timeout): |
| 101 | assert url.endswith("/embed/text") | 102 | assert url.endswith("/embed/text") |
| 102 | assert json == ["hello", "world"] | 103 | assert json == ["hello", "world"] |
| 103 | - return _FakeResponse([[0.1, 0.2], None]) | 104 | + return _FakeResponse([[0.1, 0.2], [0.3, 0.4]]) |
| 104 | 105 | ||
| 105 | monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) | 106 | monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) |
| 106 | 107 | ||
| @@ -110,7 +111,22 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): | @@ -110,7 +111,22 @@ def test_text_embedding_encoder_response_alignment(monkeypatch): | ||
| 110 | assert len(out) == 2 | 111 | assert len(out) == 2 |
| 111 | assert isinstance(out[0], np.ndarray) | 112 | assert isinstance(out[0], np.ndarray) |
| 112 | assert out[0].shape == (2,) | 113 | assert out[0].shape == (2,) |
| 113 | - assert out[1] is None | 114 | + assert isinstance(out[1], np.ndarray) |
| 115 | + assert out[1].shape == (2,) | ||
| 116 | + | ||
| 117 | + | ||
| 118 | +def test_text_embedding_encoder_raises_on_missing_vector(monkeypatch): | ||
| 119 | + fake_redis = _FakeRedis() | ||
| 120 | + monkeypatch.setattr("embeddings.text_encoder.redis.Redis", lambda **kwargs: fake_redis) | ||
| 121 | + | ||
| 122 | + def _fake_post(url, json, timeout): | ||
| 123 | + return _FakeResponse([[0.1, 0.2], None]) | ||
| 124 | + | ||
| 125 | + monkeypatch.setattr("embeddings.text_encoder.requests.post", _fake_post) | ||
| 126 | + | ||
| 127 | + encoder = TextEmbeddingEncoder(service_url="http://127.0.0.1:6005") | ||
| 128 | + with pytest.raises(ValueError): | ||
| 129 | + encoder.encode(["hello", "world"]) | ||
| 114 | 130 | ||
| 115 | 131 | ||
| 116 | def test_text_embedding_encoder_cache_hit(monkeypatch): | 132 | def test_text_embedding_encoder_cache_hit(monkeypatch): |
| @@ -156,4 +172,3 @@ def test_query_parser_skips_query_vector_when_disabled(): | @@ -156,4 +172,3 @@ def test_query_parser_skips_query_vector_when_disabled(): | ||
| 156 | 172 | ||
| 157 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) | 173 | parsed = parser.parse("red dress", tenant_id="162", generate_vector=False) |
| 158 | assert parsed.query_vector is None | 174 | assert parsed.query_vector is None |
| 159 | - |