#!/bin/bash # # Start reranker service from its backend-specific isolated venv. # set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "${PROJECT_ROOT}" # Load .env without activating main venv. # shellcheck source=scripts/lib/load_env.sh source "${PROJECT_ROOT}/scripts/lib/load_env.sh" load_env_file "${PROJECT_ROOT}/.env" # shellcheck source=scripts/lib/reranker_backend_env.sh source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh" RERANKER_HOST="${RERANKER_HOST:-0.0.0.0}" RERANKER_PORT="${RERANKER_PORT:-6007}" RERANK_BACKEND="${RERANK_BACKEND:-$(detect_rerank_backend "${PROJECT_ROOT}")}" RERANKER_VENV="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${RERANK_BACKEND}")}" PYTHON_BIN="${RERANKER_VENV}/bin/python" if [[ ! -x "${PYTHON_BIN}" ]]; then echo "ERROR: reranker venv not found for backend ${RERANK_BACKEND}: ${RERANKER_VENV}" >&2 echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 exit 1 fi # Keep vLLM/triton/torch caches out of system disk. RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker}" mkdir -p "${RERANKER_RUNTIME_DIR}/home" \ "${RERANKER_RUNTIME_DIR}/cache" \ "${RERANKER_RUNTIME_DIR}/config" \ "${RERANKER_RUNTIME_DIR}/triton" \ "${RERANKER_RUNTIME_DIR}/torch_compile" \ "${RERANKER_RUNTIME_DIR}/tmp" export HOME="${RERANKER_RUNTIME_DIR}/home" export XDG_CACHE_HOME="${RERANKER_RUNTIME_DIR}/cache" export XDG_CONFIG_HOME="${RERANKER_RUNTIME_DIR}/config" export TRITON_CACHE_DIR="${RERANKER_RUNTIME_DIR}/triton" export TORCHINDUCTOR_CACHE_DIR="${RERANKER_RUNTIME_DIR}/torch_compile" export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp" export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}" export PATH="${RERANKER_VENV}/bin:${PATH}" if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}" fi if [[ "${RERANK_BACKEND}" == "qwen3_vllm" ]]; then if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then echo "ERROR: qwen3_vllm backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2 exit 1 fi if ! "${PYTHON_BIN}" - <<'PY' try: import vllm # noqa: F401 import torch if not torch.cuda.is_available(): raise SystemExit(1) except Exception: raise SystemExit(1) PY then echo "ERROR: qwen3_vllm backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2 echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2 exit 1 fi fi if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then gguf_check_status=0 "${PYTHON_BIN}" - <<'PY' || gguf_check_status=$? try: import llama_cpp if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload(): raise SystemExit(2) except Exception: raise SystemExit(1) PY if [[ "${gguf_check_status}" != "0" ]]; then if [[ "${gguf_check_status}" == "2" ]]; then echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2 echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 else echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2 echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2 fi exit 1 fi fi echo "========================================" echo "Starting Reranker Service" echo "========================================" echo "Python: ${PYTHON_BIN}" echo "Host: ${RERANKER_HOST}" echo "Port: ${RERANKER_PORT}" echo "Backend: ${RERANK_BACKEND}" echo "Runtime dir: ${RERANKER_RUNTIME_DIR}" echo exec "${PYTHON_BIN}" -m uvicorn reranker.server:app \ --host "${RERANKER_HOST}" \ --port "${RERANKER_PORT}" \ --workers 1