start_reranker.sh
5.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
#
# Start reranker service from its backend-specific isolated venv.
#
set -euo pipefail
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${PROJECT_ROOT}"
# Load .env without activating main venv.
# shellcheck source=scripts/lib/load_env.sh
source "${PROJECT_ROOT}/scripts/lib/load_env.sh"
load_env_file "${PROJECT_ROOT}/.env"
# shellcheck source=scripts/lib/reranker_backend_env.sh
source "${PROJECT_ROOT}/scripts/lib/reranker_backend_env.sh"
CONFIG_PYTHON="${PROJECT_ROOT}/.venv/bin/python"
if [[ ! -x "${CONFIG_PYTHON}" ]]; then
CONFIG_PYTHON="${PYTHON:-python3}"
fi
RERANK_INSTANCE="${RERANK_INSTANCE:-default}"
read -r INSTANCE_HOST INSTANCE_PORT INSTANCE_BACKEND INSTANCE_RUNTIME_DIR <<EOF
$(
PYTHONPATH="${PROJECT_ROOT}${PYTHONPATH:+:${PYTHONPATH}}" "${CONFIG_PYTHON}" - <<'PY'
from config.loader import get_app_config
import os
cfg = get_app_config().services.rerank
name = (os.getenv("RERANK_INSTANCE") or cfg.default_instance).strip() or cfg.default_instance
instance = cfg.get_instance(name)
runtime_dir = instance.runtime_dir or f"./.runtime/reranker/{name}"
print(instance.host, instance.port, instance.backend, runtime_dir)
PY
)
EOF
RERANKER_HOST="${RERANKER_HOST:-${INSTANCE_HOST:-0.0.0.0}}"
RERANKER_PORT="${RERANKER_PORT:-${INSTANCE_PORT:-6007}}"
RERANK_BACKEND="${RERANK_BACKEND:-${INSTANCE_BACKEND:-$(detect_rerank_backend "${PROJECT_ROOT}")}}"
RERANKER_VENV="${RERANKER_VENV:-$(reranker_backend_venv_dir "${PROJECT_ROOT}" "${RERANK_BACKEND}")}"
PYTHON_BIN="${RERANKER_VENV}/bin/python"
if [[ ! -x "${PYTHON_BIN}" ]]; then
echo "ERROR: reranker venv not found for backend ${RERANK_BACKEND}: ${RERANKER_VENV}" >&2
echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
exit 1
fi
# Keep vLLM/triton/torch caches out of system disk.
RERANKER_RUNTIME_DIR="${RERANKER_RUNTIME_DIR:-${INSTANCE_RUNTIME_DIR:-${PROJECT_ROOT}/.runtime/reranker/${RERANK_INSTANCE}}}"
if [[ "${RERANKER_RUNTIME_DIR}" != /* ]]; then
RERANKER_RUNTIME_DIR="${PROJECT_ROOT}/${RERANKER_RUNTIME_DIR#./}"
fi
mkdir -p "${RERANKER_RUNTIME_DIR}/home" \
"${RERANKER_RUNTIME_DIR}/cache" \
"${RERANKER_RUNTIME_DIR}/config" \
"${RERANKER_RUNTIME_DIR}/triton" \
"${RERANKER_RUNTIME_DIR}/torch_compile" \
"${RERANKER_RUNTIME_DIR}/tmp"
export HOME="${RERANKER_RUNTIME_DIR}/home"
export XDG_CACHE_HOME="${RERANKER_RUNTIME_DIR}/cache"
export XDG_CONFIG_HOME="${RERANKER_RUNTIME_DIR}/config"
export TRITON_CACHE_DIR="${RERANKER_RUNTIME_DIR}/triton"
export TORCHINDUCTOR_CACHE_DIR="${RERANKER_RUNTIME_DIR}/torch_compile"
export TMPDIR="${RERANKER_RUNTIME_DIR}/tmp"
export VLLM_NO_USAGE_STATS="${VLLM_NO_USAGE_STATS:-1}"
# venv bin must be on PATH before Python starts: vLLM worker inherits it; on T4/Turing,
# qwen3_vllm_score now relies on vLLM auto-selecting FLASHINFER, whose JIT needs pip-installed ninja.
export PATH="${RERANKER_VENV}/bin:${PATH}"
if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then
export HF_HUB_DISABLE_XET="${HF_HUB_DISABLE_XET:-1}"
fi
if [[ "${RERANK_BACKEND}" == "qwen3_vllm" || "${RERANK_BACKEND}" == "qwen3_vllm_score" || "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then
if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then
echo "ERROR: ${RERANK_BACKEND} backend requires NVIDIA GPU, but nvidia-smi is unavailable." >&2
exit 1
fi
if ! "${PYTHON_BIN}" - <<'PY'
try:
import torch
try:
import vllm # noqa: F401
except Exception:
pass
if not torch.cuda.is_available():
raise SystemExit(1)
except Exception:
raise SystemExit(1)
PY
then
if [[ "${RERANK_BACKEND}" == "qwen3_transformers_packed" ]]; then
echo "ERROR: ${RERANK_BACKEND} backend requires torch + CUDA runtime in ${RERANKER_VENV}." >&2
else
echo "ERROR: ${RERANK_BACKEND} backend requires vllm + CUDA runtime in ${RERANKER_VENV}." >&2
fi
echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND} and verify CUDA is available." >&2
exit 1
fi
fi
if [[ "${RERANK_BACKEND}" == qwen3_gguf* ]]; then
gguf_check_status=0
"${PYTHON_BIN}" - <<'PY' || gguf_check_status=$?
try:
import llama_cpp
if hasattr(llama_cpp, "llama_supports_gpu_offload") and not llama_cpp.llama_supports_gpu_offload():
raise SystemExit(2)
except Exception:
raise SystemExit(1)
PY
if [[ "${gguf_check_status}" != "0" ]]; then
if [[ "${gguf_check_status}" == "2" ]]; then
echo "ERROR: ${RERANK_BACKEND} backend detected a CPU-only llama-cpp-python build in ${RERANKER_VENV}." >&2
echo "Please rerun: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
else
echo "ERROR: ${RERANK_BACKEND} backend requires llama-cpp-python in ${RERANKER_VENV}." >&2
echo "Please run: ./scripts/setup_reranker_venv.sh ${RERANK_BACKEND}" >&2
fi
exit 1
fi
fi
echo "========================================"
echo "Starting Reranker Service"
echo "========================================"
echo "Instance: ${RERANK_INSTANCE}"
echo "Python: ${PYTHON_BIN}"
echo "Host: ${RERANKER_HOST}"
echo "Port: ${RERANKER_PORT}"
echo "Backend: ${RERANK_BACKEND}"
echo "Runtime dir: ${RERANKER_RUNTIME_DIR}"
echo
exec "${PYTHON_BIN}" -m uvicorn reranker.server:app \
--host "${RERANKER_HOST}" \
--port "${RERANKER_PORT}" \
--workers 1