start_tei_service.sh
5.99 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
#
# Start Hugging Face TEI service for Qwen3-Embedding-0.6B using Docker.
#
set -euo pipefail
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${PROJECT_ROOT}"
# Load .env.
# shellcheck source=scripts/lib/load_env.sh
source "${PROJECT_ROOT}/scripts/lib/load_env.sh"
load_env_file "${PROJECT_ROOT}/.env"
if ! command -v docker >/dev/null 2>&1; then
echo "ERROR: docker is required to run TEI service." >&2
exit 1
fi
TEI_CONTAINER_NAME="${TEI_CONTAINER_NAME:-saas-search-tei}"
TEI_PORT="${TEI_PORT:-8080}"
TEI_MODEL_ID="${TEI_MODEL_ID:-Qwen/Qwen3-Embedding-0.6B}"
TEI_VERSION="${TEI_VERSION:-1.9}"
TEI_MAX_BATCH_TOKENS="${TEI_MAX_BATCH_TOKENS:-4096}"
TEI_MAX_CLIENT_BATCH_SIZE="${TEI_MAX_CLIENT_BATCH_SIZE:-24}"
TEI_DTYPE="${TEI_DTYPE:-float16}"
HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/huggingface}"
TEI_HEALTH_TIMEOUT_SEC="${TEI_HEALTH_TIMEOUT_SEC:-300}"
TEI_DEVICE_RAW="${TEI_DEVICE:-cuda}"
TEI_DEVICE="$(echo "${TEI_DEVICE_RAW}" | tr '[:upper:]' '[:lower:]')"
if [[ "${TEI_DEVICE}" != "cuda" && "${TEI_DEVICE}" != "cpu" ]]; then
echo "ERROR: invalid TEI_DEVICE=${TEI_DEVICE_RAW}. Use cuda/cpu." >&2
exit 1
fi
detect_gpu_tei_image() {
# Prefer turing image for pre-Ampere GPUs (e.g. Tesla T4, compute capability 7.5).
local compute_cap major
compute_cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 || true)"
major="${compute_cap%%.*}"
if [[ -n "${major}" && "${major}" -lt 8 ]]; then
echo "ghcr.io/huggingface/text-embeddings-inference:turing-${TEI_VERSION}"
else
echo "ghcr.io/huggingface/text-embeddings-inference:cuda-${TEI_VERSION}"
fi
}
if [[ "${TEI_DEVICE}" == "cuda" ]]; then
if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then
echo "ERROR: TEI_DEVICE=cuda but NVIDIA GPU is not available. No CPU fallback." >&2
exit 1
fi
if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q 'nvidia'; then
echo "ERROR: TEI_DEVICE=cuda but Docker nvidia runtime is not configured." >&2
echo "Install and configure nvidia-container-toolkit, then restart Docker." >&2
exit 1
fi
TEI_IMAGE="${TEI_IMAGE:-$(detect_gpu_tei_image)}"
GPU_ARGS=(--gpus all)
TEI_MODE="cuda"
else
TEI_IMAGE="${TEI_IMAGE:-ghcr.io/huggingface/text-embeddings-inference:${TEI_VERSION}}"
GPU_ARGS=()
TEI_MODE="cpu"
fi
mkdir -p "${HF_CACHE_DIR}"
existing_id="$(docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$)"
if [[ -n "${existing_id}" ]]; then
running_id="$(docker ps -q -f name=^/${TEI_CONTAINER_NAME}$)"
if [[ -n "${running_id}" ]]; then
current_image="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{.Config.Image}}' 2>/dev/null || true)"
device_req="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{json .HostConfig.DeviceRequests}}' 2>/dev/null || true)"
current_is_gpu_image=0
if [[ "${current_image}" == *":cuda-"* || "${current_image}" == *":turing-"* ]]; then
current_is_gpu_image=1
fi
if [[ "${TEI_DEVICE}" == "cuda" ]]; then
if [[ "${current_is_gpu_image}" -eq 1 ]] && [[ "${device_req}" != "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then
echo "TEI already running (CUDA): ${TEI_CONTAINER_NAME}"
exit 0
fi
echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}"
echo " current_image=${current_image:-unknown}"
echo " target_image=${TEI_IMAGE}"
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
else
if [[ "${current_is_gpu_image}" -eq 0 ]] && [[ "${device_req}" == "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then
echo "TEI already running (CPU): ${TEI_CONTAINER_NAME}"
exit 0
fi
echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}"
echo " current_image=${current_image:-unknown}"
echo " target_image=${TEI_IMAGE}"
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
fi
fi
if docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$ | grep -q .; then
docker rm "${TEI_CONTAINER_NAME}" >/dev/null
fi
fi
echo "Starting TEI container: ${TEI_CONTAINER_NAME}"
echo "Image: ${TEI_IMAGE}"
echo "Model: ${TEI_MODEL_ID}"
echo "Port: ${TEI_PORT}"
echo "Mode: ${TEI_MODE}"
docker run -d \
--name "${TEI_CONTAINER_NAME}" \
-p "${TEI_PORT}:80" \
"${GPU_ARGS[@]}" \
-v "${HF_CACHE_DIR}:/data" \
-e HF_TOKEN="${HF_TOKEN:-}" \
"${TEI_IMAGE}" \
--model-id "${TEI_MODEL_ID}" \
--dtype "${TEI_DTYPE}" \
--max-batch-tokens "${TEI_MAX_BATCH_TOKENS}" \
--max-client-batch-size "${TEI_MAX_CLIENT_BATCH_SIZE}" >/dev/null
echo "Waiting for TEI health..."
for i in $(seq 1 "${TEI_HEALTH_TIMEOUT_SEC}"); do
if curl -sf "http://127.0.0.1:${TEI_PORT}/health" >/dev/null 2>&1; then
echo "TEI health is ready: http://127.0.0.1:${TEI_PORT}"
break
fi
sleep 1
if [[ "${i}" == "${TEI_HEALTH_TIMEOUT_SEC}" ]]; then
echo "ERROR: TEI failed to become healthy in time." >&2
docker logs --tail 100 "${TEI_CONTAINER_NAME}" >&2 || true
exit 1
fi
done
echo "Running TEI output probe..."
for probe_idx in 1 2; do
probe_resp="$(curl -sf -X POST "http://127.0.0.1:${TEI_PORT}/embed" \
-H "Content-Type: application/json" \
-d '{"inputs":["health check","芭比娃娃 儿童玩具"]}' || true)"
if [[ -z "${probe_resp}" ]]; then
echo "ERROR: TEI probe ${probe_idx} failed: empty response" >&2
docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
exit 1
fi
# Detect non-finite-like payloads (observed as null/NaN on incompatible CUDA image + GPU).
if echo "${probe_resp}" | rg -qi '(null|nan|inf)'; then
echo "ERROR: TEI probe ${probe_idx} detected invalid embedding values (null/NaN/Inf)." >&2
echo "Response preview: $(echo "${probe_resp}" | head -c 220)" >&2
docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
exit 1
fi
done
echo "TEI is ready and output probe passed: http://127.0.0.1:${TEI_PORT}"
exit 0