start_tei_service.sh
6.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
#!/bin/bash
#
# Start Hugging Face TEI service for Qwen3-Embedding-0.6B using Docker.
#
set -euo pipefail
PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${PROJECT_ROOT}"
# Load .env if present
ENV_FILE="${PROJECT_ROOT}/.env"
if [ -f "${ENV_FILE}" ]; then
while IFS= read -r line || [ -n "${line}" ]; do
line="${line%$'\r'}"
[[ -z "${line//[[:space:]]/}" ]] && continue
[[ "${line}" =~ ^[[:space:]]*# ]] && continue
[[ "${line}" != *=* ]] && continue
key="${line%%=*}"
value="${line#*=}"
key="${key#"${key%%[![:space:]]*}"}"
key="${key%"${key##*[![:space:]]}"}"
value="${value#"${value%%[![:space:]]*}"}"
if [[ ${#value} -ge 2 ]]; then
first="${value:0:1}"
last="${value: -1}"
if [[ ("${first}" == '"' && "${last}" == '"') || ("${first}" == "'" && "${last}" == "'") ]]; then
value="${value:1:${#value}-2}"
fi
fi
export "${key}=${value}"
done < "${ENV_FILE}"
fi
if ! command -v docker >/dev/null 2>&1; then
echo "ERROR: docker is required to run TEI service." >&2
exit 1
fi
TEI_CONTAINER_NAME="${TEI_CONTAINER_NAME:-saas-search-tei}"
TEI_PORT="${TEI_PORT:-8080}"
TEI_MODEL_ID="${TEI_MODEL_ID:-Qwen/Qwen3-Embedding-0.6B}"
TEI_VERSION="${TEI_VERSION:-1.9}"
TEI_MAX_BATCH_TOKENS="${TEI_MAX_BATCH_TOKENS:-4096}"
TEI_MAX_CLIENT_BATCH_SIZE="${TEI_MAX_CLIENT_BATCH_SIZE:-24}"
TEI_DTYPE="${TEI_DTYPE:-float16}"
HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/huggingface}"
TEI_HEALTH_TIMEOUT_SEC="${TEI_HEALTH_TIMEOUT_SEC:-300}"
USE_GPU_RAW="${TEI_USE_GPU:-1}"
USE_GPU="$(echo "${USE_GPU_RAW}" | tr '[:upper:]' '[:lower:]')"
if [[ "${USE_GPU}" == "1" || "${USE_GPU}" == "true" || "${USE_GPU}" == "yes" ]]; then
USE_GPU="1"
elif [[ "${USE_GPU}" == "0" || "${USE_GPU}" == "false" || "${USE_GPU}" == "no" ]]; then
USE_GPU="0"
else
echo "ERROR: invalid TEI_USE_GPU=${USE_GPU_RAW}. Use 1/0 (or true/false)." >&2
exit 1
fi
detect_gpu_tei_image() {
# Prefer turing image for pre-Ampere GPUs (e.g. Tesla T4, compute capability 7.5).
local compute_cap major
compute_cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 || true)"
major="${compute_cap%%.*}"
if [[ -n "${major}" && "${major}" -lt 8 ]]; then
echo "ghcr.io/huggingface/text-embeddings-inference:turing-${TEI_VERSION}"
else
echo "ghcr.io/huggingface/text-embeddings-inference:cuda-${TEI_VERSION}"
fi
}
if [[ "${USE_GPU}" == "1" ]]; then
if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then
echo "ERROR: TEI_USE_GPU=1 but NVIDIA GPU is not available. No CPU fallback." >&2
exit 1
fi
if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q 'nvidia'; then
echo "ERROR: TEI_USE_GPU=1 but Docker nvidia runtime is not configured." >&2
echo "Install and configure nvidia-container-toolkit, then restart Docker." >&2
exit 1
fi
TEI_IMAGE="${TEI_IMAGE:-$(detect_gpu_tei_image)}"
GPU_ARGS=(--gpus all)
TEI_MODE="gpu"
else
TEI_IMAGE="${TEI_IMAGE:-ghcr.io/huggingface/text-embeddings-inference:${TEI_VERSION}}"
GPU_ARGS=()
TEI_MODE="cpu"
fi
mkdir -p "${HF_CACHE_DIR}"
existing_id="$(docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$)"
if [[ -n "${existing_id}" ]]; then
running_id="$(docker ps -q -f name=^/${TEI_CONTAINER_NAME}$)"
if [[ -n "${running_id}" ]]; then
current_image="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{.Config.Image}}' 2>/dev/null || true)"
device_req="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{json .HostConfig.DeviceRequests}}' 2>/dev/null || true)"
current_is_gpu_image=0
if [[ "${current_image}" == *":cuda-"* || "${current_image}" == *":turing-"* ]]; then
current_is_gpu_image=1
fi
if [[ "${USE_GPU}" == "1" ]]; then
if [[ "${current_is_gpu_image}" -eq 1 ]] && [[ "${device_req}" != "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then
echo "TEI already running (GPU): ${TEI_CONTAINER_NAME}"
exit 0
fi
echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}"
echo " current_image=${current_image:-unknown}"
echo " target_image=${TEI_IMAGE}"
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
else
if [[ "${current_is_gpu_image}" -eq 0 ]] && [[ "${device_req}" == "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then
echo "TEI already running (CPU): ${TEI_CONTAINER_NAME}"
exit 0
fi
echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}"
echo " current_image=${current_image:-unknown}"
echo " target_image=${TEI_IMAGE}"
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
fi
fi
if docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$ | grep -q .; then
docker rm "${TEI_CONTAINER_NAME}" >/dev/null
fi
fi
echo "Starting TEI container: ${TEI_CONTAINER_NAME}"
echo "Image: ${TEI_IMAGE}"
echo "Model: ${TEI_MODEL_ID}"
echo "Port: ${TEI_PORT}"
echo "Mode: ${TEI_MODE}"
docker run -d \
--name "${TEI_CONTAINER_NAME}" \
-p "${TEI_PORT}:80" \
"${GPU_ARGS[@]}" \
-v "${HF_CACHE_DIR}:/data" \
-e HF_TOKEN="${HF_TOKEN:-}" \
"${TEI_IMAGE}" \
--model-id "${TEI_MODEL_ID}" \
--dtype "${TEI_DTYPE}" \
--max-batch-tokens "${TEI_MAX_BATCH_TOKENS}" \
--max-client-batch-size "${TEI_MAX_CLIENT_BATCH_SIZE}" >/dev/null
echo "Waiting for TEI health..."
for i in $(seq 1 "${TEI_HEALTH_TIMEOUT_SEC}"); do
if curl -sf "http://127.0.0.1:${TEI_PORT}/health" >/dev/null 2>&1; then
echo "TEI health is ready: http://127.0.0.1:${TEI_PORT}"
break
fi
sleep 1
if [[ "${i}" == "${TEI_HEALTH_TIMEOUT_SEC}" ]]; then
echo "ERROR: TEI failed to become healthy in time." >&2
docker logs --tail 100 "${TEI_CONTAINER_NAME}" >&2 || true
exit 1
fi
done
echo "Running TEI output probe..."
for probe_idx in 1 2; do
probe_resp="$(curl -sf -X POST "http://127.0.0.1:${TEI_PORT}/embed" \
-H "Content-Type: application/json" \
-d '{"inputs":["health check","芭比娃娃 儿童玩具"]}' || true)"
if [[ -z "${probe_resp}" ]]; then
echo "ERROR: TEI probe ${probe_idx} failed: empty response" >&2
docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
exit 1
fi
# Detect non-finite-like payloads (observed as null/NaN on incompatible CUDA image + GPU).
if echo "${probe_resp}" | rg -qi '(null|nan|inf)'; then
echo "ERROR: TEI probe ${probe_idx} detected invalid embedding values (null/NaN/Inf)." >&2
echo "Response preview: $(echo "${probe_resp}" | head -c 220)" >&2
docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true
docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true
exit 1
fi
done
echo "TEI is ready and output probe passed: http://127.0.0.1:${TEI_PORT}"
exit 0