#!/bin/bash # # Start Hugging Face TEI service for Qwen3-Embedding-0.6B using Docker. # set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" cd "${PROJECT_ROOT}" # Load .env. # shellcheck source=scripts/lib/load_env.sh source "${PROJECT_ROOT}/scripts/lib/load_env.sh" load_env_file "${PROJECT_ROOT}/.env" if ! command -v docker >/dev/null 2>&1; then echo "ERROR: docker is required to run TEI service." >&2 exit 1 fi TEI_CONTAINER_NAME="${TEI_CONTAINER_NAME:-saas-search-tei}" TEI_PORT="${TEI_PORT:-8080}" TEI_MODEL_ID="${TEI_MODEL_ID:-Qwen/Qwen3-Embedding-0.6B}" TEI_VERSION="${TEI_VERSION:-1.9}" TEI_MAX_BATCH_TOKENS="${TEI_MAX_BATCH_TOKENS:-4096}" TEI_MAX_CLIENT_BATCH_SIZE="${TEI_MAX_CLIENT_BATCH_SIZE:-24}" TEI_DTYPE="${TEI_DTYPE:-float16}" HF_CACHE_DIR="${HF_CACHE_DIR:-$HOME/.cache/huggingface}" TEI_HEALTH_TIMEOUT_SEC="${TEI_HEALTH_TIMEOUT_SEC:-300}" TEI_DEVICE_RAW="${TEI_DEVICE:-cuda}" TEI_DEVICE="$(echo "${TEI_DEVICE_RAW}" | tr '[:upper:]' '[:lower:]')" if [[ "${TEI_DEVICE}" != "cuda" && "${TEI_DEVICE}" != "cpu" ]]; then echo "ERROR: invalid TEI_DEVICE=${TEI_DEVICE_RAW}. Use cuda/cpu." >&2 exit 1 fi detect_gpu_tei_image() { # Prefer turing image for pre-Ampere GPUs (e.g. Tesla T4, compute capability 7.5). local compute_cap major compute_cap="$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader 2>/dev/null | head -n1 || true)" major="${compute_cap%%.*}" if [[ -n "${major}" && "${major}" -lt 8 ]]; then echo "ghcr.io/huggingface/text-embeddings-inference:turing-${TEI_VERSION}" else echo "ghcr.io/huggingface/text-embeddings-inference:cuda-${TEI_VERSION}" fi } if [[ "${TEI_DEVICE}" == "cuda" ]]; then if ! command -v nvidia-smi >/dev/null 2>&1 || ! nvidia-smi >/dev/null 2>&1; then echo "ERROR: TEI_DEVICE=cuda but NVIDIA GPU is not available. No CPU fallback." >&2 exit 1 fi if ! docker info --format '{{json .Runtimes}}' 2>/dev/null | grep -q 'nvidia'; then echo "ERROR: TEI_DEVICE=cuda but Docker nvidia runtime is not configured." >&2 echo "Install and configure nvidia-container-toolkit, then restart Docker." >&2 exit 1 fi TEI_IMAGE="${TEI_IMAGE:-$(detect_gpu_tei_image)}" GPU_ARGS=(--gpus all) TEI_MODE="cuda" else TEI_IMAGE="${TEI_IMAGE:-ghcr.io/huggingface/text-embeddings-inference:${TEI_VERSION}}" GPU_ARGS=() TEI_MODE="cpu" fi mkdir -p "${HF_CACHE_DIR}" existing_id="$(docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$)" if [[ -n "${existing_id}" ]]; then running_id="$(docker ps -q -f name=^/${TEI_CONTAINER_NAME}$)" if [[ -n "${running_id}" ]]; then current_image="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{.Config.Image}}' 2>/dev/null || true)" device_req="$(docker inspect "${TEI_CONTAINER_NAME}" --format '{{json .HostConfig.DeviceRequests}}' 2>/dev/null || true)" current_is_gpu_image=0 if [[ "${current_image}" == *":cuda-"* || "${current_image}" == *":turing-"* ]]; then current_is_gpu_image=1 fi if [[ "${TEI_DEVICE}" == "cuda" ]]; then if [[ "${current_is_gpu_image}" -eq 1 ]] && [[ "${device_req}" != "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then echo "TEI already running (CUDA): ${TEI_CONTAINER_NAME}" exit 0 fi echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}" echo " current_image=${current_image:-unknown}" echo " target_image=${TEI_IMAGE}" docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true else if [[ "${current_is_gpu_image}" -eq 0 ]] && [[ "${device_req}" == "null" ]] && [[ "${current_image}" == "${TEI_IMAGE}" ]]; then echo "TEI already running (CPU): ${TEI_CONTAINER_NAME}" exit 0 fi echo "TEI running with different mode/image; recreating container ${TEI_CONTAINER_NAME}" echo " current_image=${current_image:-unknown}" echo " target_image=${TEI_IMAGE}" docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true fi fi if docker ps -aq -f name=^/${TEI_CONTAINER_NAME}$ | grep -q .; then docker rm "${TEI_CONTAINER_NAME}" >/dev/null fi fi echo "Starting TEI container: ${TEI_CONTAINER_NAME}" echo "Image: ${TEI_IMAGE}" echo "Model: ${TEI_MODEL_ID}" echo "Port: ${TEI_PORT}" echo "Mode: ${TEI_MODE}" docker run -d \ --name "${TEI_CONTAINER_NAME}" \ -p "${TEI_PORT}:80" \ "${GPU_ARGS[@]}" \ -v "${HF_CACHE_DIR}:/data" \ -e HF_TOKEN="${HF_TOKEN:-}" \ "${TEI_IMAGE}" \ --model-id "${TEI_MODEL_ID}" \ --dtype "${TEI_DTYPE}" \ --max-batch-tokens "${TEI_MAX_BATCH_TOKENS}" \ --max-client-batch-size "${TEI_MAX_CLIENT_BATCH_SIZE}" >/dev/null echo "Waiting for TEI health..." for i in $(seq 1 "${TEI_HEALTH_TIMEOUT_SEC}"); do if curl -sf "http://127.0.0.1:${TEI_PORT}/health" >/dev/null 2>&1; then echo "TEI health is ready: http://127.0.0.1:${TEI_PORT}" break fi sleep 1 if [[ "${i}" == "${TEI_HEALTH_TIMEOUT_SEC}" ]]; then echo "ERROR: TEI failed to become healthy in time." >&2 docker logs --tail 100 "${TEI_CONTAINER_NAME}" >&2 || true exit 1 fi done echo "Running TEI output probe..." for probe_idx in 1 2; do probe_resp="$(curl -sf -X POST "http://127.0.0.1:${TEI_PORT}/embed" \ -H "Content-Type: application/json" \ -d '{"inputs":["health check","芭比娃娃 儿童玩具"]}' || true)" if [[ -z "${probe_resp}" ]]; then echo "ERROR: TEI probe ${probe_idx} failed: empty response" >&2 docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true exit 1 fi # Detect non-finite-like payloads (observed as null/NaN on incompatible CUDA image + GPU). if echo "${probe_resp}" | rg -qi '(null|nan|inf)'; then echo "ERROR: TEI probe ${probe_idx} detected invalid embedding values (null/NaN/Inf)." >&2 echo "Response preview: $(echo "${probe_resp}" | head -c 220)" >&2 docker logs --tail 120 "${TEI_CONTAINER_NAME}" >&2 || true docker rm -f "${TEI_CONTAINER_NAME}" >/dev/null 2>&1 || true exit 1 fi done echo "TEI is ready and output probe passed: http://127.0.0.1:${TEI_PORT}" exit 0