#!/bin/bash
#
# Unified service lifecycle controller for saas-search.
# Supports: up / down / start / stop / restart / status / monitor / monitor-start / monitor-stop / monitor-status
#

set -euo pipefail

PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
LOG_DIR="${PROJECT_ROOT}/logs"
LOG_RETENTION_DAYS="${LOG_RETENTION_DAYS:-30}"

mkdir -p "${LOG_DIR}"

# shellcheck source=scripts/lib/load_env.sh
source "${PROJECT_ROOT}/scripts/lib/load_env.sh"

CORE_SERVICES=("backend" "indexer" "frontend")
OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "translator" "reranker")
FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}")
STOP_ORDER_SERVICES=("frontend" "indexer" "backend" "reranker" "translator" "embedding" "cnclip" "tei")

all_services() {
  echo "${FULL_SERVICES[@]}"
}

get_port() {
  local service="$1"
  case "${service}" in
    backend) echo "${API_PORT:-6002}" ;;
    indexer) echo "${INDEXER_PORT:-6004}" ;;
    frontend) echo "${FRONTEND_PORT:-6003}" ;;
    embedding) echo "${EMBEDDING_PORT:-6005}" ;;
    translator) echo "${TRANSLATION_PORT:-6006}" ;;
    reranker) echo "${RERANKER_PORT:-6007}" ;;
    tei) echo "${TEI_PORT:-8080}" ;;
    cnclip) echo "${CNCLIP_PORT:-51000}" ;;
    *) echo "" ;;
  esac
}

pid_file() {
  local service="$1"
  echo "${LOG_DIR}/${service}.pid"
}

log_file() {
  local service="$1"
  echo "${LOG_DIR}/${service}.log"
}

prepare_daily_log_target() {
  local service="$1"
  local day
  local today_file
  day="$(date +%F)"
  today_file="${LOG_DIR}/${service}-${day}.log"
  touch "${today_file}"
  ln -sfn "$(basename "${today_file}")" "$(log_file "${service}")"
}

service_start_cmd() {
  local service="$1"
  case "${service}" in
    backend) echo "./scripts/start_backend.sh" ;;
    indexer) echo "./scripts/start_indexer.sh" ;;
    frontend) echo "./scripts/start_frontend.sh" ;;
    embedding) echo "./scripts/start_embedding_service.sh" ;;
    translator) echo "./scripts/start_translator.sh" ;;
    reranker) echo "./scripts/start_reranker.sh" ;;
    tei) echo "./scripts/start_tei_service.sh" ;;
    cnclip) echo "./scripts/start_cnclip_service.sh" ;;
    *) return 1 ;;
  esac
}

service_exists() {
  local service="$1"
  case "${service}" in
    backend|indexer|frontend|embedding|translator|reranker|tei|cnclip) return 0 ;;
    *) return 1 ;;
  esac
}

validate_targets() {
  local targets="$1"
  for svc in ${targets}; do
    if ! service_exists "${svc}"; then
      echo "[error] unknown service: ${svc}" >&2
      return 1
    fi
  done
}

health_path_for_service() {
  local service="$1"
  case "${service}" in
    backend|indexer|embedding|translator|reranker|tei) echo "/health" ;;
    *) echo "" ;;
  esac
}

monitor_log_file() {
  echo "${LOG_DIR}/service-monitor.log"
}

monitor_pid_file() {
  echo "${LOG_DIR}/service-monitor.pid"
}

monitor_targets_file() {
  echo "${LOG_DIR}/service-monitor.targets"
}

sync_monitor_daemon_state() {
  local pf
  pf="$(monitor_pid_file)"
  local tf
  tf="$(monitor_targets_file)"

  if [ ! -f "${pf}" ]; then
    return 1
  fi

  local pid
  pid="$(cat "${pf}" 2>/dev/null || true)"
  if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
    return 0
  fi

  rm -f "${pf}" "${tf}"
  return 1
}

monitor_current_targets() {
  if [ -f "$(monitor_targets_file)" ]; then
    cat "$(monitor_targets_file)" 2>/dev/null || true
  fi
}

merge_targets() {
  local base="${1:-}"
  local extra="${2:-}"
  local merged=""
  merged="$(normalize_targets "${base} ${extra}")"
  if [ -n "${merged}" ]; then
    merged="$(apply_target_order monitor "${merged}")"
  fi
  echo "${merged}"
}

subtract_targets() {
  local base="${1:-}"
  local remove="${2:-}"
  local out=""
  local svc
  declare -A removed=()

  for svc in ${remove}; do
    removed["${svc}"]=1
  done

  for svc in ${base}; do
    if [ "${removed[${svc}]:-0}" != "1" ]; then
      out="${out} ${svc}"
    fi
  done

  out="${out# }"
  if [ -n "${out}" ]; then
    out="$(normalize_targets "${out}")"
    out="$(apply_target_order monitor "${out}")"
  fi
  echo "${out}"
}

monitor_log_event() {
  local service="$1"
  local level="$2"
  local message="$3"
  local ts line
  ts="$(date '+%F %T')"
  line="[${ts}] [${level}] [${service}] ${message}"
  if [ -t 1 ]; then
    echo "${line}" | tee -a "$(monitor_log_file)"
  else
    echo "${line}" >> "$(monitor_log_file)"
  fi
}

require_positive_int() {
  local name="$1"
  local value="$2"
  if ! [[ "${value}" =~ ^[0-9]+$ ]] || [ "${value}" -le 0 ]; then
    echo "[error] invalid ${name}=${value}, must be a positive integer" >&2
    return 1
  fi
}

service_healthy_now() {
  local service="$1"
  local port
  local path

  if [ "${service}" = "tei" ]; then
    port="$(get_port "${service}")"
    is_running_tei_container &&
      curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1
    return
  fi

  if [ "${service}" = "cnclip" ]; then
    is_running_by_pid "${service}" || is_running_by_port "${service}"
    return
  fi

  port="$(get_port "${service}")"
  path="$(health_path_for_service "${service}")"
  if [ -z "${port}" ]; then
    return 1
  fi
  if [ -z "${path}" ]; then
    is_running_by_pid "${service}" || is_running_by_port "${service}"
    return
  fi
  if ! is_running_by_port "${service}"; then
    return 1
  fi
  curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1
}

wait_for_health() {
  local service="$1"
  local max_retries="${2:-30}"
  local interval_sec="${3:-1}"
  local port
  port="$(get_port "${service}")"
  local path
  path="$(health_path_for_service "${service}")"
  if [ -z "${path}" ]; then
    return 0
  fi

  local i=0
  while [ "${i}" -lt "${max_retries}" ]; do
    if curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then
      return 0
    fi
    i=$((i + 1))
    sleep "${interval_sec}"
  done
  return 1
}

wait_for_stable_health() {
  local service="$1"
  local checks="${2:-3}"
  local interval_sec="${3:-1}"
  local port
  port="$(get_port "${service}")"
  local path
  path="$(health_path_for_service "${service}")"
  if [ -z "${path}" ]; then
    return 0
  fi

  local i=0
  while [ "${i}" -lt "${checks}" ]; do
    if ! is_running_by_port "${service}"; then
      return 1
    fi
    if ! curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then
      return 1
    fi
    i=$((i + 1))
    sleep "${interval_sec}"
  done
  return 0
}

monitor_services() {
  local targets="$1"
  local interval_sec="${MONITOR_INTERVAL_SEC:-10}"
  local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}"
  local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}"
  local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}"
  local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py"

  require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}"
  require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}"
  require_positive_int "MONITOR_RESTART_COOLDOWN_SEC" "${restart_cooldown_sec}"
  require_positive_int "MONITOR_MAX_RESTARTS_PER_HOUR" "${max_restarts_per_hour}"

  touch "$(monitor_log_file)"

  if [ "${MONITOR_DAEMON:-0}" = "1" ]; then
    echo "$$" > "$(monitor_pid_file)"
    echo "${targets}" > "$(monitor_targets_file)"
    trap '
      current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)"
      if [ "${current_pid}" = "$$" ]; then
        rm -f "$(monitor_pid_file)" "$(monitor_targets_file)"
      fi
    ' EXIT
  fi

  declare -A fail_streak=()
  declare -A last_restart_epoch=()
  declare -A restart_history=()

  monitor_log_event "monitor" "info" "started targets=[${targets}] interval=${interval_sec}s fail_threshold=${fail_threshold} cooldown=${restart_cooldown_sec}s max_restarts_per_hour=${max_restarts_per_hour}"
  trap 'monitor_log_event "monitor" "info" "received stop signal, exiting"; exit 0' INT TERM

  while true; do
    local svc
    for svc in ${targets}; do
      if service_healthy_now "${svc}"; then
        if [ "${fail_streak[${svc}]:-0}" -gt 0 ]; then
          monitor_log_event "${svc}" "info" "health recovered after ${fail_streak[${svc}]} consecutive failures"
        fi
        fail_streak["${svc}"]=0
        continue
      fi

      fail_streak["${svc}"]=$(( ${fail_streak[${svc}]:-0} + 1 ))
      monitor_log_event "${svc}" "warn" "health check failed (${fail_streak[${svc}]}/${fail_threshold})"

      if [ "${fail_streak[${svc}]}" -lt "${fail_threshold}" ]; then
        continue
      fi

      local now
      now="$(date +%s)"
      local last
      last="${last_restart_epoch[${svc}]:-0}"
      if [ $((now - last)) -lt "${restart_cooldown_sec}" ]; then
        monitor_log_event "${svc}" "warn" "restart suppressed by cooldown (${restart_cooldown_sec}s)"
        continue
      fi

      local t
      local recent_history=""
      local recent_count=0
      for t in ${restart_history[${svc}]:-}; do
        if [ $((now - t)) -lt 3600 ]; then
          recent_history="${recent_history} ${t}"
          recent_count=$((recent_count + 1))
        fi
      done
      restart_history["${svc}"]="${recent_history# }"

      if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then
        monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)"
        if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
          python "${wechat_alert_py}" \
            --service "${svc}" \
            --level "error" \
            --message "监控检测到服务连续多次健康检查失败，且已达到每小时最大重启次数上限（${max_restarts_per_hour} 次/小时），请及时排查。"
        fi
        continue
      fi

      monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures"
      if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
        python "${wechat_alert_py}" \
          --service "${svc}" \
          --level "error" \
          --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败，正在尝试自动重启。"
      fi
      if stop_one "${svc}" && start_one "${svc}"; then
        fail_streak["${svc}"]=0
        last_restart_epoch["${svc}"]="${now}"
        restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}"
        monitor_log_event "${svc}" "info" "restart succeeded"
      else
        last_restart_epoch["${svc}"]="${now}"
        restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}"
        monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")"
        if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
          python "${wechat_alert_py}" \
            --service "${svc}" \
            --level "error" \
            --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败，自动重启尝试失败，请尽快登录服务器查看日志：$(log_file "${svc}")."
        fi
      fi
    done
    sleep "${interval_sec}"
  done
}

is_monitor_daemon_running() {
  sync_monitor_daemon_state
}

stop_monitor_daemon() {
  local pf
  pf="$(monitor_pid_file)"
  local tf
  tf="$(monitor_targets_file)"

  if ! is_monitor_daemon_running; then
    rm -f "${pf}"
    return 0
  fi

  local pid
  pid="$(cat "${pf}" 2>/dev/null || true)"
  if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
    echo "[stop] monitor daemon pid=${pid}"
    kill -TERM "${pid}" 2>/dev/null || true
    sleep 1
    if kill -0 "${pid}" 2>/dev/null; then
      kill -KILL "${pid}" 2>/dev/null || true
    fi
  fi
  rm -f "${pf}" "${tf}"
}

start_monitor_daemon() {
  local targets="$1"
  local pf
  pf="$(monitor_pid_file)"
  local tf
  tf="$(monitor_targets_file)"

  local current_targets
  current_targets="$(monitor_current_targets)"
  if is_monitor_daemon_running; then
    if [ "${current_targets}" = "${targets}" ]; then
      echo "[skip] monitor daemon already running (targets=[${targets}])"
      return 0
    fi
    echo "[info] monitor daemon targets changed: [${current_targets}] -> [${targets}]"
    stop_monitor_daemon
  fi

  echo "${targets}" > "${tf}"
  MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 &
  local pid=$!
  echo "${pid}" > "${pf}"
  sleep 1
  if ! kill -0 "${pid}" 2>/dev/null; then
    rm -f "${pf}" "${tf}"
    echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2
    return 1
  fi
  echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))"
}

monitor_daemon_status() {
  local running="no"
  local pid="-"
  local targets="-"

  if is_monitor_daemon_running; then
    running="yes"
    pid="$(cat "$(monitor_pid_file)" 2>/dev/null || echo "-")"
    targets="$(monitor_current_targets)"
    [ -z "${targets}" ] && targets="-"
  fi

  printf "%-14s running=%-3s pid=%-8s targets=%s\n" "service-monitor" "${running}" "${pid}" "${targets}"
}

is_running_by_pid() {
  local service="$1"
  local pf
  pf="$(pid_file "${service}")"
  if [ ! -f "${pf}" ]; then
    return 1
  fi
  local pid
  pid="$(cat "${pf}" 2>/dev/null || true)"
  if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
    return 0
  fi
  rm -f "${pf}"
  return 1
}

is_running_by_port() {
  local service="$1"
  local port
  port="$(get_port "${service}")"
  [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1
}

is_running_tei_container() {
  local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}"
  local cid
  cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)"
  [ -n "${cid}" ]
}

get_cnclip_flow_device() {
  local flow_file="${PROJECT_ROOT}/third-party/clip-as-service/server/torch-flow-temp.yml"
  if [ ! -f "${flow_file}" ]; then
    return 1
  fi
  sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1
}

start_health_retries_for_service() {
  local service="$1"
  case "${service}" in
    reranker) echo 90 ;;
    *) echo 30 ;;
  esac
}

wait_for_startup_health() {
  local service="$1"
  local pid="$2"
  local lf="$3"
  local retries
  retries="$(start_health_retries_for_service "${service}")"

  if wait_for_health "${service}" "${retries}"; then
    if wait_for_stable_health "${service}" 5 1; then
      echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
      return 0
    fi
    echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2
    return 1
  fi

  echo "[error] ${service} health check timeout, inspect ${lf}" >&2
  return 1
}

start_one() {
  local service="$1"
  cd "${PROJECT_ROOT}"
  local cmd
  if ! cmd="$(service_start_cmd "${service}")"; then
    echo "[error] unknown service: ${service}" >&2
    return 1
  fi
  local pf lf
  pf="$(pid_file "${service}")"
  lf="$(log_file "${service}")"
  prepare_daily_log_target "${service}"

  if [ "${service}" != "tei" ]; then
    if is_running_by_pid "${service}" || is_running_by_port "${service}"; then
      if [ "${service}" = "cnclip" ]; then
        local expected_device="${CNCLIP_DEVICE:-cuda}"
        expected_device="$(echo "${expected_device}" | tr '[:upper:]' '[:lower:]')"
        if [[ "${expected_device}" != "cuda" && "${expected_device}" != "cpu" ]]; then
          echo "[error] invalid CNCLIP_DEVICE=${CNCLIP_DEVICE}; use cuda/cpu" >&2
          return 1
        fi
        local actual_device
        actual_device="$(get_cnclip_flow_device 2>/dev/null || true)"
        if [ -n "${actual_device}" ] && [ "${actual_device}" != "${expected_device}" ]; then
          echo "[error] cnclip already running with device=${actual_device}, expected=${expected_device}" >&2
          echo "[error] run: ./scripts/service_ctl.sh stop cnclip && CNCLIP_DEVICE=${expected_device} ./scripts/service_ctl.sh start cnclip" >&2
          return 1
        fi
      fi
      echo "[skip] ${service} already running"
      return 0
    fi
  fi

  case "${service}" in
    cnclip|tei)
      echo "[start] ${service} (managed by native script)"
      if [ "${service}" = "cnclip" ]; then
        if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
          echo "[error] ${service} start script failed, inspect ${lf}" >&2
          return 1
        fi
      else
        if ! "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
          echo "[error] ${service} start script failed, inspect ${lf}" >&2
          return 1
        fi
      fi
      if [ "${service}" = "tei" ]; then
        if is_running_tei_container; then
          echo "[ok] ${service} started (log=${lf})"
        else
          echo "[error] ${service} failed to start, inspect ${lf}" >&2
          return 1
        fi
      elif is_running_by_pid "${service}" || is_running_by_port "${service}"; then
        echo "[ok] ${service} started (log=${lf})"
      else
        echo "[error] ${service} failed to start, inspect ${lf}" >&2
        return 1
      fi
      ;;
    backend|indexer|frontend|embedding|translator|reranker)
      echo "[start] ${service}"
      nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
      local pid=$!
      echo "${pid}" > "${pf}"
      wait_for_startup_health "${service}" "${pid}" "${lf}"
      ;;
    *)
      echo "[warn] ${service} unsupported start path"
      ;;
  esac
}

cleanup_reranker_orphans() {
  local engine_pids
  engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)"
  if [ -z "${engine_pids}" ]; then
    return 0
  fi

  echo "[stop] reranker orphan engines=${engine_pids}"
  for pid in ${engine_pids}; do
    kill -TERM "${pid}" 2>/dev/null || true
  done
  sleep 1
  engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)"
  for pid in ${engine_pids}; do
    kill -KILL "${pid}" 2>/dev/null || true
  done
}

stop_one() {
  local service="$1"
  cd "${PROJECT_ROOT}"
  if [ "${service}" = "cnclip" ]; then
    echo "[stop] cnclip (managed by native script)"
    bash -lc "./scripts/stop_cnclip_service.sh" || true
    return 0
  fi
  if [ "${service}" = "tei" ]; then
    echo "[stop] tei (managed by native script)"
    bash -lc "./scripts/stop_tei_service.sh" || true
    return 0
  fi

  local pf
  pf="$(pid_file "${service}")"

  if [ -f "${pf}" ]; then
    local pid
    pid="$(cat "${pf}" 2>/dev/null || true)"
    if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
      echo "[stop] ${service} pid=${pid}"
      kill -TERM "${pid}" 2>/dev/null || true
      sleep 1
      if kill -0 "${pid}" 2>/dev/null; then
        kill -KILL "${pid}" 2>/dev/null || true
      fi
    fi
    rm -f "${pf}"
  fi

  local port
  port="$(get_port "${service}")"
  if [ -n "${port}" ]; then
    local pids
    pids="$(lsof -ti:${port} 2>/dev/null || true)"
    if [ -n "${pids}" ]; then
      echo "[stop] ${service} port=${port} pids=${pids}"
      for pid in ${pids}; do
        kill -TERM "${pid}" 2>/dev/null || true
      done
      sleep 1
      pids="$(lsof -ti:${port} 2>/dev/null || true)"
      for pid in ${pids}; do
        kill -KILL "${pid}" 2>/dev/null || true
      done
    fi
  fi

  if [ "${service}" = "reranker" ]; then
    cleanup_reranker_orphans
  fi
}

status_one() {
  local service="$1"
  local port
  port="$(get_port "${service}")"
  local running="no"
  local pid_info="-"
  local health="down"
  local health_body=""

  if [ "${service}" = "tei" ]; then
    local cid
    local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}"
    cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)"
    if [ -n "${cid}" ]; then
      running="yes"
      pid_info="${cid:0:12}"
      # TEI: container 级别 running 后再尝试 HTTP /health
      local path
      path="$(health_path_for_service "${service}")"
      if [ -n "${port}" ] && [ -n "${path}" ]; then
        if health_body="$(curl -fsS "http://127.0.0.1:${port}${path}" 2>/dev/null)"; then
          health="ok"
        else
          health="fail"
        fi
      fi
    fi
    if [ -n "${health_body}" ]; then
      printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s body=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" "${health_body}"
    else
      printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}"
    fi
    return
  fi

  if is_running_by_pid "${service}"; then
    running="yes"
    pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")"
  elif is_running_by_port "${service}"; then
    running="yes"
    pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")"
  fi

  if [ "${running}" = "yes" ]; then
    local path
    path="$(health_path_for_service "${service}")"
    if [ -n "${port}" ] && [ -n "${path}" ]; then
      if health_body="$(curl -fsS "http://127.0.0.1:${port}${path}" 2>/dev/null)"; then
        health="ok"
      else
        health="fail"
      fi
    else
      # 没有 HTTP 健康检查端点（如 cnclip），运行即可视为 ok
      health="ok"
    fi
  fi

  if [ -n "${health_body}" ]; then
    printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s body=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" "${health_body}"
  else
    printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}"
  fi
}

service_is_running() {
  local service="$1"
  case "${service}" in
    tei)
      is_running_tei_container
      ;;
    cnclip)
      is_running_by_pid "${service}" || is_running_by_port "${service}"
      ;;
    *)
      is_running_by_pid "${service}" || is_running_by_port "${service}"
      ;;
  esac
}

expand_target_token() {
  local token="$1"
  case "${token}" in
    all)
      echo "$(all_services)"
      ;;
    *)
      echo "${token}"
      ;;
  esac
}

normalize_targets() {
  local raw="$1"
  declare -A seen=()
  local out=""
  local token svc
  for token in ${raw}; do
    for svc in $(expand_target_token "${token}"); do
      if [ -z "${seen[${svc}]:-}" ]; then
        seen["${svc}"]=1
        out="${out} ${svc}"
      fi
    done
  done
  echo "${out# }"
}

sort_targets_by_order() {
  local targets="$1"
  shift || true
  local out=""
  local svc
  declare -A want=()
  for svc in ${targets}; do
    want["${svc}"]=1
  done

  for svc in "$@"; do
    if [ "${want[${svc}]:-0}" = "1" ]; then
      out="${out} ${svc}"
      unset "want[${svc}]"
    fi
  done

  for svc in ${targets}; do
    if [ "${want[${svc}]:-0}" = "1" ]; then
      out="${out} ${svc}"
      unset "want[${svc}]"
    fi
  done
  echo "${out# }"
}

apply_target_order() {
  local action="$1"
  local targets="$2"
  case "${action}" in
    stop|down)
      sort_targets_by_order "${targets}" "${STOP_ORDER_SERVICES[@]}"
      ;;
    *)
      sort_targets_by_order "${targets}" "${FULL_SERVICES[@]}"
      ;;
  esac
}

resolve_targets() {
  local scope="$1"
  shift || true

  if [ "$#" -gt 0 ]; then
    echo "$*"
    return
  fi

  case "${scope}" in
    monitor-stop|monitor-status)
      echo ""
      ;;
    status)
      echo "$(all_services)"
      ;;
    *)
      echo ""
      ;;
  esac
}

usage() {
  cat <<'EOF'
Usage:
  ./scripts/service_ctl.sh up [all|service...]
  ./scripts/service_ctl.sh down [service...]
  ./scripts/service_ctl.sh start [service...]
  ./scripts/service_ctl.sh stop [service...]
  ./scripts/service_ctl.sh restart [service...]
  ./scripts/service_ctl.sh status [service...]
  ./scripts/service_ctl.sh monitor [service...]
  ./scripts/service_ctl.sh monitor-start [service...]
  ./scripts/service_ctl.sh monitor-stop
  ./scripts/service_ctl.sh monitor-status

Default target set (when no service provided):
  status  -> all known services
  up/start/stop/restart/down/monitor/monitor-start -> must specify services or all

Special targets:
  all      -> all known services

Examples:
  ./scripts/service_ctl.sh up all
  ./scripts/service_ctl.sh up backend indexer frontend
  ./scripts/service_ctl.sh restart
  ./scripts/service_ctl.sh monitor-start all
  ./scripts/service_ctl.sh monitor-status

Log retention:
  LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start

Monitor tuning:
  MONITOR_INTERVAL_SEC=10
  MONITOR_FAIL_THRESHOLD=3
  MONITOR_RESTART_COOLDOWN_SEC=30
  MONITOR_MAX_RESTARTS_PER_HOUR=6
EOF
}

main() {
  if [ "$#" -lt 1 ]; then
    usage
    exit 1
  fi

  local action="$1"
  shift || true

  load_env_file "${PROJECT_ROOT}/.env"
  local targets=""
  local monitor_was_running=0
  local monitor_prev_targets=""
  local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}"

  case "${action}" in
    monitor-stop|monitor-status)
      ;;
    *)
      targets="$(resolve_targets "${action}" "$@")"
      if [ -z "${targets}" ]; then
        usage
        exit 1
      fi
      targets="$(normalize_targets "${targets}")"
      targets="$(apply_target_order "${action}" "${targets}")"
      if [ -z "${targets}" ]; then
        echo "[error] empty targets after expansion" >&2
        exit 1
      fi
      validate_targets "${targets}"
      ;;
  esac

  case "${action}" in
    up)
      for svc in ${targets}; do
        start_one "${svc}"
      done
      start_monitor_daemon "${targets}"
      ;;
    down)
      stop_monitor_daemon
      for svc in ${targets}; do
        stop_one "${svc}"
      done
      ;;
    start)
      for svc in ${targets}; do
        start_one "${svc}"
      done
      if [ "${auto_monitor_on_start}" = "1" ]; then
        start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
      fi
      ;;
    stop)
      if is_monitor_daemon_running; then
        local remaining_targets
        remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")"
        if [ -n "${remaining_targets}" ]; then
          echo "[info] updating monitor daemon targets -> [${remaining_targets}]"
          start_monitor_daemon "${remaining_targets}"
        else
          echo "[info] stopping monitor daemon before manual stop"
          stop_monitor_daemon
        fi
      fi
      for svc in ${targets}; do
        stop_one "${svc}"
      done
      ;;
    restart)
      local restart_stop_targets
      restart_stop_targets="$(apply_target_order stop "${targets}")"
      if is_monitor_daemon_running; then
        monitor_was_running=1
        monitor_prev_targets="$(monitor_current_targets)"
        [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}"
        stop_monitor_daemon
      fi
      for svc in ${restart_stop_targets}; do
        stop_one "${svc}"
      done
      for svc in ${targets}; do
        start_one "${svc}"
      done
      if [ "${monitor_was_running}" -eq 1 ]; then
        monitor_prev_targets="$(normalize_targets "${monitor_prev_targets}")"
        monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")"
        [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}"
        start_monitor_daemon "${monitor_prev_targets}"
      elif [ "${auto_monitor_on_start}" = "1" ]; then
        start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")"
      fi
      ;;
    status)
      for svc in ${targets}; do
        status_one "${svc}"
      done
      monitor_daemon_status
      ;;
    monitor)
      monitor_services "${targets}"
      ;;
    monitor-start)
      start_monitor_daemon "${targets}"
      ;;
    monitor-stop)
      stop_monitor_daemon
      ;;
    monitor-status)
      monitor_daemon_status
      ;;
    *)
      usage
      exit 1
      ;;
  esac
}

main "$@"