#!/bin/bash # # Unified service lifecycle controller for saas-search. # Supports: up / down / start / stop / restart / status / monitor / monitor-start / monitor-stop / monitor-status # set -euo pipefail PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)" LOG_DIR="${PROJECT_ROOT}/logs" LOG_RETENTION_DAYS="${LOG_RETENTION_DAYS:-30}" mkdir -p "${LOG_DIR}" # shellcheck source=scripts/lib/load_env.sh source "${PROJECT_ROOT}/scripts/lib/load_env.sh" CORE_SERVICES=("backend" "indexer" "frontend") OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "translator" "reranker") FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") STOP_ORDER_SERVICES=("frontend" "indexer" "backend" "reranker" "translator" "embedding" "cnclip" "tei") all_services() { echo "${FULL_SERVICES[@]}" } get_port() { local service="$1" case "${service}" in backend) echo "${API_PORT:-6002}" ;; indexer) echo "${INDEXER_PORT:-6004}" ;; frontend) echo "${FRONTEND_PORT:-6003}" ;; embedding) echo "${EMBEDDING_PORT:-6005}" ;; translator) echo "${TRANSLATION_PORT:-6006}" ;; reranker) echo "${RERANKER_PORT:-6007}" ;; tei) echo "${TEI_PORT:-8080}" ;; cnclip) echo "${CNCLIP_PORT:-51000}" ;; *) echo "" ;; esac } pid_file() { local service="$1" echo "${LOG_DIR}/${service}.pid" } log_file() { local service="$1" echo "${LOG_DIR}/${service}.log" } prepare_daily_log_target() { local service="$1" local day local today_file day="$(date +%F)" today_file="${LOG_DIR}/${service}-${day}.log" touch "${today_file}" ln -sfn "$(basename "${today_file}")" "$(log_file "${service}")" } service_start_cmd() { local service="$1" case "${service}" in backend) echo "./scripts/start_backend.sh" ;; indexer) echo "./scripts/start_indexer.sh" ;; frontend) echo "./scripts/start_frontend.sh" ;; embedding) echo "./scripts/start_embedding_service.sh" ;; translator) echo "./scripts/start_translator.sh" ;; reranker) echo "./scripts/start_reranker.sh" ;; tei) echo "./scripts/start_tei_service.sh" ;; cnclip) echo "./scripts/start_cnclip_service.sh" ;; *) return 1 ;; esac } service_exists() { local service="$1" case "${service}" in backend|indexer|frontend|embedding|translator|reranker|tei|cnclip) return 0 ;; *) return 1 ;; esac } validate_targets() { local targets="$1" for svc in ${targets}; do if ! service_exists "${svc}"; then echo "[error] unknown service: ${svc}" >&2 return 1 fi done } health_path_for_service() { local service="$1" case "${service}" in backend|indexer|embedding|translator|reranker|tei) echo "/health" ;; *) echo "" ;; esac } monitor_log_file() { echo "${LOG_DIR}/service-monitor.log" } monitor_pid_file() { echo "${LOG_DIR}/service-monitor.pid" } monitor_targets_file() { echo "${LOG_DIR}/service-monitor.targets" } sync_monitor_daemon_state() { local pf pf="$(monitor_pid_file)" local tf tf="$(monitor_targets_file)" if [ ! -f "${pf}" ]; then return 1 fi local pid pid="$(cat "${pf}" 2>/dev/null || true)" if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then return 0 fi rm -f "${pf}" "${tf}" return 1 } monitor_current_targets() { if [ -f "$(monitor_targets_file)" ]; then cat "$(monitor_targets_file)" 2>/dev/null || true fi } merge_targets() { local base="${1:-}" local extra="${2:-}" local merged="" merged="$(normalize_targets "${base} ${extra}")" if [ -n "${merged}" ]; then merged="$(apply_target_order monitor "${merged}")" fi echo "${merged}" } subtract_targets() { local base="${1:-}" local remove="${2:-}" local out="" local svc declare -A removed=() for svc in ${remove}; do removed["${svc}"]=1 done for svc in ${base}; do if [ "${removed[${svc}]:-0}" != "1" ]; then out="${out} ${svc}" fi done out="${out# }" if [ -n "${out}" ]; then out="$(normalize_targets "${out}")" out="$(apply_target_order monitor "${out}")" fi echo "${out}" } monitor_log_event() { local service="$1" local level="$2" local message="$3" local ts line ts="$(date '+%F %T')" line="[${ts}] [${level}] [${service}] ${message}" if [ -t 1 ]; then echo "${line}" | tee -a "$(monitor_log_file)" else echo "${line}" >> "$(monitor_log_file)" fi } require_positive_int() { local name="$1" local value="$2" if ! [[ "${value}" =~ ^[0-9]+$ ]] || [ "${value}" -le 0 ]; then echo "[error] invalid ${name}=${value}, must be a positive integer" >&2 return 1 fi } service_healthy_now() { local service="$1" local port local path if [ "${service}" = "tei" ]; then port="$(get_port "${service}")" is_running_tei_container && curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1 return fi if [ "${service}" = "cnclip" ]; then is_running_by_pid "${service}" || is_running_by_port "${service}" return fi port="$(get_port "${service}")" path="$(health_path_for_service "${service}")" if [ -z "${port}" ]; then return 1 fi if [ -z "${path}" ]; then is_running_by_pid "${service}" || is_running_by_port "${service}" return fi if ! is_running_by_port "${service}"; then return 1 fi curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1 } wait_for_health() { local service="$1" local max_retries="${2:-30}" local interval_sec="${3:-1}" local port port="$(get_port "${service}")" local path path="$(health_path_for_service "${service}")" if [ -z "${path}" ]; then return 0 fi local i=0 while [ "${i}" -lt "${max_retries}" ]; do if curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then return 0 fi i=$((i + 1)) sleep "${interval_sec}" done return 1 } wait_for_stable_health() { local service="$1" local checks="${2:-3}" local interval_sec="${3:-1}" local port port="$(get_port "${service}")" local path path="$(health_path_for_service "${service}")" if [ -z "${path}" ]; then return 0 fi local i=0 while [ "${i}" -lt "${checks}" ]; do if ! is_running_by_port "${service}"; then return 1 fi if ! curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then return 1 fi i=$((i + 1)) sleep "${interval_sec}" done return 0 } monitor_services() { local targets="$1" local interval_sec="${MONITOR_INTERVAL_SEC:-10}" local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py" require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" require_positive_int "MONITOR_RESTART_COOLDOWN_SEC" "${restart_cooldown_sec}" require_positive_int "MONITOR_MAX_RESTARTS_PER_HOUR" "${max_restarts_per_hour}" touch "$(monitor_log_file)" if [ "${MONITOR_DAEMON:-0}" = "1" ]; then echo "$$" > "$(monitor_pid_file)" echo "${targets}" > "$(monitor_targets_file)" trap ' current_pid="$(cat "$(monitor_pid_file)" 2>/dev/null || true)" if [ "${current_pid}" = "$$" ]; then rm -f "$(monitor_pid_file)" "$(monitor_targets_file)" fi ' EXIT fi declare -A fail_streak=() declare -A last_restart_epoch=() declare -A restart_history=() monitor_log_event "monitor" "info" "started targets=[${targets}] interval=${interval_sec}s fail_threshold=${fail_threshold} cooldown=${restart_cooldown_sec}s max_restarts_per_hour=${max_restarts_per_hour}" trap 'monitor_log_event "monitor" "info" "received stop signal, exiting"; exit 0' INT TERM while true; do local svc for svc in ${targets}; do if service_healthy_now "${svc}"; then if [ "${fail_streak[${svc}]:-0}" -gt 0 ]; then monitor_log_event "${svc}" "info" "health recovered after ${fail_streak[${svc}]} consecutive failures" fi fail_streak["${svc}"]=0 continue fi fail_streak["${svc}"]=$(( ${fail_streak[${svc}]:-0} + 1 )) monitor_log_event "${svc}" "warn" "health check failed (${fail_streak[${svc}]}/${fail_threshold})" if [ "${fail_streak[${svc}]}" -lt "${fail_threshold}" ]; then continue fi local now now="$(date +%s)" local last last="${last_restart_epoch[${svc}]:-0}" if [ $((now - last)) -lt "${restart_cooldown_sec}" ]; then monitor_log_event "${svc}" "warn" "restart suppressed by cooldown (${restart_cooldown_sec}s)" continue fi local t local recent_history="" local recent_count=0 for t in ${restart_history[${svc}]:-}; do if [ $((now - t)) -lt 3600 ]; then recent_history="${recent_history} ${t}" recent_count=$((recent_count + 1)) fi done restart_history["${svc}"]="${recent_history# }" if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then python "${wechat_alert_py}" \ --service "${svc}" \ --level "error" \ --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" fi continue fi monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then python "${wechat_alert_py}" \ --service "${svc}" \ --level "error" \ --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" fi if stop_one "${svc}" && start_one "${svc}"; then fail_streak["${svc}"]=0 last_restart_epoch["${svc}"]="${now}" restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" monitor_log_event "${svc}" "info" "restart succeeded" else last_restart_epoch["${svc}"]="${now}" restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then python "${wechat_alert_py}" \ --service "${svc}" \ --level "error" \ --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." fi fi done sleep "${interval_sec}" done } is_monitor_daemon_running() { sync_monitor_daemon_state } stop_monitor_daemon() { local pf pf="$(monitor_pid_file)" local tf tf="$(monitor_targets_file)" if ! is_monitor_daemon_running; then rm -f "${pf}" return 0 fi local pid pid="$(cat "${pf}" 2>/dev/null || true)" if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then echo "[stop] monitor daemon pid=${pid}" kill -TERM "${pid}" 2>/dev/null || true sleep 1 if kill -0 "${pid}" 2>/dev/null; then kill -KILL "${pid}" 2>/dev/null || true fi fi rm -f "${pf}" "${tf}" } start_monitor_daemon() { local targets="$1" local pf pf="$(monitor_pid_file)" local tf tf="$(monitor_targets_file)" local current_targets current_targets="$(monitor_current_targets)" if is_monitor_daemon_running; then if [ "${current_targets}" = "${targets}" ]; then echo "[skip] monitor daemon already running (targets=[${targets}])" return 0 fi echo "[info] monitor daemon targets changed: [${current_targets}] -> [${targets}]" stop_monitor_daemon fi echo "${targets}" > "${tf}" MONITOR_DAEMON=1 nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >> "$(monitor_log_file)" 2>&1 & local pid=$! echo "${pid}" > "${pf}" sleep 1 if ! kill -0 "${pid}" 2>/dev/null; then rm -f "${pf}" "${tf}" echo "[error] monitor daemon failed to stay alive, inspect $(monitor_log_file)" >&2 return 1 fi echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" } monitor_daemon_status() { local running="no" local pid="-" local targets="-" if is_monitor_daemon_running; then running="yes" pid="$(cat "$(monitor_pid_file)" 2>/dev/null || echo "-")" targets="$(monitor_current_targets)" [ -z "${targets}" ] && targets="-" fi printf "%-14s running=%-3s pid=%-8s targets=%s\n" "service-monitor" "${running}" "${pid}" "${targets}" } is_running_by_pid() { local service="$1" local pf pf="$(pid_file "${service}")" if [ ! -f "${pf}" ]; then return 1 fi local pid pid="$(cat "${pf}" 2>/dev/null || true)" if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then return 0 fi rm -f "${pf}" return 1 } is_running_by_port() { local service="$1" local port port="$(get_port "${service}")" [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1 } is_running_tei_container() { local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}" local cid cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)" [ -n "${cid}" ] } get_cnclip_flow_device() { local flow_file="${PROJECT_ROOT}/third-party/clip-as-service/server/torch-flow-temp.yml" if [ ! -f "${flow_file}" ]; then return 1 fi sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1 } start_health_retries_for_service() { local service="$1" case "${service}" in reranker) echo 90 ;; *) echo 30 ;; esac } wait_for_startup_health() { local service="$1" local pid="$2" local lf="$3" local retries retries="$(start_health_retries_for_service "${service}")" if wait_for_health "${service}" "${retries}"; then if wait_for_stable_health "${service}" 5 1; then echo "[ok] ${service} healthy (pid=${pid}, log=${lf})" return 0 fi echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2 return 1 fi echo "[error] ${service} health check timeout, inspect ${lf}" >&2 return 1 } start_one() { local service="$1" cd "${PROJECT_ROOT}" local cmd if ! cmd="$(service_start_cmd "${service}")"; then echo "[error] unknown service: ${service}" >&2 return 1 fi local pf lf pf="$(pid_file "${service}")" lf="$(log_file "${service}")" prepare_daily_log_target "${service}" if [ "${service}" != "tei" ]; then if is_running_by_pid "${service}" || is_running_by_port "${service}"; then if [ "${service}" = "cnclip" ]; then local expected_device="${CNCLIP_DEVICE:-cuda}" expected_device="$(echo "${expected_device}" | tr '[:upper:]' '[:lower:]')" if [[ "${expected_device}" != "cuda" && "${expected_device}" != "cpu" ]]; then echo "[error] invalid CNCLIP_DEVICE=${CNCLIP_DEVICE}; use cuda/cpu" >&2 return 1 fi local actual_device actual_device="$(get_cnclip_flow_device 2>/dev/null || true)" if [ -n "${actual_device}" ] && [ "${actual_device}" != "${expected_device}" ]; then echo "[error] cnclip already running with device=${actual_device}, expected=${expected_device}" >&2 echo "[error] run: ./scripts/service_ctl.sh stop cnclip && CNCLIP_DEVICE=${expected_device} ./scripts/service_ctl.sh start cnclip" >&2 return 1 fi fi echo "[skip] ${service} already running" return 0 fi fi case "${service}" in cnclip|tei) echo "[start] ${service} (managed by native script)" if [ "${service}" = "cnclip" ]; then if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then echo "[error] ${service} start script failed, inspect ${lf}" >&2 return 1 fi else if ! "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then echo "[error] ${service} start script failed, inspect ${lf}" >&2 return 1 fi fi if [ "${service}" = "tei" ]; then if is_running_tei_container; then echo "[ok] ${service} started (log=${lf})" else echo "[error] ${service} failed to start, inspect ${lf}" >&2 return 1 fi elif is_running_by_pid "${service}" || is_running_by_port "${service}"; then echo "[ok] ${service} started (log=${lf})" else echo "[error] ${service} failed to start, inspect ${lf}" >&2 return 1 fi ;; backend|indexer|frontend|embedding|translator|reranker) echo "[start] ${service}" nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 & local pid=$! echo "${pid}" > "${pf}" wait_for_startup_health "${service}" "${pid}" "${lf}" ;; *) echo "[warn] ${service} unsupported start path" ;; esac } cleanup_reranker_orphans() { local engine_pids engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)" if [ -z "${engine_pids}" ]; then return 0 fi echo "[stop] reranker orphan engines=${engine_pids}" for pid in ${engine_pids}; do kill -TERM "${pid}" 2>/dev/null || true done sleep 1 engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)" for pid in ${engine_pids}; do kill -KILL "${pid}" 2>/dev/null || true done } stop_one() { local service="$1" cd "${PROJECT_ROOT}" if [ "${service}" = "cnclip" ]; then echo "[stop] cnclip (managed by native script)" bash -lc "./scripts/stop_cnclip_service.sh" || true return 0 fi if [ "${service}" = "tei" ]; then echo "[stop] tei (managed by native script)" bash -lc "./scripts/stop_tei_service.sh" || true return 0 fi local pf pf="$(pid_file "${service}")" if [ -f "${pf}" ]; then local pid pid="$(cat "${pf}" 2>/dev/null || true)" if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then echo "[stop] ${service} pid=${pid}" kill -TERM "${pid}" 2>/dev/null || true sleep 1 if kill -0 "${pid}" 2>/dev/null; then kill -KILL "${pid}" 2>/dev/null || true fi fi rm -f "${pf}" fi local port port="$(get_port "${service}")" if [ -n "${port}" ]; then local pids pids="$(lsof -ti:${port} 2>/dev/null || true)" if [ -n "${pids}" ]; then echo "[stop] ${service} port=${port} pids=${pids}" for pid in ${pids}; do kill -TERM "${pid}" 2>/dev/null || true done sleep 1 pids="$(lsof -ti:${port} 2>/dev/null || true)" for pid in ${pids}; do kill -KILL "${pid}" 2>/dev/null || true done fi fi if [ "${service}" = "reranker" ]; then cleanup_reranker_orphans fi } status_one() { local service="$1" local port port="$(get_port "${service}")" local running="no" local pid_info="-" local health="down" local health_body="" if [ "${service}" = "tei" ]; then local cid local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}" cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)" if [ -n "${cid}" ]; then running="yes" pid_info="${cid:0:12}" # TEI: container 级别 running 后再尝试 HTTP /health local path path="$(health_path_for_service "${service}")" if [ -n "${port}" ] && [ -n "${path}" ]; then if health_body="$(curl -fsS "http://127.0.0.1:${port}${path}" 2>/dev/null)"; then health="ok" else health="fail" fi fi fi if [ -n "${health_body}" ]; then printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s body=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" "${health_body}" else printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" fi return fi if is_running_by_pid "${service}"; then running="yes" pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")" elif is_running_by_port "${service}"; then running="yes" pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")" fi if [ "${running}" = "yes" ]; then local path path="$(health_path_for_service "${service}")" if [ -n "${port}" ] && [ -n "${path}" ]; then if health_body="$(curl -fsS "http://127.0.0.1:${port}${path}" 2>/dev/null)"; then health="ok" else health="fail" fi else # 没有 HTTP 健康检查端点(如 cnclip),运行即可视为 ok health="ok" fi fi if [ -n "${health_body}" ]; then printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s body=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" "${health_body}" else printf "%-10s running=%-3s port=%-6s pid=%s health=%-4s\n" "${service}" "${running}" "${port:--}" "${pid_info}" "${health}" fi } service_is_running() { local service="$1" case "${service}" in tei) is_running_tei_container ;; cnclip) is_running_by_pid "${service}" || is_running_by_port "${service}" ;; *) is_running_by_pid "${service}" || is_running_by_port "${service}" ;; esac } expand_target_token() { local token="$1" case "${token}" in all) echo "$(all_services)" ;; *) echo "${token}" ;; esac } normalize_targets() { local raw="$1" declare -A seen=() local out="" local token svc for token in ${raw}; do for svc in $(expand_target_token "${token}"); do if [ -z "${seen[${svc}]:-}" ]; then seen["${svc}"]=1 out="${out} ${svc}" fi done done echo "${out# }" } sort_targets_by_order() { local targets="$1" shift || true local out="" local svc declare -A want=() for svc in ${targets}; do want["${svc}"]=1 done for svc in "$@"; do if [ "${want[${svc}]:-0}" = "1" ]; then out="${out} ${svc}" unset "want[${svc}]" fi done for svc in ${targets}; do if [ "${want[${svc}]:-0}" = "1" ]; then out="${out} ${svc}" unset "want[${svc}]" fi done echo "${out# }" } apply_target_order() { local action="$1" local targets="$2" case "${action}" in stop|down) sort_targets_by_order "${targets}" "${STOP_ORDER_SERVICES[@]}" ;; *) sort_targets_by_order "${targets}" "${FULL_SERVICES[@]}" ;; esac } resolve_targets() { local scope="$1" shift || true if [ "$#" -gt 0 ]; then echo "$*" return fi case "${scope}" in monitor-stop|monitor-status) echo "" ;; status) echo "$(all_services)" ;; *) echo "" ;; esac } usage() { cat <<'EOF' Usage: ./scripts/service_ctl.sh up [all|service...] ./scripts/service_ctl.sh down [service...] ./scripts/service_ctl.sh start [service...] ./scripts/service_ctl.sh stop [service...] ./scripts/service_ctl.sh restart [service...] ./scripts/service_ctl.sh status [service...] ./scripts/service_ctl.sh monitor [service...] ./scripts/service_ctl.sh monitor-start [service...] ./scripts/service_ctl.sh monitor-stop ./scripts/service_ctl.sh monitor-status Default target set (when no service provided): status -> all known services up/start/stop/restart/down/monitor/monitor-start -> must specify services or all Special targets: all -> all known services Examples: ./scripts/service_ctl.sh up all ./scripts/service_ctl.sh up backend indexer frontend ./scripts/service_ctl.sh restart ./scripts/service_ctl.sh monitor-start all ./scripts/service_ctl.sh monitor-status Log retention: LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start Monitor tuning: MONITOR_INTERVAL_SEC=10 MONITOR_FAIL_THRESHOLD=3 MONITOR_RESTART_COOLDOWN_SEC=30 MONITOR_MAX_RESTARTS_PER_HOUR=6 EOF } main() { if [ "$#" -lt 1 ]; then usage exit 1 fi local action="$1" shift || true load_env_file "${PROJECT_ROOT}/.env" local targets="" local monitor_was_running=0 local monitor_prev_targets="" local auto_monitor_on_start="${SERVICE_CTL_AUTO_MONITOR_ON_START:-1}" case "${action}" in monitor-stop|monitor-status) ;; *) targets="$(resolve_targets "${action}" "$@")" if [ -z "${targets}" ]; then usage exit 1 fi targets="$(normalize_targets "${targets}")" targets="$(apply_target_order "${action}" "${targets}")" if [ -z "${targets}" ]; then echo "[error] empty targets after expansion" >&2 exit 1 fi validate_targets "${targets}" ;; esac case "${action}" in up) for svc in ${targets}; do start_one "${svc}" done start_monitor_daemon "${targets}" ;; down) stop_monitor_daemon for svc in ${targets}; do stop_one "${svc}" done ;; start) for svc in ${targets}; do start_one "${svc}" done if [ "${auto_monitor_on_start}" = "1" ]; then start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" fi ;; stop) if is_monitor_daemon_running; then local remaining_targets remaining_targets="$(subtract_targets "$(monitor_current_targets)" "${targets}")" if [ -n "${remaining_targets}" ]; then echo "[info] updating monitor daemon targets -> [${remaining_targets}]" start_monitor_daemon "${remaining_targets}" else echo "[info] stopping monitor daemon before manual stop" stop_monitor_daemon fi fi for svc in ${targets}; do stop_one "${svc}" done ;; restart) local restart_stop_targets restart_stop_targets="$(apply_target_order stop "${targets}")" if is_monitor_daemon_running; then monitor_was_running=1 monitor_prev_targets="$(monitor_current_targets)" [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" stop_monitor_daemon fi for svc in ${restart_stop_targets}; do stop_one "${svc}" done for svc in ${targets}; do start_one "${svc}" done if [ "${monitor_was_running}" -eq 1 ]; then monitor_prev_targets="$(normalize_targets "${monitor_prev_targets}")" monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" start_monitor_daemon "${monitor_prev_targets}" elif [ "${auto_monitor_on_start}" = "1" ]; then start_monitor_daemon "$(merge_targets "$(monitor_current_targets)" "${targets}")" fi ;; status) for svc in ${targets}; do status_one "${svc}" done monitor_daemon_status ;; monitor) monitor_services "${targets}" ;; monitor-start) start_monitor_daemon "${targets}" ;; monitor-stop) stop_monitor_daemon ;; monitor-status) monitor_daemon_status ;; *) usage exit 1 ;; esac } main "$@"