From 7913e2fb4b6fb39cd54c61727066648c316c42cc Mon Sep 17 00:00:00 2001 From: tangwang Date: Thu, 12 Mar 2026 23:31:59 +0800 Subject: [PATCH] 服务管理和监控 --- README.md | 15 +++++++++------ docs/QUICKSTART.md | 17 +++++++++++------ docs/Usage-Guide.md | 61 +++++++++++++++++++++++++++++++------------------------------ restart.sh | 2 +- run.sh | 2 +- scripts/service_ctl.sh | 461 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------- scripts/start.sh | 14 +------------- scripts/stop.sh | 8 +------- 8 files changed, 472 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index 6284c40..10b93ff 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,17 @@ README 用于给后续开发者建立统一认知:**系统框架、模块边 ./scripts/init_env.sh # 从 .env.example 生成本地 .env(若不存在) source activate.sh -# 启动核心服务(backend/indexer/frontend) -./run.sh +# 推荐:一键拉起全部服务(含监控守护) +./run.sh all # 薄封装:等价于 ./scripts/service_ctl.sh up all -# 可选:附加能力服务(按需开启) -./scripts/service_ctl.sh start tei cnclip embedding translator reranker - -# 查看状态 +# 查看状态(含 monitor daemon 状态) ./scripts/service_ctl.sh status + +# 重启指定服务集合(示例:全部) +./restart.sh all # 薄封装:等价于 ./scripts/service_ctl.sh restart all + +# 一键停止(含 monitor daemon) +./scripts/stop.sh all # 薄封装:等价于 ./scripts/service_ctl.sh down all ``` 服务管理全盘说明(入口职责、默认行为、全量启停方式)见: diff --git a/docs/QUICKSTART.md b/docs/QUICKSTART.md index 63db1b4..cdfb69e 100644 --- a/docs/QUICKSTART.md +++ b/docs/QUICKSTART.md @@ -64,17 +64,20 @@ source activate.sh 启动与停止: ```bash -./run.sh -# 启动全部能力 -# 追加可选能力服务(显式指定) -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker +./run.sh all +# 仅为薄封装:等价于 ./scripts/service_ctl.sh up all # 说明: +# - all = tei cnclip embedding translator reranker backend indexer frontend +# - up 会同时启动 monitor daemon(运行期连续失败自动重启) # - reranker 为 GPU 强制模式(资源不足会直接启动失败) # - TEI 默认使用 GPU;当 TEI_DEVICE=cuda 且 GPU 不可用时会直接失败(不会自动降级到 CPU) # - cnclip 默认使用 cuda;若显式配置为 cuda 且 GPU 不可用会直接失败(不会自动降级到 cpu) +./restart.sh all +# 仅为薄封装:等价于 ./scripts/service_ctl.sh restart all + ./scripts/service_ctl.sh status -./scripts/stop.sh +./scripts/stop.sh all # 仅为薄封装:等价于 ./scripts/service_ctl.sh down all ``` 服务管理方式(入口职责、默认行为、全量拉起顺序)见: @@ -313,7 +316,8 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: - `scripts/mock_data.sh`:生成 Tenant1 Mock + Tenant2 CSV 并导入 MySQL - `scripts/create_tenant_index.sh `:创建租户 ES 索引结构 - `POST /indexer/reindex`:从 MySQL 全量导入到 ES -- `run.sh` / `scripts/stop.sh`:服务启停;`scripts/service_ctl.sh`:start/stop/restart/status +- `run.sh` / `restart.sh` / `scripts/stop.sh`:推荐入口(对应 up/restart/down) +- `scripts/service_ctl.sh`:`up/down/start/stop/restart/status/monitor/monitor-start/monitor-stop/monitor-status` 更多脚本与验证命令见 `docs/Usage-Guide.md`。 @@ -516,6 +520,7 @@ curl http://localhost:6007/health - `logs/-YYYY-MM-DD.log`(`service_ctl.sh` 按天写入的真实文件) - `logs/.log`(指向当天文件的软链,推荐 `tail -F`) +- `logs/service-monitor.log`(`service_ctl.sh monitor` 运行期健康检查、失败计数、自动重启日志) - `logs/api.log`(backend 进程内日志,按天轮转) - `logs/backend_verbose.log`(backend 大对象详细日志,按天轮转) - `logs/indexer.log`(索引结构化 JSON 日志,按天轮转) diff --git a/docs/Usage-Guide.md b/docs/Usage-Guide.md index 3837ee9..1926465 100644 --- a/docs/Usage-Guide.md +++ b/docs/Usage-Guide.md @@ -121,14 +121,15 @@ API_PORT=6002 ```bash cd /data/saas-search -./run.sh +./run.sh all ``` 这个脚本会自动: 1. 创建日志目录 -2. 启动核心服务(backend/indexer/frontend) +2. 按目标启动服务(`all`:`tei cnclip embedding translator reranker backend indexer frontend`) 3. 写入 PID 到 `logs/*.pid` 4. 执行健康检查 +5. 启动 monitor daemon(运行期连续失败自动重启) 启动完成后,访问: - **前端界面**: http://localhost:6003 @@ -136,11 +137,7 @@ cd /data/saas-search - **API文档**: http://localhost:6002/docs - **索引API**: http://localhost:6004/docs -可选:全功能模式(同时启动 embedding/translator/reranker/tei/cnclip): - -```bash -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker -``` +仅启动部分服务示例:`./run.sh backend indexer frontend` ### 方式2: 统一控制脚本(推荐) @@ -148,17 +145,20 @@ TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip emb # 查看状态 ./scripts/service_ctl.sh status -# 启动核心服务(默认) -./scripts/service_ctl.sh start +# 推荐:一键启动 + 监控守护 +./scripts/service_ctl.sh up all # 启动指定服务 +./scripts/service_ctl.sh up backend indexer frontend + +# 启动指定服务(不自动拉起 monitor daemon) ./scripts/service_ctl.sh start backend indexer frontend translator reranker tei cnclip -# 停止全部服务(含可选服务) -./scripts/service_ctl.sh stop +# 停止全部服务(含 monitor daemon) +./scripts/service_ctl.sh down all -# 重启 -./scripts/service_ctl.sh restart +# 重启全部服务 +./scripts/service_ctl.sh restart all ``` ### 方式3: 分步启动(单环境) @@ -246,34 +246,35 @@ python -m http.server 6003 ### 1) 入口脚本职责 -- `./run.sh`:仅启动核心服务(`backend/indexer/frontend`)。 -- `./restart.sh`:重启逻辑为“先停所有已知服务,再启动核心服务”。 -- `./scripts/stop.sh`:停止所有已知服务。 -- `./scripts/service_ctl.sh`:统一控制器,支持 `start/stop/restart/status`,是唯一推荐入口。 +- `./run.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh up [all|service...]`。 +- `./restart.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh restart [all|service...]`。 +- `./scripts/stop.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh down [all|service...]`。 +- `./scripts/service_ctl.sh`:统一控制器,支持 `up/down/start/stop/restart/status/monitor*`(带参数行为完全由此脚本定义)。 ### 2) `service_ctl.sh` 的默认行为 -- `start`(不带服务名):启动核心服务 `backend/indexer/frontend`。 -- `stop`(不带服务名):停止全部已知服务(含可选服务)。 -- `restart`(不带服务名):先停全部,再只启动核心服务。 -- `status`(不带服务名):显示全部已知服务状态。 +- `up`:**必须显式指定** 服务或 `all`,并自动启动 monitor daemon。 +- `down`:**必须显式指定** 服务或 `all`,并停止 monitor daemon。 +- `start`:**必须显式指定** 服务或 `all`(不自动拉起 monitor daemon)。 +- `stop`:**必须显式指定** 服务或 `all`;若 monitor daemon 运行会先停止它。 +- `restart`:**必须显式指定** 服务或 `all`。 +- `status`(不带服务名):显示全部已知服务状态 + monitor daemon 状态。 ### 3) 全量服务一键拉起 ```bash -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker +./scripts/service_ctl.sh up all ``` 说明: - `TEI_DEVICE` / `CNCLIP_DEVICE` 统一使用 `cuda|cpu`。 -- 显式把 `tei`、`cnclip` 放在前面,避免 `embedding` 因依赖未就绪启动失败。 +- `all` 内部已按依赖顺序处理(先 `tei/cnclip` 再 `embedding`)。 ### 4) 常用运维命令 ```bash -# 先重启核心,再拉起可选服务(最常用) -./restart.sh -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker +# 一键拉起完整栈(推荐) +./scripts/service_ctl.sh up all # 或 ./run.sh all # 查看全量状态 ./scripts/service_ctl.sh status @@ -282,17 +283,17 @@ TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip emb ./scripts/service_ctl.sh restart embedding # 停止全部 -./scripts/service_ctl.sh stop +./scripts/service_ctl.sh down all # 或 ./scripts/stop.sh all ``` ### 停止服务 ```bash -# 推荐:统一停止 -./scripts/stop.sh +# 推荐:统一停止(示例:全部) +./scripts/stop.sh all # 或使用统一控制脚本 -./scripts/service_ctl.sh stop +./scripts/service_ctl.sh down all ``` ### 服务端口 diff --git a/restart.sh b/restart.sh index 49c036f..9551817 100755 --- a/restart.sh +++ b/restart.sh @@ -6,4 +6,4 @@ set -euo pipefail cd "$(dirname "$0")" -./scripts/service_ctl.sh restart +./scripts/service_ctl.sh restart "$@" diff --git a/run.sh b/run.sh index 403554b..6ac1881 100755 --- a/run.sh +++ b/run.sh @@ -6,4 +6,4 @@ set -euo pipefail cd "$(dirname "$0")" -./scripts/service_ctl.sh start +./scripts/service_ctl.sh up "$@" diff --git a/scripts/service_ctl.sh b/scripts/service_ctl.sh index 3dc6c20..38d060c 100755 --- a/scripts/service_ctl.sh +++ b/scripts/service_ctl.sh @@ -1,7 +1,7 @@ #!/bin/bash # # Unified service lifecycle controller for saas-search. -# Supports: start / stop / restart / status +# Supports: up / down / start / stop / restart / status / monitor / monitor-start / monitor-stop / monitor-status # set -euo pipefail @@ -16,10 +16,12 @@ mkdir -p "${LOG_DIR}" source "${PROJECT_ROOT}/scripts/lib/load_env.sh" CORE_SERVICES=("backend" "indexer" "frontend") -OPTIONAL_SERVICES=("embedding" "translator" "reranker" "tei" "cnclip") +OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "translator" "reranker") +FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") +STOP_ORDER_SERVICES=("frontend" "indexer" "backend" "reranker" "translator" "embedding" "cnclip" "tei") all_services() { - echo "${CORE_SERVICES[@]} ${OPTIONAL_SERVICES[@]}" + echo "${FULL_SERVICES[@]}" } get_port() { @@ -90,24 +92,91 @@ validate_targets() { done } +health_path_for_service() { + local service="$1" + case "${service}" in + backend|indexer|embedding|translator|reranker|tei) echo "/health" ;; + frontend) echo "/" ;; + *) echo "" ;; + esac +} + +monitor_log_file() { + echo "${LOG_DIR}/service-monitor.log" +} + +monitor_pid_file() { + echo "${LOG_DIR}/service-monitor.pid" +} + +monitor_targets_file() { + echo "${LOG_DIR}/service-monitor.targets" +} + +monitor_current_targets() { + if [ -f "$(monitor_targets_file)" ]; then + cat "$(monitor_targets_file)" 2>/dev/null || true + fi +} + +monitor_log_event() { + local service="$1" + local level="$2" + local message="$3" + local ts line + ts="$(date '+%F %T')" + line="[${ts}] [${level}] [${service}] ${message}" + echo "${line}" | tee -a "$(monitor_log_file)" +} + +require_positive_int() { + local name="$1" + local value="$2" + if ! [[ "${value}" =~ ^[0-9]+$ ]] || [ "${value}" -le 0 ]; then + echo "[error] invalid ${name}=${value}, must be a positive integer" >&2 + return 1 + fi +} + +service_healthy_now() { + local service="$1" + local port + local path + + if [ "${service}" = "tei" ]; then + port="$(get_port "${service}")" + is_running_tei_container && + curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1 + return + fi + + if [ "${service}" = "cnclip" ]; then + is_running_by_pid "${service}" || is_running_by_port "${service}" + return + fi + + port="$(get_port "${service}")" + path="$(health_path_for_service "${service}")" + if [ -z "${port}" ] || [ -z "${path}" ]; then + return 1 + fi + if ! is_running_by_port "${service}"; then + return 1 + fi + curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1 +} + wait_for_health() { local service="$1" local max_retries="${2:-30}" local interval_sec="${3:-1}" local port port="$(get_port "${service}")" - local path="/health" - - case "${service}" in - backend) path="/health" ;; - indexer) path="/health" ;; - frontend) path="/" ;; - embedding) path="/health" ;; - translator) path="/health" ;; - reranker) path="/health" ;; - tei) path="/health" ;; - *) return 0 ;; - esac + local path + path="$(health_path_for_service "${service}")" + if [ -z "${path}" ]; then + return 0 + fi local i=0 while [ "${i}" -lt "${max_retries}" ]; do @@ -126,7 +195,11 @@ wait_for_stable_health() { local interval_sec="${3:-1}" local port port="$(get_port "${service}")" - local path="/health" + local path + path="$(health_path_for_service "${service}")" + if [ -z "${path}" ]; then + return 0 + fi local i=0 while [ "${i}" -lt "${checks}" ]; do @@ -142,6 +215,161 @@ wait_for_stable_health() { return 0 } +monitor_services() { + local targets="$1" + local interval_sec="${MONITOR_INTERVAL_SEC:-10}" + local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" + local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" + local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" + + require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" + require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" + require_positive_int "MONITOR_RESTART_COOLDOWN_SEC" "${restart_cooldown_sec}" + require_positive_int "MONITOR_MAX_RESTARTS_PER_HOUR" "${max_restarts_per_hour}" + + touch "$(monitor_log_file)" + + declare -A fail_streak=() + declare -A last_restart_epoch=() + declare -A restart_history=() + + monitor_log_event "monitor" "info" "started targets=[${targets}] interval=${interval_sec}s fail_threshold=${fail_threshold} cooldown=${restart_cooldown_sec}s max_restarts_per_hour=${max_restarts_per_hour}" + trap 'monitor_log_event "monitor" "info" "received stop signal, exiting"; exit 0' INT TERM + + while true; do + local svc + for svc in ${targets}; do + if service_healthy_now "${svc}"; then + if [ "${fail_streak[${svc}]:-0}" -gt 0 ]; then + monitor_log_event "${svc}" "info" "health recovered after ${fail_streak[${svc}]} consecutive failures" + fi + fail_streak["${svc}"]=0 + continue + fi + + fail_streak["${svc}"]=$(( ${fail_streak[${svc}]:-0} + 1 )) + monitor_log_event "${svc}" "warn" "health check failed (${fail_streak[${svc}]}/${fail_threshold})" + + if [ "${fail_streak[${svc}]}" -lt "${fail_threshold}" ]; then + continue + fi + + local now + now="$(date +%s)" + local last + last="${last_restart_epoch[${svc}]:-0}" + if [ $((now - last)) -lt "${restart_cooldown_sec}" ]; then + monitor_log_event "${svc}" "warn" "restart suppressed by cooldown (${restart_cooldown_sec}s)" + continue + fi + + local t + local recent_history="" + local recent_count=0 + for t in ${restart_history[${svc}]:-}; do + if [ $((now - t)) -lt 3600 ]; then + recent_history="${recent_history} ${t}" + recent_count=$((recent_count + 1)) + fi + done + restart_history["${svc}"]="${recent_history# }" + + if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then + monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" + continue + fi + + monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" + if stop_one "${svc}" && start_one "${svc}"; then + fail_streak["${svc}"]=0 + last_restart_epoch["${svc}"]="${now}" + restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" + monitor_log_event "${svc}" "info" "restart succeeded" + else + last_restart_epoch["${svc}"]="${now}" + restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" + monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" + fi + done + sleep "${interval_sec}" + done +} + +is_monitor_daemon_running() { + local pf + pf="$(monitor_pid_file)" + if [ ! -f "${pf}" ]; then + return 1 + fi + local pid + pid="$(cat "${pf}" 2>/dev/null || true)" + [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null +} + +stop_monitor_daemon() { + local pf + pf="$(monitor_pid_file)" + local tf + tf="$(monitor_targets_file)" + + if ! is_monitor_daemon_running; then + rm -f "${pf}" + return 0 + fi + + local pid + pid="$(cat "${pf}" 2>/dev/null || true)" + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then + echo "[stop] monitor daemon pid=${pid}" + kill -TERM "${pid}" 2>/dev/null || true + sleep 1 + if kill -0 "${pid}" 2>/dev/null; then + kill -KILL "${pid}" 2>/dev/null || true + fi + fi + rm -f "${pf}" "${tf}" +} + +start_monitor_daemon() { + local targets="$1" + local pf + pf="$(monitor_pid_file)" + local tf + tf="$(monitor_targets_file)" + + local current_targets + current_targets="$(monitor_current_targets)" + if is_monitor_daemon_running; then + if [ "${current_targets}" = "${targets}" ]; then + echo "[skip] monitor daemon already running (targets=[${targets}])" + return 0 + fi + echo "[info] monitor daemon targets changed: [${current_targets}] -> [${targets}]" + stop_monitor_daemon + fi + + echo "${targets}" > "${tf}" + nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 & + local pid=$! + echo "${pid}" > "${pf}" + echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" +} + +monitor_daemon_status() { + local running="no" + local pid="-" + local targets="-" + + if is_monitor_daemon_running; then + running="yes" + pid="$(cat "$(monitor_pid_file)" 2>/dev/null || echo "-")" + targets="$(monitor_current_targets)" + [ -z "${targets}" ] && targets="-" + fi + + printf "%-14s running=%-3s pid=%-8s targets=%s\n" "service-monitor" "${running}" "${pid}" "${targets}" +} + is_running_by_pid() { local service="$1" local pf @@ -379,6 +607,88 @@ status_one() { printf "%-10s running=%-3s port=%-6s pid=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" } +service_is_running() { + local service="$1" + case "${service}" in + tei) + is_running_tei_container + ;; + cnclip) + is_running_by_pid "${service}" || is_running_by_port "${service}" + ;; + *) + is_running_by_pid "${service}" || is_running_by_port "${service}" + ;; + esac +} + +expand_target_token() { + local token="$1" + case "${token}" in + all) + echo "$(all_services)" + ;; + *) + echo "${token}" + ;; + esac +} + +normalize_targets() { + local raw="$1" + declare -A seen=() + local out="" + local token svc + for token in ${raw}; do + for svc in $(expand_target_token "${token}"); do + if [ -z "${seen[${svc}]:-}" ]; then + seen["${svc}"]=1 + out="${out} ${svc}" + fi + done + done + echo "${out# }" +} + +sort_targets_by_order() { + local targets="$1" + shift || true + local out="" + local svc + declare -A want=() + for svc in ${targets}; do + want["${svc}"]=1 + done + + for svc in "$@"; do + if [ "${want[${svc}]:-0}" = "1" ]; then + out="${out} ${svc}" + unset "want[${svc}]" + fi + done + + for svc in ${targets}; do + if [ "${want[${svc}]:-0}" = "1" ]; then + out="${out} ${svc}" + unset "want[${svc}]" + fi + done + echo "${out# }" +} + +apply_target_order() { + local action="$1" + local targets="$2" + case "${action}" in + stop|down) + sort_targets_by_order "${targets}" "${STOP_ORDER_SERVICES[@]}" + ;; + *) + sort_targets_by_order "${targets}" "${FULL_SERVICES[@]}" + ;; + esac +} + resolve_targets() { local scope="$1" shift || true @@ -389,16 +699,12 @@ resolve_targets() { fi case "${scope}" in - start) - echo "${CORE_SERVICES[@]}" + monitor-stop|monitor-status) + echo "" ;; - stop|status) + status) echo "$(all_services)" ;; - restart) - # Restart with no explicit services uses the same explicit start target order. - echo "$(resolve_targets start)" - ;; *) echo "" ;; @@ -408,24 +714,39 @@ resolve_targets() { usage() { cat <<'EOF' Usage: + ./scripts/service_ctl.sh up [all|service...] + ./scripts/service_ctl.sh down [service...] ./scripts/service_ctl.sh start [service...] ./scripts/service_ctl.sh stop [service...] ./scripts/service_ctl.sh restart [service...] ./scripts/service_ctl.sh status [service...] + ./scripts/service_ctl.sh monitor [service...] + ./scripts/service_ctl.sh monitor-start [service...] + ./scripts/service_ctl.sh monitor-stop + ./scripts/service_ctl.sh monitor-status Default target set (when no service provided): - start -> backend indexer frontend - stop -> all known services - restart -> stop all known services, then start with start targets status -> all known services + up/start/stop/restart/down/monitor/monitor-start -> must specify services or all + +Special targets: + all -> all known services -Optional service startup: - ./scripts/service_ctl.sh start tei cnclip embedding translator reranker - TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei - CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip +Examples: + ./scripts/service_ctl.sh up all + ./scripts/service_ctl.sh up backend indexer frontend + ./scripts/service_ctl.sh restart + ./scripts/service_ctl.sh monitor-start all + ./scripts/service_ctl.sh monitor-status Log retention: LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start + +Monitor tuning: + MONITOR_INTERVAL_SEC=10 + MONITOR_FAIL_THRESHOLD=3 + MONITOR_RESTART_COOLDOWN_SEC=30 + MONITOR_MAX_RESTARTS_PER_HOUR=6 EOF } @@ -439,43 +760,95 @@ main() { shift || true load_env_file "${PROJECT_ROOT}/.env" - local stop_targets="" - local targets - # For restart without explicit services, stop everything first, then start - # with the default start target order. - if [ "${action}" = "restart" ] && [ "$#" -eq 0 ]; then - stop_targets="$(resolve_targets stop)" - fi - targets="$(resolve_targets "${action}" "$@")" - if [ -z "${targets}" ]; then - usage - exit 1 - fi - validate_targets "${targets}" + local targets="" + local monitor_was_running=0 + local monitor_prev_targets="" + + case "${action}" in + monitor-stop|monitor-status) + ;; + *) + targets="$(resolve_targets "${action}" "$@")" + if [ -z "${targets}" ]; then + usage + exit 1 + fi + targets="$(normalize_targets "${targets}")" + targets="$(apply_target_order "${action}" "${targets}")" + if [ -z "${targets}" ]; then + echo "[error] empty targets after expansion" >&2 + exit 1 + fi + validate_targets "${targets}" + ;; + esac case "${action}" in + up) + for svc in ${targets}; do + start_one "${svc}" + done + start_monitor_daemon "${targets}" + ;; + down) + stop_monitor_daemon + for svc in ${targets}; do + stop_one "${svc}" + done + ;; start) for svc in ${targets}; do start_one "${svc}" done ;; stop) + if is_monitor_daemon_running; then + echo "[info] stopping monitor daemon before manual stop" + stop_monitor_daemon + fi for svc in ${targets}; do stop_one "${svc}" done ;; restart) - for svc in ${stop_targets:-${targets}}; do + local restart_stop_targets + restart_stop_targets="$(apply_target_order stop "${targets}")" + if is_monitor_daemon_running; then + monitor_was_running=1 + monitor_prev_targets="$(monitor_current_targets)" + [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" + stop_monitor_daemon + fi + for svc in ${restart_stop_targets}; do stop_one "${svc}" done for svc in ${targets}; do start_one "${svc}" done + if [ "${monitor_was_running}" -eq 1 ]; then + monitor_prev_targets="$(normalize_targets "${monitor_prev_targets}")" + monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" + [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" + start_monitor_daemon "${monitor_prev_targets}" + fi ;; status) for svc in ${targets}; do status_one "${svc}" done + monitor_daemon_status + ;; + monitor) + monitor_services "${targets}" + ;; + monitor-start) + start_monitor_daemon "${targets}" + ;; + monitor-stop) + stop_monitor_daemon + ;; + monitor-status) + monitor_daemon_status ;; *) usage diff --git a/scripts/start.sh b/scripts/start.sh index 2a1b81d..cc15e75 100755 --- a/scripts/start.sh +++ b/scripts/start.sh @@ -7,16 +7,4 @@ set -euo pipefail cd "$(dirname "$0")/.." -echo "========================================" -echo "saas-search 服务启动" -echo "========================================" -echo "默认启动核心服务: backend/indexer/frontend" -echo "可选服务请显式指定:" -echo " ./scripts/service_ctl.sh start tei cnclip embedding translator reranker" -echo - -./scripts/service_ctl.sh start - -echo -echo "当前服务状态:" -./scripts/service_ctl.sh status backend indexer frontend tei cnclip embedding translator reranker +./scripts/service_ctl.sh up "$@" diff --git a/scripts/stop.sh b/scripts/stop.sh index f4b39da..4b4ee1a 100755 --- a/scripts/stop.sh +++ b/scripts/stop.sh @@ -7,10 +7,4 @@ set -euo pipefail cd "$(dirname "$0")/.." -echo "========================================" -echo "Stopping saas-search services" -echo "========================================" - -./scripts/service_ctl.sh stop - -echo "Done." +./scripts/service_ctl.sh down "$@" -- libgit2 0.21.2