Commit 7913e2fb4b6fb39cd54c61727066648c316c42cc
1 parent
149dad2b
服务管理和监控
Showing
8 changed files
with
472 additions
and
108 deletions
Show diff stats
README.md
| ... | ... | @@ -25,14 +25,17 @@ README 鈭賒撘遣蝡霈斤嚗**蝟餌芋器 |
| 25 | 25 | ./scripts/init_env.sh # 隞 .env.example .env嚗銝嚗 |
| 26 | 26 | source activate.sh |
| 27 | 27 | |
| 28 | -# 敹嚗ackend/indexer/frontend嚗 | |
| 29 | -./run.sh | |
| 28 | +# 絲嚗摰嚗 | |
| 29 | +./run.sh all # 遠鈭 ./scripts/service_ctl.sh up all | |
| 30 | 30 | |
| 31 | -# 嚗撘嚗 | |
| 32 | -./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 33 | - | |
| 34 | -# | |
| 31 | +# monitor daemon | |
| 35 | 32 | ./scripts/service_ctl.sh status |
| 33 | + | |
| 34 | +# 內靘嚗 | |
| 35 | +./restart.sh all # 遠鈭 ./scripts/service_ctl.sh restart all | |
| 36 | + | |
| 37 | +# 銝迫嚗 monitor daemon嚗 | |
| 38 | +./scripts/stop.sh all # 遠鈭 ./scripts/service_ctl.sh down all | |
| 36 | 39 | ``` |
| 37 | 40 | |
| 38 | 41 | 蝞∠秩提恕銵蛹撘 | ... | ... |
docs/QUICKSTART.md
| ... | ... | @@ -64,17 +64,20 @@ source activate.sh |
| 64 | 64 | 启动与停止: |
| 65 | 65 | |
| 66 | 66 | ```bash |
| 67 | -./run.sh | |
| 68 | -# 启动全部能力 | |
| 69 | -# 追加可选能力服务(显式指定) | |
| 70 | -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 67 | +./run.sh all | |
| 68 | +# 仅为薄封装:等价于 ./scripts/service_ctl.sh up all | |
| 71 | 69 | # 说明: |
| 70 | +# - all = tei cnclip embedding translator reranker backend indexer frontend | |
| 71 | +# - up 会同时启动 monitor daemon(运行期连续失败自动重启) | |
| 72 | 72 | # - reranker 为 GPU 强制模式(资源不足会直接启动失败) |
| 73 | 73 | # - TEI 默认使用 GPU;当 TEI_DEVICE=cuda 且 GPU 不可用时会直接失败(不会自动降级到 CPU) |
| 74 | 74 | # - cnclip 默认使用 cuda;若显式配置为 cuda 且 GPU 不可用会直接失败(不会自动降级到 cpu) |
| 75 | 75 | |
| 76 | +./restart.sh all | |
| 77 | +# 仅为薄封装:等价于 ./scripts/service_ctl.sh restart all | |
| 78 | + | |
| 76 | 79 | ./scripts/service_ctl.sh status |
| 77 | -./scripts/stop.sh | |
| 80 | +./scripts/stop.sh all # 仅为薄封装:等价于 ./scripts/service_ctl.sh down all | |
| 78 | 81 | ``` |
| 79 | 82 | |
| 80 | 83 | 服务管理方式(入口职责、默认行为、全量拉起顺序)见: |
| ... | ... | @@ -313,7 +316,8 @@ saas-search 以 MySQL 中的店匠标准表为权威数据源: |
| 313 | 316 | - `scripts/mock_data.sh`:生成 Tenant1 Mock + Tenant2 CSV 并导入 MySQL |
| 314 | 317 | - `scripts/create_tenant_index.sh <tenant_id>`:创建租户 ES 索引结构 |
| 315 | 318 | - `POST /indexer/reindex`:从 MySQL 全量导入到 ES |
| 316 | -- `run.sh` / `scripts/stop.sh`:服务启停;`scripts/service_ctl.sh`:start/stop/restart/status | |
| 319 | +- `run.sh` / `restart.sh` / `scripts/stop.sh`:推荐入口(对应 up/restart/down) | |
| 320 | +- `scripts/service_ctl.sh`:`up/down/start/stop/restart/status/monitor/monitor-start/monitor-stop/monitor-status` | |
| 317 | 321 | |
| 318 | 322 | 更多脚本与验证命令见 `docs/Usage-Guide.md`。 |
| 319 | 323 | |
| ... | ... | @@ -516,6 +520,7 @@ curl http://localhost:6007/health |
| 516 | 520 | |
| 517 | 521 | - `logs/<service>-YYYY-MM-DD.log`(`service_ctl.sh` 按天写入的真实文件) |
| 518 | 522 | - `logs/<service>.log`(指向当天文件的软链,推荐 `tail -F`) |
| 523 | +- `logs/service-monitor.log`(`service_ctl.sh monitor` 运行期健康检查、失败计数、自动重启日志) | |
| 519 | 524 | - `logs/api.log`(backend 进程内日志,按天轮转) |
| 520 | 525 | - `logs/backend_verbose.log`(backend 大对象详细日志,按天轮转) |
| 521 | 526 | - `logs/indexer.log`(索引结构化 JSON 日志,按天轮转) | ... | ... |
docs/Usage-Guide.md
| ... | ... | @@ -121,14 +121,15 @@ API_PORT=6002 |
| 121 | 121 | |
| 122 | 122 | ```bash |
| 123 | 123 | cd /data/saas-search |
| 124 | -./run.sh | |
| 124 | +./run.sh all | |
| 125 | 125 | ``` |
| 126 | 126 | |
| 127 | 127 | 这个脚本会自动: |
| 128 | 128 | 1. 创建日志目录 |
| 129 | -2. 启动核心服务(backend/indexer/frontend) | |
| 129 | +2. 按目标启动服务(`all`:`tei cnclip embedding translator reranker backend indexer frontend`) | |
| 130 | 130 | 3. 写入 PID 到 `logs/*.pid` |
| 131 | 131 | 4. 执行健康检查 |
| 132 | +5. 启动 monitor daemon(运行期连续失败自动重启) | |
| 132 | 133 | |
| 133 | 134 | 启动完成后,访问: |
| 134 | 135 | - **前端界面**: http://localhost:6003 |
| ... | ... | @@ -136,11 +137,7 @@ cd /data/saas-search |
| 136 | 137 | - **API文档**: http://localhost:6002/docs |
| 137 | 138 | - **索引API**: http://localhost:6004/docs |
| 138 | 139 | |
| 139 | -可选:全功能模式(同时启动 embedding/translator/reranker/tei/cnclip): | |
| 140 | - | |
| 141 | -```bash | |
| 142 | -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 143 | -``` | |
| 140 | +仅启动部分服务示例:`./run.sh backend indexer frontend` | |
| 144 | 141 | |
| 145 | 142 | ### 方式2: 统一控制脚本(推荐) |
| 146 | 143 | |
| ... | ... | @@ -148,17 +145,20 @@ TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip emb |
| 148 | 145 | # 查看状态 |
| 149 | 146 | ./scripts/service_ctl.sh status |
| 150 | 147 | |
| 151 | -# 启动核心服务(默认) | |
| 152 | -./scripts/service_ctl.sh start | |
| 148 | +# 推荐:一键启动 + 监控守护 | |
| 149 | +./scripts/service_ctl.sh up all | |
| 153 | 150 | |
| 154 | 151 | # 启动指定服务 |
| 152 | +./scripts/service_ctl.sh up backend indexer frontend | |
| 153 | + | |
| 154 | +# 启动指定服务(不自动拉起 monitor daemon) | |
| 155 | 155 | ./scripts/service_ctl.sh start backend indexer frontend translator reranker tei cnclip |
| 156 | 156 | |
| 157 | -# 停止全部服务(含可选服务) | |
| 158 | -./scripts/service_ctl.sh stop | |
| 157 | +# 停止全部服务(含 monitor daemon) | |
| 158 | +./scripts/service_ctl.sh down all | |
| 159 | 159 | |
| 160 | -# 重启 | |
| 161 | -./scripts/service_ctl.sh restart | |
| 160 | +# 重启全部服务 | |
| 161 | +./scripts/service_ctl.sh restart all | |
| 162 | 162 | ``` |
| 163 | 163 | |
| 164 | 164 | ### 方式3: 分步启动(单环境) |
| ... | ... | @@ -246,34 +246,35 @@ python -m http.server 6003 |
| 246 | 246 | |
| 247 | 247 | ### 1) 入口脚本职责 |
| 248 | 248 | |
| 249 | -- `./run.sh`:仅启动核心服务(`backend/indexer/frontend`)。 | |
| 250 | -- `./restart.sh`:重启逻辑为“先停所有已知服务,再启动核心服务”。 | |
| 251 | -- `./scripts/stop.sh`:停止所有已知服务。 | |
| 252 | -- `./scripts/service_ctl.sh`:统一控制器,支持 `start/stop/restart/status`,是唯一推荐入口。 | |
| 249 | +- `./run.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh up [all|service...]`。 | |
| 250 | +- `./restart.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh restart [all|service...]`。 | |
| 251 | +- `./scripts/stop.sh [all|service...]`:薄封装,直接调用 `./scripts/service_ctl.sh down [all|service...]`。 | |
| 252 | +- `./scripts/service_ctl.sh`:统一控制器,支持 `up/down/start/stop/restart/status/monitor*`(带参数行为完全由此脚本定义)。 | |
| 253 | 253 | |
| 254 | 254 | ### 2) `service_ctl.sh` 的默认行为 |
| 255 | 255 | |
| 256 | -- `start`(不带服务名):启动核心服务 `backend/indexer/frontend`。 | |
| 257 | -- `stop`(不带服务名):停止全部已知服务(含可选服务)。 | |
| 258 | -- `restart`(不带服务名):先停全部,再只启动核心服务。 | |
| 259 | -- `status`(不带服务名):显示全部已知服务状态。 | |
| 256 | +- `up`:**必须显式指定** 服务或 `all`,并自动启动 monitor daemon。 | |
| 257 | +- `down`:**必须显式指定** 服务或 `all`,并停止 monitor daemon。 | |
| 258 | +- `start`:**必须显式指定** 服务或 `all`(不自动拉起 monitor daemon)。 | |
| 259 | +- `stop`:**必须显式指定** 服务或 `all`;若 monitor daemon 运行会先停止它。 | |
| 260 | +- `restart`:**必须显式指定** 服务或 `all`。 | |
| 261 | +- `status`(不带服务名):显示全部已知服务状态 + monitor daemon 状态。 | |
| 260 | 262 | |
| 261 | 263 | ### 3) 全量服务一键拉起 |
| 262 | 264 | |
| 263 | 265 | ```bash |
| 264 | -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 266 | +./scripts/service_ctl.sh up all | |
| 265 | 267 | ``` |
| 266 | 268 | |
| 267 | 269 | 说明: |
| 268 | 270 | - `TEI_DEVICE` / `CNCLIP_DEVICE` 统一使用 `cuda|cpu`。 |
| 269 | -- 显式把 `tei`、`cnclip` 放在前面,避免 `embedding` 因依赖未就绪启动失败。 | |
| 271 | +- `all` 内部已按依赖顺序处理(先 `tei/cnclip` 再 `embedding`)。 | |
| 270 | 272 | |
| 271 | 273 | ### 4) 常用运维命令 |
| 272 | 274 | |
| 273 | 275 | ```bash |
| 274 | -# 先重启核心,再拉起可选服务(最常用) | |
| 275 | -./restart.sh | |
| 276 | -TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 276 | +# 一键拉起完整栈(推荐) | |
| 277 | +./scripts/service_ctl.sh up all # 或 ./run.sh all | |
| 277 | 278 | |
| 278 | 279 | # 查看全量状态 |
| 279 | 280 | ./scripts/service_ctl.sh status |
| ... | ... | @@ -282,17 +283,17 @@ TEI_DEVICE=cuda CNCLIP_DEVICE=cuda ./scripts/service_ctl.sh start tei cnclip emb |
| 282 | 283 | ./scripts/service_ctl.sh restart embedding |
| 283 | 284 | |
| 284 | 285 | # 停止全部 |
| 285 | -./scripts/service_ctl.sh stop | |
| 286 | +./scripts/service_ctl.sh down all # 或 ./scripts/stop.sh all | |
| 286 | 287 | ``` |
| 287 | 288 | |
| 288 | 289 | ### 停止服务 |
| 289 | 290 | |
| 290 | 291 | ```bash |
| 291 | -# 推荐:统一停止 | |
| 292 | -./scripts/stop.sh | |
| 292 | +# 推荐:统一停止(示例:全部) | |
| 293 | +./scripts/stop.sh all | |
| 293 | 294 | |
| 294 | 295 | # 或使用统一控制脚本 |
| 295 | -./scripts/service_ctl.sh stop | |
| 296 | +./scripts/service_ctl.sh down all | |
| 296 | 297 | ``` |
| 297 | 298 | |
| 298 | 299 | ### 服务端口 | ... | ... |
restart.sh
scripts/service_ctl.sh
| 1 | 1 | #!/bin/bash |
| 2 | 2 | # |
| 3 | 3 | # Unified service lifecycle controller for saas-search. |
| 4 | -# Supports: start / stop / restart / status | |
| 4 | +# Supports: up / down / start / stop / restart / status / monitor / monitor-start / monitor-stop / monitor-status | |
| 5 | 5 | # |
| 6 | 6 | |
| 7 | 7 | set -euo pipefail |
| ... | ... | @@ -16,10 +16,12 @@ mkdir -p "${LOG_DIR}" |
| 16 | 16 | source "${PROJECT_ROOT}/scripts/lib/load_env.sh" |
| 17 | 17 | |
| 18 | 18 | CORE_SERVICES=("backend" "indexer" "frontend") |
| 19 | -OPTIONAL_SERVICES=("embedding" "translator" "reranker" "tei" "cnclip") | |
| 19 | +OPTIONAL_SERVICES=("tei" "cnclip" "embedding" "translator" "reranker") | |
| 20 | +FULL_SERVICES=("${OPTIONAL_SERVICES[@]}" "${CORE_SERVICES[@]}") | |
| 21 | +STOP_ORDER_SERVICES=("frontend" "indexer" "backend" "reranker" "translator" "embedding" "cnclip" "tei") | |
| 20 | 22 | |
| 21 | 23 | all_services() { |
| 22 | - echo "${CORE_SERVICES[@]} ${OPTIONAL_SERVICES[@]}" | |
| 24 | + echo "${FULL_SERVICES[@]}" | |
| 23 | 25 | } |
| 24 | 26 | |
| 25 | 27 | get_port() { |
| ... | ... | @@ -90,24 +92,91 @@ validate_targets() { |
| 90 | 92 | done |
| 91 | 93 | } |
| 92 | 94 | |
| 95 | +health_path_for_service() { | |
| 96 | + local service="$1" | |
| 97 | + case "${service}" in | |
| 98 | + backend|indexer|embedding|translator|reranker|tei) echo "/health" ;; | |
| 99 | + frontend) echo "/" ;; | |
| 100 | + *) echo "" ;; | |
| 101 | + esac | |
| 102 | +} | |
| 103 | + | |
| 104 | +monitor_log_file() { | |
| 105 | + echo "${LOG_DIR}/service-monitor.log" | |
| 106 | +} | |
| 107 | + | |
| 108 | +monitor_pid_file() { | |
| 109 | + echo "${LOG_DIR}/service-monitor.pid" | |
| 110 | +} | |
| 111 | + | |
| 112 | +monitor_targets_file() { | |
| 113 | + echo "${LOG_DIR}/service-monitor.targets" | |
| 114 | +} | |
| 115 | + | |
| 116 | +monitor_current_targets() { | |
| 117 | + if [ -f "$(monitor_targets_file)" ]; then | |
| 118 | + cat "$(monitor_targets_file)" 2>/dev/null || true | |
| 119 | + fi | |
| 120 | +} | |
| 121 | + | |
| 122 | +monitor_log_event() { | |
| 123 | + local service="$1" | |
| 124 | + local level="$2" | |
| 125 | + local message="$3" | |
| 126 | + local ts line | |
| 127 | + ts="$(date '+%F %T')" | |
| 128 | + line="[${ts}] [${level}] [${service}] ${message}" | |
| 129 | + echo "${line}" | tee -a "$(monitor_log_file)" | |
| 130 | +} | |
| 131 | + | |
| 132 | +require_positive_int() { | |
| 133 | + local name="$1" | |
| 134 | + local value="$2" | |
| 135 | + if ! [[ "${value}" =~ ^[0-9]+$ ]] || [ "${value}" -le 0 ]; then | |
| 136 | + echo "[error] invalid ${name}=${value}, must be a positive integer" >&2 | |
| 137 | + return 1 | |
| 138 | + fi | |
| 139 | +} | |
| 140 | + | |
| 141 | +service_healthy_now() { | |
| 142 | + local service="$1" | |
| 143 | + local port | |
| 144 | + local path | |
| 145 | + | |
| 146 | + if [ "${service}" = "tei" ]; then | |
| 147 | + port="$(get_port "${service}")" | |
| 148 | + is_running_tei_container && | |
| 149 | + curl -sf "http://127.0.0.1:${port}/health" >/dev/null 2>&1 | |
| 150 | + return | |
| 151 | + fi | |
| 152 | + | |
| 153 | + if [ "${service}" = "cnclip" ]; then | |
| 154 | + is_running_by_pid "${service}" || is_running_by_port "${service}" | |
| 155 | + return | |
| 156 | + fi | |
| 157 | + | |
| 158 | + port="$(get_port "${service}")" | |
| 159 | + path="$(health_path_for_service "${service}")" | |
| 160 | + if [ -z "${port}" ] || [ -z "${path}" ]; then | |
| 161 | + return 1 | |
| 162 | + fi | |
| 163 | + if ! is_running_by_port "${service}"; then | |
| 164 | + return 1 | |
| 165 | + fi | |
| 166 | + curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1 | |
| 167 | +} | |
| 168 | + | |
| 93 | 169 | wait_for_health() { |
| 94 | 170 | local service="$1" |
| 95 | 171 | local max_retries="${2:-30}" |
| 96 | 172 | local interval_sec="${3:-1}" |
| 97 | 173 | local port |
| 98 | 174 | port="$(get_port "${service}")" |
| 99 | - local path="/health" | |
| 100 | - | |
| 101 | - case "${service}" in | |
| 102 | - backend) path="/health" ;; | |
| 103 | - indexer) path="/health" ;; | |
| 104 | - frontend) path="/" ;; | |
| 105 | - embedding) path="/health" ;; | |
| 106 | - translator) path="/health" ;; | |
| 107 | - reranker) path="/health" ;; | |
| 108 | - tei) path="/health" ;; | |
| 109 | - *) return 0 ;; | |
| 110 | - esac | |
| 175 | + local path | |
| 176 | + path="$(health_path_for_service "${service}")" | |
| 177 | + if [ -z "${path}" ]; then | |
| 178 | + return 0 | |
| 179 | + fi | |
| 111 | 180 | |
| 112 | 181 | local i=0 |
| 113 | 182 | while [ "${i}" -lt "${max_retries}" ]; do |
| ... | ... | @@ -126,7 +195,11 @@ wait_for_stable_health() { |
| 126 | 195 | local interval_sec="${3:-1}" |
| 127 | 196 | local port |
| 128 | 197 | port="$(get_port "${service}")" |
| 129 | - local path="/health" | |
| 198 | + local path | |
| 199 | + path="$(health_path_for_service "${service}")" | |
| 200 | + if [ -z "${path}" ]; then | |
| 201 | + return 0 | |
| 202 | + fi | |
| 130 | 203 | |
| 131 | 204 | local i=0 |
| 132 | 205 | while [ "${i}" -lt "${checks}" ]; do |
| ... | ... | @@ -142,6 +215,161 @@ wait_for_stable_health() { |
| 142 | 215 | return 0 |
| 143 | 216 | } |
| 144 | 217 | |
| 218 | +monitor_services() { | |
| 219 | + local targets="$1" | |
| 220 | + local interval_sec="${MONITOR_INTERVAL_SEC:-10}" | |
| 221 | + local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" | |
| 222 | + local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" | |
| 223 | + local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" | |
| 224 | + | |
| 225 | + require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" | |
| 226 | + require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" | |
| 227 | + require_positive_int "MONITOR_RESTART_COOLDOWN_SEC" "${restart_cooldown_sec}" | |
| 228 | + require_positive_int "MONITOR_MAX_RESTARTS_PER_HOUR" "${max_restarts_per_hour}" | |
| 229 | + | |
| 230 | + touch "$(monitor_log_file)" | |
| 231 | + | |
| 232 | + declare -A fail_streak=() | |
| 233 | + declare -A last_restart_epoch=() | |
| 234 | + declare -A restart_history=() | |
| 235 | + | |
| 236 | + monitor_log_event "monitor" "info" "started targets=[${targets}] interval=${interval_sec}s fail_threshold=${fail_threshold} cooldown=${restart_cooldown_sec}s max_restarts_per_hour=${max_restarts_per_hour}" | |
| 237 | + trap 'monitor_log_event "monitor" "info" "received stop signal, exiting"; exit 0' INT TERM | |
| 238 | + | |
| 239 | + while true; do | |
| 240 | + local svc | |
| 241 | + for svc in ${targets}; do | |
| 242 | + if service_healthy_now "${svc}"; then | |
| 243 | + if [ "${fail_streak[${svc}]:-0}" -gt 0 ]; then | |
| 244 | + monitor_log_event "${svc}" "info" "health recovered after ${fail_streak[${svc}]} consecutive failures" | |
| 245 | + fi | |
| 246 | + fail_streak["${svc}"]=0 | |
| 247 | + continue | |
| 248 | + fi | |
| 249 | + | |
| 250 | + fail_streak["${svc}"]=$(( ${fail_streak[${svc}]:-0} + 1 )) | |
| 251 | + monitor_log_event "${svc}" "warn" "health check failed (${fail_streak[${svc}]}/${fail_threshold})" | |
| 252 | + | |
| 253 | + if [ "${fail_streak[${svc}]}" -lt "${fail_threshold}" ]; then | |
| 254 | + continue | |
| 255 | + fi | |
| 256 | + | |
| 257 | + local now | |
| 258 | + now="$(date +%s)" | |
| 259 | + local last | |
| 260 | + last="${last_restart_epoch[${svc}]:-0}" | |
| 261 | + if [ $((now - last)) -lt "${restart_cooldown_sec}" ]; then | |
| 262 | + monitor_log_event "${svc}" "warn" "restart suppressed by cooldown (${restart_cooldown_sec}s)" | |
| 263 | + continue | |
| 264 | + fi | |
| 265 | + | |
| 266 | + local t | |
| 267 | + local recent_history="" | |
| 268 | + local recent_count=0 | |
| 269 | + for t in ${restart_history[${svc}]:-}; do | |
| 270 | + if [ $((now - t)) -lt 3600 ]; then | |
| 271 | + recent_history="${recent_history} ${t}" | |
| 272 | + recent_count=$((recent_count + 1)) | |
| 273 | + fi | |
| 274 | + done | |
| 275 | + restart_history["${svc}"]="${recent_history# }" | |
| 276 | + | |
| 277 | + if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then | |
| 278 | + monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" | |
| 279 | + continue | |
| 280 | + fi | |
| 281 | + | |
| 282 | + monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" | |
| 283 | + if stop_one "${svc}" && start_one "${svc}"; then | |
| 284 | + fail_streak["${svc}"]=0 | |
| 285 | + last_restart_epoch["${svc}"]="${now}" | |
| 286 | + restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" | |
| 287 | + monitor_log_event "${svc}" "info" "restart succeeded" | |
| 288 | + else | |
| 289 | + last_restart_epoch["${svc}"]="${now}" | |
| 290 | + restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" | |
| 291 | + monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" | |
| 292 | + fi | |
| 293 | + done | |
| 294 | + sleep "${interval_sec}" | |
| 295 | + done | |
| 296 | +} | |
| 297 | + | |
| 298 | +is_monitor_daemon_running() { | |
| 299 | + local pf | |
| 300 | + pf="$(monitor_pid_file)" | |
| 301 | + if [ ! -f "${pf}" ]; then | |
| 302 | + return 1 | |
| 303 | + fi | |
| 304 | + local pid | |
| 305 | + pid="$(cat "${pf}" 2>/dev/null || true)" | |
| 306 | + [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null | |
| 307 | +} | |
| 308 | + | |
| 309 | +stop_monitor_daemon() { | |
| 310 | + local pf | |
| 311 | + pf="$(monitor_pid_file)" | |
| 312 | + local tf | |
| 313 | + tf="$(monitor_targets_file)" | |
| 314 | + | |
| 315 | + if ! is_monitor_daemon_running; then | |
| 316 | + rm -f "${pf}" | |
| 317 | + return 0 | |
| 318 | + fi | |
| 319 | + | |
| 320 | + local pid | |
| 321 | + pid="$(cat "${pf}" 2>/dev/null || true)" | |
| 322 | + if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then | |
| 323 | + echo "[stop] monitor daemon pid=${pid}" | |
| 324 | + kill -TERM "${pid}" 2>/dev/null || true | |
| 325 | + sleep 1 | |
| 326 | + if kill -0 "${pid}" 2>/dev/null; then | |
| 327 | + kill -KILL "${pid}" 2>/dev/null || true | |
| 328 | + fi | |
| 329 | + fi | |
| 330 | + rm -f "${pf}" "${tf}" | |
| 331 | +} | |
| 332 | + | |
| 333 | +start_monitor_daemon() { | |
| 334 | + local targets="$1" | |
| 335 | + local pf | |
| 336 | + pf="$(monitor_pid_file)" | |
| 337 | + local tf | |
| 338 | + tf="$(monitor_targets_file)" | |
| 339 | + | |
| 340 | + local current_targets | |
| 341 | + current_targets="$(monitor_current_targets)" | |
| 342 | + if is_monitor_daemon_running; then | |
| 343 | + if [ "${current_targets}" = "${targets}" ]; then | |
| 344 | + echo "[skip] monitor daemon already running (targets=[${targets}])" | |
| 345 | + return 0 | |
| 346 | + fi | |
| 347 | + echo "[info] monitor daemon targets changed: [${current_targets}] -> [${targets}]" | |
| 348 | + stop_monitor_daemon | |
| 349 | + fi | |
| 350 | + | |
| 351 | + echo "${targets}" > "${tf}" | |
| 352 | + nohup "${PROJECT_ROOT}/scripts/service_ctl.sh" monitor ${targets} >/dev/null 2>&1 & | |
| 353 | + local pid=$! | |
| 354 | + echo "${pid}" > "${pf}" | |
| 355 | + echo "[ok] monitor daemon started (pid=${pid}, targets=[${targets}], log=$(monitor_log_file))" | |
| 356 | +} | |
| 357 | + | |
| 358 | +monitor_daemon_status() { | |
| 359 | + local running="no" | |
| 360 | + local pid="-" | |
| 361 | + local targets="-" | |
| 362 | + | |
| 363 | + if is_monitor_daemon_running; then | |
| 364 | + running="yes" | |
| 365 | + pid="$(cat "$(monitor_pid_file)" 2>/dev/null || echo "-")" | |
| 366 | + targets="$(monitor_current_targets)" | |
| 367 | + [ -z "${targets}" ] && targets="-" | |
| 368 | + fi | |
| 369 | + | |
| 370 | + printf "%-14s running=%-3s pid=%-8s targets=%s\n" "service-monitor" "${running}" "${pid}" "${targets}" | |
| 371 | +} | |
| 372 | + | |
| 145 | 373 | is_running_by_pid() { |
| 146 | 374 | local service="$1" |
| 147 | 375 | local pf |
| ... | ... | @@ -379,6 +607,88 @@ status_one() { |
| 379 | 607 | printf "%-10s running=%-3s port=%-6s pid=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}" |
| 380 | 608 | } |
| 381 | 609 | |
| 610 | +service_is_running() { | |
| 611 | + local service="$1" | |
| 612 | + case "${service}" in | |
| 613 | + tei) | |
| 614 | + is_running_tei_container | |
| 615 | + ;; | |
| 616 | + cnclip) | |
| 617 | + is_running_by_pid "${service}" || is_running_by_port "${service}" | |
| 618 | + ;; | |
| 619 | + *) | |
| 620 | + is_running_by_pid "${service}" || is_running_by_port "${service}" | |
| 621 | + ;; | |
| 622 | + esac | |
| 623 | +} | |
| 624 | + | |
| 625 | +expand_target_token() { | |
| 626 | + local token="$1" | |
| 627 | + case "${token}" in | |
| 628 | + all) | |
| 629 | + echo "$(all_services)" | |
| 630 | + ;; | |
| 631 | + *) | |
| 632 | + echo "${token}" | |
| 633 | + ;; | |
| 634 | + esac | |
| 635 | +} | |
| 636 | + | |
| 637 | +normalize_targets() { | |
| 638 | + local raw="$1" | |
| 639 | + declare -A seen=() | |
| 640 | + local out="" | |
| 641 | + local token svc | |
| 642 | + for token in ${raw}; do | |
| 643 | + for svc in $(expand_target_token "${token}"); do | |
| 644 | + if [ -z "${seen[${svc}]:-}" ]; then | |
| 645 | + seen["${svc}"]=1 | |
| 646 | + out="${out} ${svc}" | |
| 647 | + fi | |
| 648 | + done | |
| 649 | + done | |
| 650 | + echo "${out# }" | |
| 651 | +} | |
| 652 | + | |
| 653 | +sort_targets_by_order() { | |
| 654 | + local targets="$1" | |
| 655 | + shift || true | |
| 656 | + local out="" | |
| 657 | + local svc | |
| 658 | + declare -A want=() | |
| 659 | + for svc in ${targets}; do | |
| 660 | + want["${svc}"]=1 | |
| 661 | + done | |
| 662 | + | |
| 663 | + for svc in "$@"; do | |
| 664 | + if [ "${want[${svc}]:-0}" = "1" ]; then | |
| 665 | + out="${out} ${svc}" | |
| 666 | + unset "want[${svc}]" | |
| 667 | + fi | |
| 668 | + done | |
| 669 | + | |
| 670 | + for svc in ${targets}; do | |
| 671 | + if [ "${want[${svc}]:-0}" = "1" ]; then | |
| 672 | + out="${out} ${svc}" | |
| 673 | + unset "want[${svc}]" | |
| 674 | + fi | |
| 675 | + done | |
| 676 | + echo "${out# }" | |
| 677 | +} | |
| 678 | + | |
| 679 | +apply_target_order() { | |
| 680 | + local action="$1" | |
| 681 | + local targets="$2" | |
| 682 | + case "${action}" in | |
| 683 | + stop|down) | |
| 684 | + sort_targets_by_order "${targets}" "${STOP_ORDER_SERVICES[@]}" | |
| 685 | + ;; | |
| 686 | + *) | |
| 687 | + sort_targets_by_order "${targets}" "${FULL_SERVICES[@]}" | |
| 688 | + ;; | |
| 689 | + esac | |
| 690 | +} | |
| 691 | + | |
| 382 | 692 | resolve_targets() { |
| 383 | 693 | local scope="$1" |
| 384 | 694 | shift || true |
| ... | ... | @@ -389,16 +699,12 @@ resolve_targets() { |
| 389 | 699 | fi |
| 390 | 700 | |
| 391 | 701 | case "${scope}" in |
| 392 | - start) | |
| 393 | - echo "${CORE_SERVICES[@]}" | |
| 702 | + monitor-stop|monitor-status) | |
| 703 | + echo "" | |
| 394 | 704 | ;; |
| 395 | - stop|status) | |
| 705 | + status) | |
| 396 | 706 | echo "$(all_services)" |
| 397 | 707 | ;; |
| 398 | - restart) | |
| 399 | - # Restart with no explicit services uses the same explicit start target order. | |
| 400 | - echo "$(resolve_targets start)" | |
| 401 | - ;; | |
| 402 | 708 | *) |
| 403 | 709 | echo "" |
| 404 | 710 | ;; |
| ... | ... | @@ -408,24 +714,39 @@ resolve_targets() { |
| 408 | 714 | usage() { |
| 409 | 715 | cat <<'EOF' |
| 410 | 716 | Usage: |
| 717 | + ./scripts/service_ctl.sh up [all|service...] | |
| 718 | + ./scripts/service_ctl.sh down [service...] | |
| 411 | 719 | ./scripts/service_ctl.sh start [service...] |
| 412 | 720 | ./scripts/service_ctl.sh stop [service...] |
| 413 | 721 | ./scripts/service_ctl.sh restart [service...] |
| 414 | 722 | ./scripts/service_ctl.sh status [service...] |
| 723 | + ./scripts/service_ctl.sh monitor [service...] | |
| 724 | + ./scripts/service_ctl.sh monitor-start [service...] | |
| 725 | + ./scripts/service_ctl.sh monitor-stop | |
| 726 | + ./scripts/service_ctl.sh monitor-status | |
| 415 | 727 | |
| 416 | 728 | Default target set (when no service provided): |
| 417 | - start -> backend indexer frontend | |
| 418 | - stop -> all known services | |
| 419 | - restart -> stop all known services, then start with start targets | |
| 420 | 729 | status -> all known services |
| 730 | + up/start/stop/restart/down/monitor/monitor-start -> must specify services or all | |
| 731 | + | |
| 732 | +Special targets: | |
| 733 | + all -> all known services | |
| 421 | 734 | |
| 422 | -Optional service startup: | |
| 423 | - ./scripts/service_ctl.sh start tei cnclip embedding translator reranker | |
| 424 | - TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei | |
| 425 | - CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip | |
| 735 | +Examples: | |
| 736 | + ./scripts/service_ctl.sh up all | |
| 737 | + ./scripts/service_ctl.sh up backend indexer frontend | |
| 738 | + ./scripts/service_ctl.sh restart | |
| 739 | + ./scripts/service_ctl.sh monitor-start all | |
| 740 | + ./scripts/service_ctl.sh monitor-status | |
| 426 | 741 | |
| 427 | 742 | Log retention: |
| 428 | 743 | LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start |
| 744 | + | |
| 745 | +Monitor tuning: | |
| 746 | + MONITOR_INTERVAL_SEC=10 | |
| 747 | + MONITOR_FAIL_THRESHOLD=3 | |
| 748 | + MONITOR_RESTART_COOLDOWN_SEC=30 | |
| 749 | + MONITOR_MAX_RESTARTS_PER_HOUR=6 | |
| 429 | 750 | EOF |
| 430 | 751 | } |
| 431 | 752 | |
| ... | ... | @@ -439,43 +760,95 @@ main() { |
| 439 | 760 | shift || true |
| 440 | 761 | |
| 441 | 762 | load_env_file "${PROJECT_ROOT}/.env" |
| 442 | - local stop_targets="" | |
| 443 | - local targets | |
| 444 | - # For restart without explicit services, stop everything first, then start | |
| 445 | - # with the default start target order. | |
| 446 | - if [ "${action}" = "restart" ] && [ "$#" -eq 0 ]; then | |
| 447 | - stop_targets="$(resolve_targets stop)" | |
| 448 | - fi | |
| 449 | - targets="$(resolve_targets "${action}" "$@")" | |
| 450 | - if [ -z "${targets}" ]; then | |
| 451 | - usage | |
| 452 | - exit 1 | |
| 453 | - fi | |
| 454 | - validate_targets "${targets}" | |
| 763 | + local targets="" | |
| 764 | + local monitor_was_running=0 | |
| 765 | + local monitor_prev_targets="" | |
| 766 | + | |
| 767 | + case "${action}" in | |
| 768 | + monitor-stop|monitor-status) | |
| 769 | + ;; | |
| 770 | + *) | |
| 771 | + targets="$(resolve_targets "${action}" "$@")" | |
| 772 | + if [ -z "${targets}" ]; then | |
| 773 | + usage | |
| 774 | + exit 1 | |
| 775 | + fi | |
| 776 | + targets="$(normalize_targets "${targets}")" | |
| 777 | + targets="$(apply_target_order "${action}" "${targets}")" | |
| 778 | + if [ -z "${targets}" ]; then | |
| 779 | + echo "[error] empty targets after expansion" >&2 | |
| 780 | + exit 1 | |
| 781 | + fi | |
| 782 | + validate_targets "${targets}" | |
| 783 | + ;; | |
| 784 | + esac | |
| 455 | 785 | |
| 456 | 786 | case "${action}" in |
| 787 | + up) | |
| 788 | + for svc in ${targets}; do | |
| 789 | + start_one "${svc}" | |
| 790 | + done | |
| 791 | + start_monitor_daemon "${targets}" | |
| 792 | + ;; | |
| 793 | + down) | |
| 794 | + stop_monitor_daemon | |
| 795 | + for svc in ${targets}; do | |
| 796 | + stop_one "${svc}" | |
| 797 | + done | |
| 798 | + ;; | |
| 457 | 799 | start) |
| 458 | 800 | for svc in ${targets}; do |
| 459 | 801 | start_one "${svc}" |
| 460 | 802 | done |
| 461 | 803 | ;; |
| 462 | 804 | stop) |
| 805 | + if is_monitor_daemon_running; then | |
| 806 | + echo "[info] stopping monitor daemon before manual stop" | |
| 807 | + stop_monitor_daemon | |
| 808 | + fi | |
| 463 | 809 | for svc in ${targets}; do |
| 464 | 810 | stop_one "${svc}" |
| 465 | 811 | done |
| 466 | 812 | ;; |
| 467 | 813 | restart) |
| 468 | - for svc in ${stop_targets:-${targets}}; do | |
| 814 | + local restart_stop_targets | |
| 815 | + restart_stop_targets="$(apply_target_order stop "${targets}")" | |
| 816 | + if is_monitor_daemon_running; then | |
| 817 | + monitor_was_running=1 | |
| 818 | + monitor_prev_targets="$(monitor_current_targets)" | |
| 819 | + [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" | |
| 820 | + stop_monitor_daemon | |
| 821 | + fi | |
| 822 | + for svc in ${restart_stop_targets}; do | |
| 469 | 823 | stop_one "${svc}" |
| 470 | 824 | done |
| 471 | 825 | for svc in ${targets}; do |
| 472 | 826 | start_one "${svc}" |
| 473 | 827 | done |
| 828 | + if [ "${monitor_was_running}" -eq 1 ]; then | |
| 829 | + monitor_prev_targets="$(normalize_targets "${monitor_prev_targets}")" | |
| 830 | + monitor_prev_targets="$(apply_target_order monitor "${monitor_prev_targets}")" | |
| 831 | + [ -z "${monitor_prev_targets}" ] && monitor_prev_targets="${targets}" | |
| 832 | + start_monitor_daemon "${monitor_prev_targets}" | |
| 833 | + fi | |
| 474 | 834 | ;; |
| 475 | 835 | status) |
| 476 | 836 | for svc in ${targets}; do |
| 477 | 837 | status_one "${svc}" |
| 478 | 838 | done |
| 839 | + monitor_daemon_status | |
| 840 | + ;; | |
| 841 | + monitor) | |
| 842 | + monitor_services "${targets}" | |
| 843 | + ;; | |
| 844 | + monitor-start) | |
| 845 | + start_monitor_daemon "${targets}" | |
| 846 | + ;; | |
| 847 | + monitor-stop) | |
| 848 | + stop_monitor_daemon | |
| 849 | + ;; | |
| 850 | + monitor-status) | |
| 851 | + monitor_daemon_status | |
| 479 | 852 | ;; |
| 480 | 853 | *) |
| 481 | 854 | usage | ... | ... |
scripts/start.sh
| ... | ... | @@ -7,16 +7,4 @@ set -euo pipefail |
| 7 | 7 | |
| 8 | 8 | cd "$(dirname "$0")/.." |
| 9 | 9 | |
| 10 | -echo "========================================" | |
| 11 | -echo "saas-search 服务启动" | |
| 12 | -echo "========================================" | |
| 13 | -echo "默认启动核心服务: backend/indexer/frontend" | |
| 14 | -echo "可选服务请显式指定:" | |
| 15 | -echo " ./scripts/service_ctl.sh start tei cnclip embedding translator reranker" | |
| 16 | -echo | |
| 17 | - | |
| 18 | -./scripts/service_ctl.sh start | |
| 19 | - | |
| 20 | -echo | |
| 21 | -echo "当前服务状态:" | |
| 22 | -./scripts/service_ctl.sh status backend indexer frontend tei cnclip embedding translator reranker | |
| 10 | +./scripts/service_ctl.sh up "$@" | ... | ... |
scripts/stop.sh
| ... | ... | @@ -7,10 +7,4 @@ set -euo pipefail |
| 7 | 7 | |
| 8 | 8 | cd "$(dirname "$0")/.." |
| 9 | 9 | |
| 10 | -echo "========================================" | |
| 11 | -echo "Stopping saas-search services" | |
| 12 | -echo "========================================" | |
| 13 | - | |
| 14 | -./scripts/service_ctl.sh stop | |
| 15 | - | |
| 16 | -echo "Done." | |
| 10 | +./scripts/service_ctl.sh down "$@" | ... | ... |