Blame view

scripts/service_ctl.sh 12.9 KB
d1d356f8   tangwang   脚本优化
1
2
3
4
5
6
7
8
9
10
  #!/bin/bash
  #
  # Unified service lifecycle controller for saas-search.
  # Supports: start / stop / restart / status
  #
  
  set -euo pipefail
  
  PROJECT_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
  LOG_DIR="${PROJECT_ROOT}/logs"
28e57bb1   tangwang   日志体系优化
11
  LOG_RETENTION_DAYS="${LOG_RETENTION_DAYS:-30}"
d1d356f8   tangwang   脚本优化
12
13
14
  
  mkdir -p "${LOG_DIR}"
  
c7e80cc2   tangwang   新的 .env 管理机制如下:
15
16
17
  # shellcheck source=scripts/lib/load_env.sh
  source "${PROJECT_ROOT}/scripts/lib/load_env.sh"
  
d1d356f8   tangwang   脚本优化
18
  CORE_SERVICES=("backend" "indexer" "frontend")
7fbca0d7   tangwang   启动脚本优化
19
  OPTIONAL_SERVICES=("embedding" "translator" "reranker" "tei" "cnclip")
d1d356f8   tangwang   脚本优化
20
21
  
  all_services() {
7fbca0d7   tangwang   启动脚本优化
22
    echo "${CORE_SERVICES[@]} ${OPTIONAL_SERVICES[@]}"
d1d356f8   tangwang   脚本优化
23
24
  }
  
d1d356f8   tangwang   脚本优化
25
26
27
28
29
30
31
  get_port() {
    local service="$1"
    case "${service}" in
      backend) echo "${API_PORT:-6002}" ;;
      indexer) echo "${INDEXER_PORT:-6004}" ;;
      frontend) echo "${FRONTEND_PORT:-6003}" ;;
      embedding) echo "${EMBEDDING_PORT:-6005}" ;;
af7ee060   tangwang   service_ctl 简化为“显...
32
      translator) echo "${TRANSLATION_PORT:-6006}" ;;
d1d356f8   tangwang   脚本优化
33
      reranker) echo "${RERANKER_PORT:-6007}" ;;
07cf5a93   tangwang   START_EMBEDDING=...
34
      tei) echo "${TEI_PORT:-8080}" ;;
d1d356f8   tangwang   脚本优化
35
36
37
38
39
40
41
      cnclip) echo "${CNCLIP_PORT:-51000}" ;;
      *) echo "" ;;
    esac
  }
  
  pid_file() {
    local service="$1"
af7ee060   tangwang   service_ctl 简化为“显...
42
    echo "${LOG_DIR}/${service}.pid"
d1d356f8   tangwang   脚本优化
43
44
45
46
47
48
49
  }
  
  log_file() {
    local service="$1"
    echo "${LOG_DIR}/${service}.log"
  }
  
28e57bb1   tangwang   日志体系优化
50
51
52
53
54
55
56
57
58
59
  prepare_daily_log_target() {
    local service="$1"
    local day
    local today_file
    day="$(date +%F)"
    today_file="${LOG_DIR}/${service}-${day}.log"
    touch "${today_file}"
    ln -sfn "$(basename "${today_file}")" "$(log_file "${service}")"
  }
  
d1d356f8   tangwang   脚本优化
60
61
62
63
64
65
66
67
68
  service_start_cmd() {
    local service="$1"
    case "${service}" in
      backend) echo "./scripts/start_backend.sh" ;;
      indexer) echo "./scripts/start_indexer.sh" ;;
      frontend) echo "./scripts/start_frontend.sh" ;;
      embedding) echo "./scripts/start_embedding_service.sh" ;;
      translator) echo "./scripts/start_translator.sh" ;;
      reranker) echo "./scripts/start_reranker.sh" ;;
07cf5a93   tangwang   START_EMBEDDING=...
69
      tei) echo "./scripts/start_tei_service.sh" ;;
d1d356f8   tangwang   脚本优化
70
71
72
73
74
      cnclip) echo "./scripts/start_cnclip_service.sh" ;;
      *) return 1 ;;
    esac
  }
  
af7ee060   tangwang   service_ctl 简化为“显...
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
  service_exists() {
    local service="$1"
    case "${service}" in
      backend|indexer|frontend|embedding|translator|reranker|tei|cnclip) return 0 ;;
      *) return 1 ;;
    esac
  }
  
  validate_targets() {
    local targets="$1"
    for svc in ${targets}; do
      if ! service_exists "${svc}"; then
        echo "[error] unknown service: ${svc}" >&2
        return 1
      fi
    done
  }
  
d1d356f8   tangwang   脚本优化
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
  wait_for_health() {
    local service="$1"
    local max_retries="${2:-30}"
    local interval_sec="${3:-1}"
    local port
    port="$(get_port "${service}")"
    local path="/health"
  
    case "${service}" in
      backend) path="/health" ;;
      indexer) path="/health" ;;
      frontend) path="/" ;;
      embedding) path="/health" ;;
      translator) path="/health" ;;
      reranker) path="/health" ;;
07cf5a93   tangwang   START_EMBEDDING=...
108
      tei) path="/health" ;;
d1d356f8   tangwang   脚本优化
109
110
111
112
113
114
115
116
117
118
119
120
121
122
      *) return 0 ;;
    esac
  
    local i=0
    while [ "${i}" -lt "${max_retries}" ]; do
      if curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then
        return 0
      fi
      i=$((i + 1))
      sleep "${interval_sec}"
    done
    return 1
  }
  
9f5994b4   tangwang   reranker
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
  wait_for_stable_health() {
    local service="$1"
    local checks="${2:-3}"
    local interval_sec="${3:-1}"
    local port
    port="$(get_port "${service}")"
    local path="/health"
  
    local i=0
    while [ "${i}" -lt "${checks}" ]; do
      if ! is_running_by_port "${service}"; then
        return 1
      fi
      if ! curl -sf "http://127.0.0.1:${port}${path}" >/dev/null 2>&1; then
        return 1
      fi
      i=$((i + 1))
      sleep "${interval_sec}"
    done
    return 0
  }
  
d1d356f8   tangwang   脚本优化
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
  is_running_by_pid() {
    local service="$1"
    local pf
    pf="$(pid_file "${service}")"
    if [ ! -f "${pf}" ]; then
      return 1
    fi
    local pid
    pid="$(cat "${pf}" 2>/dev/null || true)"
    [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null
  }
  
  is_running_by_port() {
    local service="$1"
    local port
    port="$(get_port "${service}")"
    [ -n "${port}" ] && lsof -ti:"${port}" >/dev/null 2>&1
  }
  
07cf5a93   tangwang   START_EMBEDDING=...
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
  is_running_tei_container() {
    local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}"
    local cid
    cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)"
    [ -n "${cid}" ]
  }
  
  get_cnclip_flow_device() {
    local flow_file="${PROJECT_ROOT}/third-party/clip-as-service/server/torch-flow-temp.yml"
    if [ ! -f "${flow_file}" ]; then
      return 1
    fi
    sed -n "s/^[[:space:]]*device:[[:space:]]*'\\([^']*\\)'.*/\\1/p" "${flow_file}" | head -n 1
  }
  
d1d356f8   tangwang   脚本优化
179
180
181
182
  start_one() {
    local service="$1"
    cd "${PROJECT_ROOT}"
    local cmd
7fbca0d7   tangwang   启动脚本优化
183
184
185
186
    if ! cmd="$(service_start_cmd "${service}")"; then
      echo "[error] unknown service: ${service}" >&2
      return 1
    fi
d1d356f8   tangwang   脚本优化
187
188
189
    local pf lf
    pf="$(pid_file "${service}")"
    lf="$(log_file "${service}")"
28e57bb1   tangwang   日志体系优化
190
    prepare_daily_log_target "${service}"
d1d356f8   tangwang   脚本优化
191
  
07cf5a93   tangwang   START_EMBEDDING=...
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
    if [ "${service}" != "tei" ]; then
      if is_running_by_pid "${service}" || is_running_by_port "${service}"; then
        if [ "${service}" = "cnclip" ]; then
          local expected_device="${CNCLIP_DEVICE:-cuda}"
          expected_device="$(echo "${expected_device}" | tr '[:upper:]' '[:lower:]')"
          if [[ "${expected_device}" != "cuda" && "${expected_device}" != "cpu" ]]; then
            echo "[error] invalid CNCLIP_DEVICE=${CNCLIP_DEVICE}; use cuda/cpu" >&2
            return 1
          fi
          local actual_device
          actual_device="$(get_cnclip_flow_device 2>/dev/null || true)"
          if [ -n "${actual_device}" ] && [ "${actual_device}" != "${expected_device}" ]; then
            echo "[error] cnclip already running with device=${actual_device}, expected=${expected_device}" >&2
            echo "[error] run: ./scripts/service_ctl.sh stop cnclip && CNCLIP_DEVICE=${expected_device} ./scripts/service_ctl.sh start cnclip" >&2
            return 1
          fi
        fi
        echo "[skip] ${service} already running"
        return 0
      fi
d1d356f8   tangwang   脚本优化
212
213
214
    fi
  
    case "${service}" in
7fbca0d7   tangwang   启动脚本优化
215
      cnclip|tei)
d1d356f8   tangwang   脚本优化
216
        echo "[start] ${service} (managed by native script)"
cc11ae04   tangwang   cnclip
217
        if [ "${service}" = "cnclip" ]; then
28e57bb1   tangwang   日志体系优化
218
          if ! CNCLIP_DEVICE="${CNCLIP_DEVICE:-cuda}" "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
af7ee060   tangwang   service_ctl 简化为“显...
219
220
221
            echo "[error] ${service} start script failed, inspect ${lf}" >&2
            return 1
          fi
cc11ae04   tangwang   cnclip
222
        else
28e57bb1   tangwang   日志体系优化
223
          if ! "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1; then
af7ee060   tangwang   service_ctl 简化为“显...
224
225
226
            echo "[error] ${service} start script failed, inspect ${lf}" >&2
            return 1
          fi
cc11ae04   tangwang   cnclip
227
        fi
07cf5a93   tangwang   START_EMBEDDING=...
228
229
230
231
232
233
234
235
        if [ "${service}" = "tei" ]; then
          if is_running_tei_container; then
            echo "[ok] ${service} started (log=${lf})"
          else
            echo "[error] ${service} failed to start, inspect ${lf}" >&2
            return 1
          fi
        elif is_running_by_pid "${service}" || is_running_by_port "${service}"; then
d1d356f8   tangwang   脚本优化
236
237
          echo "[ok] ${service} started (log=${lf})"
        else
07cf5a93   tangwang   START_EMBEDDING=...
238
239
          echo "[error] ${service} failed to start, inspect ${lf}" >&2
          return 1
d1d356f8   tangwang   脚本优化
240
241
        fi
        ;;
9f5994b4   tangwang   reranker
242
      backend|indexer|frontend|embedding|translator)
d1d356f8   tangwang   脚本优化
243
        echo "[start] ${service}"
28e57bb1   tangwang   日志体系优化
244
        nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
d1d356f8   tangwang   脚本优化
245
246
        local pid=$!
        echo "${pid}" > "${pf}"
200fdddf   tangwang   embed norm
247
        local retries=30
200fdddf   tangwang   embed norm
248
        if wait_for_health "${service}" "${retries}"; then
d1d356f8   tangwang   脚本优化
249
250
          echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
        else
07cf5a93   tangwang   START_EMBEDDING=...
251
252
          echo "[error] ${service} health check timeout, inspect ${lf}" >&2
          return 1
d1d356f8   tangwang   脚本优化
253
254
        fi
        ;;
9f5994b4   tangwang   reranker
255
256
257
258
      reranker)
        echo "[start] ${service}"
        # Start reranker directly so pid file points to the script process that
        # will exec uvicorn, avoiding extra shell wrapper lifecycle edge-cases.
28e57bb1   tangwang   日志体系优化
259
        nohup "${cmd}" > >("${PROJECT_ROOT}/scripts/daily_log_router.sh" "${service}" "${LOG_DIR}" "${LOG_RETENTION_DAYS}") 2>&1 &
9f5994b4   tangwang   reranker
260
261
262
263
264
265
266
267
268
269
270
271
272
273
        local pid=$!
        echo "${pid}" > "${pf}"
        if wait_for_health "${service}" 90; then
          if wait_for_stable_health "${service}" 5 1; then
            echo "[ok] ${service} healthy (pid=${pid}, log=${lf})"
          else
            echo "[error] ${service} became unavailable right after startup, inspect ${lf}" >&2
            return 1
          fi
        else
          echo "[error] ${service} health check timeout, inspect ${lf}" >&2
          return 1
        fi
        ;;
d1d356f8   tangwang   脚本优化
274
275
276
277
278
279
      *)
        echo "[warn] ${service} unsupported start path"
        ;;
    esac
  }
  
af7ee060   tangwang   service_ctl 简化为“显...
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
  cleanup_reranker_orphans() {
    local engine_pids
    engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)"
    if [ -z "${engine_pids}" ]; then
      return 0
    fi
  
    echo "[stop] reranker orphan engines=${engine_pids}"
    for pid in ${engine_pids}; do
      kill -TERM "${pid}" 2>/dev/null || true
    done
    sleep 1
    engine_pids="$(pgrep -f 'VLLM::EngineCore' 2>/dev/null || true)"
    for pid in ${engine_pids}; do
      kill -KILL "${pid}" 2>/dev/null || true
    done
  }
  
d1d356f8   tangwang   脚本优化
298
299
300
  stop_one() {
    local service="$1"
    cd "${PROJECT_ROOT}"
d1d356f8   tangwang   脚本优化
301
302
303
304
305
    if [ "${service}" = "cnclip" ]; then
      echo "[stop] cnclip (managed by native script)"
      bash -lc "./scripts/stop_cnclip_service.sh" || true
      return 0
    fi
07cf5a93   tangwang   START_EMBEDDING=...
306
307
308
309
310
    if [ "${service}" = "tei" ]; then
      echo "[stop] tei (managed by native script)"
      bash -lc "./scripts/stop_tei_service.sh" || true
      return 0
    fi
d1d356f8   tangwang   脚本优化
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
  
    local pf
    pf="$(pid_file "${service}")"
  
    if [ -f "${pf}" ]; then
      local pid
      pid="$(cat "${pf}" 2>/dev/null || true)"
      if [ -n "${pid}" ] && kill -0 "${pid}" 2>/dev/null; then
        echo "[stop] ${service} pid=${pid}"
        kill -TERM "${pid}" 2>/dev/null || true
        sleep 1
        if kill -0 "${pid}" 2>/dev/null; then
          kill -KILL "${pid}" 2>/dev/null || true
        fi
      fi
      rm -f "${pf}"
    fi
  
    local port
    port="$(get_port "${service}")"
    if [ -n "${port}" ]; then
      local pids
      pids="$(lsof -ti:${port} 2>/dev/null || true)"
      if [ -n "${pids}" ]; then
        echo "[stop] ${service} port=${port} pids=${pids}"
        for pid in ${pids}; do
          kill -TERM "${pid}" 2>/dev/null || true
        done
        sleep 1
        pids="$(lsof -ti:${port} 2>/dev/null || true)"
        for pid in ${pids}; do
          kill -KILL "${pid}" 2>/dev/null || true
        done
      fi
    fi
af7ee060   tangwang   service_ctl 简化为“显...
346
347
348
349
  
    if [ "${service}" = "reranker" ]; then
      cleanup_reranker_orphans
    fi
d1d356f8   tangwang   脚本优化
350
351
352
353
354
355
356
357
358
  }
  
  status_one() {
    local service="$1"
    local port
    port="$(get_port "${service}")"
    local running="no"
    local pid_info="-"
  
07cf5a93   tangwang   START_EMBEDDING=...
359
360
361
362
363
364
365
366
367
368
369
370
    if [ "${service}" = "tei" ]; then
      local cid
      local tei_name="${TEI_CONTAINER_NAME:-saas-search-tei}"
      cid="$(docker ps -q -f name=^/${tei_name}$ 2>/dev/null || true)"
      if [ -n "${cid}" ]; then
        running="yes"
        pid_info="${cid:0:12}"
      fi
      printf "%-10s running=%-3s port=%-6s pid=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}"
      return
    fi
  
d1d356f8   tangwang   脚本优化
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
    if is_running_by_pid "${service}"; then
      running="yes"
      pid_info="$(cat "$(pid_file "${service}")" 2>/dev/null || echo "-")"
    elif is_running_by_port "${service}"; then
      running="yes"
      pid_info="$(lsof -ti:${port} 2>/dev/null | tr '\n' ',' | sed 's/,$//' || echo "-")"
    fi
  
    printf "%-10s running=%-3s port=%-6s pid=%s\n" "${service}" "${running}" "${port:--}" "${pid_info}"
  }
  
  resolve_targets() {
    local scope="$1"
    shift || true
  
    if [ "$#" -gt 0 ]; then
      echo "$*"
      return
    fi
  
    case "${scope}" in
      start)
af7ee060   tangwang   service_ctl 简化为“显...
393
        echo "${CORE_SERVICES[@]}"
d1d356f8   tangwang   脚本优化
394
        ;;
200fdddf   tangwang   embed norm
395
      stop|status)
d1d356f8   tangwang   脚本优化
396
397
        echo "$(all_services)"
        ;;
200fdddf   tangwang   embed norm
398
      restart)
7fbca0d7   tangwang   启动脚本优化
399
        # Restart with no explicit services uses the same explicit start target order.
200fdddf   tangwang   embed norm
400
401
        echo "$(resolve_targets start)"
        ;;
d1d356f8   tangwang   脚本优化
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
      *)
        echo ""
        ;;
    esac
  }
  
  usage() {
    cat <<'EOF'
  Usage:
    ./scripts/service_ctl.sh start [service...]
    ./scripts/service_ctl.sh stop [service...]
    ./scripts/service_ctl.sh restart [service...]
    ./scripts/service_ctl.sh status [service...]
  
  Default target set (when no service provided):
af7ee060   tangwang   service_ctl 简化为“显...
417
    start   -> backend indexer frontend
d1d356f8   tangwang   脚本优化
418
    stop    -> all known services
200fdddf   tangwang   embed norm
419
    restart -> stop all known services, then start with start targets
d1d356f8   tangwang   脚本优化
420
421
    status  -> all known services
  
af7ee060   tangwang   service_ctl 简化为“显...
422
  Optional service startup:
28e57bb1   tangwang   日志体系优化
423
    ./scripts/service_ctl.sh start tei cnclip embedding translator reranker
af7ee060   tangwang   service_ctl 简化为“显...
424
    TEI_DEVICE=cuda|cpu ./scripts/service_ctl.sh start tei
cc11ae04   tangwang   cnclip
425
    CNCLIP_DEVICE=cuda|cpu ./scripts/service_ctl.sh start cnclip
28e57bb1   tangwang   日志体系优化
426
427
428
  
  Log retention:
    LOG_RETENTION_DAYS=30 ./scripts/service_ctl.sh start
d1d356f8   tangwang   脚本优化
429
430
431
432
433
434
435
436
437
438
439
440
  EOF
  }
  
  main() {
    if [ "$#" -lt 1 ]; then
      usage
      exit 1
    fi
  
    local action="$1"
    shift || true
  
c7e80cc2   tangwang   新的 .env 管理机制如下:
441
    load_env_file "${PROJECT_ROOT}/.env"
200fdddf   tangwang   embed norm
442
    local stop_targets=""
d1d356f8   tangwang   脚本优化
443
    local targets
200fdddf   tangwang   embed norm
444
    # For restart without explicit services, stop everything first, then start
7fbca0d7   tangwang   启动脚本优化
445
    # with the default start target order.
200fdddf   tangwang   embed norm
446
447
448
    if [ "${action}" = "restart" ] && [ "$#" -eq 0 ]; then
      stop_targets="$(resolve_targets stop)"
    fi
d1d356f8   tangwang   脚本优化
449
450
451
452
453
    targets="$(resolve_targets "${action}" "$@")"
    if [ -z "${targets}" ]; then
      usage
      exit 1
    fi
af7ee060   tangwang   service_ctl 简化为“显...
454
    validate_targets "${targets}"
d1d356f8   tangwang   脚本优化
455
456
457
458
459
460
461
462
463
464
465
466
467
  
    case "${action}" in
      start)
        for svc in ${targets}; do
          start_one "${svc}"
        done
        ;;
      stop)
        for svc in ${targets}; do
          stop_one "${svc}"
        done
        ;;
      restart)
200fdddf   tangwang   embed norm
468
        for svc in ${stop_targets:-${targets}}; do
d1d356f8   tangwang   脚本优化
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
          stop_one "${svc}"
        done
        for svc in ${targets}; do
          start_one "${svc}"
        done
        ;;
      status)
        for svc in ${targets}; do
          status_one "${svc}"
        done
        ;;
      *)
        usage
        exit 1
        ;;
    esac
  }
  
  main "$@"