Commit 2260eed2c7a97ae9c09b3af45f455dfd48c1928a
1 parent
a7bb846c
推送报警到微信群webhook
Showing
2 changed files
with
123 additions
and
0 deletions
Show diff stats
scripts/service_ctl.sh
| @@ -284,6 +284,7 @@ monitor_services() { | @@ -284,6 +284,7 @@ monitor_services() { | ||
| 284 | local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" | 284 | local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" |
| 285 | local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" | 285 | local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" |
| 286 | local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" | 286 | local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" |
| 287 | + local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py" | ||
| 287 | 288 | ||
| 288 | require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" | 289 | require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" |
| 289 | require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" | 290 | require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" |
| @@ -350,10 +351,22 @@ monitor_services() { | @@ -350,10 +351,22 @@ monitor_services() { | ||
| 350 | 351 | ||
| 351 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then | 352 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then |
| 352 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" | 353 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" |
| 354 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | ||
| 355 | + python "${wechat_alert_py}" \ | ||
| 356 | + --service "${svc}" \ | ||
| 357 | + --level "error" \ | ||
| 358 | + --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" | ||
| 359 | + fi | ||
| 353 | continue | 360 | continue |
| 354 | fi | 361 | fi |
| 355 | 362 | ||
| 356 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" | 363 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" |
| 364 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | ||
| 365 | + python "${wechat_alert_py}" \ | ||
| 366 | + --service "${svc}" \ | ||
| 367 | + --level "error" \ | ||
| 368 | + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" | ||
| 369 | + fi | ||
| 357 | if stop_one "${svc}" && start_one "${svc}"; then | 370 | if stop_one "${svc}" && start_one "${svc}"; then |
| 358 | fail_streak["${svc}"]=0 | 371 | fail_streak["${svc}"]=0 |
| 359 | last_restart_epoch["${svc}"]="${now}" | 372 | last_restart_epoch["${svc}"]="${now}" |
| @@ -363,6 +376,12 @@ monitor_services() { | @@ -363,6 +376,12 @@ monitor_services() { | ||
| 363 | last_restart_epoch["${svc}"]="${now}" | 376 | last_restart_epoch["${svc}"]="${now}" |
| 364 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" | 377 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" |
| 365 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" | 378 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" |
| 379 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | ||
| 380 | + python "${wechat_alert_py}" \ | ||
| 381 | + --service "${svc}" \ | ||
| 382 | + --level "error" \ | ||
| 383 | + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." | ||
| 384 | + fi | ||
| 366 | fi | 385 | fi |
| 367 | done | 386 | done |
| 368 | sleep "${interval_sec}" | 387 | sleep "${interval_sec}" |
| @@ -0,0 +1,104 @@ | @@ -0,0 +1,104 @@ | ||
| 1 | +#!/usr/bin/env python | ||
| 2 | +""" | ||
| 3 | +Lightweight Enterprise WeChat webhook sender for service monitor alerts. | ||
| 4 | + | ||
| 5 | +This module is intentionally small and focused so that Bash-based monitors | ||
| 6 | +can invoke it without pulling in the full application stack. | ||
| 7 | + | ||
| 8 | +Usage example: | ||
| 9 | + python scripts/wechat_alert.py --service backend --level error --message "backend restarted" | ||
| 10 | +""" | ||
| 11 | + | ||
| 12 | +import argparse | ||
| 13 | +import json | ||
| 14 | +import os | ||
| 15 | +import sys | ||
| 16 | +from datetime import datetime | ||
| 17 | + | ||
| 18 | +import requests | ||
| 19 | + | ||
| 20 | + | ||
| 21 | +def get_webhook_url() -> str: | ||
| 22 | + """ | ||
| 23 | + Resolve webhook URL from environment, with optional default. | ||
| 24 | + | ||
| 25 | + Priority: | ||
| 26 | + 1. SERVICE_MONITOR_WECHAT_WEBHOOK | ||
| 27 | + 2. Built-in default (provided by ops) | ||
| 28 | + """ | ||
| 29 | + env_url = os.getenv("SERVICE_MONITOR_WECHAT_WEBHOOK", "").strip() | ||
| 30 | + if env_url: | ||
| 31 | + return env_url | ||
| 32 | + # Fallback to the URL provided in ops configuration. | ||
| 33 | + return ( | ||
| 34 | + "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?" | ||
| 35 | + "key=2d9e38ef-9242-4e2e-82cc-fab060322871" | ||
| 36 | + ) | ||
| 37 | + | ||
| 38 | + | ||
| 39 | +def build_text_payload(message: str) -> dict: | ||
| 40 | + return { | ||
| 41 | + "msgtype": "text", | ||
| 42 | + "text": { | ||
| 43 | + "content": message, | ||
| 44 | + }, | ||
| 45 | + } | ||
| 46 | + | ||
| 47 | + | ||
| 48 | +def send_wechat_message(message: str) -> None: | ||
| 49 | + url = get_webhook_url() | ||
| 50 | + if not url: | ||
| 51 | + # No webhook configured; fail silently to avoid breaking callers. | ||
| 52 | + return | ||
| 53 | + | ||
| 54 | + payload = build_text_payload(message) | ||
| 55 | + headers = {"Content-Type": "application/json"} | ||
| 56 | + | ||
| 57 | + try: | ||
| 58 | + resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=5) | ||
| 59 | + except Exception: | ||
| 60 | + # Swallow all exceptions to avoid impacting the caller. | ||
| 61 | + return | ||
| 62 | + | ||
| 63 | + try: | ||
| 64 | + if resp.status_code != 200: | ||
| 65 | + return | ||
| 66 | + data = resp.json() | ||
| 67 | + # errcode == 0 means success per WeCom docs | ||
| 68 | + if int(data.get("errcode", -1)) != 0: | ||
| 69 | + return | ||
| 70 | + except Exception: | ||
| 71 | + return | ||
| 72 | + | ||
| 73 | + | ||
| 74 | +def main(argv: list[str] | None = None) -> int: | ||
| 75 | + parser = argparse.ArgumentParser(description="Send Enterprise WeChat alert message") | ||
| 76 | + parser.add_argument("--service", help="service name", default="") | ||
| 77 | + parser.add_argument("--level", help="alert level (info|warn|error)", default="info") | ||
| 78 | + parser.add_argument( | ||
| 79 | + "--message", | ||
| 80 | + required=True, | ||
| 81 | + help="alert message body (short, human-readable)", | ||
| 82 | + ) | ||
| 83 | + | ||
| 84 | + args = parser.parse_args(argv) | ||
| 85 | + | ||
| 86 | + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | ||
| 87 | + parts = [ | ||
| 88 | + f"【服务监控告警】", | ||
| 89 | + f"时间: {ts}", | ||
| 90 | + ] | ||
| 91 | + if args.service: | ||
| 92 | + parts.append(f"服务: {args.service}") | ||
| 93 | + if args.level: | ||
| 94 | + parts.append(f"级别: {args.level}") | ||
| 95 | + parts.append(f"详情: {args.message}") | ||
| 96 | + | ||
| 97 | + full_message = "\n".join(parts) | ||
| 98 | + send_wechat_message(full_message) | ||
| 99 | + return 0 | ||
| 100 | + | ||
| 101 | + | ||
| 102 | +if __name__ == "__main__": | ||
| 103 | + raise SystemExit(main()) | ||
| 104 | + |