Commit 2260eed2c7a97ae9c09b3af45f455dfd48c1928a
1 parent
a7bb846c
推送报警到微信群webhook
Showing
2 changed files
with
123 additions
and
0 deletions
Show diff stats
scripts/service_ctl.sh
| ... | ... | @@ -284,6 +284,7 @@ monitor_services() { |
| 284 | 284 | local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" |
| 285 | 285 | local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" |
| 286 | 286 | local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" |
| 287 | + local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py" | |
| 287 | 288 | |
| 288 | 289 | require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" |
| 289 | 290 | require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" |
| ... | ... | @@ -350,10 +351,22 @@ monitor_services() { |
| 350 | 351 | |
| 351 | 352 | if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then |
| 352 | 353 | monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" |
| 354 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | |
| 355 | + python "${wechat_alert_py}" \ | |
| 356 | + --service "${svc}" \ | |
| 357 | + --level "error" \ | |
| 358 | + --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" | |
| 359 | + fi | |
| 353 | 360 | continue |
| 354 | 361 | fi |
| 355 | 362 | |
| 356 | 363 | monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" |
| 364 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | |
| 365 | + python "${wechat_alert_py}" \ | |
| 366 | + --service "${svc}" \ | |
| 367 | + --level "error" \ | |
| 368 | + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" | |
| 369 | + fi | |
| 357 | 370 | if stop_one "${svc}" && start_one "${svc}"; then |
| 358 | 371 | fail_streak["${svc}"]=0 |
| 359 | 372 | last_restart_epoch["${svc}"]="${now}" |
| ... | ... | @@ -363,6 +376,12 @@ monitor_services() { |
| 363 | 376 | last_restart_epoch["${svc}"]="${now}" |
| 364 | 377 | restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" |
| 365 | 378 | monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" |
| 379 | + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then | |
| 380 | + python "${wechat_alert_py}" \ | |
| 381 | + --service "${svc}" \ | |
| 382 | + --level "error" \ | |
| 383 | + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." | |
| 384 | + fi | |
| 366 | 385 | fi |
| 367 | 386 | done |
| 368 | 387 | sleep "${interval_sec}" | ... | ... |
| ... | ... | @@ -0,0 +1,104 @@ |
| 1 | +#!/usr/bin/env python | |
| 2 | +""" | |
| 3 | +Lightweight Enterprise WeChat webhook sender for service monitor alerts. | |
| 4 | + | |
| 5 | +This module is intentionally small and focused so that Bash-based monitors | |
| 6 | +can invoke it without pulling in the full application stack. | |
| 7 | + | |
| 8 | +Usage example: | |
| 9 | + python scripts/wechat_alert.py --service backend --level error --message "backend restarted" | |
| 10 | +""" | |
| 11 | + | |
| 12 | +import argparse | |
| 13 | +import json | |
| 14 | +import os | |
| 15 | +import sys | |
| 16 | +from datetime import datetime | |
| 17 | + | |
| 18 | +import requests | |
| 19 | + | |
| 20 | + | |
| 21 | +def get_webhook_url() -> str: | |
| 22 | + """ | |
| 23 | + Resolve webhook URL from environment, with optional default. | |
| 24 | + | |
| 25 | + Priority: | |
| 26 | + 1. SERVICE_MONITOR_WECHAT_WEBHOOK | |
| 27 | + 2. Built-in default (provided by ops) | |
| 28 | + """ | |
| 29 | + env_url = os.getenv("SERVICE_MONITOR_WECHAT_WEBHOOK", "").strip() | |
| 30 | + if env_url: | |
| 31 | + return env_url | |
| 32 | + # Fallback to the URL provided in ops configuration. | |
| 33 | + return ( | |
| 34 | + "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?" | |
| 35 | + "key=2d9e38ef-9242-4e2e-82cc-fab060322871" | |
| 36 | + ) | |
| 37 | + | |
| 38 | + | |
| 39 | +def build_text_payload(message: str) -> dict: | |
| 40 | + return { | |
| 41 | + "msgtype": "text", | |
| 42 | + "text": { | |
| 43 | + "content": message, | |
| 44 | + }, | |
| 45 | + } | |
| 46 | + | |
| 47 | + | |
| 48 | +def send_wechat_message(message: str) -> None: | |
| 49 | + url = get_webhook_url() | |
| 50 | + if not url: | |
| 51 | + # No webhook configured; fail silently to avoid breaking callers. | |
| 52 | + return | |
| 53 | + | |
| 54 | + payload = build_text_payload(message) | |
| 55 | + headers = {"Content-Type": "application/json"} | |
| 56 | + | |
| 57 | + try: | |
| 58 | + resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=5) | |
| 59 | + except Exception: | |
| 60 | + # Swallow all exceptions to avoid impacting the caller. | |
| 61 | + return | |
| 62 | + | |
| 63 | + try: | |
| 64 | + if resp.status_code != 200: | |
| 65 | + return | |
| 66 | + data = resp.json() | |
| 67 | + # errcode == 0 means success per WeCom docs | |
| 68 | + if int(data.get("errcode", -1)) != 0: | |
| 69 | + return | |
| 70 | + except Exception: | |
| 71 | + return | |
| 72 | + | |
| 73 | + | |
| 74 | +def main(argv: list[str] | None = None) -> int: | |
| 75 | + parser = argparse.ArgumentParser(description="Send Enterprise WeChat alert message") | |
| 76 | + parser.add_argument("--service", help="service name", default="") | |
| 77 | + parser.add_argument("--level", help="alert level (info|warn|error)", default="info") | |
| 78 | + parser.add_argument( | |
| 79 | + "--message", | |
| 80 | + required=True, | |
| 81 | + help="alert message body (short, human-readable)", | |
| 82 | + ) | |
| 83 | + | |
| 84 | + args = parser.parse_args(argv) | |
| 85 | + | |
| 86 | + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| 87 | + parts = [ | |
| 88 | + f"【服务监控告警】", | |
| 89 | + f"时间: {ts}", | |
| 90 | + ] | |
| 91 | + if args.service: | |
| 92 | + parts.append(f"服务: {args.service}") | |
| 93 | + if args.level: | |
| 94 | + parts.append(f"级别: {args.level}") | |
| 95 | + parts.append(f"详情: {args.message}") | |
| 96 | + | |
| 97 | + full_message = "\n".join(parts) | |
| 98 | + send_wechat_message(full_message) | |
| 99 | + return 0 | |
| 100 | + | |
| 101 | + | |
| 102 | +if __name__ == "__main__": | |
| 103 | + raise SystemExit(main()) | |
| 104 | + | ... | ... |