Commit 2260eed2c7a97ae9c09b3af45f455dfd48c1928a

Authored by tangwang
1 parent a7bb846c

推送报警到微信群webhook

Showing 2 changed files with 123 additions and 0 deletions   Show diff stats
scripts/service_ctl.sh
... ... @@ -284,6 +284,7 @@ monitor_services() {
284 284 local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}"
285 285 local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}"
286 286 local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}"
  287 + local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py"
287 288  
288 289 require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}"
289 290 require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}"
... ... @@ -350,10 +351,22 @@ monitor_services() {
350 351  
351 352 if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then
352 353 monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)"
  354 + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
  355 + python "${wechat_alert_py}" \
  356 + --service "${svc}" \
  357 + --level "error" \
  358 + --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。"
  359 + fi
353 360 continue
354 361 fi
355 362  
356 363 monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures"
  364 + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
  365 + python "${wechat_alert_py}" \
  366 + --service "${svc}" \
  367 + --level "error" \
  368 + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。"
  369 + fi
357 370 if stop_one "${svc}" && start_one "${svc}"; then
358 371 fail_streak["${svc}"]=0
359 372 last_restart_epoch["${svc}"]="${now}"
... ... @@ -363,6 +376,12 @@ monitor_services() {
363 376 last_restart_epoch["${svc}"]="${now}"
364 377 restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}"
365 378 monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")"
  379 + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then
  380 + python "${wechat_alert_py}" \
  381 + --service "${svc}" \
  382 + --level "error" \
  383 + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")."
  384 + fi
366 385 fi
367 386 done
368 387 sleep "${interval_sec}"
... ...
scripts/wechat_alert.py 0 → 100644
... ... @@ -0,0 +1,104 @@
  1 +#!/usr/bin/env python
  2 +"""
  3 +Lightweight Enterprise WeChat webhook sender for service monitor alerts.
  4 +
  5 +This module is intentionally small and focused so that Bash-based monitors
  6 +can invoke it without pulling in the full application stack.
  7 +
  8 +Usage example:
  9 + python scripts/wechat_alert.py --service backend --level error --message "backend restarted"
  10 +"""
  11 +
  12 +import argparse
  13 +import json
  14 +import os
  15 +import sys
  16 +from datetime import datetime
  17 +
  18 +import requests
  19 +
  20 +
  21 +def get_webhook_url() -> str:
  22 + """
  23 + Resolve webhook URL from environment, with optional default.
  24 +
  25 + Priority:
  26 + 1. SERVICE_MONITOR_WECHAT_WEBHOOK
  27 + 2. Built-in default (provided by ops)
  28 + """
  29 + env_url = os.getenv("SERVICE_MONITOR_WECHAT_WEBHOOK", "").strip()
  30 + if env_url:
  31 + return env_url
  32 + # Fallback to the URL provided in ops configuration.
  33 + return (
  34 + "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?"
  35 + "key=2d9e38ef-9242-4e2e-82cc-fab060322871"
  36 + )
  37 +
  38 +
  39 +def build_text_payload(message: str) -> dict:
  40 + return {
  41 + "msgtype": "text",
  42 + "text": {
  43 + "content": message,
  44 + },
  45 + }
  46 +
  47 +
  48 +def send_wechat_message(message: str) -> None:
  49 + url = get_webhook_url()
  50 + if not url:
  51 + # No webhook configured; fail silently to avoid breaking callers.
  52 + return
  53 +
  54 + payload = build_text_payload(message)
  55 + headers = {"Content-Type": "application/json"}
  56 +
  57 + try:
  58 + resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=5)
  59 + except Exception:
  60 + # Swallow all exceptions to avoid impacting the caller.
  61 + return
  62 +
  63 + try:
  64 + if resp.status_code != 200:
  65 + return
  66 + data = resp.json()
  67 + # errcode == 0 means success per WeCom docs
  68 + if int(data.get("errcode", -1)) != 0:
  69 + return
  70 + except Exception:
  71 + return
  72 +
  73 +
  74 +def main(argv: list[str] | None = None) -> int:
  75 + parser = argparse.ArgumentParser(description="Send Enterprise WeChat alert message")
  76 + parser.add_argument("--service", help="service name", default="")
  77 + parser.add_argument("--level", help="alert level (info|warn|error)", default="info")
  78 + parser.add_argument(
  79 + "--message",
  80 + required=True,
  81 + help="alert message body (short, human-readable)",
  82 + )
  83 +
  84 + args = parser.parse_args(argv)
  85 +
  86 + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  87 + parts = [
  88 + f"【服务监控告警】",
  89 + f"时间: {ts}",
  90 + ]
  91 + if args.service:
  92 + parts.append(f"服务: {args.service}")
  93 + if args.level:
  94 + parts.append(f"级别: {args.level}")
  95 + parts.append(f"详情: {args.message}")
  96 +
  97 + full_message = "\n".join(parts)
  98 + send_wechat_message(full_message)
  99 + return 0
  100 +
  101 +
  102 +if __name__ == "__main__":
  103 + raise SystemExit(main())
  104 +
... ...