diff --git a/scripts/service_ctl.sh b/scripts/service_ctl.sh index 054047c..084fcfb 100755 --- a/scripts/service_ctl.sh +++ b/scripts/service_ctl.sh @@ -284,6 +284,7 @@ monitor_services() { local fail_threshold="${MONITOR_FAIL_THRESHOLD:-3}" local restart_cooldown_sec="${MONITOR_RESTART_COOLDOWN_SEC:-30}" local max_restarts_per_hour="${MONITOR_MAX_RESTARTS_PER_HOUR:-6}" + local wechat_alert_py="${PROJECT_ROOT}/scripts/wechat_alert.py" require_positive_int "MONITOR_INTERVAL_SEC" "${interval_sec}" require_positive_int "MONITOR_FAIL_THRESHOLD" "${fail_threshold}" @@ -350,10 +351,22 @@ monitor_services() { if [ "${recent_count}" -ge "${max_restarts_per_hour}" ]; then monitor_log_event "${svc}" "error" "restart suppressed by hourly cap (${max_restarts_per_hour}/hour)" + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then + python "${wechat_alert_py}" \ + --service "${svc}" \ + --level "error" \ + --message "监控检测到服务连续多次健康检查失败,且已达到每小时最大重启次数上限(${max_restarts_per_hour} 次/小时),请及时排查。" + fi continue fi monitor_log_event "${svc}" "error" "triggering restart after ${fail_streak[${svc}]} consecutive failures" + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then + python "${wechat_alert_py}" \ + --service "${svc}" \ + --level "error" \ + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,正在尝试自动重启。" + fi if stop_one "${svc}" && start_one "${svc}"; then fail_streak["${svc}"]=0 last_restart_epoch["${svc}"]="${now}" @@ -363,6 +376,12 @@ monitor_services() { last_restart_epoch["${svc}"]="${now}" restart_history["${svc}"]="${restart_history[${svc}]:-} ${now}" monitor_log_event "${svc}" "error" "restart failed, inspect $(log_file "${svc}")" + if [ -x "${wechat_alert_py}" ] || [ -f "${wechat_alert_py}" ]; then + python "${wechat_alert_py}" \ + --service "${svc}" \ + --level "error" \ + --message "监控检测到服务连续 ${fail_streak[${svc}]} 次健康检查失败,自动重启尝试失败,请尽快登录服务器查看日志:$(log_file "${svc}")." + fi fi done sleep "${interval_sec}" diff --git a/scripts/wechat_alert.py b/scripts/wechat_alert.py new file mode 100644 index 0000000..f4507a3 --- /dev/null +++ b/scripts/wechat_alert.py @@ -0,0 +1,104 @@ +#!/usr/bin/env python +""" +Lightweight Enterprise WeChat webhook sender for service monitor alerts. + +This module is intentionally small and focused so that Bash-based monitors +can invoke it without pulling in the full application stack. + +Usage example: + python scripts/wechat_alert.py --service backend --level error --message "backend restarted" +""" + +import argparse +import json +import os +import sys +from datetime import datetime + +import requests + + +def get_webhook_url() -> str: + """ + Resolve webhook URL from environment, with optional default. + + Priority: + 1. SERVICE_MONITOR_WECHAT_WEBHOOK + 2. Built-in default (provided by ops) + """ + env_url = os.getenv("SERVICE_MONITOR_WECHAT_WEBHOOK", "").strip() + if env_url: + return env_url + # Fallback to the URL provided in ops configuration. + return ( + "https://qyapi.weixin.qq.com/cgi-bin/webhook/send?" + "key=2d9e38ef-9242-4e2e-82cc-fab060322871" + ) + + +def build_text_payload(message: str) -> dict: + return { + "msgtype": "text", + "text": { + "content": message, + }, + } + + +def send_wechat_message(message: str) -> None: + url = get_webhook_url() + if not url: + # No webhook configured; fail silently to avoid breaking callers. + return + + payload = build_text_payload(message) + headers = {"Content-Type": "application/json"} + + try: + resp = requests.post(url, headers=headers, data=json.dumps(payload), timeout=5) + except Exception: + # Swallow all exceptions to avoid impacting the caller. + return + + try: + if resp.status_code != 200: + return + data = resp.json() + # errcode == 0 means success per WeCom docs + if int(data.get("errcode", -1)) != 0: + return + except Exception: + return + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description="Send Enterprise WeChat alert message") + parser.add_argument("--service", help="service name", default="") + parser.add_argument("--level", help="alert level (info|warn|error)", default="info") + parser.add_argument( + "--message", + required=True, + help="alert message body (short, human-readable)", + ) + + args = parser.parse_args(argv) + + ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + parts = [ + f"【服务监控告警】", + f"时间: {ts}", + ] + if args.service: + parts.append(f"服务: {args.service}") + if args.level: + parts.append(f"级别: {args.level}") + parts.append(f"详情: {args.message}") + + full_message = "\n".join(parts) + send_wechat_message(full_message) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) + -- libgit2 0.21.2