diff --git a/ops/alert_on_missing_events/README.md b/ops/alert_on_missing_events/README.md new file mode 100644 index 0000000..7ced0e2 --- /dev/null +++ b/ops/alert_on_missing_events/README.md @@ -0,0 +1,30 @@ +--- +title: Alert on missing events +description: Send alerts when AutoKitteh doesn't receive certain events in time +integrations: ["Slack"] +categories: ["Ops"] +--- + +# Alert on Missing Events + +Send alerts when AutoKitteh doesn't receive certain events in time. This +detects incidents due to missing updates from monitored services. + +Possible causes for example: + +- The monitored service is down / local network outage +- The integration's callback URL has been modified +- The connection's event watches have expired + +While an incident is ongoing, the workflow continues to wait for the desired +events, and resends reminder alerts at a shorter interval. + +--- + +You can add this project's configuration and code to existing projects, or run +it in parallel to them. Either way, all matching triggers in AutoKitteh will +receive each conformant event. + +You can also duplicate or extend this mechanism to handle multiple events and +connections in a single project. AutoKitteh sessions are isolated from each +other. diff --git a/ops/alert_on_missing_events/autokitteh.yaml b/ops/alert_on_missing_events/autokitteh.yaml new file mode 100644 index 0000000..a69eeeb --- /dev/null +++ b/ops/alert_on_missing_events/autokitteh.yaml @@ -0,0 +1,46 @@ +# This YAML file is a declarative manifest that describes +# the setup of an AutoKitteh project that sends alerts when +# AutoKitteh doesn't receive certain events in time. + +version: v1 + +project: + name: alert_on_missing_events + + vars: + # This should be identical to the connection + # name of the monitored service below! + - name: CONN_NAME + value: monitored_service_conn + # This should be identical/equivalent to the "event_type" + # and/or "filter" fields of the trigger below! + - name: EVENT_FILTER + value: "" + # Human-readable description of the events that + # should be received, displayed in alert messages. + - name: EVENT_DESCRIPTION + value: "" + # It's OK not to receive events up to this amount of time. + - name: TIMEOUT_HOURS + value: 24 + # While an incident is ongoing, re-check at a shorter interval. + - name: PING_HOURS + value: 1 + # Send incident alerts on this Slack channel name/ID. + - name: SLACK_CHANNEL + value: autokitteh-alerts + + connections: + # Set the "integration" field to the name of the integration that you want to monitor. + - name: monitored_service_conn + integration: "" + - name: slack_conn + integration: slack + + triggers: + - name: monitor_trigger + connection: monitored_service + # Set this CEL expression to match only the events you want to monitor. + # Also set the EVENT_FILTER project variable above accordingly. + filter: event_type == 'TODO' && data.TODO == 'TODO' + call: program.py:on_monitor_trigger diff --git a/ops/alert_on_missing_events/program.py b/ops/alert_on_missing_events/program.py new file mode 100644 index 0000000..37824ea --- /dev/null +++ b/ops/alert_on_missing_events/program.py @@ -0,0 +1,41 @@ +"""Send alerts when AutoKitteh doesn't receive certain events in time.""" + +from datetime import datetime, timedelta, UTC +import os + +import autokitteh +from autokitteh.slack import slack_client + + +CONN_NAME = os.getenv("CONN_NAME", "") +EVENT_FILTER = os.getenv("EVENT_FILTER", "") +EVENT_DESCRIPTION = os.getenv("EVENT_DESCRIPTION", "") + +TIMEOUT_HOURS = int(os.getenv("TIMEOUT_HOURS", "24")) +PING_HOURS = int(os.getenv("PING_HOURS", "1")) + +SLACK_CHANNEL = os.getenv("SLACK_CHANNEL", "") + + +def on_monitor_trigger(event): + """Handle an incoming event from a monitored service.""" + start_time = datetime.now(UTC) + slack = slack_client(CONN_NAME) + + # Wait for the next conformant event from the monitored service. + sub = autokitteh.subscribe(CONN_NAME, filter=EVENT_FILTER) + data = autokitteh.next_event(sub, timeout=timedelta(hours=TIMEOUT_HOURS)) + + # The monitored service hasn't sent us a conformant event for TIMEOUT_HOURS. + # Send a Slack alert once every PING_HOURS, until the incident is resolved. + while data is None: + description = EVENT_DESCRIPTION or f"`{EVENT_FILTER}` in `{CONN_NAME}`" + msg = f"Events not received since {start_time} (UTC): {description}" + slack.chat_postMessage(channel=SLACK_CHANNEL, text=msg) + + data = autokitteh.next_event(sub, timeout=timedelta(hours=PING_HOURS)) + + # All clear, the monitored service is sending us conformant events still/again. + # Note that another "on_monitor_trigger" workflow is starting to run now, in a + # separate AutoKitteh session, waiting for the next event/incident. + autokitteh.unsubscribe(sub)