diff --git a/README.md b/README.md index 2406525..646105b 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ demonstrate basic system features, integration APIs, and best practices. | [Hacker News Alerts in Slack ](./hackernews/) | Track Hacker News articles by topic and send updates to Slack | slack | | [JIRA Assignee From Google Calendar Workflow](./jira_google_calendar/assignee_from_schedule/) | Set Assignee in Jira ticket to the person currently on-call | jira, calendar | | [Create calendar due date event for Jira ticket](./jira_google_calendar/deadline_to_event/) | When a new Jira issue is created, the workflow automatically generates a Google Calendar event with a deadline | calendar, jira | +| [Alert on missing Jira events](./ops/alert_on_missing_events/) | Send Slack alerts when AutoKitteh doesn't receive certain Jira events in time | Jira, Slack | | [Pull Request Review Reminder (Purrr)](./purrr/) | Streamline code reviews and cut down turnaround time to merge pull requests | GitHub, Google Sheets, Slack | | [Quickstart](./quickstart/) | Sample for quickstart | http | | [Monitor PR until completion in Slack](./reviewkitteh/) | Create a Slack channel for each PR, update team leads until completion | slack, github, sheets | diff --git a/ops/alert_on_missing_events/README.md b/ops/alert_on_missing_events/README.md new file mode 100644 index 0000000..14cc085 --- /dev/null +++ b/ops/alert_on_missing_events/README.md @@ -0,0 +1,73 @@ +--- +title: Alert on missing Jira events +description: Send Slack alerts when AutoKitteh doesn't receive certain Jira events in time +integrations: ["Jira", "Slack"] +categories: ["Ops"] +--- + +# Alert on Missing Events + +Send alerts when AutoKitteh doesn't receive certain events in time. + +This is a detection tool for incidents due to missing updates from +monitored services. Possible root-causes for example: + +- The monitored service is down / local network outage +- The integration's callback URL has been modified +- The connection's event watches have expired + +While an incident is ongoing, the workflow continues to wait for the desired +events, and resends reminder alerts at a shorter interval. + +## Configuration and Deployment + +### Cloud Usage + +1. Import/upload the project +2. Initialize your connections + +> [!TIP] If you want to monitor a different service than Jira: +> +> 1. Delete the `monitored_service_conn` connection +> 2. Recreate a new connection with the same name, +> and select the desired integration +> 3. Delete the `monitor_trigger` trigger +> 4. Recreate a new trigger, select the new `monitored_service_conn` +> connection from step 2, and configure it + +3. Edit the trigger + + - Specifically, select an `Event Type` and/or set the CEL expression in + the `Filter` field to match only the events you want to monitor + +4. Set/modify these project variables: + + - `EVENT_FILTER`: must be identical/equivalent to the `Event Type` and/or + `Filter` fields of the trigger in step 2! + - `EVENT_DESCRIPTION`: human-readable description of the events that should + be received, displayed in alert messages + - `TIMEOUT_HOURS`: it's OK not to receive events up to this amount of time + (default = `24` hours) + - `PING_HOURS`: while an incident is ongoing, re-check at a shorter interval + (default = `1` hour) + - `SLACK_CHANNEL`: send incident alerts to this Slack channel name/ID + (default = `autokitteh-alerts`) + +5. Deploy the project + +### Self-Hosted Usage + +Follow [these detailed instructions](https://docs.autokitteh.com/get_started/deployment) +to deploy the project on a self-hosted server. + +Also follow the instructions in the [Cloud Usage](#cloud-usage) section above. + +### Advanced Usage + +You can add this project's configuration and code to existing projects, or run +it in parallel to them. Either way, all matching triggers in AutoKitteh will +receive all the events that match the specified criteria. + +You can also duplicate or extend this mechanism to handle multiple events and +connections in a single project. AutoKitteh sessions are isolated from each +other. diff --git a/ops/alert_on_missing_events/autokitteh.yaml b/ops/alert_on_missing_events/autokitteh.yaml new file mode 100644 index 0000000..a07f16d --- /dev/null +++ b/ops/alert_on_missing_events/autokitteh.yaml @@ -0,0 +1,40 @@ +# This YAML file is a declarative manifest that describes the +# setup of an AutoKitteh project that sends Slack alerts when +# AutoKitteh doesn't receive certain Jira events in time. + +version: v1 + +project: + name: alert_on_missing_events + + vars: + # This should be identical to the connection + # name of the monitored service below! + - name: CONN_NAME + value: monitored_service_conn + # Must be identical/equivalent to the "event_type" + # and/or "filter" fields of the trigger below! + - name: EVENT_FILTER + value: "" + - name: EVENT_DESCRIPTION + value: "" + - name: TIMEOUT_HOURS + value: 24 + - name: PING_HOURS + value: 1 + - name: SLACK_CHANNEL + value: autokitteh-alerts + + connections: + - name: monitored_service_conn + integration: jira + - name: slack_conn + integration: slack + + triggers: + - name: monitor_trigger + connection: monitored_service_conn + # Set this CEL expression to match the events you want to monitor. + # Also set the "EVENT_FILTER" project variable above accordingly! + filter: event_type == 'TODO' && data.TODO['TODO'] == 'TODO' + call: program.py:on_monitor_trigger diff --git a/ops/alert_on_missing_events/program.py b/ops/alert_on_missing_events/program.py new file mode 100644 index 0000000..2ae05d8 --- /dev/null +++ b/ops/alert_on_missing_events/program.py @@ -0,0 +1,48 @@ +"""Send Slack alerts when AutoKitteh doesn't receive certain Jira events in time. + +See the configuration and deployment instructions in the README.md file. +""" + +from datetime import datetime, timedelta, UTC +import os + +import autokitteh +from autokitteh.slack import slack_client + + +CONN_NAME = os.getenv("CONN_NAME", "") +EVENT_FILTER = os.getenv("EVENT_FILTER", "") +EVENT_DESCRIPTION = os.getenv("EVENT_DESCRIPTION", "") + +TIMEOUT_HOURS = int(os.getenv("TIMEOUT_HOURS", "24")) +PING_HOURS = int(os.getenv("PING_HOURS", "1")) + +SLACK_CHANNEL = os.getenv("SLACK_CHANNEL", "") + + +def on_monitor_trigger(event): + """Handle an incoming event from a monitored service.""" + start_time = datetime.now(UTC) + slack = slack_client(CONN_NAME) + + # Wait for the next conformant event from the monitored service. + sub = autokitteh.subscribe(CONN_NAME, filter=EVENT_FILTER) + data = autokitteh.next_event(sub, timeout=timedelta(hours=TIMEOUT_HOURS)) + incident_detected = data is None + + # The monitored service hasn't sent us a conformant event for TIMEOUT_HOURS. + # Send a Slack alert once every PING_HOURS, until the incident is resolved. + while data is None: + description = EVENT_DESCRIPTION or f"`{EVENT_FILTER}` in `{CONN_NAME}`" + msg = f"Events not received since {start_time} (UTC): {description}" + slack.chat_postMessage(channel=SLACK_CHANNEL, text=msg) + + data = autokitteh.next_event(sub, timeout=timedelta(hours=PING_HOURS)) + + # All clear, the monitored service is sending us events still/again. + # Note that another "on_monitor_trigger" workflow is starting to run now, + # in a separate AutoKitteh session, waiting for the next event/incident. + autokitteh.unsubscribe(sub) + if incident_detected: + msg = f":relieved: Event received again now: {description}" + slack.chat_postMessage(channel=SLACK_CHANNEL, text=msg)