From 45a39ebecc311f48f394980854895199934ad20a Mon Sep 17 00:00:00 2001 From: jon-funk Date: Tue, 29 Oct 2024 19:34:39 -0700 Subject: [PATCH] feat: add IaC sysdig alerts (#725) Co-authored-by: afwilcox --- .github/scripts/sysdig_deploy.sh | 27 +++ .github/scripts/sysdig_installed.sh | 34 ++++ .github/workflows/deploy-sysdig.yml | 64 +++++++ .gitignore | 20 +++ terraform/alerts_prod.tf | 259 ++++++++++++++++++++++++++++ terraform/alerts_test.tf | 259 ++++++++++++++++++++++++++++ terraform/backend.tf | 17 ++ terraform/channels.tf | 18 ++ terraform/provider.tf | 12 ++ terraform/variables.tf | 6 + 10 files changed, 716 insertions(+) create mode 100755 .github/scripts/sysdig_deploy.sh create mode 100755 .github/scripts/sysdig_installed.sh create mode 100644 .github/workflows/deploy-sysdig.yml create mode 100644 terraform/alerts_prod.tf create mode 100644 terraform/alerts_test.tf create mode 100644 terraform/backend.tf create mode 100644 terraform/channels.tf create mode 100644 terraform/provider.tf create mode 100644 terraform/variables.tf diff --git a/.github/scripts/sysdig_deploy.sh b/.github/scripts/sysdig_deploy.sh new file mode 100755 index 000000000..aaef72b14 --- /dev/null +++ b/.github/scripts/sysdig_deploy.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Handles sysdig terraform validation and apply + +set -e # failfast +# ENV: +# APPLY: determines if plan is applied, lease as false for dry-run + +cd terraform || exit 1 +terraform -v +terraform init \ + -backend-config="bucket=${STATE_BACKEND_BUCKET}" \ + -backend-config="key=${STATE_BACKEND_FILEPATH}" \ + -backend-config="access_key=${STATE_BACKEND_ACCESS_KEY}" \ + -backend-config="secret_key=${STATE_BACKEND_SECRET_KEY}" \ + -backend-config="endpoint=${STATE_BACKEND_ENDPOINT}" + +# validate and lint check +terraform validate +terraform plan + +if [ "$APPLY" = "true" ]; then + echo "APPLY=true flag provided, attempting to apply changes" + # deploy + terraform apply -auto-approve +else + echo "Dry-run, skipping apply" +fi \ No newline at end of file diff --git a/.github/scripts/sysdig_installed.sh b/.github/scripts/sysdig_installed.sh new file mode 100755 index 000000000..9494b069a --- /dev/null +++ b/.github/scripts/sysdig_installed.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Fetches the sysdig team crd and checks at least 1 user is present in the config +# IMPORTANT NOTE: requires a serviceaccount with get/list on sysdig-team +# ENV: +# OC_NAMESPACE +# OC_SERVER +# OC_TOKEN +set -e # failfast +if [ -z "$OC_NAMESPACE" ]; then + echo "OC_NAMESPACE not set" + exit 1 +fi +if [ -z "$OC_SERVER" ]; then + echo "OC_SERVER not set" + exit 1 +fi +if [ -z "$OC_TOKEN" ]; then + echo "OC_TOKEN not set" + exit 1 +fi + +OC_TEMP_TOKEN=$(curl -k -X POST $OC_SERVER/api/v1/namespaces/$OC_NAMESPACE/serviceaccounts/pipeline/token --header "Authorization: Bearer $OC_TOKEN" -d '{"spec": {"expirationSeconds": 600}}' -H 'Content-Type: application/json; charset=utf-8' | jq -r '.status.token' ) +oc login --token=$OC_TEMP_TOKEN --server=$OC_SERVER +oc project $OC_NAMESPACE # Safeguard! + + +sysdig_config=$(oc get sysdig-team -n $OC_NAMESPACE -ojson) +num_users=$(echo $sysdig_config | jq -r '.items[0].spec.team.users | length') +if [ $num_users -eq 0 ]; then + echo "No users found in sysdig-team" + exit 1 +fi +echo "Found $num_users users in sysdig-team" +exit 0 diff --git a/.github/workflows/deploy-sysdig.yml b/.github/workflows/deploy-sysdig.yml new file mode 100644 index 000000000..e5e132a5b --- /dev/null +++ b/.github/workflows/deploy-sysdig.yml @@ -0,0 +1,64 @@ +name: Deploy Sysdig Alerts + +on: + push: + paths: + - "terraform/**" + +concurrency: + # Do not interrupt previous workflows + # avoid state corruption from cancels + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + installed: + environment: tools + name: Check Sysdig Installed + runs-on: ubuntu-22.04 + timeout-minutes: 1 + steps: + - uses: actions/checkout@v4 + - run: ./.github/scripts/sysdig_installed.sh + env: + OC_NAMESPACE: ${{ secrets.OC_NAMESPACE }} + OC_SERVER: ${{ secrets.OC_SERVER }} + OC_TOKEN: ${{ secrets.OC_TOKEN }} + + validate: + environment: tools + needs: installed + name: Validate Sysdig Terraform + runs-on: ubuntu-22.04 + timeout-minutes: 3 + steps: + - uses: actions/checkout@v4 + - name: Validate Sysdig Terraform + run: APPLY=false ./.github/scripts/sysdig_deploy.sh + env: + STATE_BACKEND_BUCKET: ${{ secrets.STATE_BACKEND_BUCKET }} + STATE_BACKEND_ACCESS_KEY: ${{ secrets.STATE_BACKEND_ACCESS_KEY }} + STATE_BACKEND_SECRET_KEY: ${{ secrets.STATE_BACKEND_SECRET_KEY }} + STATE_BACKEND_FILEPATH: ${{ secrets.STATE_BACKEND_FILEPATH }} + STATE_BACKEND_ENDPOINT: ${{ secrets.STATE_BACKEND_ENDPOINT }} + TF_VAR_sysdig_api_token: ${{ secrets.TF_VAR_SYSDIG_API_TOKEN }} + AWS_NO_SIGN_REQUEST: 1 + deploy: + if: github.ref == 'refs/heads/main' + needs: validate + environment: tools + name: Deploy Sysdig Terraform + runs-on: ubuntu-22.04 + timeout-minutes: 10 + steps: + - uses: actions/checkout@v4 + - name: Apply Sysdig Terraform + run: APPLY=true ./.github/scripts/sysdig_deploy.sh + env: + STATE_BACKEND_BUCKET: ${{ secrets.STATE_BACKEND_BUCKET }} + STATE_BACKEND_ACCESS_KEY: ${{ secrets.STATE_BACKEND_ACCESS_KEY }} + STATE_BACKEND_SECRET_KEY: ${{ secrets.STATE_BACKEND_SECRET_KEY }} + STATE_BACKEND_FILEPATH: ${{ secrets.STATE_BACKEND_FILEPATH }} + STATE_BACKEND_ENDPOINT: ${{ secrets.STATE_BACKEND_ENDPOINT }} + TF_VAR_sysdig_api_token: ${{ secrets.TF_VAR_SYSDIG_API_TOKEN }} + AWS_NO_SIGN_REQUEST: 1 diff --git a/.gitignore b/.gitignore index b4ca1287a..e8bcf6afc 100644 --- a/.gitignore +++ b/.gitignore @@ -119,3 +119,23 @@ frontend/cypress/screenshots/allegation-details-edit.cy.ts/Complaint Edit Page s frontend/cypress/screenshots/hwcr-details-edit.cy.ts/Complaint Edit Page spec - Edit View -- Navigate to the Complaint Edit page & check inputs (failed).png frontend/cypress/screenshots/hwcr-details-edit.cy.ts/Complaint Edit Page spec - Edit View -- it has a map on screen with a marker at the correct location (failed).png nr-compliance-enforcement.code-workspace + +# Terraform +*.tfstate +*.tfstate.* +crash.log +crash.*.log +override.tf +override.tf.json +*_override.tf +*_override.tf.json +.terraform/ +.terraform.lock.hcl +**/.terraform/* +*.tfvars +*.tfvars.json +# Ignore transient lock info files created by terraform apply +.terraform.tfstate.lock.info +# Ignore CLI configuration files +.terraformrc +terraform.rc \ No newline at end of file diff --git a/terraform/alerts_prod.tf b/terraform/alerts_prod.tf new file mode 100644 index 000000000..077bb8f06 --- /dev/null +++ b/terraform/alerts_prod.tf @@ -0,0 +1,259 @@ +### Backend Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_prod_cpu_quota" { + name = "Prod Backend CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-backend\",container_name=\"nr-compliance-enforcement-prod-backend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_prod_mem_usage" { + name = "Prod Backend Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-backend\",container_name=\"nr-compliance-enforcement-prod-backend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_prod_mem_limit" { + name = "Prod Backend Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-backend\",container_name=\"nr-compliance-enforcement-prod-backend\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_prod_uptime_score" { + name = "Prod Backend Uptime Alert" + description = "Alert when the backend container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-backend\",container_name=\"nr-compliance-enforcement-prod-backend\"} < 0.5" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_prod_http_silent" { + name = "Prod Backend Unresponsive Alert" + description = "Alert when the backend container has been unresponsive or silent for too long" + severity = "high" + query = "sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-backend\",container_name=\"nr-compliance-enforcement-prod-backend\"} < 0.1" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +### Frontend Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_cpu_quota" { + name = "Prod Frontend CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\",container_name=\"nr-compliance-enforcement-prod-frontend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_mem_usage" { + name = "Prod Frontend Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\",container_name=\"nr-compliance-enforcement-prod-frontend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_mem_limit" { + name = "Prod Frontend Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\",container_name=\"nr-compliance-enforcement-prod-frontend\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_uptime_score" { + name = "Prod Frontend Uptime Alert" + description = "Alert when the frontend container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\",container_name=\"nr-compliance-enforcement-prod-frontend\"} < 0.5" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_http_silent" { + name = "Prod Frontend Unresponsive Alert" + description = "Alert when the frontend container has been unresponsive or silent for too long" + severity = "high" + query = "sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\",container_name=\"nr-compliance-enforcement-prod-frontend\"} < 0.1" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_prod_http_error_rate" { + name = "Prod Frontend HTTP Error Rate Alert" + description = "Alert when the frontend container has too many HTTP errors over a period" + severity = "high" + query = "(sysdig_container_net_http_error_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\"} / sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_deployment_name=\"nr-compliance-enforcement-prod-frontend\"} ) > 0.05" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +### Database Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_prod_cpu_quota" { + name = "Prod Database CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_statefulset_name=\"nr-compliance-enforcement-prod-bitnami-pg\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_prod_mem_usage" { + name = "Prod Database Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_statefulset_name=\"nr-compliance-enforcement-prod-bitnami-pg\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_prod_mem_limit" { + name = "Prod Database Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_statefulset_name=\"nr-compliance-enforcement-prod-bitnami-pg\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_prod_uptime_score" { + name = "Prod Database Uptime Alert" + description = "Alert when the database container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_statefulset_name=\"nr-compliance-enforcement-prod-bitnami-pg\"} < 0.7" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_prod_storage_usage" { + name = "Prod Database Storage Alert" + description = "Alert when the database storage usage is too high" + severity = "high" + query = "sysdig_fs_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-prod\",kube_statefulset_name=\"nr-compliance-enforcement-prod-bitnami-pg\"} > 70" + enabled = true + duration_seconds = 600 + notification_channels { + id = sysdig_monitor_notification_channel_email.prod_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} \ No newline at end of file diff --git a/terraform/alerts_test.tf b/terraform/alerts_test.tf new file mode 100644 index 000000000..6d6cee20e --- /dev/null +++ b/terraform/alerts_test.tf @@ -0,0 +1,259 @@ +### Backend Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_test_cpu_quota" { + name = "Test Backend CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-backend\",container_name=\"nr-compliance-enforcement-test-backend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_test_mem_usage" { + name = "Test Backend Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-backend\",container_name=\"nr-compliance-enforcement-test-backend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_test_mem_limit" { + name = "Test Backend Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-backend\",container_name=\"nr-compliance-enforcement-test-backend\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_test_uptime_score" { + name = "Test Backend Uptime Alert" + description = "Alert when the backend container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-backend\",container_name=\"nr-compliance-enforcement-test-backend\"} < 0.5" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_backend_test_http_silent" { + name = "Test Backend Unresponsive Alert" + description = "Alert when the backend container has been unresponsive or silent for too long" + severity = "high" + query = "sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-backend\",container_name=\"nr-compliance-enforcement-test-backend\"} < 0.1" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Backend" + app = "NatCom" + } +} +### Frontend Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_cpu_quota" { + name = "Test Frontend CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\",container_name=\"nr-compliance-enforcement-test-frontend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_mem_usage" { + name = "Test Frontend Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\",container_name=\"nr-compliance-enforcement-test-frontend\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_mem_limit" { + name = "Test Frontend Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\",container_name=\"nr-compliance-enforcement-test-frontend\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_uptime_score" { + name = "Test Frontend Uptime Alert" + description = "Alert when the frontend container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\",container_name=\"nr-compliance-enforcement-test-frontend\"} < 0.5" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_http_silent" { + name = "Test Frontend Unresponsive Alert" + description = "Alert when the frontend container has been unresponsive or silent for too long" + severity = "high" + query = "sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\",container_name=\"nr-compliance-enforcement-test-frontend\"} < 0.1" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_frontend_test_http_error_rate" { + name = "Test Frontend HTTP Error Rate Alert" + description = "Alert when the frontend container has too many HTTP errors over a period" + severity = "high" + query = "(sysdig_container_net_http_error_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\"} / sysdig_container_net_http_request_count{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_deployment_name=\"nr-compliance-enforcement-test-frontend\"} ) > 0.05" + enabled = true + duration_seconds = 300 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Frontend" + app = "NatCom" + } +} +### Databsae Alerts +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_test_cpu_quota" { + name = "Test Database CPU Requests Quota Alert" + description = "Alert when the CPU requests usage is too high" + severity = "medium" + query = "sysdig_container_cpu_quota_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_statefulset_name=\"nr-compliance-enforcement-test-bitnami-pg\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_test_mem_usage" { + name = "Test Database Mem Usage Alert" + description = "Alert when the mem usage is too high" + severity = "medium" + query = "sysdig_container_memory_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_statefulset_name=\"nr-compliance-enforcement-test-bitnami-pg\"} > 98" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_test_mem_limit" { + name = "Test Database Mem Limit Alert" + description = "Alert when the mem usage is near the limit for too long" + severity = "high" + query = "sysdig_container_memory_limit_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_statefulset_name=\"nr-compliance-enforcement-test-bitnami-pg\"} > 70" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_test_uptime_score" { + name = "Test Database Uptime Alert" + description = "Alert when the database container has too much downtime" + severity = "high" + query = "sysdig_container_up{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_statefulset_name=\"nr-compliance-enforcement-test-bitnami-pg\"} < 0.5" + enabled = true + duration_seconds = 180 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} +resource "sysdig_monitor_alert_v2_prometheus" "nr_database_test_storage_usage" { + name = "Test Database Storage Alert" + description = "Alert when the database storage usage is too high" + severity = "high" + query = "sysdig_fs_used_percent{kube_cluster_name=\"silver\",kube_namespace_name=\"c1c7ed-test\",kube_statefulset_name=\"nr-compliance-enforcement-test-bitnami-pg\"} > 70" + enabled = true + duration_seconds = 600 + notification_channels { + id = sysdig_monitor_notification_channel_email.test_environment_alerts.id + renotify_every_minutes = 120 + } + labels = { + service = "NatCom Database" + app = "NatCom" + } +} \ No newline at end of file diff --git a/terraform/backend.tf b/terraform/backend.tf new file mode 100644 index 000000000..cf37af6a6 --- /dev/null +++ b/terraform/backend.tf @@ -0,0 +1,17 @@ +terraform { + backend "s3" { + region = "ca-central-1" # dummy value + bucket = "override" + key = "override" + access_key = "override" + secret_key = "override" + endpoint = "override" + # avoid aws-specific api checks + skip_credentials_validation = true + skip_metadata_api_check = true + skip_region_validation = true + skip_requesting_account_id = true + skip_s3_checksum = true + force_path_style = true + } +} \ No newline at end of file diff --git a/terraform/channels.tf b/terraform/channels.tf new file mode 100644 index 000000000..28799ca31 --- /dev/null +++ b/terraform/channels.tf @@ -0,0 +1,18 @@ +resource "sysdig_monitor_notification_channel_email" "test_environment_alerts" { + name = "Team Wolverine - Test Environment Alerts" + recipients = ["jonathan.funk@gov.bc.ca"] + enabled = true + notify_when_ok = true + notify_when_resolved = true + send_test_notification = false + share_with_current_team = true +} +resource "sysdig_monitor_notification_channel_email" "prod_environment_alerts" { + name = "Team Wolverine - Prod Environment Alerts" + recipients = ["jonathan.funk@gov.bc.ca"] + enabled = true + notify_when_ok = true + notify_when_resolved = true + send_test_notification = false + share_with_current_team = true +} \ No newline at end of file diff --git a/terraform/provider.tf b/terraform/provider.tf new file mode 100644 index 000000000..7f5d21a37 --- /dev/null +++ b/terraform/provider.tf @@ -0,0 +1,12 @@ +terraform { + required_providers { + sysdig = { + source = "sysdiglabs/sysdig" + version = ">=0.5" + } + } +} +provider "sysdig" { + sysdig_monitor_url = "https://app.sysdigcloud.com" + sysdig_monitor_api_token = var.sysdig_api_token +} \ No newline at end of file diff --git a/terraform/variables.tf b/terraform/variables.tf new file mode 100644 index 000000000..963723623 --- /dev/null +++ b/terraform/variables.tf @@ -0,0 +1,6 @@ +variable "sysdig_api_token" { + description = "Sysdig API Token" + type = string + sensitive = true + nullable = false +} \ No newline at end of file