Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Monitor feature #8

Merged
merged 5 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .editorconfig
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = space
indent_size = 4
indent_size = 2
trim_trailing_whitespace = true

[*.{yml,yaml}]
Expand Down
55 changes: 55 additions & 0 deletions .github/workflows/check-ansible.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Check

on:
pull_request:
branches: [main]

env:
SLACK_INCOMING_WEBHOOK_URL: ${{ secrets.SLACK_INCOMING_WEBHOOK_URL }}

jobs:
deploy-essentials:
name: Deploy essentials
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 0

- name: Verify essentials file changed
uses: tj-actions/[email protected]
id: changed_files
with:
files: |
ansible/inventories/hosts.ini
ansible/playbooks/_essentials/*
ansible/playbooks/essentials.yml

- name: Deploy essentials
id: deploy-essentials
if: steps.changed_files.outputs.any_changed == 'true'
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/essentials.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible

check-playbooks:
name: Check playbook
runs-on: ubuntu-latest
needs: [deploy-essentials]
strategy:
matrix:
playbook:
- snapshots_crab
steps:
- uses: actions/checkout@v2

- name: Run playbook
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/${{ matrix.playbook }}/playbook.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible --verbose --diff --check
25 changes: 25 additions & 0 deletions .github/workflows/monitor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: Monitor

on:
schedule:
- cron: "0 */2 * * *"
workflow_dispatch:

jobs:
monitor:
name: monitor
runs-on: ubuntu-latest
strategy:
matrix:
playbook:
- monitor
steps:
- uses: actions/checkout@v2

- name: Run playbook
uses: dawidd6/action-ansible-playbook@v2
with:
directory: ansible
playbook: playbooks/${{ matrix.playbook }}/playbook.yml
key: "${{ secrets.SSH_PRIVATE_KEY }}"
options: --user ansible --verbose --diff
23 changes: 23 additions & 0 deletions ansible/inventories/hosts.ini
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,26 @@ g1.crab2.darwinia.network
[darwinia_nodes]
g1.darwinia2.darwinia.network

[monitor]
g1.crab2.darwinia.network
c1.crab2.darwinia.network
g1.darwinia2.darwinia.network
c1.darwinia2.darwinia.network
g1.testnets.darwinia.network
g2.testnets.darwinia.network
g3.testnets.darwinia.network
g1.generic.darwinia.network
g2.generic.darwinia.network

c1.darwinia-rpc.itering.io
c2.darwinia-rpc.itering.io
c1.crab-rpc.itering.io
c2.crab-rpc.itering.io

c1.collator.itering.io
c2.collator.itering.io
c3.collator.itering.io
c4.collator.itering.io

c5.collator.itering.io
c6.collator.itering.io
3 changes: 3 additions & 0 deletions ansible/playbooks/monitor/group_vars/monitor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

monitor:
notify_slack_webhook: "{{ lookup('env', 'SLACK_INCOMING_WEBHOOK_URL') }}"
3 changes: 3 additions & 0 deletions ansible/playbooks/monitor/playbook.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- hosts: monitor
roles:
- monitor
16 changes: 16 additions & 0 deletions ansible/roles/monitor/defaults/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@

monitor:
workdir: /tmp/monitor
notify_slack_webhook: ''
notify_slack_channel: 'darwinia-alert-notification'
server_name: ''
check_disks:
- /dev/sda
- /dev/sdb
alert_thread_cpu_p2: 90
alert_thread_cpu_p1: 98
alert_thread_ram_p2: 90
alert_thread_ram_p1: 98
alert_thread_disk_p2: 90
alert_thread_disk_p1: 98

15 changes: 15 additions & 0 deletions ansible/roles/monitor/tasks/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

- name: Creates workdir
file:
path: "{{ monitor.workdir }}"
state: directory

- name: Generate scripts file
template:
src: crawl.sh
dest: "{{ monitor.workdir }}/crawl.sh"
mode: "0644"

- name: Run snapshot
command: bash {{ monitor.workdir }}/crawl.sh

168 changes: 168 additions & 0 deletions ansible/roles/monitor/templates/crawl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
#!/bin/bash

DISKS_TO_MONITOR=({{ monitor.check_disks | join(' ') }})

SERVER_NAME='{{ monitor.server_name }}'
NOTIFY_SLACK_WEBHOOK='{{ monitor.notify_slack_webhook }}'
NOTIFY_SLACK_CHANNEL='{{ monitor.notify_slack_channel }}'

ALERT_THREAD_CPU_P2={{ monitor.alert_thread_cpu_p2 }}
ALERT_THREAD_CPU_P1={{ monitor.alert_thread_cpu_p1 }}
ALERT_THREAD_RAM_P2={{ monitor.alert_thread_ram_p2 }}
ALERT_THREAD_RAM_P1={{ monitor.alert_thread_ram_p1 }}
ALERT_THREAD_DISK_P2={{ monitor.alert_thread_disk_p2 }}
ALERT_THREAD_DISK_P1={{ monitor.alert_thread_disk_p1 }}

timestamp() {
date +"%Y-%m-%d %H:%M:%S"
}

cpu_usage() {
top -bn1 | grep "Cpu(s)" | awk '{print $2 + $4}' | sed 's/%//'
}

memory_usage() {
free | grep Mem | awk '{print $3/$2 * 100.0}'
}

disk_usage() {
for disk in "${DISKS_TO_MONITOR[@]}"; do
usage=$(df -h | grep "^$disk" | awk '{print $5}' | sed 's/%//')
echo "$disk $usage"
done
}

request_count() {
ss -s | grep 'estab' | awk '{print $2}'
}

generate_alert_message() {
local cpu=$(cpu_usage)
local ram=$(memory_usage)
local tcp=$(request_count)
local alert_message="[]"
local priority='P2'

if (( $(echo "$cpu > $ALERT_THREAD_CPU_P1" | bc -l) )); then
priority='P1'
fi
if (( $(echo "$ram > $ALERT_THREAD_RAM_P1" | bc -l) )); then
priority='P1'
fi
if [[ "P1" == "$priority" ]]; then
priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]')
alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert')
fi

if (( $(echo "$cpu > $ALERT_THREAD_CPU_P2" | bc -l) )); then
cpu_alert=$(jq -n --arg cpu "${cpu}%" '[{"type":"mrkdwn","text":"*CPU*"},{"type":"plain_text","text":$cpu}]')
alert_message=$(echo "$alert_message" | jq --argjson cpu_alert "$cpu_alert" '. += $cpu_alert')
fi

if (( $(echo "$ram > $ALERT_THREAD_RAM_P2" | bc -l) )); then
ram_alert=$(jq -n --arg ram "${ram}%" '[{"type":"mrkdwn","text":"*RAM*"},{"type":"plain_text","text":$ram}]')
alert_message=$(echo "$alert_message" | jq --argjson ram_alert "$ram_alert" '. += $ram_alert')
fi

if [[ "$alert_message" != "[]" ]]; then
tcp_alert=$(jq -n --arg tcp "${tcp}" '[{"type":"mrkdwn","text":"*TCP*"},{"type":"plain_text","text":$tcp}]')
alert_message=$(echo "$alert_message" | jq --argjson tcp_alert "$tcp_alert" '. += $tcp_alert')
fi

echo "$alert_message"
}


generate_disk_alert_message() {
local alert_message="[]"
local priority='P2'

while IFS= read -r line; do
local disk=$(echo $line | awk '{print $1}')
local usage=$(echo $line | awk '{print $2}')
if [[ -z "$usage" ]]; then
continue
fi

if (( $(echo "$usage > $ALERT_THREAD_DISK_P1" | bc -l) )); then
priority='P1'
fi
if (( $(echo "$usage > $ALERT_THREAD_DISK_P2" | bc -l) )); then
disk_alert=$(jq -n --arg disk "*DISK* ($disk)" --arg usage "${usage}%" '[{"type":"mrkdwn","text":$disk},{"type":"plain_text","text":$usage}]')
alert_message=$(echo "$alert_message" | jq --argjson disk_alert "$disk_alert" '. += $disk_alert')
fi
done < <(disk_usage)

if [[ "P1" == "$priority" ]]; then
priority_alert=$(jq -n --arg priority "${priority}" '[{"type":"mrkdwn","text":"*Priority*"},{"type":"plain_text","text":$priority}]')
alert_message=$(echo "$alert_message" | jq --argjson priority_alert "$priority_alert" '. += $priority_alert')
fi

echo "$alert_message"
}


check_and_send_alert() {
local alert_message=$(generate_alert_message)
local disk_alert_message=$(generate_disk_alert_message)
local HOSTNAME=${SERVER_NAME:-$(hostname)}

local blocks="[]"

if [[ "$alert_message" != "[]" ]]; then
alert_block=$(
jq -n \
--arg warning "[*WARNING*]: New server alert > $HOSTNAME" \
--argjson msg "$alert_message" \
'{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }'
)
blocks=$(echo "$blocks" | jq --argjson block "$alert_block" '. += [$block]')
fi

if [[ "$disk_alert_message" != "[]" ]]; then
disk_block=$(
jq -n \
--arg warning "[*WARNING*]: New disk alert > $HOSTNAME" \
--argjson msg "$disk_alert_message" \
'{ "type": "section", "text": {"type": "mrkdwn", "text": $warning}, "fields": $msg }'
)
blocks=$(echo "$blocks" | jq --argjson block "$disk_block" '. += [$block]')
fi

if [[ "$blocks" != "[]" ]]; then
local data=$(
jq -n \
--arg channel "$NOTIFY_SLACK_CHANNEL" \
--argjson blocks "$blocks" \
'{
"username": "ServerBot",
"icon_emoji": ":loudspeaker:",
"channel": $channel,
"blocks": $blocks
}'
)

send_alert "$data"
fi
}

send_alert() {
local message=$1

curl -X POST \
-H "Content-type: application/json" \
$NOTIFY_SLACK_WEBHOOK \
--data "$message"
}

main() {
local cpu=$(cpu_usage)
local ram=$(memory_usage)
local disk=$(disk_usage)
local requests=$(request_count)
echo "$(timestamp) CPU: ${cpu}% RAM: ${ram}% Disk: ${disk}% Requests: ${requests}"

check_and_send_alert
}

main
Loading