forked from cloudalchemy/ansible-prometheus
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.yml
89 lines (78 loc) · 2.73 KB
/
main.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
---
prometheus_version: 2.0.0
prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_root_dir: /opt/prometheus
prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''
prometheus_storage_retention: "30d"
prometheus_config_flags_extra: {}
#prometheus_config_flags_extra:
# storage.tsdb.retention: 15d
# alertmanager.timeout: 10s
prometheus_alertmanager_config: []
#prometheus_alertmanager_config:
# - scheme: https
# path_prefix: /alertmanager
# basic_auth:
# username: user
# password: pass
# static_configs:
# - targets: ["127.0.0.1:9093"]
# proxy_url: "127.0.0.2"
prometheus_global:
scrape_interval: 60s
scrape_timeout: 15s
evaluation_interval: 15s
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_targets:
node:
- targets:
- localhost:9100
labels:
env: test
prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "{{ prometheus_metrics_path }}"
static_configs:
- targets:
- "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
- job_name: "node"
file_sd_configs:
- files:
- "{{ prometheus_config_dir }}/file_sd/node.yml"
# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'
prometheus_alert_rules:
- alert: InstanceDown
expr: "up == 0"
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
- alert: CriticalCPULoad
expr: '(100 * (1 - avg(irate(node_cpu{job="node",mode="idle"}[5m])) BY (instance))) > 96'
for: 2m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 2 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical CPU load{% endraw %}"
- alert: CriticalDiskSpace
expr: 'node_filesystem_free{job="node",filesystem!~"^/run(/|$)"} / node_filesystem_size{job="node"} < 0.1'
for: 4m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical disk space usage{% endraw %}"
- alert: RebootRequired
expr: "node_reboot_required > 0"
labels:
severity: warning
annotations:
descritpion: "{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}"