Skip to content

Commit

Permalink
Use rules for haproxy's internal exporter
Browse files Browse the repository at this point in the history
Part of osism/issues#1017

Signed-off-by: Jan Horstmann <[email protected]>
  • Loading branch information
janhorstmann committed Oct 7, 2024
1 parent 1a4c672 commit 3b782aa
Showing 1 changed file with 52 additions and 45 deletions.
97 changes: 52 additions & 45 deletions prometheus/haproxy.rules
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
# Taken from https://awesome-prometheus-alerts.grep.to/rules

groups:

- name: HAProxy
rules:

- alert: HaproxyDown
expr: 'haproxy_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy down (instance {{ $labels.instance }})
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
rules:

- alert: HaproxyHighHttp4xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -23,7 +16,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp5xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -32,7 +25,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp4xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -41,7 +34,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp5xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -50,7 +43,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerResponseErrors
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5'
for: 1m
labels:
severity: critical
Expand All @@ -59,7 +52,7 @@ groups:
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendConnectionErrors
expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100'
for: 1m
labels:
severity: critical
Expand All @@ -68,70 +61,65 @@ groups:
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerConnectionErrors
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
- alert: HaproxyBackendMaxActiveSession>80%
expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyPendingRequests
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy pending requests (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: The following alert has been changed from upstream because of
# https://github.com/samber/awesome-prometheus-alerts/issues/421
- alert: HaproxyHttpSlowingDown
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
# expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1'
expr: 'haproxy_backend_total_time_average_seconds > 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyRetryHigh
expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy retry high (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendDown
expr: 'haproxy_backend_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy backend down (instance {{ $labels.instance }})
description: "HAProxy backend is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerDown
expr: 'haproxy_server_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server down (instance {{ $labels.instance }})
description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: We do not use the following alert because there are frontends which are redirected internally like 'stats' or 'metrics' without active backends
# - alert: HaproxyHasNoAliveBackends
# expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0'
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: HAproxy has no alive backends (instance {{ $labels.instance }})
# description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyFrontendSecurityBlockedRequests
expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10'
for: 2m
labels:
severity: warning
Expand All @@ -147,3 +135,22 @@ groups:
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: Instead of alert 'HaproxyHasNoAliveBackends' we alert on server down and backend down
- alert: HaproxyServerStatusDown
expr: 'haproxy_server_status{state="DOWN"} == 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAproxy server down (instance {{ $labels.instance }})
description: "HAProxy server {{ $labels.server }} down for {{ $labels.proxy }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendStatusDown
expr: 'haproxy_backend_status{state="DOWN"} == 1'
for: 1m
labels:
severity: critical
annotations:
summary: HAproxy backend down (instance {{ $labels.instance }})
description: "HAProxy backend {{ $labels.proxy }} down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

0 comments on commit 3b782aa

Please sign in to comment.