Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use rules for haproxy's internal exporter #41

Merged
merged 1 commit into from
Oct 7, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 52 additions & 45 deletions prometheus/haproxy.rules
Original file line number Diff line number Diff line change
@@ -1,20 +1,13 @@
# Taken from https://awesome-prometheus-alerts.grep.to/rules

groups:

- name: HAProxy
rules:

- alert: HaproxyDown
expr: 'haproxy_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy down (instance {{ $labels.instance }})
description: "HAProxy down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
rules:

- alert: HaproxyHighHttp4xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -23,7 +16,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp5xxErrorRateBackend
expr: 'sum by (backend) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (backend) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (proxy) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -32,7 +25,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp4xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -41,7 +34,7 @@ groups:
description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyHighHttp5xxErrorRateServer
expr: 'sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5'
for: 1m
labels:
severity: critical
Expand All @@ -50,7 +43,7 @@ groups:
description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerResponseErrors
expr: 'sum by (server) (rate(haproxy_server_response_errors_total[1m]) * 100) / sum by (server) (rate(haproxy_server_http_responses_total[1m])) > 5'
expr: '(sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5'
for: 1m
labels:
severity: critical
Expand All @@ -59,7 +52,7 @@ groups:
description: "Too many response errors to {{ $labels.server }} server (> 5%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendConnectionErrors
expr: 'sum by (backend) (rate(haproxy_backend_connection_errors_total[1m])) > 100'
expr: '(sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100'
for: 1m
labels:
severity: critical
Expand All @@ -68,70 +61,65 @@ groups:
description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerConnectionErrors
expr: 'sum by (server) (rate(haproxy_server_connection_errors_total[1m])) > 100'
expr: '(sum by (proxy) (rate(haproxy_server_connection_errors_total[1m]))) > 100'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server connection errors (instance {{ $labels.instance }})
description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be too high.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendMaxActiveSession
expr: '((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m]) * 100) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m])))) > 80'
- alert: HaproxyBackendMaxActiveSession>80%
expr: '((haproxy_server_max_sessions >0) * 100) / (haproxy_server_limit_sessions > 0) > 80'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy backend max active session (instance {{ $labels.instance }})
description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
summary: HAProxy backend max active session > 80% (instance {{ $labels.instance }})
description: "Session limit from backend {{ $labels.proxy }} to server {{ $labels.server }} reached 80% of limit - {{ $value | printf \"%.2f\"}}%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyPendingRequests
expr: 'sum by (backend) (haproxy_backend_current_queue) > 0'
expr: 'sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy pending requests (instance {{ $labels.instance }})
description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Some HAProxy requests are pending on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: The following alert has been changed from upstream because of
# https://github.com/samber/awesome-prometheus-alerts/issues/421
- alert: HaproxyHttpSlowingDown
expr: 'avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1'
# expr: 'avg by (instance, proxy) (haproxy_backend_max_total_time_seconds) > 1'
expr: 'haproxy_backend_total_time_average_seconds > 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAProxy HTTP slowing down (instance {{ $labels.instance }})
description: "Average request time is increasing\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "Average request time is increasing - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyRetryHigh
expr: 'sum by (backend) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
expr: 'sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10'
for: 2m
labels:
severity: warning
annotations:
summary: HAProxy retry high (instance {{ $labels.instance }})
description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendDown
expr: 'haproxy_backend_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy backend down (instance {{ $labels.instance }})
description: "HAProxy backend is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyServerDown
expr: 'haproxy_server_up == 0'
for: 0m
labels:
severity: critical
annotations:
summary: HAProxy server down (instance {{ $labels.instance }})
description: "HAProxy server is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
description: "High rate of retry on {{ $labels.proxy }} - {{ $value | printf \"%.2f\"}}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: We do not use the following alert because there are frontends which are redirected internally like 'stats' or 'metrics' without active backends
# - alert: HaproxyHasNoAliveBackends
# expr: 'haproxy_backend_active_servers + haproxy_backend_backup_servers == 0'
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: HAproxy has no alive backends (instance {{ $labels.instance }})
# description: "HAProxy has no alive active or backup backends for {{ $labels.proxy }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyFrontendSecurityBlockedRequests
expr: 'sum by (frontend) (rate(haproxy_frontend_requests_denied_total[2m])) > 10'
expr: 'sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10'
for: 2m
labels:
severity: warning
Expand All @@ -147,3 +135,22 @@ groups:
annotations:
summary: HAProxy server healthcheck failure (instance {{ $labels.instance }})
description: "Some server healthcheck are failing on {{ $labels.server }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

# NOTE: Instead of alert 'HaproxyHasNoAliveBackends' we alert on server down and backend down
- alert: HaproxyServerStatusDown
expr: 'haproxy_server_status{state="DOWN"} == 1'
for: 1m
labels:
severity: warning
annotations:
summary: HAproxy server down (instance {{ $labels.instance }})
description: "HAProxy server {{ $labels.server }} down for {{ $labels.proxy }} on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

- alert: HaproxyBackendStatusDown
expr: 'haproxy_backend_status{state="DOWN"} == 1'
for: 1m
labels:
severity: critical
annotations:
summary: HAproxy backend down (instance {{ $labels.instance }})
description: "HAProxy backend {{ $labels.proxy }} down on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
Loading