From acc585d0da16b93040db033765e36a60924b5d69 Mon Sep 17 00:00:00 2001 From: Daan Schipper <2778477+daanschipper@users.noreply.github.com> Date: Thu, 7 Nov 2024 16:26:22 +0100 Subject: [PATCH] fix zone aware alertmanager http idle timeout --- .../helm/charts/mimir-distributed/CHANGELOG.md | 1 + .../alertmanager/alertmanager-statefulset.yaml | 2 +- .../alertmanager/alertmanager-statefulset.yaml | 12 ++++++++++++ .../alertmanager/alertmanager-statefulset.yaml | 12 ++++++++++++ 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index ac7e0e2c37c..5866787dfe3 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -35,6 +35,7 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] helm: add `enabled` field for admin-api, compactor, distributor, gateway, ingester, querier, query-frontend and store-gateway components. This helps when deploying the GEM federation-frontend on its own. #9734 * [BUGFIX] Fix PVC template in AlertManager to not show diff in ArgoCD. #9774 * [BUGFIX] Fix how `fullnameOverride` is reflected in generated manifests. #9564 +* [BUGFIX] Alertmanager: Set -server.http-idle-timeout to avoid EOF errors in ruler, also for zone aware Alertmanager #9851 ## 5.5.1 * [BUGFIX] Fix incorrect use of topology spread constraints in `GrafanaAgent` CRD of metamonitoring. #9669 diff --git a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index d312fdf91d3..ae996634ab4 100644 --- a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -147,11 +147,11 @@ spec: {{- if .Values.alertmanager.zoneAwareReplication.enabled }} - "-alertmanager.sharding-ring.instance-availability-zone=zone-default" {{- end }} + {{- end }} # Prometheus HTTP client used to send alerts has a hard-coded idle # timeout of 5 minutes, therefore the server timeout for Alertmanager # needs to be higher to avoid connections being closed abruptly. - "-server.http-idle-timeout=6m" - {{- end }} {{- range $key, $value := .Values.alertmanager.extraArgs }} - "-{{ $key }}={{ $value }}" {{- end }} diff --git a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index 81d87150eab..1997b823720 100644 --- a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -87,6 +87,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-a" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -214,6 +218,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-b" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -341,6 +349,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-c" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir diff --git a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index 9f6d1116ed7..d94716ea17d 100644 --- a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -110,6 +110,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-a" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -262,6 +266,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-b" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -414,6 +422,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-c" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir