From acc585d0da16b93040db033765e36a60924b5d69 Mon Sep 17 00:00:00 2001
From: Daan Schipper <2778477+daanschipper@users.noreply.github.com>
Date: Thu, 7 Nov 2024 16:26:22 +0100
Subject: [PATCH] fix zone aware alertmanager http idle timeout

---
 .../helm/charts/mimir-distributed/CHANGELOG.md       |  1 +
 .../alertmanager/alertmanager-statefulset.yaml       |  2 +-
 .../alertmanager/alertmanager-statefulset.yaml       | 12 ++++++++++++
 .../alertmanager/alertmanager-statefulset.yaml       | 12 ++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md
index ac7e0e2c37c..5866787dfe3 100644
--- a/operations/helm/charts/mimir-distributed/CHANGELOG.md
+++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md
@@ -35,6 +35,7 @@ Entries should include a reference to the Pull Request that introduced the chang
 * [ENHANCEMENT] helm: add `enabled` field for admin-api, compactor, distributor, gateway, ingester, querier, query-frontend and store-gateway components. This helps when deploying the GEM federation-frontend on its own. #9734
 * [BUGFIX] Fix PVC template in AlertManager to not show diff in ArgoCD. #9774
 * [BUGFIX] Fix how `fullnameOverride` is reflected in generated manifests. #9564
+* [BUGFIX] Alertmanager: Set -server.http-idle-timeout to avoid EOF errors in ruler, also for zone aware Alertmanager #9851
 
 ## 5.5.1
 * [BUGFIX] Fix incorrect use of topology spread constraints in `GrafanaAgent` CRD of metamonitoring. #9669
diff --git a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
index d312fdf91d3..ae996634ab4 100644
--- a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
+++ b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
@@ -147,11 +147,11 @@ spec:
             {{- if .Values.alertmanager.zoneAwareReplication.enabled }}
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-default"
             {{- end }}
+            {{- end }}
             # Prometheus HTTP client used to send alerts has a hard-coded idle
             # timeout of 5 minutes, therefore the server timeout for Alertmanager
             # needs to be higher to avoid connections being closed abruptly.
             - "-server.http-idle-timeout=6m"
-            {{- end }}
             {{- range $key, $value := .Values.alertmanager.extraArgs }}
             - "-{{ $key }}={{ $value }}"
             {{- end }}
diff --git a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
index 81d87150eab..1997b823720 100644
--- a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
+++ b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
@@ -87,6 +87,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-a"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir
@@ -214,6 +218,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-b"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir
@@ -341,6 +349,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-c"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir
diff --git a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
index 9f6d1116ed7..d94716ea17d 100644
--- a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
+++ b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml
@@ -110,6 +110,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-a"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir
@@ -262,6 +266,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-b"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir
@@ -414,6 +422,10 @@ spec:
             - "-config.expand-env=true"
             - "-config.file=/etc/mimir/mimir.yaml"
             - "-alertmanager.sharding-ring.instance-availability-zone=zone-c"
+            # Prometheus HTTP client used to send alerts has a hard-coded idle
+            # timeout of 5 minutes, therefore the server timeout for Alertmanager
+            # needs to be higher to avoid connections being closed abruptly.
+            - "-server.http-idle-timeout=6m"
           volumeMounts:
             - name: config
               mountPath: /etc/mimir