diff --git a/operations/helm/charts/mimir-distributed/CHANGELOG.md b/operations/helm/charts/mimir-distributed/CHANGELOG.md index acf96b39916..2f3b4ce38d0 100644 --- a/operations/helm/charts/mimir-distributed/CHANGELOG.md +++ b/operations/helm/charts/mimir-distributed/CHANGELOG.md @@ -33,6 +33,7 @@ Entries should include a reference to the Pull Request that introduced the chang * [ENHANCEMENT] Add the possibility to create a dedicated serviceAccount for the `alertmanager` component by setting `alertmanager.serviceAcount.create` to true in the values. #9781 * [BUGFIX] Fix PVC template in AlertManager to not show diff in ArgoCD. #9774 * [BUGFIX] Fix how `fullnameOverride` is reflected in generated manifests. #9564 +* [BUGFIX] Alertmanager: Set -server.http-idle-timeout to avoid EOF errors in ruler, also for zone aware Alertmanager #9851 ## 5.5.1 diff --git a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index 5437fc483db..4e157d904b8 100644 --- a/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/charts/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -147,11 +147,11 @@ spec: {{- if .Values.alertmanager.zoneAwareReplication.enabled }} - "-alertmanager.sharding-ring.instance-availability-zone=zone-default" {{- end }} + {{- end }} # Prometheus HTTP client used to send alerts has a hard-coded idle # timeout of 5 minutes, therefore the server timeout for Alertmanager # needs to be higher to avoid connections being closed abruptly. - "-server.http-idle-timeout=6m" - {{- end }} {{- range $key, $value := .Values.alertmanager.extraArgs }} - "-{{ $key }}={{ $value }}" {{- end }} diff --git a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index 81d87150eab..1997b823720 100644 --- a/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/tests/test-oss-logical-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -87,6 +87,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-a" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -214,6 +218,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-b" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -341,6 +349,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-c" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir diff --git a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml index 9f6d1116ed7..d94716ea17d 100644 --- a/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml +++ b/operations/helm/tests/test-oss-multizone-values-generated/mimir-distributed/templates/alertmanager/alertmanager-statefulset.yaml @@ -110,6 +110,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-a" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -262,6 +266,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-b" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir @@ -414,6 +422,10 @@ spec: - "-config.expand-env=true" - "-config.file=/etc/mimir/mimir.yaml" - "-alertmanager.sharding-ring.instance-availability-zone=zone-c" + # Prometheus HTTP client used to send alerts has a hard-coded idle + # timeout of 5 minutes, therefore the server timeout for Alertmanager + # needs to be higher to avoid connections being closed abruptly. + - "-server.http-idle-timeout=6m" volumeMounts: - name: config mountPath: /etc/mimir