From e0cc0767e4426fea19606a75b65da81de6f25b11 Mon Sep 17 00:00:00 2001 From: Sascha Schwarze Date: Thu, 29 Aug 2024 17:17:16 +0200 Subject: [PATCH] Ensure ContainerHealthy condition is set back to True Co-authored-by: Matthias Diester --- pkg/reconciler/autoscaling/hpa/hpa_test.go | 2 +- .../revision/reconcile_resources.go | 57 ++++++++---- pkg/reconciler/revision/table_test.go | 91 ++++++++++++++++++- pkg/testing/functional.go | 26 +++++- pkg/testing/v1/revision.go | 15 +++ 5 files changed, 166 insertions(+), 25 deletions(-) diff --git a/pkg/reconciler/autoscaling/hpa/hpa_test.go b/pkg/reconciler/autoscaling/hpa/hpa_test.go index 06d3c8813b81..559c497bc3a9 100644 --- a/pkg/reconciler/autoscaling/hpa/hpa_test.go +++ b/pkg/reconciler/autoscaling/hpa/hpa_test.go @@ -331,7 +331,7 @@ func TestReconcile(t *testing.T) { Name: "nop deletion reconcile", // Test that with a DeletionTimestamp we do nothing. Objects: []runtime.Object{ - pa(testNamespace, testRevision, WithHPAClass, WithPADeletionTimestamp), + WithDeletionTimestamp(pa(testNamespace, testRevision, WithHPAClass)), deploy(testNamespace, testRevision), }, Key: key(testNamespace, testRevision), diff --git a/pkg/reconciler/revision/reconcile_resources.go b/pkg/reconciler/revision/reconcile_resources.go index 3fa6f07d24fe..77905f21a1f7 100644 --- a/pkg/reconciler/revision/reconcile_resources.go +++ b/pkg/reconciler/revision/reconcile_resources.go @@ -81,9 +81,10 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) logger.Errorw("Error getting pods", zap.Error(err)) return nil } - if len(pods.Items) > 0 { - // Arbitrarily grab the very first pod, as they all should be crashing - pod := pods.Items[0] + + podsLoop: + for i := range pods.Items { + pod := pods.Items[i] // Update the revision status if pod cannot be scheduled (possibly resource constraints) // If pod cannot be scheduled then we expect the container status to be empty. @@ -94,29 +95,47 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision) } } + // if a Pod is terminating already, we do not care it being not ready because this is expected to be the case + if pod.DeletionTimestamp != nil { + continue + } + + // if a Pod is ready, then do not check it. The fact it is ready may not yet be reflected on the Deployment status. + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue { + continue podsLoop + } + } + + // Iterate the container status and report the first failure for _, status := range pod.Status.ContainerStatuses { - if status.Name != resources.QueueContainerName { - if t := status.LastTerminationState.Terminated; t != nil { - logger.Infof("marking exiting with: %d/%s", t.ExitCode, t.Message) - if t.ExitCode == 0 && t.Message == "" { - // In cases where there is no error message, we should still provide some exit message in the status - rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), - v1.RevisionContainerExitingMessage("container exited with no error")) - break - } else { - rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), v1.RevisionContainerExitingMessage(t.Message)) - break - } - } else if w := status.State.Waiting; w != nil && hasDeploymentTimedOut(deployment) { - logger.Infof("marking resources unavailable with: %s: %s", w.Reason, w.Message) - rev.Status.MarkResourcesAvailableFalse(w.Reason, w.Message) - break + if status.Name == resources.QueueContainerName { + continue + } + + if t := status.LastTerminationState.Terminated; t != nil { + logger.Infof("marking exiting with: %d/%s", t.ExitCode, t.Message) + if t.ExitCode == 0 && t.Message == "" { + // In cases where there is no error message, we should still provide some exit message in the status + rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), + v1.RevisionContainerExitingMessage("container exited with no error")) + } else { + rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), v1.RevisionContainerExitingMessage(t.Message)) } + break podsLoop + } else if w := status.State.Waiting; w != nil && hasDeploymentTimedOut(deployment) { + logger.Infof("marking resources unavailable with: %s: %s", w.Reason, w.Message) + rev.Status.MarkResourcesAvailableFalse(w.Reason, w.Message) + break podsLoop } } } } + if deployment.Status.ReadyReplicas > 0 { + rev.Status.MarkContainerHealthyTrue() + } + return nil } diff --git a/pkg/reconciler/revision/table_test.go b/pkg/reconciler/revision/table_test.go index c534b56e7ed8..80fcc3ac097e 100644 --- a/pkg/reconciler/revision/table_test.go +++ b/pkg/reconciler/revision/table_test.go @@ -41,6 +41,7 @@ import ( tracingconfig "knative.dev/pkg/tracing/config" autoscalingv1alpha1 "knative.dev/serving/pkg/apis/autoscaling/v1alpha1" defaultconfig "knative.dev/serving/pkg/apis/config" + "knative.dev/serving/pkg/apis/serving" v1 "knative.dev/serving/pkg/apis/serving/v1" "knative.dev/serving/pkg/autoscaler/config/autoscalerconfig" servingclient "knative.dev/serving/pkg/client/injection/client" @@ -544,7 +545,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateActive, fc), ), pa("foo", "pull-backoff", WithReachabilityReachable), // pa can't be ready since deployment times out. - pod(t, "foo", "pull-backoff", WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")), + pod(t, "foo", "pull-backoff", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")), timeoutDeploy(deploy(t, "foo", "pull-backoff"), "Timed out!"), image("foo", "pull-backoff"), }, @@ -570,7 +571,7 @@ func TestReconcile(t *testing.T) { WithRoutingState(v1.RoutingStateActive, fc), WithLogURL, allUnknownConditions, MarkActive), pa("foo", "pod-error", WithReachabilityReachable), // PA can't be ready, since no traffic. - pod(t, "foo", "pod-error", WithFailingContainer("pod-error", 5, "I failed man!")), + pod(t, "foo", "pod-error", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("pod-error", 5, "I failed man!")), deploy(t, "foo", "pod-error"), image("foo", "pod-error"), }, @@ -742,6 +743,72 @@ func TestReconcile(t *testing.T) { PodSpecPersistentVolumeClaim: defaultconfig.Enabled, PodSpecPersistentVolumeWrite: defaultconfig.Enabled, }}), + }, { + Name: "revision's ContainerHealthy turns back to True if the deployment is healthy", + Objects: []runtime.Object{ + Revision("foo", "container-unhealthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + MarkContainerHealthyFalse("ExitCode137"), + ), + pa("foo", "container-unhealthy", + WithPASKSReady, + WithScaleTargetInitialized, + WithTraffic, + WithReachabilityReachable, + WithPAStatusService("something"), + ), + readyDeploy(deploy(t, "foo", "container-unhealthy", withReplicas(1))), + image("foo", "container-unhealthy"), + }, + Key: "foo/container-unhealthy", + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: Revision("foo", "container-unhealthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + ), + }}, + WantEvents: []string{ + Eventf(corev1.EventTypeNormal, "RevisionReady", "Revision becomes ready upon all resources being ready"), + }, + }, { + Name: "revision's ContainerHealthy stays True even if the last Pod with restarts terminates", + Objects: []runtime.Object{ + Revision("foo", "container-healthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + MarkContainerHealthyTrue(), + ), + pa("foo", "container-healthy", + WithPASKSReady, + WithScaleTargetInitialized, + WithTraffic, + WithReachabilityReachable, + WithPAStatusService("something"), + ), + WithDeletionTimestamp(pod(t, "foo", "container-healthy", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("crashed-at-some-point", 128, "OOMKilled"))), + readyDeploy(deploy(t, "foo", "container-healthy", withReplicas(0))), + image("foo", "container-healthy"), + }, + Key: "foo/container-healthy", + /* + WantStatusUpdates: []clientgotesting.UpdateActionImpl{{ + Object: Revision("foo", "container-unhealthy", + WithLogURL, + MarkRevisionReady, + withDefaultContainerStatuses(), + WithRevisionLabel(serving.RoutingStateLabelKey, "active"), + ), + }}, + WantEvents: []string{ + Eventf(corev1.EventTypeNormal, "RevisionReady", "Revision becomes ready upon all resources being ready"), + },*/ }} table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, _ configmap.Watcher) controller.Reconciler { @@ -851,6 +918,19 @@ func allUnknownConditions(r *v1.Revision) { type configOption func(*config.Config) +type deploymentOption func(*appsv1.Deployment) + +// withReplicas configures the number of replicas on the Deployment +func withReplicas(replicas int32) deploymentOption { + return func(d *appsv1.Deployment) { + d.Spec.Replicas = &replicas + d.Status.AvailableReplicas = replicas + d.Status.ReadyReplicas = replicas + d.Status.Replicas = replicas + d.Status.UpdatedReplicas = replicas + } +} + func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.Deployment { t.Helper() cfg := reconcilerTestConfig() @@ -876,6 +956,13 @@ func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.D if err != nil { t.Fatal("failed to create deployment") } + + for _, opt := range opts { + if deploymentOpt, ok := opt.(deploymentOption); ok { + deploymentOpt(deployment) + } + } + return deployment } diff --git a/pkg/testing/functional.go b/pkg/testing/functional.go index f0ee41eb7658..176d1e73abf2 100644 --- a/pkg/testing/functional.go +++ b/pkg/testing/functional.go @@ -132,10 +132,11 @@ func WithNoTraffic(reason, message string) PodAutoscalerOption { } } -// WithPADeletionTimestamp will set the DeletionTimestamp on the PodAutoscaler. -func WithPADeletionTimestamp(r *autoscalingv1alpha1.PodAutoscaler) { +// WithDeletionTimestamp will set the DeletionTimestamp on the object. +func WithDeletionTimestamp[T metav1.Object](obj T) T { t := metav1.NewTime(time.Unix(1e9, 0)) - r.ObjectMeta.SetDeletionTimestamp(&t) + obj.SetDeletionTimestamp(&t) + return obj } // WithHPAClass updates the PA to add the hpa class annotation. @@ -288,6 +289,25 @@ func WithEndpointsOwnersRemoved(eps *corev1.Endpoints) { // PodOption enables further configuration of a Pod. type PodOption func(*corev1.Pod) +// WithPodCondition sets a condition in the status +func WithPodCondition(conditionType corev1.PodConditionType, status corev1.ConditionStatus, reason string) PodOption { + return func(pod *corev1.Pod) { + for i, condition := range pod.Status.Conditions { + if condition.Type == conditionType { + pod.Status.Conditions[i].Status = status + pod.Status.Conditions[i].Reason = reason + return + } + } + + pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{ + Type: conditionType, + Status: status, + Reason: reason, + }) + } +} + // WithFailingContainer sets the .Status.ContainerStatuses on the pod to // include a container named accordingly to fail with the given state. func WithFailingContainer(name string, exitCode int, message string) PodOption { diff --git a/pkg/testing/v1/revision.go b/pkg/testing/v1/revision.go index a3cd452fd9ef..537230e5796c 100644 --- a/pkg/testing/v1/revision.go +++ b/pkg/testing/v1/revision.go @@ -152,12 +152,27 @@ func MarkDeploying(reason string) RevisionOption { } } +// MarkContainerHealthyTrue changes the ContainerHealthy condition to Unknown with the given reason func MarkContainerHealthyUnknown(reason string) RevisionOption { return func(r *v1.Revision) { r.Status.MarkContainerHealthyUnknown(reason, "") } } +// MarkContainerHealthyTrue changes the ContainerHealthy condition to False with the given reason +func MarkContainerHealthyFalse(reason string) RevisionOption { + return func(r *v1.Revision) { + r.Status.MarkContainerHealthyFalse(reason, "") + } +} + +// MarkContainerHealthyTrue changes the ContainerHealthy condition to True +func MarkContainerHealthyTrue() RevisionOption { + return func(r *v1.Revision) { + r.Status.MarkContainerHealthyTrue() + } +} + // MarkProgressDeadlineExceeded calls the method of the same name on the Revision // with the message we expect the Revision Reconciler to pass. func MarkProgressDeadlineExceeded(message string) RevisionOption {