Skip to content

Commit

Permalink
Ensure ContainerHealthy condition is set back to True
Browse files Browse the repository at this point in the history
Co-authored-by: Matthias Diester <[email protected]>
  • Loading branch information
SaschaSchwarze0 and HeavyWombat committed Dec 5, 2024
1 parent 5717d19 commit e0cc076
Show file tree
Hide file tree
Showing 5 changed files with 166 additions and 25 deletions.
2 changes: 1 addition & 1 deletion pkg/reconciler/autoscaling/hpa/hpa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ func TestReconcile(t *testing.T) {
Name: "nop deletion reconcile",
// Test that with a DeletionTimestamp we do nothing.
Objects: []runtime.Object{
pa(testNamespace, testRevision, WithHPAClass, WithPADeletionTimestamp),
WithDeletionTimestamp(pa(testNamespace, testRevision, WithHPAClass)),
deploy(testNamespace, testRevision),
},
Key: key(testNamespace, testRevision),
Expand Down
57 changes: 38 additions & 19 deletions pkg/reconciler/revision/reconcile_resources.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,10 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision)
logger.Errorw("Error getting pods", zap.Error(err))
return nil
}
if len(pods.Items) > 0 {
// Arbitrarily grab the very first pod, as they all should be crashing
pod := pods.Items[0]

podsLoop:
for i := range pods.Items {
pod := pods.Items[i]

// Update the revision status if pod cannot be scheduled (possibly resource constraints)
// If pod cannot be scheduled then we expect the container status to be empty.
Expand All @@ -94,29 +95,47 @@ func (c *Reconciler) reconcileDeployment(ctx context.Context, rev *v1.Revision)
}
}

// if a Pod is terminating already, we do not care it being not ready because this is expected to be the case
if pod.DeletionTimestamp != nil {
continue
}

// if a Pod is ready, then do not check it. The fact it is ready may not yet be reflected on the Deployment status.
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodReady && condition.Status == corev1.ConditionTrue {
continue podsLoop
}
}

// Iterate the container status and report the first failure
for _, status := range pod.Status.ContainerStatuses {
if status.Name != resources.QueueContainerName {
if t := status.LastTerminationState.Terminated; t != nil {
logger.Infof("marking exiting with: %d/%s", t.ExitCode, t.Message)
if t.ExitCode == 0 && t.Message == "" {
// In cases where there is no error message, we should still provide some exit message in the status
rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode),
v1.RevisionContainerExitingMessage("container exited with no error"))
break
} else {
rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), v1.RevisionContainerExitingMessage(t.Message))
break
}
} else if w := status.State.Waiting; w != nil && hasDeploymentTimedOut(deployment) {
logger.Infof("marking resources unavailable with: %s: %s", w.Reason, w.Message)
rev.Status.MarkResourcesAvailableFalse(w.Reason, w.Message)
break
if status.Name == resources.QueueContainerName {
continue
}

if t := status.LastTerminationState.Terminated; t != nil {
logger.Infof("marking exiting with: %d/%s", t.ExitCode, t.Message)
if t.ExitCode == 0 && t.Message == "" {
// In cases where there is no error message, we should still provide some exit message in the status
rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode),
v1.RevisionContainerExitingMessage("container exited with no error"))
} else {
rev.Status.MarkContainerHealthyFalse(v1.ExitCodeReason(t.ExitCode), v1.RevisionContainerExitingMessage(t.Message))
}
break podsLoop
} else if w := status.State.Waiting; w != nil && hasDeploymentTimedOut(deployment) {
logger.Infof("marking resources unavailable with: %s: %s", w.Reason, w.Message)
rev.Status.MarkResourcesAvailableFalse(w.Reason, w.Message)
break podsLoop
}
}
}
}

if deployment.Status.ReadyReplicas > 0 {
rev.Status.MarkContainerHealthyTrue()
}

return nil
}

Expand Down
91 changes: 89 additions & 2 deletions pkg/reconciler/revision/table_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import (
tracingconfig "knative.dev/pkg/tracing/config"
autoscalingv1alpha1 "knative.dev/serving/pkg/apis/autoscaling/v1alpha1"
defaultconfig "knative.dev/serving/pkg/apis/config"
"knative.dev/serving/pkg/apis/serving"
v1 "knative.dev/serving/pkg/apis/serving/v1"
"knative.dev/serving/pkg/autoscaler/config/autoscalerconfig"
servingclient "knative.dev/serving/pkg/client/injection/client"
Expand Down Expand Up @@ -544,7 +545,7 @@ func TestReconcile(t *testing.T) {
WithRoutingState(v1.RoutingStateActive, fc),
),
pa("foo", "pull-backoff", WithReachabilityReachable), // pa can't be ready since deployment times out.
pod(t, "foo", "pull-backoff", WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")),
pod(t, "foo", "pull-backoff", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithWaitingContainer("pull-backoff", "ImagePullBackoff", "can't pull it")),
timeoutDeploy(deploy(t, "foo", "pull-backoff"), "Timed out!"),
image("foo", "pull-backoff"),
},
Expand All @@ -570,7 +571,7 @@ func TestReconcile(t *testing.T) {
WithRoutingState(v1.RoutingStateActive, fc),
WithLogURL, allUnknownConditions, MarkActive),
pa("foo", "pod-error", WithReachabilityReachable), // PA can't be ready, since no traffic.
pod(t, "foo", "pod-error", WithFailingContainer("pod-error", 5, "I failed man!")),
pod(t, "foo", "pod-error", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("pod-error", 5, "I failed man!")),
deploy(t, "foo", "pod-error"),
image("foo", "pod-error"),
},
Expand Down Expand Up @@ -742,6 +743,72 @@ func TestReconcile(t *testing.T) {
PodSpecPersistentVolumeClaim: defaultconfig.Enabled,
PodSpecPersistentVolumeWrite: defaultconfig.Enabled,
}}),
}, {
Name: "revision's ContainerHealthy turns back to True if the deployment is healthy",
Objects: []runtime.Object{
Revision("foo", "container-unhealthy",
WithLogURL,
MarkRevisionReady,
withDefaultContainerStatuses(),
WithRevisionLabel(serving.RoutingStateLabelKey, "active"),
MarkContainerHealthyFalse("ExitCode137"),
),
pa("foo", "container-unhealthy",
WithPASKSReady,
WithScaleTargetInitialized,
WithTraffic,
WithReachabilityReachable,
WithPAStatusService("something"),
),
readyDeploy(deploy(t, "foo", "container-unhealthy", withReplicas(1))),
image("foo", "container-unhealthy"),
},
Key: "foo/container-unhealthy",
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: Revision("foo", "container-unhealthy",
WithLogURL,
MarkRevisionReady,
withDefaultContainerStatuses(),
WithRevisionLabel(serving.RoutingStateLabelKey, "active"),
),
}},
WantEvents: []string{
Eventf(corev1.EventTypeNormal, "RevisionReady", "Revision becomes ready upon all resources being ready"),
},
}, {
Name: "revision's ContainerHealthy stays True even if the last Pod with restarts terminates",
Objects: []runtime.Object{
Revision("foo", "container-healthy",
WithLogURL,
MarkRevisionReady,
withDefaultContainerStatuses(),
WithRevisionLabel(serving.RoutingStateLabelKey, "active"),
MarkContainerHealthyTrue(),
),
pa("foo", "container-healthy",
WithPASKSReady,
WithScaleTargetInitialized,
WithTraffic,
WithReachabilityReachable,
WithPAStatusService("something"),
),
WithDeletionTimestamp(pod(t, "foo", "container-healthy", WithPodCondition(corev1.PodReady, corev1.ConditionFalse, "ContainersNotReady"), WithFailingContainer("crashed-at-some-point", 128, "OOMKilled"))),
readyDeploy(deploy(t, "foo", "container-healthy", withReplicas(0))),
image("foo", "container-healthy"),
},
Key: "foo/container-healthy",
/*
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: Revision("foo", "container-unhealthy",
WithLogURL,
MarkRevisionReady,
withDefaultContainerStatuses(),
WithRevisionLabel(serving.RoutingStateLabelKey, "active"),
),
}},
WantEvents: []string{
Eventf(corev1.EventTypeNormal, "RevisionReady", "Revision becomes ready upon all resources being ready"),
},*/
}}

table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, _ configmap.Watcher) controller.Reconciler {
Expand Down Expand Up @@ -851,6 +918,19 @@ func allUnknownConditions(r *v1.Revision) {

type configOption func(*config.Config)

type deploymentOption func(*appsv1.Deployment)

// withReplicas configures the number of replicas on the Deployment
func withReplicas(replicas int32) deploymentOption {
return func(d *appsv1.Deployment) {
d.Spec.Replicas = &replicas
d.Status.AvailableReplicas = replicas
d.Status.ReadyReplicas = replicas
d.Status.Replicas = replicas
d.Status.UpdatedReplicas = replicas
}
}

func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.Deployment {
t.Helper()
cfg := reconcilerTestConfig()
Expand All @@ -876,6 +956,13 @@ func deploy(t *testing.T, namespace, name string, opts ...interface{}) *appsv1.D
if err != nil {
t.Fatal("failed to create deployment")
}

for _, opt := range opts {
if deploymentOpt, ok := opt.(deploymentOption); ok {
deploymentOpt(deployment)
}
}

return deployment
}

Expand Down
26 changes: 23 additions & 3 deletions pkg/testing/functional.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,10 +132,11 @@ func WithNoTraffic(reason, message string) PodAutoscalerOption {
}
}

// WithPADeletionTimestamp will set the DeletionTimestamp on the PodAutoscaler.
func WithPADeletionTimestamp(r *autoscalingv1alpha1.PodAutoscaler) {
// WithDeletionTimestamp will set the DeletionTimestamp on the object.
func WithDeletionTimestamp[T metav1.Object](obj T) T {
t := metav1.NewTime(time.Unix(1e9, 0))
r.ObjectMeta.SetDeletionTimestamp(&t)
obj.SetDeletionTimestamp(&t)
return obj
}

// WithHPAClass updates the PA to add the hpa class annotation.
Expand Down Expand Up @@ -288,6 +289,25 @@ func WithEndpointsOwnersRemoved(eps *corev1.Endpoints) {
// PodOption enables further configuration of a Pod.
type PodOption func(*corev1.Pod)

// WithPodCondition sets a condition in the status
func WithPodCondition(conditionType corev1.PodConditionType, status corev1.ConditionStatus, reason string) PodOption {
return func(pod *corev1.Pod) {
for i, condition := range pod.Status.Conditions {
if condition.Type == conditionType {
pod.Status.Conditions[i].Status = status
pod.Status.Conditions[i].Reason = reason
return
}
}

pod.Status.Conditions = append(pod.Status.Conditions, corev1.PodCondition{
Type: conditionType,
Status: status,
Reason: reason,
})
}
}

// WithFailingContainer sets the .Status.ContainerStatuses on the pod to
// include a container named accordingly to fail with the given state.
func WithFailingContainer(name string, exitCode int, message string) PodOption {
Expand Down
15 changes: 15 additions & 0 deletions pkg/testing/v1/revision.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,12 +152,27 @@ func MarkDeploying(reason string) RevisionOption {
}
}

// MarkContainerHealthyTrue changes the ContainerHealthy condition to Unknown with the given reason
func MarkContainerHealthyUnknown(reason string) RevisionOption {
return func(r *v1.Revision) {
r.Status.MarkContainerHealthyUnknown(reason, "")
}
}

// MarkContainerHealthyTrue changes the ContainerHealthy condition to False with the given reason
func MarkContainerHealthyFalse(reason string) RevisionOption {
return func(r *v1.Revision) {
r.Status.MarkContainerHealthyFalse(reason, "")
}
}

// MarkContainerHealthyTrue changes the ContainerHealthy condition to True
func MarkContainerHealthyTrue() RevisionOption {
return func(r *v1.Revision) {
r.Status.MarkContainerHealthyTrue()
}
}

// MarkProgressDeadlineExceeded calls the method of the same name on the Revision
// with the message we expect the Revision Reconciler to pass.
func MarkProgressDeadlineExceeded(message string) RevisionOption {
Expand Down

0 comments on commit e0cc076

Please sign in to comment.