Skip to content

Commit

Permalink
fix: Revisions stuck in restart loop now can scale down once unreachable
Browse files Browse the repository at this point in the history
1. Unreachable pa becomes inactive

PA computeActiveCondition() will MarkInactive
in the case of "Queued" when the pa is
Unreachable.

2. Adding DeadStart e2e tests

TestDeadStartToHealthy: creates a service which is never able to
transition to a ready state by existing immediately.
The failed revision will scale to zero once the service
is updated with a healthy revision.

TestDeadStartFromHealthy: updates a healthy service with an image
that can never reach a ready state.
The healthy revision remains Ready and the DeadStart revision doesn't
not scale down until ProgressDeadline is reached.
  • Loading branch information
andrew-delph committed Mar 15, 2024
1 parent 7e7ddd6 commit 7033fe1
Show file tree
Hide file tree
Showing 10 changed files with 489 additions and 17 deletions.
8 changes: 8 additions & 0 deletions pkg/apis/autoscaling/v1alpha1/pa_lifecycle.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,14 @@ func (pa *PodAutoscaler) IsReady() bool {
pas.GetCondition(PodAutoscalerConditionReady).IsTrue()
}

func (pa *PodAutoscaler) IsUnreachable() bool {
return pa.Spec.Reachability == ReachabilityUnreachable
}

func (pa *PodAutoscaler) IsReachable() bool {
return pa.Spec.Reachability == ReachabilityReachable
}

// IsActive returns true if the pod autoscaler has finished scaling.
func (pas *PodAutoscalerStatus) IsActive() bool {
return pas.GetCondition(PodAutoscalerConditionActive).IsTrue()
Expand Down
9 changes: 7 additions & 2 deletions pkg/reconciler/autoscaling/kpa/kpa.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAuto

// If the service is not created yet, it is either queued or inctive if initial scale zero.
if !isServiceCreated {

if minReady == 0 {
pa.Status.MarkInactive(noTrafficReason, "The target is not receiving traffic.")
} else {
Expand All @@ -281,8 +282,12 @@ func computeActiveCondition(ctx context.Context, pa *autoscalingv1alpha1.PodAuto

// If the target is not initialized, it will be queued.
if !pa.Status.IsScaleTargetInitialized() {
pa.Status.MarkActivating(
"Queued", "Requests to the target are being buffered as resources are provisioned.")
if pa.IsUnreachable() {
pa.Status.MarkInactive("Unreachable", "The target does not have an active routing state.")
} else {
pa.Status.MarkActivating(
"Queued", "Requests to the target are being buffered as resources are provisioned.")
}
return
}

Expand Down
124 changes: 110 additions & 14 deletions pkg/reconciler/autoscaling/kpa/kpa_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
// These are the fake informers we want setup.
fakenetworkingclient "knative.dev/networking/pkg/client/injection/client/fake"
fakesksinformer "knative.dev/networking/pkg/client/injection/informers/networking/v1alpha1/serverlessservice/fake"
"knative.dev/pkg/apis"
fakekubeclient "knative.dev/pkg/client/injection/kube/client/fake"
fakefilteredpodsinformer "knative.dev/pkg/client/injection/kube/informers/core/v1/pod/filtered/fake"
_ "knative.dev/pkg/client/injection/kube/informers/core/v1/service/fake"
Expand Down Expand Up @@ -174,9 +175,9 @@ func newConfigWatcher() configmap.Watcher {
})
}

func withScales(g, w int32) PodAutoscalerOption {
func withScales(actualScale, desiredScale int32) PodAutoscalerOption {
return func(pa *autoscalingv1alpha1.PodAutoscaler) {
pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(w), ptr.Int32(g)
pa.Status.DesiredScale, pa.Status.ActualScale = ptr.Int32(desiredScale), ptr.Int32(actualScale)
}
}

Expand Down Expand Up @@ -211,8 +212,20 @@ func sks(ns, n string, so ...SKSOption) *nv1a1.ServerlessService {
return s
}

func markOld(pa *autoscalingv1alpha1.PodAutoscaler) {
pa.Status.Conditions[0].LastTransitionTime.Inner.Time = time.Now().Add(-1 * time.Hour)
func sksMarkOld(sks *nv1a1.ServerlessService) {
for i := range sks.Status.Conditions {
sks.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))}
}
}

func markActivatorEndpointsPopulated(sks *nv1a1.ServerlessService) {
sks.Status.MarkActivatorEndpointsPopulated()
}

func kpaMarkOld(pa *autoscalingv1alpha1.PodAutoscaler) {
for i := range pa.Status.Conditions {
pa.Status.Conditions[i].LastTransitionTime = apis.VolatileTime{Inner: metav1.NewTime(time.Now().Add(-time.Hour))}
}
}

func markScaleTargetInitialized(pa *autoscalingv1alpha1.PodAutoscaler) {
Expand Down Expand Up @@ -467,7 +480,7 @@ func TestReconcile(t *testing.T) {
Key: key,
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)),
withScales(0, defaultScale), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc), WithReachabilityReachable),
// SKS is ready here, since its endpoints are populated with Activator endpoints.
sks(testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName), WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -480,7 +493,7 @@ func TestReconcile(t *testing.T) {
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithBufferedTraffic, withScales(0, defaultScale),
WithPASKSReady, WithPAMetricsService(privateSvc),
WithPAStatusService(testRevision), WithObservedGeneration(1)),
WithPAStatusService(testRevision), WithObservedGeneration(1), WithReachabilityReachable),
}},
}, {
Name: "sks is still not ready",
Expand Down Expand Up @@ -708,7 +721,7 @@ func TestReconcile(t *testing.T) {
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0),
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
WithPASKSReady, markOld, WithPAStatusService(testRevision),
WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -724,7 +737,7 @@ func TestReconcile(t *testing.T) {
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, withScales(0, 0),
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
WithPASKSReady, markOld, WithPAStatusService(testRevision),
WithPASKSReady, kpaMarkOld, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady),
metric(testNamespace, testRevision),
Expand All @@ -743,7 +756,7 @@ func TestReconcile(t *testing.T) {
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, markOld,
kpa(testNamespace, testRevision, WithPASKSReady, WithTraffic, kpaMarkOld,
withScales(0, 0), WithPAStatusService(testRevision), WithPAMetricsService(privateSvc)),
defaultSKS,
metric(testNamespace, testRevision),
Expand Down Expand Up @@ -776,14 +789,14 @@ func TestReconcile(t *testing.T) {
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, markOld,
kpa(testNamespace, testRevision, WithPASKSReady, WithBufferedTraffic, kpaMarkOld, WithReachabilityUnreachable,
WithPAStatusService(testRevision), withScales(0, 0),
WithPAMetricsService(privateSvc)),
defaultSKS,
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision), defaultReady},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc),
Object: kpa(testNamespace, testRevision, markScaleTargetInitialized, WithPASKSReady, WithPAMetricsService(privateSvc), WithReachabilityUnreachable,
WithNoTraffic("TimedOut", "The target could not be activated."), withScales(1, 0),
WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithObservedGeneration(1)),
Expand Down Expand Up @@ -932,13 +945,13 @@ func TestReconcile(t *testing.T) {
Object: activeKPAMinScale(overscale, defaultScale),
}},
}, {
Name: "scaled-to-0-no-scale-data",
Name: "scaled to 0 no scale data",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, unknownScale, /* desiredScale */
0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithPASKSReady,
kpa(testNamespace, testRevision, WithScaleTargetInitialized, WithReachabilityReachable, WithPASKSReady,
WithNoTraffic(noTrafficReason, "The target is not receiving traffic."), WithPAMetricsService(privateSvc),
withScales(0, -1), WithPAStatusService(testRevision), WithObservedGeneration(1)),
sks(testNamespace, testRevision, WithDeployRef(deployName),
Expand Down Expand Up @@ -1071,6 +1084,28 @@ func TestReconcile(t *testing.T) {
WithPASKSNotReady(""), WithPAStatusService(testRevision),
),
}},
}, {
Name: "initial scale zero: with ready pod",
Key: key,
Ctx: context.WithValue(context.WithValue(context.Background(), asConfigKey{}, initialScaleZeroASConfig()), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, -42 /* ebc */)),
Objects: append([]runtime.Object{
kpa(testNamespace, testRevision, markScaleTargetInitialized, withScales(1, 0),
WithReachabilityReachable, WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithPASKSReady, WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
),
sks(testNamespace, testRevision, WithDeployRef(deployName), WithProxyMode, WithSKSReady, WithNumActivators(defaultAct)),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
}, makeReadyPods(1, testNamespace, testRevision)...),
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision, WithTraffic, WithPASKSReady, markScaleTargetInitialized,
withScales(1, 0), WithReachabilityReachable, WithPAStatusService(testRevision),
WithPAMetricsService(privateSvc), WithObservedGeneration(1), WithNoTraffic(noTrafficReason, "The target is not receiving traffic."),
),
}},
}, {
Name: "initial scale zero: sks ServiceName empty",
Key: key,
Expand Down Expand Up @@ -1194,7 +1229,68 @@ func TestReconcile(t *testing.T) {
WithPAMetricsService(privateSvc), WithObservedGeneration(1),
WithPAStatusService(testRevision),
),
}}}}
}}},
{
Name: "unreachable crashing - pa becomes Failed.",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, -1 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace,
testRevision, WithReachabilityUnreachable,
WithPAMetricsService(privateSvc), WithPASKSNotReady(noPrivateServiceName)),
sks(
testNamespace, testRevision, WithProxyMode,
WithDeployRef(deployName), WithPrivateService, WithPubService,
),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), withScales(0, -1), WithReachabilityUnreachable,
WithPAMetricsService(privateSvc), WithObservedGeneration(1),
WithPAStatusService(testRevision), WithPASKSNotReady(""),
),
}},
},
{
Name: "unreachable crashing - deployment becomes 0",
Key: key,
Ctx: context.WithValue(context.Background(), deciderKey{},
decider(testNamespace, testRevision, 0 /* desiredScale */, 0 /* ebc */)),
Objects: []runtime.Object{
kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable,
WithObservedGeneration(1), WithPAStatusService(testRevision), WithPASKSNotReady(""), withScales(0, -1),
),
sks(
testNamespace, testRevision, WithProxyMode, WithDeployRef(deployName),
WithPrivateService, WithPubService, markActivatorEndpointsPopulated, sksMarkOld,
),
metric(testNamespace, testRevision),
deploy(testNamespace, testRevision, func(d *appsv1.Deployment) {
d.Spec.Replicas = ptr.Int32(1)
}),
},
WantStatusUpdates: []clientgotesting.UpdateActionImpl{{
Object: kpa(testNamespace, testRevision,
WithNoTraffic("Unreachable", "The target does not have an active routing state."), WithReachabilityUnreachable,
WithPAStatusService(testRevision), WithPAMetricsService(privateSvc),
WithObservedGeneration(1), withScales(0, 0), WithPASKSNotReady(""),
),
}},
WantPatches: []clientgotesting.PatchActionImpl{{
ActionImpl: clientgotesting.ActionImpl{
Namespace: testNamespace,
},
Name: deployName,
Patch: []byte(`[{"op":"replace","path":"/spec/replicas","value":0}]`),
}},
},
}

table.Test(t, MakeFactory(func(ctx context.Context, listers *Listers, cmw configmap.Watcher) controller.Reconciler {
retryAttempted = false
Expand Down
6 changes: 5 additions & 1 deletion pkg/reconciler/autoscaling/kpa/scaler.go
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,11 @@ func (ks *scaler) scale(ctx context.Context, pa *autoscalingv1alpha1.PodAutoscal

if desiredScale < 0 && !pa.Status.IsActivating() {
logger.Debug("Metrics are not yet being collected.")
return desiredScale, nil

// if the target is unreachable and has no metrics. we still want to be able to scale it to 0
if !pa.IsUnreachable() {
return desiredScale, nil
}
}

min, max := pa.ScaleBounds(asConfig)
Expand Down
22 changes: 22 additions & 0 deletions pkg/reconciler/autoscaling/kpa/scaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -409,6 +409,27 @@ func TestScaler(t *testing.T) {
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now())
WithReachabilityReachable(k)
},
}, {
label: "negative scale, to zero if unreachable with no metrics",
startReplicas: 1,
scaleTo: -1,
wantReplicas: 0,
wantScaling: true,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now().Add(-time.Hour))
WithReachabilityUnreachable(k)
},
}, {
label: "negative scale does not to zero if reachable with no metrics",
startReplicas: 1,
scaleTo: -1,
wantReplicas: -1,
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkInactive(k, time.Now().Add(-time.Hour))
WithReachabilityReachable(k)
},
}, {
label: "scales up from zero to desired one",
Expand All @@ -430,6 +451,7 @@ func TestScaler(t *testing.T) {
wantScaling: false,
paMutation: func(k *autoscalingv1alpha1.PodAutoscaler) {
paMarkActive(k, time.Now())
WithReachabilityReachable(k)
},
}, {
label: "initial scale attained, but now time to scale down",
Expand Down
1 change: 1 addition & 0 deletions test/conformance.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ const (
Timeout = "timeout"
Volumes = "volumes"
SlowStart = "slowstart"
DeadStart = "deadstart"

// Constants for test image output.
PizzaPlanetText1 = "What a spaceport!"
Expand Down
Loading

0 comments on commit 7033fe1

Please sign in to comment.