Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Admission webhook: Undo prepare-shutdown calls if last-downscale fails to store #151

Merged
merged 13 commits into from
Jun 21, 2024
Merged
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## main / unreleased

* [ENHANCEMENT] prepare-downscale admission webhook: undo prepare-shutdown calls if adding the `last-downscale` annotation fails. #151

## v0.17.0

* [CHANGE] The docker base images are now based off distroless images rather than Alpine. #149
Expand All @@ -10,7 +12,6 @@
* [ENHANCEMENT] Include unique IDs of webhook requests in logs for easier debugging. #150
* [ENHANCEMENT] Include k8s operation username in request debug logs. #152
* [ENHANCEMENT] `rollout-max-unavailable` annotation can now be specified as percentage, e.g.: `rollout-max-unavailable: 25%`. Resulting value is computed as `floor(replicas * percentage)`, but is never less than 1. #153
* [ENHANCEMENT] Delayed downscale of statefulset can now reduce replicas earlier, if subset of pods at the end of statefulset have already reached their delay. #156
* [BUGFIX] Fix a mangled error log in controller's delayed downscale code. #154

## v0.16.0
Expand Down
56 changes: 33 additions & 23 deletions pkg/admission/prep_downscale.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (

const (
PrepareDownscaleWebhookPath = "/admission/prepare-downscale"
maxPrepareGoroutines = 32
)

func PrepareDownscale(ctx context.Context, logger log.Logger, ar v1.AdmissionReview, api *kubernetes.Clientset, useZoneTracker bool, zoneTrackerConfigMapName string) *v1.AdmissionResponse {
Expand Down Expand Up @@ -158,6 +159,10 @@ func prepareDownscale(ctx context.Context, l log.Logger, ar v1.AdmissionReview,
eps := createEndpoints(ar, oldInfo, newInfo, port, path)

if err := sendPrepareShutdownRequests(ctx, logger, client, eps); err != nil {
// At least one failed. Undo them all.
level.Warn(logger).Log("msg", "failed to prepare hosts for shutdown. unpreparing...", "err", err)
seizethedave marked this conversation as resolved.
Show resolved Hide resolved
undoPrepareShutdownRequests(ctx, logger, client, eps)

// Down-scale operation is disallowed because a pod failed to prepare for shutdown and cannot be deleted
level.Error(logger).Log("msg", "downscale not allowed due to error", "err", err)
return deny(
Expand All @@ -167,7 +172,9 @@ func prepareDownscale(ctx context.Context, l log.Logger, ar v1.AdmissionReview,
}

if err := addDownscaledAnnotationToStatefulSet(ctx, api, ar.Request.Namespace, ar.Request.Name); err != nil {
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation", "err", err)
level.Error(logger).Log("msg", "downscale not allowed due to error while adding annotation. unpreparing...", "err", err)
undoPrepareShutdownRequests(ctx, logger, client, eps)

return deny(
"downscale of %s/%s in %s from %d to %d replicas is not allowed because adding an annotation to the statefulset failed.",
ar.Request.Resource.Resource, ar.Request.Name, ar.Request.Namespace, *oldInfo.replicas, *newInfo.replicas,
Expand Down Expand Up @@ -510,13 +517,10 @@ func sendPrepareShutdownRequests(ctx context.Context, logger log.Logger, client
span, ctx := opentracing.StartSpanFromContext(ctx, "admission.sendPrepareShutdownRequests()")
defer span.Finish()

// Attempt to POST to every prepare-shutdown endpoint. If any fail, we'll
// undo them all with a DELETE.

const maxGoroutines = 32
// Attempt to POST to every prepare-shutdown endpoint.

g, ectx := errgroup.WithContext(ctx)
g.SetLimit(maxGoroutines)
g.SetLimit(maxPrepareGoroutines)
for _, ep := range eps {
ep := ep
g.Go(func() error {
Expand All @@ -527,25 +531,31 @@ func sendPrepareShutdownRequests(ctx context.Context, logger log.Logger, client
})
}

err := g.Wait()
if err != nil {
// At least one failed. Undo them all.
level.Warn(logger).Log("msg", "failed to prepare hosts for shutdown. unpreparing...", "err", err)
undoGroup, _ := errgroup.WithContext(ctx)
undoGroup.SetLimit(maxGoroutines)
for _, ep := range eps {
ep := ep
undoGroup.Go(func() error {
if err := invokePrepareShutdown(ctx, http.MethodDelete, logger, client, ep); err != nil {
level.Warn(logger).Log("msg", "failed to undo prepare shutdown request", "url", ep.url, "err", err)
}
return nil
})
}
_ = undoGroup.Wait()
return g.Wait()
}

// undoPrepareShutdownRequests sends an HTTP DELETE to each of the given endpoints.
func undoPrepareShutdownRequests(ctx context.Context, logger log.Logger, client httpClient, eps []endpoint) {
span, ctx := opentracing.StartSpanFromContext(ctx, "admission.undoPrepareShutdownRequests()")
defer span.Finish()

if len(eps) == 0 {
seizethedave marked this conversation as resolved.
Show resolved Hide resolved
return
}
undoGroup, _ := errgroup.WithContext(ctx)
undoGroup.SetLimit(maxPrepareGoroutines)
for _, ep := range eps {
ep := ep
undoGroup.Go(func() error {
if err := invokePrepareShutdown(ctx, http.MethodDelete, logger, client, ep); err != nil {
level.Warn(logger).Log("msg", "failed to undo prepare shutdown request", "url", ep.url, "err", err)
// (We swallow the error so all of the deletes are attempted.)
}
return nil
})
}

return err
_ = undoGroup.Wait()
}

var tenantResolver spanlogger.TenantResolver = noTenantResolver{}
Expand Down
Loading