diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 7435a3828..765efa08f 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -174,7 +174,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp } } if machine.Spec.ProviderID == "" || machine.Status.CurrentStatus.Phase == "" || machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff { - return c.triggerCreationFlow( + retry, err = c.triggerCreationFlow( ctx, &driver.CreateMachineRequest{ Machine: machine, @@ -182,6 +182,7 @@ func (c *controller) reconcileClusterMachine(ctx context.Context, machine *v1alp Secret: &corev1.Secret{Data: secretData}, }, ) + return c.adjustCreateRetryRequired(machine, retry), err } return machineutils.LongRetry, nil @@ -353,7 +354,9 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque // If node label is not present klog.V(2).Infof("Creating a VM for machine %q, please wait!", machine.Name) klog.V(2).Infof("The machine creation is triggered with timeout of %s", c.getEffectiveCreationTimeout(createMachineRequest.Machine).Duration) - createMachineResponse, err := c.driver.CreateMachine(ctx, createMachineRequest) + createMCCtx, cancelFunc := c.getCreationContext(ctx, machine) + defer cancelFunc() + createMachineResponse, err := c.driver.CreateMachine(createMCCtx, createMachineRequest) if err != nil { // Create call returned an error. klog.Errorf("Error while creating machine %s: %s", machine.Name, err.Error()) diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 0fb700da7..f84ec9a71 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -473,7 +473,7 @@ func SyncMachineTaints( return toBeUpdated } -// machineCreateErrorHandler TODO +// machineCreateErrorHandler handles errors when machine creation does not succeed func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1alpha1.Machine, createMachineResponse *driver.CreateMachineResponse, err error) (machineutils.RetryPeriod, error) { var ( retryRequired = machineutils.MediumRetry @@ -490,7 +490,7 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a lastKnownState = createMachineResponse.LastKnownState } - c.machineStatusUpdate( + statusUpdErr := c.machineStatusUpdate( ctx, machine, v1alpha1.LastOperation{ @@ -505,8 +505,23 @@ func (c *controller) machineCreateErrorHandler(ctx context.Context, machine *v1a }, lastKnownState, ) + return retryRequired, statusUpdErr +} + +// adjustCreateRetryRequired adjusts the retry period if needed so that we don't have a case where the machine is reconciled after machineCreationTimeout +// if the retry period is too large so that, it is adjusted so that it causes a reconcile at the machineCreationTimeout +// if the machineCreationTimeout has already passed, return `ShortRetry` so that the machine is immediately reconciled +func (c *controller) adjustCreateRetryRequired(machine *v1alpha1.Machine, retryRequired machineutils.RetryPeriod) machineutils.RetryPeriod { + adjustedRetry := retryRequired + machineCreationDeadline := machine.CreationTimestamp.Time.Add(c.getEffectiveCreationTimeout(machine).Duration) + if time.Now().After(machineCreationDeadline) { + adjustedRetry = machineutils.ShortRetry + } else if time.Now().Add(time.Duration(retryRequired)).After(machineCreationDeadline) { + // Machine will reconcile after create deadline. Adapt RetryPeriod to reconcile machine at deadline + adjustedRetry = machineutils.RetryPeriod(machineCreationDeadline.Sub(time.Now())) + } - return retryRequired, nil + return adjustedRetry } func (c *controller) machineStatusUpdate( @@ -972,7 +987,7 @@ func isConditionEmpty(condition v1.NodeCondition) bool { return condition == v1.NodeCondition{} } -// initializes err and description with the passed string message +// printLogInitError initializes err and description with the passed string message func printLogInitError(s string, err *error, description *string, machine *v1alpha1.Machine) { klog.Warningf(s+" machine: %q ", machine.Name) *err = fmt.Errorf(s+" %s", machineutils.InitiateVMDeletion) @@ -1302,7 +1317,7 @@ func (c *controller) getEffectiveHealthTimeout(machine *v1alpha1.Machine) *metav return effectiveHealthTimeout } -// getEffectiveHealthTimeout returns the creationTimeout set on the machine-object, otherwise returns the timeout set using the global-flag. +// getEffectiveCreationTimeout returns the creationTimeout set on the machine-object, otherwise returns the timeout set using the global-flag. func (c *controller) getEffectiveCreationTimeout(machine *v1alpha1.Machine) *metav1.Duration { var effectiveCreationTimeout *metav1.Duration if machine.Spec.MachineConfiguration != nil && machine.Spec.MachineConfiguration.MachineCreationTimeout != nil { @@ -1324,6 +1339,11 @@ func (c *controller) getEffectiveNodeConditions(machine *v1alpha1.Machine) *stri return effectiveNodeConditions } +func (c *controller) getCreationContext(parentCtx context.Context, machine *v1alpha1.Machine) (context.Context, context.CancelFunc) { + timeOutDuration := c.getEffectiveCreationTimeout(machine).Duration + return context.WithDeadline(parentCtx, machine.CreationTimestamp.Time.Add(timeOutDuration)) +} + // UpdateNodeTerminationCondition updates termination condition on the node object func (c *controller) UpdateNodeTerminationCondition(ctx context.Context, machine *v1alpha1.Machine) error { if machine.Status.CurrentStatus.Phase == "" || machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff { diff --git a/pkg/util/provider/machinecontroller/machine_util_test.go b/pkg/util/provider/machinecontroller/machine_util_test.go index d5bdbe5af..1ab4ddf2d 100644 --- a/pkg/util/provider/machinecontroller/machine_util_test.go +++ b/pkg/util/provider/machinecontroller/machine_util_test.go @@ -2725,4 +2725,71 @@ var _ = Describe("machine_util", func() { }), ) }) + + Describe("#Adjust Machine Create Retry Timeout", func() { + type setup struct { + machine *machinev1.Machine + mcCreateTimeout metav1.Duration + createOpRetry machineutils.RetryPeriod + } + + type expect struct { + retryAdjusted bool + } + + type data struct { + setup setup + expect expect + } + + DescribeTable("##table", + func(data *data) { + stop := make(chan struct{}) + defer close(stop) + + c, trackers := createController(stop, testNamespace, nil, nil, nil, nil) + defer trackers.Stop() + + c.safetyOptions.MachineCreationTimeout = data.setup.mcCreateTimeout + + adjustedRetry := c.adjustCreateRetryRequired(data.setup.machine, machineutils.RetryPeriod(data.setup.createOpRetry)) + + if data.expect.retryAdjusted { + Expect(adjustedRetry).Should(BeNumerically("<=", machineutils.RetryPeriod(c.getEffectiveCreationTimeout(data.setup.machine).Duration))) + } else { + Expect(adjustedRetry).To(Equal(data.setup.createOpRetry)) + } + }, + Entry("Should not adjust machine create retry period if retry is set to occur before machine create deadline", &data{ + setup: setup{ + createOpRetry: machineutils.RetryPeriod(3 * time.Minute), + mcCreateTimeout: metav1.Duration{Duration: 20 * time.Minute}, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + Spec: machinev1.MachineSpec{}, + }, + &machinev1.MachineStatus{}, + nil, nil, map[string]string{v1alpha1.NodeLabelKey: "test-node-0"}, true, metav1.Now()), + }, + expect: expect{ + retryAdjusted: false, + }, + }), + Entry("Should adjust machine create retry period to machine create deadline if retry is set to occur after machine create deadline", &data{ + setup: setup{ + createOpRetry: machineutils.RetryPeriod(4 * time.Minute), + mcCreateTimeout: metav1.Duration{Duration: 3 * time.Minute}, + machine: newMachine( + &machinev1.MachineTemplateSpec{ + Spec: machinev1.MachineSpec{}, + }, + &machinev1.MachineStatus{}, + nil, nil, map[string]string{v1alpha1.NodeLabelKey: "test-node-0"}, true, metav1.Now()), + }, + expect: expect{ + retryAdjusted: true, + }, + }), + ) + }) })