diff --git a/pkg/util/provider/driver/fake.go b/pkg/util/provider/driver/fake.go index ca9cde3d7..8f8326711 100644 --- a/pkg/util/provider/driver/fake.go +++ b/pkg/util/provider/driver/fake.go @@ -92,17 +92,14 @@ func (d *FakeDriver) DeleteMachine(_ context.Context, deleteMachineRequest *Dele // GetMachineStatus makes a gRPC call to the driver to check existance of machine func (d *FakeDriver) GetMachineStatus(_ context.Context, _ *GetMachineStatusRequest) (*GetMachineStatusResponse, error) { - switch { - case !d.VMExists: + if !d.VMExists { errMessage := "Fake plugin is returning no VM instances backing this machine object" return nil, status.Error(codes.NotFound, errMessage) - case d.Err != nil: - return nil, d.Err } return &GetMachineStatusResponse{ ProviderID: d.ProviderID, NodeName: d.NodeName, - }, nil + }, d.Err } // ListMachines have to list machines diff --git a/pkg/util/provider/machinecontroller/machine.go b/pkg/util/provider/machinecontroller/machine.go index 7f2db92f3..0a58b6d75 100644 --- a/pkg/util/provider/machinecontroller/machine.go +++ b/pkg/util/provider/machinecontroller/machine.go @@ -377,7 +377,6 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque Secret: createMachineRequest.Secret, }, ) - if err != nil { // VM with required name is not found. machineErr, ok := status.FromError(err) @@ -387,7 +386,6 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque return machineutils.MediumRetry, err } klog.Warningf("For machine %q, obtained VM error status as: %s", machineName, machineErr) - // Decoding machine error code switch machineErr.Code() { case codes.NotFound, codes.Unimplemented: @@ -404,13 +402,10 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque klog.Errorf("Error while creating machine %s: %s", machine.Name, err.Error()) return c.machineCreateErrorHandler(ctx, machine, createMachineResponse, err) } - nodeName = createMachineResponse.NodeName providerID = createMachineResponse.ProviderID - // Creation was successful klog.V(2).Infof("Created new VM for machine: %q with ProviderID: %q and backing node: %q", machine.Name, providerID, getNodeName(machine)) - // If a node obj already exists by the same nodeName, treat it as a stale node and trigger machine deletion. // TODO: there is a case with Azure where the VM may join the cluster before the CreateMachine call is completed, // and that would make an otherwise healthy node, be marked as stale. @@ -497,6 +492,10 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque case codes.Uninitialized: uninitializedMachine = true klog.Infof("VM instance associated with machine %s was created but not initialized.", machine.Name) + //clean me up. I'm dirty. + //TODO@thiyyakat add a pointer to a boolean variable indicating whether initialization has happened successfully. + nodeName = getMachineStatusResponse.NodeName + providerID = getMachineStatusResponse.ProviderID default: updateRetryPeriod, updateErr := c.machineStatusUpdate( @@ -527,43 +526,23 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque nodeName = getMachineStatusResponse.NodeName providerID = getMachineStatusResponse.ProviderID } - + //Update labels, providerID + var clone *v1alpha1.Machine + clone, err = c.updateLabels(ctx, createMachineRequest.Machine, nodeName, providerID) + //initialize VM if not initialized if uninitializedMachine { - retryPeriod, err := c.initializeMachine(ctx, createMachineRequest.Machine, createMachineRequest.MachineClass, createMachineRequest.Secret) + var retryPeriod machineutils.RetryPeriod + retryPeriod, err = c.initializeMachine(ctx, clone, createMachineRequest.MachineClass, createMachineRequest.Secret) if err != nil { return retryPeriod, err } + // Return error even when machine object is updated + err = fmt.Errorf("machine creation in process. Machine initialization (if required) is successful") + return machineutils.ShortRetry, err } - - _, machineNodeLabelPresent := createMachineRequest.Machine.Labels[v1alpha1.NodeLabelKey] - _, machinePriorityAnnotationPresent := createMachineRequest.Machine.Annotations[machineutils.MachinePriority] - - if !machineNodeLabelPresent || !machinePriorityAnnotationPresent || machine.Spec.ProviderID == "" { - clone := machine.DeepCopy() - if clone.Labels == nil { - clone.Labels = make(map[string]string) - } - clone.Labels[v1alpha1.NodeLabelKey] = nodeName - if clone.Annotations == nil { - clone.Annotations = make(map[string]string) - } - if clone.Annotations[machineutils.MachinePriority] == "" { - clone.Annotations[machineutils.MachinePriority] = "3" - } - - clone.Spec.ProviderID = providerID - _, err := c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) - if err != nil { - klog.Warningf("Machine UPDATE failed for %q. Retrying, error: %s", machine.Name, err) - } else { - klog.V(2).Infof("Machine labels/annotations UPDATE for %q", machine.Name) - - // Return error even when machine object is updated - err = fmt.Errorf("Machine creation in process. Machine UPDATE successful") - } + if err != nil { return machineutils.ShortRetry, err } - if machine.Status.CurrentStatus.Phase == "" || machine.Status.CurrentStatus.Phase == v1alpha1.MachineCrashLoopBackOff { clone := machine.DeepCopy() clone.Status.LastOperation = v1alpha1.LastOperation{ @@ -577,23 +556,49 @@ func (c *controller) triggerCreationFlow(ctx context.Context, createMachineReque TimeoutActive: true, LastUpdateTime: metav1.Now(), } - _, err := c.controlMachineClient.Machines(clone.Namespace).UpdateStatus(ctx, clone, metav1.UpdateOptions{}) if err != nil { klog.Warningf("Machine/status UPDATE failed for %q. Retrying, error: %s", machine.Name, err) } else { klog.V(2).Infof("Machine/status UPDATE for %q during creation", machine.Name) - // Return error even when machine object is updated - err = fmt.Errorf("Machine creation in process. Machine/Status UPDATE successful") + err = fmt.Errorf("machine creation in process. Machine/Status UPDATE successful") } - return machineutils.ShortRetry, err } - return machineutils.LongRetry, nil } +func (c *controller) updateLabels(ctx context.Context, machine *v1alpha1.Machine, nodeName, providerID string) (clone *v1alpha1.Machine, err error) { + machineNodeLabelPresent := metav1.HasLabel(machine.ObjectMeta, v1alpha1.NodeLabelKey) + machinePriorityAnnotationPresent := metav1.HasAnnotation(machine.ObjectMeta, machineutils.MachinePriority) + clone = machine.DeepCopy() + if !machineNodeLabelPresent || !machinePriorityAnnotationPresent || machine.Spec.ProviderID == "" { + if clone.Labels == nil { + clone.Labels = make(map[string]string) + } + clone.Labels[v1alpha1.NodeLabelKey] = nodeName + if clone.Annotations == nil { + clone.Annotations = make(map[string]string) + } + if clone.Annotations[machineutils.MachinePriority] == "" { + clone.Annotations[machineutils.MachinePriority] = "3" + } + clone.Spec.ProviderID = providerID + var updatedMachine *v1alpha1.Machine + updatedMachine, err = c.controlMachineClient.Machines(clone.Namespace).Update(ctx, clone, metav1.UpdateOptions{}) + if err != nil { + klog.Warningf("Machine labels/annotations UPDATE failed for %q. Will retry after VM initialization (if required), error: %s", machine.Name, err) + clone = machine.DeepCopy() + } else { + clone = updatedMachine + klog.V(2).Infof("Machine labels/annotations UPDATE for %q", clone.Name) + err = fmt.Errorf("machine creation in process. Machine labels/annotations update is successful") + } + } + return clone, err +} + func (c *controller) initializeMachine(ctx context.Context, machine *v1alpha1.Machine, machineClass *v1alpha1.MachineClass, secret *corev1.Secret) (machineutils.RetryPeriod, error) { req := &driver.InitializeMachineRequest{ Machine: machine, @@ -608,15 +613,11 @@ func (c *controller) initializeMachine(ctx context.Context, machine *v1alpha1.Ma klog.Errorf("Cannot decode Driver error for machine %q: %s. Unexpected behaviour as Driver errors are expected to be of type status.Status", machine.Name, err) return machineutils.LongRetry, err } - klog.Errorf("Error occurred while initializing VM instance for machine %q: %s", machine.Name, err) - switch errStatus.Code() { - case codes.Unimplemented: + if errStatus.Code() == codes.Unimplemented { klog.V(3).Infof("Provider does not support Driver.InitializeMachine - skipping VM instance initialization for %q.", machine.Name) return 0, nil - case codes.NotFound: - klog.Warningf("No VM instance found for machine %q. Skipping VM instance initialization.", machine.Name) - return 0, nil } + klog.Errorf("Error occurred while initializing VM instance for machine %q: %s", machine.Name, err) updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, machine, @@ -633,14 +634,11 @@ func (c *controller) initializeMachine(ctx context.Context, machine *v1alpha1.Ma }, machine.Status.LastKnownState, ) - if updateErr != nil { return updateRetryPeriod, updateErr } - return machineutils.ShortRetry, err } - klog.V(3).Infof("VM instance %q for machine %q was initialized", resp.ProviderID, machine.Name) return 0, nil } @@ -661,7 +659,7 @@ func (c *controller) triggerDeletionFlow(ctx context.Context, deleteMachineReque return c.setMachineTerminationStatus(ctx, deleteMachineRequest) case strings.Contains(machine.Status.LastOperation.Description, machineutils.GetVMStatus): - return c.getVMStatus( + return c.updateMachineStatusAndNodeLabel( ctx, &driver.GetMachineStatusRequest{ Machine: deleteMachineRequest.Machine, diff --git a/pkg/util/provider/machinecontroller/machine_test.go b/pkg/util/provider/machinecontroller/machine_test.go index 2b87907b7..a658641f4 100644 --- a/pkg/util/provider/machinecontroller/machine_test.go +++ b/pkg/util/provider/machinecontroller/machine_test.go @@ -545,7 +545,7 @@ var _ = Describe("machine", func() { ProviderID: "fakeID", }, }, nil, nil, nil, map[string]string{v1alpha1.NodeLabelKey: "fakeNode-0"}, true, metav1.Now()), - err: fmt.Errorf("Machine creation in process. Machine UPDATE successful"), + err: fmt.Errorf("machine creation in process. Machine initialization (if required) is successful"), retry: machineutils.ShortRetry, }, }), @@ -623,7 +623,7 @@ var _ = Describe("machine", func() { true, metav1.Now(), ), - err: fmt.Errorf("Machine creation in process. Machine/Status UPDATE successful"), + err: fmt.Errorf("machine creation in process. Machine/Status UPDATE successful"), retry: machineutils.ShortRetry, }, }), @@ -1016,7 +1016,7 @@ var _ = Describe("machine", func() { true, metav1.Now(), ), - err: fmt.Errorf("Machine creation in process. Machine/Status UPDATE successful"), + err: fmt.Errorf("machine creation in process. Machine/Status UPDATE successful"), retry: machineutils.ShortRetry, }, }), @@ -1068,6 +1068,7 @@ var _ = Describe("machine", func() { Kind: "MachineClass", Name: "machineClass", }, + ProviderID: "fakeID", }, }, &v1alpha1.MachineStatus{ CurrentStatus: v1alpha1.CurrentStatus{ @@ -1079,7 +1080,7 @@ var _ = Describe("machine", func() { State: v1alpha1.MachineStateFailed, Type: v1alpha1.MachineOperationCreate, }, - }, nil, nil, nil, true, metav1.Now()), + }, nil, nil, map[string]string{v1alpha1.NodeLabelKey: "fakeNode-0"}, true, metav1.Now()), err: status.Error(codes.Uninitialized, "VM instance could not be initialized"), retry: machineutils.ShortRetry, }, @@ -1503,7 +1504,7 @@ var _ = Describe("machine", func() { }, }, expect: expect{ - err: fmt.Errorf("Machine deletion in process. VM with matching ID found"), + err: fmt.Errorf("machine deletion in process. VM with matching ID found"), retry: machineutils.ShortRetry, machine: newMachine( &v1alpha1.MachineTemplateSpec{ diff --git a/pkg/util/provider/machinecontroller/machine_util.go b/pkg/util/provider/machinecontroller/machine_util.go index 0058b6500..7f4886051 100644 --- a/pkg/util/provider/machinecontroller/machine_util.go +++ b/pkg/util/provider/machinecontroller/machine_util.go @@ -944,77 +944,74 @@ func (c *controller) setMachineTerminationStatus(ctx context.Context, deleteMach return machineutils.ShortRetry, err } -// getVMStatus tries to retrive VM status backed by machine -func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *driver.GetMachineStatusRequest) (machineutils.RetryPeriod, error) { +// updateMachineStatusAndNodeLabel tries to update the node name label if it is empty. This is required for drain to happen. +func (c *controller) updateMachineStatusAndNodeLabel(ctx context.Context, getMachineStatusRequest *driver.GetMachineStatusRequest) (machineutils.RetryPeriod, error) { var ( retry machineutils.RetryPeriod description string state v1alpha1.MachineState + err error + nodeName string ) - - statusResp, err := c.driver.GetMachineStatus(ctx, getMachineStatusRequest) - if err == nil { - // VM Found - - // If `node` label is missing on machine obj, then update this label on Machine object with nodeName from status response - nodeName := getMachineStatusRequest.Machine.Labels[v1alpha1.NodeLabelKey] - if nodeName == "" { - err = c.updateMachineNodeLabel(ctx, getMachineStatusRequest.Machine, statusResp.NodeName) - if err != nil { + // Node label is required for drain of node, therefore we try to update machine object before proceeding to drain. + isNodeLabelUpdated := false + //check if node name label is already present in machine object + nodeName = getMachineStatusRequest.Machine.Labels[v1alpha1.NodeLabelKey] + if nodeName != "" { + isNodeLabelUpdated = true + } else { + // Figure out node label either by checking all nodes for label matching machine name or retrieving it using GetMachineStatus + nodeName, err = c.getNodeName(ctx, getMachineStatusRequest) + if err == nil { + if err = c.updateMachineNodeLabel(ctx, getMachineStatusRequest.Machine, nodeName); err != nil { return machineutils.ShortRetry, err } + isNodeLabelUpdated = true + } else { + if machineErr, ok := status.FromError(err); !ok { + // Error occurred with decoding machine error status, aborting without retry. + description = "Error occurred with decoding machine error status while getting VM status, aborting without retry. " + err.Error() + " " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + retry = machineutils.LongRetry + err = fmt.Errorf("machine deletion has failed. %s", description) + } else { + // Decoding machine error code + switch machineErr.Code() { + case codes.Unimplemented: + // GetMachineStatus() call is not implemented + // In this case, try to drain and delete + description = machineutils.InitiateDrain + state = v1alpha1.MachineStateProcessing + retry = machineutils.ShortRetry + case codes.NotFound: + // VM was not found at provider, proceed to initiateDrain to ensure associated orphan resources such as NICs are deleted in the next few steps, before node object is deleted + description = "VM was not found at provider. Moving forward to node drain. " + machineutils.InitiateDrain + state = v1alpha1.MachineStateProcessing + retry = machineutils.ShortRetry + case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: + description = "Error occurred with decoding machine error status while getting VM status, aborting with retry. " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + retry = machineutils.ShortRetry + case codes.Uninitialized: + description = "VM instance was not initialized. Moving forward to node drain. " + machineutils.InitiateDrain + state = v1alpha1.MachineStateProcessing + retry = machineutils.ShortRetry + default: + // Error occurred with decoding machine error status, abort with retry. + description = "Error occurred with decoding machine error status while getting VM status, aborting without retry. machine code: " + err.Error() + " " + machineutils.GetVMStatus + state = v1alpha1.MachineStateFailed + retry = machineutils.MediumRetry + } + } } - + } + if isNodeLabelUpdated { description = machineutils.InitiateDrain state = v1alpha1.MachineStateProcessing retry = machineutils.ShortRetry - // Return error even when machine object is updated to ensure reconcilation is restarted - err = fmt.Errorf("Machine deletion in process. VM with matching ID found") - - } else { - if machineErr, ok := status.FromError(err); !ok { - // Error occurred with decoding machine error status, aborting without retry. - description = "Error occurred with decoding machine error status while getting VM status, aborting without retry. " + err.Error() + " " + machineutils.GetVMStatus - state = v1alpha1.MachineStateFailed - retry = machineutils.LongRetry - - err = fmt.Errorf("Machine deletion has failed. %s", description) - } else { - // Decoding machine error code - switch machineErr.Code() { - - case codes.Unimplemented: - // GetMachineStatus() call is not implemented - // In this case, try to drain and delete - description = machineutils.InitiateDrain - state = v1alpha1.MachineStateProcessing - retry = machineutils.ShortRetry - - case codes.NotFound: - // VM was not found at provder - description = "VM was not found at provider. " + machineutils.InitiateNodeDeletion - state = v1alpha1.MachineStateProcessing - retry = machineutils.ShortRetry - - case codes.Unknown, codes.DeadlineExceeded, codes.Aborted, codes.Unavailable: - description = "Error occurred with decoding machine error status while getting VM status, aborting with retry. " + machineutils.GetVMStatus - state = v1alpha1.MachineStateFailed - retry = machineutils.ShortRetry - case codes.Uninitialized: - description = "VM instance was not initalized. Moving forward to node drain. " + machineutils.InitiateDrain - state = v1alpha1.MachineStateProcessing - retry = machineutils.ShortRetry - - default: - // Error occurred with decoding machine error status, abort with retry. - description = "Error occurred with decoding machine error status while getting VM status, aborting without retry. machine code: " + err.Error() + " " + machineutils.GetVMStatus - state = v1alpha1.MachineStateFailed - retry = machineutils.MediumRetry - } - } + err = fmt.Errorf("machine deletion in process. VM with matching ID found") } - updateRetryPeriod, updateErr := c.machineStatusUpdate( ctx, getMachineStatusRequest.Machine, @@ -1030,19 +1027,12 @@ func (c *controller) getVMStatus(ctx context.Context, getMachineStatusRequest *d getMachineStatusRequest.Machine.Status.CurrentStatus, getMachineStatusRequest.Machine.Status.LastKnownState, ) - if updateErr != nil { return updateRetryPeriod, updateErr } - return retry, err } -// isValidNodeName checks if the nodeName is valid -func isValidNodeName(nodeName string) bool { - return nodeName != "" -} - // isConditionEmpty returns true if passed NodeCondition is empty func isConditionEmpty(condition v1.NodeCondition) bool { return condition == v1.NodeCondition{} @@ -1080,7 +1070,7 @@ func (c *controller) drainNode(ctx context.Context, deleteMachineRequest *driver ReadonlyFilesystem v1.NodeConditionType = "ReadonlyFilesystem" ) - if !isValidNodeName(nodeName) { + if nodeName == "" { message := "Skipping drain as nodeName is not a valid one for machine." printLogInitError(message, &err, &description, machine) skipDrain = true @@ -1730,3 +1720,30 @@ func (c *controller) updateMachineNodeLabel(ctx context.Context, machine *v1alph klog.V(2).Infof("Updated %q label on machine %q to %q", v1alpha1.NodeLabelKey, machine.Name, nodeName) return nil } + +func (c *controller) getNodeName(ctx context.Context, request *driver.GetMachineStatusRequest) (string, error) { + matchingNodeName, err := c.fetchMatchingNodeName(request.Machine.Name) + if err == nil { + return matchingNodeName, nil + } + klog.Errorf("Error trying to get node matching machine %s: %v. Will try to get the node name by calling driver.GetMachineStatus instead.", request.Machine.Name, err) + statusResp, err := c.driver.GetMachineStatus(ctx, request) + if err == nil { + return statusResp.NodeName, nil + } + return "", err +} + +func (c *controller) fetchMatchingNodeName(machineName string) (string, error) { + var nodes []*v1.Node + nodes, err := c.nodeLister.List(labels.Everything()) + if err != nil { + return "", fmt.Errorf("failed to list nodes for machine %q: %w", machineName, err) + } + for _, node := range nodes { + if node.Labels[machineutils.MachineLabelKey] == machineName { + return node.Name, nil + } + } + return "", fmt.Errorf("machine %q not found in node lister for machine %q", machineName, machineName) +}