Skip to content

Commit

Permalink
Add retry and background run to node taint removal
Browse files Browse the repository at this point in the history
Signed-off-by: Connor Catlett <[email protected]>
  • Loading branch information
ConnorJC3 committed Dec 12, 2023
1 parent 1aa09de commit 555b028
Showing 1 changed file with 29 additions and 7 deletions.
36 changes: 29 additions & 7 deletions pkg/driver/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"os"
"path/filepath"
"strings"
"time"

csi "github.com/container-storage-interface/spec/lib/go/csi"
"github.com/kubernetes-sigs/aws-ebs-csi-driver/pkg/cloud"
Expand All @@ -33,6 +34,7 @@ import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8stypes "k8s.io/apimachinery/pkg/types"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/klog/v2"
"k8s.io/kubernetes/pkg/volume"
)
Expand All @@ -44,7 +46,7 @@ const (
// VolumeOperationAlreadyExists is message fmt returned to CO when there is another in-flight call on the given volumeID
VolumeOperationAlreadyExists = "An operation with the given volume=%q is already in progress"

//sbeDeviceVolumeAttachmentLimit refers to the maximum number of volumes that can be attached to an instance on snow.
// sbeDeviceVolumeAttachmentLimit refers to the maximum number of volumes that can be attached to an instance on snow.
sbeDeviceVolumeAttachmentLimit = 10
)

Expand All @@ -65,6 +67,13 @@ var (
csi.NodeServiceCapability_RPC_EXPAND_VOLUME,
csi.NodeServiceCapability_RPC_GET_VOLUME_STATS,
}

// taintRemovalBackoff is the exponential backoff configuration for node taint removal
taintRemovalBackoff = wait.Backoff{
Duration: 500 * time.Millisecond,
Factor: 2,
Steps: 10, // Max delay = 0.5 * 2^9 = ~4 minutes
}
)

// nodeService represents the node service of CSI driver
Expand Down Expand Up @@ -94,10 +103,7 @@ func newNodeService(driverOptions *DriverOptions) nodeService {

// Remove taint from node to indicate driver startup success
// This is done at the last possible moment to prevent race conditions or false positive removals
err = removeNotReadyTaint(cloud.DefaultKubernetesAPIClient)
if err != nil {
klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)")
}
go removeTaintInBackground(cloud.DefaultKubernetesAPIClient)

return nodeService{
metadata: metadata,
Expand Down Expand Up @@ -849,6 +855,22 @@ type JSONPatch struct {
Value interface{} `json:"value"`
}

// removeTaintInBackground is a goroutine that retries removeNotReadyTaint with exponential backoff
func removeTaintInBackground(k8sClient cloud.KubernetesAPIClient) {
backoffErr := wait.ExponentialBackoff(taintRemovalBackoff, func() (bool, error) {
err := removeNotReadyTaint(k8sClient)
if err != nil {
klog.ErrorS(err, "Unexpected failure when attempting to remove node taint(s)")
return false, err
}
return true, nil
})

if backoffErr != nil {
klog.ErrorS(backoffErr, "Retries exhausted, giving up attempting to remove node taint(s)")
}
}

// removeNotReadyTaint removes the taint ebs.csi.aws.com/agent-not-ready from the local node
// This taint can be optionally applied by users to prevent startup race conditions such as
// https://github.com/kubernetes/kubernetes/issues/95911
Expand All @@ -861,8 +883,8 @@ func removeNotReadyTaint(k8sClient cloud.KubernetesAPIClient) error {

clientset, err := k8sClient()
if err != nil {
klog.V(4).InfoS("Failed to communicate with k8s API, skipping taint removal")
return nil //lint:ignore nilerr Failing to communicate with k8s API is a soft failure
klog.V(4).InfoS("Failed to setup k8s client")
return nil //lint:ignore nilerr If there are no k8s credentials, treat that as a soft failure
}

node, err := clientset.CoreV1().Nodes().Get(context.Background(), nodeName, metav1.GetOptions{})
Expand Down

0 comments on commit 555b028

Please sign in to comment.