From 600526121c352c32e91921909bea726e061ed3ab Mon Sep 17 00:00:00 2001 From: Aaron Francis Fernandes <79958509+aaronfern@users.noreply.github.com> Date: Wed, 13 Dec 2023 16:21:08 +0530 Subject: [PATCH] Add options to `NodeGroupAutoscalingOptions` from `machineDeployment` annotations (#257) * Add node group specific options to NodeGroupAutoscalingOptions from machineDeployment annotations * Updated func and variable names * Added unit tests * Fix unit tests after rebase --- .../cloudprovider/mcm/mcm_cloud_provider.go | 64 +++++++- .../mcm/mcm_cloud_provider_test.go | 140 +++++++++++++++++- .../cloudprovider/mcm/mcm_manager.go | 10 ++ 3 files changed, 210 insertions(+), 4 deletions(-) diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go index 73d14e321a61..abf1fc7adad1 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go @@ -24,6 +24,10 @@ package mcm import ( "context" "fmt" + "strconv" + "strings" + "time" + apiv1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" @@ -34,7 +38,6 @@ import ( "k8s.io/autoscaler/cluster-autoscaler/utils/errors" "k8s.io/klog/v2" schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework" - "strings" ) const ( @@ -44,6 +47,21 @@ const ( // GPULabel is the label added to nodes with GPU resource. // TODO: Align on a GPU Label for Gardener. GPULabel = "gardener.cloud/accelerator" + + // ScaleDownUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUtilizationThreshold + ScaleDownUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-utilization-threshold" + + // ScaleDownGpuUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownGpuUtilizationThreshold + ScaleDownGpuUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-gpu-utilization-threshold" + + // ScaleDownUnneededTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnneededTime + ScaleDownUnneededTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unneeded-time" + + // ScaleDownUnreadyTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnreadyTime + ScaleDownUnreadyTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unready-time" + + // MaxNodeProvisionTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.MaxNodeProvisionTime + MaxNodeProvisionTimeAnnotation = "autoscaler.gardener.cloud/max-node-provision-time" ) // MCMCloudProvider implements the cloud provider interface for machine-controller-manager @@ -439,9 +457,49 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular // NodeGroup. Returning a nil will result in using default options. // Implementation optional. -// TODO: add proper implementation func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) { - return nil, cloudprovider.ErrNotImplemented + mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name) + if err != nil { + return nil, err + } + + scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold + if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok { + if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil { + scaleDownUtilThresholdValue = floatVal + } + } + scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold + if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok { + if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil { + scaleDownGPUUtilThresholdValue = floatVal + } + } + scaleDownUnneededDuration := defaults.ScaleDownUnneededTime + if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok { + if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil { + scaleDownUnneededDuration = durationVal + } + } + scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime + if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok { + if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil { + scaleDownUnreadyDuration = durationVal + } + } + maxNodeProvisionDuration := defaults.MaxNodeProvisionTime + if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok { + if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil { + maxNodeProvisionDuration = durationVal + } + } + return &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: scaleDownUtilThresholdValue, + ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue, + ScaleDownUnneededTime: scaleDownUnneededDuration, + ScaleDownUnreadyTime: scaleDownUnreadyDuration, + MaxNodeProvisionTime: maxNodeProvisionDuration, + }, nil } // TemplateNodeInfo returns a node template for this node group. diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go index 51d6c0f68d17..ff82a435e773 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go @@ -20,14 +20,17 @@ import ( "context" "errors" "fmt" - v1 "k8s.io/api/apps/v1" "math" "strings" "testing" + "time" + + v1 "k8s.io/api/apps/v1" machinecodes "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes" "k8s.io/autoscaler/cluster-autoscaler/cloudprovider" customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient" + "k8s.io/autoscaler/cluster-autoscaler/config" "github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1" . "github.com/onsi/gomega" @@ -580,3 +583,138 @@ func TestNodes(t *testing.T) { }) } } + +func TestGetOptions(t *testing.T) { + type expect struct { + ngOptions *config.NodeGroupAutoscalingOptions + err error + } + type data struct { + name string + setup setup + expect expect + } + table := []data{ + { + "should throw error if machinedeployment cannot be found", + setup{ + nodeGroups: []string{nodeGroup1}, + }, + expect{ + err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"), + }, + }, + { + "should return default nodegroupautoscalingoptions if none are provided", + setup{ + machineDeployments: newMachineDeployments(1, 2, nil, nil, nil), + nodeGroups: []string{nodeGroup1}, + }, + expect{ + ngOptions: &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.5, + ScaleDownGpuUtilizationThreshold: 0.5, + ScaleDownUnneededTime: 1 * time.Minute, + ScaleDownUnreadyTime: 1 * time.Minute, + MaxNodeProvisionTime: 1 * time.Minute, + }, + err: nil, + }, + }, + { + "should return nodegroupautoscalingoptions with values from mcd if all annotations are present", + setup{ + machineDeployments: newMachineDeployments( + 1, + 2, + nil, + map[string]string{ + ScaleDownUtilizationThresholdAnnotation: "0.7", + ScaleDownGpuUtilizationThresholdAnnotation: "0.7", + ScaleDownUnneededTimeAnnotation: "5m", + ScaleDownUnreadyTimeAnnotation: "5m", + MaxNodeProvisionTimeAnnotation: "5m", + }, + nil, + ), + nodeGroups: []string{nodeGroup1}, + }, + expect{ + ngOptions: &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.7, + ScaleDownGpuUtilizationThreshold: 0.7, + ScaleDownUnneededTime: 5 * time.Minute, + ScaleDownUnreadyTime: 5 * time.Minute, + MaxNodeProvisionTime: 5 * time.Minute, + }, + err: nil, + }, + }, + { + "should return nodegroupautoscalingoptions with annotations values from mcd and remaining defaults", + setup{ + machineDeployments: newMachineDeployments( + 1, + 2, + nil, + map[string]string{ + ScaleDownUtilizationThresholdAnnotation: "0.7", + ScaleDownUnneededTimeAnnotation: "5m", + MaxNodeProvisionTimeAnnotation: "2m", + }, + nil, + ), + nodeGroups: []string{nodeGroup1}, + }, + expect{ + ngOptions: &config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.7, + ScaleDownGpuUtilizationThreshold: 0.5, + ScaleDownUnneededTime: 5 * time.Minute, + ScaleDownUnreadyTime: 1 * time.Minute, + MaxNodeProvisionTime: 2 * time.Minute, + }, + err: nil, + }, + }, + } + + for _, entry := range table { + entry := entry // have a shallow copy of the entry for parallelization of tests + t.Run(entry.name, func(t *testing.T) { + t.Parallel() + g := NewWithT(t) + stop := make(chan struct{}) + defer close(stop) + controlMachineObjects, targetCoreObjects, _ := setupEnv(&entry.setup) + m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, nil, controlMachineObjects, targetCoreObjects, nil) + defer trackers.Stop() + waitForCacheSync(t, stop, hasSyncedCacheFns) + + md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m) + g.Expect(err).To(BeNil()) + + ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{ + ScaleDownUtilizationThreshold: 0.5, + ScaleDownGpuUtilizationThreshold: 0.5, + ScaleDownUnneededTime: 1 * time.Minute, + ScaleDownUnreadyTime: 1 * time.Minute, + MaxNodeProvisionTime: 1 * time.Minute, + } + + options, err := md.GetOptions(ngAutoScalingOpDefaults) + + if entry.expect.err != nil { + g.Expect(err).To(Equal(entry.expect.err)) + g.Expect(options).To(BeNil()) + } else { + g.Expect(err).To(BeNil()) + g.Expect(*options).To(HaveField("ScaleDownUtilizationThreshold", entry.expect.ngOptions.ScaleDownUtilizationThreshold)) + g.Expect(*options).To(HaveField("ScaleDownGpuUtilizationThreshold", entry.expect.ngOptions.ScaleDownGpuUtilizationThreshold)) + g.Expect(*options).To(HaveField("ScaleDownUnneededTime", entry.expect.ngOptions.ScaleDownUnneededTime)) + g.Expect(*options).To(HaveField("ScaleDownUnreadyTime", entry.expect.ngOptions.ScaleDownUnreadyTime)) + g.Expect(*options).To(HaveField("MaxNodeProvisionTime", entry.expect.ngOptions.MaxNodeProvisionTime)) + } + }) + } +} diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go index 2b94c18fbe5c..7f662ac19626 100644 --- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go +++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go @@ -707,6 +707,16 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error { return nil } +// GetMachineDeploymentAnnotations returns the annotations present on the machine deployment for the provided machine deployment name +func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName string) (map[string]string, error) { + md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machineDeploymentName) + if err != nil { + return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", machineDeploymentName, err) + } + + return md.Annotations, nil +} + // GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment // If no node present then it forms the nodeTemplate using the one present in machineClass func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {