From 600526121c352c32e91921909bea726e061ed3ab Mon Sep 17 00:00:00 2001
From: Aaron Francis Fernandes <79958509+aaronfern@users.noreply.github.com>
Date: Wed, 13 Dec 2023 16:21:08 +0530
Subject: [PATCH] Add options to `NodeGroupAutoscalingOptions` from
 `machineDeployment` annotations (#257)

* Add node group specific options to NodeGroupAutoscalingOptions from machineDeployment annotations

* Updated func and variable names

* Added unit tests

* Fix unit tests after rebase
---
 .../cloudprovider/mcm/mcm_cloud_provider.go   |  64 +++++++-
 .../mcm/mcm_cloud_provider_test.go            | 140 +++++++++++++++++-
 .../cloudprovider/mcm/mcm_manager.go          |  10 ++
 3 files changed, 210 insertions(+), 4 deletions(-)

diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
index 73d14e321a61..abf1fc7adad1 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider.go
@@ -24,6 +24,10 @@ package mcm
 import (
 	"context"
 	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
 	apiv1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
 	"k8s.io/apimachinery/pkg/labels"
@@ -34,7 +38,6 @@ import (
 	"k8s.io/autoscaler/cluster-autoscaler/utils/errors"
 	"k8s.io/klog/v2"
 	schedulerframework "k8s.io/kubernetes/pkg/scheduler/framework"
-	"strings"
 )
 
 const (
@@ -44,6 +47,21 @@ const (
 	// GPULabel is the label added to nodes with GPU resource.
 	// TODO: Align on a GPU Label for Gardener.
 	GPULabel = "gardener.cloud/accelerator"
+
+	// ScaleDownUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUtilizationThreshold
+	ScaleDownUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-utilization-threshold"
+
+	// ScaleDownGpuUtilizationThresholdAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownGpuUtilizationThreshold
+	ScaleDownGpuUtilizationThresholdAnnotation = "autoscaler.gardener.cloud/scale-down-gpu-utilization-threshold"
+
+	// ScaleDownUnneededTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnneededTime
+	ScaleDownUnneededTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unneeded-time"
+
+	// ScaleDownUnreadyTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.ScaleDownUnreadyTime
+	ScaleDownUnreadyTimeAnnotation = "autoscaler.gardener.cloud/scale-down-unready-time"
+
+	// MaxNodeProvisionTimeAnnotation is the annotation key for the value of NodeGroupAutoscalingOptions.MaxNodeProvisionTime
+	MaxNodeProvisionTimeAnnotation = "autoscaler.gardener.cloud/max-node-provision-time"
 )
 
 // MCMCloudProvider implements the cloud provider interface for machine-controller-manager
@@ -439,9 +457,49 @@ func (machinedeployment *MachineDeployment) Nodes() ([]cloudprovider.Instance, e
 // GetOptions returns NodeGroupAutoscalingOptions that should be used for this particular
 // NodeGroup. Returning a nil will result in using default options.
 // Implementation optional.
-// TODO: add proper implementation
 func (machinedeployment *MachineDeployment) GetOptions(defaults config.NodeGroupAutoscalingOptions) (*config.NodeGroupAutoscalingOptions, error) {
-	return nil, cloudprovider.ErrNotImplemented
+	mcdAnnotations, err := machinedeployment.mcmManager.GetMachineDeploymentAnnotations(machinedeployment.Name)
+	if err != nil {
+		return nil, err
+	}
+
+	scaleDownUtilThresholdValue := defaults.ScaleDownUtilizationThreshold
+	if _, ok := mcdAnnotations[ScaleDownUtilizationThresholdAnnotation]; ok {
+		if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownUtilizationThresholdAnnotation], 64); err == nil {
+			scaleDownUtilThresholdValue = floatVal
+		}
+	}
+	scaleDownGPUUtilThresholdValue := defaults.ScaleDownGpuUtilizationThreshold
+	if _, ok := mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation]; ok {
+		if floatVal, err := strconv.ParseFloat(mcdAnnotations[ScaleDownGpuUtilizationThresholdAnnotation], 64); err == nil {
+			scaleDownGPUUtilThresholdValue = floatVal
+		}
+	}
+	scaleDownUnneededDuration := defaults.ScaleDownUnneededTime
+	if _, ok := mcdAnnotations[ScaleDownUnneededTimeAnnotation]; ok {
+		if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnneededTimeAnnotation]); err == nil {
+			scaleDownUnneededDuration = durationVal
+		}
+	}
+	scaleDownUnreadyDuration := defaults.ScaleDownUnreadyTime
+	if _, ok := mcdAnnotations[ScaleDownUnreadyTimeAnnotation]; ok {
+		if durationVal, err := time.ParseDuration(mcdAnnotations[ScaleDownUnreadyTimeAnnotation]); err == nil {
+			scaleDownUnreadyDuration = durationVal
+		}
+	}
+	maxNodeProvisionDuration := defaults.MaxNodeProvisionTime
+	if _, ok := mcdAnnotations[MaxNodeProvisionTimeAnnotation]; ok {
+		if durationVal, err := time.ParseDuration(mcdAnnotations[MaxNodeProvisionTimeAnnotation]); err == nil {
+			maxNodeProvisionDuration = durationVal
+		}
+	}
+	return &config.NodeGroupAutoscalingOptions{
+		ScaleDownUtilizationThreshold:    scaleDownUtilThresholdValue,
+		ScaleDownGpuUtilizationThreshold: scaleDownGPUUtilThresholdValue,
+		ScaleDownUnneededTime:            scaleDownUnneededDuration,
+		ScaleDownUnreadyTime:             scaleDownUnreadyDuration,
+		MaxNodeProvisionTime:             maxNodeProvisionDuration,
+	}, nil
 }
 
 // TemplateNodeInfo returns a node template for this node group.
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
index 51d6c0f68d17..ff82a435e773 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_cloud_provider_test.go
@@ -20,14 +20,17 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	v1 "k8s.io/api/apps/v1"
 	"math"
 	"strings"
 	"testing"
+	"time"
+
+	v1 "k8s.io/api/apps/v1"
 
 	machinecodes "github.com/gardener/machine-controller-manager/pkg/util/provider/machinecodes/codes"
 	"k8s.io/autoscaler/cluster-autoscaler/cloudprovider"
 	customfake "k8s.io/autoscaler/cluster-autoscaler/cloudprovider/mcm/fakeclient"
+	"k8s.io/autoscaler/cluster-autoscaler/config"
 
 	"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
 	. "github.com/onsi/gomega"
@@ -580,3 +583,138 @@ func TestNodes(t *testing.T) {
 		})
 	}
 }
+
+func TestGetOptions(t *testing.T) {
+	type expect struct {
+		ngOptions *config.NodeGroupAutoscalingOptions
+		err       error
+	}
+	type data struct {
+		name   string
+		setup  setup
+		expect expect
+	}
+	table := []data{
+		{
+			"should throw error if machinedeployment cannot be found",
+			setup{
+				nodeGroups: []string{nodeGroup1},
+			},
+			expect{
+				err: fmt.Errorf("unable to fetch MachineDeployment object machinedeployment-1, Error: machinedeployment.machine.sapcloud.io \"machinedeployment-1\" not found"),
+			},
+		},
+		{
+			"should return default nodegroupautoscalingoptions if none are provided",
+			setup{
+				machineDeployments: newMachineDeployments(1, 2, nil, nil, nil),
+				nodeGroups:         []string{nodeGroup1},
+			},
+			expect{
+				ngOptions: &config.NodeGroupAutoscalingOptions{
+					ScaleDownUtilizationThreshold:    0.5,
+					ScaleDownGpuUtilizationThreshold: 0.5,
+					ScaleDownUnneededTime:            1 * time.Minute,
+					ScaleDownUnreadyTime:             1 * time.Minute,
+					MaxNodeProvisionTime:             1 * time.Minute,
+				},
+				err: nil,
+			},
+		},
+		{
+			"should return nodegroupautoscalingoptions with values from mcd if all annotations are present",
+			setup{
+				machineDeployments: newMachineDeployments(
+					1,
+					2,
+					nil,
+					map[string]string{
+						ScaleDownUtilizationThresholdAnnotation:    "0.7",
+						ScaleDownGpuUtilizationThresholdAnnotation: "0.7",
+						ScaleDownUnneededTimeAnnotation:            "5m",
+						ScaleDownUnreadyTimeAnnotation:             "5m",
+						MaxNodeProvisionTimeAnnotation:             "5m",
+					},
+					nil,
+				),
+				nodeGroups: []string{nodeGroup1},
+			},
+			expect{
+				ngOptions: &config.NodeGroupAutoscalingOptions{
+					ScaleDownUtilizationThreshold:    0.7,
+					ScaleDownGpuUtilizationThreshold: 0.7,
+					ScaleDownUnneededTime:            5 * time.Minute,
+					ScaleDownUnreadyTime:             5 * time.Minute,
+					MaxNodeProvisionTime:             5 * time.Minute,
+				},
+				err: nil,
+			},
+		},
+		{
+			"should return nodegroupautoscalingoptions with annotations values from mcd and remaining defaults",
+			setup{
+				machineDeployments: newMachineDeployments(
+					1,
+					2,
+					nil,
+					map[string]string{
+						ScaleDownUtilizationThresholdAnnotation: "0.7",
+						ScaleDownUnneededTimeAnnotation:         "5m",
+						MaxNodeProvisionTimeAnnotation:          "2m",
+					},
+					nil,
+				),
+				nodeGroups: []string{nodeGroup1},
+			},
+			expect{
+				ngOptions: &config.NodeGroupAutoscalingOptions{
+					ScaleDownUtilizationThreshold:    0.7,
+					ScaleDownGpuUtilizationThreshold: 0.5,
+					ScaleDownUnneededTime:            5 * time.Minute,
+					ScaleDownUnreadyTime:             1 * time.Minute,
+					MaxNodeProvisionTime:             2 * time.Minute,
+				},
+				err: nil,
+			},
+		},
+	}
+
+	for _, entry := range table {
+		entry := entry // have a shallow copy of the entry for parallelization of tests
+		t.Run(entry.name, func(t *testing.T) {
+			t.Parallel()
+			g := NewWithT(t)
+			stop := make(chan struct{})
+			defer close(stop)
+			controlMachineObjects, targetCoreObjects, _ := setupEnv(&entry.setup)
+			m, trackers, hasSyncedCacheFns := createMcmManager(t, stop, testNamespace, nil, controlMachineObjects, targetCoreObjects, nil)
+			defer trackers.Stop()
+			waitForCacheSync(t, stop, hasSyncedCacheFns)
+
+			md, err := buildMachineDeploymentFromSpec(entry.setup.nodeGroups[0], m)
+			g.Expect(err).To(BeNil())
+
+			ngAutoScalingOpDefaults := config.NodeGroupAutoscalingOptions{
+				ScaleDownUtilizationThreshold:    0.5,
+				ScaleDownGpuUtilizationThreshold: 0.5,
+				ScaleDownUnneededTime:            1 * time.Minute,
+				ScaleDownUnreadyTime:             1 * time.Minute,
+				MaxNodeProvisionTime:             1 * time.Minute,
+			}
+
+			options, err := md.GetOptions(ngAutoScalingOpDefaults)
+
+			if entry.expect.err != nil {
+				g.Expect(err).To(Equal(entry.expect.err))
+				g.Expect(options).To(BeNil())
+			} else {
+				g.Expect(err).To(BeNil())
+				g.Expect(*options).To(HaveField("ScaleDownUtilizationThreshold", entry.expect.ngOptions.ScaleDownUtilizationThreshold))
+				g.Expect(*options).To(HaveField("ScaleDownGpuUtilizationThreshold", entry.expect.ngOptions.ScaleDownGpuUtilizationThreshold))
+				g.Expect(*options).To(HaveField("ScaleDownUnneededTime", entry.expect.ngOptions.ScaleDownUnneededTime))
+				g.Expect(*options).To(HaveField("ScaleDownUnreadyTime", entry.expect.ngOptions.ScaleDownUnreadyTime))
+				g.Expect(*options).To(HaveField("MaxNodeProvisionTime", entry.expect.ngOptions.MaxNodeProvisionTime))
+			}
+		})
+	}
+}
diff --git a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
index 2b94c18fbe5c..7f662ac19626 100644
--- a/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
+++ b/cluster-autoscaler/cloudprovider/mcm/mcm_manager.go
@@ -707,6 +707,16 @@ func validateNodeTemplate(nodeTemplateAttributes *v1alpha1.NodeTemplate) error {
 	return nil
 }
 
+// GetMachineDeploymentAnnotations returns the annotations present on the machine deployment for the provided machine deployment name
+func (m *McmManager) GetMachineDeploymentAnnotations(machineDeploymentName string) (map[string]string, error) {
+	md, err := m.machineDeploymentLister.MachineDeployments(m.namespace).Get(machineDeploymentName)
+	if err != nil {
+		return nil, fmt.Errorf("unable to fetch MachineDeployment object %s, Error: %v", machineDeploymentName, err)
+	}
+
+	return md.Annotations, nil
+}
+
 // GetMachineDeploymentNodeTemplate returns the NodeTemplate of a node belonging to the same worker pool as the machinedeployment
 // If no node present then it forms the nodeTemplate using the one present in machineClass
 func (m *McmManager) GetMachineDeploymentNodeTemplate(machinedeployment *MachineDeployment) (*nodeTemplate, error) {