kubernetes-sigs · Zeel-Patel · Aug 1, 2024 · googs1025 · Aug 1, 2024 · Zeel-Patel
diff --git a/pkg/gpumanagementfilter/gpu_management_filter.go b/pkg/gpumanagementfilter/gpu_management_filter.go
@@ -0,0 +1,129 @@
+package gpumanagementfilter
+
+import (
+	"context"
+	"fmt"
+	"strings"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/kubernetes/pkg/scheduler/framework"
+)
+
+const (
+	// PluginName is the name of the plugin used in the plugin registry and configurations.
+	PluginName = "GPUManagementFilter"
+
+	//preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
+	// Using the name of the plugin helps avoid collisions with other plugins.
+	preFilterStateKey = "PreFilter" + PluginName
+
+	//devicePluginContainerImage is the nvidia provided kubernetes device plugin image.
+	devicePluginContainerImage = "nvcr.io/nvidia/k8s-device-plugin"
+
+	// ResourceNvidiaGPU is nvidia GPU resource
+	ResourceNvidiaGPU = "nvidia.com/gpu"
+)
+
+type GPUManagementFilter struct {
+	profileName string
+}
+
+var (
+	_ framework.FilterPlugin    = &GPUManagementFilter{}
+	_ framework.PreFilterPlugin = &GPUManagementFilter{}
+)
+
+// Name returns name of the plugin. It is used in logs, etc.
+func (p *GPUManagementFilter) Name() string {
+	return PluginName
+}
+
+// preFilterState computed at PreFilter and used in Filter.
+type preFilterState struct {
+	isPodRequestingGPUs     bool
+	isPodNvidiaDevicePlugin bool
+}
+
+// Clone implements the StateData interface.
+// It does not actually copy the data, as there is no need to do so.
+func (s *preFilterState) Clone() framework.StateData {
+	return s
+}
+
+// PreFilter invoked at the prefilter extension point.
+func (p *GPUManagementFilter) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
+	cycleState.Write(preFilterStateKey, updatePreFilterState(pod))
+	return nil, nil
+}
+
+// PreFilterExtensions returns prefilter extensions, pod add and remove.
+func (p *GPUManagementFilter) PreFilterExtensions() framework.PreFilterExtensions {
+	return nil
+}
+
+func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
+	c, err := cycleState.Read(preFilterStateKey)
+	if err != nil {
+		return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
+	}
+
+	s, ok := c.(*preFilterState)
+	if !ok {
+		return nil, fmt.Errorf("%+v  convert to GPUSchedulerFilterPlugin.preFilterState error", c)
+	}
+	return s, nil
+}
+
+// Filter invoked at the filter extension point.
+func (p *GPUManagementFilter) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
+	s, err := getPreFilterState(cycleState)
+	if err != nil {
+		return framework.AsStatus(err)
+	}
+
+	if (!s.isPodRequestingGPUs) && isGPUNode(nodeInfo) && (!s.isPodNvidiaDevicePlugin) {
+		return framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod")
+	}
+
+	return nil
+}
+
+// New initializes a new plugin and returns it.
+func New(_ runtime.Object, h framework.Handle) (framework.Plugin, error) {
+	return &GPUManagementFilter{}, nil
+}
+
+// updatePreFilterState checks if given pod is requesting GPUs or not by checking resource limits it also checks if the pod is a Nvidia Device Plugin Pod or not and updates the PreFilter State accordingly
+func updatePreFilterState(pod *v1.Pod) *preFilterState {
+	result := &preFilterState{}
+
+	initContainers := pod.Spec.InitContainers
+	containers := pod.Spec.Containers
+
+	if containsNvidiaDevicePluginImage(containers) {
+		result.isPodNvidiaDevicePlugin = true
+	}
+
+	if requestsNvidiaGPU(initContainers) || requestsNvidiaGPU(containers) {
+		result.isPodRequestingGPUs = true
+		return result
+	}
+	return result
+}
+
+// isGPUNode checks if given node has GPU resource or not by checking Allocatable
+func isGPUNode(nodeInfo *framework.NodeInfo) bool {
+	_, gpuAllocatable := nodeInfo.Allocatable.ScalarResources[ResourceNvidiaGPU]
+	return gpuAllocatable
+}
+
+// containsNvidiaDevicePluginImage checks if any of the container images in containerList are the nvidia device plugin. The device plugin image would be the invariant across different Daemonset specs (only the version would be changed). Labels or annotation might be changed or added by anyone on their pod spec
+func containsNvidiaDevicePluginImage(containerList []v1.Container) bool {
+	for _, containerItem := range containerList {
+		if strings.Contains(containerItem.Image, devicePluginContainerImage) {
+			return true
+		}
+	}
+	return false
+}
diff --git a/pkg/gpumanagementfilter/gpu_management_filter_test.go b/pkg/gpumanagementfilter/gpu_management_filter_test.go
@@ -0,0 +1,140 @@
+package gpumanagementfilter
+
+import (
+	"context"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/kubernetes/pkg/scheduler/framework"
+)
+
+func TestGPUManagementFilter(t *testing.T) {
+	tests := []struct {
+		msg        string
+		pod        *v1.Pod
+		node       *framework.NodeInfo
+		wantStatus *framework.Status
+	}{
+		{
+			msg:        "gpu node, gpu pod",
+			pod:        makeGPUPod("p1", 2),
+			node:       makeGPUNode("n1", "2"),
+			wantStatus: nil,
+		},
+		{
+			msg:        "non-gpu node, gpu pod",
+			pod:        makeGPUPod("p2", 2),
+			node:       makeNonGPUNode("n2"),
+			wantStatus: nil,
+		},
+		{
+			msg:        "gpu node, non-gpu pod",
+			pod:        makeNonGPUPod("p3"),
+			node:       makeGPUNode("n3", "2"),
+			wantStatus: framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod"),
+		},
+		{
+			msg:        "non-gpu node, non-gpu pod",
+			pod:        makeNonGPUPod("p4"),
+			node:       makeNonGPUNode("n4"),
+			wantStatus: nil,
+		},
+		{
+			msg:        "gpu node, gpu pod (requesting 0 GPUs)",
+			pod:        makeGPUPod("p5", 0),
+			node:       makeGPUNode("n5", "2"),
+			wantStatus: framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod"),
+		},
+		{
+			msg:        "non-gpu node, gpu pod (requesting 0 GPUs)",
+			pod:        makeGPUPod("p6", 0),
+			node:       makeNonGPUNode("n6"),
+			wantStatus: nil,
+		},
+		{
+			msg:        "gpu node, device plugin pod",
+			pod:        makeNonGPUNvidiaDevicePluginPod("p7"),
+			node:       makeNonGPUNode("n7"),
+			wantStatus: nil,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.msg, func(t *testing.T) {
+			ctx := context.Background()
+			cycleState := framework.NewCycleState()
+			plugin := &GPUManagementFilter{
+				profileName: v1.DefaultSchedulerName,
+			}
+			if _, status := plugin.PreFilter(ctx, cycleState, tt.pod); !status.IsSuccess() {
+				t.Fatal(status.AsError())
+			}
+
+			status := plugin.Filter(ctx, cycleState, tt.pod, tt.node)
+			assert.Equal(t, tt.wantStatus, status)
+		})
+	}
+}
+
+func makeGPUPod(node string, limits int64) *v1.Pod {
+	p := &v1.Pod{
+		Spec: v1.PodSpec{
+			NodeName: node,
+			Containers: []v1.Container{v1.Container{Resources: v1.ResourceRequirements{
+				Limits: v1.ResourceList{v1.ResourceName(ResourceNvidiaGPU): *resource.NewQuantity(limits, resource.DecimalSI)},
+			}}},
+		},
+	}
+	return p
+}
+
+func makeGPUNode(node, gpu string, pods ...*v1.Pod) *framework.NodeInfo {
+	n := framework.NewNodeInfo(pods...)
+	n.SetNode(&v1.Node{
+		ObjectMeta: metav1.ObjectMeta{Name: node},
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{
+				v1.ResourceName(ResourceNvidiaGPU): resource.MustParse(gpu),
+			},
+		},
+	})
+	return n
+}
+
+func makeNonGPUPod(node string) *v1.Pod {
+	p := &v1.Pod{
+		Spec: v1.PodSpec{
+			NodeName:   node,
+			Containers: []v1.Container{v1.Container{Resources: v1.ResourceRequirements{}}},
+		},
+	}
+	return p
+}
+
+func makeNonGPUNvidiaDevicePluginPod(node string) *v1.Pod {
+	p := &v1.Pod{
+		Spec: v1.PodSpec{
+			NodeName: node,
+			Containers: []v1.Container{
+				v1.Container{
+					Image:     "nvcr.io/nvidia/k8s-device-plugin:v0.11.0",
+					Resources: v1.ResourceRequirements{},
+				},
+			},
+		},
+	}
+	return p
+}
+
+func makeNonGPUNode(node string, pods ...*v1.Pod) *framework.NodeInfo {
+	n := framework.NewNodeInfo(pods...)
+	n.SetNode(&v1.Node{
+		ObjectMeta: metav1.ObjectMeta{Name: node},
+		Status: v1.NodeStatus{
+			Allocatable: v1.ResourceList{},
+		},
+	})
+	return n
+}
diff --git a/pkg/gpumanagementfilter/utils.go b/pkg/gpumanagementfilter/utils.go
@@ -0,0 +1,22 @@
+package gpumanagementfilter
+
+import v1 "k8s.io/api/core/v1"
+
+// RequestsNvidiaGPU checks if any container in the list requests for a NVIDIA GPU
+func requestsNvidiaGPU(containerList []v1.Container) bool {
+	for _, containerItem := range containerList {
+		if checkNvidiaGPUResources(containerItem.Resources.Requests) || checkNvidiaGPUResources(containerItem.Resources.Limits) {
+			return true
+		}
+	}
+	return false
+}
+
+func checkNvidiaGPUResources(containerResources v1.ResourceList) bool {
+	value, isPresent := containerResources[ResourceNvidiaGPU]
+	valueInt, _ := value.AsInt64()
+	if isPresent && valueInt > 0 {
+		return true
+	}
+	return false
+}
diff --git a/pkg/gpumanagementfilter/utils_test.go b/pkg/gpumanagementfilter/utils_test.go
@@ -0,0 +1,66 @@
+package gpumanagementfilter
+
+import (
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/resource"
+)
+
+var cpuResourceList = v1.ResourceList{
+	"cpu": *resource.NewQuantity(5, resource.DecimalSI),
+}
+
+var gpuResourceList = v1.ResourceList{
+	ResourceNvidiaGPU: *resource.NewQuantity(5, resource.DecimalSI),
+}
+
+var zeroGpuResourceList = v1.ResourceList{
+	ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
+}
+
+func TestRequestsNvidiaGPU(t *testing.T) {
+	tests := []struct {
+		name         string
+		resourceList v1.ResourceList
+		expected     bool
+	}{
+		{
+			name:         "containers requesting CPUs only",
+			resourceList: cpuResourceList,
+			expected:     false,
+		},
+		{
+			name:         "containers requesting GPU resource with 0 value",
+			resourceList: zeroGpuResourceList,
+			expected:     false,
+		},
+		{
+			name:         "containers requesting GPUs",
+			resourceList: gpuResourceList,
+			expected:     true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			assert.Equal(t, tt.expected, requestsNvidiaGPU(getContainerList(tt.resourceList)))
+		})
+	}
+}
+
+func getContainerList(resourceList v1.ResourceList) []v1.Container {
+	return []v1.Container{
+		{
+			Resources: v1.ResourceRequirements{
+				Limits: resourceList,
+			},
+		},
+		{
+			Resources: v1.ResourceRequirements{
+				Limits: cpuResourceList,
+			},
+		},
+	}
+}