Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Add a Filter plugin to ensure that non-GPU pods are not sch… #788

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
129 changes: 129 additions & 0 deletions pkg/gpumanagementfilter/gpu_management_filter.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
package gpumanagementfilter
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we have a KEP document?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, let me write a KEP doc first


import (
"context"
"fmt"
"strings"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/kubernetes/pkg/scheduler/framework"
)

const (
// PluginName is the name of the plugin used in the plugin registry and configurations.
PluginName = "GPUManagementFilter"

//preFilterStateKey is the key in CycleState to NodeResourcesFit pre-computed data.
// Using the name of the plugin helps avoid collisions with other plugins.
preFilterStateKey = "PreFilter" + PluginName

//devicePluginContainerImage is the nvidia provided kubernetes device plugin image.
devicePluginContainerImage = "nvcr.io/nvidia/k8s-device-plugin"

// ResourceNvidiaGPU is nvidia GPU resource
ResourceNvidiaGPU = "nvidia.com/gpu"
Comment on lines +21 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should move both of those values so they can be changed via configuration file. Some clusters may also have more than one gpu resource on the node, so it would be great if we could provide a list of gpu resources.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, Using configuration files is better than hard-coding. Maybe we don't need to limit ourselves to GPU resources, can we extend it to different resources?

)

type GPUManagementFilter struct {
profileName string
}

var (
_ framework.FilterPlugin = &GPUManagementFilter{}
_ framework.PreFilterPlugin = &GPUManagementFilter{}
)

// Name returns name of the plugin. It is used in logs, etc.
func (p *GPUManagementFilter) Name() string {
return PluginName
}

// preFilterState computed at PreFilter and used in Filter.
type preFilterState struct {
isPodRequestingGPUs bool
isPodNvidiaDevicePlugin bool
}

// Clone implements the StateData interface.
// It does not actually copy the data, as there is no need to do so.
func (s *preFilterState) Clone() framework.StateData {
return s
}

// PreFilter invoked at the prefilter extension point.
func (p *GPUManagementFilter) PreFilter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod) (*framework.PreFilterResult, *framework.Status) {
cycleState.Write(preFilterStateKey, updatePreFilterState(pod))
return nil, nil
}

// PreFilterExtensions returns prefilter extensions, pod add and remove.
func (p *GPUManagementFilter) PreFilterExtensions() framework.PreFilterExtensions {
return nil
}

func getPreFilterState(cycleState *framework.CycleState) (*preFilterState, error) {
c, err := cycleState.Read(preFilterStateKey)
if err != nil {
return nil, fmt.Errorf("error reading %q from cycleState: %w", preFilterStateKey, err)
}

s, ok := c.(*preFilterState)
if !ok {
return nil, fmt.Errorf("%+v convert to GPUSchedulerFilterPlugin.preFilterState error", c)
}
return s, nil
}

// Filter invoked at the filter extension point.
func (p *GPUManagementFilter) Filter(ctx context.Context, cycleState *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
s, err := getPreFilterState(cycleState)
if err != nil {
return framework.AsStatus(err)
}

if (!s.isPodRequestingGPUs) && isGPUNode(nodeInfo) && (!s.isPodNvidiaDevicePlugin) {
return framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod")
}

return nil
}

// New initializes a new plugin and returns it.
func New(_ runtime.Object, h framework.Handle) (framework.Plugin, error) {
return &GPUManagementFilter{}, nil
}

// updatePreFilterState checks if given pod is requesting GPUs or not by checking resource limits it also checks if the pod is a Nvidia Device Plugin Pod or not and updates the PreFilter State accordingly
func updatePreFilterState(pod *v1.Pod) *preFilterState {
result := &preFilterState{}

initContainers := pod.Spec.InitContainers
containers := pod.Spec.Containers

if containsNvidiaDevicePluginImage(containers) {
result.isPodNvidiaDevicePlugin = true
}

if requestsNvidiaGPU(initContainers) || requestsNvidiaGPU(containers) {
result.isPodRequestingGPUs = true
return result
}
return result
}

// isGPUNode checks if given node has GPU resource or not by checking Allocatable
func isGPUNode(nodeInfo *framework.NodeInfo) bool {
_, gpuAllocatable := nodeInfo.Allocatable.ScalarResources[ResourceNvidiaGPU]
return gpuAllocatable
}

// containsNvidiaDevicePluginImage checks if any of the container images in containerList are the nvidia device plugin. The device plugin image would be the invariant across different Daemonset specs (only the version would be changed). Labels or annotation might be changed or added by anyone on their pod spec
func containsNvidiaDevicePluginImage(containerList []v1.Container) bool {
for _, containerItem := range containerList {
if strings.Contains(containerItem.Image, devicePluginContainerImage) {
return true
}
}
return false
}
140 changes: 140 additions & 0 deletions pkg/gpumanagementfilter/gpu_management_filter_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
package gpumanagementfilter

import (
"context"
"testing"

"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/kubernetes/pkg/scheduler/framework"
)

func TestGPUManagementFilter(t *testing.T) {
tests := []struct {
msg string
pod *v1.Pod
node *framework.NodeInfo
wantStatus *framework.Status
}{
{
msg: "gpu node, gpu pod",
pod: makeGPUPod("p1", 2),
node: makeGPUNode("n1", "2"),
wantStatus: nil,
},
{
msg: "non-gpu node, gpu pod",
pod: makeGPUPod("p2", 2),
node: makeNonGPUNode("n2"),
wantStatus: nil,
},
{
msg: "gpu node, non-gpu pod",
pod: makeNonGPUPod("p3"),
node: makeGPUNode("n3", "2"),
wantStatus: framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod"),
},
{
msg: "non-gpu node, non-gpu pod",
pod: makeNonGPUPod("p4"),
node: makeNonGPUNode("n4"),
wantStatus: nil,
},
{
msg: "gpu node, gpu pod (requesting 0 GPUs)",
pod: makeGPUPod("p5", 0),
node: makeGPUNode("n5", "2"),
wantStatus: framework.NewStatus(framework.Unschedulable, "gpu node, non-gpu pod"),
},
{
msg: "non-gpu node, gpu pod (requesting 0 GPUs)",
pod: makeGPUPod("p6", 0),
node: makeNonGPUNode("n6"),
wantStatus: nil,
},
{
msg: "gpu node, device plugin pod",
pod: makeNonGPUNvidiaDevicePluginPod("p7"),
node: makeNonGPUNode("n7"),
wantStatus: nil,
},
}
for _, tt := range tests {
t.Run(tt.msg, func(t *testing.T) {
ctx := context.Background()
cycleState := framework.NewCycleState()
plugin := &GPUManagementFilter{
profileName: v1.DefaultSchedulerName,
}
if _, status := plugin.PreFilter(ctx, cycleState, tt.pod); !status.IsSuccess() {
t.Fatal(status.AsError())
}

status := plugin.Filter(ctx, cycleState, tt.pod, tt.node)
assert.Equal(t, tt.wantStatus, status)
})
}
}

func makeGPUPod(node string, limits int64) *v1.Pod {
p := &v1.Pod{
Spec: v1.PodSpec{
NodeName: node,
Containers: []v1.Container{v1.Container{Resources: v1.ResourceRequirements{
Limits: v1.ResourceList{v1.ResourceName(ResourceNvidiaGPU): *resource.NewQuantity(limits, resource.DecimalSI)},
}}},
},
}
return p
}

func makeGPUNode(node, gpu string, pods ...*v1.Pod) *framework.NodeInfo {
n := framework.NewNodeInfo(pods...)
n.SetNode(&v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: node},
Status: v1.NodeStatus{
Allocatable: v1.ResourceList{
v1.ResourceName(ResourceNvidiaGPU): resource.MustParse(gpu),
},
},
})
return n
}

func makeNonGPUPod(node string) *v1.Pod {
p := &v1.Pod{
Spec: v1.PodSpec{
NodeName: node,
Containers: []v1.Container{v1.Container{Resources: v1.ResourceRequirements{}}},
},
}
return p
}

func makeNonGPUNvidiaDevicePluginPod(node string) *v1.Pod {
p := &v1.Pod{
Spec: v1.PodSpec{
NodeName: node,
Containers: []v1.Container{
v1.Container{
Image: "nvcr.io/nvidia/k8s-device-plugin:v0.11.0",
Resources: v1.ResourceRequirements{},
},
},
},
}
return p
}

func makeNonGPUNode(node string, pods ...*v1.Pod) *framework.NodeInfo {
n := framework.NewNodeInfo(pods...)
n.SetNode(&v1.Node{
ObjectMeta: metav1.ObjectMeta{Name: node},
Status: v1.NodeStatus{
Allocatable: v1.ResourceList{},
},
})
return n
}
22 changes: 22 additions & 0 deletions pkg/gpumanagementfilter/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package gpumanagementfilter

import v1 "k8s.io/api/core/v1"

// RequestsNvidiaGPU checks if any container in the list requests for a NVIDIA GPU
func requestsNvidiaGPU(containerList []v1.Container) bool {
for _, containerItem := range containerList {
if checkNvidiaGPUResources(containerItem.Resources.Requests) || checkNvidiaGPUResources(containerItem.Resources.Limits) {
return true
}
}
return false
}

func checkNvidiaGPUResources(containerResources v1.ResourceList) bool {
value, isPresent := containerResources[ResourceNvidiaGPU]
valueInt, _ := value.AsInt64()
if isPresent && valueInt > 0 {
return true
}
return false
}
66 changes: 66 additions & 0 deletions pkg/gpumanagementfilter/utils_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package gpumanagementfilter

import (
"testing"

"github.com/stretchr/testify/assert"
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
)

var cpuResourceList = v1.ResourceList{
"cpu": *resource.NewQuantity(5, resource.DecimalSI),
}

var gpuResourceList = v1.ResourceList{
ResourceNvidiaGPU: *resource.NewQuantity(5, resource.DecimalSI),
}

var zeroGpuResourceList = v1.ResourceList{
ResourceNvidiaGPU: *resource.NewQuantity(0, resource.DecimalSI),
}

func TestRequestsNvidiaGPU(t *testing.T) {
tests := []struct {
name string
resourceList v1.ResourceList
expected bool
}{
{
name: "containers requesting CPUs only",
resourceList: cpuResourceList,
expected: false,
},
{
name: "containers requesting GPU resource with 0 value",
resourceList: zeroGpuResourceList,
expected: false,
},
{
name: "containers requesting GPUs",
resourceList: gpuResourceList,
expected: true,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
assert.Equal(t, tt.expected, requestsNvidiaGPU(getContainerList(tt.resourceList)))
})
}
}

func getContainerList(resourceList v1.ResourceList) []v1.Container {
return []v1.Container{
{
Resources: v1.ResourceRequirements{
Limits: resourceList,
},
},
{
Resources: v1.ResourceRequirements{
Limits: cpuResourceList,
},
},
}
}