diff --git a/ray-operator/controllers/ray/common/pod.go b/ray-operator/controllers/ray/common/pod.go index 1994a4f987..8b9e75932c 100644 --- a/ray-operator/controllers/ray/common/pod.go +++ b/ray-operator/controllers/ray/common/pod.go @@ -38,6 +38,7 @@ const ( NeuronCoreRayResourceName = "neuron_cores" TPUContainerResourceName = "google.com/tpu" TPURayResourceName = "TPU" + AliyunGPUContainerResourceName = "aliyun.com/gpu-mem" ) var customAcceleratorToRayResourceMap = map[string]string{ @@ -795,9 +796,22 @@ func addWellKnownAcceleratorResources(rayStartParams map[string]string, resource for _, resourceKeyString := range sortedResourceKeys { resourceValue := resourceLimits[corev1.ResourceName(resourceKeyString)] + // Scan for resource keys that match "aliyun.com/gpu-mem" + if resourceKeyString == AliyunGPUContainerResourceName && !resourceValue.IsZero() { + if existingValue, ok := rayStartParams["num-gpus"]; ok { + existingIntValue, _ := strconv.ParseInt(existingValue, 10, 64) + rayStartParams["num-gpus"] = strconv.FormatInt(existingIntValue+resourceValue.Value(), 10) + } else { + rayStartParams["num-gpus"] = strconv.FormatInt(resourceValue.Value(), 10) + } + } + // Scan for resource keys ending with "gpu" like "nvidia.com/gpu" - if _, ok := rayStartParams["num-gpus"]; !ok { - if strings.HasSuffix(resourceKeyString, "gpu") && !resourceValue.IsZero() { + if strings.HasSuffix(resourceKeyString, "gpu") && !resourceValue.IsZero() { + if existingValue, ok := rayStartParams["num-gpus"]; ok { + existingIntValue, _ := strconv.ParseInt(existingValue, 10, 64) + rayStartParams["num-gpus"] = strconv.FormatInt(existingIntValue+resourceValue.Value(), 10) + } else { rayStartParams["num-gpus"] = strconv.FormatInt(resourceValue.Value(), 10) } } diff --git a/ray-operator/controllers/ray/common/pod_test.go b/ray-operator/controllers/ray/common/pod_test.go index 6384b8405f..e1f4973a15 100644 --- a/ray-operator/controllers/ray/common/pod_test.go +++ b/ray-operator/controllers/ray/common/pod_test.go @@ -1244,6 +1244,17 @@ func TestGenerateRayStartCommand(t *testing.T) { }, expected: `ray start --resources='{"TPU":4}' `, }, + { + name: "WorkerNode with Aliyun's GPU Share", + nodeType: rayv1.WorkerNode, + rayStartParams: map[string]string{}, + resource: corev1.ResourceRequirements{ + Limits: corev1.ResourceList{ + "aliyun.com/gpu-mem": resource.MustParse("4"), + }, + }, + expected: `ray start --num-gpus=4 `, + }, { name: "HeadNode with Neuron Cores", nodeType: rayv1.HeadNode, @@ -1261,11 +1272,12 @@ func TestGenerateRayStartCommand(t *testing.T) { rayStartParams: map[string]string{}, resource: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ + "aliyun.com/gpu-mem": resource.MustParse("3"), "aws.amazon.com/neuroncore": resource.MustParse("4"), "nvidia.com/gpu": resource.MustParse("1"), }, }, - expected: `ray start --head --num-gpus=1 --resources='{"neuron_cores":4}' `, + expected: `ray start --head --num-gpus=4 --resources='{"neuron_cores":4}' `, }, { name: "HeadNode with multiple custom accelerators", @@ -1273,12 +1285,13 @@ func TestGenerateRayStartCommand(t *testing.T) { rayStartParams: map[string]string{}, resource: corev1.ResourceRequirements{ Limits: corev1.ResourceList{ + "aliyun.com/gpu-mem": resource.MustParse("4"), "google.com/tpu": resource.MustParse("8"), "aws.amazon.com/neuroncore": resource.MustParse("4"), "nvidia.com/gpu": resource.MustParse("1"), }, }, - expected: `ray start --head --num-gpus=1 --resources='{"neuron_cores":4}' `, + expected: `ray start --head --num-gpus=5 --resources='{"neuron_cores":4}' `, }, { name: "HeadNode with existing resources",