Skip to content

Commit

Permalink
[Test][Autoscaler][2/n] Add Ray Autoscaler e2e tests for GPU workers
Browse files Browse the repository at this point in the history
Signed-off-by: Rueian <[email protected]>
  • Loading branch information
rueian committed Jun 6, 2024
1 parent c147ad2 commit ec9c0a5
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 2 deletions.
10 changes: 8 additions & 2 deletions ray-operator/test/e2e/create_detached_actor.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,17 @@
import ray
import sys
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('name')
parser.add_argument('--num-cpus', type=float, default=1)
parser.add_argument('--num-gpus', type=float, default=0)
args = parser.parse_args()

@ray.remote(num_cpus=1)
@ray.remote(num_cpus=args.num_cpus, num_gpus=args.num_gpus)
class Actor:
pass


ray.init(namespace="default_namespace")
Actor.options(name=sys.argv[1], lifetime="detached").remote()
Actor.options(name=args.name, lifetime="detached").remote()
20 changes: 20 additions & 0 deletions ray-operator/test/e2e/raycluster_autoscaler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ func TestRayClusterAutoscaler(t *testing.T) {
WithMaxReplicas(3).
WithGroupName("small-group").
WithRayStartParams(map[string]string{"num-cpus": "1"}).
WithTemplate(workerPodTemplateApplyConfiguration())).
WithWorkerGroupSpecs(rayv1ac.WorkerGroupSpec().
WithReplicas(0).
WithMinReplicas(0).
WithMaxReplicas(3).
WithGroupName("gpu-group").
WithRayStartParams(map[string]string{"num-cpus": "1", "num-gpus": "1"}).
WithTemplate(workerPodTemplateApplyConfiguration()))
rayClusterAC := rayv1ac.RayCluster("ray-cluster", namespace.Name).
WithSpec(apply(rayClusterSpecAC, mountConfigMap[rayv1ac.RayClusterSpecApplyConfiguration](scripts, "/home/ray/test_scripts")))
Expand Down Expand Up @@ -73,5 +80,18 @@ func TestRayClusterAutoscaler(t *testing.T) {
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "actor2"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0))))

// Create a detached gpu actor, and a worker in the "gpu-group" should be created.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/create_detached_actor.py", "gpu_actor", "--num-gpus=1"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(1))))
// We don't use real GPU resources of Kubernetes here, therefore we can't test the RayClusterDesiredGPU.
// We test the Pods count of the "gpu-group" instead.
test.Expect(GetGroupPods(test, rayCluster, "gpu-group")).To(HaveLen(1))

// Terminate the gpu detached actor, and the worker should be deleted.
ExecPodCmd(test, headPod, common.RayHeadContainer, []string{"python", "/home/ray/test_scripts/terminate_detached_actor.py", "gpu_actor"})
test.Eventually(RayCluster(test, rayCluster.Namespace, rayCluster.Name), TestTimeoutMedium).
Should(WithTransform(RayClusterDesiredWorkerReplicas, Equal(int32(0))))
})
}
10 changes: 10 additions & 0 deletions ray-operator/test/support/ray.go
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,13 @@ func GetHeadPod(t Test, rayCluster *rayv1.RayCluster) *corev1.Pod {
t.Expect(len(pods.Items)).To(gomega.Equal(1))
return &pods.Items[0]
}

func GetGroupPods(t Test, rayCluster *rayv1.RayCluster, group string) []corev1.Pod {
t.T().Helper()
pods, err := t.Client().Core().CoreV1().Pods(rayCluster.Namespace).List(
t.Ctx(),
common.RayClusterGroupPodsAssociationOptions(rayCluster, group).ToMetaV1ListOptions(),
)
t.Expect(err).NotTo(gomega.HaveOccurred())
return pods.Items
}

0 comments on commit ec9c0a5

Please sign in to comment.