Skip to content

Commit

Permalink
Update V6e TPU Ray Samples (#2448)
Browse files Browse the repository at this point in the history
* Fix Ray CR manifests

Signed-off-by: Ryan O'Leary <[email protected]>

* update rayjob resources

Signed-off-by: Ryan O'Leary <[email protected]>

* fix test

Signed-off-by: Ryan O'Leary <[email protected]>

* fix ray job resources

Signed-off-by: Ryan O'Leary <[email protected]>

---------

Signed-off-by: Ryan O'Leary <[email protected]>
  • Loading branch information
ryanaoleary authored Oct 16, 2024
1 parent 047699f commit 714aea6
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 35 deletions.
72 changes: 72 additions & 0 deletions ray-operator/config/samples/ray-cluster.tpu-v6e-16-multihost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: tpu-ray-cluster
spec:
headGroupSpec:
rayStartParams: {}
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.37.0-py310
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "8"
memory: 40G
requests:
cpu: "8"
memory: 40G
ports:
- containerPort: 6379
name: gcs
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8000
name: serve
- containerPort: 8888
name: grpc
workerGroupSpecs:
- groupName: tpu-group
replicas: 1
minReplicas: 0
maxReplicas: 1
numOfHosts: 4
rayStartParams: {}
template:
spec:
securityContext:
runAsUser: 0
containers:
- name: ray-worker
image: rayproject/ray:2.37.0-py310
imagePullPolicy: IfNotPresent
resources:
limits:
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "24"
google.com/tpu: "4"
memory: 200G
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: VBAR_CONTROL_SERVICE_URL
value: $(NODE_IP):8353
- name: JAX_PLATFORMS
value: tpu,cpu
- name: ENABLE_PJRT_COMPATIBILITY
value: "true"
ports:
- containerPort: 8081
name: mxla
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 4x4
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: tpu-cluster-multi-host
name: tpu-ray-cluster
spec:
headGroupSpec:
rayStartParams: {}
Expand All @@ -14,11 +14,9 @@ spec:
resources:
limits:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
requests:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
ports:
- containerPort: 6379
Expand All @@ -29,8 +27,6 @@ spec:
name: client
- containerPort: 8000
name: serve
- containerPort: 8081
name: mxla
- containerPort: 8888
name: grpc
workerGroupSpecs:
Expand All @@ -48,20 +44,29 @@ spec:
- name: ray-worker
image: rayproject/ray:2.37.0-py310
imagePullPolicy: IfNotPresent
env:
- name: JAX_PLATFORMS
value: "tpu"
resources:
limits:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: VBAR_CONTROL_SERVICE_URL
value: $(NODE_IP):8353
- name: JAX_PLATFORMS
value: tpu,cpu
- name: ENABLE_PJRT_COMPATIBILITY
value: "true"
ports:
- containerPort: 8081
name: mxla
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 16x16
13 changes: 3 additions & 10 deletions ray-operator/config/samples/ray-cluster.tpu-v6e-singlehost.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: ray.io/v1
kind: RayCluster
metadata:
name: tpu-cluster-single-host
name: tpu-ray-cluster
spec:
headGroupSpec:
rayStartParams: {}
Expand All @@ -14,11 +14,9 @@ spec:
resources:
limits:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
requests:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
ports:
- containerPort: 6379
Expand Down Expand Up @@ -48,18 +46,13 @@ spec:
- name: ray-worker
image: rayproject/ray:2.37.0-py310
imagePullPolicy: IfNotPresent
env:
- name: JAX_PLATFORMS
value: "tpu"
resources:
limits:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
nodeSelector:
Expand Down
76 changes: 76 additions & 0 deletions ray-operator/config/samples/ray-job.tpu-v6e-16-multihost.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
apiVersion: ray.io/v1
kind: RayJob
metadata:
name: v6e-16-job
spec:
entrypoint: python ray-operator/config/samples/tpu/tpu_list_devices.py
runtimeEnvYAML: |
working_dir: "https://github.com/ray-project/kuberay/archive/master.zip"
pip:
- jax[tpu]==0.4.33
- -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
rayClusterSpec:
rayVersion: '2.37.0'
headGroupSpec:
rayStartParams: {}
template:
spec:
containers:
- name: ray-head
image: rayproject/ray:2.37.0-py310
ports:
- containerPort: 6379
name: gcs-server
- containerPort: 8265
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8888
name: grpc
resources:
limits:
cpu: "8"
memory: 40G
requests:
cpu: "8"
memory: 40G
workerGroupSpecs:
- replicas: 1
minReplicas: 1
maxReplicas: 1
numOfHosts: 4
groupName: tpu-group
rayStartParams: {}
template:
spec:
securityContext:
runAsUser: 0
containers:
- name: ray-worker
image: rayproject/ray:2.37.0-py310
resources:
limits:
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "24"
google.com/tpu: "4"
memory: 200G
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: VBAR_CONTROL_SERVICE_URL
value: $(NODE_IP):8353
- name: JAX_PLATFORMS
value: tpu,cpu
- name: ENABLE_PJRT_COMPATIBILITY
value: "true"
ports:
- containerPort: 8081
name: mxla
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 4x4
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ spec:
name: dashboard
- containerPort: 10001
name: client
- containerPort: 8888
name: grpc
resources:
limits:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
requests:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
workerGroupSpecs:
- replicas: 1
Expand All @@ -50,15 +50,27 @@ spec:
image: rayproject/ray:2.37.0-py310
resources:
limits:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
env:
- name: NODE_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: VBAR_CONTROL_SERVICE_URL
value: $(NODE_IP):8353
- name: JAX_PLATFORMS
value: tpu,cpu
- name: ENABLE_PJRT_COMPATIBILITY
value: "true"
ports:
- containerPort: 8081
name: mxla
nodeSelector:
cloud.google.com/gke-tpu-accelerator: tpu-v6e-slice
cloud.google.com/gke-tpu-topology: 16x16
8 changes: 2 additions & 6 deletions ray-operator/config/samples/ray-job.tpu-v6e-singlehost.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,9 @@ spec:
resources:
limits:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
requests:
cpu: "8"
ephemeral-storage: 5Gi
memory: 40G
workerGroupSpecs:
- replicas: 1
Expand All @@ -50,13 +48,11 @@ spec:
image: rayproject/ray:2.37.0-py310
resources:
limits:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
requests:
cpu: "100"
ephemeral-storage: 20Gi
cpu: "24"
google.com/tpu: "4"
memory: 200G
nodeSelector:
Expand Down
3 changes: 2 additions & 1 deletion tests/test_sample_raycluster_yamls.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def parse_args():
'ray-cluster.tpu-v4-singlehost.yaml': 'Skip this test because it requires TPU resources.',
'ray-cluster.tpu-v4-multihost.yaml' : 'Skip this test because it requires TPU resources',
'ray-cluster.tpu-v6e-singlehost.yaml' : 'Skip this test because it requires TPU resources',
'ray-cluster.tpu-v6e-multihost.yaml' : 'Skip this test because it requires TPU resources',
'ray-cluster.tpu-v6e-16-multihost.yaml' : 'Skip this test because it requires TPU resources',
'ray-cluster.tpu-v6e-256-multihost.yaml' : 'Skip this test because it requires TPU resources',
'ray-cluster.gke-bucket.yaml': 'Skip this test because it requires GKE and k8s service accounts.',
'ray-service.high-availability-locust.yaml': 'Skip this test because the RayCluster here is only used for testing RayService.',
}
Expand Down

0 comments on commit 714aea6

Please sign in to comment.