-
Notifications
You must be signed in to change notification settings - Fork 435
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RayJob] implement deletion policy API #2643
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ import ( | |
|
||
"github.com/ray-project/kuberay/ray-operator/controllers/ray/common" | ||
"github.com/ray-project/kuberay/ray-operator/controllers/ray/utils" | ||
"github.com/ray-project/kuberay/ray-operator/pkg/features" | ||
|
||
"k8s.io/apimachinery/pkg/runtime" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
|
@@ -346,11 +347,53 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) | |
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, nil | ||
case rayv1.JobDeploymentStatusComplete, rayv1.JobDeploymentStatusFailed: | ||
// If this RayJob uses an existing RayCluster (i.e., ClusterSelector is set), we should not delete the RayCluster. | ||
logger.Info(string(rayJobInstance.Status.JobDeploymentStatus), "RayJob", rayJobInstance.Name, "ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes, "ClusterSelector", rayJobInstance.Spec.ClusterSelector) | ||
if rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||
ttlSeconds := rayJobInstance.Spec.TTLSecondsAfterFinished | ||
nowTime := time.Now() | ||
shutdownTime := rayJobInstance.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) | ||
ttlSeconds := rayJobInstance.Spec.TTLSecondsAfterFinished | ||
nowTime := time.Now() | ||
shutdownTime := rayJobInstance.Status.EndTime.Add(time.Duration(ttlSeconds) * time.Second) | ||
logger.Info(string(rayJobInstance.Status.JobDeploymentStatus), "RayJob", rayJobInstance.Name, | ||
"ShutdownAfterJobFinishes", rayJobInstance.Spec.ShutdownAfterJobFinishes, | ||
"ClusterSelector", rayJobInstance.Spec.ClusterSelector, | ||
"ttlSecondsAfterFinished", ttlSeconds, | ||
"Status.endTime", rayJobInstance.Status.EndTime, | ||
"Now", nowTime, | ||
"ShutdownTime", shutdownTime) | ||
|
||
if features.Enabled(features.RayJobDeletionPolicy) && | ||
andrewsykim marked this conversation as resolved.
Show resolved
Hide resolved
|
||
rayJobInstance.Spec.DeletionPolicy != nil && | ||
*rayJobInstance.Spec.DeletionPolicy != rayv1.DeleteNoneDeletionPolicy && | ||
len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||
logger.Info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Move logger.Info(
"RayJob deployment status",
"jobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus,
"deletionPolicy", rayJobInstance.Spec.DeletionPolicy,
"ttlSecondsAfterFinished", ttlSeconds,
"Status.endTime", rayJobInstance.Status.EndTime,
"Now", nowTime,
"ShutdownTime", shutdownTime)
if shutdownTime.After(nowTime) {
delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds())
logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta)
return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil
} to the above of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The conditions are split here because the keys passed into |
||
"RayJob deployment status", | ||
"jobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus, | ||
"deletionPolicy", rayJobInstance.Spec.DeletionPolicy, | ||
"ttlSecondsAfterFinished", ttlSeconds, | ||
"Status.endTime", rayJobInstance.Status.EndTime, | ||
"Now", nowTime, | ||
"ShutdownTime", shutdownTime) | ||
if shutdownTime.After(nowTime) { | ||
delta := int32(time.Until(shutdownTime.Add(2 * time.Second)).Seconds()) | ||
logger.Info("shutdownTime not reached, requeue this RayJob for n seconds", "seconds", delta) | ||
return ctrl.Result{RequeueAfter: time.Duration(delta) * time.Second}, nil | ||
} | ||
|
||
switch *rayJobInstance.Spec.DeletionPolicy { | ||
case rayv1.DeleteClusterDeletionPolicy: | ||
logger.Info("Deleting RayCluster", "RayCluster", rayJobInstance.Status.RayClusterName) | ||
_, err = r.deleteClusterResources(ctx, rayJobInstance) | ||
case rayv1.DeleteWorkersDeletionPolicy: | ||
logger.Info("Suspending all worker groups", "RayCluster", rayJobInstance.Status.RayClusterName) | ||
err = r.suspendWorkerGroups(ctx, rayJobInstance) | ||
case rayv1.DeleteSelfDeletionPolicy: | ||
logger.Info("Deleting RayJob") | ||
err = r.Client.Delete(ctx, rayJobInstance) | ||
default: | ||
} | ||
if err != nil { | ||
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err | ||
} | ||
} | ||
|
||
if (!features.Enabled(features.RayJobDeletionPolicy) || rayJobInstance.Spec.DeletionPolicy == nil) && rayJobInstance.Spec.ShutdownAfterJobFinishes && len(rayJobInstance.Spec.ClusterSelector) == 0 { | ||
logger.Info( | ||
"RayJob deployment status", | ||
"jobDeploymentStatus", rayJobInstance.Status.JobDeploymentStatus, | ||
|
@@ -377,6 +420,7 @@ func (r *RayJobReconciler) Reconcile(ctx context.Context, request ctrl.Request) | |
return ctrl.Result{RequeueAfter: RayJobDefaultRequeueDuration}, err | ||
} | ||
} | ||
|
||
// If the RayJob is completed, we should not requeue it. | ||
return ctrl.Result{}, nil | ||
default: | ||
|
@@ -617,6 +661,30 @@ func (r *RayJobReconciler) deleteClusterResources(ctx context.Context, rayJobIns | |
return isClusterDeleted, nil | ||
} | ||
|
||
func (r *RayJobReconciler) suspendWorkerGroups(ctx context.Context, rayJobInstance *rayv1.RayJob) error { | ||
logger := ctrl.LoggerFrom(ctx) | ||
clusterIdentifier := common.RayJobRayClusterNamespacedName(rayJobInstance) | ||
|
||
cluster := rayv1.RayCluster{} | ||
if err := r.Get(ctx, clusterIdentifier, &cluster); err != nil { | ||
return err | ||
} | ||
|
||
for i := range cluster.Spec.WorkerGroupSpecs { | ||
cluster.Spec.WorkerGroupSpecs[i].Suspend = ptr.To[bool](true) | ||
} | ||
|
||
if err := r.Update(ctx, &cluster); err != nil { | ||
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeWarning, string(utils.FailedToUpdateRayCluster), "Failed to update cluster %s/%s: %v", cluster.Namespace, cluster.Name, err) | ||
return err | ||
} | ||
|
||
logger.Info("All worker groups for RayCluster have had `suspend` set to true", "RayCluster", clusterIdentifier) | ||
r.Recorder.Eventf(rayJobInstance, corev1.EventTypeNormal, string(utils.UpdatedRayCluster), "Set the `suspend` field to true for all worker groups in cluster %s/%s", cluster.Namespace, cluster.Name) | ||
|
||
return nil | ||
} | ||
|
||
// SetupWithManager sets up the controller with the Manager. | ||
func (r *RayJobReconciler) SetupWithManager(mgr ctrl.Manager, reconcileConcurrency int) error { | ||
return ctrl.NewControllerManagedBy(mgr). | ||
|
@@ -845,6 +913,9 @@ func validateRayJobSpec(rayJob *rayv1.RayJob) error { | |
if rayJob.Spec.BackoffLimit != nil && *rayJob.Spec.BackoffLimit < 0 { | ||
return fmt.Errorf("backoffLimit must be a positive integer") | ||
} | ||
if rayJob.Spec.ShutdownAfterJobFinishes && rayJob.Spec.DeletionPolicy != nil && *rayJob.Spec.DeletionPolicy == rayv1.DeleteNoneDeletionPolicy { | ||
return fmt.Errorf("shutdownAfterJobFinshes is set to 'true' while deletion policy is 'DeleteNone'") | ||
} | ||
return nil | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe move the check before
if features.Enabled(features.RayJobDeletionPolicy) ...
to simplify the code?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could add a requeue for cases where deletion is not required