Skip to content

Commit

Permalink
Add enable_cudf_spill to LocalCudaCluster (#268)
Browse files Browse the repository at this point in the history
  • Loading branch information
praateekmahajan authored Sep 30, 2024
2 parents 97e8f15 + 7c476ac commit 9af8da1
Showing 1 changed file with 2 additions and 17 deletions.
19 changes: 2 additions & 17 deletions nemo_curator/utils/distributed_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
import psutil
from dask.distributed import Client, LocalCluster, get_worker, performance_report

from nemo_curator.utils.gpu_utils import GPU_INSTALL_STRING, is_cudf_type
from nemo_curator.utils.gpu_utils import is_cudf_type
from nemo_curator.utils.import_utils import gpu_only_import, gpu_only_import_from

cudf = gpu_only_import("cudf")
Expand Down Expand Up @@ -70,14 +70,11 @@ def start_dask_gpu_local_cluster(
rmm_pool_size=rmm_pool_size,
protocol=protocol,
rmm_async=True,
enable_cudf_spill=enable_spilling,
**extra_kwargs,
)
client = Client(cluster)

if enable_spilling:
_enable_spilling()
client.run(_enable_spilling)

if set_torch_to_use_rmm:
_set_torch_to_use_rmm()
client.run(_set_torch_to_use_rmm)
Expand Down Expand Up @@ -193,18 +190,6 @@ def _set_torch_to_use_rmm():
torch.cuda.memory.change_current_allocator(rmm_torch_allocator)


def _enable_spilling():
"""
Setting this environment variable enables automatic spilling (and "unspilling")
of buffers from device to host to enable out-of-memory computation,
i.e., computing on objects that occupy more memory than is available on the GPU.
"""
import cudf

cudf.set_option("spill", True)


def read_single_partition(
files,
backend="cudf",
Expand Down

0 comments on commit 9af8da1

Please sign in to comment.