Remove special handling for CPU recipe tests

pytorch · Dec 4, 2024 · 1b992aa · 1b992aa
1 parent 7764a6d
commit 1b992aa
Show file tree

Hide file tree

Showing 5 changed files with 5 additions and 15 deletions.
diff --git a/recipes/full_finetune_distributed.py b/recipes/full_finetune_distributed.py
@@ -946,13 +946,11 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-    process_group = "gloo" if cfg.device == "cpu" else "nccl"
+    init_process_group("cuda:nccl,cpu:gloo")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
         training.set_torch_num_threads()
-        process_group = "cuda:nccl,cpu:gloo"
-    init_process_group(backend=process_group)
 
     config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg)
 

diff --git a/recipes/knowledge_distillation_distributed.py b/recipes/knowledge_distillation_distributed.py
@@ -971,13 +971,11 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-    process_group = "gloo" if cfg.device == "cpu" else "nccl"
+    init_process_group("cuda:nccl,cpu:gloo")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
         training.set_torch_num_threads()
-        process_group = "cuda:nccl,cpu:gloo"
-    init_process_group(backend=process_group)
 
     config.log_config(recipe_name="KDRecipeDistributed", cfg=cfg)
 

diff --git a/recipes/lora_dpo_distributed.py b/recipes/lora_dpo_distributed.py
@@ -782,13 +782,11 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-    process_group = "gloo" if cfg.device == "cpu" else "nccl"
+    init_process_group("cuda:nccl,cpu:gloo")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
         training.set_torch_num_threads()
-        process_group = "cuda:nccl,cpu:gloo"
-    init_process_group(backend=process_group)
 
     config.log_config(recipe_name="LoRADPORecipeDistributed", cfg=cfg)
 

diff --git a/recipes/lora_finetune_distributed.py b/recipes/lora_finetune_distributed.py
@@ -920,13 +920,11 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-    process_group = "gloo" if cfg.device == "cpu" else "nccl"
+    init_process_group("cuda:nccl,cpu:gloo")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
         training.set_torch_num_threads()
-        process_group = "cuda:nccl,cpu:gloo"
-    init_process_group(backend=process_group)
 
     config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg)
 

diff --git a/recipes/qat_distributed.py b/recipes/qat_distributed.py
@@ -935,13 +935,11 @@ def recipe_main(cfg: DictConfig) -> None:
             "Distributed finetune recipe should be run via a distributed launcher."
             "If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
         )
-    process_group = "gloo" if cfg.device == "cpu" else "nccl"
+    init_process_group("cuda:nccl,cpu:gloo")
     if cfg.get("fsdp_cpu_offload", False):
         # Utilize all available CPU cores for intra-op parallelism. This provides ~2x
         # speed up when benchmarking fused AdamW on CPU
         training.set_torch_num_threads()
-        process_group = "cuda:nccl,cpu:gloo"
-    init_process_group(backend=process_group)
 
     config.log_config(recipe_name="QATRecipeDistributed", cfg=cfg)