Skip to content

Commit

Permalink
Remove special handling for CPU recipe tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ebsmothers committed Dec 4, 2024
1 parent 7764a6d commit 1b992aa
Show file tree
Hide file tree
Showing 5 changed files with 5 additions and 15 deletions.
4 changes: 1 addition & 3 deletions recipes/full_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -946,13 +946,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
process_group = "gloo" if cfg.device == "cpu" else "nccl"
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
process_group = "cuda:nccl,cpu:gloo"
init_process_group(backend=process_group)

config.log_config(recipe_name="FullFinetuneRecipeDistributed", cfg=cfg)

Expand Down
4 changes: 1 addition & 3 deletions recipes/knowledge_distillation_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,13 +971,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
process_group = "gloo" if cfg.device == "cpu" else "nccl"
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
process_group = "cuda:nccl,cpu:gloo"
init_process_group(backend=process_group)

config.log_config(recipe_name="KDRecipeDistributed", cfg=cfg)

Expand Down
4 changes: 1 addition & 3 deletions recipes/lora_dpo_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -782,13 +782,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
process_group = "gloo" if cfg.device == "cpu" else "nccl"
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
process_group = "cuda:nccl,cpu:gloo"
init_process_group(backend=process_group)

config.log_config(recipe_name="LoRADPORecipeDistributed", cfg=cfg)

Expand Down
4 changes: 1 addition & 3 deletions recipes/lora_finetune_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -920,13 +920,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
process_group = "gloo" if cfg.device == "cpu" else "nccl"
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
process_group = "cuda:nccl,cpu:gloo"
init_process_group(backend=process_group)

config.log_config(recipe_name="LoRAFinetuneRecipeDistributed", cfg=cfg)

Expand Down
4 changes: 1 addition & 3 deletions recipes/qat_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -935,13 +935,11 @@ def recipe_main(cfg: DictConfig) -> None:
"Distributed finetune recipe should be run via a distributed launcher."
"If using tune CLI, please specify --nnodes 1 and --nproc_per_node [num_gpus]"
)
process_group = "gloo" if cfg.device == "cpu" else "nccl"
init_process_group("cuda:nccl,cpu:gloo")
if cfg.get("fsdp_cpu_offload", False):
# Utilize all available CPU cores for intra-op parallelism. This provides ~2x
# speed up when benchmarking fused AdamW on CPU
training.set_torch_num_threads()
process_group = "cuda:nccl,cpu:gloo"
init_process_group(backend=process_group)

config.log_config(recipe_name="QATRecipeDistributed", cfg=cfg)

Expand Down

0 comments on commit 1b992aa

Please sign in to comment.