Skip to content

Commit

Permalink
Revert "MCore Partial DistOpt Feature (#10693)"
Browse files Browse the repository at this point in the history
This reverts commit 84d5fad.
  • Loading branch information
pablo-garay authored Jan 15, 2025
1 parent 1626ddd commit 08c1700
Show file tree
Hide file tree
Showing 5 changed files with 0 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,6 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
virtual_pipeline_model_parallel_size=vp_size,
pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0),
use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False),
num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1),
context_parallel_size=cfg.get('context_parallel_size', 1),
micro_batch_size=cfg.get('micro_batch_size'),
global_batch_size=cfg.get('global_batch_size'),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -593,7 +593,6 @@ def setup_mcore_distributed_parallel(self):
ddp_config = DistributedDataParallelConfig(
grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1),
use_distributed_optimizer=True,
check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
# mcore bucket_size is based on num of parameters, therefore not
Expand Down
2 changes: 0 additions & 2 deletions nemo/collections/nlp/modules/common/megatron/megatron_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def initialize_model_parallel_for_nemo(
apex_transformer_log_level=30,
use_tp_pp_dp_mapping=False,
use_te_rng_tracker=False,
num_distributed_optimizer_instances=1,
):

if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
Expand All @@ -118,7 +117,6 @@ def initialize_model_parallel_for_nemo(
app_state.world_size = world_size
app_state.local_rank = local_rank
app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping
app_state.num_distributed_optimizer_instances = num_distributed_optimizer_instances
app_state.expert_model_parallel_size = expert_model_parallel_size
app_state.tensor_model_parallel_size = tensor_model_parallel_size
app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
Expand Down
1 change: 0 additions & 1 deletion nemo/collections/nlp/parts/nlp_overrides.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ def init_model_parallel(
use_sharp=sharp,
expert_model_parallel_size=app_state.expert_model_parallel_size,
order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
num_distributed_optimizer_instances=app_state.num_distributed_optimizer_instances,
distributed_timeout_minutes=distributed_timeout_minutes,
)

Expand Down
17 changes: 0 additions & 17 deletions nemo/utils/app_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ def __init__(self):
self._data_parallel_size = None
self._data_parallel_group = None
self._use_tp_pp_dp_mapping = False
self._num_distributed_optimizer_instances = 1
self._megatron_checkpoint_version = None
self._use_fp8 = False
self._context_parallel_size = None
Expand Down Expand Up @@ -243,22 +242,6 @@ def use_tp_pp_dp_mapping(self):
def use_tp_pp_dp_mapping(self, use_new_mapping):
self._use_tp_pp_dp_mapping = use_new_mapping

@property
def num_distributed_optimizer_instances(self):
"""Property returns the factor by which the Partial DistOpt is sharded.
Returns:
The partial DistOpt shard factor
"""
return self._num_distributed_optimizer_instances

@num_distributed_optimizer_instances.setter
def num_distributed_optimizer_instances(self, shard_factor):
"""Property sets the factor by which the Partial DistOpt is sharded.
Args:
shard_factor (int): The partial DistOpt shard factor.
"""
self._num_distributed_optimizer_instances = shard_factor

@property
def virtual_pipeline_model_parallel_size(self):
"""Property returns the number of GPUs in each model parallel group.
Expand Down

0 comments on commit 08c1700

Please sign in to comment.