diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml index 1b0712924101..01afb55d4361 100644 --- a/.github/workflows/mcore-tag-bump-bot.yml +++ b/.github/workflows/mcore-tag-bump-bot.yml @@ -13,8 +13,8 @@ jobs: source-ref: main build-arg: MCORE_TAG dockerfile: Dockerfile.ci - base-branch: main + base-branch: weekly-bump cicd-label: Run CICD pr-reviewers: 'pablo-garay' secrets: - PAT: ${{ secrets.PAT }} \ No newline at end of file + PAT: ${{ secrets.PAT }} diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py index abe966229ffe..5f315397584b 100644 --- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -44,6 +44,7 @@ def __init__( trust_remote_code=False, default_dtype=torch.bfloat16, load_in_4bit=False, + attn_implementation="sdpa", ): super().__init__() self.save_hyperparameters() @@ -58,6 +59,7 @@ def __init__( self.trust_remote_code = trust_remote_code self.default_dtype = default_dtype self.load_in_4bit = load_in_4bit + self.attn_implementation = attn_implementation @property def tokenizer(self): @@ -82,6 +84,7 @@ def configure_model(self): torch_dtype='auto', trust_remote_code=self.trust_remote_code, load_in_4bit=self.load_in_4bit, + attn_implementation=self.attn_implementation, ) else: from transformers import AutoConfig @@ -89,7 +92,10 @@ def configure_model(self): config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code) dtype = getattr(config, 'torch_dtype', self.default_dtype) self.model = AutoModelForCausalLM.from_config( - config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code + config, + torch_dtype=dtype, + trust_remote_code=self.trust_remote_code, + attn_implementation=self.attn_implementation, ) # Apply FSDP2 and TP to the model diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 6c7e7e93ae8f..a6b7ad288765 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -52,7 +52,7 @@ class LinearAdapter(nn.Linear): orig_linear (nn.Module): the linear module to augment. dim (int): lora's dim in_features -> dim -> out_features. alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.1). + dropout (float): dropout prob (default: 0.0). dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they @@ -64,7 +64,7 @@ def __init__( orig_linear, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -82,14 +82,22 @@ def __init__( if orig_linear.bias is not None: self.bias.data.copy_(orig_linear.bias.data) # initialize the adapte - LinearAdapter._init_adapter(self) + LinearAdapter._init_adapter( + self, + dim=dim, + alpha=alpha, + dropout=dropout, + dropout_position=dropout_position, + lora_A_init_method=lora_A_init_method, + lora_dtype=lora_dtype, + ) @staticmethod def _init_adapter( obj, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -101,7 +109,7 @@ def _init_adapter( obj (LinearAdapter | nn.Module): input module to adapt. dim (int): lora's dim in_features -> dim -> out_features. alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.1). + dropout (float): dropout prob (default: 0.0). dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they @@ -155,7 +163,7 @@ def patch_linear_module( orig_linear, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -175,7 +183,7 @@ def patch_linear_module( orig_linear (nn.Linear): the module we add adapter to. dim (int, optional): Lora dim. Defaults to 8. alpha (int, optional): Lora alpha scale. Defaults to 32. - dropout (float, optional): dropout prob. Defaults to 0.1. + dropout (float, optional): dropout prob. Defaults to 0.0. dropout_position (str, optional): location to apply dropout wrt lora. Defaults to 'post' (choices: 'pre', 'post'). lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'. diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index 6d6ddda1fd80..0224d7e9836d 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -103,23 +103,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): if isinstance(trainer, fl.Fabric): raise NotImplementedError("Fabric is not supported yet.") - trainer_ckpt_path = self.get_trainer_ckpt_path(model) - if trainer_ckpt_path: - trainer.ckpt_path = trainer_ckpt_path - trainer.checkpoint_callback.last_model_path = trainer_ckpt_path - # Load artifacts - if getattr(self.restore_config, 'load_artifacts', False): - if isinstance(trainer_ckpt_path, AdapterPath): - # load tokenizer from the base model during peft resume, in case the first peft checkpoint - # is deleted before the current peft checkpoint is saved - context_path = trainer_ckpt_path.base_model_path / "context" - if not context_path.exists(): - context_path = trainer_ckpt_path.base_model_path - else: - context_path = self.get_context_path(model) - model = _try_restore_tokenizer(model, context_path) - - elif self.restore_config: + if self.restore_config: new_path = self._extract_path( model=model, path=self.restore_config.path, @@ -139,6 +123,21 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): _try_restore_tokenizer(model, context_path) + elif (trainer_ckpt_path := self.get_trainer_ckpt_path(model)) is not None: + trainer.ckpt_path = trainer_ckpt_path + trainer.checkpoint_callback.last_model_path = trainer_ckpt_path + # Load artifacts + if getattr(self.restore_config, 'load_artifacts', False): + if isinstance(trainer_ckpt_path, AdapterPath): + # load tokenizer from the base model during peft resume, in case the first peft checkpoint + # is deleted before the current peft checkpoint is saved + context_path = trainer_ckpt_path.base_model_path / "context" + if not context_path.exists(): + context_path = trainer_ckpt_path.base_model_path + else: + context_path = self.get_context_path(model) + model = _try_restore_tokenizer(model, context_path) + def _extract_path( self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None ) -> BasePath: