From 74025a6b8312bbe7c8c8090d9f06c3c6853d7ee7 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Tue, 14 Jan 2025 08:55:10 -0700
Subject: [PATCH 01/27] remove tensorstore pin in requirements*.txt (#11777)

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
---
 requirements/requirements_infer.txt | 2 +-
 requirements/requirements_nlp.txt   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt
index 5f428d91fc56..47daf571d26f 100644
--- a/requirements/requirements_infer.txt
+++ b/requirements/requirements_infer.txt
@@ -3,6 +3,6 @@
 fastapi
 nvidia-pytriton
 pydantic-settings
-tensorstore==0.1.45
+tensorstore
 uvicorn
 zarr
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index 6a86dacbfefb..d35b649a46ba 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -21,6 +21,6 @@ rapidfuzz
 rouge_score
 sacrebleu  # manually install sacrebleu[ja] for Japanese support; MeCab is unsupported in Python 3.11+
 sentence_transformers
-tensorstore<0.1.46
+tensorstore
 tiktoken==0.7.0
 zarr

From cdaf7b141c39b416dc19ce39d89624966f32cf0c Mon Sep 17 00:00:00 2001
From: Hemil Desai <hemild@nvidia.com>
Date: Tue, 14 Jan 2025 22:50:41 +0530
Subject: [PATCH 02/27] Do not load context for model transform in llm
 inference (#11751)

---
 nemo/collections/llm/inference/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py
index dd53d97b21ad..0a87480f31d9 100644
--- a/nemo/collections/llm/inference/base.py
+++ b/nemo/collections/llm/inference/base.py
@@ -14,7 +14,7 @@
 import inspect
 import json
 from pathlib import Path
-from typing import Optional, Union
+from typing import Optional
 
 import lightning.pytorch as pl
 import torch
@@ -161,7 +161,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl.
     trainer.strategy.trainer = trainer
     trainer.strategy.selective_restore()
 
-    peft: Union[io.TrainerContext, PEFT] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform")
+    peft: Optional[PEFT] = model.model_transform
     if isinstance(peft, PEFT):
         model = peft(model)
         adapter_sharded_state_dict = {k: v for k, v in model.sharded_state_dict().items() if ".adapter." in k}

From c1e46eab419223f0d76fdac7767c3025212be778 Mon Sep 17 00:00:00 2001
From: chenrui17 <33319780+chenrui17@users.noreply.github.com>
Date: Wed, 15 Jan 2025 01:23:30 +0800
Subject: [PATCH 03/27] add chat sft dataset to support agent tool calling
 (#11759)

* add chat sft dataset to support agent tool calling

* Apply isort and black reformatting

Signed-off-by: chenrui17 <chenrui17@users.noreply.github.com>

* update docstring

* fix typo

Signed-off-by: Chen Cui <chcui@nvidia.com>

---------

Signed-off-by: chenrui17 <chenrui17@users.noreply.github.com>
Signed-off-by: Chen Cui <chcui@nvidia.com>
Co-authored-by: Charlie Chen <charliech@nvidia.com>
Co-authored-by: chenrui17 <chenrui17@users.noreply.github.com>
Co-authored-by: Chen Cui <chcui@nvidia.com>
---
 .../megatron/gpt_sft_chat_dataset.py          | 16 ++--
 .../collections/nlp/test_chat_sft_dataset.py  | 73 +++++++++++++++++++
 2 files changed, 83 insertions(+), 6 deletions(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
index 6d71a9d8e014..53d94452a480 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
@@ -110,7 +110,7 @@ def _mask_targets(
         header_len (int): the system prompt length
         s_ids (List[Tensor]): array of tokenized ids of each turns
         tokenizer (TokenizerSpec): tokenizer object
-        mask_role (str): the speaker id to be masked from loss computation
+        mask_role (str): the speaker id to be masked from loss computation. If there is more than 1 masked role, `mask_role` is a comma-separated string of the roles
         gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT'
         name_end_token_ids (int): end of name token ids
         special_tokens (dict): special tokens used for the chat prompt. It has the keys: system_turn_start, turn_start, label_start, end_of_turn
@@ -164,13 +164,13 @@ def _mask_targets(
         if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None):
             # mask the first turn completely to provide at least one turn as context for the rest
             target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
-        elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE':
+        elif speaker in mask_role and i == 1 and gtype == 'TEXT_TO_VALUE':
             # leave the first turn start tag unmasked, servers severs as the end of turn signal
             target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX
-        elif speaker == mask_role and (i > 1):
+        elif speaker in mask_role and (i > 1):
             # leave the first turn start tag unmasked, which severs as the end of turn signal
             target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX
-        elif speaker == mask_role and (i <= 1):
+        elif speaker in mask_role and (i <= 1):
             # mask out everything in the second turn
             target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX
         else:
@@ -238,7 +238,7 @@ def _add_speaker_and_signal(header, source, mask_role, gtype, special_tokens):
             )
         conversation += sentence["value"]
         # if the last turn is not masked, add next token start token to the end, which will be included for loss calculation
-        if sentence_from != mask_role and i == len(source) - 1:
+        if sentence_from not in mask_role and i == len(source) - 1:
             conversation += TURN_TOKEN
     return conversation
 
@@ -276,7 +276,11 @@ def preprocess(
         ids.append(torch.tensor(tokenized_sentence))
         tokenized_lens.append(len(tokenized_sentence))
     speakers = [sentence["from"] for sentence in source['conversations']]
-    assert mask_role in speakers, "mask role not in the conversation"
+    # assert mask_role in speakers, "mask role not in the conversation"
+    split_mask = mask_role.split(',')
+    for s in split_mask:
+        assert s in speakers, "mask role not in the conversation"
+
     target = torch.LongTensor(target)
     # not going to train on the header
     target[:header_len] = IGNORE_INDEX
diff --git a/tests/collections/nlp/test_chat_sft_dataset.py b/tests/collections/nlp/test_chat_sft_dataset.py
index bc44049f8f11..fce07c0f3897 100644
--- a/tests/collections/nlp/test_chat_sft_dataset.py
+++ b/tests/collections/nlp/test_chat_sft_dataset.py
@@ -76,6 +76,36 @@ def create_data_points(mask_user, turn_num, records, temp_file, t2v, label=True)
     return data_points
 
 
+def create_custom_data_points(mask_list, turn_num, records, temp_file):
+    data_points = []
+    with open(temp_file, 'w', encoding='utf-8') as f:
+        for r in range(records):
+            record = {}
+            record['system'] = 'a chat'
+            record['mask'] = ''
+            for i, s in enumerate(mask_list):
+                record['mask'] += s
+                if i != len(mask_list) - 1:
+                    record['mask'] += ','
+            turns = []
+            record['conversations'] = turns
+            for i in range(turn_num):
+                turn = {}
+                if i % 4 == 0:
+                    turn['from'] = 'User'
+                elif i % 4 == 1:
+                    turn['from'] = 'Assistant'
+                elif i % 4 == 2:
+                    turn['from'] = 'Function'
+                else:
+                    turn['from'] = 'Assistant'
+                turn['value'] = get_random_sentence()
+                turns.append(turn)
+            f.write(json.dumps(record, ensure_ascii=False) + '\n')
+            data_points.append(record)
+    return data_points
+
+
 @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine')
 class TestGPTSFTChatDataset:
     @classmethod
@@ -118,6 +148,41 @@ def _mask_user_test(self, tokenizer, ids_to_text):
         finally:
             os.remove(temp_file)
 
+    def _mask_user_func_test(self, tokenizer, ids_to_text):
+        random.seed(5)
+        temp_file = '/tmp/test_file.jsonl'
+        turn_num = 10
+        records = 2
+        mask_list = ["User", "Function"]
+        try:
+            # create custom data for Agent SFT case
+            data_points = create_custom_data_points(mask_list, turn_num, records, temp_file)
+            print(data_points)
+            d = GPTSFTChatDataset(
+                temp_file,
+                tokenizer,
+                4096,
+                1,
+                index_mapping_dir='/tmp/',
+                hf_dataset=True,
+                special_tokens=self.special_tokens,
+            )
+            for i in range(len(d)):
+                result = d[i]
+                input_ids = result['input_ids']
+                mask = result['mask']
+                text = ids_to_text(input_ids[mask].tolist())
+                print("【text】", i)
+                print(text)
+                expected_text = ''
+                for j in range(1, turn_num, 2):
+                    expected_text += data_points[i]['conversations'][j]['value'] + self.suffix
+                print("【expected text】", i)
+                print(expected_text)
+                assert text == expected_text
+        finally:
+            os.remove(temp_file)
+
     def _mask_assistant_test(self, tokenizer, ids_to_text):
         random.seed(3)
         temp_file = '/tmp/test_file.jsonl'
@@ -321,6 +386,14 @@ def test_43B_tokenizer_mask_assistant_nolabel(self):
         tokenizer = get_nmt_tokenizer(library='sentencepiece', tokenizer_model=TOKENIZER_FILE_43B)
         self._mask_assistant_nolabel_test(tokenizer, tokenizer.ids_to_text)
 
+    @pytest.mark.unit
+    def test_mpt_tokenizer_mask_user_func(self):
+        tokenizer = get_nmt_tokenizer(
+            library='huggingface', model_name='gpt2', merges_file=MERGE_FILE, vocab_file=VOCAB_FILE, use_fast=True
+        )
+        tokenizer.add_special_tokens({'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>']})
+        self._mask_user_func_test(tokenizer, partial(ids_to_text, tokenizer))
+
     @pytest.mark.unit
     def test_mpt_tokenizer_mask_user(self):
         tokenizer = get_nmt_tokenizer(

From dc08eddab27ef1da0cbf4f272c25dedf7cb8f11e Mon Sep 17 00:00:00 2001
From: meatybobby <bobchen@nvidia.com>
Date: Tue, 14 Jan 2025 09:57:20 -0800
Subject: [PATCH 04/27] Fix starcoder2 missing bias in nemo2 config (#11809)

---
 nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 9adc83c1b82a..1d344fd55735 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -553,6 +553,9 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
             elif nemo_model_config["activation"] == "squared_relu":
                 nemo_model_config["activation"] = "squared-relu"
 
+            if nemo_model_config.get("add_bias_linear"):
+                nemo_model_config["bias"] = True
+
             nemo_model_config["mcore_gpt"] = True
             nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096)
             nemo_model_config["rotary_percentage"] = nemo_model_config.get("rotary_percent", 1.0)

From c856900f8ef16f144476f5978a2a7e6e99195a2b Mon Sep 17 00:00:00 2001
From: Huiying <willwin.lee@gmail.com>
Date: Tue, 14 Jan 2025 13:58:05 -0800
Subject: [PATCH 05/27] update nemo2 tutorial container verison (#11832)

Signed-off-by: Huiying Li <willwin.lee@gmail.com>
---
 tutorials/llm/llama-3/nemo2-sft-peft/README.rst       | 4 ++--
 tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/README.rst b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst
index d1bd7b87759c..0dee7c316697 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/README.rst
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst
@@ -20,7 +20,7 @@ Requirements
    
 * Software Requirements
     * Use the latest [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) . Note that you must be logged in to the container registry to view this page.
-    * This notebook is tested on the container: `nvcr.io/nvidia/nemo:24.12-rc0`.
+    * This notebook is tested on the container: `nvcr.io/nvidia/nemo:24.12`.
     * Get your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens), which will be used to obtain the tokenizer required during training.
 
 * NeMo 2.0 and NeMo-Run
@@ -42,7 +42,7 @@ Start the NeMo Framework Container
      --rm -it \
      -v ${PWD}:/workspace \
      -w /workspace \
-     nvcr.io/nvidia/nemo:24.12-rc0 bash
+     nvcr.io/nvidia/nemo:24.12 bash
 
 Once you are inside the container, you can run `nvidia-smi` to verify that the GPUs are accessible.
 
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
index b3393d133a45..730ffd9ff972 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -533,7 +533,7 @@
     "\n",
     "2. [NeMo-Run GitHub repo](https://github.com/NVIDIA/NeMo-Run/)\n",
     "\n",
-    "3. NeMo Framework Container: `nvcr.io/nvidia/nemo:24.12-rc0`\n",
+    "3. NeMo Framework Container: `nvcr.io/nvidia/nemo:24.12`\n",
     "\n",
     "\n",
     "\n",

From 84d5fad2eb2152161c759d1153dfc5d50f11de62 Mon Sep 17 00:00:00 2001
From: Selvaraj Anandaraj <anandaraj@wisc.edu>
Date: Tue, 14 Jan 2025 18:22:22 -0800
Subject: [PATCH 06/27] MCore Partial DistOpt Feature (#10693)

* Added interface arg for partial DistOpt

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

* Typo fix

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

* Apply isort and black reformatting

Signed-off-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>

* Changed variable name

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>

---------

Signed-off-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Signed-off-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>
Co-authored-by: Selvaraj Anandaraj <selvaraja@login-eos02.eos.clusters.nvidia.com>
Co-authored-by: sanandaraj5597 <sanandaraj5597@users.noreply.github.com>
---
 .../language_modeling/megatron_base_model.py    |  1 +
 .../language_modeling/megatron_gpt_model.py     |  1 +
 .../modules/common/megatron/megatron_init.py    |  2 ++
 nemo/collections/nlp/parts/nlp_overrides.py     |  1 +
 nemo/utils/app_state.py                         | 17 +++++++++++++++++
 5 files changed, 22 insertions(+)

diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index 330f6ffee05b..cf13a0318ffc 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -195,6 +195,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True):
             virtual_pipeline_model_parallel_size=vp_size,
             pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0),
             use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False),
+            num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1),
             context_parallel_size=cfg.get('context_parallel_size', 1),
             micro_batch_size=cfg.get('micro_batch_size'),
             global_batch_size=cfg.get('global_batch_size'),
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
index 02ef522dde1f..caa909dc7ead 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py
@@ -593,6 +593,7 @@ def setup_mcore_distributed_parallel(self):
             ddp_config = DistributedDataParallelConfig(
                 grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'),
                 overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False),
+                num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1),
                 use_distributed_optimizer=True,
                 check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False),
                 # mcore bucket_size is based on num of parameters, therefore not
diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
index 8b42985a3937..5e44fda2be23 100644
--- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py
+++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py
@@ -106,6 +106,7 @@ def initialize_model_parallel_for_nemo(
     apex_transformer_log_level=30,
     use_tp_pp_dp_mapping=False,
     use_te_rng_tracker=False,
+    num_distributed_optimizer_instances=1,
 ):
 
     if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED:
@@ -117,6 +118,7 @@ def initialize_model_parallel_for_nemo(
     app_state.world_size = world_size
     app_state.local_rank = local_rank
     app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping
+    app_state.num_distributed_optimizer_instances = num_distributed_optimizer_instances
     app_state.expert_model_parallel_size = expert_model_parallel_size
     app_state.tensor_model_parallel_size = tensor_model_parallel_size
     app_state.pipeline_model_parallel_size = pipeline_model_parallel_size
diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py
index 144583db249a..ee62b80fe1be 100644
--- a/nemo/collections/nlp/parts/nlp_overrides.py
+++ b/nemo/collections/nlp/parts/nlp_overrides.py
@@ -163,6 +163,7 @@ def init_model_parallel(
                 use_sharp=sharp,
                 expert_model_parallel_size=app_state.expert_model_parallel_size,
                 order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp',
+                num_distributed_optimizer_instances=app_state.num_distributed_optimizer_instances,
                 distributed_timeout_minutes=distributed_timeout_minutes,
             )
 
diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py
index 37193cfdd8c5..643d5afe5815 100644
--- a/nemo/utils/app_state.py
+++ b/nemo/utils/app_state.py
@@ -58,6 +58,7 @@ def __init__(self):
         self._data_parallel_size = None
         self._data_parallel_group = None
         self._use_tp_pp_dp_mapping = False
+        self._num_distributed_optimizer_instances = 1
         self._megatron_checkpoint_version = None
         self._use_fp8 = False
         self._context_parallel_size = None
@@ -242,6 +243,22 @@ def use_tp_pp_dp_mapping(self):
     def use_tp_pp_dp_mapping(self, use_new_mapping):
         self._use_tp_pp_dp_mapping = use_new_mapping
 
+    @property
+    def num_distributed_optimizer_instances(self):
+        """Property returns the factor by which the Partial DistOpt is sharded.
+        Returns:
+            The partial DistOpt shard factor
+        """
+        return self._num_distributed_optimizer_instances
+
+    @num_distributed_optimizer_instances.setter
+    def num_distributed_optimizer_instances(self, shard_factor):
+        """Property sets the factor by which the Partial DistOpt is sharded.
+        Args:
+            shard_factor (int):  The partial DistOpt shard factor.
+        """
+        self._num_distributed_optimizer_instances = shard_factor
+
     @property
     def virtual_pipeline_model_parallel_size(self):
         """Property returns the number of GPUs in each model parallel group.

From 3591cf8cf9be7aa47a1da3aca2e2a2d7318208a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Wed, 15 Jan 2025 19:34:35 +0100
Subject: [PATCH 07/27] ci: Shorter retention period
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/_test_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index f198ffe6af1b..911fcc17e636 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
         - name: Docker system cleanup
           run: |
-            docker system prune -a --filter "until=48h" --force || true
+            docker system prune -a --filter "until=24h" --force || true
 
         - name: Docker pull image
           run: |

From 1626ddded63af1fe82b0bee482fba44c57b8203d Mon Sep 17 00:00:00 2001
From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Date: Wed, 15 Jan 2025 13:25:44 -0800
Subject: [PATCH 08/27] Add Seq Packing in NeMo / Neva2 (#11633)

* api updates and fixes

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix arg

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update seq packing in mock ds

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update preprocess_data

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* update seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix sp

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* save

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add truncation and padding

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Fix issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* change LLaVATemplateConfig variables to class variables

* change to use field with default attributes

* Apply isort and black reformatting

Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Add seq packing option in energon

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix energon conversation

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* add energon option in neva training script

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* add ci test for packed seq

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix mock dataset seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix mock dataset seq packing

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix lint and update seq pack func

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix energon module

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* fix comments

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* address lightning issues

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Apply isort and black reformatting

Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>

* Update sequence_packing.py

Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>

* update energon requirements

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* Fix for energon update

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

* fix for test

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>

---------

Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
Signed-off-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Signed-off-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com>
Co-authored-by: yaoyu-33 <yaoyu-33@users.noreply.github.com>
Co-authored-by: ykarnati <ykarnati@nvidia.com>
Co-authored-by: yashaswikarnati <yashaswikarnati@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |  18 +-
 nemo/collections/llm/peft/api.py              |   4 +-
 .../multimodal/data/energon/base.py           |   5 +
 .../multimodal/data/energon/config.py         |  24 ++-
 .../multimodal/data/energon/conversation.py   |   2 +-
 .../multimodal/data/energon/task_encoder.py   | 170 +++++++++++++++---
 nemo/collections/vlm/inference/base.py        |   2 +-
 nemo/collections/vlm/neva/data/config.py      |   6 +-
 nemo/collections/vlm/neva/data/lazy.py        | 115 ++++++------
 nemo/collections/vlm/neva/data/mock.py        |  69 ++++++-
 .../vlm/neva/data/sequence_packing.py         | 157 ++++++++++++++++
 nemo/collections/vlm/neva/model/base.py       |  38 +++-
 nemo/collections/vlm/recipes/llava15_13b.py   |   2 +-
 nemo/collections/vlm/recipes/llava15_7b.py    |   2 +-
 nemo/collections/vlm/recipes/llava_next_7b.py |   2 +-
 nemo/lightning/megatron_parallel.py           |   5 +-
 requirements/requirements_multimodal.txt      |   2 +-
 scripts/vlm/llava_next_finetune.py            |   2 +-
 scripts/vlm/llava_next_pretrain.py            |   2 +-
 scripts/vlm/mllama_finetune.py                |   2 +-
 scripts/vlm/neva_finetune.py                  | 109 ++++++++---
 .../data/energon/test_data_module.py          |   4 +-
 .../{mllama_train.py => test_mllama_train.py} |   0
 .../vlm/{neva_train.py => test_neva_train.py} |   7 +
 24 files changed, 611 insertions(+), 138 deletions(-)
 create mode 100644 nemo/collections/vlm/neva/data/sequence_packing.py
 rename tests/collections/vlm/{mllama_train.py => test_mllama_train.py} (100%)
 rename tests/collections/vlm/{neva_train.py => test_neva_train.py} (95%)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 16037920d080..a815be7bdc2f 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -4329,11 +4329,24 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        python tests/collections/vlm/neva_train.py \
+        python tests/collections/vlm/test_neva_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }}
 
+  L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/collections/vlm/test_neva_train.py \
+        --devices=1 \
+        --max-steps=5 \
+        --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \
+        --use_packed_sequence
+
   L2_NeMo_2_MLLAMA_MOCK_TRAINING:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4342,7 +4355,7 @@ jobs:
       RUNNER: self-hosted-azure
       SCRIPT: |
         TRANSFORMERS_OFFLINE=1 \
-        python tests/collections/vlm/mllama_train.py \
+        python tests/collections/vlm/test_mllama_train.py \
         --devices=1 \
         --max-steps=5 \
         --experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }}
@@ -5060,6 +5073,7 @@ jobs:
       - Speech_Checkpoints_tests
       - L2_Stable_Diffusion_Training
       - L2_NeMo_2_NEVA_MOCK_TRAINING
+      - L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING
       - L2_NeMo_2_MLLAMA_MOCK_TRAINING
       - L2_NeMo_2_GPT_Pretraining_no_transformer_engine
       - L2_NeMo_2_GPT_DDP_Param_Parity_check
diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py
index c05fd0b8edde..b70601faf7a3 100644
--- a/nemo/collections/llm/peft/api.py
+++ b/nemo/collections/llm/peft/api.py
@@ -16,10 +16,10 @@
 from pathlib import Path
 from typing import Tuple, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
+from lightning.pytorch.trainer.states import TrainerFn
 from megatron.core import dist_checkpointing
-from pytorch_lightning.trainer.states import TrainerFn
 from rich.console import Console
 
 from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py
index 3dfd495edd82..c29935880889 100644
--- a/nemo/collections/multimodal/data/energon/base.py
+++ b/nemo/collections/multimodal/data/energon/base.py
@@ -68,6 +68,7 @@ def __init__(
         multimodal_sample_config: Optional[MultiModalSampleConfig] = MultiModalSampleConfig(),
         task_encoder: Optional[MultiModalTaskEncoder] = None,
         decoder_seq_length: Optional[int] = None,
+        packing_buffer_size: Optional[int] = None,
     ) -> None:
         """
         Initialize the EnergonMultiModalDataModule.
@@ -84,6 +85,8 @@ def __init__(
         Defaults to MultiModalSampleConfig().
         task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding and batching samples.
         If not provided, a default (MultimodalTaskEncoder) encoder will be created. Defaults to None.
+        decoder_seq_length (int, optional): The maximum sequence length for the decoder. Used in encoder-decoder models.
+        packing_buffer_size (int, optional): Size of the packing buffer for batched samples. Defaults to None.
         """
 
         super().__init__()
@@ -113,6 +116,7 @@ def __init__(
         )
         self.train_dataloader_object = None
         self.val_dataloader_object = None
+        self.packing_buffer_size = packing_buffer_size
 
     def io_init(self, **kwargs) -> fdl.Config[Self]:
 
@@ -146,6 +150,7 @@ def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val
             task_encoder=self.task_encoder,
             worker_config=worker_config,
             max_samples_per_sequence=None,
+            packing_buffer_size=self.packing_buffer_size,
             shuffle_buffer_size=100,
             split_part=split,
         )
diff --git a/nemo/collections/multimodal/data/energon/config.py b/nemo/collections/multimodal/data/energon/config.py
index c145c5e51019..abbfd874880f 100644
--- a/nemo/collections/multimodal/data/energon/config.py
+++ b/nemo/collections/multimodal/data/energon/config.py
@@ -13,8 +13,11 @@
 # limitations under the License.
 
 from dataclasses import dataclass, field
-from typing import List
+from typing import List, Tuple, Union
+
 import torch
+from megatron.core.packed_seq_params import PackedSeqParams
+
 from nemo.collections.multimodal.data.energon.conversation import LLaVATemplateConfig
 
 
@@ -34,7 +37,7 @@ class ImageToken(MultiModalToken):
 
 @dataclass
 class ImageTextSample:
-    '''Sample type for template formatted raw image text sample'''
+    """Sample type for template formatted raw image text sample"""
 
     __key__: str = ''
     images: torch.Tensor = field(default_factory=lambda: torch.empty(0))
@@ -43,6 +46,15 @@ class ImageTextSample:
     loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
 
 
+@dataclass
+class PackedImageTextSample(ImageTextSample):
+    """Sample type for packed image text sample"""
+
+    __restore_key__: Tuple[Union[str, int, tuple], ...] = ()
+    position_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    packed_seq_params: PackedSeqParams = field(default_factory=lambda: PackedSeqParams())
+
+
 @dataclass
 class ImageTextRawBatch:
     """Sample type for image text raw batch"""
@@ -56,6 +68,14 @@ class ImageTextRawBatch:
     loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
 
 
+@dataclass
+class PackedImageTextRawBatch(ImageTextRawBatch):
+    """Sample type for image text raw batch"""
+
+    position_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float))
+    packed_seq_params: PackedSeqParams = field(default_factory=lambda: PackedSeqParams())
+
+
 @dataclass
 class MultiModalSampleConfig:
     image_token: ImageToken = field(default_factory=ImageToken)
diff --git a/nemo/collections/multimodal/data/energon/conversation.py b/nemo/collections/multimodal/data/energon/conversation.py
index 31019ae9c615..95b0ad184f8c 100644
--- a/nemo/collections/multimodal/data/energon/conversation.py
+++ b/nemo/collections/multimodal/data/energon/conversation.py
@@ -30,7 +30,7 @@ class LLaVATemplateConfig(BaseConversationTemplateConfig):
     """LLava-specific template configuration which extends the base config"""
 
     system: str = field(
-        default="A chat between a curious user and artificial assistant agent. "
+        default="A chat between a curious user and an artificial intelligence assistant. "
         "The assistant gives helpful, detailed and polite answers to user's questions."
     )
     roles: List[str] = field(default_factory=lambda: ['user', 'assistant'])
diff --git a/nemo/collections/multimodal/data/energon/task_encoder.py b/nemo/collections/multimodal/data/energon/task_encoder.py
index 7a8d0f0ab033..80b6e156f4a1 100644
--- a/nemo/collections/multimodal/data/energon/task_encoder.py
+++ b/nemo/collections/multimodal/data/energon/task_encoder.py
@@ -25,14 +25,21 @@
     batch_list,
     batch_pad_stack,
 )
+from megatron.energon.task_encoder.base import stateless
 
-from nemo.collections.multimodal.data.energon.config import ImageTextRawBatch, ImageTextSample
+from nemo.collections.multimodal.data.energon.config import (
+    ImageTextRawBatch,
+    ImageTextSample,
+    PackedImageTextRawBatch,
+    PackedImageTextSample,
+)
 from nemo.collections.multimodal.data.energon.sample_encoder import (
     InterleavedSampleEncoder,
     SampleEncoder,
     SimilarityInterleavedEncoder,
     VQASampleEncoder,
 )
+from nemo.utils import logging
 
 
 class MultiModalTaskEncoder(
@@ -54,16 +61,34 @@ class MultiModalTaskEncoder(
     for model input.
     """
 
-    def __init__(self, tokenizer, image_processor, multimodal_sample_config):
+    def __init__(
+        self,
+        tokenizer,
+        image_processor,
+        multimodal_sample_config,
+        packed_sequence=False,
+        packed_sequence_size=-1,
+        num_image_embeddings_per_tile=576,
+    ):
         """
         Initialize the MultiModalTaskEncoder with specific encoders for different sample types.
 
         Parameters:
-        tokenizer (Tokenizer): The tokenizer used for processing text across different sample types.
-        image_processor (ImageProcessor): The image processor used for preprocessing images.
-        multimodal_sample_config (MultiModalSampleConfig): MultiModalSampleConfig object.
+        tokenizer (Tokenizer): The tokenizer used for processing textual components across sample types.
+        image_processor (ImageProcessor): The image processor responsible for preprocessing image data.
+        multimodal_sample_config (MultiModalSampleConfig): Configuration object defining properties and
+            requirements for multimodal samples.
+        packed_sequence (bool, optional): Flag indicating whether packed sequences are used. Default is False.
+        packed_sequence_size (int, optional): The size of packed sequences, used when `packed_sequence` is True.
+            Default is -1.
+        num_image_embeddings_per_tile (int, optional): Number of image embeddings per image tile. Determines
+            the granularity of image features. Default is 576.
         """
         self.tokenizer = tokenizer
+        self.sample_config = multimodal_sample_config
+        self.packed_sequence = packed_sequence
+        self.num_image_embeddings_per_tile = num_image_embeddings_per_tile  # only used with seq packing
+        self.packed_sequence_size = packed_sequence_size
         self.encoders: Dict[str, SampleEncoder] = {
             VQASample.__name__: VQASampleEncoder(
                 tokenizer=tokenizer,
@@ -92,6 +117,7 @@ def register_encoder(self, sample_type: str, encoder: SampleEncoder) -> None:
         """
         self.encoders[sample_type] = encoder
 
+    @stateless
     def encode_sample(
         self, sample: Union[VQASample, InterleavedSample, SimilarityInterleavedSample, CaptioningSample]
     ) -> ImageTextSample:
@@ -118,7 +144,9 @@ def encode_sample(
         encoded_sample = encoder.encode(input_sample=sample, output_sample=ImageTextSample())
         return encoded_sample
 
-    def batch(self, samples: List[ImageTextSample]) -> ImageTextRawBatch:
+    def batch(
+        self, samples: List[Union[ImageTextSample, PackedImageTextSample]]
+    ) -> Union[ImageTextRawBatch, PackedImageTextRawBatch]:
         """
         Batch a list of encoded samples into a single raw batch.
 
@@ -131,26 +159,51 @@ def batch(self, samples: List[ImageTextSample]) -> ImageTextRawBatch:
         ImageTextRawBatch: The batched data, including images, tokens, labels, and loss masks.
         """
 
-        keys, images, tokens, labels, loss_mask = [], [], [], [], []
-        for sample in samples:
-            keys.append(sample.__key__)
-            images.append(sample.images)
-            tokens.append(sample.tokens)
-            labels.append(sample.labels)
-            loss_mask.append(sample.loss_mask)
-
-        batch_keys = batch_list(keys)
-        batch_images = batch_pad_stack(images)
-        batch_prompt_tokens = batch_pad_stack(tokens)
-        batch_labels = batch_pad_stack(labels)
-        batch_loss_mask = batch_pad_stack(loss_mask)
-        return ImageTextRawBatch(
-            __keys__=batch_keys,
-            images=batch_images,
-            tokens=batch_prompt_tokens,
-            labels=batch_labels,
-            loss_mask=batch_loss_mask,
-        )
+        if self.packed_sequence:
+            if len(samples) > 1:
+                raise ValueError(
+                    "Micro batch size should be 1 when training with packed sequence, but your micro batch size "
+                    f"is {len(samples)}. \nThe following config is equivalent to your current setting for "
+                    f"a packed dataset. Please update your config to the following: \n"
+                    f"Set micro batch size to 1 (currently {len(samples)})\n"
+                    f"Set global batch size to `global_batch_size // {len(samples)}` "
+                    f"Set packed sequence length to `original_sample_seq_len * {len(samples)}` "
+                    f"(currently {self.packed_sequence_size}) \n"
+                    f"For details please visit "
+                    f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html"
+                )
+            # The batching are taken care by packing.
+            sample = samples[0]
+            return PackedImageTextRawBatch(
+                __keys__=sample.__key__,
+                images=sample.images,
+                tokens=sample.tokens,
+                labels=sample.labels,
+                loss_mask=sample.loss_mask,
+                position_ids=sample.position_ids,
+                packed_seq_params=sample.packed_seq_params,
+            )
+        else:
+            keys, images, tokens, labels, loss_mask = [], [], [], [], []
+            for sample in samples:
+                keys.append(sample.__key__)
+                images.append(sample.images)
+                tokens.append(sample.tokens)
+                labels.append(sample.labels)
+                loss_mask.append(sample.loss_mask)
+
+            batch_keys = batch_list(keys)
+            batch_images = batch_pad_stack(images)
+            batch_prompt_tokens = batch_pad_stack(tokens)
+            batch_labels = batch_pad_stack(labels)
+            batch_loss_mask = batch_pad_stack(loss_mask)
+            return ImageTextRawBatch(
+                __keys__=batch_keys,
+                images=batch_images,
+                tokens=batch_prompt_tokens,
+                labels=batch_labels,
+                loss_mask=batch_loss_mask,
+            )
 
     def encode_batch(self, batch_data: ImageTextRawBatch) -> dict:
         """
@@ -165,7 +218,7 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict:
         Returns:
         dict: A dictionary containing the encoded batch data, ready for model input.
         """
-        batch_dict = dataclasses.asdict(batch_data)
+        batch_dict = batch_data.__dict__
         if 'images' in batch_dict:
             batch_dict['media'] = batch_dict['images']
             del batch_dict['images']
@@ -177,3 +230,66 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict:
         if 'attention_mask' not in batch_dict:
             batch_dict['attention_mask'] = None
         return batch_dict
+
+    def select_samples_to_pack(self, samples):
+        """Selects which samples will be packed together.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+        """
+        from nemo.collections.vlm.neva.data.sequence_packing import greedy_knapsack, predict_seq_len
+
+        media_token_id = self.sample_config.image_token.token_id
+        lengths = [
+            predict_seq_len(
+                sample.tokens,
+                media_token_index=media_token_id,
+                num_image_embeddings_per_tile=self.num_image_embeddings_per_tile,
+            )
+            for sample in samples
+        ]
+        packed_samples = greedy_knapsack(lengths, samples, self.packed_sequence_size)
+        avg_samples_per_bin = round(len(lengths) / len(packed_samples))
+        logging.info(
+            f"[Seq Packing Info] - Packing seq len: {self.packed_sequence_size}, "
+            f"Buffered samples: {len(lengths)}, Total number of bins: {len(packed_samples)}, "
+            f"Average samples per bin: {avg_samples_per_bin}"
+        )
+        return packed_samples
+
+    @stateless
+    def pack_selected_samples(self, samples):
+        """
+        Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked.
+
+        NOTE: Energon dataloader calls this method internally if packing is used.
+        Please see https://nvidia.github.io/Megatron-Energon/packing.html
+
+        Args:
+            samples: List of ImageTaskSample instances to pack into one sample.
+
+        Returns:
+            ImageTaskSamplePacked instance.
+        """
+        from nemo.collections.vlm.neva.data.sequence_packing import convert_to_packed
+
+        packed_images = torch.stack([sample.images for sample in samples])
+        media_token_id = self.sample_config.image_token.token_id
+        packed_tokens, packed_labels, packed_position_ids, packed_loss_mask, packed_seq_params = convert_to_packed(
+            tokens=[sample.tokens for sample in samples],
+            labels=[sample.labels for sample in samples],
+            num_image_embeddings_per_tile=self.num_image_embeddings_per_tile,
+            media_token_index=media_token_id,
+            ignore_index=self.sample_config.ignore_place_holder,
+        )
+
+        return PackedImageTextSample(
+            __key__=",".join([s.__key__ for s in samples]),
+            __restore_key__=(),  # Will be set by energon based on `samples`
+            tokens=packed_tokens,
+            labels=packed_labels,
+            images=packed_images,
+            position_ids=packed_position_ids,
+            loss_mask=packed_loss_mask,
+            packed_seq_params=packed_seq_params,
+        )
diff --git a/nemo/collections/vlm/inference/base.py b/nemo/collections/vlm/inference/base.py
index 77918bae26b9..bbceb851edae 100644
--- a/nemo/collections/vlm/inference/base.py
+++ b/nemo/collections/vlm/inference/base.py
@@ -14,7 +14,7 @@
 
 from typing import List, Optional, Union
 
-import pytorch_lightning as pl
+import lightning.pytorch as pl
 import torch
 import torch.distributed
 from megatron.core.inference.common_inference_params import CommonInferenceParams
diff --git a/nemo/collections/vlm/neva/data/config.py b/nemo/collections/vlm/neva/data/config.py
index 3b22d5a493b3..2cf3dd80f47d 100644
--- a/nemo/collections/vlm/neva/data/config.py
+++ b/nemo/collections/vlm/neva/data/config.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from typing import Optional
 
 from .multimodal_tokens import ImageToken, MultiModalToken, VideoToken
@@ -31,7 +31,7 @@ class DataConfig:
 @dataclass
 class ImageDataConfig(DataConfig):
     media_type: str = "image"
-    media_token: MultiModalToken = ImageToken
+    media_token: MultiModalToken = field(default_factory=lambda: ImageToken())
     image_folder: Optional[str] = None
     image_process_mode: str = 'pad'
 
@@ -39,7 +39,7 @@ class ImageDataConfig(DataConfig):
 @dataclass
 class VideoDataConfig(DataConfig):
     media_type: str = "video"
-    media_token: MultiModalToken = VideoToken
+    media_token: MultiModalToken = VideoToken()
     splice_single_frame: Optional[str] = None
     # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded.
     num_frames: int = 8  # Selects the number of frames to use from the video
diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py
index 066310867777..90199d3c6d30 100644
--- a/nemo/collections/vlm/neva/data/lazy.py
+++ b/nemo/collections/vlm/neva/data/lazy.py
@@ -251,7 +251,6 @@ def __init__(
         data_config,
         tokenizer,
         image_processor,
-        sequence_length=None,
     ):
         super().__init__()
         if data_path is not None:
@@ -269,8 +268,6 @@ def __init__(
             self.tokenizer = self.tokenizer.tokenizer
 
         self.image_processor = image_processor
-        self.sequence_length = sequence_length
-
         self.conv_template = data_config.conv_template
         self.conv = supported_conv_templates[self.conv_template]
         self.image_process_mode = data_config.image_process_mode
@@ -381,6 +378,8 @@ def __init__(
         data_config,
         tokenizer,
         image_processor,
+        packed_sequence=False,
+        num_image_embeddings_per_tile=576,
     ):
 
         if data_path.endswith(".json"):
@@ -414,29 +413,12 @@ def __init__(
 
         else:
             raise ValueError(f"Formatting of {data_path} is not supported in Neva.")
+        self.packed_sequence = packed_sequence
+        self.num_image_embeddings_per_tile = num_image_embeddings_per_tile
 
     def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         data_config = self.data_config
-        packed_sequence = "cu_seqlens" in instances[0]
-        max_len = max(instance['tokens'].shape[0] for instance in instances)
-        for instance in instances:
-            pad_len = max_len - instance['tokens'].shape[0]
-            instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
-            instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX)
-            if packed_sequence and instance["cu_seqlens"][-1] != max_len:
-                instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0)
-
-        if packed_sequence:
-            max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances)
-            max_len_image = max(instance['image'].shape[0] for instance in instances)
-            for instance in instances:
-                pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0]
-                instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len)
-
-                x = instance['image']
-                num_pad = max_len_image - x.shape[0]
-                pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device)
-                instance['image'] = torch.cat((x, pad_tensor), dim=0)
+        packed_sequence = self.packed_sequence
 
         media_type = data_config.media_type
         if media_type == 'image':
@@ -447,24 +429,30 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
         else:
             raise ValueError(f"Unsupported media type {media_type}")
 
-        batch = default_collate(instances)
-        tokenizer = self.tokenizer
+        if packed_sequence:
+            from nemo.collections.vlm.neva.data.sequence_packing import convert_to_packed
+
+            media_token_id = self.data_config.media_token.token_index
+            tokens, labels, position_ids, loss_mask, packed_seq_params = convert_to_packed(
+                tokens=[instance['tokens'] for instance in instances],
+                labels=[instance['labels'] for instance in instances],
+                num_image_embeddings_per_tile=self.num_image_embeddings_per_tile,
+                media_token_index=media_token_id,
+                ignore_index=IGNORE_INDEX,
+            )
+            attention_mask = None
+        else:  # regular dataset
+            max_len = max(instance['tokens'].shape[0] for instance in instances)
+            for instance in instances:
+                pad_len = max_len - instance['tokens'].shape[0]
+                instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0)
+                instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX)
 
-        tokens = batch['tokens']
-        labels = batch['labels']
+            batch = default_collate(instances)
+            tokenizer = self.tokenizer
 
-        if packed_sequence:
-            cu_seqlens = batch["cu_seqlens"]
-            position_ids = []
-            for cu_seqlen in cu_seqlens:
-                position_ids.append([])
-                for ind in range(0, len(cu_seqlen) - 1):
-                    seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind]
-                    position_ids[-1].extend(list(range(seqlen)))
-            position_ids = torch.LongTensor(position_ids)
-            loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device)
-            attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device)
-        else:
+            tokens = batch['tokens']
+            labels = batch['labels']
             attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
                 data=tokens,
                 eod_token=tokenizer.eos_token_id,
@@ -472,8 +460,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
                 reset_attention_mask=data_config.reset_attention_mask,
                 reset_position_ids=data_config.reset_position_ids,
             )
-
-        loss_mask[labels < 0] = 0.0
+            loss_mask[labels < 0] = 0.0
 
         batch = {
             'tokens': tokens,
@@ -484,7 +471,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
             'media': media,
         }
         if packed_sequence:
-            batch["cu_seqlens"] = cu_seqlens
+            batch["packed_seq_params"] = packed_seq_params
         return batch
 
 
@@ -506,7 +493,8 @@ def __init__(
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
-        use_packed_sequence: bool = False,
+        packed_sequence: bool = False,
+        num_image_embeddings_per_tile: int = 576,
         seed: int = 1234,
     ) -> None:
         super().__init__()
@@ -534,7 +522,8 @@ def __init__(
         self.pin_memory = pin_memory
         self.persistent_workers = persistent_workers
         self.seed = seed
-        self.use_packed_sequence = use_packed_sequence
+        self.packed_sequence = packed_sequence
+        self.num_image_embeddings_per_tile = num_image_embeddings_per_tile
         self.init_global_step = 0
 
         if tokenizer is None or image_processor is None:
@@ -546,6 +535,20 @@ def __init__(
             self.tokenizer = tokenizer or AutoTokenizer("llava-hf/llava-1.5-7b-hf")
             self.image_processor = image_processor or processor.image_processor
 
+        if self.packed_sequence:
+            import dataclasses
+
+            def custom_on_megatron_step_start(self, step):
+                return dataclasses.replace(
+                    step,
+                    seq_length=self.seq_len,
+                    micro_batch_size=1,  # Override the micro_batch_size to 1 (used in PP)
+                    num_microbatches=self.num_microbatches,
+                    decoder_seq_length=self.decoder_seq_len,
+                )
+
+            MegatronDataSampler.on_megatron_step_start = custom_on_megatron_step_start
+
         self.data_sampler = MegatronDataSampler(
             seq_len=self.seq_length,
             decoder_seq_len=self.decoder_seq_length,
@@ -556,14 +559,22 @@ def __init__(
 
     def setup(self, stage: str = "") -> None:
         assert len(self.paths) == 1, "not yet support blend dataset in Neva 2.0!"
-        if self.use_packed_sequence:
-            pass  # TODO
-        else:
-            # TODO:
-            # rng = torch.Generator().manual_seed(self.seed)
-            # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng)
-            self._train_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor)
-            self._validation_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor)
+        self._train_ds = NevaDataset(
+            self.paths[0],
+            self.data_config,
+            self.tokenizer,
+            self.image_processor,
+            packed_sequence=self.packed_sequence,
+            num_image_embeddings_per_tile=self.num_image_embeddings_per_tile,
+        )
+        self._validation_ds = NevaDataset(
+            self.paths[0],
+            self.data_config,
+            self.tokenizer,
+            self.image_processor,
+            packed_sequence=self.packed_sequence,
+            num_image_embeddings_per_tile=self.num_image_embeddings_per_tile,
+        )
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         return self._create_dataloader(self._train_ds)
diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py
index 7533bf56ac46..495bd9f0dee5 100644
--- a/nemo/collections/vlm/neva/data/mock.py
+++ b/nemo/collections/vlm/neva/data/mock.py
@@ -42,6 +42,7 @@ def __init__(
         num_workers: int = 8,
         pin_memory: bool = True,
         persistent_workers: bool = False,
+        packed_sequence: bool = False,
     ):
         super().__init__()
         self.seq_length = seq_length
@@ -54,6 +55,7 @@ def __init__(
         self.num_workers = num_workers
         self.pin_memory = pin_memory
         self.persistent_workers = persistent_workers
+        self.packed_sequence = packed_sequence
 
         if tokenizer is None or image_processor is None:
             logging.warning(f"Processor or tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.")
@@ -72,14 +74,36 @@ def __init__(
         )
 
     def setup(self, stage: str = "") -> None:
+        seq_length = self.seq_length
+        if self.packed_sequence and self.micro_batch_size > 1:
+            seq_length = seq_length // self.micro_batch_size
+            logging.warning(
+                f"Packed sequence is used with mock dataset. Sequence length for each "
+                f"sample is update to `seq_length // self.micro_batch_size = {seq_length}`!"
+            )
         self._train_ds = _MockNevaDataset(
-            self.tokenizer, self.image_processor, "train", self.num_train_samples, self.seq_length
+            self.tokenizer,
+            self.image_processor,
+            "train",
+            self.num_train_samples,
+            seq_length,
+            packed_sequence=self.packed_sequence,
         )
         self._validation_ds = _MockNevaDataset(
-            self.tokenizer, self.image_processor, "valid", self.num_val_samples, self.seq_length
+            self.tokenizer,
+            self.image_processor,
+            "valid",
+            self.num_val_samples,
+            seq_length,
+            packed_sequence=self.packed_sequence,
         )
         self._test_ds = _MockNevaDataset(
-            self.tokenizer, self.image_processor, "test", self.num_test_samples, self.seq_length
+            self.tokenizer,
+            self.image_processor,
+            "test",
+            self.num_test_samples,
+            seq_length,
+            packed_sequence=self.packed_sequence,
         )
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
@@ -117,6 +141,8 @@ def __init__(
         num_samples: int,
         seq_length: int,
         seed: int = 42,
+        packed_sequence: bool = False,
+        num_image_embeddings_per_tile=576,
     ) -> None:
         super().__init__()
         self.name = name
@@ -129,8 +155,10 @@ def __init__(
 
         self.length = num_samples
         self.seed = seed
+        self.packed_sequence = packed_sequence
+        self.num_image_embeddings_per_tile = num_image_embeddings_per_tile
 
-        self.loss_mask = torch.ones(self.seq_length, dtype=torch.float)
+        self.loss_mask = torch.ones(self.seq_length + 1 - num_image_embeddings_per_tile, dtype=torch.float)
         self.position_ids = torch.arange(self.seq_length, dtype=torch.int64)
 
     def __len__(self) -> int:
@@ -143,7 +171,11 @@ def _get_text(self, idx: int) -> np.ndarray:
     def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
         # Generate data of the expected size and datatype (based on GPTDataset).
         np_gen = np.random.default_rng(seed=(self.seed + idx))
-        tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64))
+        tokens = torch.from_numpy(
+            np_gen.integers(
+                self.vocab_size, size=[self.seq_length + 2 - self.num_image_embeddings_per_tile], dtype=np.int64
+            )
+        )
         tokens[2] = IMAGE_TOKEN_INDEX  # ImageToken token index
         labels = tokens.clone()
         images = torch.from_numpy(np_gen.random(size=[3, self.image_height, self.image_width], dtype=np.float32))
@@ -164,6 +196,33 @@ def _collate_fn(self, batch):
         """
         collated_batch = data.dataloader.default_collate(batch)
         collated_batch["attention_mask"] = None
+        if self.packed_sequence:
+            from megatron.core.packed_seq_params import PackedSeqParams
+
+            tokens = collated_batch["tokens"]
+            batch_size = tokens.shape[0]
+            valid_seqlen = self.seq_length
+            cu_seqlens = torch.arange(
+                0, (batch_size + 1) * (valid_seqlen), step=(valid_seqlen), dtype=torch.int32, device=tokens.device
+            )
+            cu_seqlens_padded = torch.arange(
+                0, (batch_size + 1) * (valid_seqlen), step=(valid_seqlen), dtype=torch.int32, device=tokens.device
+            )
+            qkv_format = 'thd'
+            packed_seq_params = PackedSeqParams(
+                cu_seqlens_q=cu_seqlens,
+                cu_seqlens_kv=cu_seqlens,
+                cu_seqlens_q_padded=cu_seqlens_padded,
+                cu_seqlens_kv_padded=cu_seqlens_padded,
+                max_seqlen_q=valid_seqlen,
+                max_seqlen_kv=valid_seqlen,
+                qkv_format=qkv_format,
+            )
+            collated_batch["packed_seq_params"] = packed_seq_params
+
+            for key in ["tokens", "labels", "loss_mask", "position_ids"]:
+                collated_batch[key] = collated_batch[key].reshape(1, -1)
+
         return collated_batch
 
     def collate_fn(self, batch):
diff --git a/nemo/collections/vlm/neva/data/sequence_packing.py b/nemo/collections/vlm/neva/data/sequence_packing.py
new file mode 100644
index 000000000000..1ddfe80c5797
--- /dev/null
+++ b/nemo/collections/vlm/neva/data/sequence_packing.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from megatron.core.packed_seq_params import PackedSeqParams
+
+
+# pylint:disable=line-too-long
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def search_for_fit(numbers: List[int], capacity: int) -> int:
+    """Finds the index of largest number that fits into the knapsack with the given capacity."""
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+# pylint: disable=line-too-long
+# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27
+# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0.
+def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List:
+    """Greedy algorithm with binary search for the knapsack problem.
+
+    Pack as many samples as possible given a maximum capacity and capacities of individual samples.
+    Used if sequence packing is enabled.
+    """
+    assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length."
+
+    knapsacks = []
+
+    if len(item_sizes) == 0:
+        return knapsacks
+
+    # Sort sample lengths and samples together.
+    sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0]))
+    sorted_item_sizes = list(sorted_item_sizes)
+    sorted_samples = list(sorted_samples)
+
+    # Check if all samples fit in the knapsack capacity.
+    if sorted_item_sizes[-1] > max_capacity:
+        raise ValueError(
+            f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}."
+        )
+
+    while sorted_item_sizes:
+        current_knapsack = []
+        remaining_capacity = max_capacity
+
+        while True:
+            idx = search_for_fit(sorted_item_sizes, remaining_capacity)
+            if idx == -1:
+                break  # Can't fit more samples.
+
+            remaining_capacity -= sorted_item_sizes[idx]
+
+            sorted_item_sizes.pop(idx)
+            sample = sorted_samples.pop(idx)
+            current_knapsack.append(sample)
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+def predict_seq_len(instance_tokens: torch.Tensor, num_image_embeddings_per_tile: int, media_token_index: int) -> int:
+    """
+    Predict the effective sequence length, accounting for media embeddings.
+
+    Args:
+        instance_tokens (torch.Tensor): Token tensor for a single instance.
+        num_image_embeddings_per_tile (int): Number of image embeddings per tile.
+        media_token_index (int): Token ID representing media.
+
+    Returns:
+        int: Effective sequence length.
+    """
+    num_images = torch.sum(instance_tokens == media_token_index).item()
+    seqlen = len(instance_tokens) + (num_image_embeddings_per_tile - 1) * num_images
+    return seqlen
+
+
+def convert_to_packed(
+    tokens: List[torch.Tensor],
+    labels: List[torch.Tensor],
+    num_image_embeddings_per_tile: int,
+    media_token_index: int,
+    ignore_index: int,
+    pad_to_multiple_of: int = 64,
+):
+    """
+    Convert tokens, labels, and associated inputs into a packed version with padded sequence parameters.
+
+    Args:
+        tokens (list[torch.Tensor]): List of token tensors for each instance.
+        labels (list[torch.Tensor]): List of label tensors for each instance.
+        num_image_embeddings_per_tile (int): Number of image embeddings per tile.
+        media_token_index (int): Token ID representing media.
+        ignore_index (int): Value to use for padding labels.
+        pad_to_multiple_of (int): Sequence length will be padded to a multiple of this value. Default is 8.
+    """
+    packed_tokens = []
+    packed_labels = []
+    packed_position_ids = []
+    seqlens_padded = []
+    cu_seqlens = [0]
+    cu_seqlens_padded = [0]
+
+    for instance_tokens, instance_labels in zip(tokens, labels):
+        seqlen = predict_seq_len(instance_tokens, num_image_embeddings_per_tile, media_token_index)
+        seqlen_padded = (seqlen + pad_to_multiple_of - 1) // pad_to_multiple_of * pad_to_multiple_of
+        pad_len = seqlen_padded - seqlen
+
+        if pad_len > 0:
+            instance_tokens = F.pad(instance_tokens, (0, pad_len), 'constant', 0)
+            instance_labels = F.pad(instance_labels, (0, pad_len), 'constant', ignore_index)
+
+        packed_tokens.append(instance_tokens)
+        packed_labels.append(instance_labels)
+        packed_position_ids.append(torch.arange(len(instance_tokens), dtype=torch.int, device=instance_tokens.device))
+        seqlens_padded.append(seqlen_padded)
+        cu_seqlens.append(cu_seqlens[-1] + seqlen)
+        cu_seqlens_padded.append(cu_seqlens_padded[-1] + seqlen_padded)
+
+    packed_tokens = torch.cat(packed_tokens, dim=0).unsqueeze(0)
+    packed_labels = torch.cat(packed_labels, dim=0).unsqueeze(0)
+    packed_position_ids = torch.cat(packed_position_ids, dim=0).unsqueeze(0)
+    packed_loss_mask = torch.ones_like(packed_labels, dtype=torch.float, device=packed_labels.device)
+    packed_loss_mask[packed_labels < 0] = 0.0
+
+    cu_seqlens = torch.IntTensor(cu_seqlens)
+    cu_seqlens_padded = torch.IntTensor(cu_seqlens_padded)
+
+    packed_seq_params = PackedSeqParams(
+        cu_seqlens_q=cu_seqlens,
+        cu_seqlens_kv=cu_seqlens,
+        cu_seqlens_q_padded=cu_seqlens_padded,
+        cu_seqlens_kv_padded=cu_seqlens_padded,
+        max_seqlen_q=int(max(seqlens_padded)),
+        max_seqlen_kv=int(max(seqlens_padded)),
+        qkv_format='thd',
+    )
+
+    return packed_tokens, packed_labels, packed_position_ids, packed_loss_mask, packed_seq_params
diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py
index 388078484a56..8cead72b4832 100644
--- a/nemo/collections/vlm/neva/model/base.py
+++ b/nemo/collections/vlm/neva/model/base.py
@@ -121,14 +121,19 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]:
             )
         )
 
+    packed_seq_params = _batch.get("packed_seq_params", None)
     _batch = {
         key: val.cuda(non_blocking=True) if key in required_keys and val is not None else None
         for key, val in _batch.items()
     }
-    # slice batch along sequence dimension for context parallelism
-    output = get_batch_on_this_context_parallel_rank(_batch)
+    if packed_seq_params is not None:
+        for attr in ["cu_seqlens_q", "cu_seqlens_kv", "cu_seqlens_q_padded", "cu_seqlens_kv_padded"]:
+            value = getattr(packed_seq_params, attr, None)
+            if value is not None:
+                setattr(packed_seq_params, attr, value.cuda(non_blocking=True))
+    _batch["packed_seq_params"] = packed_seq_params
 
-    return output
+    return _batch
 
 
 def neva_forward_step(model, batch) -> torch.Tensor:
@@ -596,6 +601,7 @@ def forward(
             image_token_index,
             num_image_tiles,
             attention_mask,
+            packed_seq_params,
         )  # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len]
 
         output = self.language_model(
@@ -642,6 +648,7 @@ def _preprocess_data(
         image_token_index,
         num_image_tiles,
         attention_mask,
+        packed_seq_params,
     ):
         """Preprocess input data before input to language model.
 
@@ -698,6 +705,8 @@ def _preprocess_data(
                 labels.shape == loss_mask.shape
             ), f"mismatching labels shape {labels.shape} and loss mask shape {loss_mask.shape}"
 
+        packed_sequence = packed_seq_params is not None and packed_seq_params.qkv_format == "thd"
+
         # Create indices for new text and label positions.
         with torch.no_grad():
             image_token_mask = input_ids == image_token_index
@@ -715,6 +724,16 @@ def _preprocess_data(
             # Pipeline parallel expects fixed input size. Check if we need to pad.
             if self._language_is_pipeline_parallel and max_seq_len < self._language_max_sequence_length:
                 max_seq_len = self._language_max_sequence_length
+                if packed_sequence:
+                    last_seqlen = packed_seq_params.cu_seqlens_q[-1] - packed_seq_params.cu_seqlens_q[-2]
+                    last_seqlen_padded = max_seq_len - packed_seq_params.cu_seqlens_q_padded[-2]
+                    assert (
+                        last_seqlen_padded >= last_seqlen
+                    ), "`language_max_sequence_length` needs to increase for sequence packing to work properly."
+                    packed_seq_params.cu_seqlens_q_padded[-1] = max_seq_len
+                    packed_seq_params.cu_seqlens_kv_padded[-1] = max_seq_len
+                    packed_seq_params.max_seqlen_q = max(last_seqlen_padded, packed_seq_params.max_seqlen_q)
+                    packed_seq_params.max_seqlen_kv = max(last_seqlen_padded, packed_seq_params.max_seqlen_kv)
 
             if self.sequence_parallel_lm:
                 if self.tp_comm_overlap_lm:
@@ -835,7 +854,17 @@ def _preprocess_data(
             # Truncate if exceeding the language model's max sequence length.
             if final_embedding.shape[0] > self._language_max_sequence_length:
                 final_embedding = final_embedding[: self._language_max_sequence_length]
-            if self.sequence_parallel_lm:
+                if packed_sequence:
+                    truncate_len = packed_seq_params.cu_seqlens_q_padded[-1] - self._language_max_sequence_length
+                    packed_seq_params.cu_seqlens_q_padded[-1] = self._language_max_sequence_length
+                    packed_seq_params.cu_seqlens_kv_padded[-1] = self._language_max_sequence_length
+                    packed_seq_params.cu_seqlens_q[-1] -= truncate_len
+                    packed_seq_params.cu_seqlens_kv[-1] -= truncate_len
+                    assert (
+                        packed_seq_params.cu_seqlens_q[-1] >= packed_seq_params.cu_seqlens_q[-2]
+                    ), "with packed sequence, the truncation can only truncate on the last sequence."
+
+            if self.sequence_parallel_lm and not packed_sequence:
                 # Create an attention mask. This ensures correct computation.
                 # This is done even when no padding was done as we set mask_type to
                 # 'padding' or 'padding_causal' when using SP.
@@ -858,6 +887,7 @@ def _preprocess_data(
 
                 # Attention mask True/False meaning flipped in 1.7.0
                 attention_mask = attention_mask < 0.5
+            if self.sequence_parallel_lm:
                 final_embedding = tensor_parallel.scatter_to_sequence_parallel_region(final_embedding)
 
         return final_embedding, final_labels, final_loss_mask, attention_mask
diff --git a/nemo/collections/vlm/recipes/llava15_13b.py b/nemo/collections/vlm/recipes/llava15_13b.py
index d85ba6f2752b..40bc8cc44682 100644
--- a/nemo/collections/vlm/recipes/llava15_13b.py
+++ b/nemo/collections/vlm/recipes/llava15_13b.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 from megatron.core.distributed import DistributedDataParallelConfig
 
diff --git a/nemo/collections/vlm/recipes/llava15_7b.py b/nemo/collections/vlm/recipes/llava15_7b.py
index 2abb50db6c11..9de60e671e38 100644
--- a/nemo/collections/vlm/recipes/llava15_7b.py
+++ b/nemo/collections/vlm/recipes/llava15_7b.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 from megatron.core.distributed import DistributedDataParallelConfig
 
diff --git a/nemo/collections/vlm/recipes/llava_next_7b.py b/nemo/collections/vlm/recipes/llava_next_7b.py
index d23159125823..53609fe589c8 100644
--- a/nemo/collections/vlm/recipes/llava_next_7b.py
+++ b/nemo/collections/vlm/recipes/llava_next_7b.py
@@ -15,8 +15,8 @@
 
 from typing import Optional
 
+import lightning.pytorch as pl
 import nemo_run as run
-import pytorch_lightning as pl
 import torch
 
 from nemo import lightning as nl
diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py
index 1b1f5c790b61..e3c6c77f4cda 100644
--- a/nemo/lightning/megatron_parallel.py
+++ b/nemo/lightning/megatron_parallel.py
@@ -1711,7 +1711,10 @@ def masked_token_loss(tensor: Tensor, mask: Tensor):
     """
     losses = tensor.float()
     loss_mask = mask.view(-1).float()
-    loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum()  # sequence level nll
+    num_valid_tokens = loss_mask.sum()
+    if num_valid_tokens < 0.5:  # no valid tokens
+        num_valid_tokens += 1.0
+    loss = torch.sum(losses.view(-1) * loss_mask) / num_valid_tokens  # sequence level nll
 
     return loss
 
diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt
index 92ae32659dac..585e277be72a 100644
--- a/requirements/requirements_multimodal.txt
+++ b/requirements/requirements_multimodal.txt
@@ -6,7 +6,7 @@ diffusers>=0.19.3
 einops_exts
 imageio
 kornia
-megatron-energon<3.0.0
+megatron-energon==4.0.0
 nerfacc>=0.5.3
 open_clip_torch==2.24.0
 PyMCubes
diff --git a/scripts/vlm/llava_next_finetune.py b/scripts/vlm/llava_next_finetune.py
index 91df8a39452d..9d3e5053c0c1 100644
--- a/scripts/vlm/llava_next_finetune.py
+++ b/scripts/vlm/llava_next_finetune.py
@@ -25,8 +25,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
diff --git a/scripts/vlm/llava_next_pretrain.py b/scripts/vlm/llava_next_pretrain.py
index 0beb9b5b08d0..19bdf47bb668 100644
--- a/scripts/vlm/llava_next_pretrain.py
+++ b/scripts/vlm/llava_next_pretrain.py
@@ -25,8 +25,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py
index 15cd8078fd32..9e37d9c3fc0c 100644
--- a/scripts/vlm/mllama_finetune.py
+++ b/scripts/vlm/mllama_finetune.py
@@ -15,8 +15,8 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 from transformers import AutoProcessor
 
 from nemo import lightning as nl
diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py
index 4069fb2d9278..3bf0084ea60d 100644
--- a/scripts/vlm/neva_finetune.py
+++ b/scripts/vlm/neva_finetune.py
@@ -21,11 +21,12 @@
 import argparse
 
 import torch
+from lightning.pytorch.loggers import WandbLogger
 from megatron.core.optimizer import OptimizerConfig
-from pytorch_lightning.loggers import WandbLogger
 
 from nemo import lightning as nl
 from nemo.collections import llm, vlm
+from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder
 from nemo.collections.vlm import ImageDataConfig
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
@@ -42,6 +43,33 @@ def main(args):
     max_steps = args.max_steps
 
     decoder_seq_length = 4096
+    if args.use_packed_sequence:
+        decoder_seq_length = 8192
+
+    # Submodules configurations
+    language_transformer_config = llm.Llama2Config7B(
+        seq_length=decoder_seq_length,
+    )
+    vision_transformer_config = vlm.HFCLIPVisionConfig(
+        pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
+    )
+    vision_projection_config = vlm.MultimodalProjectorConfig(
+        projector_type=args.projector_type,
+        input_size=vision_transformer_config.hidden_size,
+        hidden_size=language_transformer_config.hidden_size,
+        ffn_hidden_size=language_transformer_config.hidden_size,
+    )
+
+    # NEVA model configuration
+    neva_config = vlm.NevaConfig(
+        language_transformer_config=language_transformer_config,
+        vision_transformer_config=vision_transformer_config,
+        vision_projection_config=vision_projection_config,
+        language_model_from_pretrained=args.language_model_path,
+        freeze_language_model=False,
+        freeze_vision_model=True,
+    )
+    num_image_embeddings_per_tile = vision_transformer_config.num_image_embeddings_per_tile
 
     if args.data_type == "llava":
         # Data configuration
@@ -60,7 +88,50 @@ def main(args):
             micro_batch_size=mbs,
             tokenizer=None,
             image_processor=None,
-            num_workers=8,
+            num_workers=4,
+            packed_sequence=args.use_packed_sequence,
+            num_image_embeddings_per_tile=num_image_embeddings_per_tile,
+        )
+    elif args.data_type == "energon":
+        from transformers import AutoProcessor
+
+        from nemo.collections.multimodal.data.energon import (
+            EnergonMultiModalDataModule,
+            ImageToken,
+            LLaVATemplateConfig,
+            MultiModalSampleConfig,
+        )
+
+        processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        tokenizer = processor.tokenizer
+        image_processor = processor.image_processor
+
+        # Configure multimodal samples
+        config = MultiModalSampleConfig(
+            image_token=ImageToken(token_str="<image>", token_id=-200),
+            ignore_place_holder=-100,
+            conversation_template_config=LLaVATemplateConfig(),
+        )
+
+        # Initialize the data module
+        data = EnergonMultiModalDataModule(
+            path=args.data_path,
+            tokenizer=tokenizer,
+            image_processor=image_processor,
+            seq_length=decoder_seq_length,
+            micro_batch_size=mbs,
+            global_batch_size=gbs,
+            num_workers=0,
+            multimodal_sample_config=config,
+            task_encoder=MultiModalTaskEncoder(
+                tokenizer=tokenizer,
+                image_processor=image_processor,
+                multimodal_sample_config=config,
+                packed_sequence=args.use_packed_sequence,
+                packed_sequence_size=decoder_seq_length,
+                num_image_embeddings_per_tile=num_image_embeddings_per_tile,
+            ),
+            packing_buffer_size=200 if args.use_packed_sequence else None,
         )
     elif args.data_type == "mock":
         data = vlm.NevaMockDataModule(
@@ -70,36 +141,11 @@ def main(args):
             tokenizer=None,
             image_processor=None,
             num_workers=4,
+            packed_sequence=args.use_packed_sequence,
         )
     else:
         raise ValueError(f"Data type {args.data_type} not supported")
 
-    # Submodules configurations
-    language_transformer_config = llm.Llama2Config7B(
-        seq_length=decoder_seq_length,
-    )
-    vision_transformer_config = vlm.HFCLIPVisionConfig(
-        pretrained_model_name_or_path="openai/clip-vit-large-patch14-336"
-    )
-    vision_projection_config = vlm.MultimodalProjectorConfig(
-        projector_type=args.projector_type,
-        input_size=vision_transformer_config.hidden_size,
-        hidden_size=language_transformer_config.hidden_size,
-        ffn_hidden_size=language_transformer_config.hidden_size,
-    )
-
-    # NEVA model configuration
-    neva_config = vlm.NevaConfig(
-        language_transformer_config=language_transformer_config,
-        vision_transformer_config=vision_transformer_config,
-        vision_projection_config=vision_projection_config,
-        language_model_from_pretrained=args.language_model_path,
-        freeze_language_model=False,
-        freeze_vision_model=True,
-    )
-
-    model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer)
-
     from megatron.core.distributed import DistributedDataParallelConfig
 
     # Training strategy setup
@@ -118,6 +164,8 @@ def main(args):
         ),
     )
 
+    model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer)
+
     # Checkpoint callback setup
     checkpoint_callback = nl.ModelCheckpoint(
         save_last=True,
@@ -231,6 +279,9 @@ def main(args):
     parser.add_argument("--gbs", type=int, required=False, default=128, help="Global batch size")
     parser.add_argument("--mbs", type=int, required=False, default=2, help="Micro batch size")
     parser.add_argument("--lr", type=float, required=False, default=2.0e-06, help="Learning rate")
-
+    parser.add_argument(
+        "--use_packed_sequence",
+        action="store_true",
+    )
     args = parser.parse_args()
     main(args)
diff --git a/tests/collections/multimodal/data/energon/test_data_module.py b/tests/collections/multimodal/data/energon/test_data_module.py
index c499ecfe9ca4..dff153388f31 100644
--- a/tests/collections/multimodal/data/energon/test_data_module.py
+++ b/tests/collections/multimodal/data/energon/test_data_module.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 import webdataset as wds
-from megatron.energon.flavors import BaseWebdataset
+from megatron.energon.flavors import BaseWebdatasetFactory
 from PIL import Image
 from transformers import AutoProcessor
 
@@ -159,7 +159,7 @@ def create_vqa_test_dataset(self, path: Path, num_samples: int):
                 )
 
             total_shards = shard_writer.shard
-        BaseWebdataset.prepare_dataset(
+        BaseWebdatasetFactory.prepare_dataset(
             path,
             [f"data-{{0..{total_shards-1}}}.tar"],
             split_parts_ratio=[("train", 1.0), ("val", 1.0)],
diff --git a/tests/collections/vlm/mllama_train.py b/tests/collections/vlm/test_mllama_train.py
similarity index 100%
rename from tests/collections/vlm/mllama_train.py
rename to tests/collections/vlm/test_mllama_train.py
diff --git a/tests/collections/vlm/neva_train.py b/tests/collections/vlm/test_neva_train.py
similarity index 95%
rename from tests/collections/vlm/neva_train.py
rename to tests/collections/vlm/test_neva_train.py
index f1ddf961cb10..e12ce27702c2 100644
--- a/tests/collections/vlm/neva_train.py
+++ b/tests/collections/vlm/test_neva_train.py
@@ -37,6 +37,10 @@ def get_args():
     parser.add_argument(
         '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to"
     )
+    parser.add_argument(
+        "--use_packed_sequence",
+        action="store_true",
+    )
 
     return parser.parse_args()
 
@@ -49,6 +53,8 @@ def get_args():
     mbs = 2
     seq_length = 576
     decoder_seq_length = 1024
+    if args.use_packed_sequence:
+        decoder_seq_length = 2048
 
     data = vlm.NevaMockDataModule(
         seq_length=decoder_seq_length,
@@ -57,6 +63,7 @@ def get_args():
         tokenizer=None,
         image_processor=None,
         num_workers=2,
+        packed_sequence=args.use_packed_sequence,
     )
 
     # Transformer configurations

From 2db5dbbc304742b04814f2e6058b1984113a55f6 Mon Sep 17 00:00:00 2001
From: Pablo Garay <pagaray@nvidia.com>
Date: Wed, 15 Jan 2025 22:03:24 -0800
Subject: [PATCH 09/27] Temp change: Flaky test optional

---
 .github/workflows/cicd-main.yml | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index a815be7bdc2f..92cd1f8a6261 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -128,14 +128,15 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
 
-  L0_Unit_Tests_GPU_LLM:
+  OPTIONAL_L0_Unit_Tests_GPU_LLM:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
+       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Multimodal:
      needs: [cicd-test-container-setup]
@@ -4966,7 +4967,7 @@ jobs:
       - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
-      - L0_Unit_Tests_GPU_LLM
+      #- OPTIONAL_L0_Unit_Tests_GPU_LLM
       - L0_Unit_Tests_GPU_Multimodal
       - L0_Unit_Tests_GPU_NLP
       - L0_Unit_Tests_GPU_TTS

From fe2ae82fdb2db6c9bea52198641e2bc9d59a8768 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Thu, 16 Jan 2025 16:31:16 +0100
Subject: [PATCH 10/27] Revert "Revert Mcore update since it caused regression
 (#11791)" (#11799)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Revert "Revert Mcore update since it caused regression (#11791)"

This reverts commit 84b2bf0989a1cfde0258acb5804cd5bdcd357449.

* Fix Gemma2 Attention init args (#11792)

* Use _get_mlp_module_spec from Megatron Core rather than redefine locally (#11834)

* Use _get_mlp_module_spec from MCore rather than redefine

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

* Apply isort and black reformatting

Signed-off-by: janekl <janekl@users.noreply.github.com>

* Update nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py

Co-authored-by: oliver könig <okoenig@nvidia.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: oliver könig <okoenig@nvidia.com>

* Bugfix for output_generation_logits in tensorrtllm (#11820) (#11833)

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>

---------

Signed-off-by: Jan Lasek <janek.lasek@gmail.com>
Signed-off-by: janekl <janekl@users.noreply.github.com>
Signed-off-by: Abhishree <abhishreetm@gmail.com>
Co-authored-by: Ao Tang <aot@nvidia.com>
Co-authored-by: Jan Lasek <janek.lasek@gmail.com>
Co-authored-by: janekl <janekl@users.noreply.github.com>
Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               |  5 +-
 .github/workflows/import-test.yml             | 79 +++++++------------
 Dockerfile.ci                                 | 30 +++----
 docs/source/nlp/information_retrieval.rst     |  3 +-
 nemo/collections/diffusion/scripts/train.sh   |  1 -
 nemo/collections/llm/gpt/model/gemma.py       |  3 +
 nemo/collections/llm/recipes/gemma_2b.py      |  2 -
 nemo/collections/llm/recipes/gemma_7b.py      |  4 -
 .../megatron/gpt_layer_modelopt_spec.py       | 39 +--------
 .../language_modeling/megatron_base_model.py  |  5 ++
 .../language_modeling/megatron_retro_model.py |  3 +
 nemo/collections/vlm/mllama/model/language.py |  4 +-
 nemo/lightning/pytorch/callbacks/peft.py      |  2 +-
 pyproject.toml                                |  3 -
 requirements/requirements_nlp.txt             |  1 -
 .../convert_bert_hf_to_nemo.py                |  3 +-
 .../bitexact/mixtral/pretrain_mini_mixtral.py |  2 +
 tests/collections/llm/bitexact/mixtral/run.sh |  4 +-
 .../llm/gpt/model/test_model_import.py        |  5 ++
 tests/collections/llm/hf/peft_nemorun.py      |  1 -
 tests/collections/llm/hf/sft_nemorun.py       |  1 -
 .../llm/megatron_mixtral_pretraining.py       |  2 +
 tests/conftest.py                             | 14 ++++
 tests/core/test_exp_manager.py                |  4 +-
 tests/lightning/test_nemo_resume_from_ckpt.py | 10 +--
 .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb   |  2 -
 .../llama-3/nemo2-sft-peft/nemo2-sft.ipynb    |  2 -
 tutorials/llm/mamba/mamba.rst                 |  9 +--
 28 files changed, 97 insertions(+), 146 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 92cd1f8a6261..75b9e9e7befd 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -2938,7 +2938,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-2-h100
       SCRIPT: |
-        CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
+        CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \
         trainer.devices=2 \
         trainer.log_every_n_steps=1 \
         trainer.max_epochs=9999 \
@@ -2966,6 +2966,7 @@ jobs:
         +model.tp_comm_overlap_ag=False \
         +model.tp_comm_overlap_rs=False \
         +model.tp_comm_overlap_disable_qkv=True \
+        +model.attention_backend="unfused" \
         model.peft.peft_scheme="lora" \
         model.peft.lora_tuning.adapter_dim=16 \
         model.peft.lora_tuning.alpha=32 \
@@ -4368,7 +4369,7 @@ jobs:
       with:
         RUNNER: self-hosted-azure
         SCRIPT: |
-          NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \
+          python3 tests/collections/llm/megatron_mixtral_pretraining.py \
           --experiment-dir=/tmp/mixtral_pretrain_results \
           --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document
 
diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml
index 3af15294b2a2..47d4657dfe4f 100644
--- a/.github/workflows/import-test.yml
+++ b/.github/workflows/import-test.yml
@@ -1,73 +1,52 @@
 name: CI-Import-Check
 
 on:
-  push:
   pull_request:
     paths:
       - "**"
 
 # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags
 jobs:
-
-  test-asr-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
+  test-imports:
+    name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest]
+        collection: 
+          - asr
+          # - nlp # Currently broken
+          - tts
+        python: ['3.10', '3.11', '3.12']
     steps:
     - name: Checkout repo
       uses: actions/checkout@v2
-    - name: Update base dependencies
-      run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '${{ matrix.python }}' 
+    - name: Build wheel
       id: nemo-wheel
       run:  |
-        pip install Cython
-        # install test requirements
-        pip install -r requirements/requirements_test.txt
         # Build nemo as a wheel
         pip install build
-        python -m build --no-isolation --wheel
+        python -m build --wheel
+        
         # Preserve wheel location
         DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test ASR Domain Imports
-      run: |
-        # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]"
-        # Run import checks
-        python tests/core_ptl/check_imports.py --domain "asr"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
-  test-tts-imports:
-    runs-on: ubuntu-latest
-    container:
-      image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime
-    steps:
-    - name: Checkout repo
-      uses: actions/checkout@v2
-    - name: Update base dependencies
+        echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT"
+    
+    - name: Install NeMo + test dependencies
       run: |
-        apt-get update && apt-get install -y build-essential
-        apt-get install -y libsndfile1 make
-    - name: Install nemo dependencies
-      id: nemo-wheel
-      run:  |
-        pip install Cython
         # install test requirements
         pip install -r requirements/requirements_test.txt
-        # Build nemo as a wheel
-        pip install build
-        python -m build --no-isolation --wheel
-        # Preserve wheel location
-        DIST_FILE=$(find ./dist -name "*.whl" | head -n 1)
-        echo "::set-output name=DIST_FILE::${DIST_FILE}"
-    - name: Test TTS Domain Imports
-      run: |
+        
         # Install NeMo Domain
-        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]"
+        pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]"
+    
+    - name: Run ${{ matrix.collection }} checks
+      run: |
         # Run import checks
-        python tests/core_ptl/check_imports.py --domain "tts"
-        # Uninstall NeMo
-        pip uninstall -y nemo_toolkit
+        python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}"
+  
+  
\ No newline at end of file
diff --git a/Dockerfile.ci b/Dockerfile.ci
index 3d2f0c76b6ea..f7e637442158 100644
--- a/Dockerfile.ci
+++ b/Dockerfile.ci
@@ -34,17 +34,12 @@ EOF
 WORKDIR /workspace
 
 # Install Mamba Dependancy
-ARG CAUSAL_CONV_TAG=v1.2.2.post1
+ARG CAUSAL_CONV_TAG=v1.2.2.post1 
+ARG MAMBA_TAG=v2.2.0
 
 RUN <<"EOF" bash -ex
 # Mamba dependancy installation
-
-git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \
-  cd causal-conv1d && \
-  python setup.py install && \
-  cd .. && \
-  rm -rf causal-conv1d
-
+MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG}
 EOF
 
 RUN pip install hatchling   # needed to install nemo-run
@@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.21.0
-ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa
-
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
   --mount=type=bind,source=requirements,target=requirements \
@@ -65,7 +58,6 @@ RUN \
   --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
-"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
 "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \
 "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \
 "unstructured==0.14.9" \
@@ -73,15 +65,15 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n
 "onnxscript @ git+https://github.com/microsoft/onnxscript" \
 -r tools/ctc_segmentation/requirements.txt \
 ".[all]"
+EOF
 
-# Megatron Core installation
-git clone https://github.com/NVIDIA/Megatron-LM.git && \
-pushd Megatron-LM && \
-git checkout ${MCORE_TAG} && \
-  pushd megatron/core/datasets && \
-  make && \
-  popd && \
-popd
+ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031
+RUN <<"EOF" bash -ex
+# Megatron-LM installation
+git clone https://github.com/NVIDIA/Megatron-LM.git
+pushd Megatron-LM
+git checkout ${MCORE_TAG} 
+pip install -e .
 export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
 
 # Install nvidia-resiliency-ext
diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst
index 26732283e8f4..69f1c3219093 100644
--- a/docs/source/nlp/information_retrieval.rst
+++ b/docs/source/nlp/information_retrieval.rst
@@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     VALIDATION_DATASET_PATH= # Path to validation dataset 
     SAVE_DIR= # where the checkpoint and logs are saved
     mkdir -p $SAVE_DIR
-    export NVTE_FLASH_ATTN=0
     export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0
-    export NVTE_FUSED_ATTN=0
     
     python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \
     --config-path=${CONFIG_PATH} \
@@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script:
     model.post_process=False \
     model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size
     model.micro_batch_size=8 \
+    model.attention_backend="unfused" \ 
     model.optim.lr=0.000005 \
     model.optim.sched.min_lr=0.00000001 \
     model.optim.sched.warmup_steps=100 \
diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh
index 2150458e9376..ced479e32526 100644
--- a/nemo/collections/diffusion/scripts/train.sh
+++ b/nemo/collections/diffusion/scripts/train.sh
@@ -20,7 +20,6 @@
 export WANDB_PROJECT=xxx
 export WANDB_RUN_ID=xxx
 export WANDB_RESUME=allow
-export NVTE_FUSED_ATTN=0 
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 
diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py
index bf828bb66277..4d8d541deaa8 100644
--- a/nemo/collections/llm/gpt/model/gemma.py
+++ b/nemo/collections/llm/gpt/model/gemma.py
@@ -18,6 +18,7 @@
 
 import torch
 from megatron.core import parallel_state
+from megatron.core.transformer.enums import AttnBackend
 from torch import nn
 
 from nemo.collections.llm.fn.activation import openai_gelu
@@ -53,6 +54,8 @@ class GemmaConfig(GPTConfig):
     # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script
     # The present implementation is more in line with the official implementation
     layernorm_zero_centered_gamma: bool = True
+    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
+    attention_backend: AttnBackend = AttnBackend.flash
 
 
 @dataclass
diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py
index 3b43bbdb0e62..64af8192929c 100644
--- a/nemo/collections/llm/recipes/gemma_2b.py
+++ b/nemo/collections/llm/recipes/gemma_2b.py
@@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
     return run.Config(GemmaModel, config=run.Config(GemmaConfig2B))
 
 
diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py
index 40e43bda4d5e..2ac3419d6587 100644
--- a/nemo/collections/llm/recipes/gemma_7b.py
+++ b/nemo/collections/llm/recipes/gemma_7b.py
@@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]:
             >>> model_config = model()
             >>> print(model_config)
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
     return run.Config(GemmaModel, config=run.Config(GemmaConfig7B))
 
 
@@ -173,8 +171,6 @@ def pretrain_recipe(
         For more details on pre-training LLMs with NeMo, see the pre-training
         guide in the `examples/llm/pretrain/` directory.
     """
-    # Disable cuDNN attention since TE 1.8 does not support head dim > 128
-    os.environ['NVTE_FUSED_ATTN'] = "0"
 
     return run.Partial(
         fn,
diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
index 514ef62a9ff3..aa68273a414a 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py
@@ -17,13 +17,11 @@
 try:
     from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm
     from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add
+    from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec
     from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear
     from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules
     from megatron.core.transformer.enums import AttnMaskType
     from megatron.core.transformer.identity_op import IdentityOp
-    from megatron.core.transformer.mlp import MLP, MLPSubmodules
-    from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules
-    from megatron.core.transformer.moe.shared_experts import SharedExpertMLP
     from megatron.core.transformer.spec_utils import ModuleSpec
     from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules
 
@@ -57,7 +55,8 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec
     if not HAVE_MEGATRON_CORE:
         raise IMPORT_ERROR
 
-    mlp = _get_mlp_module_spec(num_experts=num_experts)
+    mlp = _get_mlp_module_spec(use_te=False, num_experts=num_experts, moe_grouped_gemm=False)
+
     return ModuleSpec(
         module=TransformerLayer,
         submodules=TransformerLayerSubmodules(
@@ -84,35 +83,3 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec
             },
         ),
     )
-
-
-# Helper function to get module spec for MLP/MoE
-def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec:
-    if num_experts is None:
-        # Dense MLP w/ or w/o TE modules.
-        return ModuleSpec(
-            module=MLP,
-            submodules=MLPSubmodules(
-                linear_fc1=ColumnParallelLinear,
-                linear_fc2=RowParallelLinear,
-            ),
-        )
-    else:
-        # Mixture of experts with modules in megatron core.
-        return ModuleSpec(
-            module=MoELayer,
-            submodules=MoESubmodules(
-                experts=MLPSubmodules(
-                    linear_fc1=ColumnParallelLinear,
-                    linear_fc2=RowParallelLinear,
-                ),
-                shared_experts=ModuleSpec(
-                    module=SharedExpertMLP,
-                    params={"gate": False},
-                    submodules=MLPSubmodules(
-                        linear_fc1=ColumnParallelLinear,
-                        linear_fc2=RowParallelLinear,
-                    ),
-                ),
-            ),
-        )
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
index cf13a0318ffc..122c86614311 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py
@@ -50,6 +50,7 @@
 try:
     from megatron.core import ModelParallelConfig, parallel_state
     from megatron.core.distributed import DistributedDataParallel as McoreDDP
+    from megatron.core.transformer.enums import AttnBackend
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -538,6 +539,9 @@ def build_transformer_config(self) -> TransformerConfig:
 
         tp_only_amax_red = self.cfg.get('tp_only_amax_red', False)
 
+        attention_backend = self.cfg.get('attention_backend', "auto")
+        attention_backend = AttnBackend[attention_backend]
+
         # any configs that are not in the nemo model config will be added here
         config_mapping = {
             'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
@@ -562,6 +566,7 @@ def build_transformer_config(self) -> TransformerConfig:
             'rotary_interleaved': rotary_interleaved,
             'deallocate_pipeline_outputs': True,
             'tp_only_amax_red': tp_only_amax_red,
+            'attention_backend': attention_backend,
         }
 
         # populate the transformer config dict
diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
index 493d512fd30e..b3fd7b11c6eb 100644
--- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
+++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py
@@ -76,6 +76,7 @@
     from megatron.core.models.retro.utils import get_config_path as get_retro_config_path
     from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir
     from megatron.core.pipeline_parallel.schedules import get_forward_backward_func
+    from megatron.core.transformer.enums import AttnBackend
     from megatron.core.transformer.module import Float16Module as MCoreFloat16Module
     from megatron.core.transformer.transformer_config import TransformerConfig
     from megatron.core.utils import init_method_normal, scaled_init_method_normal
@@ -431,6 +432,8 @@ def build_retro_config(self) -> RetroConfig:
 
         te_version = packaging.version.Version(version("transformer-engine"))
         if te_version >= packaging.version.Version("1.3"):
+            if HAVE_MEGATRON_CORE:
+                retro_config.attention_backend = AttnBackend.unfused
             try:
                 os.environ["NVTE_FLASH_ATTN"] = "0"
                 os.environ["NVTE_FUSED_ATTN"] = "0"
diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py
index bec3ec526f6e..3edc6706defb 100644
--- a/nemo/collections/vlm/mllama/model/language.py
+++ b/nemo/collections/vlm/mllama/model/language.py
@@ -390,7 +390,7 @@ def sharded_state_dict(
         layer_prefix = f'{prefix}layers.'
         num_layers = self.config.num_layers
         for layer in self.layers:
-            offset = layer._get_layer_offset()
+            offset = layer._get_layer_offset(layer.config)
             global_layer_offset = layer.layer_number - 1  # self.layer_number starts at 1
             state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
             sharded_prefix = layer_prefix
@@ -403,7 +403,7 @@ def sharded_state_dict(
         for xlayer in self.xattn_layers:
             if isinstance(xlayer, DummyCrossAttentionTransformerLayer):
                 continue
-            offset = xlayer._get_layer_offset()
+            offset = xlayer._get_layer_offset(xlayer.config)
             global_layer_offset = xlayer.layer_number - 1
             state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.'  # module list index in TransformerBlock # pylint: disable=line-too-long
             sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.'
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index a71d6792d457..399b4e9e5293 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -480,7 +480,7 @@ def load_checkpoint(
         if getattr(path, "base_model_path", None):
             ## PEFT Resume, FIRST TIME
             self.adapter_ckpt_path = Path(str(path))
-            adapter_ckpt = self.checkpoint_io.load_checkpoint(path)  # Loads only metadata
+            adapter_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict={})  # Loads only metadata
             # path is adapter path to restore the training metadata, but switch to loading base model here.
             path = self.model_ckpt_path = path.base_model_path
         elif adapter_meta_path.exists():
diff --git a/pyproject.toml b/pyproject.toml
index bdddfef27dc6..af5555f9d0dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -60,9 +60,6 @@ classifiers = [
     "Topic :: Utilities",
 ]
 
-[tool.setuptools.dynamic]
-dependencies = { file = ["requirements/requirements.txt"] }
-
 [tool.setuptools]
 py-modules = ["nemo"]
 
diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt
index d35b649a46ba..6a0ae8adf66c 100644
--- a/requirements/requirements_nlp.txt
+++ b/requirements/requirements_nlp.txt
@@ -8,7 +8,6 @@ gdown
 h5py
 ijson
 jieba
-mamba-ssm==2.2.2; sys_platform == 'linux'
 markdown2
 matplotlib>=3.3.2
 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again
diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
index 14baca53f165..8265da57f656 100644
--- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
+++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py
@@ -84,6 +84,8 @@ def convert(args):
     nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict(), mcore_bert=args.mcore)
 
     nemo_config.trainer["precision"] = args.precision
+    # Bert doesn't support FLASH_ATTN
+    nemo_config.model["attention_backend"] = "fused"
     trainer = MegatronTrainerBuilder(nemo_config).create_trainer()
     model = MegatronBertModel(nemo_config.model, trainer)
 
@@ -288,6 +290,5 @@ def convert(args):
 
 
 if __name__ == '__main__':
-    os.environ['NVTE_FLASH_ATTN'] = '0'  # Bert doesn't support FLASH_ATTN
     args = get_args()
     convert(args)
diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
index b4f95879bad5..654a2a9e05a8 100644
--- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
+++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py
@@ -17,6 +17,7 @@
 
 import torch
 from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig
+from megatron.core.transformer.enums import AttnBackend
 from megatron.core.utils import init_method_normal, scaled_init_method_normal
 
 from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule
@@ -102,6 +103,7 @@ def main(args):
         bias_dropout_fusion=True,
         apply_rope_fusion=True,
         distribute_saved_activations=False,
+        attention_backend=AttnBackend.unfused,
     )
 
     data = PreTrainingDataModule(
diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh
index 87bf7c382b99..0f6612b3d21b 100644
--- a/tests/collections/llm/bitexact/mixtral/run.sh
+++ b/tests/collections/llm/bitexact/mixtral/run.sh
@@ -8,7 +8,7 @@ MCORE_OUTPUT_PATH="/tmp/bex_mixtral_mcore_output/"
 NEMO_OUTPUT_PATH="/tmp/bex_mixtral_nemo_output/"
 
 # Run Mcore
-CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \
+CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 \
 torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \
     --apply-layernorm-1p --rotary-percent 1.0 --rotary-base 1000000 \
     --no-position-embedding --position-embedding-type rope \
@@ -30,7 +30,7 @@ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \
     --split 99,1,0 --log-interval 10 --save-interval 20000 --eval-interval 1000 --eval-iters 32 \
     --save "$MCORE_OUTPUT_PATH" \
     --log-num-zeros-in-grad --distributed-timeout-minutes 6000 --moe-router-topk 1 --num-experts 2 \
-    --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0
+    --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 --attention-backend unfused
 
 # Run NeMo
 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \
diff --git a/tests/collections/llm/gpt/model/test_model_import.py b/tests/collections/llm/gpt/model/test_model_import.py
index 9edc235e454f..b49885718837 100644
--- a/tests/collections/llm/gpt/model/test_model_import.py
+++ b/tests/collections/llm/gpt/model/test_model_import.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+
 import torch
 
 torch.set_grad_enabled(False)
@@ -95,5 +97,8 @@ def import_from_hf(config_name, hf_path):
 
 if __name__ == '__main__':
     for config_name, hf_id in config_name_to_hf_id.items():
+        for env_var in ['NVTE_FLASH_ATTN', 'NVTE_FUSED_ATTN', 'NVTE_UNFUSED_ATTN']:
+            if env_var in os.environ:
+                del os.environ[env_var]
         src = f'hf:///home/TestData/nemo2_ckpt/{config_name}'
         import_from_hf(config_name, src)
diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py
index ef34d4d39a11..3a135b2346be 100644
--- a/tests/collections/llm/hf/peft_nemorun.py
+++ b/tests/collections/llm/hf/peft_nemorun.py
@@ -28,7 +28,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut
         "NCCL_NVLS_ENABLE": "0",
         "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
         "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py
index a3daa66ca774..b559c04f6cbd 100644
--- a/tests/collections/llm/hf/sft_nemorun.py
+++ b/tests/collections/llm/hf/sft_nemorun.py
@@ -29,7 +29,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut
         "NCCL_NVLS_ENABLE": "0",
         "NVTE_DP_AMAX_REDUCE_INTERVAL": "0",
         "NVTE_ASYNC_AMAX_REDUCTION": "1",
-        "NVTE_FUSED_ATTN": "0",
     }
 
     executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars)
diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py
index 4123c7b37987..2a7b1fdfdad6 100644
--- a/tests/collections/llm/megatron_mixtral_pretraining.py
+++ b/tests/collections/llm/megatron_mixtral_pretraining.py
@@ -18,6 +18,7 @@
 
 import torch
 from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig
+from megatron.core.transformer.enums import AttnBackend
 
 from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule
 from nemo.collections.llm.api import train
@@ -117,6 +118,7 @@ def main(args):
         bf16=True,
         params_dtype=torch.bfloat16,
         pipeline_dtype=torch.bfloat16,
+        attention_backend=AttnBackend.unfused,
     )
     mixtral_config.overlap_param_gather_with_optimizer_step = True
 
diff --git a/tests/conftest.py b/tests/conftest.py
index 118e978e63c7..989c937ab499 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import logging
+import os
 import os.path
 import shutil
 import tarfile
@@ -122,6 +123,19 @@ def reset_singletons():
     Singleton._Singleton__instances = {}
 
 
+@pytest.fixture(autouse=True)
+def reset_env_vars():
+    # Store the original environment variables before the test
+    original_env = dict(os.environ)
+
+    # Run the test
+    yield
+
+    # After the test, restore the original environment
+    os.environ.clear()
+    os.environ.update(original_env)
+
+
 @pytest.fixture(scope="session")
 def test_data_dir():
     """
diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py
index 32d401b2051f..9dbdaa66a25e 100644
--- a/tests/core/test_exp_manager.py
+++ b/tests/core/test_exp_manager.py
@@ -280,7 +280,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
         assert Path(tmp_path).exists()
         assert Path(tmp_path / "test_no_name" / "default" / "957").exists()
 
-        monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION)
+        monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False)
         # Checks that use_datetime_version False toggle works
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False})
@@ -288,7 +288,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path):
         assert Path(tmp_path).exists()
         assert Path(tmp_path / "test_no_name" / "default" / "version_0").exists()
 
-        monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION)
+        monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False)
         # Checks that use_datetime_version False toggle works and version increments
         test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False)
         log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False})
diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py
index e876e6965000..37ea326ad621 100644
--- a/tests/lightning/test_nemo_resume_from_ckpt.py
+++ b/tests/lightning/test_nemo_resume_from_ckpt.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+from typing import List, Optional
 
 import pytest
 
 
 def set_env():
-    os.environ['NVTE_FLASH_ATTN'] = '0'
-    os.environ['NVTE_FUSED_ATTN'] = '0'
     os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '0'
 
 
@@ -28,6 +27,7 @@ def set_env():
 import pytest
 import torch
 from megatron.core.optimizer import OptimizerConfig
+from megatron.core.transformer.enums import AttnBackend
 
 import nemo.lightning as nl
 from nemo.collections import llm
@@ -68,7 +68,8 @@ def load_dcp(ckpt_dir, torch_tensor=True):
     return state_dict
 
 
-def compare_ckpts(a, b, path=[]):
+def compare_ckpts(a, b, path: Optional[List[str]] = None):
+    path = path if path is not None else []
     if isinstance(a, dict):
         assert isinstance(b, dict)
         assert set(a.keys()) == set(b.keys())
@@ -125,6 +126,7 @@ def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1):
         make_vocab_size_divisible_by=128,
         normalization='RMSNorm',
         masked_softmax_fusion=False,
+        attention_backend=AttnBackend.local,
     )
 
     model = llm.GPTModel(gpt_config, tokenizer=tokenizer)
@@ -269,8 +271,6 @@ def train(n_steps, resume):
                 trainer._teardown()
 
         set_env()
-        assert os.environ['NVTE_FLASH_ATTN'] == '0'
-        assert os.environ['NVTE_FUSED_ATTN'] == '0'
         assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '0'
 
         # Train for 40 steps
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
index 730ffd9ff972..c983b277e72a 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb
@@ -341,7 +341,6 @@
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
     "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
     "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
-    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -457,7 +456,6 @@
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
     "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
     "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
-    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
index e84ff916fc4e..0bb4367d50e9 100644
--- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
+++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb
@@ -482,7 +482,6 @@
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
     "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
     "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
-    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
@@ -565,7 +564,6 @@
     "        \"NCCL_NVLS_ENABLE\": \"0\",\n",
     "        \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n",
     "        \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n",
-    "        \"NVTE_FUSED_ATTN\": \"0\",\n",
     "    }\n",
     "\n",
     "    executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n",
diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst
index 197825c27d58..7f5e901659a4 100644
--- a/tutorials/llm/mamba/mamba.rst
+++ b/tutorials/llm/mamba/mamba.rst
@@ -103,9 +103,6 @@ Run Fine-Tuning
     CONFIG_NAME="megatron_mamba_finetuning_config"
     SAVE_DIR=<path to the saving directory>
 
-    export NVTE_FUSED_ATTN=1
-    export NVTE_FLASH_ATTN=0
-
     torchrun --nproc_per_node=${NUM_DEVICES} \
             /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \
             --config-path=${CONFIG_PATH} \
@@ -129,6 +126,7 @@ Run Fine-Tuning
             model.peft.peft_scheme='none' \
             model.megatron_amp_O2=True \
             model.encoder_seq_length=${SEQ_LEN} \
+            model.attention_backend='fused' \
             model.data.validation_ds.pad_to_max_length=True \
             model.data.train_ds.pad_to_max_length=True \
             model.optim.name="distributed_fused_adam" \
@@ -162,10 +160,6 @@ Evaluating the Fine-Tuned Model
     CONFIG_NAME="megatron_mamba_finetuning_config"
     SAVE_DIR=<path to the saving directory>
 
-    export NVTE_FUSED_ATTN=1
-    export NVTE_FLASH_ATTN=0
-
-
     CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/"
     CONFIG_NAME="megatron_mamba_generate_config"
 
@@ -185,6 +179,7 @@ Evaluating the Fine-Tuned Model
             exp_manager.exp_dir=${SAVE_DIR} \
             exp_manager.resume_if_exists=False \
             exp_manager.create_wandb_logger=False \
+            model.attention_backend='fused' \
             model.megatron_amp_O2=True \
             model.peft.restore_from_path=False \
             +model.peft.restore_from_ckpt.checkpoint_dir=False \

From a6116fa05caf689c20c282f42218295ae261dfb4 Mon Sep 17 00:00:00 2001
From: "L.B." <llane@nvidia.com>
Date: Thu, 16 Jan 2025 13:26:49 -0500
Subject: [PATCH 11/27] Latest News updated for Cosmos (#11806)

* Latest News updated for Cosmos

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* Moved Gen AI Models news to LLM section

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* Cleanup of news items

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* Added getting started section for Cosmos

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* Moved getting started section for Cosmos

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* remove unneeded section

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* remove unneeded section

Signed-off-by: Lawrence Lane <llane@nvidia.com>

* added updated get started with cosmos

Signed-off-by: Lawrence Lane <llane@nvidia.com>

---------

Signed-off-by: Lawrence Lane <llane@nvidia.com>
Co-authored-by: Pablo Garay <palenq@gmail.com>
---
 README.md | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 195b8293babd..5c33d1da735f 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,39 @@
 <details open>
   <summary><b>NeMo 2.0</b></summary>
       We've released NeMo 2.0, an update on the NeMo Framework which prioritizes modularity and ease-of-use. Please refer to the <a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html>NeMo Framework User Guide</a> to get started.
+</details>
+<details open>
+  <summary><b>New Cosmos World Foundation Models Support</b></summary>
+    <details> 
+      <summary> <a href="https://developer.nvidia.com/blog/advancing-physical-ai-with-nvidia-cosmos-world-foundation-model-platform">Advancing Physical AI with NVIDIA Cosmos World Foundation Model Platform </a> (2025-01-09) 
+      </summary> 
+        The end-to-end NVIDIA Cosmos platform accelerates world model development for physical AI systems. Built on CUDA, Cosmos combines state-of-the-art world foundation models, video tokenizers, and AI-accelerated data processing pipelines. Developers can accelerate world model development by fine-tuning Cosmos world foundation models or building new ones from the ground up. These models create realistic synthetic videos of environments and interactions, providing a scalable foundation for training complex systems, from simulating humanoid robots performing advanced actions to developing end-to-end autonomous driving models. 
+        <br><br>
     </details>
-  </details>
-
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/accelerate-custom-video-foundation-model-pipelines-with-new-nvidia-nemo-framework-capabilities/">
+          Accelerate Custom Video Foundation Model Pipelines with New NVIDIA NeMo Framework Capabilities
+        </a> (2025-01-07)
+      </summary>
+        The NeMo Framework now supports training and customizing the <a href="https://github.com/NVIDIA/Cosmos">NVIDIA Cosmos</a> collection of world foundation models. Cosmos leverages advanced text-to-world generation techniques to create fluid, coherent video content from natural language prompts.
+        <br><br>
+        You can also now accelerate your video processing step using the <a href="https://developer.nvidia.com/nemo-curator-video-processing-early-access">NeMo Curator</a> library, which provides optimized video processing and captioning features that can deliver up to 89x faster video processing when compared to an unoptimized CPU pipeline.
+      <br><br>
+    </details>
+</details>
 <details open>
   <summary><b>Large Language Models and Multimodal Models</b></summary>
-      <details>
+    <details>
+      <summary>
+        <a href="https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/">
+          State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo
+        </a> (2024-11-06)
+      </summary>
+        NVIDIA recently announced significant enhancements to the NeMo platform, focusing on multimodal generative AI models. The update includes NeMo Curator and the Cosmos tokenizer, which streamline the data curation process and enhance the quality of visual data. These tools are designed to handle large-scale data efficiently, making it easier to develop high-quality AI models for various applications, including robotics and autonomous driving. The Cosmos tokenizers, in particular, efficiently map visual data into compact, semantic tokens, which is crucial for training large-scale generative models. The tokenizer is available now on the <a href=http://github.com/NVIDIA/cosmos-tokenizer/NVIDIA/cosmos-tokenizer>NVIDIA/cosmos-tokenizer</a> GitHub repo and on <a href=https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8>Hugging Face</a>.
+      <br><br>
+    </details>
+    <details>
       <summary>
         <a href="https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/">
         New Llama 3.1 Support
@@ -81,7 +108,6 @@
         <br><br>
       </details>
 </details>
-
 <details open>
   <summary><b>Speech Recognition</b></summary>
   <details>
@@ -163,6 +189,10 @@ Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendl
 - For an in-depth exploration of the main features of NeMo 2.0, see the [Feature Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/index.html#feature-guide).
 - To transition from NeMo 1.0 to 2.0, see the [Migration Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/migration/index.html#migration-guide) for step-by-step instructions.
 
+### Get Started with Cosmos
+
+NeMo Curator and NeMo Framework support video curation and post-training of the Cosmos World Foundation Models, which are open and available on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cosmos/collections/cosmos) and [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6). For more information on video datasets, refer to [NeMo Curator](https://developer.nvidia.com/nemo-curator). To post-train World Foundation Models using the NeMo Framework for your custom physical AI tasks, see the [Cosmos Diffusion models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/nemo/post_training/README.md) and the [Cosmos Autoregressive models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/autoregressive/nemo/post_training/README.md).
+
 ## LLMs and MMs Training, Alignment, and Customization
 
 All NeMo models are trained with

From 4c5f0510fc5ae5a384d67749abbd3f57db317a96 Mon Sep 17 00:00:00 2001
From: "Peter St. John" <pstjohn@nvidia.com>
Date: Thu, 16 Jan 2025 12:00:32 -0700
Subject: [PATCH 12/27] Removes tensorstore 0.1.45 pin from
 requirements_deploy.txt (#11858)

Signed-off-by: Peter St. John <pstjohn@nvidia.com>
---
 requirements/requirements_deploy.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements/requirements_deploy.txt b/requirements/requirements_deploy.txt
index 5380398c278b..a65b651a76c8 100644
--- a/requirements/requirements_deploy.txt
+++ b/requirements/requirements_deploy.txt
@@ -1,6 +1,6 @@
 fastapi
 nvidia-pytriton
 pydantic-settings
-tensorstore==0.1.45
+tensorstore
 uvicorn
 zarr

From 7167e5e8176c2651114546e088e8fc78e2888213 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Fri, 17 Jan 2025 15:17:45 +0100
Subject: [PATCH 13/27] ci: Prune dangling images (#11885)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/_test_template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
index 911fcc17e636..87bf71ff5c4e 100644
--- a/.github/workflows/_test_template.yml
+++ b/.github/workflows/_test_template.yml
@@ -47,7 +47,7 @@ jobs:
     steps:
         - name: Docker system cleanup
           run: |
-            docker system prune -a --filter "until=24h" --force || true
+            docker system prune -af --filter "until=24h" --force || true
 
         - name: Docker pull image
           run: |

From 8786345739f4aa7dceeb881c20d7362dc4602d75 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 17 Jan 2025 08:58:13 -0800
Subject: [PATCH 14/27] Disable tests that download datasets from web (#11878)

* disable tests that download datasets from web

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* re-enable llm tests

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Update cicd-main.yml

Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml                      | 7 +++----
 tests/collections/llm/test_mnist_model_nemo2.py      | 1 +
 tests/collections/llm/test_mnist_model_nemo2_fsdp.py | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 75b9e9e7befd..3f7c5e8c3933 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -128,15 +128,14 @@ jobs:
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads
 
-  OPTIONAL_L0_Unit_Tests_GPU_LLM:
+  L0_Unit_Tests_GPU_LLM:
      needs: [cicd-test-container-setup]
      uses: ./.github/workflows/_test_template.yml
-     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
+     if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true'
      with:
        RUNNER: self-hosted-azure
        SCRIPT: |
          NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads
-       IS_OPTIONAL: true
 
   L0_Unit_Tests_GPU_Multimodal:
      needs: [cicd-test-container-setup]
@@ -4968,7 +4967,7 @@ jobs:
       - L0_Unit_Tests_GPU_ASR
       - L0_Unit_Tests_GPU_Audio
       - L0_Unit_Tests_GPU_Common
-      #- OPTIONAL_L0_Unit_Tests_GPU_LLM
+      - L0_Unit_Tests_GPU_LLM
       - L0_Unit_Tests_GPU_Multimodal
       - L0_Unit_Tests_GPU_NLP
       - L0_Unit_Tests_GPU_TTS
diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py
index 92cffc2a35bb..06afcd0fc0e0 100644
--- a/tests/collections/llm/test_mnist_model_nemo2.py
+++ b/tests/collections/llm/test_mnist_model_nemo2.py
@@ -480,6 +480,7 @@ def reset_megatron_parallel_state() -> Iterator[None]:
 
 @pytest.mark.run_only_on("GPU")
 @pytest.mark.integration
+@pytest.mark.pleasefixme
 def test_train_mnist_litautoencoder_with_megatron_strategy_single_gpu():
     path = os.path.abspath(__file__)
     call = f"python {path}"
diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
index 9418ee7e5e90..68f2dc726306 100644
--- a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
+++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py
@@ -502,6 +502,7 @@ def reset_megatron_parallel_state() -> Iterator[None]:
 
 @pytest.mark.run_only_on("GPU")
 @pytest.mark.integration
+@pytest.mark.pleasefixme
 def test_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu():
     path = os.path.abspath(__file__)
     call = f"python {path}"

From 0cd990d97f01ab946fc53108ed43c7be7140a0d8 Mon Sep 17 00:00:00 2001
From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com>
Date: Fri, 17 Jan 2025 20:04:06 +0200
Subject: [PATCH 15/27] fix checkpoint load issue (#11859)

* fix checkpoint load issue

Signed-off-by: Dmytro Pykhtar <dpykhtar@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>

* set weights_only to False

Signed-off-by: dimapihtar <dpihtar@gmail.com>

---------

Signed-off-by: Dmytro Pykhtar <dpykhtar@nvidia.com>
Signed-off-by: dimapihtar <dimapihtar@users.noreply.github.com>
Signed-off-by: dimapihtar <dpihtar@gmail.com>
Co-authored-by: dimapihtar <dimapihtar@users.noreply.github.com>
---
 nemo/collections/llm/gpt/model/ssm.py           |  2 +-
 .../text_to_image/controlnet/controlnet.py      |  2 +-
 .../text_to_image/imagen/imagen_pipeline.py     |  2 +-
 .../instruct_pix2pix/ldm/ddpm_edit.py           | 17 +++++++++++++----
 .../stable_diffusion/ldm/autoencoder.py         |  6 +++---
 .../text_to_image/stable_diffusion/ldm/ddpm.py  |  4 ++--
 .../diffusionmodules/openaimodel.py             |  2 +-
 .../speech_llm/models/modular_models.py         |  4 ++--
 .../speech_llm/parts/mixins/adapter_mixin.py    |  2 +-
 .../parts/mixins/multimodal_adapter_mixins.py   |  2 +-
 .../nlp/parts/mixins/nlp_adapter_mixins.py      |  2 +-
 11 files changed, 27 insertions(+), 18 deletions(-)

diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py
index f4190114042e..09681648cb73 100644
--- a/nemo/collections/llm/gpt/model/ssm.py
+++ b/nemo/collections/llm/gpt/model/ssm.py
@@ -120,7 +120,7 @@ def init(self) -> GPTModel:
 
     def apply(self, output_path: Path) -> Path:
 
-        source = torch.load(str(self), map_location='cpu')
+        source = torch.load(str(self), map_location='cpu', weights_only=False)
         if 'model' in source:
             source = source['model']
 
diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
index 981600fcc3a1..0e48305c4b1f 100644
--- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
+++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py
@@ -547,7 +547,7 @@ def load_from_unet(self, from_pretrained_unet, from_NeMo=True):
         else:
             print("Loading unet blocks from sd")
 
-            state_dict = torch.load(from_pretrained_unet, map_location='cpu')
+            state_dict = torch.load(from_pretrained_unet, map_location='cpu', weights_only=False)
             if 'state_dict' in state_dict.keys():
                 state_dict = state_dict['state_dict']
             model_state_dict = self.state_dict()
diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
index 63963321fcf7..f6ae5829c907 100644
--- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
+++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py
@@ -83,7 +83,7 @@ def _load_model(model_ckpt: str, model_cfg: str, eval_mode: bool = True, trainer
             model_cfg.model.micro_batch_size = 1
             model_cfg.model.global_batch_size = 1
             model = MegatronImagen(cfg=model_cfg.model, trainer=trainer)
-            checkpoint = torch.load(model_ckpt, map_location=lambda storage, loc: storage)
+            checkpoint = torch.load(model_ckpt, map_location=lambda storage, loc: storage, weights_only=False)
 
             # Change weight keys if training using TorchInductor
             state_dict = checkpoint['state_dict']
diff --git a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
index 9bb490fb8fc8..3ced0cbad87a 100644
--- a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
+++ b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py
@@ -41,9 +41,15 @@
 
 class LatentDiffusionEdit(LatentDiffusion):
     def init_from_ckpt(
-        self, path, ignore_keys=list(), only_model=False, load_vae=True, load_unet=True, load_encoder=True,
+        self,
+        path,
+        ignore_keys=list(),
+        only_model=False,
+        load_vae=True,
+        load_unet=True,
+        load_encoder=True,
     ):
-        pl_sd = torch.load(path, map_location="cpu")
+        pl_sd = torch.load(path, map_location="cpu", weights_only=False)
         if "state_dict" in list(pl_sd.keys()):
             pl_sd = pl_sd["state_dict"]
         sd = {}
@@ -144,7 +150,7 @@ def model_provider_func(self, pre_process=True, post_process=True):
         return model
 
     def setup(self, stage=None):
-        """ PTL hook that is executed after DDP spawns.
+        """PTL hook that is executed after DDP spawns.
             We setup datasets here as megatron datasets require DDP to instantiate.
             See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information.
         Args:
@@ -260,5 +266,8 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, drop_last=Tru
 
         # Torch dataloader.
         return torch.utils.data.DataLoader(
-            dataset, batch_sampler=batch_sampler, num_workers=self._cfg.data.num_workers, pin_memory=True,
+            dataset,
+            batch_sampler=batch_sampler,
+            num_workers=self._cfg.data.num_workers,
+            pin_memory=True,
         )
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
index 311ebc0f06f5..50ca205d9acd 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py
@@ -88,7 +88,7 @@ def ema_scope(self, context=None):
                     print(f"{context}: Restored training weights")
 
     def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
+        sd = torch.load(path, map_location="cpu", weights_only=False)["state_dict"]
         keys = list(sd.keys())
         for k in keys:
             for ik in ignore_keys:
@@ -345,7 +345,7 @@ def __init__(
 
                 state_dict = load_safetensors(from_pretrained)
             else:
-                state_dict = torch.load(from_pretrained)
+                state_dict = torch.load(from_pretrained, weights_only=False)
             if 'state_dict' in state_dict:
                 state_dict = state_dict['state_dict']
             missing_key, unexpected_key, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo)
@@ -476,7 +476,7 @@ def load(module: torch.nn.Module, prefix=""):
         return error_msgs
 
     def init_from_ckpt(self, path, ignore_keys=list()):
-        sd = torch.load(path, map_location="cpu")["state_dict"]
+        sd = torch.load(path, map_location="cpu", weights_only=False)["state_dict"]
         keys = list(sd.keys())
         for k in keys:
             for ik in ignore_keys:
diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
index 163b2fb27e0f..80184baa53fa 100644
--- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
+++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py
@@ -246,7 +246,7 @@ def init_from_ckpt(
         load_unet=True,
         load_encoder=True,
     ):
-        pl_sd = torch.load(path, map_location="cpu")
+        pl_sd = torch.load(path, map_location="cpu", weights_only=False)
         if "state_dict" in list(pl_sd.keys()):
             pl_sd = pl_sd["state_dict"]
 
@@ -2340,7 +2340,7 @@ def _modify_state_dict(state_dict):
         if filepath.endswith('.nemo'):
             conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
         elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
+            state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
         if not peft_cfgs:
diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
index 528048b04950..b6f57b259af3 100644
--- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
+++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py
@@ -959,7 +959,7 @@ def __init__(
 
                 state_dict = load_safetensors(from_pretrained)
             else:
-                state_dict = torch.load(from_pretrained, map_location='cpu')
+                state_dict = torch.load(from_pretrained, map_location='cpu', weights_only=False)
             if 'state_dict' in state_dict.keys():
                 state_dict = state_dict['state_dict']
             missing_key, unexpected_keys, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo)
diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py
index a9ee87e9a9de..8517400ee6ef 100644
--- a/nemo/collections/multimodal/speech_llm/models/modular_models.py
+++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py
@@ -1077,7 +1077,7 @@ def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, mod
                 peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme]
                 model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu")
             else:
-                torch_state_dict = torch.load(cfg.model.peft.restore_from_path)['state_dict']
+                torch_state_dict = torch.load(cfg.model.peft.restore_from_path, weights_only=False)['state_dict']
                 model.load_state_dict(torch_state_dict, strict=False)
         elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name:
             checkpoint_path = os.path.join(
@@ -1096,7 +1096,7 @@ def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, mod
                     peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme]
                     model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg), map_location="cpu")
                 else:
-                    model.load_state_dict(torch.load(checkpoint_path), strict=False)
+                    model.load_state_dict(torch.load(checkpoint_path, weights_only=False), strict=False)
             else:
                 raise NotImplementedError("distributed checkpointing of PEFT weights is not supported")
         elif model_cfg.peft.get("peft_scheme", None):
diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
index 4cdce4ac59c4..506adbc30d73 100644
--- a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
+++ b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py
@@ -55,7 +55,7 @@ def load_adapters(
         if filepath.endswith('.nemo'):
             conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
         elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
+            state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
         if not peft_cfgs:
diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
index 00552cb7f96e..b9485f3f2dc5 100644
--- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py
@@ -130,7 +130,7 @@ def load_adapters(
                 sharded_state_dict = self.sharded_state_dict(prefix="model.")
             conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict)
         elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
+            state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
         if not self.ptuning_only_and_non_first_stage:
diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
index 8f7870b7d4c7..080db1fa4254 100644
--- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
+++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py
@@ -368,7 +368,7 @@ def load_adapters(
         if filepath.endswith('.nemo'):
             conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location)
         elif filepath.endswith('.ckpt'):
-            state_dict = torch.load(filepath, map_location)['state_dict']
+            state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict']
         else:
             raise RuntimeError(f"{filepath} is not nemo file or ckpt file")
         if not peft_cfgs:

From ca4e4f0d7ce9f11be7bb79d8dba42ee53b7991ad Mon Sep 17 00:00:00 2001
From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com>
Date: Fri, 17 Jan 2025 11:49:43 -0800
Subject: [PATCH 16/27] Add context_logits for eval accuracy calculation in
 case of multi token prediction tasks (#11753)

* Add server ready check before evaluation

Uses bool generation_logits_available as inputs dict does not contain it

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Add context logits

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Remove max_tokens_to_generate and add more comments

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

* Get context_logits for multi token prediction tasks

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* Fix bug with single/multi token condition check

Signed-off-by: Abhishree <abhishreetm@gmail.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Apply isort and black reformatting

Signed-off-by: athitten <athitten@users.noreply.github.com>

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Bugfix with output_context_logits

Signed-off-by: Abhishree <abhishreetm@gmail.com>

---------

Signed-off-by: Abhishree <abhishreetm@gmail.com>
Signed-off-by: athitten <athitten@users.noreply.github.com>
Co-authored-by: athitten <athitten@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 nemo/collections/llm/api.py             | 13 +++---
 nemo/collections/llm/deploy/base.py     |  2 +
 nemo/collections/llm/evaluation/base.py | 61 +++++++++++++++++++------
 nemo/deploy/nlp/query_llm.py            |  6 +++
 nemo/export/tensorrt_llm.py             | 18 +++++++-
 nemo/export/trt_llm/tensorrt_llm_run.py |  3 ++
 nemo/export/vllm_exporter.py            |  5 ++
 requirements/requirements_eval.txt      |  2 +
 8 files changed, 88 insertions(+), 22 deletions(-)
 create mode 100644 requirements/requirements_eval.txt

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 386b08cc7813..83201e78283d 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -337,6 +337,7 @@ def deploy(
     max_input_len: int = 256,
     max_output_len: int = 256,
     max_batch_size: int = 8,
+    output_context_logits: bool = True,
     output_generation_logits: bool = True,
 ):
     """
@@ -364,8 +365,11 @@ def deploy(
         Needs to be True to be able to run evaluation. Default: True.
         openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to
         be True while running evaluation. Default: True.
+        output_context_logits (bool): If True builds trtllm engine with gather_context_logits set to True. Default: True.
+        context_logits are used to compute the logProb of the output token in case of multi token prediction benchmarks.
         output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True.
-        generation_logits are used to compute the logProb of the output token. Default: True.
+        generation_logits are used to compute the logProb of the output token in case of single token prediction
+        benchmarks (like MMLU, lambada). Default: True.
     """
     from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables
     from nemo.deploy import DeployPyTriton
@@ -383,6 +387,7 @@ def deploy(
         max_output_len,
         max_batch_size,
         dtype,
+        output_context_logits,
         output_generation_logits,
     )
 
@@ -425,7 +430,6 @@ def evaluate(
     limit: Optional[Union[int, float]] = None,
     bootstrap_iters: int = 100000,
     # inference params
-    max_tokens_to_generate: Optional[int] = 256,
     temperature: Optional[float] = 0.000000001,
     top_p: Optional[float] = 0.0,
     top_k: Optional[int] = 1,
@@ -454,7 +458,6 @@ def evaluate(
         bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0
         for no stderr calculations to be performed. Default: 100000.
         # inference params
-        max_tokens_to_generate (int): max tokens to generate. Default: 256.
         temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token
         with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM
         (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001.
@@ -480,9 +483,7 @@ def evaluate(
     # Wait for server to be ready before starting evaluation
     evaluation.wait_for_server_ready(url=url, triton_http_port=triton_http_port, model_name=model_name)
     # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate
-    model = evaluation.NeMoFWLMEval(
-        model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos
-    )
+    model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, temperature, top_p, top_k, add_bos)
     results = evaluator.simple_evaluate(
         model=model,
         tasks=eval_task,
diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py
index 4b0065271604..fd82c94effb3 100644
--- a/nemo/collections/llm/deploy/base.py
+++ b/nemo/collections/llm/deploy/base.py
@@ -63,6 +63,7 @@ def get_trtllm_deployable(
     max_output_len,
     max_batch_size,
     dtype,
+    output_context_logits,
     output_generation_logits,
 ):
     """
@@ -109,6 +110,7 @@ def get_trtllm_deployable(
                 max_output_len=max_output_len,
                 max_batch_size=max_batch_size,
                 dtype=dtype,
+                gather_context_logits=output_context_logits,
                 gather_generation_logits=output_generation_logits,
             )
         except Exception as error:
diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py
index aa415cb1022a..96d99b445433 100644
--- a/nemo/collections/llm/evaluation/base.py
+++ b/nemo/collections/llm/evaluation/base.py
@@ -33,38 +33,51 @@ class NeMoFWLMEval(LM):
     Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md
     """
 
-    def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos):
+    def __init__(self, model_name, api_url, tokenizer, temperature, top_p, top_k, add_bos):
         self.model_name = model_name
         self.api_url = api_url
         self.tokenizer = tokenizer
-        self.max_tokens_to_generate = max_tokens_to_generate
         self.temperature = temperature
         self.top_p = top_p
         self.top_k = top_k
         self.add_bos = add_bos
         super().__init__()
 
-    def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False):
+    def _generate_tokens_logits(
+        self, payload, single_prediction_token, return_text: bool = False, return_logits: bool = False
+    ):
         """
         A private method that sends post request to the model on PyTriton server and returns either generated text or
         logits.
         """
         nq = NemoQueryLLM(url=self.api_url, model_name=payload['model'])
 
+        output_context_logits = False
+        output_generation_logits = False
+        if single_prediction_token:
+            # In case of single token prediction return the generation logits
+            output_generation_logits = True
+        else:
+            # In case of multiple token prediction return the context logits
+            output_context_logits = True
         response = nq.query_llm(
             prompts=payload['prompt'] if isinstance(payload['prompt'], list) else [payload['prompt']],
             max_output_len=payload['max_tokens'],
             top_k=payload['top_k'],
             top_p=payload['top_p'],
             temperature=payload['temperature'],
-            output_generation_logits=True,
+            output_context_logits=output_context_logits,
+            output_generation_logits=output_generation_logits,
             openai_format_response=True,
         )
 
         if return_text:
             return response["choices"][0]["text"]  # shape[batch_size, 1]
-        if return_logits:
-            return response["choices"][0]["generation_logits"]  # shape[batch_size, 1, num_tokens, vocab_size]
+        elif return_logits:
+            if output_context_logits:
+                return response["choices"][0]["context_logits"]
+            else:
+                return response["choices"][0]["generation_logits"]
 
     def tokenizer_type(self, tokenizer):
         """
@@ -93,6 +106,16 @@ def loglikelihood(self, requests: list[Instance]):
         elif tokenizer_type == "AutoTokenizer":
             special_tokens_kwargs['add_special_tokens'] = self.add_bos
 
+        single_prediction_token = False
+        # Assuming evaluating on only one benchmark/task at a time, hence all instances in requests are of the same
+        # task.
+        mmlu_regex_pattern = r"^mmlu_"
+        lambada_regex_pattern = r"^lambada_"
+        if re.match(mmlu_regex_pattern, requests[0].task_name) or re.match(
+            lambada_regex_pattern, requests[0].task_name
+        ):
+            single_prediction_token = True
+
         results = []
         for request in tqdm(requests):
             # get the input prompt from the request
@@ -105,31 +128,39 @@ def loglikelihood(self, requests: list[Instance]):
             if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer":
                 continuation_enc = continuation_enc[1:]
             num_cont_tokens = len(continuation_enc)
-            # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request
-            self.max_tokens_to_generate = num_cont_tokens
+            # Hard code max_tokens_to_generate to 1 to always generate just 1 token
+            self.max_tokens_to_generate = 1
+            # Delete the last token from continuation before passing it to the ip prompt by replacing with empty string
+            prompt = context + continuation.replace(self.tokenizer.tokenizer.decode(continuation_enc[-1]), "")
             # Create payload to query the model deployed on PyTriton server
             payload = {
                 "model": self.model_name,
-                "prompt": context,
+                "prompt": prompt,
                 "max_tokens": self.max_tokens_to_generate,
                 "temperature": self.temperature,
                 "top_p": self.top_p,
                 "top_k": self.top_k,
             }
             # Get the logits from the model
-            generation_logits = self._generate_tokens_logits(payload, return_logits=True)
-            # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax
-            multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1)
+            logits = self._generate_tokens_logits(payload, single_prediction_token, return_logits=True)
+            # In case of multiple token prediction where full context logits are returned, get only logits
+            # corresponding to the continuation tokens from the context logits tensor.context_logits contains logits
+            # for all tokens in the ip prompt along with the logit for the next token prediction after the final token
+            # in the prompt. Shape of context_logits: [1, #tokens_in_prompt+1, vocab_size]
+            if not single_prediction_token:
+                logits = logits[:, -num_cont_tokens:, :]
+            # Convert logits to torch tensor to easily get logprobs wo manual implementation of log_softmax
+            logProbs = F.log_softmax(torch.tensor(logits), dim=-1)
             # Convert encoded continuation tokens to torch tensor
             cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0)
             # Get the greedy token from the logits (i.e token with the highest prob)
-            greedy_tokens = multi_logits.argmax(dim=-1)
+            greedy_tokens = logProbs.argmax(dim=-1)
             # Check if all greedy_tokens match the the actual continuation tokens
             is_greedy = (greedy_tokens == cont_toks).all()
             # Get the logits corresponding to the actual continuation tokens
-            logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
+            logProbs_actual = torch.gather(logProbs, 2, cont_toks.unsqueeze(-1)).squeeze(-1)
             # result is tuple of logProb of generating the continuation token and is_greedy
-            result = (float(logits.sum()), bool(is_greedy))
+            result = (float(logProbs_actual.sum()), bool(is_greedy))
 
             results.append(result)
 
diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py
index 8b65a278ff41..93443d47a6a8 100644
--- a/nemo/deploy/nlp/query_llm.py
+++ b/nemo/deploy/nlp/query_llm.py
@@ -198,6 +198,7 @@ def query_llm(
         end_strings=None,
         init_timeout=60.0,
         openai_format_response: bool = False,
+        output_context_logits: bool = False,
         output_generation_logits: bool = False,
     ):
         """
@@ -275,6 +276,9 @@ def query_llm(
         if end_strings is not None:
             inputs["end_strings"] = str_list2numpy(end_strings)
 
+        if output_context_logits is not None:
+            inputs["output_context_logits"] = np.full(prompts.shape, output_context_logits, dtype=np.bool_)
+
         if output_generation_logits is not None:
             inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_)
 
@@ -301,6 +305,8 @@ def query_llm(
                     }
                     if output_generation_logits:
                         openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"]
+                    if output_context_logits:
+                        openai_response["choices"][0]["context_logits"] = result_dict["context_logits"]
                     return openai_response
                 else:
                     return sentences
diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
index f2bb9d36b377..192b8bc86f65 100644
--- a/nemo/export/tensorrt_llm.py
+++ b/nemo/export/tensorrt_llm.py
@@ -959,6 +959,7 @@ def forward(
         prompt_embeddings_checkpoint_path: str = None,
         streaming: bool = False,
         output_log_probs: bool = False,
+        output_context_logits: bool = False,
         output_generation_logits: bool = False,
         **sampling_kwargs,
     ):
@@ -1049,6 +1050,7 @@ def forward(
                     no_repeat_ngram_size=no_repeat_ngram_size,
                     output_log_probs=output_log_probs,
                     multiprocessed_env=multiprocessed_env,
+                    output_context_logits=output_context_logits,
                     output_generation_logits=output_generation_logits,
                     **sampling_kwargs,
                 )
@@ -1133,6 +1135,7 @@ def get_triton_input(self):
             Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True),
             Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
+            Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False),
             Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False),
         )
         return inputs
@@ -1142,6 +1145,7 @@ def get_triton_output(self):
         outputs = (
             Tensor(name="outputs", shape=(-1,), dtype=bytes),
             Tensor(name="generation_logits", shape=(-1,), dtype=np.single),
+            Tensor(name="context_logits", shape=(-1,), dtype=np.single),
         )
         return outputs
 
@@ -1149,6 +1153,7 @@ def get_triton_output(self):
     def triton_infer_fn(self, **inputs: np.ndarray):
         """Triton infer function for streaming"""
         output_dict = {}
+        context_logits_available = False
         generation_logits_available = False
         try:
             infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))}
@@ -1179,10 +1184,21 @@ def triton_infer_fn(self, **inputs: np.ndarray):
             if "output_generation_logits" in inputs:
                 generation_logits_available = inputs["output_generation_logits"][0][0]
                 infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0]
+            if "output_context_logits" in inputs:
+                context_logits_available = inputs["output_context_logits"][0][0]
+                infer_input["output_context_logits"] = inputs.pop("output_context_logits")[0][0]
 
             if generation_logits_available:
                 output_texts, generation_logits = self.forward(**infer_input)
-                output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy())
+                # generation_logits is a 4d tensor of dim [1,1,#generated_tokens, vocab_size], return just the 3d tensor
+                # in output dict.
+                output_dict["generation_logits"] = np.array(generation_logits[0].cpu().numpy())
+            elif context_logits_available:
+                output_texts, context_logits = self.forward(**infer_input)
+                # convert context logits to 3d tensor from list since its avaiable as a list of tensor shaped
+                # [#tokens, vocab_size]
+                context_logits = context_logits[0].unsqueeze(0)
+                output_dict["context_logits"] = np.array(context_logits.cpu().numpy())
             else:
                 output_texts = self.forward(**infer_input)
             output_dict["outputs"] = cast_output(output_texts, np.bytes_)
diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
index ef67c918290f..8be537f840e8 100644
--- a/nemo/export/trt_llm/tensorrt_llm_run.py
+++ b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -647,6 +647,7 @@ def generate(
     streaming: bool = False,
     output_log_probs=False,
     multiprocessed_env=False,
+    output_context_logits=False,
     output_generation_logits=False,
     **sampling_kwargs,
 ) -> Optional[List[List[str]]]:
@@ -709,6 +710,8 @@ def generate(
 
     if output_generation_logits:
         return output_lines_list, outputs['generation_logits']
+    elif output_context_logits:
+        return output_lines_list, outputs['context_logits']
     return output_lines_list
 
 
diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
index 97575058bd1c..b32f5c0a76fc 100644
--- a/nemo/export/vllm_exporter.py
+++ b/nemo/export/vllm_exporter.py
@@ -403,6 +403,7 @@ def get_triton_input(self):
             Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True),
             Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True),
             Tensor(name="output_generation_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
+            Tensor(name="output_context_logits", shape=(-1,), dtype=numpy.bool_, optional=True),
         )
         return inputs
 
@@ -456,6 +457,7 @@ def forward(
         streaming: bool = False,
         output_log_probs: bool = False,
         output_generation_logits: bool = False,
+        output_context_logits: bool = False,
     ) -> Union[List[List[str]], Iterable[List[List[str]]]]:
         """
         The forward function performs LLM evaluation on the provided array of prompts with other parameters shared,
@@ -488,6 +490,9 @@ def forward(
         if output_generation_logits:
             raise NotImplementedError("output_generation_logits is not supported")
 
+        if output_context_logits:
+            raise NotImplementedError("output_context_logits is not supported")
+
         request_ids = []
         for index in range(len(input_texts)):
             prompt = input_texts[index]
diff --git a/requirements/requirements_eval.txt b/requirements/requirements_eval.txt
new file mode 100644
index 000000000000..60828395c199
--- /dev/null
+++ b/requirements/requirements_eval.txt
@@ -0,0 +1,2 @@
+# Installs EleutherAI's lm-evaluation-harness https://github.com/EleutherAI/lm-evaluation-harness/tree/main
+lm-eval

From ad807ae56821c638923f20a251694c1fdac6272f Mon Sep 17 00:00:00 2001
From: Ao Tang <aot@nvidia.com>
Date: Fri, 17 Jan 2025 17:09:14 -0500
Subject: [PATCH 17/27] add dataset_root (#11837)

---
 nemo/collections/llm/bert/data/specter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/bert/data/specter.py b/nemo/collections/llm/bert/data/specter.py
index 7784b32e6bd9..0c477851d22f 100644
--- a/nemo/collections/llm/bert/data/specter.py
+++ b/nemo/collections/llm/bert/data/specter.py
@@ -43,6 +43,7 @@ class SpecterDataModule(FineTuningDataModule, IOMixin):
 
     def __init__(
         self,
+        dataset_root: str = None,
         seq_length: int = 512,
         tokenizer: Optional["TokenizerSpec"] = None,
         micro_batch_size: int = 4,
@@ -61,7 +62,7 @@ def __init__(
         self.delete_raw = delete_raw
 
         super().__init__(
-            dataset_root=get_dataset_root("specter"),
+            dataset_root=get_dataset_root("specter") if dataset_root is None else dataset_root,
             seq_length=seq_length,
             tokenizer=tokenizer,
             micro_batch_size=micro_batch_size,

From 8bf5144873413c9ad653365ff1ebc049ba74d5b8 Mon Sep 17 00:00:00 2001
From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com>
Date: Fri, 17 Jan 2025 16:47:12 -0800
Subject: [PATCH 18/27] Support both Path and str for APIs (#11865)

* support both path and str for APIs

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* cleanup

Signed-off-by: Maanu Grover <maanug@nvidia.com>

* fix cleanup

Signed-off-by: Maanu Grover <maanug@nvidia.com>

---------

Signed-off-by: Maanu Grover <maanug@nvidia.com>
---
 nemo/collections/llm/api.py | 41 +++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 11 deletions(-)

diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py
index 83201e78283d..475982b0f746 100644
--- a/nemo/collections/llm/api.py
+++ b/nemo/collections/llm/api.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-import os
 import warnings
 from copy import deepcopy
 from pathlib import Path
@@ -47,6 +46,7 @@
 
 
 TokenizerType = Any
+AnyPath = Union[Path, str]
 
 
 @run.cli.entrypoint(namespace="llm")
@@ -322,14 +322,14 @@ def ptq(
 
 @run.cli.entrypoint(namespace="llm")
 def deploy(
-    nemo_checkpoint: Path = None,
+    nemo_checkpoint: AnyPath = None,
     model_type: str = "llama",
     triton_model_name: str = "triton_model",
     triton_model_version: Optional[int] = 1,
     triton_http_port: int = 8000,
     triton_grpc_port: int = 8001,
     triton_http_address: str = "0.0.0.0",
-    triton_model_repository: Path = None,
+    triton_model_repository: AnyPath = None,
     num_gpus: int = 1,
     tensor_parallelism_size: int = 1,
     pipeline_parallelism_size: int = 1,
@@ -376,6 +376,11 @@ def deploy(
 
     unset_environment_variables()
 
+    if not isinstance(nemo_checkpoint, Path):
+        nemo_checkpoint = Path(nemo_checkpoint)
+    if not isinstance(triton_model_repository, Path):
+        triton_model_repository = Path(triton_model_repository)
+
     triton_deployable = get_trtllm_deployable(
         nemo_checkpoint,
         model_type,
@@ -421,7 +426,7 @@ def deploy(
 
 
 def evaluate(
-    nemo_checkpoint_path: Path,
+    nemo_checkpoint_path: AnyPath,
     url: str = "grpc://0.0.0.0:8001",
     triton_http_port: int = 8000,
     model_name: str = "triton_model",
@@ -442,7 +447,8 @@ def evaluate(
     Args:
         nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt
         which is required to tokenize the evaluation input and output prompts.
-        url (str): grpc service url that were used in the deploy method above in the format: grpc://{grpc_service_ip}:{grpc_port}.
+        url (str): grpc service url that were used in the deploy method above
+            in the format: grpc://{grpc_service_ip}:{grpc_port}.
         triton_http_port (int): HTTP port that was used for the PyTriton server in the deploy method. Default: 8000.
         Please pass the triton_http_port if using a custom port in the deploy method.
         model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as
@@ -478,6 +484,9 @@ def evaluate(
 
     from nemo.collections.llm import evaluation
 
+    if not isinstance(nemo_checkpoint_path, Path):
+        nemo_checkpoint_path = Path(nemo_checkpoint_path)
+
     # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt.
     tokenizer = io.load_context(nemo_checkpoint_path + "/context", subpath="model.tokenizer")
     # Wait for server to be ready before starting evaluation
@@ -499,7 +508,7 @@ def evaluate(
 def import_ckpt(
     model: pl.LightningModule,
     source: str,
-    output_path: Optional[Path] = None,
+    output_path: Optional[AnyPath] = None,
     overwrite: bool = False,
 ) -> Path:
     """
@@ -557,6 +566,9 @@ def import_ckpt(
         ValueError: If the model does not implement ConnectorMixin, indicating a lack of
             necessary importer functionality.
     """
+    if output_path and not isinstance(output_path, Path):
+        output_path = Path(output_path)
+
     output = io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite)
 
     console = Console()
@@ -569,15 +581,17 @@ def import_ckpt(
     return output
 
 
-def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector:
+def load_connector_from_trainer_ckpt(path: AnyPath, target: str) -> io.ModelConnector:
+    if not isinstance(path, Path):
+        path = Path(path)
     return io.load_context(path, subpath="model").exporter(target, path)
 
 
 @run.cli.entrypoint(name="export", namespace="llm")
 def export_ckpt(
-    path: Path,
+    path: AnyPath,
     target: str,
-    output_path: Optional[Path] = None,
+    output_path: Optional[AnyPath] = None,
     overwrite: bool = False,
     load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt,
 ) -> Path:
@@ -628,6 +642,11 @@ def export_ckpt(
         ValueError: If the model does not implement ConnectorMixin, indicating a lack of
             necessary exporter functionality.
     """
+    if not isinstance(path, Path):
+        path = Path(path)
+    if output_path and not isinstance(output_path, Path):
+        output_path = Path(output_path)
+
     output = io.export_ckpt(path, target, output_path, overwrite, load_connector)
 
     console = Console()
@@ -638,7 +657,7 @@ def export_ckpt(
 
 @run.cli.entrypoint(name="generate", namespace="llm")
 def generate(
-    path: Union[Path, str],
+    path: AnyPath,
     trainer: nl.Trainer,
     prompts: Optional[list[str]] = None,
     encoder_prompts: Optional[list[str]] = None,
@@ -650,7 +669,7 @@ def generate(
     inference_batch_times_seqlen_threshold: int = 1000,
     inference_params: Optional["CommonInferenceParams"] = None,
     text_only: bool = False,
-    output_path: Optional[Union[Path, str]] = None,
+    output_path: Optional[AnyPath] = None,
 ) -> list[Union["InferenceRequest", str]]:
     """
     Generates text using a NeMo LLM model.

From 4df3fe5460af0c559b08b482688d6272d9c3821d Mon Sep 17 00:00:00 2001
From: Vince Xu <hyxcl001@163.com>
Date: Sat, 18 Jan 2025 08:58:57 +0800
Subject: [PATCH 19/27] fix tensor dimensions are not compatible for FP8 issue
 in sft (#8787)

Co-authored-by: yuhuang <yuhuang@nvidia.com>
Co-authored-by: Eric Harper <complex451@gmail.com>
---
 .../nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
index 53d94452a480..349321d3570a 100644
--- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
+++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py
@@ -373,7 +373,7 @@ def collate_fn(self, batch):
         if self.pad_to_max_length:
             max_length = self.max_seq_length
         else:
-            max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8))
+            max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16))
         assert max_length <= self.max_seq_length
 
         if not self.get_attention_mask_from_fusion:

From bd58e14392c47118ca65a014f5e3d6e0bd66e9cc Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Fri, 17 Jan 2025 19:56:15 -0800
Subject: [PATCH 20/27] Run nsys callback on GBS not on MBS (#11861)

* fix nsys callback running on each mbs

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Update test

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/pytorch/callbacks/nsys.py       | 7 +++++--
 tests/lightning/pytorch/callbacks/test_nsys.py | 2 ++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py
index 13b059011426..0368b2d52773 100644
--- a/nemo/lightning/pytorch/callbacks/nsys.py
+++ b/nemo/lightning/pytorch/callbacks/nsys.py
@@ -78,6 +78,7 @@ def __init__(
             f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},'
             f'and end_step: {self._nsys_profile_end_step}'
         )
+        self._has_nsys_enabled = False
 
     def _rank_is_active(self, trainer):
         # TODO(@akoumparouli): is this function cache-able?
@@ -98,7 +99,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt
             return
 
         current_step = get_current_epoch_step(trainer)
-        if current_step == self._nsys_profile_start_step:
+        if current_step == self._nsys_profile_start_step and not self._has_nsys_enabled:
+            self._has_nsys_enabled = True
             torch.cuda.cudart().cudaProfilerStart()
             if self._nsys_profile_gen_shape:
                 torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__()
@@ -114,6 +116,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int)
             return
 
         current_step = get_current_epoch_step(trainer)
-        if current_step == self._nsys_profile_end_step:
+        if current_step == self._nsys_profile_end_step and self._has_nsys_enabled:
             torch.cuda.cudart().cudaProfilerStop()
             torch.autograd.profiler.emit_nvtx().__exit__(None, None, None)
+            self._has_nsys_enabled = False
diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py
index 9653e707198e..04ca7be718d9 100644
--- a/tests/lightning/pytorch/callbacks/test_nsys.py
+++ b/tests/lightning/pytorch/callbacks/test_nsys.py
@@ -111,6 +111,8 @@ def test_on_train_batch_end_profiling(
         callback = NsysCallback(start_step=10, end_step=20, ranks=[0])
 
         mock_trainer.strategy.current_epoch_step = 20
+        assert callback._has_nsys_enabled == False
+        callback._has_nsys_enabled = True
         callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20)
 
         mock_cudart().cudaProfilerStop.assert_called_once()

From fcd4807d882e706a5f95a6c651ce066e08dbbc50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sat, 18 Jan 2025 21:33:05 +0100
Subject: [PATCH 21/27] ci: Set bump-branch to weekly (#11889)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/mcore-tag-bump-bot.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml
index 1b0712924101..467e89c4144e 100644
--- a/.github/workflows/mcore-tag-bump-bot.yml
+++ b/.github/workflows/mcore-tag-bump-bot.yml
@@ -10,10 +10,10 @@ jobs:
     uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_bump_dockerfile.yml@v0.11.0
     with:
       source-repository: NVIDIA/Megatron-LM
-      source-ref: main
+      source-ref: weekly-bump-2025-03
       build-arg: MCORE_TAG
       dockerfile: Dockerfile.ci
-      base-branch: main
+      base-branch: weekly-bump-2025-03
       cicd-label: Run CICD
       pr-reviewers: 'pablo-garay'
     secrets:

From 102bac65380d93fef0cd1722445081220158932f Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Sat, 18 Jan 2025 18:11:40 -0800
Subject: [PATCH 22/27] surface attn_implementation option (#11873)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 .../llm/gpt/model/hf_auto_model_for_causal_lm.py          | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
index abe966229ffe..5f315397584b 100644
--- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
+++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -44,6 +44,7 @@ def __init__(
         trust_remote_code=False,
         default_dtype=torch.bfloat16,
         load_in_4bit=False,
+        attn_implementation="sdpa",
     ):
         super().__init__()
         self.save_hyperparameters()
@@ -58,6 +59,7 @@ def __init__(
         self.trust_remote_code = trust_remote_code
         self.default_dtype = default_dtype
         self.load_in_4bit = load_in_4bit
+        self.attn_implementation = attn_implementation
 
     @property
     def tokenizer(self):
@@ -82,6 +84,7 @@ def configure_model(self):
                 torch_dtype='auto',
                 trust_remote_code=self.trust_remote_code,
                 load_in_4bit=self.load_in_4bit,
+                attn_implementation=self.attn_implementation,
             )
         else:
             from transformers import AutoConfig
@@ -89,7 +92,10 @@ def configure_model(self):
             config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code)
             dtype = getattr(config, 'torch_dtype', self.default_dtype)
             self.model = AutoModelForCausalLM.from_config(
-                config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code
+                config,
+                torch_dtype=dtype,
+                trust_remote_code=self.trust_remote_code,
+                attn_implementation=self.attn_implementation,
             )
 
         # Apply FSDP2 and TP to the model

From 1218a040c6a95451dc9567068e5ae5de8ff5633c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Sun, 19 Jan 2025 07:43:49 +0100
Subject: [PATCH 23/27] chore: Update mcore-tag-bump-bot.yml (#11891)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/mcore-tag-bump-bot.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml
index 467e89c4144e..6914a8a217d2 100644
--- a/.github/workflows/mcore-tag-bump-bot.yml
+++ b/.github/workflows/mcore-tag-bump-bot.yml
@@ -10,11 +10,11 @@ jobs:
     uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_bump_dockerfile.yml@v0.11.0
     with:
       source-repository: NVIDIA/Megatron-LM
-      source-ref: weekly-bump-2025-03
+      source-ref: main
       build-arg: MCORE_TAG
       dockerfile: Dockerfile.ci
       base-branch: weekly-bump-2025-03
       cicd-label: Run CICD
       pr-reviewers: 'pablo-garay'
     secrets:
-      PAT: ${{ secrets.PAT }}
\ No newline at end of file
+      PAT: ${{ secrets.PAT }}

From aa4f9fb98ecf11455115face11ae9f9213d8d252 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?oliver=20k=C3=B6nig?= <okoenig@nvidia.com>
Date: Mon, 20 Jan 2025 11:07:43 +0100
Subject: [PATCH 24/27] ci: Bump Mcore in weekly PR (#11897)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: oliver könig <okoenig@nvidia.com>
---
 .github/workflows/mcore-tag-bump-bot.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml
index 6914a8a217d2..01afb55d4361 100644
--- a/.github/workflows/mcore-tag-bump-bot.yml
+++ b/.github/workflows/mcore-tag-bump-bot.yml
@@ -13,7 +13,7 @@ jobs:
       source-ref: main
       build-arg: MCORE_TAG
       dockerfile: Dockerfile.ci
-      base-branch: weekly-bump-2025-03
+      base-branch: weekly-bump
       cicd-label: Run CICD
       pr-reviewers: 'pablo-garay'
     secrets:

From 0075ed0fd30ee0b8cd7f10265b9593f66ce162d1 Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 20 Jan 2025 11:14:04 -0800
Subject: [PATCH 25/27] check restore_config first (#11890)

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
---
 nemo/lightning/resume.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py
index 6d6ddda1fd80..0224d7e9836d 100644
--- a/nemo/lightning/resume.py
+++ b/nemo/lightning/resume.py
@@ -103,23 +103,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
         if isinstance(trainer, fl.Fabric):
             raise NotImplementedError("Fabric is not supported yet.")
 
-        trainer_ckpt_path = self.get_trainer_ckpt_path(model)
-        if trainer_ckpt_path:
-            trainer.ckpt_path = trainer_ckpt_path
-            trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
-            # Load artifacts
-            if getattr(self.restore_config, 'load_artifacts', False):
-                if isinstance(trainer_ckpt_path, AdapterPath):
-                    # load tokenizer from the base model during peft resume, in case the first peft checkpoint
-                    # is deleted before the current peft checkpoint is saved
-                    context_path = trainer_ckpt_path.base_model_path / "context"
-                    if not context_path.exists():
-                        context_path = trainer_ckpt_path.base_model_path
-                else:
-                    context_path = self.get_context_path(model)
-                model = _try_restore_tokenizer(model, context_path)
-
-        elif self.restore_config:
+        if self.restore_config:
             new_path = self._extract_path(
                 model=model,
                 path=self.restore_config.path,
@@ -139,6 +123,21 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None):
 
                 _try_restore_tokenizer(model, context_path)
 
+        elif (trainer_ckpt_path := self.get_trainer_ckpt_path(model)) is not None:
+            trainer.ckpt_path = trainer_ckpt_path
+            trainer.checkpoint_callback.last_model_path = trainer_ckpt_path
+            # Load artifacts
+            if getattr(self.restore_config, 'load_artifacts', False):
+                if isinstance(trainer_ckpt_path, AdapterPath):
+                    # load tokenizer from the base model during peft resume, in case the first peft checkpoint
+                    # is deleted before the current peft checkpoint is saved
+                    context_path = trainer_ckpt_path.base_model_path / "context"
+                    if not context_path.exists():
+                        context_path = trainer_ckpt_path.base_model_path
+                else:
+                    context_path = self.get_context_path(model)
+                model = _try_restore_tokenizer(model, context_path)
+
     def _extract_path(
         self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None
     ) -> BasePath:

From 499161e6e173a25eb642e9c9ef6a28b73d3ea6ec Mon Sep 17 00:00:00 2001
From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com>
Date: Mon, 20 Jan 2025 15:17:30 -0800
Subject: [PATCH 26/27] LinearAdapter: propagate args to _init_adapter (#11902)

* propagate defaults

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* switch dropout default to 0.0

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>

---------

Signed-off-by: Alexandros Koumparoulis <akoumparouli@nvidia.com>
Signed-off-by: akoumpa <akoumpa@users.noreply.github.com>
Co-authored-by: akoumpa <akoumpa@users.noreply.github.com>
---
 nemo/collections/llm/peft/lora.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py
index 6c7e7e93ae8f..a6b7ad288765 100644
--- a/nemo/collections/llm/peft/lora.py
+++ b/nemo/collections/llm/peft/lora.py
@@ -52,7 +52,7 @@ class LinearAdapter(nn.Linear):
         orig_linear (nn.Module): the linear module to augment.
         dim (int): lora's dim in_features -> dim -> out_features.
         alpha (int): lora's scaling alpha.
-        dropout (float): dropout prob (default: 0.1).
+        dropout (float): dropout prob (default: 0.0).
         dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
         lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
         lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
@@ -64,7 +64,7 @@ def __init__(
         orig_linear,
         dim=8,
         alpha=32,
-        dropout=0.1,
+        dropout=0.0,
         dropout_position='post',
         lora_A_init_method='xavier',
         lora_dtype=None,
@@ -82,14 +82,22 @@ def __init__(
         if orig_linear.bias is not None:
             self.bias.data.copy_(orig_linear.bias.data)
         # initialize the adapte
-        LinearAdapter._init_adapter(self)
+        LinearAdapter._init_adapter(
+            self,
+            dim=dim,
+            alpha=alpha,
+            dropout=dropout,
+            dropout_position=dropout_position,
+            lora_A_init_method=lora_A_init_method,
+            lora_dtype=lora_dtype,
+        )
 
     @staticmethod
     def _init_adapter(
         obj,
         dim=8,
         alpha=32,
-        dropout=0.1,
+        dropout=0.0,
         dropout_position='post',
         lora_A_init_method='xavier',
         lora_dtype=None,
@@ -101,7 +109,7 @@ def _init_adapter(
             obj (LinearAdapter | nn.Module): input module to adapt.
             dim (int): lora's dim in_features -> dim -> out_features.
             alpha (int): lora's scaling alpha.
-            dropout (float): dropout prob (default: 0.1).
+            dropout (float): dropout prob (default: 0.0).
             dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post)
             lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform'])
             lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they
@@ -155,7 +163,7 @@ def patch_linear_module(
     orig_linear,
     dim=8,
     alpha=32,
-    dropout=0.1,
+    dropout=0.0,
     dropout_position='post',
     lora_A_init_method='xavier',
     lora_dtype=None,
@@ -175,7 +183,7 @@ def patch_linear_module(
         orig_linear (nn.Linear): the module we add adapter to.
         dim (int, optional): Lora dim. Defaults to 8.
         alpha (int, optional): Lora alpha scale. Defaults to 32.
-        dropout (float, optional): dropout prob. Defaults to 0.1.
+        dropout (float, optional): dropout prob. Defaults to 0.0.
         dropout_position (str, optional): location to apply dropout wrt lora.
             Defaults to 'post' (choices: 'pre', 'post').
         lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'.

From b4f89c50bda173a9ec2b2883aeee4dc291465031 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Piotr=20Kami=C5=84ski?=
 <67481570+Laplasjan107@users.noreply.github.com>
Date: Tue, 21 Jan 2025 11:10:54 +0100
Subject: [PATCH 27/27] NeMo 2.0 fp8 conversion (#11845)

* initial commit

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* Apply isort and black reformatting

Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>

* code review: reuse nemo1 model config loader

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

* remove unused import

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>

---------

Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com>
Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
---
 .../trt_llm/nemo_ckpt_loader/nemo_file.py     | 29 +++++++++++++++----
 .../convert_nemo1_to_nemo2.py                 | 26 +++++++++++++++--
 2 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
index 1d344fd55735..f3c9812555bc 100644
--- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
+++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -480,6 +480,28 @@ def get_model_type(nemo_ckpt: Union[str, Path]) -> Optional[str]:
     return model_type
 
 
+def load_distributed_model_weights(
+    weights_directory: Union[Path, TarPath], mcore_scales_format: bool
+) -> Dict[str, Any]:
+    """
+    Loads model weights in `torch_dist` format directly from weights directory.
+    Preprocesses the scaling factors for local export if mcore_scales_format is set to False.
+
+    Args:
+        weights_directory (Path | TarPath): Path to the weights directory.
+        mcore_scales_format (bool): Flag for local vs megatron.core export.
+
+    Returns:
+        dict: Model state dictionary
+    """
+    model = load_sharded_metadata(weights_directory)
+    if not mcore_scales_format:
+        model.update({k: v[0] for k, v in model.items() if EXTRA_STATE in k and isinstance(v, list)})
+        model = preprocess_scaling_factors_for_local_export(model)
+
+    return model
+
+
 def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path], mcore_scales_format: bool = True):
     if not os.path.exists(nemo_ckpt):
         raise TypeError("%s does not exist", nemo_ckpt)
@@ -496,10 +518,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
         if (nemo_dir / "model_weights").exists():
             dist_ckpt_folder = nemo_dir / "model_weights"
 
-            model = load_sharded_metadata(dist_ckpt_folder)
-            if not mcore_scales_format:
-                model.update({k: v[0] for k, v in model.items() if EXTRA_STATE in k and isinstance(v, list)})
-                model = preprocess_scaling_factors_for_local_export(model)
+            model = load_distributed_model_weights(dist_ckpt_folder, mcore_scales_format)
 
             nemo_model_config = unpacked_checkpoint_dir.model_config
 
@@ -515,7 +534,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat
                 tokenizer = build_tokenizer(tokenizer_config)
         elif (nemo_dir / "weights").exists():
             dist_ckpt_folder = nemo_dir / "weights"
-            model = load_sharded_metadata(dist_ckpt_folder)
+            model = load_distributed_model_weights(dist_ckpt_folder, mcore_scales_format)
             io_folder = nemo_dir / "context"
 
             if (io_folder / "model.yaml").exists():
diff --git a/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py b/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py
index 5ce814d6cff3..4052678ccef3 100644
--- a/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py
+++ b/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py
@@ -40,6 +40,7 @@
 import tempfile
 from argparse import ArgumentParser
 from pathlib import Path
+from typing import Any, Dict
 
 import torch
 from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace
@@ -56,6 +57,7 @@
 from nemo.lightning.ckpt_utils import ckpt_to_context_subdir
 from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir
 from nemo.utils import logging
+from nemo.utils.model_utils import load_config
 
 MODEL_CONFIG_MAPPING = {
     "meta-llama/Llama-2-7b-hf": (llm.LlamaModel, llm.Llama2Config7B),
@@ -116,7 +118,23 @@ def get_args():
     return args
 
 
-def get_nemo2_model(model_id, tokenizer) -> llm.GPTModel:
+def load_fp8_config(model_path: str) -> Dict[str, Any]:
+    """
+    Loads fp8 configuration of the NeMo 1.0 model.
+
+    Args:
+        model_path (str): Path to NeMo 1.0 checkpoint.
+
+    Returns:
+        (dict): NeMo 1.0 model fp8 settings.
+    """
+    fp8_params = ['fp8', 'fp8_amax_history_len', 'fp8_interval', 'fp8_margin', 'fp8_amax_compute_algo']
+    config = load_config(model_path)
+    fp8_config = {key: config[key] for key in fp8_params if key in config}
+    return fp8_config
+
+
+def get_nemo2_model(model_id, tokenizer, input_path) -> llm.GPTModel:
     """
     Get NeMo 2.0 model class from model_id and tokenizer. Use bf16 for NeMo 1.0 ckpts.
 
@@ -135,8 +153,10 @@ def get_nemo2_model(model_id, tokenizer) -> llm.GPTModel:
         valid_ids = "\n- ".join([""] + list(MODEL_CONFIG_MAPPING.keys()))
         raise ValueError(f"Unsupported model_id: {model_id}. Please provide a valid model_id from {valid_ids}")
     model_cls, config_cls = MODEL_CONFIG_MAPPING[model_id]
+
+    fp8_config = load_fp8_config(input_path)
     # nemo1 ckpts are bf16
-    return model_cls(config_cls(bf16=True, params_dtype=torch.bfloat16), tokenizer=tokenizer)
+    return model_cls(config_cls(bf16=True, params_dtype=torch.bfloat16, **fp8_config), tokenizer=tokenizer)
 
 
 def get_tokenizer(input_path: Path, tokenizer_tmp_dir: Path) -> AutoTokenizer:
@@ -183,7 +203,7 @@ def main() -> None:
     tokenizer_tmp_dir = Path("/tmp/nemo_tokenizer")
     tokenizer_tmp_dir.mkdir(parents=True, exist_ok=True)
     tokenizer = get_tokenizer(Path(args.input_path), tokenizer_tmp_dir)
-    model = get_nemo2_model(args.model_id, tokenizer=tokenizer)
+    model = get_nemo2_model(args.model_id, tokenizer=tokenizer, input_path=args.input_path)
     model.optim = None
 
     trainer = Trainer(