From 74025a6b8312bbe7c8c8090d9f06c3c6853d7ee7 Mon Sep 17 00:00:00 2001 From: "Peter St. John" Date: Tue, 14 Jan 2025 08:55:10 -0700 Subject: [PATCH 01/27] remove tensorstore pin in requirements*.txt (#11777) Signed-off-by: Peter St. John --- requirements/requirements_infer.txt | 2 +- requirements/requirements_nlp.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/requirements_infer.txt b/requirements/requirements_infer.txt index 5f428d91fc56..47daf571d26f 100644 --- a/requirements/requirements_infer.txt +++ b/requirements/requirements_infer.txt @@ -3,6 +3,6 @@ fastapi nvidia-pytriton pydantic-settings -tensorstore==0.1.45 +tensorstore uvicorn zarr diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index 6a86dacbfefb..d35b649a46ba 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -21,6 +21,6 @@ rapidfuzz rouge_score sacrebleu # manually install sacrebleu[ja] for Japanese support; MeCab is unsupported in Python 3.11+ sentence_transformers -tensorstore<0.1.46 +tensorstore tiktoken==0.7.0 zarr From cdaf7b141c39b416dc19ce39d89624966f32cf0c Mon Sep 17 00:00:00 2001 From: Hemil Desai Date: Tue, 14 Jan 2025 22:50:41 +0530 Subject: [PATCH 02/27] Do not load context for model transform in llm inference (#11751) --- nemo/collections/llm/inference/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nemo/collections/llm/inference/base.py b/nemo/collections/llm/inference/base.py index dd53d97b21ad..0a87480f31d9 100644 --- a/nemo/collections/llm/inference/base.py +++ b/nemo/collections/llm/inference/base.py @@ -14,7 +14,7 @@ import inspect import json from pathlib import Path -from typing import Optional, Union +from typing import Optional import lightning.pytorch as pl import torch @@ -161,7 +161,7 @@ def _setup_trainer_and_restore_model(path: Path, trainer: nl.Trainer, model: pl. trainer.strategy.trainer = trainer trainer.strategy.selective_restore() - peft: Union[io.TrainerContext, PEFT] = io.load_context(ckpt_to_context_subdir(path), "model.model_transform") + peft: Optional[PEFT] = model.model_transform if isinstance(peft, PEFT): model = peft(model) adapter_sharded_state_dict = {k: v for k, v in model.sharded_state_dict().items() if ".adapter." in k} From c1e46eab419223f0d76fdac7767c3025212be778 Mon Sep 17 00:00:00 2001 From: chenrui17 <33319780+chenrui17@users.noreply.github.com> Date: Wed, 15 Jan 2025 01:23:30 +0800 Subject: [PATCH 03/27] add chat sft dataset to support agent tool calling (#11759) * add chat sft dataset to support agent tool calling * Apply isort and black reformatting Signed-off-by: chenrui17 * update docstring * fix typo Signed-off-by: Chen Cui --------- Signed-off-by: chenrui17 Signed-off-by: Chen Cui Co-authored-by: Charlie Chen Co-authored-by: chenrui17 Co-authored-by: Chen Cui --- .../megatron/gpt_sft_chat_dataset.py | 16 ++-- .../collections/nlp/test_chat_sft_dataset.py | 73 +++++++++++++++++++ 2 files changed, 83 insertions(+), 6 deletions(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index 6d71a9d8e014..53d94452a480 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -110,7 +110,7 @@ def _mask_targets( header_len (int): the system prompt length s_ids (List[Tensor]): array of tokenized ids of each turns tokenizer (TokenizerSpec): tokenizer object - mask_role (str): the speaker id to be masked from loss computation + mask_role (str): the speaker id to be masked from loss computation. If there is more than 1 masked role, `mask_role` is a comma-separated string of the roles gtype (str): either 'TEXT_TO_VALUE' or 'VALUE_TO_TEXT' name_end_token_ids (int): end of name token ids special_tokens (dict): special tokens used for the chat prompt. It has the keys: system_turn_start, turn_start, label_start, end_of_turn @@ -164,13 +164,13 @@ def _mask_targets( if i == 0 and (gtype == 'VALUE_TO_TEXT' or gtype is None): # mask the first turn completely to provide at least one turn as context for the rest target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and i == 1 and gtype == 'TEXT_TO_VALUE': + elif speaker in mask_role and i == 1 and gtype == 'TEXT_TO_VALUE': # leave the first turn start tag unmasked, servers severs as the end of turn signal target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and (i > 1): + elif speaker in mask_role and (i > 1): # leave the first turn start tag unmasked, which severs as the end of turn signal target[cur_idx + num_turn_start_tokens : cur_idx + tokenized_len] = IGNORE_INDEX - elif speaker == mask_role and (i <= 1): + elif speaker in mask_role and (i <= 1): # mask out everything in the second turn target[cur_idx : cur_idx + tokenized_len] = IGNORE_INDEX else: @@ -238,7 +238,7 @@ def _add_speaker_and_signal(header, source, mask_role, gtype, special_tokens): ) conversation += sentence["value"] # if the last turn is not masked, add next token start token to the end, which will be included for loss calculation - if sentence_from != mask_role and i == len(source) - 1: + if sentence_from not in mask_role and i == len(source) - 1: conversation += TURN_TOKEN return conversation @@ -276,7 +276,11 @@ def preprocess( ids.append(torch.tensor(tokenized_sentence)) tokenized_lens.append(len(tokenized_sentence)) speakers = [sentence["from"] for sentence in source['conversations']] - assert mask_role in speakers, "mask role not in the conversation" + # assert mask_role in speakers, "mask role not in the conversation" + split_mask = mask_role.split(',') + for s in split_mask: + assert s in speakers, "mask role not in the conversation" + target = torch.LongTensor(target) # not going to train on the header target[:header_len] = IGNORE_INDEX diff --git a/tests/collections/nlp/test_chat_sft_dataset.py b/tests/collections/nlp/test_chat_sft_dataset.py index bc44049f8f11..fce07c0f3897 100644 --- a/tests/collections/nlp/test_chat_sft_dataset.py +++ b/tests/collections/nlp/test_chat_sft_dataset.py @@ -76,6 +76,36 @@ def create_data_points(mask_user, turn_num, records, temp_file, t2v, label=True) return data_points +def create_custom_data_points(mask_list, turn_num, records, temp_file): + data_points = [] + with open(temp_file, 'w', encoding='utf-8') as f: + for r in range(records): + record = {} + record['system'] = 'a chat' + record['mask'] = '' + for i, s in enumerate(mask_list): + record['mask'] += s + if i != len(mask_list) - 1: + record['mask'] += ',' + turns = [] + record['conversations'] = turns + for i in range(turn_num): + turn = {} + if i % 4 == 0: + turn['from'] = 'User' + elif i % 4 == 1: + turn['from'] = 'Assistant' + elif i % 4 == 2: + turn['from'] = 'Function' + else: + turn['from'] = 'Assistant' + turn['value'] = get_random_sentence() + turns.append(turn) + f.write(json.dumps(record, ensure_ascii=False) + '\n') + data_points.append(record) + return data_points + + @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine') class TestGPTSFTChatDataset: @classmethod @@ -118,6 +148,41 @@ def _mask_user_test(self, tokenizer, ids_to_text): finally: os.remove(temp_file) + def _mask_user_func_test(self, tokenizer, ids_to_text): + random.seed(5) + temp_file = '/tmp/test_file.jsonl' + turn_num = 10 + records = 2 + mask_list = ["User", "Function"] + try: + # create custom data for Agent SFT case + data_points = create_custom_data_points(mask_list, turn_num, records, temp_file) + print(data_points) + d = GPTSFTChatDataset( + temp_file, + tokenizer, + 4096, + 1, + index_mapping_dir='/tmp/', + hf_dataset=True, + special_tokens=self.special_tokens, + ) + for i in range(len(d)): + result = d[i] + input_ids = result['input_ids'] + mask = result['mask'] + text = ids_to_text(input_ids[mask].tolist()) + print("【text】", i) + print(text) + expected_text = '' + for j in range(1, turn_num, 2): + expected_text += data_points[i]['conversations'][j]['value'] + self.suffix + print("【expected text】", i) + print(expected_text) + assert text == expected_text + finally: + os.remove(temp_file) + def _mask_assistant_test(self, tokenizer, ids_to_text): random.seed(3) temp_file = '/tmp/test_file.jsonl' @@ -321,6 +386,14 @@ def test_43B_tokenizer_mask_assistant_nolabel(self): tokenizer = get_nmt_tokenizer(library='sentencepiece', tokenizer_model=TOKENIZER_FILE_43B) self._mask_assistant_nolabel_test(tokenizer, tokenizer.ids_to_text) + @pytest.mark.unit + def test_mpt_tokenizer_mask_user_func(self): + tokenizer = get_nmt_tokenizer( + library='huggingface', model_name='gpt2', merges_file=MERGE_FILE, vocab_file=VOCAB_FILE, use_fast=True + ) + tokenizer.add_special_tokens({'additional_special_tokens': ['', '', '']}) + self._mask_user_func_test(tokenizer, partial(ids_to_text, tokenizer)) + @pytest.mark.unit def test_mpt_tokenizer_mask_user(self): tokenizer = get_nmt_tokenizer( From dc08eddab27ef1da0cbf4f272c25dedf7cb8f11e Mon Sep 17 00:00:00 2001 From: meatybobby Date: Tue, 14 Jan 2025 09:57:20 -0800 Subject: [PATCH 04/27] Fix starcoder2 missing bias in nemo2 config (#11809) --- nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 9adc83c1b82a..1d344fd55735 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -553,6 +553,9 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat elif nemo_model_config["activation"] == "squared_relu": nemo_model_config["activation"] = "squared-relu" + if nemo_model_config.get("add_bias_linear"): + nemo_model_config["bias"] = True + nemo_model_config["mcore_gpt"] = True nemo_model_config["max_position_embeddings"] = nemo_model_config.get("seq_length", 4096) nemo_model_config["rotary_percentage"] = nemo_model_config.get("rotary_percent", 1.0) From c856900f8ef16f144476f5978a2a7e6e99195a2b Mon Sep 17 00:00:00 2001 From: Huiying Date: Tue, 14 Jan 2025 13:58:05 -0800 Subject: [PATCH 05/27] update nemo2 tutorial container verison (#11832) Signed-off-by: Huiying Li --- tutorials/llm/llama-3/nemo2-sft-peft/README.rst | 4 ++-- tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/README.rst b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst index d1bd7b87759c..0dee7c316697 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/README.rst +++ b/tutorials/llm/llama-3/nemo2-sft-peft/README.rst @@ -20,7 +20,7 @@ Requirements * Software Requirements * Use the latest [NeMo Framework Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo/tags) . Note that you must be logged in to the container registry to view this page. - * This notebook is tested on the container: `nvcr.io/nvidia/nemo:24.12-rc0`. + * This notebook is tested on the container: `nvcr.io/nvidia/nemo:24.12`. * Get your Hugging Face [access token](https://huggingface.co/docs/hub/en/security-tokens), which will be used to obtain the tokenizer required during training. * NeMo 2.0 and NeMo-Run @@ -42,7 +42,7 @@ Start the NeMo Framework Container --rm -it \ -v ${PWD}:/workspace \ -w /workspace \ - nvcr.io/nvidia/nemo:24.12-rc0 bash + nvcr.io/nvidia/nemo:24.12 bash Once you are inside the container, you can run `nvidia-smi` to verify that the GPUs are accessible. diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index b3393d133a45..730ffd9ff972 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -533,7 +533,7 @@ "\n", "2. [NeMo-Run GitHub repo](https://github.com/NVIDIA/NeMo-Run/)\n", "\n", - "3. NeMo Framework Container: `nvcr.io/nvidia/nemo:24.12-rc0`\n", + "3. NeMo Framework Container: `nvcr.io/nvidia/nemo:24.12`\n", "\n", "\n", "\n", From 84d5fad2eb2152161c759d1153dfc5d50f11de62 Mon Sep 17 00:00:00 2001 From: Selvaraj Anandaraj Date: Tue, 14 Jan 2025 18:22:22 -0800 Subject: [PATCH 06/27] MCore Partial DistOpt Feature (#10693) * Added interface arg for partial DistOpt Signed-off-by: Selvaraj Anandaraj * Typo fix Signed-off-by: Selvaraj Anandaraj * Apply isort and black reformatting Signed-off-by: sanandaraj5597 * Changed variable name Signed-off-by: Selvaraj Anandaraj --------- Signed-off-by: Selvaraj Anandaraj Signed-off-by: sanandaraj5597 Co-authored-by: Selvaraj Anandaraj Co-authored-by: sanandaraj5597 --- .../language_modeling/megatron_base_model.py | 1 + .../language_modeling/megatron_gpt_model.py | 1 + .../modules/common/megatron/megatron_init.py | 2 ++ nemo/collections/nlp/parts/nlp_overrides.py | 1 + nemo/utils/app_state.py | 17 +++++++++++++++++ 5 files changed, 22 insertions(+) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index 330f6ffee05b..cf13a0318ffc 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -195,6 +195,7 @@ def __init__(self, cfg: DictConfig, trainer: Trainer, no_lm_init=True): virtual_pipeline_model_parallel_size=vp_size, pipeline_model_parallel_split_rank=cfg.get('pipeline_model_parallel_split_rank', 0), use_tp_pp_dp_mapping=cfg.get('use_tp_pp_dp_mapping', False), + num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1), context_parallel_size=cfg.get('context_parallel_size', 1), micro_batch_size=cfg.get('micro_batch_size'), global_batch_size=cfg.get('global_batch_size'), diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py index 02ef522dde1f..caa909dc7ead 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_model.py @@ -593,6 +593,7 @@ def setup_mcore_distributed_parallel(self): ddp_config = DistributedDataParallelConfig( grad_reduce_in_fp32=(self.cfg.optim.get('grad_sync_dtype', 'fp32') == 'fp32'), overlap_grad_reduce=self.cfg.optim.get('overlap_grad_sync', False), + num_distributed_optimizer_instances=self.cfg.optim.get('num_distributed_optimizer_instances', 1), use_distributed_optimizer=True, check_for_nan_in_grad=self.cfg.optim.get('check_for_nan_in_grad', False), # mcore bucket_size is based on num of parameters, therefore not diff --git a/nemo/collections/nlp/modules/common/megatron/megatron_init.py b/nemo/collections/nlp/modules/common/megatron/megatron_init.py index 8b42985a3937..5e44fda2be23 100644 --- a/nemo/collections/nlp/modules/common/megatron/megatron_init.py +++ b/nemo/collections/nlp/modules/common/megatron/megatron_init.py @@ -106,6 +106,7 @@ def initialize_model_parallel_for_nemo( apex_transformer_log_level=30, use_tp_pp_dp_mapping=False, use_te_rng_tracker=False, + num_distributed_optimizer_instances=1, ): if virtual_pipeline_model_parallel_size is not None and not HAVE_INTERLEAVED: @@ -117,6 +118,7 @@ def initialize_model_parallel_for_nemo( app_state.world_size = world_size app_state.local_rank = local_rank app_state.use_tp_pp_dp_mapping = use_tp_pp_dp_mapping + app_state.num_distributed_optimizer_instances = num_distributed_optimizer_instances app_state.expert_model_parallel_size = expert_model_parallel_size app_state.tensor_model_parallel_size = tensor_model_parallel_size app_state.pipeline_model_parallel_size = pipeline_model_parallel_size diff --git a/nemo/collections/nlp/parts/nlp_overrides.py b/nemo/collections/nlp/parts/nlp_overrides.py index 144583db249a..ee62b80fe1be 100644 --- a/nemo/collections/nlp/parts/nlp_overrides.py +++ b/nemo/collections/nlp/parts/nlp_overrides.py @@ -163,6 +163,7 @@ def init_model_parallel( use_sharp=sharp, expert_model_parallel_size=app_state.expert_model_parallel_size, order='tp-pp-dp' if app_state.use_tp_pp_dp_mapping else 'tp-cp-ep-dp-pp', + num_distributed_optimizer_instances=app_state.num_distributed_optimizer_instances, distributed_timeout_minutes=distributed_timeout_minutes, ) diff --git a/nemo/utils/app_state.py b/nemo/utils/app_state.py index 37193cfdd8c5..643d5afe5815 100644 --- a/nemo/utils/app_state.py +++ b/nemo/utils/app_state.py @@ -58,6 +58,7 @@ def __init__(self): self._data_parallel_size = None self._data_parallel_group = None self._use_tp_pp_dp_mapping = False + self._num_distributed_optimizer_instances = 1 self._megatron_checkpoint_version = None self._use_fp8 = False self._context_parallel_size = None @@ -242,6 +243,22 @@ def use_tp_pp_dp_mapping(self): def use_tp_pp_dp_mapping(self, use_new_mapping): self._use_tp_pp_dp_mapping = use_new_mapping + @property + def num_distributed_optimizer_instances(self): + """Property returns the factor by which the Partial DistOpt is sharded. + Returns: + The partial DistOpt shard factor + """ + return self._num_distributed_optimizer_instances + + @num_distributed_optimizer_instances.setter + def num_distributed_optimizer_instances(self, shard_factor): + """Property sets the factor by which the Partial DistOpt is sharded. + Args: + shard_factor (int): The partial DistOpt shard factor. + """ + self._num_distributed_optimizer_instances = shard_factor + @property def virtual_pipeline_model_parallel_size(self): """Property returns the number of GPUs in each model parallel group. From 3591cf8cf9be7aa47a1da3aca2e2a2d7318208a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Wed, 15 Jan 2025 19:34:35 +0100 Subject: [PATCH 07/27] ci: Shorter retention period MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_test_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index f198ffe6af1b..911fcc17e636 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -47,7 +47,7 @@ jobs: steps: - name: Docker system cleanup run: | - docker system prune -a --filter "until=48h" --force || true + docker system prune -a --filter "until=24h" --force || true - name: Docker pull image run: | From 1626ddded63af1fe82b0bee482fba44c57b8203d Mon Sep 17 00:00:00 2001 From: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:25:44 -0800 Subject: [PATCH 08/27] Add Seq Packing in NeMo / Neva2 (#11633) * api updates and fixes Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix Signed-off-by: yaoyu-33 * fix arg Signed-off-by: yaoyu-33 * update seq packing in mock ds Signed-off-by: yaoyu-33 * save Signed-off-by: yaoyu-33 * update preprocess_data Signed-off-by: yaoyu-33 * update seq packing Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix sp Signed-off-by: yaoyu-33 * save Signed-off-by: yaoyu-33 * fix seq packing Signed-off-by: yaoyu-33 * add truncation and padding Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Fix issues Signed-off-by: yaoyu-33 * change LLaVATemplateConfig variables to class variables * change to use field with default attributes * Apply isort and black reformatting Signed-off-by: yashaswikarnati * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Add seq packing option in energon Signed-off-by: yaoyu-33 * Fix energon conversation Signed-off-by: yaoyu-33 * add energon option in neva training script Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * add ci test for packed seq Signed-off-by: yaoyu-33 * fix mock dataset seq packing Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix mock dataset seq packing Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix lint and update seq pack func Signed-off-by: yaoyu-33 * fix energon module Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * fix comments Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * address lightning issues Signed-off-by: yaoyu-33 * Apply isort and black reformatting Signed-off-by: yaoyu-33 * Update sequence_packing.py Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> * update energon requirements Signed-off-by: yaoyu-33 * Fix for energon update Signed-off-by: yaoyu-33 * fix for test Signed-off-by: yaoyu-33 --------- Signed-off-by: yaoyu-33 Signed-off-by: yaoyu-33 Signed-off-by: yashaswikarnati Signed-off-by: Yu Yao <54727607+yaoyu-33@users.noreply.github.com> Co-authored-by: yaoyu-33 Co-authored-by: ykarnati Co-authored-by: yashaswikarnati --- .github/workflows/cicd-main.yml | 18 +- nemo/collections/llm/peft/api.py | 4 +- .../multimodal/data/energon/base.py | 5 + .../multimodal/data/energon/config.py | 24 ++- .../multimodal/data/energon/conversation.py | 2 +- .../multimodal/data/energon/task_encoder.py | 170 +++++++++++++++--- nemo/collections/vlm/inference/base.py | 2 +- nemo/collections/vlm/neva/data/config.py | 6 +- nemo/collections/vlm/neva/data/lazy.py | 115 ++++++------ nemo/collections/vlm/neva/data/mock.py | 69 ++++++- .../vlm/neva/data/sequence_packing.py | 157 ++++++++++++++++ nemo/collections/vlm/neva/model/base.py | 38 +++- nemo/collections/vlm/recipes/llava15_13b.py | 2 +- nemo/collections/vlm/recipes/llava15_7b.py | 2 +- nemo/collections/vlm/recipes/llava_next_7b.py | 2 +- nemo/lightning/megatron_parallel.py | 5 +- requirements/requirements_multimodal.txt | 2 +- scripts/vlm/llava_next_finetune.py | 2 +- scripts/vlm/llava_next_pretrain.py | 2 +- scripts/vlm/mllama_finetune.py | 2 +- scripts/vlm/neva_finetune.py | 109 ++++++++--- .../data/energon/test_data_module.py | 4 +- .../{mllama_train.py => test_mllama_train.py} | 0 .../vlm/{neva_train.py => test_neva_train.py} | 7 + 24 files changed, 611 insertions(+), 138 deletions(-) create mode 100644 nemo/collections/vlm/neva/data/sequence_packing.py rename tests/collections/vlm/{mllama_train.py => test_mllama_train.py} (100%) rename tests/collections/vlm/{neva_train.py => test_neva_train.py} (95%) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 16037920d080..a815be7bdc2f 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -4329,11 +4329,24 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - python tests/collections/vlm/neva_train.py \ + python tests/collections/vlm/test_neva_train.py \ --devices=1 \ --max-steps=5 \ --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} + L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING: + needs: [cicd-test-container-setup] + uses: ./.github/workflows/_test_template.yml + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING') || needs.cicd-test-container-setup.outputs.all == 'true' + with: + RUNNER: self-hosted-azure + SCRIPT: | + python tests/collections/vlm/test_neva_train.py \ + --devices=1 \ + --max-steps=5 \ + --experiment-dir=/tmp/nemo2_neva_results/${{ github.run_id }} \ + --use_packed_sequence + L2_NeMo_2_MLLAMA_MOCK_TRAINING: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4342,7 +4355,7 @@ jobs: RUNNER: self-hosted-azure SCRIPT: | TRANSFORMERS_OFFLINE=1 \ - python tests/collections/vlm/mllama_train.py \ + python tests/collections/vlm/test_mllama_train.py \ --devices=1 \ --max-steps=5 \ --experiment-dir=/tmp/nemo2_mllama_results/${{ github.run_id }} @@ -5060,6 +5073,7 @@ jobs: - Speech_Checkpoints_tests - L2_Stable_Diffusion_Training - L2_NeMo_2_NEVA_MOCK_TRAINING + - L2_NeMo_2_NEVA_MOCK_PACKED_TRAINING - L2_NeMo_2_MLLAMA_MOCK_TRAINING - L2_NeMo_2_GPT_Pretraining_no_transformer_engine - L2_NeMo_2_GPT_DDP_Param_Parity_check diff --git a/nemo/collections/llm/peft/api.py b/nemo/collections/llm/peft/api.py index c05fd0b8edde..b70601faf7a3 100644 --- a/nemo/collections/llm/peft/api.py +++ b/nemo/collections/llm/peft/api.py @@ -16,10 +16,10 @@ from pathlib import Path from typing import Tuple, Union -import pytorch_lightning as pl +import lightning.pytorch as pl import torch +from lightning.pytorch.trainer.states import TrainerFn from megatron.core import dist_checkpointing -from pytorch_lightning.trainer.states import TrainerFn from rich.console import Console from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer diff --git a/nemo/collections/multimodal/data/energon/base.py b/nemo/collections/multimodal/data/energon/base.py index 3dfd495edd82..c29935880889 100644 --- a/nemo/collections/multimodal/data/energon/base.py +++ b/nemo/collections/multimodal/data/energon/base.py @@ -68,6 +68,7 @@ def __init__( multimodal_sample_config: Optional[MultiModalSampleConfig] = MultiModalSampleConfig(), task_encoder: Optional[MultiModalTaskEncoder] = None, decoder_seq_length: Optional[int] = None, + packing_buffer_size: Optional[int] = None, ) -> None: """ Initialize the EnergonMultiModalDataModule. @@ -84,6 +85,8 @@ def __init__( Defaults to MultiModalSampleConfig(). task_encoder (MultiModalTaskEncoder, optional): Encoder responsible for encoding and batching samples. If not provided, a default (MultimodalTaskEncoder) encoder will be created. Defaults to None. + decoder_seq_length (int, optional): The maximum sequence length for the decoder. Used in encoder-decoder models. + packing_buffer_size (int, optional): Size of the packing buffer for batched samples. Defaults to None. """ super().__init__() @@ -113,6 +116,7 @@ def __init__( ) self.train_dataloader_object = None self.val_dataloader_object = None + self.packing_buffer_size = packing_buffer_size def io_init(self, **kwargs) -> fdl.Config[Self]: @@ -146,6 +150,7 @@ def datasets_provider(self, worker_config, split: Literal['train', 'val'] = 'val task_encoder=self.task_encoder, worker_config=worker_config, max_samples_per_sequence=None, + packing_buffer_size=self.packing_buffer_size, shuffle_buffer_size=100, split_part=split, ) diff --git a/nemo/collections/multimodal/data/energon/config.py b/nemo/collections/multimodal/data/energon/config.py index c145c5e51019..abbfd874880f 100644 --- a/nemo/collections/multimodal/data/energon/config.py +++ b/nemo/collections/multimodal/data/energon/config.py @@ -13,8 +13,11 @@ # limitations under the License. from dataclasses import dataclass, field -from typing import List +from typing import List, Tuple, Union + import torch +from megatron.core.packed_seq_params import PackedSeqParams + from nemo.collections.multimodal.data.energon.conversation import LLaVATemplateConfig @@ -34,7 +37,7 @@ class ImageToken(MultiModalToken): @dataclass class ImageTextSample: - '''Sample type for template formatted raw image text sample''' + """Sample type for template formatted raw image text sample""" __key__: str = '' images: torch.Tensor = field(default_factory=lambda: torch.empty(0)) @@ -43,6 +46,15 @@ class ImageTextSample: loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float)) +@dataclass +class PackedImageTextSample(ImageTextSample): + """Sample type for packed image text sample""" + + __restore_key__: Tuple[Union[str, int, tuple], ...] = () + position_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float)) + packed_seq_params: PackedSeqParams = field(default_factory=lambda: PackedSeqParams()) + + @dataclass class ImageTextRawBatch: """Sample type for image text raw batch""" @@ -56,6 +68,14 @@ class ImageTextRawBatch: loss_mask: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float)) +@dataclass +class PackedImageTextRawBatch(ImageTextRawBatch): + """Sample type for image text raw batch""" + + position_ids: torch.Tensor = field(default_factory=lambda: torch.empty(0, dtype=torch.float)) + packed_seq_params: PackedSeqParams = field(default_factory=lambda: PackedSeqParams()) + + @dataclass class MultiModalSampleConfig: image_token: ImageToken = field(default_factory=ImageToken) diff --git a/nemo/collections/multimodal/data/energon/conversation.py b/nemo/collections/multimodal/data/energon/conversation.py index 31019ae9c615..95b0ad184f8c 100644 --- a/nemo/collections/multimodal/data/energon/conversation.py +++ b/nemo/collections/multimodal/data/energon/conversation.py @@ -30,7 +30,7 @@ class LLaVATemplateConfig(BaseConversationTemplateConfig): """LLava-specific template configuration which extends the base config""" system: str = field( - default="A chat between a curious user and artificial assistant agent. " + default="A chat between a curious user and an artificial intelligence assistant. " "The assistant gives helpful, detailed and polite answers to user's questions." ) roles: List[str] = field(default_factory=lambda: ['user', 'assistant']) diff --git a/nemo/collections/multimodal/data/energon/task_encoder.py b/nemo/collections/multimodal/data/energon/task_encoder.py index 7a8d0f0ab033..80b6e156f4a1 100644 --- a/nemo/collections/multimodal/data/energon/task_encoder.py +++ b/nemo/collections/multimodal/data/energon/task_encoder.py @@ -25,14 +25,21 @@ batch_list, batch_pad_stack, ) +from megatron.energon.task_encoder.base import stateless -from nemo.collections.multimodal.data.energon.config import ImageTextRawBatch, ImageTextSample +from nemo.collections.multimodal.data.energon.config import ( + ImageTextRawBatch, + ImageTextSample, + PackedImageTextRawBatch, + PackedImageTextSample, +) from nemo.collections.multimodal.data.energon.sample_encoder import ( InterleavedSampleEncoder, SampleEncoder, SimilarityInterleavedEncoder, VQASampleEncoder, ) +from nemo.utils import logging class MultiModalTaskEncoder( @@ -54,16 +61,34 @@ class MultiModalTaskEncoder( for model input. """ - def __init__(self, tokenizer, image_processor, multimodal_sample_config): + def __init__( + self, + tokenizer, + image_processor, + multimodal_sample_config, + packed_sequence=False, + packed_sequence_size=-1, + num_image_embeddings_per_tile=576, + ): """ Initialize the MultiModalTaskEncoder with specific encoders for different sample types. Parameters: - tokenizer (Tokenizer): The tokenizer used for processing text across different sample types. - image_processor (ImageProcessor): The image processor used for preprocessing images. - multimodal_sample_config (MultiModalSampleConfig): MultiModalSampleConfig object. + tokenizer (Tokenizer): The tokenizer used for processing textual components across sample types. + image_processor (ImageProcessor): The image processor responsible for preprocessing image data. + multimodal_sample_config (MultiModalSampleConfig): Configuration object defining properties and + requirements for multimodal samples. + packed_sequence (bool, optional): Flag indicating whether packed sequences are used. Default is False. + packed_sequence_size (int, optional): The size of packed sequences, used when `packed_sequence` is True. + Default is -1. + num_image_embeddings_per_tile (int, optional): Number of image embeddings per image tile. Determines + the granularity of image features. Default is 576. """ self.tokenizer = tokenizer + self.sample_config = multimodal_sample_config + self.packed_sequence = packed_sequence + self.num_image_embeddings_per_tile = num_image_embeddings_per_tile # only used with seq packing + self.packed_sequence_size = packed_sequence_size self.encoders: Dict[str, SampleEncoder] = { VQASample.__name__: VQASampleEncoder( tokenizer=tokenizer, @@ -92,6 +117,7 @@ def register_encoder(self, sample_type: str, encoder: SampleEncoder) -> None: """ self.encoders[sample_type] = encoder + @stateless def encode_sample( self, sample: Union[VQASample, InterleavedSample, SimilarityInterleavedSample, CaptioningSample] ) -> ImageTextSample: @@ -118,7 +144,9 @@ def encode_sample( encoded_sample = encoder.encode(input_sample=sample, output_sample=ImageTextSample()) return encoded_sample - def batch(self, samples: List[ImageTextSample]) -> ImageTextRawBatch: + def batch( + self, samples: List[Union[ImageTextSample, PackedImageTextSample]] + ) -> Union[ImageTextRawBatch, PackedImageTextRawBatch]: """ Batch a list of encoded samples into a single raw batch. @@ -131,26 +159,51 @@ def batch(self, samples: List[ImageTextSample]) -> ImageTextRawBatch: ImageTextRawBatch: The batched data, including images, tokens, labels, and loss masks. """ - keys, images, tokens, labels, loss_mask = [], [], [], [], [] - for sample in samples: - keys.append(sample.__key__) - images.append(sample.images) - tokens.append(sample.tokens) - labels.append(sample.labels) - loss_mask.append(sample.loss_mask) - - batch_keys = batch_list(keys) - batch_images = batch_pad_stack(images) - batch_prompt_tokens = batch_pad_stack(tokens) - batch_labels = batch_pad_stack(labels) - batch_loss_mask = batch_pad_stack(loss_mask) - return ImageTextRawBatch( - __keys__=batch_keys, - images=batch_images, - tokens=batch_prompt_tokens, - labels=batch_labels, - loss_mask=batch_loss_mask, - ) + if self.packed_sequence: + if len(samples) > 1: + raise ValueError( + "Micro batch size should be 1 when training with packed sequence, but your micro batch size " + f"is {len(samples)}. \nThe following config is equivalent to your current setting for " + f"a packed dataset. Please update your config to the following: \n" + f"Set micro batch size to 1 (currently {len(samples)})\n" + f"Set global batch size to `global_batch_size // {len(samples)}` " + f"Set packed sequence length to `original_sample_seq_len * {len(samples)}` " + f"(currently {self.packed_sequence_size}) \n" + f"For details please visit " + f"https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/packed_sequence.html" + ) + # The batching are taken care by packing. + sample = samples[0] + return PackedImageTextRawBatch( + __keys__=sample.__key__, + images=sample.images, + tokens=sample.tokens, + labels=sample.labels, + loss_mask=sample.loss_mask, + position_ids=sample.position_ids, + packed_seq_params=sample.packed_seq_params, + ) + else: + keys, images, tokens, labels, loss_mask = [], [], [], [], [] + for sample in samples: + keys.append(sample.__key__) + images.append(sample.images) + tokens.append(sample.tokens) + labels.append(sample.labels) + loss_mask.append(sample.loss_mask) + + batch_keys = batch_list(keys) + batch_images = batch_pad_stack(images) + batch_prompt_tokens = batch_pad_stack(tokens) + batch_labels = batch_pad_stack(labels) + batch_loss_mask = batch_pad_stack(loss_mask) + return ImageTextRawBatch( + __keys__=batch_keys, + images=batch_images, + tokens=batch_prompt_tokens, + labels=batch_labels, + loss_mask=batch_loss_mask, + ) def encode_batch(self, batch_data: ImageTextRawBatch) -> dict: """ @@ -165,7 +218,7 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict: Returns: dict: A dictionary containing the encoded batch data, ready for model input. """ - batch_dict = dataclasses.asdict(batch_data) + batch_dict = batch_data.__dict__ if 'images' in batch_dict: batch_dict['media'] = batch_dict['images'] del batch_dict['images'] @@ -177,3 +230,66 @@ def encode_batch(self, batch_data: ImageTextRawBatch) -> dict: if 'attention_mask' not in batch_dict: batch_dict['attention_mask'] = None return batch_dict + + def select_samples_to_pack(self, samples): + """Selects which samples will be packed together. + + NOTE: Energon dataloader calls this method internally if packing is used. + Please see https://nvidia.github.io/Megatron-Energon/packing.html + """ + from nemo.collections.vlm.neva.data.sequence_packing import greedy_knapsack, predict_seq_len + + media_token_id = self.sample_config.image_token.token_id + lengths = [ + predict_seq_len( + sample.tokens, + media_token_index=media_token_id, + num_image_embeddings_per_tile=self.num_image_embeddings_per_tile, + ) + for sample in samples + ] + packed_samples = greedy_knapsack(lengths, samples, self.packed_sequence_size) + avg_samples_per_bin = round(len(lengths) / len(packed_samples)) + logging.info( + f"[Seq Packing Info] - Packing seq len: {self.packed_sequence_size}, " + f"Buffered samples: {len(lengths)}, Total number of bins: {len(packed_samples)}, " + f"Average samples per bin: {avg_samples_per_bin}" + ) + return packed_samples + + @stateless + def pack_selected_samples(self, samples): + """ + Function to pack a list of ImageTaskSample into a single ImageTaskSamplePacked. + + NOTE: Energon dataloader calls this method internally if packing is used. + Please see https://nvidia.github.io/Megatron-Energon/packing.html + + Args: + samples: List of ImageTaskSample instances to pack into one sample. + + Returns: + ImageTaskSamplePacked instance. + """ + from nemo.collections.vlm.neva.data.sequence_packing import convert_to_packed + + packed_images = torch.stack([sample.images for sample in samples]) + media_token_id = self.sample_config.image_token.token_id + packed_tokens, packed_labels, packed_position_ids, packed_loss_mask, packed_seq_params = convert_to_packed( + tokens=[sample.tokens for sample in samples], + labels=[sample.labels for sample in samples], + num_image_embeddings_per_tile=self.num_image_embeddings_per_tile, + media_token_index=media_token_id, + ignore_index=self.sample_config.ignore_place_holder, + ) + + return PackedImageTextSample( + __key__=",".join([s.__key__ for s in samples]), + __restore_key__=(), # Will be set by energon based on `samples` + tokens=packed_tokens, + labels=packed_labels, + images=packed_images, + position_ids=packed_position_ids, + loss_mask=packed_loss_mask, + packed_seq_params=packed_seq_params, + ) diff --git a/nemo/collections/vlm/inference/base.py b/nemo/collections/vlm/inference/base.py index 77918bae26b9..bbceb851edae 100644 --- a/nemo/collections/vlm/inference/base.py +++ b/nemo/collections/vlm/inference/base.py @@ -14,7 +14,7 @@ from typing import List, Optional, Union -import pytorch_lightning as pl +import lightning.pytorch as pl import torch import torch.distributed from megatron.core.inference.common_inference_params import CommonInferenceParams diff --git a/nemo/collections/vlm/neva/data/config.py b/nemo/collections/vlm/neva/data/config.py index 3b22d5a493b3..2cf3dd80f47d 100644 --- a/nemo/collections/vlm/neva/data/config.py +++ b/nemo/collections/vlm/neva/data/config.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Optional from .multimodal_tokens import ImageToken, MultiModalToken, VideoToken @@ -31,7 +31,7 @@ class DataConfig: @dataclass class ImageDataConfig(DataConfig): media_type: str = "image" - media_token: MultiModalToken = ImageToken + media_token: MultiModalToken = field(default_factory=lambda: ImageToken()) image_folder: Optional[str] = None image_process_mode: str = 'pad' @@ -39,7 +39,7 @@ class ImageDataConfig(DataConfig): @dataclass class VideoDataConfig(DataConfig): media_type: str = "video" - media_token: MultiModalToken = VideoToken + media_token: MultiModalToken = VideoToken() splice_single_frame: Optional[str] = None # 'first', 'middle', 'last' will represent video as first / middle / last frame only, all other frames discarded. num_frames: int = 8 # Selects the number of frames to use from the video diff --git a/nemo/collections/vlm/neva/data/lazy.py b/nemo/collections/vlm/neva/data/lazy.py index 066310867777..90199d3c6d30 100644 --- a/nemo/collections/vlm/neva/data/lazy.py +++ b/nemo/collections/vlm/neva/data/lazy.py @@ -251,7 +251,6 @@ def __init__( data_config, tokenizer, image_processor, - sequence_length=None, ): super().__init__() if data_path is not None: @@ -269,8 +268,6 @@ def __init__( self.tokenizer = self.tokenizer.tokenizer self.image_processor = image_processor - self.sequence_length = sequence_length - self.conv_template = data_config.conv_template self.conv = supported_conv_templates[self.conv_template] self.image_process_mode = data_config.image_process_mode @@ -381,6 +378,8 @@ def __init__( data_config, tokenizer, image_processor, + packed_sequence=False, + num_image_embeddings_per_tile=576, ): if data_path.endswith(".json"): @@ -414,29 +413,12 @@ def __init__( else: raise ValueError(f"Formatting of {data_path} is not supported in Neva.") + self.packed_sequence = packed_sequence + self.num_image_embeddings_per_tile = num_image_embeddings_per_tile def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: data_config = self.data_config - packed_sequence = "cu_seqlens" in instances[0] - max_len = max(instance['tokens'].shape[0] for instance in instances) - for instance in instances: - pad_len = max_len - instance['tokens'].shape[0] - instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0) - instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX) - if packed_sequence and instance["cu_seqlens"][-1] != max_len: - instance["cu_seqlens"] = torch.cat((instance["cu_seqlens"], torch.IntTensor([max_len])), 0) - - if packed_sequence: - max_len_cu = max(instance['cu_seqlens'].shape[0] for instance in instances) - max_len_image = max(instance['image'].shape[0] for instance in instances) - for instance in instances: - pad_len_cu = max_len_cu - instance['cu_seqlens'].shape[0] - instance['cu_seqlens'] = F.pad(instance['cu_seqlens'], (0, pad_len_cu), 'constant', max_len) - - x = instance['image'] - num_pad = max_len_image - x.shape[0] - pad_tensor = torch.zeros(num_pad, *x.shape[1:], dtype=x.dtype, device=x.device) - instance['image'] = torch.cat((x, pad_tensor), dim=0) + packed_sequence = self.packed_sequence media_type = data_config.media_type if media_type == 'image': @@ -447,24 +429,30 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: else: raise ValueError(f"Unsupported media type {media_type}") - batch = default_collate(instances) - tokenizer = self.tokenizer + if packed_sequence: + from nemo.collections.vlm.neva.data.sequence_packing import convert_to_packed + + media_token_id = self.data_config.media_token.token_index + tokens, labels, position_ids, loss_mask, packed_seq_params = convert_to_packed( + tokens=[instance['tokens'] for instance in instances], + labels=[instance['labels'] for instance in instances], + num_image_embeddings_per_tile=self.num_image_embeddings_per_tile, + media_token_index=media_token_id, + ignore_index=IGNORE_INDEX, + ) + attention_mask = None + else: # regular dataset + max_len = max(instance['tokens'].shape[0] for instance in instances) + for instance in instances: + pad_len = max_len - instance['tokens'].shape[0] + instance['tokens'] = F.pad(instance['tokens'], (0, pad_len), 'constant', 0) + instance['labels'] = F.pad(instance['labels'], (0, pad_len), 'constant', IGNORE_INDEX) - tokens = batch['tokens'] - labels = batch['labels'] + batch = default_collate(instances) + tokenizer = self.tokenizer - if packed_sequence: - cu_seqlens = batch["cu_seqlens"] - position_ids = [] - for cu_seqlen in cu_seqlens: - position_ids.append([]) - for ind in range(0, len(cu_seqlen) - 1): - seqlen = cu_seqlen[ind + 1] - cu_seqlen[ind] - position_ids[-1].extend(list(range(seqlen))) - position_ids = torch.LongTensor(position_ids) - loss_mask = torch.ones(tokens.size(), dtype=torch.float, device=tokens.device) - attention_mask = torch.ones(tokens.size(), dtype=torch.long, device=tokens.device) - else: + tokens = batch['tokens'] + labels = batch['labels'] attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids( data=tokens, eod_token=tokenizer.eos_token_id, @@ -472,8 +460,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: reset_attention_mask=data_config.reset_attention_mask, reset_position_ids=data_config.reset_position_ids, ) - - loss_mask[labels < 0] = 0.0 + loss_mask[labels < 0] = 0.0 batch = { 'tokens': tokens, @@ -484,7 +471,7 @@ def collate_fn(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 'media': media, } if packed_sequence: - batch["cu_seqlens"] = cu_seqlens + batch["packed_seq_params"] = packed_seq_params return batch @@ -506,7 +493,8 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, - use_packed_sequence: bool = False, + packed_sequence: bool = False, + num_image_embeddings_per_tile: int = 576, seed: int = 1234, ) -> None: super().__init__() @@ -534,7 +522,8 @@ def __init__( self.pin_memory = pin_memory self.persistent_workers = persistent_workers self.seed = seed - self.use_packed_sequence = use_packed_sequence + self.packed_sequence = packed_sequence + self.num_image_embeddings_per_tile = num_image_embeddings_per_tile self.init_global_step = 0 if tokenizer is None or image_processor is None: @@ -546,6 +535,20 @@ def __init__( self.tokenizer = tokenizer or AutoTokenizer("llava-hf/llava-1.5-7b-hf") self.image_processor = image_processor or processor.image_processor + if self.packed_sequence: + import dataclasses + + def custom_on_megatron_step_start(self, step): + return dataclasses.replace( + step, + seq_length=self.seq_len, + micro_batch_size=1, # Override the micro_batch_size to 1 (used in PP) + num_microbatches=self.num_microbatches, + decoder_seq_length=self.decoder_seq_len, + ) + + MegatronDataSampler.on_megatron_step_start = custom_on_megatron_step_start + self.data_sampler = MegatronDataSampler( seq_len=self.seq_length, decoder_seq_len=self.decoder_seq_length, @@ -556,14 +559,22 @@ def __init__( def setup(self, stage: str = "") -> None: assert len(self.paths) == 1, "not yet support blend dataset in Neva 2.0!" - if self.use_packed_sequence: - pass # TODO - else: - # TODO: - # rng = torch.Generator().manual_seed(self.seed) - # train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size], generator=rng) - self._train_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) - self._validation_ds = NevaDataset(self.paths[0], self.data_config, self.tokenizer, self.image_processor) + self._train_ds = NevaDataset( + self.paths[0], + self.data_config, + self.tokenizer, + self.image_processor, + packed_sequence=self.packed_sequence, + num_image_embeddings_per_tile=self.num_image_embeddings_per_tile, + ) + self._validation_ds = NevaDataset( + self.paths[0], + self.data_config, + self.tokenizer, + self.image_processor, + packed_sequence=self.packed_sequence, + num_image_embeddings_per_tile=self.num_image_embeddings_per_tile, + ) def train_dataloader(self) -> TRAIN_DATALOADERS: return self._create_dataloader(self._train_ds) diff --git a/nemo/collections/vlm/neva/data/mock.py b/nemo/collections/vlm/neva/data/mock.py index 7533bf56ac46..495bd9f0dee5 100644 --- a/nemo/collections/vlm/neva/data/mock.py +++ b/nemo/collections/vlm/neva/data/mock.py @@ -42,6 +42,7 @@ def __init__( num_workers: int = 8, pin_memory: bool = True, persistent_workers: bool = False, + packed_sequence: bool = False, ): super().__init__() self.seq_length = seq_length @@ -54,6 +55,7 @@ def __init__( self.num_workers = num_workers self.pin_memory = pin_memory self.persistent_workers = persistent_workers + self.packed_sequence = packed_sequence if tokenizer is None or image_processor is None: logging.warning(f"Processor or tokenizer are not provided! Fall back to `llava-hf/llava-1.5-7b-hf`.") @@ -72,14 +74,36 @@ def __init__( ) def setup(self, stage: str = "") -> None: + seq_length = self.seq_length + if self.packed_sequence and self.micro_batch_size > 1: + seq_length = seq_length // self.micro_batch_size + logging.warning( + f"Packed sequence is used with mock dataset. Sequence length for each " + f"sample is update to `seq_length // self.micro_batch_size = {seq_length}`!" + ) self._train_ds = _MockNevaDataset( - self.tokenizer, self.image_processor, "train", self.num_train_samples, self.seq_length + self.tokenizer, + self.image_processor, + "train", + self.num_train_samples, + seq_length, + packed_sequence=self.packed_sequence, ) self._validation_ds = _MockNevaDataset( - self.tokenizer, self.image_processor, "valid", self.num_val_samples, self.seq_length + self.tokenizer, + self.image_processor, + "valid", + self.num_val_samples, + seq_length, + packed_sequence=self.packed_sequence, ) self._test_ds = _MockNevaDataset( - self.tokenizer, self.image_processor, "test", self.num_test_samples, self.seq_length + self.tokenizer, + self.image_processor, + "test", + self.num_test_samples, + seq_length, + packed_sequence=self.packed_sequence, ) def train_dataloader(self) -> TRAIN_DATALOADERS: @@ -117,6 +141,8 @@ def __init__( num_samples: int, seq_length: int, seed: int = 42, + packed_sequence: bool = False, + num_image_embeddings_per_tile=576, ) -> None: super().__init__() self.name = name @@ -129,8 +155,10 @@ def __init__( self.length = num_samples self.seed = seed + self.packed_sequence = packed_sequence + self.num_image_embeddings_per_tile = num_image_embeddings_per_tile - self.loss_mask = torch.ones(self.seq_length, dtype=torch.float) + self.loss_mask = torch.ones(self.seq_length + 1 - num_image_embeddings_per_tile, dtype=torch.float) self.position_ids = torch.arange(self.seq_length, dtype=torch.int64) def __len__(self) -> int: @@ -143,7 +171,11 @@ def _get_text(self, idx: int) -> np.ndarray: def __getitem__(self, idx) -> Dict[str, torch.Tensor]: # Generate data of the expected size and datatype (based on GPTDataset). np_gen = np.random.default_rng(seed=(self.seed + idx)) - tokens = torch.from_numpy(np_gen.integers(self.vocab_size, size=[self.seq_length + 1], dtype=np.int64)) + tokens = torch.from_numpy( + np_gen.integers( + self.vocab_size, size=[self.seq_length + 2 - self.num_image_embeddings_per_tile], dtype=np.int64 + ) + ) tokens[2] = IMAGE_TOKEN_INDEX # ImageToken token index labels = tokens.clone() images = torch.from_numpy(np_gen.random(size=[3, self.image_height, self.image_width], dtype=np.float32)) @@ -164,6 +196,33 @@ def _collate_fn(self, batch): """ collated_batch = data.dataloader.default_collate(batch) collated_batch["attention_mask"] = None + if self.packed_sequence: + from megatron.core.packed_seq_params import PackedSeqParams + + tokens = collated_batch["tokens"] + batch_size = tokens.shape[0] + valid_seqlen = self.seq_length + cu_seqlens = torch.arange( + 0, (batch_size + 1) * (valid_seqlen), step=(valid_seqlen), dtype=torch.int32, device=tokens.device + ) + cu_seqlens_padded = torch.arange( + 0, (batch_size + 1) * (valid_seqlen), step=(valid_seqlen), dtype=torch.int32, device=tokens.device + ) + qkv_format = 'thd' + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + cu_seqlens_q_padded=cu_seqlens_padded, + cu_seqlens_kv_padded=cu_seqlens_padded, + max_seqlen_q=valid_seqlen, + max_seqlen_kv=valid_seqlen, + qkv_format=qkv_format, + ) + collated_batch["packed_seq_params"] = packed_seq_params + + for key in ["tokens", "labels", "loss_mask", "position_ids"]: + collated_batch[key] = collated_batch[key].reshape(1, -1) + return collated_batch def collate_fn(self, batch): diff --git a/nemo/collections/vlm/neva/data/sequence_packing.py b/nemo/collections/vlm/neva/data/sequence_packing.py new file mode 100644 index 000000000000..1ddfe80c5797 --- /dev/null +++ b/nemo/collections/vlm/neva/data/sequence_packing.py @@ -0,0 +1,157 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import bisect +from typing import List + +import torch +import torch.nn.functional as F +from megatron.core.packed_seq_params import PackedSeqParams + + +# pylint:disable=line-too-long +# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L19 +# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0. +def search_for_fit(numbers: List[int], capacity: int) -> int: + """Finds the index of largest number that fits into the knapsack with the given capacity.""" + index = bisect.bisect(numbers, capacity) + return -1 if index == 0 else (index - 1) + + +# pylint: disable=line-too-long +# Based on https://github.com/hiyouga/LLaMA-Factory/blob/641d0dab08d96a93c34657742213d8994d9ed476/src/llamafactory/data/processors/processor_utils.py#L27 +# Copyright (c) 2024 LLaMA-Factory. Apache license 2.0. +def greedy_knapsack(item_sizes: List[int], samples: List, max_capacity: int) -> List: + """Greedy algorithm with binary search for the knapsack problem. + + Pack as many samples as possible given a maximum capacity and capacities of individual samples. + Used if sequence packing is enabled. + """ + assert len(item_sizes) == len(samples), "sample lengths and samples must have the same length." + + knapsacks = [] + + if len(item_sizes) == 0: + return knapsacks + + # Sort sample lengths and samples together. + sorted_item_sizes, sorted_samples = zip(*sorted(zip(item_sizes, samples), key=lambda x: x[0])) + sorted_item_sizes = list(sorted_item_sizes) + sorted_samples = list(sorted_samples) + + # Check if all samples fit in the knapsack capacity. + if sorted_item_sizes[-1] > max_capacity: + raise ValueError( + f"knapsack: A sample is larger {sorted_item_sizes[-1]} than the max_sequence_length {max_capacity}." + ) + + while sorted_item_sizes: + current_knapsack = [] + remaining_capacity = max_capacity + + while True: + idx = search_for_fit(sorted_item_sizes, remaining_capacity) + if idx == -1: + break # Can't fit more samples. + + remaining_capacity -= sorted_item_sizes[idx] + + sorted_item_sizes.pop(idx) + sample = sorted_samples.pop(idx) + current_knapsack.append(sample) + + knapsacks.append(current_knapsack) + + return knapsacks + + +def predict_seq_len(instance_tokens: torch.Tensor, num_image_embeddings_per_tile: int, media_token_index: int) -> int: + """ + Predict the effective sequence length, accounting for media embeddings. + + Args: + instance_tokens (torch.Tensor): Token tensor for a single instance. + num_image_embeddings_per_tile (int): Number of image embeddings per tile. + media_token_index (int): Token ID representing media. + + Returns: + int: Effective sequence length. + """ + num_images = torch.sum(instance_tokens == media_token_index).item() + seqlen = len(instance_tokens) + (num_image_embeddings_per_tile - 1) * num_images + return seqlen + + +def convert_to_packed( + tokens: List[torch.Tensor], + labels: List[torch.Tensor], + num_image_embeddings_per_tile: int, + media_token_index: int, + ignore_index: int, + pad_to_multiple_of: int = 64, +): + """ + Convert tokens, labels, and associated inputs into a packed version with padded sequence parameters. + + Args: + tokens (list[torch.Tensor]): List of token tensors for each instance. + labels (list[torch.Tensor]): List of label tensors for each instance. + num_image_embeddings_per_tile (int): Number of image embeddings per tile. + media_token_index (int): Token ID representing media. + ignore_index (int): Value to use for padding labels. + pad_to_multiple_of (int): Sequence length will be padded to a multiple of this value. Default is 8. + """ + packed_tokens = [] + packed_labels = [] + packed_position_ids = [] + seqlens_padded = [] + cu_seqlens = [0] + cu_seqlens_padded = [0] + + for instance_tokens, instance_labels in zip(tokens, labels): + seqlen = predict_seq_len(instance_tokens, num_image_embeddings_per_tile, media_token_index) + seqlen_padded = (seqlen + pad_to_multiple_of - 1) // pad_to_multiple_of * pad_to_multiple_of + pad_len = seqlen_padded - seqlen + + if pad_len > 0: + instance_tokens = F.pad(instance_tokens, (0, pad_len), 'constant', 0) + instance_labels = F.pad(instance_labels, (0, pad_len), 'constant', ignore_index) + + packed_tokens.append(instance_tokens) + packed_labels.append(instance_labels) + packed_position_ids.append(torch.arange(len(instance_tokens), dtype=torch.int, device=instance_tokens.device)) + seqlens_padded.append(seqlen_padded) + cu_seqlens.append(cu_seqlens[-1] + seqlen) + cu_seqlens_padded.append(cu_seqlens_padded[-1] + seqlen_padded) + + packed_tokens = torch.cat(packed_tokens, dim=0).unsqueeze(0) + packed_labels = torch.cat(packed_labels, dim=0).unsqueeze(0) + packed_position_ids = torch.cat(packed_position_ids, dim=0).unsqueeze(0) + packed_loss_mask = torch.ones_like(packed_labels, dtype=torch.float, device=packed_labels.device) + packed_loss_mask[packed_labels < 0] = 0.0 + + cu_seqlens = torch.IntTensor(cu_seqlens) + cu_seqlens_padded = torch.IntTensor(cu_seqlens_padded) + + packed_seq_params = PackedSeqParams( + cu_seqlens_q=cu_seqlens, + cu_seqlens_kv=cu_seqlens, + cu_seqlens_q_padded=cu_seqlens_padded, + cu_seqlens_kv_padded=cu_seqlens_padded, + max_seqlen_q=int(max(seqlens_padded)), + max_seqlen_kv=int(max(seqlens_padded)), + qkv_format='thd', + ) + + return packed_tokens, packed_labels, packed_position_ids, packed_loss_mask, packed_seq_params diff --git a/nemo/collections/vlm/neva/model/base.py b/nemo/collections/vlm/neva/model/base.py index 388078484a56..8cead72b4832 100644 --- a/nemo/collections/vlm/neva/model/base.py +++ b/nemo/collections/vlm/neva/model/base.py @@ -121,14 +121,19 @@ def neva_data_step(dataloader_iter) -> Dict[str, torch.Tensor]: ) ) + packed_seq_params = _batch.get("packed_seq_params", None) _batch = { key: val.cuda(non_blocking=True) if key in required_keys and val is not None else None for key, val in _batch.items() } - # slice batch along sequence dimension for context parallelism - output = get_batch_on_this_context_parallel_rank(_batch) + if packed_seq_params is not None: + for attr in ["cu_seqlens_q", "cu_seqlens_kv", "cu_seqlens_q_padded", "cu_seqlens_kv_padded"]: + value = getattr(packed_seq_params, attr, None) + if value is not None: + setattr(packed_seq_params, attr, value.cuda(non_blocking=True)) + _batch["packed_seq_params"] = packed_seq_params - return output + return _batch def neva_forward_step(model, batch) -> torch.Tensor: @@ -596,6 +601,7 @@ def forward( image_token_index, num_image_tiles, attention_mask, + packed_seq_params, ) # [combined_seq_len, b, h_language], [b, combined_seq_len], [b, combined_seq_len] output = self.language_model( @@ -642,6 +648,7 @@ def _preprocess_data( image_token_index, num_image_tiles, attention_mask, + packed_seq_params, ): """Preprocess input data before input to language model. @@ -698,6 +705,8 @@ def _preprocess_data( labels.shape == loss_mask.shape ), f"mismatching labels shape {labels.shape} and loss mask shape {loss_mask.shape}" + packed_sequence = packed_seq_params is not None and packed_seq_params.qkv_format == "thd" + # Create indices for new text and label positions. with torch.no_grad(): image_token_mask = input_ids == image_token_index @@ -715,6 +724,16 @@ def _preprocess_data( # Pipeline parallel expects fixed input size. Check if we need to pad. if self._language_is_pipeline_parallel and max_seq_len < self._language_max_sequence_length: max_seq_len = self._language_max_sequence_length + if packed_sequence: + last_seqlen = packed_seq_params.cu_seqlens_q[-1] - packed_seq_params.cu_seqlens_q[-2] + last_seqlen_padded = max_seq_len - packed_seq_params.cu_seqlens_q_padded[-2] + assert ( + last_seqlen_padded >= last_seqlen + ), "`language_max_sequence_length` needs to increase for sequence packing to work properly." + packed_seq_params.cu_seqlens_q_padded[-1] = max_seq_len + packed_seq_params.cu_seqlens_kv_padded[-1] = max_seq_len + packed_seq_params.max_seqlen_q = max(last_seqlen_padded, packed_seq_params.max_seqlen_q) + packed_seq_params.max_seqlen_kv = max(last_seqlen_padded, packed_seq_params.max_seqlen_kv) if self.sequence_parallel_lm: if self.tp_comm_overlap_lm: @@ -835,7 +854,17 @@ def _preprocess_data( # Truncate if exceeding the language model's max sequence length. if final_embedding.shape[0] > self._language_max_sequence_length: final_embedding = final_embedding[: self._language_max_sequence_length] - if self.sequence_parallel_lm: + if packed_sequence: + truncate_len = packed_seq_params.cu_seqlens_q_padded[-1] - self._language_max_sequence_length + packed_seq_params.cu_seqlens_q_padded[-1] = self._language_max_sequence_length + packed_seq_params.cu_seqlens_kv_padded[-1] = self._language_max_sequence_length + packed_seq_params.cu_seqlens_q[-1] -= truncate_len + packed_seq_params.cu_seqlens_kv[-1] -= truncate_len + assert ( + packed_seq_params.cu_seqlens_q[-1] >= packed_seq_params.cu_seqlens_q[-2] + ), "with packed sequence, the truncation can only truncate on the last sequence." + + if self.sequence_parallel_lm and not packed_sequence: # Create an attention mask. This ensures correct computation. # This is done even when no padding was done as we set mask_type to # 'padding' or 'padding_causal' when using SP. @@ -858,6 +887,7 @@ def _preprocess_data( # Attention mask True/False meaning flipped in 1.7.0 attention_mask = attention_mask < 0.5 + if self.sequence_parallel_lm: final_embedding = tensor_parallel.scatter_to_sequence_parallel_region(final_embedding) return final_embedding, final_labels, final_loss_mask, attention_mask diff --git a/nemo/collections/vlm/recipes/llava15_13b.py b/nemo/collections/vlm/recipes/llava15_13b.py index d85ba6f2752b..40bc8cc44682 100644 --- a/nemo/collections/vlm/recipes/llava15_13b.py +++ b/nemo/collections/vlm/recipes/llava15_13b.py @@ -15,8 +15,8 @@ from typing import Optional +import lightning.pytorch as pl import nemo_run as run -import pytorch_lightning as pl import torch from megatron.core.distributed import DistributedDataParallelConfig diff --git a/nemo/collections/vlm/recipes/llava15_7b.py b/nemo/collections/vlm/recipes/llava15_7b.py index 2abb50db6c11..9de60e671e38 100644 --- a/nemo/collections/vlm/recipes/llava15_7b.py +++ b/nemo/collections/vlm/recipes/llava15_7b.py @@ -15,8 +15,8 @@ from typing import Optional +import lightning.pytorch as pl import nemo_run as run -import pytorch_lightning as pl import torch from megatron.core.distributed import DistributedDataParallelConfig diff --git a/nemo/collections/vlm/recipes/llava_next_7b.py b/nemo/collections/vlm/recipes/llava_next_7b.py index d23159125823..53609fe589c8 100644 --- a/nemo/collections/vlm/recipes/llava_next_7b.py +++ b/nemo/collections/vlm/recipes/llava_next_7b.py @@ -15,8 +15,8 @@ from typing import Optional +import lightning.pytorch as pl import nemo_run as run -import pytorch_lightning as pl import torch from nemo import lightning as nl diff --git a/nemo/lightning/megatron_parallel.py b/nemo/lightning/megatron_parallel.py index 1b1f5c790b61..e3c6c77f4cda 100644 --- a/nemo/lightning/megatron_parallel.py +++ b/nemo/lightning/megatron_parallel.py @@ -1711,7 +1711,10 @@ def masked_token_loss(tensor: Tensor, mask: Tensor): """ losses = tensor.float() loss_mask = mask.view(-1).float() - loss = torch.sum(losses.view(-1) * loss_mask) / loss_mask.sum() # sequence level nll + num_valid_tokens = loss_mask.sum() + if num_valid_tokens < 0.5: # no valid tokens + num_valid_tokens += 1.0 + loss = torch.sum(losses.view(-1) * loss_mask) / num_valid_tokens # sequence level nll return loss diff --git a/requirements/requirements_multimodal.txt b/requirements/requirements_multimodal.txt index 92ae32659dac..585e277be72a 100644 --- a/requirements/requirements_multimodal.txt +++ b/requirements/requirements_multimodal.txt @@ -6,7 +6,7 @@ diffusers>=0.19.3 einops_exts imageio kornia -megatron-energon<3.0.0 +megatron-energon==4.0.0 nerfacc>=0.5.3 open_clip_torch==2.24.0 PyMCubes diff --git a/scripts/vlm/llava_next_finetune.py b/scripts/vlm/llava_next_finetune.py index 91df8a39452d..9d3e5053c0c1 100644 --- a/scripts/vlm/llava_next_finetune.py +++ b/scripts/vlm/llava_next_finetune.py @@ -25,8 +25,8 @@ import argparse import torch +from lightning.pytorch.loggers import WandbLogger from megatron.core.optimizer import OptimizerConfig -from pytorch_lightning.loggers import WandbLogger from nemo import lightning as nl from nemo.collections import llm, vlm diff --git a/scripts/vlm/llava_next_pretrain.py b/scripts/vlm/llava_next_pretrain.py index 0beb9b5b08d0..19bdf47bb668 100644 --- a/scripts/vlm/llava_next_pretrain.py +++ b/scripts/vlm/llava_next_pretrain.py @@ -25,8 +25,8 @@ import argparse import torch +from lightning.pytorch.loggers import WandbLogger from megatron.core.optimizer import OptimizerConfig -from pytorch_lightning.loggers import WandbLogger from nemo import lightning as nl from nemo.collections import llm, vlm diff --git a/scripts/vlm/mllama_finetune.py b/scripts/vlm/mllama_finetune.py index 15cd8078fd32..9e37d9c3fc0c 100644 --- a/scripts/vlm/mllama_finetune.py +++ b/scripts/vlm/mllama_finetune.py @@ -15,8 +15,8 @@ import argparse import torch +from lightning.pytorch.loggers import WandbLogger from megatron.core.optimizer import OptimizerConfig -from pytorch_lightning.loggers import WandbLogger from transformers import AutoProcessor from nemo import lightning as nl diff --git a/scripts/vlm/neva_finetune.py b/scripts/vlm/neva_finetune.py index 4069fb2d9278..3bf0084ea60d 100644 --- a/scripts/vlm/neva_finetune.py +++ b/scripts/vlm/neva_finetune.py @@ -21,11 +21,12 @@ import argparse import torch +from lightning.pytorch.loggers import WandbLogger from megatron.core.optimizer import OptimizerConfig -from pytorch_lightning.loggers import WandbLogger from nemo import lightning as nl from nemo.collections import llm, vlm +from nemo.collections.multimodal.data.energon.task_encoder import MultiModalTaskEncoder from nemo.collections.vlm import ImageDataConfig from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback from nemo.lightning.pytorch.optim import CosineAnnealingScheduler @@ -42,6 +43,33 @@ def main(args): max_steps = args.max_steps decoder_seq_length = 4096 + if args.use_packed_sequence: + decoder_seq_length = 8192 + + # Submodules configurations + language_transformer_config = llm.Llama2Config7B( + seq_length=decoder_seq_length, + ) + vision_transformer_config = vlm.HFCLIPVisionConfig( + pretrained_model_name_or_path="openai/clip-vit-large-patch14-336" + ) + vision_projection_config = vlm.MultimodalProjectorConfig( + projector_type=args.projector_type, + input_size=vision_transformer_config.hidden_size, + hidden_size=language_transformer_config.hidden_size, + ffn_hidden_size=language_transformer_config.hidden_size, + ) + + # NEVA model configuration + neva_config = vlm.NevaConfig( + language_transformer_config=language_transformer_config, + vision_transformer_config=vision_transformer_config, + vision_projection_config=vision_projection_config, + language_model_from_pretrained=args.language_model_path, + freeze_language_model=False, + freeze_vision_model=True, + ) + num_image_embeddings_per_tile = vision_transformer_config.num_image_embeddings_per_tile if args.data_type == "llava": # Data configuration @@ -60,7 +88,50 @@ def main(args): micro_batch_size=mbs, tokenizer=None, image_processor=None, - num_workers=8, + num_workers=4, + packed_sequence=args.use_packed_sequence, + num_image_embeddings_per_tile=num_image_embeddings_per_tile, + ) + elif args.data_type == "energon": + from transformers import AutoProcessor + + from nemo.collections.multimodal.data.energon import ( + EnergonMultiModalDataModule, + ImageToken, + LLaVATemplateConfig, + MultiModalSampleConfig, + ) + + processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + tokenizer = processor.tokenizer + image_processor = processor.image_processor + + # Configure multimodal samples + config = MultiModalSampleConfig( + image_token=ImageToken(token_str="", token_id=-200), + ignore_place_holder=-100, + conversation_template_config=LLaVATemplateConfig(), + ) + + # Initialize the data module + data = EnergonMultiModalDataModule( + path=args.data_path, + tokenizer=tokenizer, + image_processor=image_processor, + seq_length=decoder_seq_length, + micro_batch_size=mbs, + global_batch_size=gbs, + num_workers=0, + multimodal_sample_config=config, + task_encoder=MultiModalTaskEncoder( + tokenizer=tokenizer, + image_processor=image_processor, + multimodal_sample_config=config, + packed_sequence=args.use_packed_sequence, + packed_sequence_size=decoder_seq_length, + num_image_embeddings_per_tile=num_image_embeddings_per_tile, + ), + packing_buffer_size=200 if args.use_packed_sequence else None, ) elif args.data_type == "mock": data = vlm.NevaMockDataModule( @@ -70,36 +141,11 @@ def main(args): tokenizer=None, image_processor=None, num_workers=4, + packed_sequence=args.use_packed_sequence, ) else: raise ValueError(f"Data type {args.data_type} not supported") - # Submodules configurations - language_transformer_config = llm.Llama2Config7B( - seq_length=decoder_seq_length, - ) - vision_transformer_config = vlm.HFCLIPVisionConfig( - pretrained_model_name_or_path="openai/clip-vit-large-patch14-336" - ) - vision_projection_config = vlm.MultimodalProjectorConfig( - projector_type=args.projector_type, - input_size=vision_transformer_config.hidden_size, - hidden_size=language_transformer_config.hidden_size, - ffn_hidden_size=language_transformer_config.hidden_size, - ) - - # NEVA model configuration - neva_config = vlm.NevaConfig( - language_transformer_config=language_transformer_config, - vision_transformer_config=vision_transformer_config, - vision_projection_config=vision_projection_config, - language_model_from_pretrained=args.language_model_path, - freeze_language_model=False, - freeze_vision_model=True, - ) - - model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer) - from megatron.core.distributed import DistributedDataParallelConfig # Training strategy setup @@ -118,6 +164,8 @@ def main(args): ), ) + model = vlm.NevaModel(neva_config, tokenizer=data.tokenizer) + # Checkpoint callback setup checkpoint_callback = nl.ModelCheckpoint( save_last=True, @@ -231,6 +279,9 @@ def main(args): parser.add_argument("--gbs", type=int, required=False, default=128, help="Global batch size") parser.add_argument("--mbs", type=int, required=False, default=2, help="Micro batch size") parser.add_argument("--lr", type=float, required=False, default=2.0e-06, help="Learning rate") - + parser.add_argument( + "--use_packed_sequence", + action="store_true", + ) args = parser.parse_args() main(args) diff --git a/tests/collections/multimodal/data/energon/test_data_module.py b/tests/collections/multimodal/data/energon/test_data_module.py index c499ecfe9ca4..dff153388f31 100644 --- a/tests/collections/multimodal/data/energon/test_data_module.py +++ b/tests/collections/multimodal/data/energon/test_data_module.py @@ -21,7 +21,7 @@ import numpy as np import webdataset as wds -from megatron.energon.flavors import BaseWebdataset +from megatron.energon.flavors import BaseWebdatasetFactory from PIL import Image from transformers import AutoProcessor @@ -159,7 +159,7 @@ def create_vqa_test_dataset(self, path: Path, num_samples: int): ) total_shards = shard_writer.shard - BaseWebdataset.prepare_dataset( + BaseWebdatasetFactory.prepare_dataset( path, [f"data-{{0..{total_shards-1}}}.tar"], split_parts_ratio=[("train", 1.0), ("val", 1.0)], diff --git a/tests/collections/vlm/mllama_train.py b/tests/collections/vlm/test_mllama_train.py similarity index 100% rename from tests/collections/vlm/mllama_train.py rename to tests/collections/vlm/test_mllama_train.py diff --git a/tests/collections/vlm/neva_train.py b/tests/collections/vlm/test_neva_train.py similarity index 95% rename from tests/collections/vlm/neva_train.py rename to tests/collections/vlm/test_neva_train.py index f1ddf961cb10..e12ce27702c2 100644 --- a/tests/collections/vlm/neva_train.py +++ b/tests/collections/vlm/test_neva_train.py @@ -37,6 +37,10 @@ def get_args(): parser.add_argument( '--experiment-dir', type=str, default=None, help="directory to write results and checkpoints to" ) + parser.add_argument( + "--use_packed_sequence", + action="store_true", + ) return parser.parse_args() @@ -49,6 +53,8 @@ def get_args(): mbs = 2 seq_length = 576 decoder_seq_length = 1024 + if args.use_packed_sequence: + decoder_seq_length = 2048 data = vlm.NevaMockDataModule( seq_length=decoder_seq_length, @@ -57,6 +63,7 @@ def get_args(): tokenizer=None, image_processor=None, num_workers=2, + packed_sequence=args.use_packed_sequence, ) # Transformer configurations From 2db5dbbc304742b04814f2e6058b1984113a55f6 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Wed, 15 Jan 2025 22:03:24 -0800 Subject: [PATCH 09/27] Temp change: Flaky test optional --- .github/workflows/cicd-main.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index a815be7bdc2f..92cd1f8a6261 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -128,14 +128,15 @@ jobs: SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads - L0_Unit_Tests_GPU_LLM: + OPTIONAL_L0_Unit_Tests_GPU_LLM: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads + IS_OPTIONAL: true L0_Unit_Tests_GPU_Multimodal: needs: [cicd-test-container-setup] @@ -4966,7 +4967,7 @@ jobs: - L0_Unit_Tests_GPU_ASR - L0_Unit_Tests_GPU_Audio - L0_Unit_Tests_GPU_Common - - L0_Unit_Tests_GPU_LLM + #- OPTIONAL_L0_Unit_Tests_GPU_LLM - L0_Unit_Tests_GPU_Multimodal - L0_Unit_Tests_GPU_NLP - L0_Unit_Tests_GPU_TTS From fe2ae82fdb2db6c9bea52198641e2bc9d59a8768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Thu, 16 Jan 2025 16:31:16 +0100 Subject: [PATCH 10/27] Revert "Revert Mcore update since it caused regression (#11791)" (#11799) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Revert "Revert Mcore update since it caused regression (#11791)" This reverts commit 84b2bf0989a1cfde0258acb5804cd5bdcd357449. * Fix Gemma2 Attention init args (#11792) * Use _get_mlp_module_spec from Megatron Core rather than redefine locally (#11834) * Use _get_mlp_module_spec from MCore rather than redefine Signed-off-by: Jan Lasek * Apply isort and black reformatting Signed-off-by: janekl * Update nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py Co-authored-by: oliver könig Signed-off-by: Jan Lasek --------- Signed-off-by: Jan Lasek Signed-off-by: janekl Co-authored-by: janekl Co-authored-by: oliver könig * Bugfix for output_generation_logits in tensorrtllm (#11820) (#11833) Signed-off-by: Abhishree Signed-off-by: Jan Lasek Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --------- Signed-off-by: Jan Lasek Signed-off-by: janekl Signed-off-by: Abhishree Co-authored-by: Ao Tang Co-authored-by: Jan Lasek Co-authored-by: janekl Co-authored-by: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> --- .github/workflows/cicd-main.yml | 5 +- .github/workflows/import-test.yml | 79 +++++++------------ Dockerfile.ci | 30 +++---- docs/source/nlp/information_retrieval.rst | 3 +- nemo/collections/diffusion/scripts/train.sh | 1 - nemo/collections/llm/gpt/model/gemma.py | 3 + nemo/collections/llm/recipes/gemma_2b.py | 2 - nemo/collections/llm/recipes/gemma_7b.py | 4 - .../megatron/gpt_layer_modelopt_spec.py | 39 +-------- .../language_modeling/megatron_base_model.py | 5 ++ .../language_modeling/megatron_retro_model.py | 3 + nemo/collections/vlm/mllama/model/language.py | 4 +- nemo/lightning/pytorch/callbacks/peft.py | 2 +- pyproject.toml | 3 - requirements/requirements_nlp.txt | 1 - .../convert_bert_hf_to_nemo.py | 3 +- .../bitexact/mixtral/pretrain_mini_mixtral.py | 2 + tests/collections/llm/bitexact/mixtral/run.sh | 4 +- .../llm/gpt/model/test_model_import.py | 5 ++ tests/collections/llm/hf/peft_nemorun.py | 1 - tests/collections/llm/hf/sft_nemorun.py | 1 - .../llm/megatron_mixtral_pretraining.py | 2 + tests/conftest.py | 14 ++++ tests/core/test_exp_manager.py | 4 +- tests/lightning/test_nemo_resume_from_ckpt.py | 10 +-- .../llama-3/nemo2-sft-peft/nemo2-peft.ipynb | 2 - .../llama-3/nemo2-sft-peft/nemo2-sft.ipynb | 2 - tutorials/llm/mamba/mamba.rst | 9 +-- 28 files changed, 97 insertions(+), 146 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 92cd1f8a6261..75b9e9e7befd 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -2938,7 +2938,7 @@ jobs: with: RUNNER: self-hosted-azure-gpus-2-h100 SCRIPT: | - CUDA_DEVICE_MAX_CONNECTIONS=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + CUDA_DEVICE_MAX_CONNECTIONS=1 python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ trainer.devices=2 \ trainer.log_every_n_steps=1 \ trainer.max_epochs=9999 \ @@ -2966,6 +2966,7 @@ jobs: +model.tp_comm_overlap_ag=False \ +model.tp_comm_overlap_rs=False \ +model.tp_comm_overlap_disable_qkv=True \ + +model.attention_backend="unfused" \ model.peft.peft_scheme="lora" \ model.peft.lora_tuning.adapter_dim=16 \ model.peft.lora_tuning.alpha=32 \ @@ -4368,7 +4369,7 @@ jobs: with: RUNNER: self-hosted-azure SCRIPT: | - NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python3 tests/collections/llm/megatron_mixtral_pretraining.py \ + python3 tests/collections/llm/megatron_mixtral_pretraining.py \ --experiment-dir=/tmp/mixtral_pretrain_results \ --data-path=/home/TestData/nlp/megatron_t5/data/pile_val_small_bert_tokenizer_text_document diff --git a/.github/workflows/import-test.yml b/.github/workflows/import-test.yml index 3af15294b2a2..47d4657dfe4f 100644 --- a/.github/workflows/import-test.yml +++ b/.github/workflows/import-test.yml @@ -1,73 +1,52 @@ name: CI-Import-Check on: - push: pull_request: paths: - "**" # Check https://hub.docker.com/r/pytorch/pytorch/tags for latest tags jobs: - - test-asr-imports: - runs-on: ubuntu-latest - container: - image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime + test-imports: + name: test-${{ matrix.collection }}-import-${{ matrix.os }}-py${{ matrix.python }} + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, macos-latest] + collection: + - asr + # - nlp # Currently broken + - tts + python: ['3.10', '3.11', '3.12'] steps: - name: Checkout repo uses: actions/checkout@v2 - - name: Update base dependencies - run: | - apt-get update && apt-get install -y build-essential - apt-get install -y libsndfile1 make - - name: Install nemo dependencies + - uses: actions/setup-python@v5 + with: + python-version: '${{ matrix.python }}' + - name: Build wheel id: nemo-wheel run: | - pip install Cython - # install test requirements - pip install -r requirements/requirements_test.txt # Build nemo as a wheel pip install build - python -m build --no-isolation --wheel + python -m build --wheel + # Preserve wheel location DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) - echo "::set-output name=DIST_FILE::${DIST_FILE}" - - name: Test ASR Domain Imports - run: | - # Install NeMo Domain - pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[asr]" - # Run import checks - python tests/core_ptl/check_imports.py --domain "asr" - # Uninstall NeMo - pip uninstall -y nemo_toolkit - test-tts-imports: - runs-on: ubuntu-latest - container: - image: pytorch/pytorch:2.4.0-cuda11.8-cudnn9-runtime - steps: - - name: Checkout repo - uses: actions/checkout@v2 - - name: Update base dependencies + echo "DIST_FILE=${DIST_FILE}" | tee -a "$GITHUB_OUTPUT" + + - name: Install NeMo + test dependencies run: | - apt-get update && apt-get install -y build-essential - apt-get install -y libsndfile1 make - - name: Install nemo dependencies - id: nemo-wheel - run: | - pip install Cython # install test requirements pip install -r requirements/requirements_test.txt - # Build nemo as a wheel - pip install build - python -m build --no-isolation --wheel - # Preserve wheel location - DIST_FILE=$(find ./dist -name "*.whl" | head -n 1) - echo "::set-output name=DIST_FILE::${DIST_FILE}" - - name: Test TTS Domain Imports - run: | + # Install NeMo Domain - pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[tts]" + pip install "${{ steps.nemo-wheel.outputs.DIST_FILE }}[test,${{ matrix.collection }}]" + + - name: Run ${{ matrix.collection }} checks + run: | # Run import checks - python tests/core_ptl/check_imports.py --domain "tts" - # Uninstall NeMo - pip uninstall -y nemo_toolkit + python tests/core_ptl/check_imports.py --domain "${{ matrix.collection }}" + + \ No newline at end of file diff --git a/Dockerfile.ci b/Dockerfile.ci index 3d2f0c76b6ea..f7e637442158 100644 --- a/Dockerfile.ci +++ b/Dockerfile.ci @@ -34,17 +34,12 @@ EOF WORKDIR /workspace # Install Mamba Dependancy -ARG CAUSAL_CONV_TAG=v1.2.2.post1 +ARG CAUSAL_CONV_TAG=v1.2.2.post1 +ARG MAMBA_TAG=v2.2.0 RUN <<"EOF" bash -ex # Mamba dependancy installation - -git clone --depth 1 --branch ${CAUSAL_CONV_TAG} https://github.com/Dao-AILab/causal-conv1d && \ - cd causal-conv1d && \ - python setup.py install && \ - cd .. && \ - rm -rf causal-conv1d - +MAMBA_FORCE_BUILD=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE pip3 install --no-cache-dir -v git+https://github.com/Dao-AILab/causal-conv1d.git@${CAUSAL_CONV_TAG} git+https://github.com/state-spaces/mamba.git@${MAMBA_TAG} EOF RUN pip install hatchling # needed to install nemo-run @@ -54,8 +49,6 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T # Install NeMo requirements ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea ARG MODELOPT_VERSION=0.21.0 -ARG MCORE_TAG=bd677bfb13ac2f19deaa927adc6da6f9201d66aa - ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c RUN \ --mount=type=bind,source=requirements,target=requirements \ @@ -65,7 +58,6 @@ RUN \ --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \ "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \ -"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \ "nvidia-modelopt[torch]~=${MODELOPT_VERSION}" \ "apex @ git+https://github.com/NVIDIA/apex.git@${APEX_TAG}" \ "unstructured==0.14.9" \ @@ -73,15 +65,15 @@ pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.n "onnxscript @ git+https://github.com/microsoft/onnxscript" \ -r tools/ctc_segmentation/requirements.txt \ ".[all]" +EOF -# Megatron Core installation -git clone https://github.com/NVIDIA/Megatron-LM.git && \ -pushd Megatron-LM && \ -git checkout ${MCORE_TAG} && \ - pushd megatron/core/datasets && \ - make && \ - popd && \ -popd +ARG MCORE_TAG=4dc8977167d71f86bdec47a60a98e85c4cfa0031 +RUN <<"EOF" bash -ex +# Megatron-LM installation +git clone https://github.com/NVIDIA/Megatron-LM.git +pushd Megatron-LM +git checkout ${MCORE_TAG} +pip install -e . export PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM" # Install nvidia-resiliency-ext diff --git a/docs/source/nlp/information_retrieval.rst b/docs/source/nlp/information_retrieval.rst index 26732283e8f4..69f1c3219093 100644 --- a/docs/source/nlp/information_retrieval.rst +++ b/docs/source/nlp/information_retrieval.rst @@ -70,9 +70,7 @@ Then you can fine-tune the sentence-BERT model using the following script: VALIDATION_DATASET_PATH= # Path to validation dataset SAVE_DIR= # where the checkpoint and logs are saved mkdir -p $SAVE_DIR - export NVTE_FLASH_ATTN=0 export NVTE_ALLOW_NONDETERMINISTIC_ALGO=0 - export NVTE_FUSED_ATTN=0 python NeMo/examples/nlp/information_retrieval/megatron_bert_embedding_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -87,6 +85,7 @@ Then you can fine-tune the sentence-BERT model using the following script: model.post_process=False \ model.global_batch_size=8 \ # should be NUM_DEVICES * model.micro_batch_size model.micro_batch_size=8 \ + model.attention_backend="unfused" \ model.optim.lr=0.000005 \ model.optim.sched.min_lr=0.00000001 \ model.optim.sched.warmup_steps=100 \ diff --git a/nemo/collections/diffusion/scripts/train.sh b/nemo/collections/diffusion/scripts/train.sh index 2150458e9376..ced479e32526 100644 --- a/nemo/collections/diffusion/scripts/train.sh +++ b/nemo/collections/diffusion/scripts/train.sh @@ -20,7 +20,6 @@ export WANDB_PROJECT=xxx export WANDB_RUN_ID=xxx export WANDB_RESUME=allow -export NVTE_FUSED_ATTN=0 export CUDA_DEVICE_MAX_CONNECTIONS=1 export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True diff --git a/nemo/collections/llm/gpt/model/gemma.py b/nemo/collections/llm/gpt/model/gemma.py index bf828bb66277..4d8d541deaa8 100644 --- a/nemo/collections/llm/gpt/model/gemma.py +++ b/nemo/collections/llm/gpt/model/gemma.py @@ -18,6 +18,7 @@ import torch from megatron.core import parallel_state +from megatron.core.transformer.enums import AttnBackend from torch import nn from nemo.collections.llm.fn.activation import openai_gelu @@ -53,6 +54,8 @@ class GemmaConfig(GPTConfig): # Legacy NeMo does not set layernorm_zero_centered_gamma and instead adds 1 in the HF -> NeMo conversion script # The present implementation is more in line with the official implementation layernorm_zero_centered_gamma: bool = True + # Disable cuDNN attention since TE 1.8 does not support head dim > 128 + attention_backend: AttnBackend = AttnBackend.flash @dataclass diff --git a/nemo/collections/llm/recipes/gemma_2b.py b/nemo/collections/llm/recipes/gemma_2b.py index 3b43bbdb0e62..64af8192929c 100644 --- a/nemo/collections/llm/recipes/gemma_2b.py +++ b/nemo/collections/llm/recipes/gemma_2b.py @@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig2B)) diff --git a/nemo/collections/llm/recipes/gemma_7b.py b/nemo/collections/llm/recipes/gemma_7b.py index 40e43bda4d5e..2ac3419d6587 100644 --- a/nemo/collections/llm/recipes/gemma_7b.py +++ b/nemo/collections/llm/recipes/gemma_7b.py @@ -51,8 +51,6 @@ def model() -> run.Config[pl.LightningModule]: >>> model_config = model() >>> print(model_config) """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Config(GemmaModel, config=run.Config(GemmaConfig7B)) @@ -173,8 +171,6 @@ def pretrain_recipe( For more details on pre-training LLMs with NeMo, see the pre-training guide in the `examples/llm/pretrain/` directory. """ - # Disable cuDNN attention since TE 1.8 does not support head dim > 128 - os.environ['NVTE_FUSED_ATTN'] = "0" return run.Partial( fn, diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py index 514ef62a9ff3..aa68273a414a 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_layer_modelopt_spec.py @@ -17,13 +17,11 @@ try: from megatron.core.extensions.transformer_engine import TEDotProductAttention, TENorm from megatron.core.fusions.fused_bias_dropout import get_bias_dropout_add + from megatron.core.models.gpt.gpt_layer_specs import _get_mlp_module_spec from megatron.core.tensor_parallel.layers import ColumnParallelLinear, RowParallelLinear from megatron.core.transformer.attention import SelfAttention, SelfAttentionSubmodules from megatron.core.transformer.enums import AttnMaskType from megatron.core.transformer.identity_op import IdentityOp - from megatron.core.transformer.mlp import MLP, MLPSubmodules - from megatron.core.transformer.moe.moe_layer import MoELayer, MoESubmodules - from megatron.core.transformer.moe.shared_experts import SharedExpertMLP from megatron.core.transformer.spec_utils import ModuleSpec from megatron.core.transformer.transformer_layer import TransformerLayer, TransformerLayerSubmodules @@ -57,7 +55,8 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec if not HAVE_MEGATRON_CORE: raise IMPORT_ERROR - mlp = _get_mlp_module_spec(num_experts=num_experts) + mlp = _get_mlp_module_spec(use_te=False, num_experts=num_experts, moe_grouped_gemm=False) + return ModuleSpec( module=TransformerLayer, submodules=TransformerLayerSubmodules( @@ -84,35 +83,3 @@ def get_gpt_layer_modelopt_spec(num_experts: Optional[int] = None) -> ModuleSpec }, ), ) - - -# Helper function to get module spec for MLP/MoE -def _get_mlp_module_spec(num_experts: Optional[int] = None) -> ModuleSpec: - if num_experts is None: - # Dense MLP w/ or w/o TE modules. - return ModuleSpec( - module=MLP, - submodules=MLPSubmodules( - linear_fc1=ColumnParallelLinear, - linear_fc2=RowParallelLinear, - ), - ) - else: - # Mixture of experts with modules in megatron core. - return ModuleSpec( - module=MoELayer, - submodules=MoESubmodules( - experts=MLPSubmodules( - linear_fc1=ColumnParallelLinear, - linear_fc2=RowParallelLinear, - ), - shared_experts=ModuleSpec( - module=SharedExpertMLP, - params={"gate": False}, - submodules=MLPSubmodules( - linear_fc1=ColumnParallelLinear, - linear_fc2=RowParallelLinear, - ), - ), - ), - ) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py index cf13a0318ffc..122c86614311 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_model.py @@ -50,6 +50,7 @@ try: from megatron.core import ModelParallelConfig, parallel_state from megatron.core.distributed import DistributedDataParallel as McoreDDP + from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -538,6 +539,9 @@ def build_transformer_config(self) -> TransformerConfig: tp_only_amax_red = self.cfg.get('tp_only_amax_red', False) + attention_backend = self.cfg.get('attention_backend', "auto") + attention_backend = AttnBackend[attention_backend] + # any configs that are not in the nemo model config will be added here config_mapping = { 'apply_query_key_layer_scaling': apply_query_key_layer_scaling, @@ -562,6 +566,7 @@ def build_transformer_config(self) -> TransformerConfig: 'rotary_interleaved': rotary_interleaved, 'deallocate_pipeline_outputs': True, 'tp_only_amax_red': tp_only_amax_red, + 'attention_backend': attention_backend, } # populate the transformer config dict diff --git a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py index 493d512fd30e..b3fd7b11c6eb 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_retro_model.py @@ -76,6 +76,7 @@ from megatron.core.models.retro.utils import get_config_path as get_retro_config_path from megatron.core.models.retro.utils import get_gpt_data_dir as get_retro_data_dir from megatron.core.pipeline_parallel.schedules import get_forward_backward_func + from megatron.core.transformer.enums import AttnBackend from megatron.core.transformer.module import Float16Module as MCoreFloat16Module from megatron.core.transformer.transformer_config import TransformerConfig from megatron.core.utils import init_method_normal, scaled_init_method_normal @@ -431,6 +432,8 @@ def build_retro_config(self) -> RetroConfig: te_version = packaging.version.Version(version("transformer-engine")) if te_version >= packaging.version.Version("1.3"): + if HAVE_MEGATRON_CORE: + retro_config.attention_backend = AttnBackend.unfused try: os.environ["NVTE_FLASH_ATTN"] = "0" os.environ["NVTE_FUSED_ATTN"] = "0" diff --git a/nemo/collections/vlm/mllama/model/language.py b/nemo/collections/vlm/mllama/model/language.py index bec3ec526f6e..3edc6706defb 100644 --- a/nemo/collections/vlm/mllama/model/language.py +++ b/nemo/collections/vlm/mllama/model/language.py @@ -390,7 +390,7 @@ def sharded_state_dict( layer_prefix = f'{prefix}layers.' num_layers = self.config.num_layers for layer in self.layers: - offset = layer._get_layer_offset() + offset = layer._get_layer_offset(layer.config) global_layer_offset = layer.layer_number - 1 # self.layer_number starts at 1 state_dict_prefix = f'{layer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = layer_prefix @@ -403,7 +403,7 @@ def sharded_state_dict( for xlayer in self.xattn_layers: if isinstance(xlayer, DummyCrossAttentionTransformerLayer): continue - offset = xlayer._get_layer_offset() + offset = xlayer._get_layer_offset(xlayer.config) global_layer_offset = xlayer.layer_number - 1 state_dict_prefix = f'{xlayer_prefix}{global_layer_offset - offset}.' # module list index in TransformerBlock # pylint: disable=line-too-long sharded_prefix = f'{xlayer_prefix}{global_layer_offset}.' diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py index a71d6792d457..399b4e9e5293 100644 --- a/nemo/lightning/pytorch/callbacks/peft.py +++ b/nemo/lightning/pytorch/callbacks/peft.py @@ -480,7 +480,7 @@ def load_checkpoint( if getattr(path, "base_model_path", None): ## PEFT Resume, FIRST TIME self.adapter_ckpt_path = Path(str(path)) - adapter_ckpt = self.checkpoint_io.load_checkpoint(path) # Loads only metadata + adapter_ckpt = self.checkpoint_io.load_checkpoint(path, sharded_state_dict={}) # Loads only metadata # path is adapter path to restore the training metadata, but switch to loading base model here. path = self.model_ckpt_path = path.base_model_path elif adapter_meta_path.exists(): diff --git a/pyproject.toml b/pyproject.toml index bdddfef27dc6..af5555f9d0dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,9 +60,6 @@ classifiers = [ "Topic :: Utilities", ] -[tool.setuptools.dynamic] -dependencies = { file = ["requirements/requirements.txt"] } - [tool.setuptools] py-modules = ["nemo"] diff --git a/requirements/requirements_nlp.txt b/requirements/requirements_nlp.txt index d35b649a46ba..6a0ae8adf66c 100644 --- a/requirements/requirements_nlp.txt +++ b/requirements/requirements_nlp.txt @@ -8,7 +8,6 @@ gdown h5py ijson jieba -mamba-ssm==2.2.2; sys_platform == 'linux' markdown2 matplotlib>=3.3.2 #megatron_core>0.6.0 # add back once mcore on pypi is compatible again diff --git a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py index 14baca53f165..8265da57f656 100644 --- a/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py +++ b/scripts/checkpoint_converters/convert_bert_hf_to_nemo.py @@ -84,6 +84,8 @@ def convert(args): nemo_config.model = adjust_nemo_config(nemo_config.model, hf_model.config.to_dict(), mcore_bert=args.mcore) nemo_config.trainer["precision"] = args.precision + # Bert doesn't support FLASH_ATTN + nemo_config.model["attention_backend"] = "fused" trainer = MegatronTrainerBuilder(nemo_config).create_trainer() model = MegatronBertModel(nemo_config.model, trainer) @@ -288,6 +290,5 @@ def convert(args): if __name__ == '__main__': - os.environ['NVTE_FLASH_ATTN'] = '0' # Bert doesn't support FLASH_ATTN args = get_args() convert(args) diff --git a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py index b4f95879bad5..654a2a9e05a8 100644 --- a/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py +++ b/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.py @@ -17,6 +17,7 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig +from megatron.core.transformer.enums import AttnBackend from megatron.core.utils import init_method_normal, scaled_init_method_normal from nemo.collections.llm import MixtralConfig8x7B, MixtralModel, PreTrainingDataModule @@ -102,6 +103,7 @@ def main(args): bias_dropout_fusion=True, apply_rope_fusion=True, distribute_saved_activations=False, + attention_backend=AttnBackend.unfused, ) data = PreTrainingDataModule( diff --git a/tests/collections/llm/bitexact/mixtral/run.sh b/tests/collections/llm/bitexact/mixtral/run.sh index 87bf7c382b99..0f6612b3d21b 100644 --- a/tests/collections/llm/bitexact/mixtral/run.sh +++ b/tests/collections/llm/bitexact/mixtral/run.sh @@ -8,7 +8,7 @@ MCORE_OUTPUT_PATH="/tmp/bex_mixtral_mcore_output/" NEMO_OUTPUT_PATH="/tmp/bex_mixtral_nemo_output/" # Run Mcore -CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ +CUDA_DEVICE_MAX_CONNECTIONS=1 CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 \ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --apply-layernorm-1p --rotary-percent 1.0 --rotary-base 1000000 \ --no-position-embedding --position-embedding-type rope \ @@ -30,7 +30,7 @@ torchrun --nproc-per-node 1 --nnodes 1 /workspace/Megatron-LM/pretrain_gpt.py \ --split 99,1,0 --log-interval 10 --save-interval 20000 --eval-interval 1000 --eval-iters 32 \ --save "$MCORE_OUTPUT_PATH" \ --log-num-zeros-in-grad --distributed-timeout-minutes 6000 --moe-router-topk 1 --num-experts 2 \ - --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 + --moe-router-pre-softmax --expert-model-parallel-size 1 --eval-iters=0 --attention-backend unfused # Run NeMo CUDA_LAUNCH_BLOCKING=1 TORCH_COMPILE_DISABLE=1 NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 \ diff --git a/tests/collections/llm/gpt/model/test_model_import.py b/tests/collections/llm/gpt/model/test_model_import.py index 9edc235e454f..b49885718837 100644 --- a/tests/collections/llm/gpt/model/test_model_import.py +++ b/tests/collections/llm/gpt/model/test_model_import.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os + import torch torch.set_grad_enabled(False) @@ -95,5 +97,8 @@ def import_from_hf(config_name, hf_path): if __name__ == '__main__': for config_name, hf_id in config_name_to_hf_id.items(): + for env_var in ['NVTE_FLASH_ATTN', 'NVTE_FUSED_ATTN', 'NVTE_UNFUSED_ATTN']: + if env_var in os.environ: + del os.environ[env_var] src = f'hf:///home/TestData/nemo2_ckpt/{config_name}' import_from_hf(config_name, src) diff --git a/tests/collections/llm/hf/peft_nemorun.py b/tests/collections/llm/hf/peft_nemorun.py index ef34d4d39a11..3a135b2346be 100644 --- a/tests/collections/llm/hf/peft_nemorun.py +++ b/tests/collections/llm/hf/peft_nemorun.py @@ -28,7 +28,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/hf/sft_nemorun.py b/tests/collections/llm/hf/sft_nemorun.py index a3daa66ca774..b559c04f6cbd 100644 --- a/tests/collections/llm/hf/sft_nemorun.py +++ b/tests/collections/llm/hf/sft_nemorun.py @@ -29,7 +29,6 @@ def local_executor_torchrun(nodes: int = 1, devices: int = 2) -> run.LocalExecut "NCCL_NVLS_ENABLE": "0", "NVTE_DP_AMAX_REDUCE_INTERVAL": "0", "NVTE_ASYNC_AMAX_REDUCTION": "1", - "NVTE_FUSED_ATTN": "0", } executor = run.LocalExecutor(ntasks_per_node=devices, launcher="torchrun", env_vars=env_vars) diff --git a/tests/collections/llm/megatron_mixtral_pretraining.py b/tests/collections/llm/megatron_mixtral_pretraining.py index 4123c7b37987..2a7b1fdfdad6 100644 --- a/tests/collections/llm/megatron_mixtral_pretraining.py +++ b/tests/collections/llm/megatron_mixtral_pretraining.py @@ -18,6 +18,7 @@ import torch from megatron.core.distributed import DistributedDataParallelConfig as McoreDDPConfig +from megatron.core.transformer.enums import AttnBackend from nemo.collections.llm import MixtralConfig8x3B, MixtralModel, PreTrainingDataModule from nemo.collections.llm.api import train @@ -117,6 +118,7 @@ def main(args): bf16=True, params_dtype=torch.bfloat16, pipeline_dtype=torch.bfloat16, + attention_backend=AttnBackend.unfused, ) mixtral_config.overlap_param_gather_with_optimizer_step = True diff --git a/tests/conftest.py b/tests/conftest.py index 118e978e63c7..989c937ab499 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import logging +import os import os.path import shutil import tarfile @@ -122,6 +123,19 @@ def reset_singletons(): Singleton._Singleton__instances = {} +@pytest.fixture(autouse=True) +def reset_env_vars(): + # Store the original environment variables before the test + original_env = dict(os.environ) + + # Run the test + yield + + # After the test, restore the original environment + os.environ.clear() + os.environ.update(original_env) + + @pytest.fixture(scope="session") def test_data_dir(): """ diff --git a/tests/core/test_exp_manager.py b/tests/core/test_exp_manager.py index 32d401b2051f..9dbdaa66a25e 100644 --- a/tests/core/test_exp_manager.py +++ b/tests/core/test_exp_manager.py @@ -280,7 +280,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "957").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) # Checks that use_datetime_version False toggle works test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) @@ -288,7 +288,7 @@ def test_log_dir_overrides(self, monkeypatch, tmp_path): assert Path(tmp_path).exists() assert Path(tmp_path / "test_no_name" / "default" / "version_0").exists() - monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION) + monkeypatch.delenv(NEMO_ENV_VARNAME_VERSION, raising=False) # Checks that use_datetime_version False toggle works and version increments test_trainer = pl.Trainer(accelerator='cpu', enable_checkpointing=False, logger=False) log_dir = exp_manager(test_trainer, {"exp_dir": str(tmp_path / "test_no_name"), "use_datetime_version": False}) diff --git a/tests/lightning/test_nemo_resume_from_ckpt.py b/tests/lightning/test_nemo_resume_from_ckpt.py index e876e6965000..37ea326ad621 100644 --- a/tests/lightning/test_nemo_resume_from_ckpt.py +++ b/tests/lightning/test_nemo_resume_from_ckpt.py @@ -12,13 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. import os +from typing import List, Optional import pytest def set_env(): - os.environ['NVTE_FLASH_ATTN'] = '0' - os.environ['NVTE_FUSED_ATTN'] = '0' os.environ['NVTE_APPLY_QK_LAYER_SCALING'] = '0' @@ -28,6 +27,7 @@ def set_env(): import pytest import torch from megatron.core.optimizer import OptimizerConfig +from megatron.core.transformer.enums import AttnBackend import nemo.lightning as nl from nemo.collections import llm @@ -68,7 +68,8 @@ def load_dcp(ckpt_dir, torch_tensor=True): return state_dict -def compare_ckpts(a, b, path=[]): +def compare_ckpts(a, b, path: Optional[List[str]] = None): + path = path if path is not None else [] if isinstance(a, dict): assert isinstance(b, dict) assert set(a.keys()) == set(b.keys()) @@ -125,6 +126,7 @@ def setup_model_optim(log_dir, n_steps, tokenizer, gbs=2, mbs=1): make_vocab_size_divisible_by=128, normalization='RMSNorm', masked_softmax_fusion=False, + attention_backend=AttnBackend.local, ) model = llm.GPTModel(gpt_config, tokenizer=tokenizer) @@ -269,8 +271,6 @@ def train(n_steps, resume): trainer._teardown() set_env() - assert os.environ['NVTE_FLASH_ATTN'] == '0' - assert os.environ['NVTE_FUSED_ATTN'] == '0' assert os.environ['NVTE_APPLY_QK_LAYER_SCALING'] == '0' # Train for 40 steps diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb index 730ffd9ff972..c983b277e72a 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-peft.ipynb @@ -341,7 +341,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -457,7 +456,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb index e84ff916fc4e..0bb4367d50e9 100644 --- a/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb +++ b/tutorials/llm/llama-3/nemo2-sft-peft/nemo2-sft.ipynb @@ -482,7 +482,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", @@ -565,7 +564,6 @@ " \"NCCL_NVLS_ENABLE\": \"0\",\n", " \"NVTE_DP_AMAX_REDUCE_INTERVAL\": \"0\",\n", " \"NVTE_ASYNC_AMAX_REDUCTION\": \"1\",\n", - " \"NVTE_FUSED_ATTN\": \"0\",\n", " }\n", "\n", " executor = run.LocalExecutor(ntasks_per_node=devices, launcher=\"torchrun\", env_vars=env_vars)\n", diff --git a/tutorials/llm/mamba/mamba.rst b/tutorials/llm/mamba/mamba.rst index 197825c27d58..7f5e901659a4 100644 --- a/tutorials/llm/mamba/mamba.rst +++ b/tutorials/llm/mamba/mamba.rst @@ -103,9 +103,6 @@ Run Fine-Tuning CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= - export NVTE_FUSED_ATTN=1 - export NVTE_FLASH_ATTN=0 - torchrun --nproc_per_node=${NUM_DEVICES} \ /opt/NeMo/examples/nlp/language_modeling/tuning/megatron_mamba_finetuning.py \ --config-path=${CONFIG_PATH} \ @@ -129,6 +126,7 @@ Run Fine-Tuning model.peft.peft_scheme='none' \ model.megatron_amp_O2=True \ model.encoder_seq_length=${SEQ_LEN} \ + model.attention_backend='fused' \ model.data.validation_ds.pad_to_max_length=True \ model.data.train_ds.pad_to_max_length=True \ model.optim.name="distributed_fused_adam" \ @@ -162,10 +160,6 @@ Evaluating the Fine-Tuned Model CONFIG_NAME="megatron_mamba_finetuning_config" SAVE_DIR= - export NVTE_FUSED_ATTN=1 - export NVTE_FLASH_ATTN=0 - - CONFIG_PATH="/opt/NeMo/examples/nlp/language_modeling/tuning/conf/" CONFIG_NAME="megatron_mamba_generate_config" @@ -185,6 +179,7 @@ Evaluating the Fine-Tuned Model exp_manager.exp_dir=${SAVE_DIR} \ exp_manager.resume_if_exists=False \ exp_manager.create_wandb_logger=False \ + model.attention_backend='fused' \ model.megatron_amp_O2=True \ model.peft.restore_from_path=False \ +model.peft.restore_from_ckpt.checkpoint_dir=False \ From a6116fa05caf689c20c282f42218295ae261dfb4 Mon Sep 17 00:00:00 2001 From: "L.B." Date: Thu, 16 Jan 2025 13:26:49 -0500 Subject: [PATCH 11/27] Latest News updated for Cosmos (#11806) * Latest News updated for Cosmos Signed-off-by: Lawrence Lane * Moved Gen AI Models news to LLM section Signed-off-by: Lawrence Lane * Cleanup of news items Signed-off-by: Lawrence Lane * Added getting started section for Cosmos Signed-off-by: Lawrence Lane * Moved getting started section for Cosmos Signed-off-by: Lawrence Lane * remove unneeded section Signed-off-by: Lawrence Lane * remove unneeded section Signed-off-by: Lawrence Lane * added updated get started with cosmos Signed-off-by: Lawrence Lane --------- Signed-off-by: Lawrence Lane Co-authored-by: Pablo Garay --- README.md | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 195b8293babd..5c33d1da735f 100644 --- a/README.md +++ b/README.md @@ -15,12 +15,39 @@
NeMo 2.0 We've released NeMo 2.0, an update on the NeMo Framework which prioritizes modularity and ease-of-use. Please refer to the NeMo Framework User Guide to get started. +
+
+ New Cosmos World Foundation Models Support +
+ Advancing Physical AI with NVIDIA Cosmos World Foundation Model Platform (2025-01-09) + + The end-to-end NVIDIA Cosmos platform accelerates world model development for physical AI systems. Built on CUDA, Cosmos combines state-of-the-art world foundation models, video tokenizers, and AI-accelerated data processing pipelines. Developers can accelerate world model development by fine-tuning Cosmos world foundation models or building new ones from the ground up. These models create realistic synthetic videos of environments and interactions, providing a scalable foundation for training complex systems, from simulating humanoid robots performing advanced actions to developing end-to-end autonomous driving models. +

-
- +
+ + + Accelerate Custom Video Foundation Model Pipelines with New NVIDIA NeMo Framework Capabilities + (2025-01-07) + + The NeMo Framework now supports training and customizing the NVIDIA Cosmos collection of world foundation models. Cosmos leverages advanced text-to-world generation techniques to create fluid, coherent video content from natural language prompts. +

+ You can also now accelerate your video processing step using the NeMo Curator library, which provides optimized video processing and captioning features that can deliver up to 89x faster video processing when compared to an unoptimized CPU pipeline. +

+
+
Large Language Models and Multimodal Models -
+
+ + + State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo + (2024-11-06) + + NVIDIA recently announced significant enhancements to the NeMo platform, focusing on multimodal generative AI models. The update includes NeMo Curator and the Cosmos tokenizer, which streamline the data curation process and enhance the quality of visual data. These tools are designed to handle large-scale data efficiently, making it easier to develop high-quality AI models for various applications, including robotics and autonomous driving. The Cosmos tokenizers, in particular, efficiently map visual data into compact, semantic tokens, which is crucial for training large-scale generative models. The tokenizer is available now on the NVIDIA/cosmos-tokenizer GitHub repo and on Hugging Face. +

+
+
New Llama 3.1 Support @@ -81,7 +108,6 @@

-
Speech Recognition
@@ -163,6 +189,10 @@ Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendl - For an in-depth exploration of the main features of NeMo 2.0, see the [Feature Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/index.html#feature-guide). - To transition from NeMo 1.0 to 2.0, see the [Migration Guide](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/migration/index.html#migration-guide) for step-by-step instructions. +### Get Started with Cosmos + +NeMo Curator and NeMo Framework support video curation and post-training of the Cosmos World Foundation Models, which are open and available on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cosmos/collections/cosmos) and [Hugging Face](https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6). For more information on video datasets, refer to [NeMo Curator](https://developer.nvidia.com/nemo-curator). To post-train World Foundation Models using the NeMo Framework for your custom physical AI tasks, see the [Cosmos Diffusion models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/nemo/post_training/README.md) and the [Cosmos Autoregressive models](https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/autoregressive/nemo/post_training/README.md). + ## LLMs and MMs Training, Alignment, and Customization All NeMo models are trained with From 4c5f0510fc5ae5a384d67749abbd3f57db317a96 Mon Sep 17 00:00:00 2001 From: "Peter St. John" Date: Thu, 16 Jan 2025 12:00:32 -0700 Subject: [PATCH 12/27] Removes tensorstore 0.1.45 pin from requirements_deploy.txt (#11858) Signed-off-by: Peter St. John --- requirements/requirements_deploy.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements/requirements_deploy.txt b/requirements/requirements_deploy.txt index 5380398c278b..a65b651a76c8 100644 --- a/requirements/requirements_deploy.txt +++ b/requirements/requirements_deploy.txt @@ -1,6 +1,6 @@ fastapi nvidia-pytriton pydantic-settings -tensorstore==0.1.45 +tensorstore uvicorn zarr From 7167e5e8176c2651114546e088e8fc78e2888213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Fri, 17 Jan 2025 15:17:45 +0100 Subject: [PATCH 13/27] ci: Prune dangling images (#11885) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/_test_template.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml index 911fcc17e636..87bf71ff5c4e 100644 --- a/.github/workflows/_test_template.yml +++ b/.github/workflows/_test_template.yml @@ -47,7 +47,7 @@ jobs: steps: - name: Docker system cleanup run: | - docker system prune -a --filter "until=24h" --force || true + docker system prune -af --filter "until=24h" --force || true - name: Docker pull image run: | From 8786345739f4aa7dceeb881c20d7362dc4602d75 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 17 Jan 2025 08:58:13 -0800 Subject: [PATCH 14/27] Disable tests that download datasets from web (#11878) * disable tests that download datasets from web Signed-off-by: Alexandros Koumparoulis * re-enable llm tests Signed-off-by: Alexandros Koumparoulis * Update cicd-main.yml Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> --- .github/workflows/cicd-main.yml | 7 +++---- tests/collections/llm/test_mnist_model_nemo2.py | 1 + tests/collections/llm/test_mnist_model_nemo2_fsdp.py | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 75b9e9e7befd..3f7c5e8c3933 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -128,15 +128,14 @@ jobs: SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/collections/common -m "not pleasefixme" --with_downloads - OPTIONAL_L0_Unit_Tests_GPU_LLM: + L0_Unit_Tests_GPU_LLM: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml - if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'OPTIONAL_L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' + if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L0_Unit_Tests_GPU_LLM') || needs.cicd-test-container-setup.outputs.all == 'true' with: RUNNER: self-hosted-azure SCRIPT: | NEMO_NUMBA_MINVER=0.53 pytest tests/collections/llm -m "not pleasefixme" --with_downloads - IS_OPTIONAL: true L0_Unit_Tests_GPU_Multimodal: needs: [cicd-test-container-setup] @@ -4968,7 +4967,7 @@ jobs: - L0_Unit_Tests_GPU_ASR - L0_Unit_Tests_GPU_Audio - L0_Unit_Tests_GPU_Common - #- OPTIONAL_L0_Unit_Tests_GPU_LLM + - L0_Unit_Tests_GPU_LLM - L0_Unit_Tests_GPU_Multimodal - L0_Unit_Tests_GPU_NLP - L0_Unit_Tests_GPU_TTS diff --git a/tests/collections/llm/test_mnist_model_nemo2.py b/tests/collections/llm/test_mnist_model_nemo2.py index 92cffc2a35bb..06afcd0fc0e0 100644 --- a/tests/collections/llm/test_mnist_model_nemo2.py +++ b/tests/collections/llm/test_mnist_model_nemo2.py @@ -480,6 +480,7 @@ def reset_megatron_parallel_state() -> Iterator[None]: @pytest.mark.run_only_on("GPU") @pytest.mark.integration +@pytest.mark.pleasefixme def test_train_mnist_litautoencoder_with_megatron_strategy_single_gpu(): path = os.path.abspath(__file__) call = f"python {path}" diff --git a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py index 9418ee7e5e90..68f2dc726306 100644 --- a/tests/collections/llm/test_mnist_model_nemo2_fsdp.py +++ b/tests/collections/llm/test_mnist_model_nemo2_fsdp.py @@ -502,6 +502,7 @@ def reset_megatron_parallel_state() -> Iterator[None]: @pytest.mark.run_only_on("GPU") @pytest.mark.integration +@pytest.mark.pleasefixme def test_train_mnist_litautoencoder_with_fsdp_strategy_single_gpu(): path = os.path.abspath(__file__) call = f"python {path}" From 0cd990d97f01ab946fc53108ed43c7be7140a0d8 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Fri, 17 Jan 2025 20:04:06 +0200 Subject: [PATCH 15/27] fix checkpoint load issue (#11859) * fix checkpoint load issue Signed-off-by: Dmytro Pykhtar * Apply isort and black reformatting Signed-off-by: dimapihtar * set weights_only to False Signed-off-by: dimapihtar --------- Signed-off-by: Dmytro Pykhtar Signed-off-by: dimapihtar Signed-off-by: dimapihtar Co-authored-by: dimapihtar --- nemo/collections/llm/gpt/model/ssm.py | 2 +- .../text_to_image/controlnet/controlnet.py | 2 +- .../text_to_image/imagen/imagen_pipeline.py | 2 +- .../instruct_pix2pix/ldm/ddpm_edit.py | 17 +++++++++++++---- .../stable_diffusion/ldm/autoencoder.py | 6 +++--- .../text_to_image/stable_diffusion/ldm/ddpm.py | 4 ++-- .../diffusionmodules/openaimodel.py | 2 +- .../speech_llm/models/modular_models.py | 4 ++-- .../speech_llm/parts/mixins/adapter_mixin.py | 2 +- .../parts/mixins/multimodal_adapter_mixins.py | 2 +- .../nlp/parts/mixins/nlp_adapter_mixins.py | 2 +- 11 files changed, 27 insertions(+), 18 deletions(-) diff --git a/nemo/collections/llm/gpt/model/ssm.py b/nemo/collections/llm/gpt/model/ssm.py index f4190114042e..09681648cb73 100644 --- a/nemo/collections/llm/gpt/model/ssm.py +++ b/nemo/collections/llm/gpt/model/ssm.py @@ -120,7 +120,7 @@ def init(self) -> GPTModel: def apply(self, output_path: Path) -> Path: - source = torch.load(str(self), map_location='cpu') + source = torch.load(str(self), map_location='cpu', weights_only=False) if 'model' in source: source = source['model'] diff --git a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py index 981600fcc3a1..0e48305c4b1f 100644 --- a/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py +++ b/nemo/collections/multimodal/models/text_to_image/controlnet/controlnet.py @@ -547,7 +547,7 @@ def load_from_unet(self, from_pretrained_unet, from_NeMo=True): else: print("Loading unet blocks from sd") - state_dict = torch.load(from_pretrained_unet, map_location='cpu') + state_dict = torch.load(from_pretrained_unet, map_location='cpu', weights_only=False) if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict'] model_state_dict = self.state_dict() diff --git a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py index 63963321fcf7..f6ae5829c907 100644 --- a/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py +++ b/nemo/collections/multimodal/models/text_to_image/imagen/imagen_pipeline.py @@ -83,7 +83,7 @@ def _load_model(model_ckpt: str, model_cfg: str, eval_mode: bool = True, trainer model_cfg.model.micro_batch_size = 1 model_cfg.model.global_batch_size = 1 model = MegatronImagen(cfg=model_cfg.model, trainer=trainer) - checkpoint = torch.load(model_ckpt, map_location=lambda storage, loc: storage) + checkpoint = torch.load(model_ckpt, map_location=lambda storage, loc: storage, weights_only=False) # Change weight keys if training using TorchInductor state_dict = checkpoint['state_dict'] diff --git a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py index 9bb490fb8fc8..3ced0cbad87a 100644 --- a/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py +++ b/nemo/collections/multimodal/models/text_to_image/instruct_pix2pix/ldm/ddpm_edit.py @@ -41,9 +41,15 @@ class LatentDiffusionEdit(LatentDiffusion): def init_from_ckpt( - self, path, ignore_keys=list(), only_model=False, load_vae=True, load_unet=True, load_encoder=True, + self, + path, + ignore_keys=list(), + only_model=False, + load_vae=True, + load_unet=True, + load_encoder=True, ): - pl_sd = torch.load(path, map_location="cpu") + pl_sd = torch.load(path, map_location="cpu", weights_only=False) if "state_dict" in list(pl_sd.keys()): pl_sd = pl_sd["state_dict"] sd = {} @@ -144,7 +150,7 @@ def model_provider_func(self, pre_process=True, post_process=True): return model def setup(self, stage=None): - """ PTL hook that is executed after DDP spawns. + """PTL hook that is executed after DDP spawns. We setup datasets here as megatron datasets require DDP to instantiate. See https://pytorch-lightning.readthedocs.io/en/latest/common/lightning_module.html#setup for more information. Args: @@ -260,5 +266,8 @@ def build_pretraining_data_loader(self, dataset, consumed_samples, drop_last=Tru # Torch dataloader. return torch.utils.data.DataLoader( - dataset, batch_sampler=batch_sampler, num_workers=self._cfg.data.num_workers, pin_memory=True, + dataset, + batch_sampler=batch_sampler, + num_workers=self._cfg.data.num_workers, + pin_memory=True, ) diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py index 311ebc0f06f5..50ca205d9acd 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/autoencoder.py @@ -88,7 +88,7 @@ def ema_scope(self, context=None): print(f"{context}: Restored training weights") def init_from_ckpt(self, path, ignore_keys=list()): - sd = torch.load(path, map_location="cpu")["state_dict"] + sd = torch.load(path, map_location="cpu", weights_only=False)["state_dict"] keys = list(sd.keys()) for k in keys: for ik in ignore_keys: @@ -345,7 +345,7 @@ def __init__( state_dict = load_safetensors(from_pretrained) else: - state_dict = torch.load(from_pretrained) + state_dict = torch.load(from_pretrained, weights_only=False) if 'state_dict' in state_dict: state_dict = state_dict['state_dict'] missing_key, unexpected_key, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo) @@ -476,7 +476,7 @@ def load(module: torch.nn.Module, prefix=""): return error_msgs def init_from_ckpt(self, path, ignore_keys=list()): - sd = torch.load(path, map_location="cpu")["state_dict"] + sd = torch.load(path, map_location="cpu", weights_only=False)["state_dict"] keys = list(sd.keys()) for k in keys: for ik in ignore_keys: diff --git a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py index 163b2fb27e0f..80184baa53fa 100644 --- a/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py +++ b/nemo/collections/multimodal/models/text_to_image/stable_diffusion/ldm/ddpm.py @@ -246,7 +246,7 @@ def init_from_ckpt( load_unet=True, load_encoder=True, ): - pl_sd = torch.load(path, map_location="cpu") + pl_sd = torch.load(path, map_location="cpu", weights_only=False) if "state_dict" in list(pl_sd.keys()): pl_sd = pl_sd["state_dict"] @@ -2340,7 +2340,7 @@ def _modify_state_dict(state_dict): if filepath.endswith('.nemo'): conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location) elif filepath.endswith('.ckpt'): - state_dict = torch.load(filepath, map_location)['state_dict'] + state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict'] else: raise RuntimeError(f"{filepath} is not nemo file or ckpt file") if not peft_cfgs: diff --git a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py index 528048b04950..b6f57b259af3 100644 --- a/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py +++ b/nemo/collections/multimodal/modules/stable_diffusion/diffusionmodules/openaimodel.py @@ -959,7 +959,7 @@ def __init__( state_dict = load_safetensors(from_pretrained) else: - state_dict = torch.load(from_pretrained, map_location='cpu') + state_dict = torch.load(from_pretrained, map_location='cpu', weights_only=False) if 'state_dict' in state_dict.keys(): state_dict = state_dict['state_dict'] missing_key, unexpected_keys, _, _ = self._load_pretrained_model(state_dict, from_NeMo=from_NeMo) diff --git a/nemo/collections/multimodal/speech_llm/models/modular_models.py b/nemo/collections/multimodal/speech_llm/models/modular_models.py index a9ee87e9a9de..8517400ee6ef 100644 --- a/nemo/collections/multimodal/speech_llm/models/modular_models.py +++ b/nemo/collections/multimodal/speech_llm/models/modular_models.py @@ -1077,7 +1077,7 @@ def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, mod peft_cfg_cls = PEFT_CONFIG_MAP[model_cfg.peft.peft_scheme] model.load_adapters(cfg.model.peft.restore_from_path, peft_cfg_cls(model_cfg), map_location="cpu") else: - torch_state_dict = torch.load(cfg.model.peft.restore_from_path)['state_dict'] + torch_state_dict = torch.load(cfg.model.peft.restore_from_path, weights_only=False)['state_dict'] model.load_state_dict(torch_state_dict, strict=False) elif cfg.model.peft.restore_from_ckpt.checkpoint_dir and cfg.model.peft.restore_from_ckpt.checkpoint_name: checkpoint_path = os.path.join( @@ -1096,7 +1096,7 @@ def load_adapters_for_inference(cls, cfg: DictConfig, model_cfg: DictConfig, mod peft_cfg_cls = PEFT_CONFIG_MAP[cfg.model.peft.peft_scheme] model.load_adapters(checkpoint_path, peft_cfgs=peft_cfg_cls(model_cfg), map_location="cpu") else: - model.load_state_dict(torch.load(checkpoint_path), strict=False) + model.load_state_dict(torch.load(checkpoint_path, weights_only=False), strict=False) else: raise NotImplementedError("distributed checkpointing of PEFT weights is not supported") elif model_cfg.peft.get("peft_scheme", None): diff --git a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py index 4cdce4ac59c4..506adbc30d73 100644 --- a/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py +++ b/nemo/collections/multimodal/speech_llm/parts/mixins/adapter_mixin.py @@ -55,7 +55,7 @@ def load_adapters( if filepath.endswith('.nemo'): conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location) elif filepath.endswith('.ckpt'): - state_dict = torch.load(filepath, map_location)['state_dict'] + state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict'] else: raise RuntimeError(f"{filepath} is not nemo file or ckpt file") if not peft_cfgs: diff --git a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py index 00552cb7f96e..b9485f3f2dc5 100644 --- a/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/multimodal_adapter_mixins.py @@ -130,7 +130,7 @@ def load_adapters( sharded_state_dict = self.sharded_state_dict(prefix="model.") conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location, sharded_state_dict) elif filepath.endswith('.ckpt'): - state_dict = torch.load(filepath, map_location)['state_dict'] + state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict'] else: raise RuntimeError(f"{filepath} is not nemo file or ckpt file") if not self.ptuning_only_and_non_first_stage: diff --git a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py index 8f7870b7d4c7..080db1fa4254 100644 --- a/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py +++ b/nemo/collections/nlp/parts/mixins/nlp_adapter_mixins.py @@ -368,7 +368,7 @@ def load_adapters( if filepath.endswith('.nemo'): conf, state_dict = self._get_config_and_state_dict_from_nemo(filepath, map_location) elif filepath.endswith('.ckpt'): - state_dict = torch.load(filepath, map_location)['state_dict'] + state_dict = torch.load(filepath, map_location, weights_only=False)['state_dict'] else: raise RuntimeError(f"{filepath} is not nemo file or ckpt file") if not peft_cfgs: From ca4e4f0d7ce9f11be7bb79d8dba42ee53b7991ad Mon Sep 17 00:00:00 2001 From: Abhishree Thittenamane <47577437+athitten@users.noreply.github.com> Date: Fri, 17 Jan 2025 11:49:43 -0800 Subject: [PATCH 16/27] Add context_logits for eval accuracy calculation in case of multi token prediction tasks (#11753) * Add server ready check before evaluation Uses bool generation_logits_available as inputs dict does not contain it Signed-off-by: Abhishree * Add context logits Signed-off-by: Abhishree * Remove max_tokens_to_generate and add more comments Signed-off-by: Abhishree * Apply isort and black reformatting Signed-off-by: athitten * Get context_logits for multi token prediction tasks Signed-off-by: Abhishree * Fix bug with single/multi token condition check Signed-off-by: Abhishree * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Apply isort and black reformatting Signed-off-by: athitten * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Bugfix with output_context_logits Signed-off-by: Abhishree --------- Signed-off-by: Abhishree Signed-off-by: athitten Co-authored-by: athitten Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- nemo/collections/llm/api.py | 13 +++--- nemo/collections/llm/deploy/base.py | 2 + nemo/collections/llm/evaluation/base.py | 61 +++++++++++++++++++------ nemo/deploy/nlp/query_llm.py | 6 +++ nemo/export/tensorrt_llm.py | 18 +++++++- nemo/export/trt_llm/tensorrt_llm_run.py | 3 ++ nemo/export/vllm_exporter.py | 5 ++ requirements/requirements_eval.txt | 2 + 8 files changed, 88 insertions(+), 22 deletions(-) create mode 100644 requirements/requirements_eval.txt diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 386b08cc7813..83201e78283d 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -337,6 +337,7 @@ def deploy( max_input_len: int = 256, max_output_len: int = 256, max_batch_size: int = 8, + output_context_logits: bool = True, output_generation_logits: bool = True, ): """ @@ -364,8 +365,11 @@ def deploy( Needs to be True to be able to run evaluation. Default: True. openai_format_response (bool): Return the response from PyTriton server in OpenAI compatible format. Needs to be True while running evaluation. Default: True. + output_context_logits (bool): If True builds trtllm engine with gather_context_logits set to True. Default: True. + context_logits are used to compute the logProb of the output token in case of multi token prediction benchmarks. output_generation_logits (bool): If True builds trtllm engine with gather_generation_logits set to True. - generation_logits are used to compute the logProb of the output token. Default: True. + generation_logits are used to compute the logProb of the output token in case of single token prediction + benchmarks (like MMLU, lambada). Default: True. """ from nemo.collections.llm.deploy.base import get_trtllm_deployable, unset_environment_variables from nemo.deploy import DeployPyTriton @@ -383,6 +387,7 @@ def deploy( max_output_len, max_batch_size, dtype, + output_context_logits, output_generation_logits, ) @@ -425,7 +430,6 @@ def evaluate( limit: Optional[Union[int, float]] = None, bootstrap_iters: int = 100000, # inference params - max_tokens_to_generate: Optional[int] = 256, temperature: Optional[float] = 0.000000001, top_p: Optional[float] = 0.0, top_k: Optional[int] = 1, @@ -454,7 +458,6 @@ def evaluate( bootstrap_iters (int): Number of iterations for bootstrap statistics, used when calculating stderrs. Set to 0 for no stderr calculations to be performed. Default: 100000. # inference params - max_tokens_to_generate (int): max tokens to generate. Default: 256. temperature: Optional[float]: float value between 0 and 1. temp of 0 indicates greedy decoding, where the token with highest prob is chosen. Temperature can't be set to 0.0 currently, due to a bug with TRTLLM (# TODO to be investigated). Hence using a very samll value as the default. Default: 0.000000001. @@ -480,9 +483,7 @@ def evaluate( # Wait for server to be ready before starting evaluation evaluation.wait_for_server_ready(url=url, triton_http_port=triton_http_port, model_name=model_name) # Create an object of the NeMoFWLM which is passed as a model to evaluator.simple_evaluate - model = evaluation.NeMoFWLMEval( - model_name, url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos - ) + model = evaluation.NeMoFWLMEval(model_name, url, tokenizer, temperature, top_p, top_k, add_bos) results = evaluator.simple_evaluate( model=model, tasks=eval_task, diff --git a/nemo/collections/llm/deploy/base.py b/nemo/collections/llm/deploy/base.py index 4b0065271604..fd82c94effb3 100644 --- a/nemo/collections/llm/deploy/base.py +++ b/nemo/collections/llm/deploy/base.py @@ -63,6 +63,7 @@ def get_trtllm_deployable( max_output_len, max_batch_size, dtype, + output_context_logits, output_generation_logits, ): """ @@ -109,6 +110,7 @@ def get_trtllm_deployable( max_output_len=max_output_len, max_batch_size=max_batch_size, dtype=dtype, + gather_context_logits=output_context_logits, gather_generation_logits=output_generation_logits, ) except Exception as error: diff --git a/nemo/collections/llm/evaluation/base.py b/nemo/collections/llm/evaluation/base.py index aa415cb1022a..96d99b445433 100644 --- a/nemo/collections/llm/evaluation/base.py +++ b/nemo/collections/llm/evaluation/base.py @@ -33,38 +33,51 @@ class NeMoFWLMEval(LM): Created based on: https://github.com/EleutherAI/lm-evaluation-harness/blob/v0.4.4/docs/model_guide.md """ - def __init__(self, model_name, api_url, tokenizer, max_tokens_to_generate, temperature, top_p, top_k, add_bos): + def __init__(self, model_name, api_url, tokenizer, temperature, top_p, top_k, add_bos): self.model_name = model_name self.api_url = api_url self.tokenizer = tokenizer - self.max_tokens_to_generate = max_tokens_to_generate self.temperature = temperature self.top_p = top_p self.top_k = top_k self.add_bos = add_bos super().__init__() - def _generate_tokens_logits(self, payload, return_text: bool = False, return_logits: bool = False): + def _generate_tokens_logits( + self, payload, single_prediction_token, return_text: bool = False, return_logits: bool = False + ): """ A private method that sends post request to the model on PyTriton server and returns either generated text or logits. """ nq = NemoQueryLLM(url=self.api_url, model_name=payload['model']) + output_context_logits = False + output_generation_logits = False + if single_prediction_token: + # In case of single token prediction return the generation logits + output_generation_logits = True + else: + # In case of multiple token prediction return the context logits + output_context_logits = True response = nq.query_llm( prompts=payload['prompt'] if isinstance(payload['prompt'], list) else [payload['prompt']], max_output_len=payload['max_tokens'], top_k=payload['top_k'], top_p=payload['top_p'], temperature=payload['temperature'], - output_generation_logits=True, + output_context_logits=output_context_logits, + output_generation_logits=output_generation_logits, openai_format_response=True, ) if return_text: return response["choices"][0]["text"] # shape[batch_size, 1] - if return_logits: - return response["choices"][0]["generation_logits"] # shape[batch_size, 1, num_tokens, vocab_size] + elif return_logits: + if output_context_logits: + return response["choices"][0]["context_logits"] + else: + return response["choices"][0]["generation_logits"] def tokenizer_type(self, tokenizer): """ @@ -93,6 +106,16 @@ def loglikelihood(self, requests: list[Instance]): elif tokenizer_type == "AutoTokenizer": special_tokens_kwargs['add_special_tokens'] = self.add_bos + single_prediction_token = False + # Assuming evaluating on only one benchmark/task at a time, hence all instances in requests are of the same + # task. + mmlu_regex_pattern = r"^mmlu_" + lambada_regex_pattern = r"^lambada_" + if re.match(mmlu_regex_pattern, requests[0].task_name) or re.match( + lambada_regex_pattern, requests[0].task_name + ): + single_prediction_token = True + results = [] for request in tqdm(requests): # get the input prompt from the request @@ -105,31 +128,39 @@ def loglikelihood(self, requests: list[Instance]): if self.tokenizer_type(self.tokenizer) == "SentencePieceTokenizer": continuation_enc = continuation_enc[1:] num_cont_tokens = len(continuation_enc) - # Update self.max_tokens_to_generate with number of continuation tokens (or output tokens) in the request - self.max_tokens_to_generate = num_cont_tokens + # Hard code max_tokens_to_generate to 1 to always generate just 1 token + self.max_tokens_to_generate = 1 + # Delete the last token from continuation before passing it to the ip prompt by replacing with empty string + prompt = context + continuation.replace(self.tokenizer.tokenizer.decode(continuation_enc[-1]), "") # Create payload to query the model deployed on PyTriton server payload = { "model": self.model_name, - "prompt": context, + "prompt": prompt, "max_tokens": self.max_tokens_to_generate, "temperature": self.temperature, "top_p": self.top_p, "top_k": self.top_k, } # Get the logits from the model - generation_logits = self._generate_tokens_logits(payload, return_logits=True) - # Convert generation_logits to torch tensor to easily get logprobs wo manual implementation of log_softmax - multi_logits = F.log_softmax(torch.tensor(generation_logits[0]), dim=-1) + logits = self._generate_tokens_logits(payload, single_prediction_token, return_logits=True) + # In case of multiple token prediction where full context logits are returned, get only logits + # corresponding to the continuation tokens from the context logits tensor.context_logits contains logits + # for all tokens in the ip prompt along with the logit for the next token prediction after the final token + # in the prompt. Shape of context_logits: [1, #tokens_in_prompt+1, vocab_size] + if not single_prediction_token: + logits = logits[:, -num_cont_tokens:, :] + # Convert logits to torch tensor to easily get logprobs wo manual implementation of log_softmax + logProbs = F.log_softmax(torch.tensor(logits), dim=-1) # Convert encoded continuation tokens to torch tensor cont_toks = torch.tensor(continuation_enc, dtype=torch.long).unsqueeze(0) # Get the greedy token from the logits (i.e token with the highest prob) - greedy_tokens = multi_logits.argmax(dim=-1) + greedy_tokens = logProbs.argmax(dim=-1) # Check if all greedy_tokens match the the actual continuation tokens is_greedy = (greedy_tokens == cont_toks).all() # Get the logits corresponding to the actual continuation tokens - logits = torch.gather(multi_logits, 2, cont_toks.unsqueeze(-1)).squeeze(-1) + logProbs_actual = torch.gather(logProbs, 2, cont_toks.unsqueeze(-1)).squeeze(-1) # result is tuple of logProb of generating the continuation token and is_greedy - result = (float(logits.sum()), bool(is_greedy)) + result = (float(logProbs_actual.sum()), bool(is_greedy)) results.append(result) diff --git a/nemo/deploy/nlp/query_llm.py b/nemo/deploy/nlp/query_llm.py index 8b65a278ff41..93443d47a6a8 100644 --- a/nemo/deploy/nlp/query_llm.py +++ b/nemo/deploy/nlp/query_llm.py @@ -198,6 +198,7 @@ def query_llm( end_strings=None, init_timeout=60.0, openai_format_response: bool = False, + output_context_logits: bool = False, output_generation_logits: bool = False, ): """ @@ -275,6 +276,9 @@ def query_llm( if end_strings is not None: inputs["end_strings"] = str_list2numpy(end_strings) + if output_context_logits is not None: + inputs["output_context_logits"] = np.full(prompts.shape, output_context_logits, dtype=np.bool_) + if output_generation_logits is not None: inputs["output_generation_logits"] = np.full(prompts.shape, output_generation_logits, dtype=np.bool_) @@ -301,6 +305,8 @@ def query_llm( } if output_generation_logits: openai_response["choices"][0]["generation_logits"] = result_dict["generation_logits"] + if output_context_logits: + openai_response["choices"][0]["context_logits"] = result_dict["context_logits"] return openai_response else: return sentences diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index f2bb9d36b377..192b8bc86f65 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -959,6 +959,7 @@ def forward( prompt_embeddings_checkpoint_path: str = None, streaming: bool = False, output_log_probs: bool = False, + output_context_logits: bool = False, output_generation_logits: bool = False, **sampling_kwargs, ): @@ -1049,6 +1050,7 @@ def forward( no_repeat_ngram_size=no_repeat_ngram_size, output_log_probs=output_log_probs, multiprocessed_env=multiprocessed_env, + output_context_logits=output_context_logits, output_generation_logits=output_generation_logits, **sampling_kwargs, ) @@ -1133,6 +1135,7 @@ def get_triton_input(self): Tensor(name="no_repeat_ngram_size", shape=(-1,), dtype=np.single, optional=True), Tensor(name="task_id", shape=(-1,), dtype=bytes, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), + Tensor(name="output_context_logits", shape=(-1,), dtype=np.bool_, optional=False), Tensor(name="output_generation_logits", shape=(-1,), dtype=np.bool_, optional=False), ) return inputs @@ -1142,6 +1145,7 @@ def get_triton_output(self): outputs = ( Tensor(name="outputs", shape=(-1,), dtype=bytes), Tensor(name="generation_logits", shape=(-1,), dtype=np.single), + Tensor(name="context_logits", shape=(-1,), dtype=np.single), ) return outputs @@ -1149,6 +1153,7 @@ def get_triton_output(self): def triton_infer_fn(self, **inputs: np.ndarray): """Triton infer function for streaming""" output_dict = {} + context_logits_available = False generation_logits_available = False try: infer_input = {"input_texts": str_ndarray2list(inputs.pop("prompts"))} @@ -1179,10 +1184,21 @@ def triton_infer_fn(self, **inputs: np.ndarray): if "output_generation_logits" in inputs: generation_logits_available = inputs["output_generation_logits"][0][0] infer_input["output_generation_logits"] = inputs.pop("output_generation_logits")[0][0] + if "output_context_logits" in inputs: + context_logits_available = inputs["output_context_logits"][0][0] + infer_input["output_context_logits"] = inputs.pop("output_context_logits")[0][0] if generation_logits_available: output_texts, generation_logits = self.forward(**infer_input) - output_dict["generation_logits"] = np.array(generation_logits.cpu().numpy()) + # generation_logits is a 4d tensor of dim [1,1,#generated_tokens, vocab_size], return just the 3d tensor + # in output dict. + output_dict["generation_logits"] = np.array(generation_logits[0].cpu().numpy()) + elif context_logits_available: + output_texts, context_logits = self.forward(**infer_input) + # convert context logits to 3d tensor from list since its avaiable as a list of tensor shaped + # [#tokens, vocab_size] + context_logits = context_logits[0].unsqueeze(0) + output_dict["context_logits"] = np.array(context_logits.cpu().numpy()) else: output_texts = self.forward(**infer_input) output_dict["outputs"] = cast_output(output_texts, np.bytes_) diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index ef67c918290f..8be537f840e8 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -647,6 +647,7 @@ def generate( streaming: bool = False, output_log_probs=False, multiprocessed_env=False, + output_context_logits=False, output_generation_logits=False, **sampling_kwargs, ) -> Optional[List[List[str]]]: @@ -709,6 +710,8 @@ def generate( if output_generation_logits: return output_lines_list, outputs['generation_logits'] + elif output_context_logits: + return output_lines_list, outputs['context_logits'] return output_lines_list diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py index 97575058bd1c..b32f5c0a76fc 100644 --- a/nemo/export/vllm_exporter.py +++ b/nemo/export/vllm_exporter.py @@ -403,6 +403,7 @@ def get_triton_input(self): Tensor(name="temperature", shape=(-1,), dtype=numpy.single, optional=True), Tensor(name="lora_uids", shape=(-1,), dtype=bytes, optional=True), Tensor(name="output_generation_logits", shape=(-1,), dtype=numpy.bool_, optional=True), + Tensor(name="output_context_logits", shape=(-1,), dtype=numpy.bool_, optional=True), ) return inputs @@ -456,6 +457,7 @@ def forward( streaming: bool = False, output_log_probs: bool = False, output_generation_logits: bool = False, + output_context_logits: bool = False, ) -> Union[List[List[str]], Iterable[List[List[str]]]]: """ The forward function performs LLM evaluation on the provided array of prompts with other parameters shared, @@ -488,6 +490,9 @@ def forward( if output_generation_logits: raise NotImplementedError("output_generation_logits is not supported") + if output_context_logits: + raise NotImplementedError("output_context_logits is not supported") + request_ids = [] for index in range(len(input_texts)): prompt = input_texts[index] diff --git a/requirements/requirements_eval.txt b/requirements/requirements_eval.txt new file mode 100644 index 000000000000..60828395c199 --- /dev/null +++ b/requirements/requirements_eval.txt @@ -0,0 +1,2 @@ +# Installs EleutherAI's lm-evaluation-harness https://github.com/EleutherAI/lm-evaluation-harness/tree/main +lm-eval From ad807ae56821c638923f20a251694c1fdac6272f Mon Sep 17 00:00:00 2001 From: Ao Tang Date: Fri, 17 Jan 2025 17:09:14 -0500 Subject: [PATCH 17/27] add dataset_root (#11837) --- nemo/collections/llm/bert/data/specter.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/bert/data/specter.py b/nemo/collections/llm/bert/data/specter.py index 7784b32e6bd9..0c477851d22f 100644 --- a/nemo/collections/llm/bert/data/specter.py +++ b/nemo/collections/llm/bert/data/specter.py @@ -43,6 +43,7 @@ class SpecterDataModule(FineTuningDataModule, IOMixin): def __init__( self, + dataset_root: str = None, seq_length: int = 512, tokenizer: Optional["TokenizerSpec"] = None, micro_batch_size: int = 4, @@ -61,7 +62,7 @@ def __init__( self.delete_raw = delete_raw super().__init__( - dataset_root=get_dataset_root("specter"), + dataset_root=get_dataset_root("specter") if dataset_root is None else dataset_root, seq_length=seq_length, tokenizer=tokenizer, micro_batch_size=micro_batch_size, From 8bf5144873413c9ad653365ff1ebc049ba74d5b8 Mon Sep 17 00:00:00 2001 From: Maanu Grover <109391026+maanug-nv@users.noreply.github.com> Date: Fri, 17 Jan 2025 16:47:12 -0800 Subject: [PATCH 18/27] Support both Path and str for APIs (#11865) * support both path and str for APIs Signed-off-by: Maanu Grover * cleanup Signed-off-by: Maanu Grover * fix cleanup Signed-off-by: Maanu Grover --------- Signed-off-by: Maanu Grover --- nemo/collections/llm/api.py | 41 +++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/nemo/collections/llm/api.py b/nemo/collections/llm/api.py index 83201e78283d..475982b0f746 100644 --- a/nemo/collections/llm/api.py +++ b/nemo/collections/llm/api.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. import json -import os import warnings from copy import deepcopy from pathlib import Path @@ -47,6 +46,7 @@ TokenizerType = Any +AnyPath = Union[Path, str] @run.cli.entrypoint(namespace="llm") @@ -322,14 +322,14 @@ def ptq( @run.cli.entrypoint(namespace="llm") def deploy( - nemo_checkpoint: Path = None, + nemo_checkpoint: AnyPath = None, model_type: str = "llama", triton_model_name: str = "triton_model", triton_model_version: Optional[int] = 1, triton_http_port: int = 8000, triton_grpc_port: int = 8001, triton_http_address: str = "0.0.0.0", - triton_model_repository: Path = None, + triton_model_repository: AnyPath = None, num_gpus: int = 1, tensor_parallelism_size: int = 1, pipeline_parallelism_size: int = 1, @@ -376,6 +376,11 @@ def deploy( unset_environment_variables() + if not isinstance(nemo_checkpoint, Path): + nemo_checkpoint = Path(nemo_checkpoint) + if not isinstance(triton_model_repository, Path): + triton_model_repository = Path(triton_model_repository) + triton_deployable = get_trtllm_deployable( nemo_checkpoint, model_type, @@ -421,7 +426,7 @@ def deploy( def evaluate( - nemo_checkpoint_path: Path, + nemo_checkpoint_path: AnyPath, url: str = "grpc://0.0.0.0:8001", triton_http_port: int = 8000, model_name: str = "triton_model", @@ -442,7 +447,8 @@ def evaluate( Args: nemo_checkpoint_path (Path): Path for nemo 2.0 checkpoint. This is used to get the tokenizer from the ckpt which is required to tokenize the evaluation input and output prompts. - url (str): grpc service url that were used in the deploy method above in the format: grpc://{grpc_service_ip}:{grpc_port}. + url (str): grpc service url that were used in the deploy method above + in the format: grpc://{grpc_service_ip}:{grpc_port}. triton_http_port (int): HTTP port that was used for the PyTriton server in the deploy method. Default: 8000. Please pass the triton_http_port if using a custom port in the deploy method. model_name (str): Name of the model that is deployed on PyTriton server. It should be the same as @@ -478,6 +484,9 @@ def evaluate( from nemo.collections.llm import evaluation + if not isinstance(nemo_checkpoint_path, Path): + nemo_checkpoint_path = Path(nemo_checkpoint_path) + # Get tokenizer from nemo ckpt. This works only with NeMo 2.0 ckpt. tokenizer = io.load_context(nemo_checkpoint_path + "/context", subpath="model.tokenizer") # Wait for server to be ready before starting evaluation @@ -499,7 +508,7 @@ def evaluate( def import_ckpt( model: pl.LightningModule, source: str, - output_path: Optional[Path] = None, + output_path: Optional[AnyPath] = None, overwrite: bool = False, ) -> Path: """ @@ -557,6 +566,9 @@ def import_ckpt( ValueError: If the model does not implement ConnectorMixin, indicating a lack of necessary importer functionality. """ + if output_path and not isinstance(output_path, Path): + output_path = Path(output_path) + output = io.import_ckpt(model=model, source=source, output_path=output_path, overwrite=overwrite) console = Console() @@ -569,15 +581,17 @@ def import_ckpt( return output -def load_connector_from_trainer_ckpt(path: Path, target: str) -> io.ModelConnector: +def load_connector_from_trainer_ckpt(path: AnyPath, target: str) -> io.ModelConnector: + if not isinstance(path, Path): + path = Path(path) return io.load_context(path, subpath="model").exporter(target, path) @run.cli.entrypoint(name="export", namespace="llm") def export_ckpt( - path: Path, + path: AnyPath, target: str, - output_path: Optional[Path] = None, + output_path: Optional[AnyPath] = None, overwrite: bool = False, load_connector: Callable[[Path, str], io.ModelConnector] = load_connector_from_trainer_ckpt, ) -> Path: @@ -628,6 +642,11 @@ def export_ckpt( ValueError: If the model does not implement ConnectorMixin, indicating a lack of necessary exporter functionality. """ + if not isinstance(path, Path): + path = Path(path) + if output_path and not isinstance(output_path, Path): + output_path = Path(output_path) + output = io.export_ckpt(path, target, output_path, overwrite, load_connector) console = Console() @@ -638,7 +657,7 @@ def export_ckpt( @run.cli.entrypoint(name="generate", namespace="llm") def generate( - path: Union[Path, str], + path: AnyPath, trainer: nl.Trainer, prompts: Optional[list[str]] = None, encoder_prompts: Optional[list[str]] = None, @@ -650,7 +669,7 @@ def generate( inference_batch_times_seqlen_threshold: int = 1000, inference_params: Optional["CommonInferenceParams"] = None, text_only: bool = False, - output_path: Optional[Union[Path, str]] = None, + output_path: Optional[AnyPath] = None, ) -> list[Union["InferenceRequest", str]]: """ Generates text using a NeMo LLM model. From 4df3fe5460af0c559b08b482688d6272d9c3821d Mon Sep 17 00:00:00 2001 From: Vince Xu Date: Sat, 18 Jan 2025 08:58:57 +0800 Subject: [PATCH 19/27] fix tensor dimensions are not compatible for FP8 issue in sft (#8787) Co-authored-by: yuhuang Co-authored-by: Eric Harper --- .../nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py index 53d94452a480..349321d3570a 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_chat_dataset.py @@ -373,7 +373,7 @@ def collate_fn(self, batch): if self.pad_to_max_length: max_length = self.max_seq_length else: - max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 8)) + max_length = min(self.max_seq_length, self._ceil_to_nearest(max_length, 16)) assert max_length <= self.max_seq_length if not self.get_attention_mask_from_fusion: From bd58e14392c47118ca65a014f5e3d6e0bd66e9cc Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Fri, 17 Jan 2025 19:56:15 -0800 Subject: [PATCH 20/27] Run nsys callback on GBS not on MBS (#11861) * fix nsys callback running on each mbs Signed-off-by: Alexandros Koumparoulis * Update test Signed-off-by: Alexandros Koumparoulis --------- Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/pytorch/callbacks/nsys.py | 7 +++++-- tests/lightning/pytorch/callbacks/test_nsys.py | 2 ++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/nemo/lightning/pytorch/callbacks/nsys.py b/nemo/lightning/pytorch/callbacks/nsys.py index 13b059011426..0368b2d52773 100644 --- a/nemo/lightning/pytorch/callbacks/nsys.py +++ b/nemo/lightning/pytorch/callbacks/nsys.py @@ -78,6 +78,7 @@ def __init__( f'Nsys profiling setup with start_step: {self._nsys_profile_start_step},' f'and end_step: {self._nsys_profile_end_step}' ) + self._has_nsys_enabled = False def _rank_is_active(self, trainer): # TODO(@akoumparouli): is this function cache-able? @@ -98,7 +99,8 @@ def on_train_batch_start(self, trainer, pl_module, batch, batch_idx: int) -> Opt return current_step = get_current_epoch_step(trainer) - if current_step == self._nsys_profile_start_step: + if current_step == self._nsys_profile_start_step and not self._has_nsys_enabled: + self._has_nsys_enabled = True torch.cuda.cudart().cudaProfilerStart() if self._nsys_profile_gen_shape: torch.autograd.profiler.emit_nvtx(record_shapes=True).__enter__() @@ -114,6 +116,7 @@ def on_train_batch_end(self, trainer, pl_module, outputs, batch, batch_idx: int) return current_step = get_current_epoch_step(trainer) - if current_step == self._nsys_profile_end_step: + if current_step == self._nsys_profile_end_step and self._has_nsys_enabled: torch.cuda.cudart().cudaProfilerStop() torch.autograd.profiler.emit_nvtx().__exit__(None, None, None) + self._has_nsys_enabled = False diff --git a/tests/lightning/pytorch/callbacks/test_nsys.py b/tests/lightning/pytorch/callbacks/test_nsys.py index 9653e707198e..04ca7be718d9 100644 --- a/tests/lightning/pytorch/callbacks/test_nsys.py +++ b/tests/lightning/pytorch/callbacks/test_nsys.py @@ -111,6 +111,8 @@ def test_on_train_batch_end_profiling( callback = NsysCallback(start_step=10, end_step=20, ranks=[0]) mock_trainer.strategy.current_epoch_step = 20 + assert callback._has_nsys_enabled == False + callback._has_nsys_enabled = True callback.on_train_batch_end(mock_trainer, mock_pl_module, None, None, 20) mock_cudart().cudaProfilerStop.assert_called_once() From fcd4807d882e706a5f95a6c651ce066e08dbbc50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sat, 18 Jan 2025 21:33:05 +0100 Subject: [PATCH 21/27] ci: Set bump-branch to weekly (#11889) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/mcore-tag-bump-bot.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml index 1b0712924101..467e89c4144e 100644 --- a/.github/workflows/mcore-tag-bump-bot.yml +++ b/.github/workflows/mcore-tag-bump-bot.yml @@ -10,10 +10,10 @@ jobs: uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_bump_dockerfile.yml@v0.11.0 with: source-repository: NVIDIA/Megatron-LM - source-ref: main + source-ref: weekly-bump-2025-03 build-arg: MCORE_TAG dockerfile: Dockerfile.ci - base-branch: main + base-branch: weekly-bump-2025-03 cicd-label: Run CICD pr-reviewers: 'pablo-garay' secrets: From 102bac65380d93fef0cd1722445081220158932f Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Sat, 18 Jan 2025 18:11:40 -0800 Subject: [PATCH 22/27] surface attn_implementation option (#11873) Signed-off-by: Alexandros Koumparoulis --- .../llm/gpt/model/hf_auto_model_for_causal_lm.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py index abe966229ffe..5f315397584b 100644 --- a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py +++ b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py @@ -44,6 +44,7 @@ def __init__( trust_remote_code=False, default_dtype=torch.bfloat16, load_in_4bit=False, + attn_implementation="sdpa", ): super().__init__() self.save_hyperparameters() @@ -58,6 +59,7 @@ def __init__( self.trust_remote_code = trust_remote_code self.default_dtype = default_dtype self.load_in_4bit = load_in_4bit + self.attn_implementation = attn_implementation @property def tokenizer(self): @@ -82,6 +84,7 @@ def configure_model(self): torch_dtype='auto', trust_remote_code=self.trust_remote_code, load_in_4bit=self.load_in_4bit, + attn_implementation=self.attn_implementation, ) else: from transformers import AutoConfig @@ -89,7 +92,10 @@ def configure_model(self): config = AutoConfig.from_pretrained(self.model_name, trust_remote_code=self.trust_remote_code) dtype = getattr(config, 'torch_dtype', self.default_dtype) self.model = AutoModelForCausalLM.from_config( - config, torch_dtype=dtype, trust_remote_code=self.trust_remote_code + config, + torch_dtype=dtype, + trust_remote_code=self.trust_remote_code, + attn_implementation=self.attn_implementation, ) # Apply FSDP2 and TP to the model From 1218a040c6a95451dc9567068e5ae5de8ff5633c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Sun, 19 Jan 2025 07:43:49 +0100 Subject: [PATCH 23/27] chore: Update mcore-tag-bump-bot.yml (#11891) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/mcore-tag-bump-bot.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml index 467e89c4144e..6914a8a217d2 100644 --- a/.github/workflows/mcore-tag-bump-bot.yml +++ b/.github/workflows/mcore-tag-bump-bot.yml @@ -10,11 +10,11 @@ jobs: uses: NVIDIA/NeMo-FW-CI-templates/.github/workflows/_bump_dockerfile.yml@v0.11.0 with: source-repository: NVIDIA/Megatron-LM - source-ref: weekly-bump-2025-03 + source-ref: main build-arg: MCORE_TAG dockerfile: Dockerfile.ci base-branch: weekly-bump-2025-03 cicd-label: Run CICD pr-reviewers: 'pablo-garay' secrets: - PAT: ${{ secrets.PAT }} \ No newline at end of file + PAT: ${{ secrets.PAT }} From aa4f9fb98ecf11455115face11ae9f9213d8d252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?oliver=20k=C3=B6nig?= Date: Mon, 20 Jan 2025 11:07:43 +0100 Subject: [PATCH 24/27] ci: Bump Mcore in weekly PR (#11897) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: oliver könig --- .github/workflows/mcore-tag-bump-bot.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/mcore-tag-bump-bot.yml b/.github/workflows/mcore-tag-bump-bot.yml index 6914a8a217d2..01afb55d4361 100644 --- a/.github/workflows/mcore-tag-bump-bot.yml +++ b/.github/workflows/mcore-tag-bump-bot.yml @@ -13,7 +13,7 @@ jobs: source-ref: main build-arg: MCORE_TAG dockerfile: Dockerfile.ci - base-branch: weekly-bump-2025-03 + base-branch: weekly-bump cicd-label: Run CICD pr-reviewers: 'pablo-garay' secrets: From 0075ed0fd30ee0b8cd7f10265b9593f66ce162d1 Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 20 Jan 2025 11:14:04 -0800 Subject: [PATCH 25/27] check restore_config first (#11890) Signed-off-by: Alexandros Koumparoulis --- nemo/lightning/resume.py | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/nemo/lightning/resume.py b/nemo/lightning/resume.py index 6d6ddda1fd80..0224d7e9836d 100644 --- a/nemo/lightning/resume.py +++ b/nemo/lightning/resume.py @@ -103,23 +103,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): if isinstance(trainer, fl.Fabric): raise NotImplementedError("Fabric is not supported yet.") - trainer_ckpt_path = self.get_trainer_ckpt_path(model) - if trainer_ckpt_path: - trainer.ckpt_path = trainer_ckpt_path - trainer.checkpoint_callback.last_model_path = trainer_ckpt_path - # Load artifacts - if getattr(self.restore_config, 'load_artifacts', False): - if isinstance(trainer_ckpt_path, AdapterPath): - # load tokenizer from the base model during peft resume, in case the first peft checkpoint - # is deleted before the current peft checkpoint is saved - context_path = trainer_ckpt_path.base_model_path / "context" - if not context_path.exists(): - context_path = trainer_ckpt_path.base_model_path - else: - context_path = self.get_context_path(model) - model = _try_restore_tokenizer(model, context_path) - - elif self.restore_config: + if self.restore_config: new_path = self._extract_path( model=model, path=self.restore_config.path, @@ -139,6 +123,21 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], model=None): _try_restore_tokenizer(model, context_path) + elif (trainer_ckpt_path := self.get_trainer_ckpt_path(model)) is not None: + trainer.ckpt_path = trainer_ckpt_path + trainer.checkpoint_callback.last_model_path = trainer_ckpt_path + # Load artifacts + if getattr(self.restore_config, 'load_artifacts', False): + if isinstance(trainer_ckpt_path, AdapterPath): + # load tokenizer from the base model during peft resume, in case the first peft checkpoint + # is deleted before the current peft checkpoint is saved + context_path = trainer_ckpt_path.base_model_path / "context" + if not context_path.exists(): + context_path = trainer_ckpt_path.base_model_path + else: + context_path = self.get_context_path(model) + model = _try_restore_tokenizer(model, context_path) + def _extract_path( self, model: Optional[io.ConnectorMixin], path: str, adapter_path: Optional[str] = None ) -> BasePath: From 499161e6e173a25eb642e9c9ef6a28b73d3ea6ec Mon Sep 17 00:00:00 2001 From: Alexandros Koumparoulis <153118171+akoumpa@users.noreply.github.com> Date: Mon, 20 Jan 2025 15:17:30 -0800 Subject: [PATCH 26/27] LinearAdapter: propagate args to _init_adapter (#11902) * propagate defaults Signed-off-by: Alexandros Koumparoulis * switch dropout default to 0.0 Signed-off-by: Alexandros Koumparoulis * Apply isort and black reformatting Signed-off-by: akoumpa --------- Signed-off-by: Alexandros Koumparoulis Signed-off-by: akoumpa Co-authored-by: akoumpa --- nemo/collections/llm/peft/lora.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/nemo/collections/llm/peft/lora.py b/nemo/collections/llm/peft/lora.py index 6c7e7e93ae8f..a6b7ad288765 100644 --- a/nemo/collections/llm/peft/lora.py +++ b/nemo/collections/llm/peft/lora.py @@ -52,7 +52,7 @@ class LinearAdapter(nn.Linear): orig_linear (nn.Module): the linear module to augment. dim (int): lora's dim in_features -> dim -> out_features. alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.1). + dropout (float): dropout prob (default: 0.0). dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they @@ -64,7 +64,7 @@ def __init__( orig_linear, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -82,14 +82,22 @@ def __init__( if orig_linear.bias is not None: self.bias.data.copy_(orig_linear.bias.data) # initialize the adapte - LinearAdapter._init_adapter(self) + LinearAdapter._init_adapter( + self, + dim=dim, + alpha=alpha, + dropout=dropout, + dropout_position=dropout_position, + lora_A_init_method=lora_A_init_method, + lora_dtype=lora_dtype, + ) @staticmethod def _init_adapter( obj, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -101,7 +109,7 @@ def _init_adapter( obj (LinearAdapter | nn.Module): input module to adapt. dim (int): lora's dim in_features -> dim -> out_features. alpha (int): lora's scaling alpha. - dropout (float): dropout prob (default: 0.1). + dropout (float): dropout prob (default: 0.0). dropout_position (str): where to apply dropout rel. to lora (choices= ['pre', 'post'], default=post) lora_A_init_method (str): init method for lora_A (choices= ['xavier', 'uniform']) lora_dtype (torch.dtype): weight's dtype, by default will use orig_linear's but if they @@ -155,7 +163,7 @@ def patch_linear_module( orig_linear, dim=8, alpha=32, - dropout=0.1, + dropout=0.0, dropout_position='post', lora_A_init_method='xavier', lora_dtype=None, @@ -175,7 +183,7 @@ def patch_linear_module( orig_linear (nn.Linear): the module we add adapter to. dim (int, optional): Lora dim. Defaults to 8. alpha (int, optional): Lora alpha scale. Defaults to 32. - dropout (float, optional): dropout prob. Defaults to 0.1. + dropout (float, optional): dropout prob. Defaults to 0.0. dropout_position (str, optional): location to apply dropout wrt lora. Defaults to 'post' (choices: 'pre', 'post'). lora_A_init_method (str, optional): lora_a init method. Defaults to 'xavier'. From b4f89c50bda173a9ec2b2883aeee4dc291465031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Kami=C5=84ski?= <67481570+Laplasjan107@users.noreply.github.com> Date: Tue, 21 Jan 2025 11:10:54 +0100 Subject: [PATCH 27/27] NeMo 2.0 fp8 conversion (#11845) * initial commit Signed-off-by: Piotr Kaminski * Apply isort and black reformatting Signed-off-by: Laplasjan107 * code review: reuse nemo1 model config loader Signed-off-by: Piotr Kaminski * remove unused import Signed-off-by: Piotr Kaminski --------- Signed-off-by: Piotr Kaminski Signed-off-by: Laplasjan107 Co-authored-by: Laplasjan107 --- .../trt_llm/nemo_ckpt_loader/nemo_file.py | 29 +++++++++++++++---- .../convert_nemo1_to_nemo2.py | 26 +++++++++++++++-- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 1d344fd55735..f3c9812555bc 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -480,6 +480,28 @@ def get_model_type(nemo_ckpt: Union[str, Path]) -> Optional[str]: return model_type +def load_distributed_model_weights( + weights_directory: Union[Path, TarPath], mcore_scales_format: bool +) -> Dict[str, Any]: + """ + Loads model weights in `torch_dist` format directly from weights directory. + Preprocesses the scaling factors for local export if mcore_scales_format is set to False. + + Args: + weights_directory (Path | TarPath): Path to the weights directory. + mcore_scales_format (bool): Flag for local vs megatron.core export. + + Returns: + dict: Model state dictionary + """ + model = load_sharded_metadata(weights_directory) + if not mcore_scales_format: + model.update({k: v[0] for k, v in model.items() if EXTRA_STATE in k and isinstance(v, list)}) + model = preprocess_scaling_factors_for_local_export(model) + + return model + + def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Path], mcore_scales_format: bool = True): if not os.path.exists(nemo_ckpt): raise TypeError("%s does not exist", nemo_ckpt) @@ -496,10 +518,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat if (nemo_dir / "model_weights").exists(): dist_ckpt_folder = nemo_dir / "model_weights" - model = load_sharded_metadata(dist_ckpt_folder) - if not mcore_scales_format: - model.update({k: v[0] for k, v in model.items() if EXTRA_STATE in k and isinstance(v, list)}) - model = preprocess_scaling_factors_for_local_export(model) + model = load_distributed_model_weights(dist_ckpt_folder, mcore_scales_format) nemo_model_config = unpacked_checkpoint_dir.model_config @@ -515,7 +534,7 @@ def load_nemo_model(nemo_ckpt: Union[str, Path], nemo_export_dir: Union[str, Pat tokenizer = build_tokenizer(tokenizer_config) elif (nemo_dir / "weights").exists(): dist_ckpt_folder = nemo_dir / "weights" - model = load_sharded_metadata(dist_ckpt_folder) + model = load_distributed_model_weights(dist_ckpt_folder, mcore_scales_format) io_folder = nemo_dir / "context" if (io_folder / "model.yaml").exists(): diff --git a/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py b/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py index 5ce814d6cff3..4052678ccef3 100644 --- a/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py +++ b/scripts/checkpoint_converters/convert_nemo1_to_nemo2.py @@ -40,6 +40,7 @@ import tempfile from argparse import ArgumentParser from pathlib import Path +from typing import Any, Dict import torch from megatron.core.dist_checkpointing.dict_utils import dict_list_map_inplace @@ -56,6 +57,7 @@ from nemo.lightning.ckpt_utils import ckpt_to_context_subdir from nemo.lightning.io.pl import TrainerContext, ckpt_to_weights_subdir from nemo.utils import logging +from nemo.utils.model_utils import load_config MODEL_CONFIG_MAPPING = { "meta-llama/Llama-2-7b-hf": (llm.LlamaModel, llm.Llama2Config7B), @@ -116,7 +118,23 @@ def get_args(): return args -def get_nemo2_model(model_id, tokenizer) -> llm.GPTModel: +def load_fp8_config(model_path: str) -> Dict[str, Any]: + """ + Loads fp8 configuration of the NeMo 1.0 model. + + Args: + model_path (str): Path to NeMo 1.0 checkpoint. + + Returns: + (dict): NeMo 1.0 model fp8 settings. + """ + fp8_params = ['fp8', 'fp8_amax_history_len', 'fp8_interval', 'fp8_margin', 'fp8_amax_compute_algo'] + config = load_config(model_path) + fp8_config = {key: config[key] for key in fp8_params if key in config} + return fp8_config + + +def get_nemo2_model(model_id, tokenizer, input_path) -> llm.GPTModel: """ Get NeMo 2.0 model class from model_id and tokenizer. Use bf16 for NeMo 1.0 ckpts. @@ -135,8 +153,10 @@ def get_nemo2_model(model_id, tokenizer) -> llm.GPTModel: valid_ids = "\n- ".join([""] + list(MODEL_CONFIG_MAPPING.keys())) raise ValueError(f"Unsupported model_id: {model_id}. Please provide a valid model_id from {valid_ids}") model_cls, config_cls = MODEL_CONFIG_MAPPING[model_id] + + fp8_config = load_fp8_config(input_path) # nemo1 ckpts are bf16 - return model_cls(config_cls(bf16=True, params_dtype=torch.bfloat16), tokenizer=tokenizer) + return model_cls(config_cls(bf16=True, params_dtype=torch.bfloat16, **fp8_config), tokenizer=tokenizer) def get_tokenizer(input_path: Path, tokenizer_tmp_dir: Path) -> AutoTokenizer: @@ -183,7 +203,7 @@ def main() -> None: tokenizer_tmp_dir = Path("/tmp/nemo_tokenizer") tokenizer_tmp_dir.mkdir(parents=True, exist_ok=True) tokenizer = get_tokenizer(Path(args.input_path), tokenizer_tmp_dir) - model = get_nemo2_model(args.model_id, tokenizer=tokenizer) + model = get_nemo2_model(args.model_id, tokenizer=tokenizer, input_path=args.input_path) model.optim = None trainer = Trainer(