diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py index 988c6b50da04..fe1a42c27804 100644 --- a/nemo/export/multimodal/build.py +++ b/nemo/export/multimodal/build.py @@ -527,12 +527,12 @@ def extract_lora_ckpt( elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")): model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")) else: - raise RuntimeError(f"Imcompatible lora checkpoint format") + raise RuntimeError("Imcompatible lora checkpoint format") model_config = os.path.join(lora_ckpt, "model_config.yaml") if not os.path.exists(model_config): - raise RuntimeError(f"Imcompatible lora checkpoint format") + raise RuntimeError("Imcompatible lora checkpoint format") llm_lora_weight = {} diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py index e47b1c665af5..e04099e6183f 100644 --- a/nemo/export/sentencepiece_tokenizer.py +++ b/nemo/export/sentencepiece_tokenizer.py @@ -248,7 +248,7 @@ def vocab(self): ] return main_vocab + special_tokens - ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM + # Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False): return self.ids_to_tokens(ids) # TODO: support skip_special_tokens diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py index ab40278efa94..50b609087250 100644 --- a/nemo/export/tensorrt_lazy_compiler.py +++ b/nemo/export/tensorrt_lazy_compiler.py @@ -291,8 +291,8 @@ def parse_groups( """ groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple() cur = 0 - for l in range(len(output_lists)): - gl = output_lists[l] + for i in range(len(output_lists)): + gl = output_lists[i] assert len(gl) == 0 or len(gl) == 1 if len(gl) == 0 or gl[0] == 0: groups = (*groups, ret[cur]) @@ -303,7 +303,7 @@ def parse_groups( elif gl[0] == -1: rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple() rcur = len(ret) - for rl in range(len(output_lists) - 1, l, -1): + for rl in range(len(output_lists) - 1, i, -1): rgl = output_lists[rl] assert len(rgl) == 0 or len(rgl) == 1 if len(rgl) == 0 or rgl[0] == 0: diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py index 7d95bcca5709..3bb6222adc06 100644 --- a/nemo/export/tensorrt_llm.py +++ b/nemo/export/tensorrt_llm.py @@ -63,7 +63,7 @@ use_deploy = True try: from nemo.deploy.utils import cast_output, str_ndarray2list -except Exception as e: +except Exception: use_deploy = False LOGGER = logging.getLogger("NeMo") @@ -663,7 +663,7 @@ def get_layer_num(param_name): reshard_model = True else: raise NotImplementedError( - f"NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases." + "NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases." ) num_layers = model_config["num_layers"] diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py index e459dc31d0fb..aef3c44e6cac 100755 --- a/nemo/export/trt_llm/converter/model_converter.py +++ b/nemo/export/trt_llm/converter/model_converter.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import csv import logging from typing import Any, Dict, List, Optional, Tuple diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py index ca725f74d2ef..6406bd678f4d 100644 --- a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py +++ b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py @@ -152,9 +152,6 @@ def convert_model_to_trt_llm_ckpt( has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict - share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False) - embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False) - hidden_size = nemo_model_config["hidden_size"] num_layers = nemo_model_config["num_layers"] training_tp_size = 1 diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py index 518a5bad8883..790cbc11ba2d 100644 --- a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py +++ b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py @@ -30,7 +30,7 @@ from torch.distributed.checkpoint import FileSystemReader from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata from torch.distributed.checkpoint.state_dict_loader import load_state_dict -from transformers import AutoTokenizer, PreTrainedTokenizer +from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer from nemo.export.tarutils import TarPath, ZarrPathStore diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py index f601c8cb1c5a..7fd554a66d14 100644 --- a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py +++ b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py @@ -95,7 +95,7 @@ def qnemo_to_tensorrt_llm( build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} " if not use_qdq: - build_cmd += f"--gemm_plugin auto " + build_cmd += "--gemm_plugin auto " if max_seq_len is not None: build_cmd += f"--max_seq_len {max_seq_len} " diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py index b2b761483700..a0c8d52b9895 100755 --- a/nemo/export/trt_llm/tensorrt_llm_build.py +++ b/nemo/export/trt_llm/tensorrt_llm_build.py @@ -16,11 +16,11 @@ import logging import tensorrt_llm from tensorrt_llm._common import check_max_num_tokens -from tensorrt_llm.builder import BuildConfig, Builder +from tensorrt_llm.builder import BuildConfig from tensorrt_llm.commands.build import build as build_trtllm from tensorrt_llm.logger import logger from tensorrt_llm.lora_manager import LoraConfig -from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights +from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights from tensorrt_llm.plugin import PluginConfig MODEL_NAME = "NeMo" @@ -60,7 +60,7 @@ def build_and_save_engine( architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture try: model_cls = getattr(tensorrt_llm.models, architecture) - except: + except Exception: raise AttributeError(f"Could not find TRTLLM model type: {model_type}!") logger.set_level("info") diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py index ef67c918290f..789d6c7b274e 100644 --- a/nemo/export/trt_llm/tensorrt_llm_run.py +++ b/nemo/export/trt_llm/tensorrt_llm_run.py @@ -27,10 +27,8 @@ import tensorrt_llm import torch from mpi4py.futures import MPIPoolExecutor -from tensorrt_llm._utils import mpi_comm from tensorrt_llm.builder import Engine from tensorrt_llm.lora_manager import LoraManager -from tensorrt_llm.mapping import Mapping from tensorrt_llm.quantization import QuantMode from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig from transformers import PreTrainedTokenizer @@ -40,7 +38,7 @@ use_trtllm_bindings = True try: from tensorrt_llm.bindings import GptJsonConfig -except Exception as e: +except Exception: use_trtllm_bindings = False TRTLLM_SUPPORTS_DEVICE_DISABLE = True @@ -465,13 +463,10 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node): json_config = GptJsonConfig.parse_file(config_path) model_config = json_config.model_config - max_beam_width = model_config.max_beam_width max_batch_size = model_config.max_batch_size max_input_len = model_config.max_input_len - max_seq_len = model_config.max_seq_len tp_size = json_config.tensor_parallelism - pp_size = json_config.pipeline_parallelism assert tp_size <= gpus_per_node, "Multinode TP is not unsupported" # TRTLLM asserts that rank equals the device num however this @@ -483,9 +478,9 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node): for _ in range(offset): device_ids.append(device_ids.pop(0)) engine_index = model_parallel_rank - mpi_rank = mpi_comm().Get_rank() + # mpi_rank = mpi_comm().Get_rank() # Copied from worldConfig.h (getDevice()) - mpi_device = mpi_rank % gpus_per_node + # mpi_device = mpi_rank % gpus_per_node # TODO: Consider re-enabling # assert torch.cuda.current_device() == mpi_device @@ -503,11 +498,11 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node): if not TRTLLM_SUPPORTS_DEVICE_DISABLE: raise RuntimeError( - f"TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature." + "TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature." ) elif not DISABLE_TORCH_DEVICE_SET: raise RuntimeError( - f"To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1" + "To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1" ) decoder = ModelRunner.from_engine( engine=engine, @@ -568,7 +563,7 @@ def refit(weights_dict: dict): logging.warning(f"Weights dict did not contain weights for these named TRT weights: {remaining_refit_weights}") if not refitter.refit_cuda_engine(): - raise ValueError(f"Refit failed!") + raise ValueError("Refit failed!") def unload_engine(): diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py index 530dea55370b..cd229317bf23 100644 --- a/nemo/export/utils/lora_converter.py +++ b/nemo/export/utils/lora_converter.py @@ -177,14 +177,14 @@ def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_ ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt" with ckpt_file.open("rb") as f: - l = torch.load(f, map_location=torch.device('cpu')) + weights = torch.load(f, map_location=torch.device('cpu')) if pp == 0: - lora_state_dict[tp] = l + lora_state_dict[tp] = weights else: # calculate layer offset layer_offset = lora_config['num_layers'] // pp_size * pp - for key, value in l.items(): + for key, value in weights.items(): new_key = replace_number_add_offset(key, layer_offset) lora_state_dict[tp][new_key] = value diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py index 595ceecf0b18..b451dc7b0e88 100644 --- a/nemo/export/vllm/model_converters.py +++ b/nemo/export/vllm/model_converters.py @@ -330,7 +330,7 @@ def convert_weights(self, nemo_model_config, state_dict): # Attention dense yield ( f'model.layers.{layer}.self_attn.o_proj.weight', - state_dict[f'model.decoder.layers.self_attention.linear_proj.weight'][layer], + state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer], ) if has_bias: yield ( diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py index 97575058bd1c..7709240e2b7a 100644 --- a/nemo/export/vllm_exporter.py +++ b/nemo/export/vllm_exporter.py @@ -168,7 +168,7 @@ def export( # Dynamic online FP8 quantization currently does not support in-memory conversion [TODO] if quantization is not None and weight_storage in {'auto', 'memory'}: - LOGGER.warning(f'Setting weight_storage = "file" for FP8 quantization') + LOGGER.warning('Setting weight_storage = "file" for FP8 quantization') weight_storage = 'file' # See if we have an up-to-date safetensors file