NVIDIA · janekl · Dec 31, 2024
diff --git a/nemo/export/multimodal/build.py b/nemo/export/multimodal/build.py
@@ -527,12 +527,12 @@ def extract_lora_ckpt(
     elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")):
         model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt"))
     else:
-        raise RuntimeError(f"Imcompatible lora checkpoint format")
+        raise RuntimeError("Imcompatible lora checkpoint format")
 
     model_config = os.path.join(lora_ckpt, "model_config.yaml")
 
     if not os.path.exists(model_config):
-        raise RuntimeError(f"Imcompatible lora checkpoint format")
+        raise RuntimeError("Imcompatible lora checkpoint format")
 
     llm_lora_weight = {}
 

diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py
@@ -248,7 +248,7 @@ def vocab(self):
         ]
         return main_vocab + special_tokens
 
-    ### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
+    # Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
         return self.ids_to_tokens(ids)  # TODO: support skip_special_tokens

diff --git a/nemo/export/tensorrt_lazy_compiler.py b/nemo/export/tensorrt_lazy_compiler.py
@@ -291,8 +291,8 @@ def parse_groups(
     """
     groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
     cur = 0
-    for l in range(len(output_lists)):
-        gl = output_lists[l]
+    for i in range(len(output_lists)):
+        gl = output_lists[i]
         assert len(gl) == 0 or len(gl) == 1
         if len(gl) == 0 or gl[0] == 0:
             groups = (*groups, ret[cur])
@@ -303,7 +303,7 @@ def parse_groups(
         elif gl[0] == -1:
             rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
             rcur = len(ret)
-            for rl in range(len(output_lists) - 1, l, -1):
+            for rl in range(len(output_lists) - 1, i, -1):
                 rgl = output_lists[rl]
                 assert len(rgl) == 0 or len(rgl) == 1
                 if len(rgl) == 0 or rgl[0] == 0:

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -63,7 +63,7 @@
 use_deploy = True
 try:
     from nemo.deploy.utils import cast_output, str_ndarray2list
-except Exception as e:
+except Exception:
     use_deploy = False
 
 LOGGER = logging.getLogger("NeMo")
@@ -663,7 +663,7 @@ def get_layer_num(param_name):
                 reshard_model = True
             else:
                 raise NotImplementedError(
-                    f"NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
+                    "NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
                 )
 
         num_layers = model_config["num_layers"]

diff --git a/nemo/export/trt_llm/converter/model_converter.py b/nemo/export/trt_llm/converter/model_converter.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
-import csv
 import logging
 from typing import Any, Dict, List, Optional, Tuple
 

diff --git a/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py b/nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
@@ -152,9 +152,6 @@ def convert_model_to_trt_llm_ckpt(
 
     has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
     has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
-    share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
-    embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
-    hidden_size = nemo_model_config["hidden_size"]
 
     num_layers = nemo_model_config["num_layers"]
     training_tp_size = 1

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -30,7 +30,7 @@
 from torch.distributed.checkpoint import FileSystemReader
 from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
 from torch.distributed.checkpoint.state_dict_loader import load_state_dict
-from transformers import AutoTokenizer, PreTrainedTokenizer
+from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer
 
 from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
 from nemo.export.tarutils import TarPath, ZarrPathStore

diff --git a/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py b/nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
@@ -95,7 +95,7 @@ def qnemo_to_tensorrt_llm(
     build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "
 
     if not use_qdq:
-        build_cmd += f"--gemm_plugin auto "
+        build_cmd += "--gemm_plugin auto "
 
     if max_seq_len is not None:
         build_cmd += f"--max_seq_len {max_seq_len} "

diff --git a/nemo/export/trt_llm/tensorrt_llm_build.py b/nemo/export/trt_llm/tensorrt_llm_build.py
@@ -16,11 +16,11 @@
 import logging
 import tensorrt_llm
 from tensorrt_llm._common import check_max_num_tokens
-from tensorrt_llm.builder import BuildConfig, Builder
+from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.commands.build import build as build_trtllm
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraConfig
-from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
+from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
 from tensorrt_llm.plugin import PluginConfig
 
 MODEL_NAME = "NeMo"
@@ -60,7 +60,7 @@ def build_and_save_engine(
     architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
     try:
         model_cls = getattr(tensorrt_llm.models, architecture)
-    except:
+    except Exception:
         raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")
 
     logger.set_level("info")

diff --git a/nemo/export/trt_llm/tensorrt_llm_run.py b/nemo/export/trt_llm/tensorrt_llm_run.py
@@ -27,10 +27,8 @@
 import tensorrt_llm
 import torch
 from mpi4py.futures import MPIPoolExecutor
-from tensorrt_llm._utils import mpi_comm
 from tensorrt_llm.builder import Engine
 from tensorrt_llm.lora_manager import LoraManager
-from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
 from transformers import PreTrainedTokenizer
@@ -40,7 +38,7 @@
 use_trtllm_bindings = True
 try:
     from tensorrt_llm.bindings import GptJsonConfig
-except Exception as e:
+except Exception:
     use_trtllm_bindings = False
 
 TRTLLM_SUPPORTS_DEVICE_DISABLE = True
@@ -465,13 +463,10 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
     json_config = GptJsonConfig.parse_file(config_path)
     model_config = json_config.model_config
 
-    max_beam_width = model_config.max_beam_width
     max_batch_size = model_config.max_batch_size
     max_input_len = model_config.max_input_len
-    max_seq_len = model_config.max_seq_len
 
     tp_size = json_config.tensor_parallelism
-    pp_size = json_config.pipeline_parallelism
     assert tp_size <= gpus_per_node, "Multinode TP is not unsupported"
 
     # TRTLLM asserts that rank equals the device num however this
@@ -483,9 +478,9 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
     for _ in range(offset):
         device_ids.append(device_ids.pop(0))
     engine_index = model_parallel_rank
-    mpi_rank = mpi_comm().Get_rank()
+    # mpi_rank = mpi_comm().Get_rank()
     # Copied from worldConfig.h (getDevice())
-    mpi_device = mpi_rank % gpus_per_node
+    # mpi_device = mpi_rank % gpus_per_node
     # TODO: Consider re-enabling
     # assert torch.cuda.current_device() == mpi_device
 
@@ -503,11 +498,11 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
 
     if not TRTLLM_SUPPORTS_DEVICE_DISABLE:
         raise RuntimeError(
-            f"TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature."
+            "TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature."
         )
     elif not DISABLE_TORCH_DEVICE_SET:
         raise RuntimeError(
-            f"To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1"
+            "To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1"
         )
     decoder = ModelRunner.from_engine(
         engine=engine,
@@ -568,7 +563,7 @@ def refit(weights_dict: dict):
         logging.warning(f"Weights dict did not contain weights for these named TRT weights: {remaining_refit_weights}")
 
     if not refitter.refit_cuda_engine():
-        raise ValueError(f"Refit failed!")
+        raise ValueError("Refit failed!")
 
 
 def unload_engine():

diff --git a/nemo/export/utils/lora_converter.py b/nemo/export/utils/lora_converter.py
@@ -177,14 +177,14 @@ def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_
                     ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"
 
                 with ckpt_file.open("rb") as f:
-                    l = torch.load(f, map_location=torch.device('cpu'))
+                    weights = torch.load(f, map_location=torch.device('cpu'))
 
                 if pp == 0:
-                    lora_state_dict[tp] = l
+                    lora_state_dict[tp] = weights
                 else:
                     # calculate layer offset
                     layer_offset = lora_config['num_layers'] // pp_size * pp
-                    for key, value in l.items():
+                    for key, value in weights.items():
                         new_key = replace_number_add_offset(key, layer_offset)
                         lora_state_dict[tp][new_key] = value
 

diff --git a/nemo/export/vllm/model_converters.py b/nemo/export/vllm/model_converters.py
@@ -330,7 +330,7 @@ def convert_weights(self, nemo_model_config, state_dict):
             # Attention dense
             yield (
                 f'model.layers.{layer}.self_attn.o_proj.weight',
-                state_dict[f'model.decoder.layers.self_attention.linear_proj.weight'][layer],
+                state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer],
             )
             if has_bias:
                 yield (

diff --git a/nemo/export/vllm_exporter.py b/nemo/export/vllm_exporter.py
@@ -168,7 +168,7 @@ def export(
 
         # Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
         if quantization is not None and weight_storage in {'auto', 'memory'}:
-            LOGGER.warning(f'Setting weight_storage = "file" for FP8 quantization')
+            LOGGER.warning('Setting weight_storage = "file" for FP8 quantization')
             weight_storage = 'file'
 
         # See if we have an up-to-date safetensors file