Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Run Flake8 for nemo.export module #11728

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions nemo/export/multimodal/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -527,12 +527,12 @@ def extract_lora_ckpt(
elif os.path.exists(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt")):
model_weight = torch.load(os.path.join(lora_ckpt, "mp_rank_00", "model_weights.ckpt"))
else:
raise RuntimeError(f"Imcompatible lora checkpoint format")
raise RuntimeError("Imcompatible lora checkpoint format")

model_config = os.path.join(lora_ckpt, "model_config.yaml")

if not os.path.exists(model_config):
raise RuntimeError(f"Imcompatible lora checkpoint format")
raise RuntimeError("Imcompatible lora checkpoint format")

llm_lora_weight = {}

Expand Down
2 changes: 1 addition & 1 deletion nemo/export/sentencepiece_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,7 +248,7 @@ def vocab(self):
]
return main_vocab + special_tokens

### Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM
# Below are a few methods that mimic transformers.PreTrainedTokenizer for vLLM

def convert_ids_to_tokens(self, ids, skip_special_tokens: bool = False):
return self.ids_to_tokens(ids) # TODO: support skip_special_tokens
Expand Down
6 changes: 3 additions & 3 deletions nemo/export/tensorrt_lazy_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ def parse_groups(
"""
groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
cur = 0
for l in range(len(output_lists)):
gl = output_lists[l]
for i in range(len(output_lists)):
gl = output_lists[i]
assert len(gl) == 0 or len(gl) == 1
if len(gl) == 0 or gl[0] == 0:
groups = (*groups, ret[cur])
Expand All @@ -303,7 +303,7 @@ def parse_groups(
elif gl[0] == -1:
rev_groups: Tuple[Union[torch.Tensor, List[torch.Tensor]], ...] = tuple()
rcur = len(ret)
for rl in range(len(output_lists) - 1, l, -1):
for rl in range(len(output_lists) - 1, i, -1):
rgl = output_lists[rl]
assert len(rgl) == 0 or len(rgl) == 1
if len(rgl) == 0 or rgl[0] == 0:
Expand Down
4 changes: 2 additions & 2 deletions nemo/export/tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
use_deploy = True
try:
from nemo.deploy.utils import cast_output, str_ndarray2list
except Exception as e:
except Exception:
use_deploy = False

LOGGER = logging.getLogger("NeMo")
Expand Down Expand Up @@ -663,7 +663,7 @@ def get_layer_num(param_name):
reshard_model = True
else:
raise NotImplementedError(
f"NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
"NeMo currently only supports PP>1 -> PP=1 resharding, other types of resharding will come in future releases."
)

num_layers = model_config["num_layers"]
Expand Down
2 changes: 0 additions & 2 deletions nemo/export/trt_llm/converter/model_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.


import csv
import logging
from typing import Any, Dict, List, Optional, Tuple

Expand Down
3 changes: 0 additions & 3 deletions nemo/export/trt_llm/converter/model_to_trt_llm_ckpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,6 @@ def convert_model_to_trt_llm_ckpt(

has_position_embedding = get_layer_name("position_embedding", prefix) in model_state_dict
has_lm_head = get_layer_name("output_layer", prefix) in model_state_dict
share_embeddings_and_output = nemo_model_config.get("share_embeddings_and_output_weights", False)
embedding_scaling = nemo_model_config.get("apply_embedding_scaling", False)
hidden_size = nemo_model_config["hidden_size"]

num_layers = nemo_model_config["num_layers"]
training_tp_size = 1
Expand Down
2 changes: 1 addition & 1 deletion nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
from torch.distributed.checkpoint import FileSystemReader
from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
from torch.distributed.checkpoint.state_dict_loader import load_state_dict
from transformers import AutoTokenizer, PreTrainedTokenizer
from transformers import AutoTokenizer, GPT2Tokenizer, PreTrainedTokenizer

from nemo.export.sentencepiece_tokenizer import SentencePieceTokenizer
from nemo.export.tarutils import TarPath, ZarrPathStore
Expand Down
2 changes: 1 addition & 1 deletion nemo/export/trt_llm/qnemo/qnemo_to_tensorrt_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def qnemo_to_tensorrt_llm(
build_cmd += f"--use_fused_mlp {'enable' if use_fused_mlp else 'disable'} "

if not use_qdq:
build_cmd += f"--gemm_plugin auto "
build_cmd += "--gemm_plugin auto "

if max_seq_len is not None:
build_cmd += f"--max_seq_len {max_seq_len} "
Expand Down
6 changes: 3 additions & 3 deletions nemo/export/trt_llm/tensorrt_llm_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
import logging
import tensorrt_llm
from tensorrt_llm._common import check_max_num_tokens
from tensorrt_llm.builder import BuildConfig, Builder
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.commands.build import build as build_trtllm
from tensorrt_llm.logger import logger
from tensorrt_llm.lora_manager import LoraConfig
from tensorrt_llm.models.modeling_utils import add_lora, optimize_model, preprocess_weights
from tensorrt_llm.models.modeling_utils import optimize_model, preprocess_weights
from tensorrt_llm.plugin import PluginConfig

MODEL_NAME = "NeMo"
Expand Down Expand Up @@ -60,7 +60,7 @@ def build_and_save_engine(
architecture = "LLaMAForCausalLM" if model_config.architecture == "LlamaForCausalLM" else model_config.architecture
try:
model_cls = getattr(tensorrt_llm.models, architecture)
except:
except Exception:
raise AttributeError(f"Could not find TRTLLM model type: {model_type}!")

logger.set_level("info")
Expand Down
17 changes: 6 additions & 11 deletions nemo/export/trt_llm/tensorrt_llm_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,8 @@
import tensorrt_llm
import torch
from mpi4py.futures import MPIPoolExecutor
from tensorrt_llm._utils import mpi_comm
from tensorrt_llm.builder import Engine
from tensorrt_llm.lora_manager import LoraManager
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime import ModelConfig, ModelRunner, ModelRunnerCpp, SamplingConfig
from transformers import PreTrainedTokenizer
Expand All @@ -40,7 +38,7 @@
use_trtllm_bindings = True
try:
from tensorrt_llm.bindings import GptJsonConfig
except Exception as e:
except Exception:
use_trtllm_bindings = False

TRTLLM_SUPPORTS_DEVICE_DISABLE = True
Expand Down Expand Up @@ -465,13 +463,10 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
json_config = GptJsonConfig.parse_file(config_path)
model_config = json_config.model_config

max_beam_width = model_config.max_beam_width
max_batch_size = model_config.max_batch_size
max_input_len = model_config.max_input_len
max_seq_len = model_config.max_seq_len

tp_size = json_config.tensor_parallelism
pp_size = json_config.pipeline_parallelism
assert tp_size <= gpus_per_node, "Multinode TP is not unsupported"

# TRTLLM asserts that rank equals the device num however this
Expand All @@ -483,9 +478,9 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):
for _ in range(offset):
device_ids.append(device_ids.pop(0))
engine_index = model_parallel_rank
mpi_rank = mpi_comm().Get_rank()
# mpi_rank = mpi_comm().Get_rank()
# Copied from worldConfig.h (getDevice())
mpi_device = mpi_rank % gpus_per_node
# mpi_device = mpi_rank % gpus_per_node
# TODO: Consider re-enabling
# assert torch.cuda.current_device() == mpi_device

Expand All @@ -503,11 +498,11 @@ def load_distributed(engine_dir, model_parallel_rank, gpus_per_node):

if not TRTLLM_SUPPORTS_DEVICE_DISABLE:
raise RuntimeError(
f"TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature."
"TensorRT-LLM does not support torch device disabling. Please upgrade TensorRT-LLM to make use of this feature."
)
elif not DISABLE_TORCH_DEVICE_SET:
raise RuntimeError(
f"To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1"
"To use TensorRT-LLM's python ModelRunner API in load_distributed(...) you must set the env var DISABLE_TORCH_DEVICE_SET=1"
)
decoder = ModelRunner.from_engine(
engine=engine,
Expand Down Expand Up @@ -568,7 +563,7 @@ def refit(weights_dict: dict):
logging.warning(f"Weights dict did not contain weights for these named TRT weights: {remaining_refit_weights}")

if not refitter.refit_cuda_engine():
raise ValueError(f"Refit failed!")
raise ValueError("Refit failed!")


def unload_engine():
Expand Down
6 changes: 3 additions & 3 deletions nemo/export/utils/lora_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,14 +177,14 @@ def convert_lora_nemo_to_canonical(lora_nemo, save_path, hf_format=False, donor_
ckpt_file = archive / f"tp_rank_{tp:02d}_pp_rank_{pp:03d}/model_weights.ckpt"

with ckpt_file.open("rb") as f:
l = torch.load(f, map_location=torch.device('cpu'))
weights = torch.load(f, map_location=torch.device('cpu'))

if pp == 0:
lora_state_dict[tp] = l
lora_state_dict[tp] = weights
else:
# calculate layer offset
layer_offset = lora_config['num_layers'] // pp_size * pp
for key, value in l.items():
for key, value in weights.items():
new_key = replace_number_add_offset(key, layer_offset)
lora_state_dict[tp][new_key] = value

Expand Down
2 changes: 1 addition & 1 deletion nemo/export/vllm/model_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def convert_weights(self, nemo_model_config, state_dict):
# Attention dense
yield (
f'model.layers.{layer}.self_attn.o_proj.weight',
state_dict[f'model.decoder.layers.self_attention.linear_proj.weight'][layer],
state_dict['model.decoder.layers.self_attention.linear_proj.weight'][layer],
)
if has_bias:
yield (
Expand Down
2 changes: 1 addition & 1 deletion nemo/export/vllm_exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def export(

# Dynamic online FP8 quantization currently does not support in-memory conversion [TODO]
if quantization is not None and weight_storage in {'auto', 'memory'}:
LOGGER.warning(f'Setting weight_storage = "file" for FP8 quantization')
LOGGER.warning('Setting weight_storage = "file" for FP8 quantization')
weight_storage = 'file'

# See if we have an up-to-date safetensors file
Expand Down
Loading