From 23f8574ee540e7e4b16ed6537098630ba1c404f4 Mon Sep 17 00:00:00 2001 From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com> Date: Thu, 29 Aug 2024 11:45:19 -0400 Subject: [PATCH 01/73] Add Param Cache For Recompilation (#2000) add param cache --- optimum/fx/parallelization/core.py | 6 ++++++ optimum/fx/parallelization/passes.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/optimum/fx/parallelization/core.py b/optimum/fx/parallelization/core.py index cba7d454441..1d13b00b468 100644 --- a/optimum/fx/parallelization/core.py +++ b/optimum/fx/parallelization/core.py @@ -125,6 +125,11 @@ class ParallelExecutionCtx: because we have to make sure we don't initiate new parameters and replace original ones when recompilation happens in training process. + - param_cache (`Dict[str, nn.Parameter]`): + Cache which keeps record of newly created parameters. Similar to `parallel_layer_cache`, we + need to make sure all the newly created parameters in the first compilation will still be used + when recompilation happens. + - weight_map (`Dict[str, str]`): Mapping between parameter names and their locations on disk, useful when loading weights from disk. @@ -140,6 +145,7 @@ class ParallelExecutionCtx: current_device: torch.device example_inputs: List[Any] = field(default_factory=list) parallel_layer_cache: Dict[str, nn.Module] = field(default_factory=dict) + param_cache: Dict[str, nn.Parameter] = field(default_factory=dict) weight_map: Dict[str, str] = field(default_factory=dict) last_optimized_graph_module: Optional[GraphModule] = None compile_times: int = 0 diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py index 1b25e9e1233..379b027d400 100644 --- a/optimum/fx/parallelization/passes.py +++ b/optimum/fx/parallelization/passes.py @@ -480,18 +480,21 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf class InitializeOrLoadWeightsPass(PassBase): """ - Make weights loading/initialization a seperate pass for cleaner logic and easier extensibility. This - pass will only run once in the very first compilation step. + Weights loading and intialization pass, will initialize parameters on current rank and load weights from disk + if necessary. """ - need_rerun_when_recompile = False - def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule: world_size = dist.get_world_size(ctx.tp_group) tp_rank = dist.get_rank(ctx.tp_group) - new_parameters, tied_parameters = [], {} + new_parameters, tied_parameters, param_cache = [], {}, ctx.param_cache for name, param in sorted(graph_module.named_parameters(remove_duplicate=False)): + # skip initializing new params when recompilation happens + if name in param_cache: + new_parameters.append((name, param_cache[name])) + continue + param_meta: ParameterMeta = getattr(param, "meta") # skip already initialized/loaded tied parameters if param_meta.is_tied and id(param) in tied_parameters: @@ -569,6 +572,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf else: parent_mod = graph_module field = name + if name not in param_cache: + param_cache[name] = new_param setattr(parent_mod, field, new_param) return graph_module From 3b5587569d2ad21d2ca53f375e1e958f16f67f4f Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Fri, 30 Aug 2024 17:15:44 +0200 Subject: [PATCH 02/73] Follow up the diffusers task refactoring (#1999) * fix * fix style --- optimum/exporters/tasks.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index f02f1769233..97053040879 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -1937,12 +1937,6 @@ def standardize_model_attributes(cls, model: Union["PreTrainedModel", "TFPreTrai if inferred_model_type is not None: break - if inferred_model_type is None: - raise ValueError( - f"The export of a DiffusionPipeline model with the class name {model.__class__.__name__} is currently not supported in Optimum. " - "Please open an issue or submit a PR to add the support." - ) - # `model_type` is a class attribute in Transformers, let's avoid modifying it. model.config.export_model_type = inferred_model_type @@ -2068,9 +2062,16 @@ def get_model_from_task( if original_task == "auto" and config.architectures is not None: model_class_name = config.architectures[0] - model_class = TasksManager.get_model_class_for_task( - task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name - ) + if library_name == "diffusers": + config = DiffusionPipeline.load_config(model_name_or_path, **kwargs) + class_name = config.get("_class_name", None) + loaded_library = importlib.import_module(library_name) + model_class = getattr(loaded_library, class_name) + else: + model_class = TasksManager.get_model_class_for_task( + task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name + ) + if library_name == "timm": model = model_class(f"hf_hub:{model_name_or_path}", pretrained=True, exportable=True) model = model.to(torch_dtype).to(device) From 7cc57e40f84e00f8ebc2849da303e40575fb23b4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 2 Sep 2024 15:35:14 +0200 Subject: [PATCH 03/73] Transformers 4.44 support (#1996) * test * fix conll2003 dataset with remote code * sdpa for new bloom attention block * style * fix bloom modeling * better version ranges to reflect max and min transformers support * pin right version * use input dims --- optimum/bettertransformer/models/attention.py | 218 ++++++++++++------ .../models/decoder_models.py | 2 + optimum/exporters/onnx/model_configs.py | 38 +-- optimum/onnxruntime/modeling_decoder.py | 21 +- optimum/utils/input_generators.py | 36 +-- .../preprocessing/token_classification.py | 2 +- setup.py | 11 +- tests/bettertransformer/test_decoder.py | 6 +- 8 files changed, 210 insertions(+), 124 deletions(-) diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 6c8f16f057c..9dfa57844d4 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -15,6 +15,9 @@ from typing import Optional, Tuple import torch +import torch.nn.functional as F + +from ...utils import check_if_transformers_greater # TODO (CRITICAL): Layer-wise attention scaling is broken for several archs. @@ -23,7 +26,7 @@ def raise_on_head_mask(head_mask: Optional[torch.Tensor]): if head_mask is not None: raise ValueError( - "layer_head_mask different than None is unsupported for now with BetterTransformer, please" + "layer_head_mask (or head_mask) different than None is unsupported for now with BetterTransformer, please" "open a PR or an issue at https://github.com/huggingface/optimum." ) @@ -534,88 +537,159 @@ def bart_forward( return attn_output, None, past_key_value -# Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward -def bloom_forward( - self, - hidden_states: torch.Tensor, - residual: torch.Tensor, - alibi: torch.Tensor, - attention_mask: torch.Tensor, - layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, - head_mask: Optional[torch.Tensor] = None, - use_cache: bool = False, - output_attentions: bool = False, - **kwargs, -): - raise_on_head_mask(head_mask) +if check_if_transformers_greater("4.44"): + from transformers.cache_utils import Cache + from transformers.models.bloom.modeling_bloom import dropout_add + + # Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward + def bloom_forward( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ): + raise_on_head_mask(head_mask) + + if output_attentions is True: + raise ValueError("output_attentions=True can not be supported with BetterTransformer.") + + batch_size, q_length, _ = hidden_states.shape + # [batch_size, seq_length, 3 x hidden_size] + fused_qkv = self.query_key_value(hidden_states) + # 3 x [batch_size, num_heads, seq_length, head_dim] + query_layer, key_layer, value_layer = self._reshape(fused_qkv) + + if layer_past is not None: + cache_kwargs = {"cache_position": cache_position} + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) + + alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:]) + + if attention_mask is not None: # no matter the length, we just slice it + kv_length = cache_position[-1] + 1 # cache position is 0-indexed while length should start from 1 + causal_mask = attention_mask[:, :, :, :kv_length] + alibi = torch.masked_fill(alibi, causal_mask.bool(), torch.finfo(alibi.dtype).min) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=alibi, + dropout_p=self.dropout_prob_attn if self.training else 0.0, + ) - if output_attentions is True: - raise ValueError("output_attentions=True can not be supported with BetterTransformer.") + # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim] + context_layer = context_layer.transpose(1, 2) + context_layer = context_layer.reshape(batch_size, q_length, self.hidden_size) + + # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 + if self.pretraining_tp > 1 and self.slow_but_exact: + slices = self.hidden_size / self.pretraining_tp + output_tensor = torch.zeros_like(context_layer) + for i in range(self.pretraining_tp): + output_tensor = output_tensor + F.linear( + context_layer[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + output_tensor = self.dense(context_layer) - fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training) - # 3 x [batch_size, seq_length, num_heads, head_dim] - (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) + outputs = (output_tensor, layer_past) - batch_size, q_length, _, _ = query_layer.shape + return outputs - # Permute to [batch_size, num_heads, seq_length, head_dim] - query_layer = query_layer.transpose(1, 2) +else: + # Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward + def bloom_forward( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + **kwargs, + ): + raise_on_head_mask(head_mask) - if layer_past is not None: - past_key, past_value = layer_past - past_key = past_key.transpose(1, 2) + if output_attentions is True: + raise ValueError("output_attentions=True can not be supported with BetterTransformer.") - key_layer = key_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) - value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) + # [batch_size, seq_length, 3 x hidden_size] + fused_qkv = self.query_key_value(hidden_states) - # concatenate along seq_length dimension - key_layer = torch.cat((past_key, key_layer), dim=1) - value_layer = torch.cat((past_value, value_layer), dim=1) + # 3 x [batch_size, seq_length, num_heads, head_dim] + (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv) - # untangle batch_size from self.num_heads - key_layer = key_layer.reshape(batch_size, self.num_heads, *key_layer.shape[1:]) - value_layer = value_layer.reshape(batch_size, self.num_heads, *value_layer.shape[1:]) - else: - key_layer = key_layer.transpose(1, 2) - value_layer = value_layer.transpose(1, 2) - - alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:]) - alibi = torch.masked_fill(alibi, attention_mask, torch.finfo(alibi.dtype).min) - - context_layer = torch.nn.functional.scaled_dot_product_attention( - query_layer, - key_layer, - value_layer, - attn_mask=alibi, - dropout_p=self.dropout_prob_attn if self.training else 0.0, - ) + batch_size, q_length, _, _ = query_layer.shape - # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim] - context_layer = context_layer.transpose(1, 2) - context_layer = context_layer.reshape(*context_layer.shape[:2], -1) - - # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 - if self.pretraining_tp > 1 and self.slow_but_exact: - slices = self.hidden_size / self.pretraining_tp - output_tensor = torch.zeros_like(context_layer) - for i in range(self.pretraining_tp): - output_tensor = output_tensor + torch.nn.functional.linear( - context_layer[:, :, int(i * slices) : int((i + 1) * slices)], - self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], - ) - else: - output_tensor = self.dense(context_layer) + # Permute to [batch_size, num_heads, seq_length, head_dim] + query_layer = query_layer.transpose(1, 2) + + if layer_past is not None: + past_key, past_value = layer_past + past_key = past_key.transpose(1, 2) - output_tensor = torch.nn.functional.dropout(output_tensor, p=self.hidden_dropout, training=self.training) - output_tensor = residual + output_tensor + key_layer = key_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) + value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim) - if use_cache is True: - present = ( - key_layer.reshape(-1, *key_layer.shape[2:]).transpose(1, 2), - value_layer.reshape(-1, *value_layer.shape[2:]), + # concatenate along seq_length dimension + key_layer = torch.cat((past_key, key_layer), dim=1) + value_layer = torch.cat((past_value, value_layer), dim=1) + + # untangle batch_size from self.num_heads + key_layer = key_layer.reshape(batch_size, self.num_heads, *key_layer.shape[1:]) + value_layer = value_layer.reshape(batch_size, self.num_heads, *value_layer.shape[1:]) + else: + key_layer = key_layer.transpose(1, 2) + value_layer = value_layer.transpose(1, 2) + + alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:]) + alibi = torch.masked_fill(alibi, attention_mask, torch.finfo(alibi.dtype).min) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=alibi, + dropout_p=self.dropout_prob_attn if self.training else 0.0, ) - else: - present = None - return (output_tensor, present) + # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim] + context_layer = context_layer.transpose(1, 2) + context_layer = context_layer.reshape(*context_layer.shape[:2], -1) + + # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 + if self.pretraining_tp > 1 and self.slow_but_exact: + slices = self.hidden_size / self.pretraining_tp + output_tensor = torch.zeros_like(context_layer) + for i in range(self.pretraining_tp): + output_tensor = output_tensor + torch.nn.functional.linear( + context_layer[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + output_tensor = self.dense(context_layer) + + output_tensor = torch.nn.functional.dropout(output_tensor, p=self.hidden_dropout, training=self.training) + output_tensor = residual + output_tensor + + if use_cache is True: + present = ( + key_layer.reshape(-1, *key_layer.shape[2:]).transpose(1, 2), + value_layer.reshape(-1, *value_layer.shape[2:]), + ) + else: + present = None + + return (output_tensor, present) diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 4bcc057373a..b64b7f5a1eb 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -216,6 +216,8 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.dropout_prob_attn = config.attention_dropout self.module_mapping = None + self.layer_idx = getattr(layer, "layer_idx", None) + submodules = ["query_key_value", "dense", "attention_dropout"] for attr in submodules: setattr(self, attr, getattr(layer, attr)) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 3e11c7e614a..d4b15b2968b 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -338,27 +338,31 @@ class BloomOnnxConfig(TextDecoderOnnxConfig): ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = BloomDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head") + DEFAULT_ONNX_OPSET = 14 # Bloom uses aten::triu that requires opset>=14, and F.scaled_dot_product_attention def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): - if direction not in ["inputs", "outputs"]: - raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') - - if direction == "inputs": - decoder_sequence_name = "past_sequence_length" - name = "past_key_values" + if check_if_transformers_greater("4.44"): + super().add_past_key_values(inputs_or_outputs, direction) else: - decoder_sequence_name = "past_sequence_length + 1" - name = "present" + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') - for i in range(self._normalized_config.num_layers): - inputs_or_outputs[f"{name}.{i}.key"] = { - 0: "batch_size x num_heads", - 2: decoder_sequence_name, - } - inputs_or_outputs[f"{name}.{i}.value"] = { - 0: "batch_size x num_heads", - 1: decoder_sequence_name, - } + if direction == "inputs": + decoder_sequence_name = "past_sequence_length" + name = "past_key_values" + else: + decoder_sequence_name = "past_sequence_length + 1" + name = "present" + + for i in range(self._normalized_config.num_layers): + inputs_or_outputs[f"{name}.{i}.key"] = { + 0: "batch_size x num_heads", + 2: decoder_sequence_name, + } + inputs_or_outputs[f"{name}.{i}.value"] = { + 0: "batch_size x num_heads", + 1: decoder_sequence_name, + } class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 6a0dcbba2f0..f6d4b7e20ab 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -336,8 +336,7 @@ def prepare_past_key_values( dtype = constructor.float16 if self.use_fp16 else constructor.float32 # TODO: find a way to better handle this controlflow, this is EXTREMELY UGLY. - # "1" is the dummy sequence length - if self.model_type == "bloom": + if self.__class__.__name__ == "ORTBloomForCausalLM": shape_value = (batch_size * num_attention_heads, 0, embed_size_per_head) shape_key = (batch_size * num_attention_heads, embed_size_per_head, 0) key = constructor.zeros(shape_key, dtype=dtype) @@ -354,9 +353,9 @@ def prepare_past_key_values( for name, value in zip(self.key_value_output_names, past_key_values): shape = [*value.shape] index = 1 if "value" in name else 2 - shape[index] += sequence_length pkv_output_shape[name] = shape + elif self.model_type == "gpt_bigcode": # GPT BigCode uses muti-query attention, and has the specificity of putting both key and value in the same cache tensor. shape_key_and_value = (batch_size, 0, embed_size_per_head * 2) @@ -371,9 +370,9 @@ def prepare_past_key_values( shape = [*value.shape] shape[1] += sequence_length pkv_output_shape[name] = shape + else: num_key_value_heads = self.num_key_value_heads if self.model_type == "falcon" else num_attention_heads - shape = (batch_size, num_key_value_heads, 0, embed_size_per_head) key_or_value = constructor.zeros(shape, dtype=dtype) @@ -534,9 +533,9 @@ def _from_pretrained( # Since https://github.com/huggingface/optimum/pull/871/ # changed axis notation/naming during export, we need to update the dims - for dim in input_dims.keys(): - if "past" in dim and input_dims[dim][2] == "past_sequence_length + sequence_length": - input_dims[dim][2] = "past_sequence_length" + for input_name in input_dims.keys(): + if "past" in input_name and input_dims[input_name][2] == "past_sequence_length + sequence_length": + input_dims[input_name][2] = "past_sequence_length" override_dims = True if override_dims: @@ -559,6 +558,12 @@ def _from_pretrained( size_threshold=0, ) + # Since transformers 4.44, the bloom model has been updated to use the standard cache format + use_old_bloom_modeling = not check_if_transformers_greater("4.44") + for input_name in input_dims.keys(): + if input_dims[input_name][0] == "batch_size x num_heads": + use_old_bloom_modeling = True + del onnx_model model = ORTModel.load_model( @@ -568,7 +573,7 @@ def _from_pretrained( provider_options=provider_options, ) - if config.model_type == "bloom": + if config.model_type == "bloom" and use_old_bloom_modeling: init_cls = ORTBloomForCausalLM elif config.model_type == "falcon": init_cls = ORTFalconForCausalLM diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 36913f652a8..dac14a38114 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -22,6 +22,7 @@ import numpy as np from transformers.utils import is_tf_available, is_torch_available +from ..utils import check_if_transformers_greater from .normalized_config import ( NormalizedConfig, NormalizedEncoderDecoderConfig, @@ -1026,23 +1027,26 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class BloomDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - past_key_shape = ( - self.batch_size * self.num_attention_heads, - self.hidden_size // self.num_attention_heads, - self.sequence_length, - ) - past_value_shape = ( - self.batch_size * self.num_attention_heads, - self.sequence_length, - self.hidden_size // self.num_attention_heads, - ) - return [ - ( - self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), - self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + if check_if_transformers_greater("4.44"): + return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype) + else: + past_key_shape = ( + self.batch_size * self.num_attention_heads, + self.hidden_size // self.num_attention_heads, + self.sequence_length, ) - for _ in range(self.num_layers) - ] + past_value_shape = ( + self.batch_size * self.num_attention_heads, + self.sequence_length, + self.hidden_size // self.num_attention_heads, + ) + return [ + ( + self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), + self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), + ) + for _ in range(self.num_layers) + ] class MultiQueryPastKeyValuesGenerator(DummyPastKeyValuesGenerator): diff --git a/optimum/utils/preprocessing/token_classification.py b/optimum/utils/preprocessing/token_classification.py index 1c59aa2285b..64a0bf2da8a 100644 --- a/optimum/utils/preprocessing/token_classification.py +++ b/optimum/utils/preprocessing/token_classification.py @@ -28,7 +28,7 @@ class TokenClassificationProcessing(TaskProcessor): ACCEPTED_PREPROCESSOR_CLASSES = (PreTrainedTokenizerBase,) - DEFAULT_DATASET_ARGS = "conll2003" + DEFAULT_DATASET_ARGS = {"path": "conll2003", "trust_remote_code": True} DEFAUL_DATASET_DATA_KEYS = {"primary": "tokens"} ALLOWED_DATA_KEY_NAMES = {"primary"} DEFAULT_REF_KEYS = ["ner_tags", "pos_tags", "chunk_tags"] diff --git a/setup.py b/setup.py index 2e8c9489a89..3ac4315321b 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29.0,<4.44.0", + "transformers[sentencepiece]>=4.29,<4.45.0", "torch>=1.11", "packaging", "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 @@ -24,6 +24,7 @@ ] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released +# pytest>=8.0.0 also fails with the transformers version pinned for exporters-tf TESTS_REQUIRE = [ "accelerate", "pytest<=8.0.0", @@ -72,7 +73,7 @@ "timm", "h5py", "numpy<1.24.0", - "transformers[sentencepiece]>=4.26.0,<4.38.0", + "transformers[sentencepiece]>=4.26,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", @@ -80,9 +81,9 @@ "nncf": "optimum-intel[nncf]>=1.18.0", "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0", "ipex": "optimum-intel[ipex]>=1.18.0", - "habana": ["optimum-habana", "transformers >= 4.43.0, < 4.44.0"], - "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"], - "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"], + "habana": ["optimum-habana", "transformers>=4.43.0,<4.44.0"], + "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers>=4.36.2,<4.42.0"], + "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers>=4.36.2,<4.42.0"], "graphcore": "optimum-graphcore", "furiosa": "optimum-furiosa", "amd": "optimum-amd", diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py index 42340d3b3aa..bab8f376fcc 100644 --- a/tests/bettertransformer/test_decoder.py +++ b/tests/bettertransformer/test_decoder.py @@ -23,7 +23,6 @@ from optimum.bettertransformer import BetterTransformer from optimum.utils import ( - BloomDummyPastKeyValuesGenerator, DummyPastKeyValuesGenerator, NormalizedConfigManager, ) @@ -136,10 +135,7 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) - if model_type == "bloom": - pkv_generator_class = BloomDummyPastKeyValuesGenerator - else: - pkv_generator_class = DummyPastKeyValuesGenerator + pkv_generator_class = DummyPastKeyValuesGenerator pkv_generator = pkv_generator_class( task="", normalized_config=normalized_config, batch_size=batch_size, sequence_length=seq_length From ad98dc944be4308f405ab34e78fa85b16c7d3709 Mon Sep 17 00:00:00 2001 From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com> Date: Mon, 2 Sep 2024 10:40:14 -0400 Subject: [PATCH 04/73] Modify Parallelization Strategy to Make it More General (#1988) * modify parallelization strategy * only support model id in api now * more comments * more comments * address comments * remove idle runner * fix * format * more comments * nit --- .../workflows/test_fx_automatic_parallel.yml | 2 +- optimum/fx/parallelization/api.py | 87 ++-- optimum/fx/parallelization/core.py | 5 + optimum/fx/parallelization/decomp.py | 225 +++++++++ .../parallelization/op_registry/__init__.py | 15 + .../op_registry/op_handlers.py | 450 ++++++++++++++++++ optimum/fx/parallelization/passes.py | 350 +++++--------- optimum/fx/parallelization/utils.py | 29 +- 8 files changed, 878 insertions(+), 285 deletions(-) create mode 100644 optimum/fx/parallelization/decomp.py create mode 100644 optimum/fx/parallelization/op_registry/__init__.py create mode 100644 optimum/fx/parallelization/op_registry/op_handlers.py diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml index 3c913e3f7ed..d8af6e40caa 100644 --- a/.github/workflows/test_fx_automatic_parallel.yml +++ b/.github/workflows/test_fx_automatic_parallel.yml @@ -24,7 +24,7 @@ jobs: config: - name: GPU-enabled Optimum Test Suite image: nvidia/cuda:12.4.1-devel-ubuntu22.04 - gpu_target: ["nvidia-multi-gpu-l4-runners", "nvidia-multi-gpu-a10-runners"] + gpu_target: ["nvidia-multi-gpu-a10-runners"] name: ${{ matrix.config.name }} runs-on: diff --git a/optimum/fx/parallelization/api.py b/optimum/fx/parallelization/api.py index bd307bd93c1..9700b491e52 100644 --- a/optimum/fx/parallelization/api.py +++ b/optimum/fx/parallelization/api.py @@ -15,10 +15,11 @@ import importlib import os from functools import partial -from typing import List, Union +from typing import Callable, List import torch from torch.fx import GraphModule +from transformers import AutoConfig from .core import Config, ParallelExecutionCtx from .passes import build_parallel_pass_pipeline @@ -43,30 +44,31 @@ def parallelize_backend( def parallelize_model( - model: Union[torch.nn.Module, str], + model: str, parallel_ctx: ParallelExecutionCtx, *model_args, **kwargs, -): +) -> Callable: """ API for automatic model parallelism through Pytorch FX. Args: - model (Union[torch.nn.Module, str]): - Model to parallelize, could either be a module or a model id on the Huggingface Hub. - parallel_ctx (ParallelExecutionCtx): + model (`str`): + Model to parallelize, a model id on the Huggingface Hub or path to a local directory containing config and weights + of the model. + parallel_ctx (`ParallelExecutionCtx`): Parallel execution context containing process groups the current process belongs to. - *model_args (Any): + *model_args (`Any`): Additional postional arguments for intializing the model if a model id is passed. - revision (str, defaults to `main`): + revision (`str`, defaults to `main`): Model revision for weights downloading if a model id is passed. - cache_dir (Optional[str], defaults to `None`): + cache_dir (`Optional[str]`, defaults to `None`): Cache directory to store downloaded weights. Defaults to None. - local_files_only (bool, defaults to `False`): + local_files_only (`bool`, defaults to `False`): Whether to use local files only, will avoid downloading from remote if set to `True`. - skip_load_weights (bool, defaults to `False`): + skip_load_weights (`bool`, defaults to `False`): Whether to skip loading weights from disk to model. - **kwargs (Dict[str, Any]): + **kwargs (`Dict[str, Any]`): Addtional keyword arguments for overriding fields in parallel config, model config and `Model.__init__`. """ revision = kwargs.pop("revision", "main") @@ -80,44 +82,41 @@ def parallelize_model( setattr(parallel_config, k, v) kwargs.pop(k) - if isinstance(model, str): - from transformers import AutoConfig - - is_local = os.path.isdir(model) - if not is_local: - hf_folder = download_model_from_hf( - model_name_or_path=model, - cache_dir=cache_dir, - revision=revision, - local_files_only=local_files_only, - skip_download_weights=skip_load_weights, - ) - else: - hf_folder = model - - # should be able to load config using only local files - model_config, kwargs = AutoConfig.from_pretrained( - hf_folder, revision=revision, local_files_only=True, return_unused_kwargs=True, **kwargs + is_local = os.path.isdir(model) + if not is_local: + hf_folder = download_model_from_hf( + model_name_or_path=model, + cache_dir=cache_dir, + revision=revision, + local_files_only=local_files_only, + skip_download_weights=skip_load_weights, ) + else: + hf_folder = model - # try getting model class info from config - model_arch = model_config.architectures - model_cls = getattr(importlib.import_module("transformers"), model_arch[0]) + # should be able to load config using only local files + model_config, kwargs = AutoConfig.from_pretrained( + hf_folder, revision=revision, local_files_only=True, return_unused_kwargs=True, **kwargs + ) - if not skip_load_weights: - parallel_ctx.weight_map = try_collect_weight_map(model, cache_dir, hf_folder) + # try getting model class info from config + model_arch = model_config.architectures + model_cls = getattr(importlib.import_module("transformers"), model_arch[0]) - torch_dtype, dtype_orig = kwargs.pop("torch_dtype", None), None - if torch_dtype is not None: - dtype_orig = model_cls._set_default_torch_dtype(torch_dtype) + if not skip_load_weights: + parallel_ctx.weight_map = try_collect_weight_map(model, cache_dir, hf_folder) - with MetaAwareMethodsPatcher(): - model = model_cls(model_config, *model_args, **kwargs) - # TODO: remove this once support training-time trace - model.eval() + torch_dtype, dtype_orig = kwargs.pop("torch_dtype", None), None + if torch_dtype is not None: + dtype_orig = model_cls._set_default_torch_dtype(torch_dtype) - if dtype_orig is not None: - torch.set_default_dtype(dtype_orig) + with MetaAwareMethodsPatcher(): + model = model_cls(model_config, *model_args, **kwargs) + # TODO: remove this once support training-time trace + model.eval() + + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) move_model_to_device(model, device=parallel_ctx.current_device) initialize_parameter_meta(model) diff --git a/optimum/fx/parallelization/core.py b/optimum/fx/parallelization/core.py index 1d13b00b468..84737292f07 100644 --- a/optimum/fx/parallelization/core.py +++ b/optimum/fx/parallelization/core.py @@ -166,8 +166,13 @@ class Config: - weight_init_fn (`Callable`, defaults to `partial(nn.init.normal_, std=0.02)`) Initialization function of weights in `nn.Linear` and `nn.Embedding` layers, if not provided weights loading path. + + - enable_sequence_parallel (`bool`, defaults to `False`): + Whether to enable Megatron-style sequence parallelism in searching parallelization + strategies. """ lint_and_recompile: bool = True clean_markers_after_all_passes: bool = True weight_init_fn: Callable = partial(nn.init.normal_, std=0.02) + enable_sequence_parallel: bool = False diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py new file mode 100644 index 00000000000..26258d451bf --- /dev/null +++ b/optimum/fx/parallelization/decomp.py @@ -0,0 +1,225 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +from typing import Callable, Dict, List + +import torch +import torch.nn.functional as F +import torch.utils._pytree as pytree +from torch import SymBool, SymFloat, SymInt +from torch._decomp import core_aten_decompositions +from torch._functorch._aot_autograd.functional_utils import from_fun, to_fun +from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode, disable_functional_mode +from torch.fx import Graph, GraphModule, Interpreter, Proxy, traceback +from torch.fx.experimental.proxy_tensor import ( + ProxyTorchDispatchMode, + _ProxyTensor, + _SymNodeDict, + decompose, + disable_proxy_modes_tracing, + fetch_object_proxy, + fetch_sym_proxy, + get_proxy_slot, + track_tensor_tree, +) +from torch.fx.proxy import GraphAppendingTracer +from torch.utils.weak import WeakTensorKeyDictionary + + +def is_leaf_module(m): + return (m.__module__.startswith("torch.nn") or m.__module__.startswith("torch.ao.nn")) and not isinstance( + m, torch.nn.Sequential + ) + + +@contextlib.contextmanager +def trace_decomp_origin(): + creat_node = Graph.create_node + + def create_node_(*args, **kwargs): + node = creat_node(*args, **kwargs) + node.meta["traced_from"] = traceback.get_current_meta()["from_node"] + return node + + try: + Graph.create_node = create_node_ + yield + finally: + Graph.create_node = creat_node + + +class DecompTracer(GraphAppendingTracer): + """ + DecompTracer is a tracer class which works together with `DecompositionInterpreter`, it keeps track of tensors and their + corresponding proxy objects during execution process. When invoked with `create_proxy`, it creates a node in the containing + graph and associate the output tensor of the node with the created proxy. + + See https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/proxy_tensor.py for more details. + """ + + def __init__(self, graph: Graph): + super().__init__(graph) + self.tensor_tracker = WeakTensorKeyDictionary() + self.symnode_tracker = _SymNodeDict() + + +class DecompositionInterpreter(Interpreter): + """ + DecompositionInterpreter takes the high-level graph module, run the iternal nodes following the topo order, and decompose + high-level pytorch operators into core aten operators by utilizing torch dispatch infrastructure along the way. + + Notes: + - Certain primitive layers(like `nn.Linear`, `nn.Embedding`, and activation layers) are preserved because we have specific + heuristic based parallelization strategy for them so that we can conveniently replace them into their parallelized counterparts + in the orignal graph module. + + - The traced graph is a low-level equivalent representation of the original graph module, and is only used for + parallel axis propagation and analysis, the original graph module is still used for real execution. + """ + + def __init__( + self, module: GraphModule, new_graph: Graph, decomposition_table=None, leaf_function_targets=None, **kwargs + ): + super().__init__(module, **kwargs) + self.new_graph = new_graph + self.tracer = DecompTracer(new_graph) + + self.decomposition_table = decomposition_table + if self.decomposition_table is None: + self.decomposition_table = {} + + self.leaf_function_targets = leaf_function_targets + if self.leaf_function_targets is None: + self.leaf_function_targets = [] + + self.fun_mode = FunctionalTensorMode() + self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real") + + def placeholder(self, target, args, kwargs): + out = super().placeholder(target, args, kwargs) + out = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), out) + proxy = self.tracer.create_proxy("placeholder", target, args, kwargs) + + with disable_proxy_modes_tracing(): + track_tensor_tree(out, proxy, constant=None, tracer=self.tracer) + + out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out) + return out + + def call_function(self, target, args, kwargs): + if target in self.leaf_function_targets: + args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args) + kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs) + + with disable_proxy_modes_tracing(), disable_functional_mode(): + out = target(*args, **kwargs) + + args, kwargs = pytree.tree_map_only((torch.Tensor,), fetch_object_proxy(self.tracer), (args, kwargs)) + proxy_args, proxy_kwargs = pytree.tree_map_only( + (SymInt, SymFloat, SymBool), + fetch_sym_proxy(self.tracer), + pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (args, kwargs)), + ) + proxy = self.tracer.create_proxy("call_function", target, proxy_args, proxy_kwargs) + + with disable_proxy_modes_tracing(): + track_tensor_tree(out, proxy, constant=None, tracer=self.tracer) + + out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out) + return out + + return super().call_function(target, args, kwargs) + + def call_module(self, target, args, kwargs): + assert isinstance(target, str) + submod = self.fetch_attr(target) + if not is_leaf_module(submod): + return super().call_module(target, args, kwargs) + + args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args) + kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs) + + with disable_proxy_modes_tracing(), disable_functional_mode(): + out = submod(*args, **kwargs) + + args, kwargs = pytree.tree_map_only((torch.Tensor,), fetch_object_proxy(self.tracer), (args, kwargs)) + proxy_args, proxy_kwargs = pytree.tree_map_only( + (SymInt, SymFloat, SymBool), + fetch_sym_proxy(self.tracer), + pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (args, kwargs)), + ) + proxy = self.tracer.create_proxy("call_module", target, proxy_args, proxy_kwargs) + + with disable_proxy_modes_tracing(): + track_tensor_tree(out, proxy, constant=None, tracer=self.tracer) + + out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out) + return out + + def get_attr(self, target, args, kwargs): + out = super().get_attr(target, args, kwargs) + proxy = Proxy(self.new_graph.get_attr(target), self.tracer) + with disable_proxy_modes_tracing(): + track_tensor_tree(out, proxy, constant=None, tracer=self.tracer) + return out + + def output(self, target, args, kwargs): + args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args) + kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs) + out = super().output(target, args, kwargs) + + def unwrap(e): + return get_proxy_slot(e, self.tracer, e, lambda x: x.proxy.node) + + self.new_graph.output(pytree.tree_map(unwrap, out)) + return out + + def run(self, *args, **kwargs): + with self.fun_mode: + args = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), args) + kwargs = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), kwargs) + with traceback.preserve_node_meta(), trace_decomp_origin(), decompose(self.decomposition_table), self.mode: + return super().run(*args, **kwargs) + + +def decompose_and_functionalize( + graph_module: GraphModule, + decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(), + leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention], +) -> Callable: + """ + API to decompose and functionalize a high-level graph module. + + Args: + graph_module (`GraphModule`): + The high-level graph module to be decomposed and functionalized. + decomposition_table (`Dict[torch._ops.OperatorBase, Callable]`, defaults to `core_aten_decompostions()`): + The lookup table which maps high-level torch op to their equivalent low-level implementation. + leaf_function_targets (`List[Callable]`, defaults to `[F.scaled_dot_product_attention]`): + Functions which will not be traced through for convenience, `F.scaled_dot_product_attention` is + treated as a leaf function by default so that we don't have to deal with all detailed version of + sdpas in the traced graph. + + Returns: + Callable: a wrapper which returns the traced low-level graph when called with concrete arguments. + """ + new_graph = Graph(owning_module=graph_module) + interp = DecompositionInterpreter(graph_module, new_graph, decomposition_table, leaf_function_targets) + + def wrapper(*args, **kwargs): + interp.run(*args, **kwargs) + return new_graph + + return wrapper diff --git a/optimum/fx/parallelization/op_registry/__init__.py b/optimum/fx/parallelization/op_registry/__init__.py new file mode 100644 index 00000000000..8f8df0f7bd0 --- /dev/null +++ b/optimum/fx/parallelization/op_registry/__init__.py @@ -0,0 +1,15 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .op_handlers import REGISTRY, FallbackParallelAxisPropagateHandler diff --git a/optimum/fx/parallelization/op_registry/op_handlers.py b/optimum/fx/parallelization/op_registry/op_handlers.py new file mode 100644 index 00000000000..56b8fc16bc0 --- /dev/null +++ b/optimum/fx/parallelization/op_registry/op_handlers.py @@ -0,0 +1,450 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from abc import abstractmethod +from typing import Any, List, Optional + +import torch +from torch.fx import Node + +from ..core import Config +from ..utils import is_activation, is_embedding, is_linear + + +class Registry: + """ + Registry class handles registration of parallel axis propagation handlers of different aten ops. + To support a new aten op, you need to register the corresponding handler class by decorating it with `register` function. + """ + + def __init__(self) -> None: + self.mapping = {} + + def register(self, op_types): + def wrapper(cls): + if isinstance(op_types, (list, tuple)): + for op_type in op_types: + self.mapping[op_type] = cls + else: + self.mapping[op_types] = cls + return cls + + return wrapper + + def is_supported(self, op_type) -> bool: + return op_type in self.mapping + + +REGISTRY = Registry() + + +class OpParallelAxisPropagateHandler: + def __init__(self, node: Node, meta_key: str, config: Config) -> None: + self.node = node + self.meta_key = meta_key + self.config = config + + def extract_axis(self, arg: Any) -> Optional[int]: + if not isinstance(arg, Node): + return None + return arg.meta[self.meta_key].get("parallel_axis", None) + + @abstractmethod + def propagate(self) -> List[int]: + raise NotImplementedError + + +@REGISTRY.register( + [ + torch.ops.aten.pow.Tensor_Scalar, + torch.ops.aten.rsqrt.default, + torch.ops.aten.clone.default, + torch.ops.aten.bitwise_not.default, + torch.ops.aten.abs.default, + torch.ops.aten._to_copy.default, + torch.ops.aten.acos.default, + torch.ops.aten.acosh.default, + torch.ops.aten.alias.default, + torch.ops.aten.asin.default, + torch.ops.aten.asinh.default, + torch.ops.aten.atan.default, + torch.ops.aten.atanh.default, + torch.ops.aten.ceil.default, + torch.ops.aten.clamp.default, + torch.ops.aten.cos.default, + torch.ops.aten.cosh.default, + torch.ops.aten.erf.default, + torch.ops.aten.exp.default, + torch.ops.aten.trunc.default, + torch.ops.aten.tanh.default, + torch.ops.aten.tan.default, + torch.ops.aten.add.Scalar, + torch.ops.aten.sub.Scalar, + torch.ops.aten.sqrt.default, + torch.ops.aten.sin.default, + torch.ops.aten.sinh.default, + torch.ops.aten.sign.default, + torch.ops.aten.sigmoid.default, + torch.ops.aten.round.default, + torch.ops.aten.remainder.Scalar, + torch.ops.aten.relu.default, + torch.ops.aten.reciprocal.default, + torch.ops.aten.neg.default, + torch.ops.aten.ne.Scalar, + torch.ops.aten.native_dropout.default, + torch.ops.aten.mul.Scalar, + torch.ops.aten.logical_not.default, + torch.ops.aten.lt.Scalar, + torch.ops.aten.le.Scalar, + torch.ops.aten.log.default, + torch.ops.aten.log10.default, + torch.ops.aten.log2.default, + torch.ops.aten.log1p.default, + torch.ops.aten.leaky_relu.default, + torch.ops.aten.isnan.default, + torch.ops.aten.isinf.default, + torch.ops.aten.hardtanh.default, + torch.ops.aten.gt.Scalar, + torch.ops.aten.gelu.default, + torch.ops.aten.ge.Scalar, + torch.ops.aten.fmod.Scalar, + torch.ops.aten.floor.default, + torch.ops.aten.fill.Scalar, + torch.ops.aten.div.Scalar_mode, + torch.ops.aten.div.Scalar, + torch.ops.aten.bitwise_and.Scalar, + torch.ops.aten.bitwise_or.Scalar, + torch.ops.aten.bitwise_xor.Scalar, + ] +) +class UnaryOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg = self.node.all_input_nodes[0] + axis = self.extract_axis(arg) + return [axis] + + +@REGISTRY.register( + [ + torch.ops.aten.atan2.default, + torch.ops.aten.add.Tensor, + torch.ops.aten.bitwise_and.Tensor, + torch.ops.aten.bitwise_or.Tensor, + torch.ops.aten.bitwise_xor.Tensor, + torch.ops.aten.div.Tensor, + torch.ops.aten.div.Tensor_mode, + torch.ops.aten.eq.Tensor, + torch.ops.aten.fmod.Tensor, + torch.ops.aten.ge.Tensor, + torch.ops.aten.gt.Tensor, + torch.ops.aten.le.Tensor, + torch.ops.aten.logical_and.default, + torch.ops.aten.logical_or.default, + torch.ops.aten.logical_xor.default, + torch.ops.aten.lt.Tensor, + torch.ops.aten.maximum.default, + torch.ops.aten.minimum.default, + torch.ops.aten.mul.Tensor, + torch.ops.aten.ne.Tensor, + torch.ops.aten.pow.Tensor_Tensor, + torch.ops.aten.remainder.Tensor, + torch.ops.aten.sub.Tensor, + ] +) +class BinaryOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + input_nodes = self.node.all_input_nodes + # only one node + if len(input_nodes) == 1: + return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate() + + assert len(input_nodes) == 2, "binary op should have exact two nodes as inputs" + lhs_shape, rhs_shape = input_nodes[0].meta["val"].shape, input_nodes[1].meta["val"].shape + lhs_axis = self.extract_axis(input_nodes[0]) + rhs_axis = self.extract_axis(input_nodes[1]) + i, j = len(lhs_shape) - 1, len(rhs_shape) - 1 + while i >= 0 and j >= 0: + k = max(lhs_shape[i], rhs_shape[j]) + assert ( + k % min(lhs_shape[i], rhs_shape[j]) == 0 + ), f"shape {lhs_shape} and {rhs_shape} are not broadcastable!" + i -= 1 + j -= 1 + + if i < 0 and lhs_axis is not None: + lhs_axis += j + 1 + if j < 0 and rhs_axis is not None: + rhs_axis += i + 1 + + if lhs_axis is None: + return [rhs_axis] + elif rhs_axis is None: + return [lhs_axis] + elif lhs_axis != rhs_axis: + return [] + return [lhs_axis] + + +@REGISTRY.register( + [ + torch.ops.aten.amax.default, + torch.ops.aten.amin.default, + torch.ops.aten.any.dim, + torch.ops.aten._log_softmax.default, + torch.ops.aten._softmax.default, + torch.ops.aten.cumsum.default, + torch.ops.aten.mean.dim, + # torch.ops.aten.min.dim, + # torch.ops.aten.max.dim, + torch.ops.aten.var.dim, + torch.ops.aten.sum.dim_IntList, + torch.ops.aten.prod.dim_int, + ] +) +class ReductionOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def extract_dims( + self, + ) -> List[int]: + ndim = self.node.meta["val"].ndim + dims = None + if "dim" in self.node.kwargs: + dims = self.node.kwargs["dim"] + elif len(self.node.args) > 1 and isinstance(self.node.args[1], (int, list)): + dims = self.node.args[1] + + if isinstance(dims, int): + dims = [dims] + if not dims: + dims = list(range(ndim)) + dims = [(dim + ndim) % ndim for dim in dims] + + keepdim = False + if "keepdim" in self.node.kwargs: + keepdim = self.node.kwargs + elif len(self.node.args) > 2 and isinstance(self.node.args[2], bool): + keepdim = self.node.args[2] + + return dims, keepdim + + def propagate(self) -> List[int]: + dims, keepdim = self.extract_dims() + arg = self.node.all_input_nodes[0] + axis = self.extract_axis(arg) + if axis in dims: + return [] + if axis is None: + return [None] + if keepdim: + return [axis] + return [axis - sum([1 if dim < axis else 0 for dim in dims])] + + +@REGISTRY.register(torch.ops.aten.view.default) +class ViewLikeOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg = self.node.args[0] + axis = self.extract_axis(arg) + if axis is None: + return [None] + shape_before, shape_after = arg.meta["val"].shape, self.node.meta["val"].shape + size = 1 + for i in range(len(shape_before) - 1, axis - 1, -1): + size *= shape_before[i] + + cur, i, res = 1, len(shape_after) - 1, [] + while cur <= size and i >= 0: + cur *= shape_after[i] + if cur == size: + res.append(i) + i -= 1 + + return res + + +@REGISTRY.register(torch.ops.aten.unsqueeze.default) +class UnsqueezeParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg, dim = self.node.args[0], self.node.args[1] + ndim = arg.meta["val"].ndim + axis = self.extract_axis(arg) + if axis is None: + return [None] + dim = (dim + ndim) % ndim + if dim <= axis: + return [axis + 1] + return [axis] + + +@REGISTRY.register( + [ + torch.ops.aten.squeeze.dim, + torch.ops.aten.squeeze.dims, + ] +) +class SqueezeParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg, dims = self.node.args[0], self.node.args[1] + axis = self.extract_axis(arg) + if axis is None: + return [None] + + ndim = self.node.args[0].meta["val"].ndim + if isinstance(dims, int): + dims = [dims] + dims = [(dim + ndim) % ndim for dim in dims] + if axis in dims: + # being conservative + return [] + return [axis - sum([1 if dim < axis else 0 for dim in dims])] + + +@REGISTRY.register(torch.ops.aten.permute.default) +class PermuteParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg, dims = self.node.args[0], self.node.args[1] + ndim = arg.meta["val"].ndim + axis = self.extract_axis(arg) + if axis is None: + return [None] + + for i, dim in enumerate(dims): + if (dim + ndim) % ndim == axis: + return [i] + return [] + + +@REGISTRY.register(torch.ops.aten.slice.Tensor) +class SliceParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg, slice_dim = self.node.args[0], self.node.args[1] + axis = self.extract_axis(arg) + if axis is None: + return [None] + ndim = arg.meta["val"].ndim + slice_dim = (slice_dim + ndim) % ndim + if slice_dim == axis: + # slice on the parallel axis is not allowed + return [] + return [axis] + + +@REGISTRY.register(torch.ops.aten.expand.default) +class ExpandParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + arg, size = self.node.args[0], self.node.args[1] + axis = self.extract_axis(arg) + if axis is None: + return [None] + assert len(size) >= arg.meta["val"].ndim, "input size must be broadcastable to the target size in expand" + return [axis + len(size) - arg.meta["val"].ndim] + + +@REGISTRY.register(torch.ops.aten.cat.default) +class CatParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + nodes, cat_axis = self.node.all_input_nodes, self.node.args[1] + axis, ndim = self.extract_axis(nodes[0]), nodes[0].meta["val"].ndim + cat_axis = (cat_axis + ndim) % ndim + if cat_axis == axis: + return [] + for i in range(1, len(nodes)): + if self.extract_axis(nodes[i]) != axis: + return [] + return [axis] + + +@REGISTRY.register(torch.ops.aten.constant_pad_nd.default) +class PadParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + pad, ndim = self.node.args[1], self.node.args[0].meta["val"].ndim + axis = self.extract_axis(self.node.args[0]) + if axis is None: + return [None] + if axis >= ndim - pad // 2: + return [] + return [axis] + + +@REGISTRY.register(torch.ops.aten.copy.default) +class CopyParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + dst, src = self.node.all_input_nodes + axis_dst = self.extract_axis(dst) + axis_src = self.extract_axis(src) + if axis_dst != axis_src: + return [] + return [axis_dst] + + +@REGISTRY.register(torch.nn.functional.scaled_dot_product_attention) +class SpdaAttnParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + q, k, v = self.node.args[:3] + q_axis = self.extract_axis(q) + # parallel axis must be the head dimension if being parallelized + if q_axis != self.extract_axis(k) or q_axis != self.extract_axis(v) or q_axis not in {None, 1}: + return [] + return [q_axis] + + +class FallbackParallelAxisPropagateHandler(OpParallelAxisPropagateHandler): + def propagate(self) -> List[int]: + # by default we don't parallelize inputs and constants(except parameters embeded in modules) + if self.node.op in ["placeholder", "get_attr"]: + return [None] + elif self.node.op == "output": + for node in self.node.all_input_nodes: + # TODO: allow parallelized nodes in output, and append comm ops in graph tp all-gather + # parallelized output if intructed + if self.extract_axis(node) is not None: + return [] + return [None] + elif is_linear(self.node): + input_arg = self.node.all_input_nodes[0] + axis = self.extract_axis(input_arg) + if axis is None: + # with input being not parallelized, output can be parallelized on the head dimension, + # i.e., `ColumnLinear`, or not being parallelized by all-gather at the end + return [2, None] + elif self.config.enable_sequence_parallel and axis == 1: + # with input being parallelized on sequence dimension, output can be parallelized on + # the head dimension, i.e., `ColumnLinear` with sequence parallel, or not being parallelized + # by all-gather at the end + return [2, None] + elif axis == 2: + # with input being parallelized on head dimension, output can be parallelized on the + # sequence dimension or not parallelized by all-reduce at the end, i.e., `RowLinear` + # when sp is not enabled + return [1, None] if self.config.enable_sequence_parallel else [None] + else: + return [] + elif is_embedding(self.node): + input_arg = self.node.all_input_nodes[0] + axis = self.extract_axis(input_arg) + if axis is None: + # only support the embedding parameter being parallelized on `vocab` dim or not parallelized for now, + # the output can be parallelized on sequence dim or not parallelized + return [1, None] if self.config.enable_sequence_parallel else [None] + else: + return [] + elif is_activation(self.node): + return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate() + + # last resort, if no input is being parallelized, then we make output also not parallelized, + # this will give us relief on writing policies for strange ops which don't actually need + # parallelization in most cases + if all(self.extract_axis(arg) is None for arg in self.node.all_input_nodes): + return [None] + + raise NotImplementedError(f"don't know how to propagate axis for {self.node.target}") diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py index 379b027d400..14b652fff73 100644 --- a/optimum/fx/parallelization/passes.py +++ b/optimum/fx/parallelization/passes.py @@ -23,15 +23,14 @@ from torch.fx import Graph, GraphModule, Node from .core import Config, ParallelExecutionCtx, ParameterMeta +from .decomp import decompose_and_functionalize from .distributed import scatter +from .op_registry import REGISTRY, FallbackParallelAxisPropagateHandler from .parallel_layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding from .utils import ( is_embedding, is_linear, - is_permute, is_shape_consumer, - is_shape_generator, - is_transpose, stable_topological_sort, ) @@ -135,238 +134,151 @@ def clean_all(self, graph_module: GraphModule) -> None: self.clear_marker_per_node(node) -class ParallelLayerAnnotatePass(AnalyzeBase): +class ParallelAxisSolverPass(AnalyzeBase): """ - A pass which tries to automatically identify parallel layers in the graph. Note that for simplicity - we only consider classical ways of parallelizing layers in transformers architecture for now, we are not - solving an optimization problem which tries to give a best solution of parallelizing any model under - memory/hardware constraints. - - For `nn.Embedding` layers, we parallelize them on the vocabulary dim by default, because they are often tied - to the `lm_head` of the model, which is usually a `ColumnLinear`(parallelized on vocab dim). - - For `nn.Linear` layers, we parallelize them by grouping them as `upstream` nodes and `downstream` nodes, and - `upstream` nodes are marked as `ColumnLinear`, `downstream` nodes are marked as `RowLinear`. - - Typical examples in transformer models: - - Attention Bert-style MLP Llama-style MLP - __________________________________________________________________________ - Linear Linear Linear Linear - \\ / | \\ --> upstream - Matmul Linear Activation Activation Linear - __________________________________________________________________________ - \\ / | \\ / - \\ / ___________ \\ / - Matmul / Linear \ Mul - | / \ | - _______________________________/ \___________________________ - Linear Linear --> downstream - - Note that there are some patterns that can not be clearly marked, like this one: - - Linear - | \\ - | Linear <-- which label should we mark for the intermediate linear, `upstream` or `downstream` - | / - Add - | - Linear - - For patterns like this we will be conservative and raise errors directly because we don't know how to parallelize - it. Another concern is about the correctness, it's possible that we might end up with a wrong parallelization solution - even if the pattern itself is clear, but for now we are mainly targeting on transformer models and the current solution - should work fairly well. + A pass which tries to automatically identify parallel layers in the graph. There are three steps + involved to find a possible parallel solution given the traced graph module and process group. + + - Decompostion & Functionalization + The vanilla graph traced by dynamo frontend is a high-level graph which contains high-level + pytorch ops, and there could be thousands of them, which makes graph analysis hard in order + to cover all cases. So we decompose the high-level graph into low-level graph which only + conrtains core aten ops, which is a much smaller set. And functionalization is also needed + to remove inplace ops in the graph so that we get `aten.Add` instead of `aten.Add_` in the + graph, which furthur reduces the op set we need to consider. + + - Parallel Axis Propagation + We need to write parallel axis propagation rules for aten ops in the decomposed and functionalized + graph, note that we don't need to cover every possible parallelization strategy because in general + only certain ops(usually involves computation) can be parallelized in transformer models. And we just + need to write rules for a subset of core aten op set in order to support most of the transformer models. + + - Backtracking Search + After we have defined parallel axis propagation rules for each op in the graph, we do a brute force + backtracking search to try to find a possible solution which respects the propagation rule of every + op in the graph. + + + Note that there are several practical concerns + + - Time Complexity. Although brute force backtracking introduces an exponential time complexity, we reduces + the search space by injecting human heuristics. First, we only consider parallelization on the head dimension + (for tensor parallel) or the sequence dimension(to support sequence parallel), then at any time the tensor is + parallelized on at most one dimension. Second, we only allow axis switch around certain layers(like `nn.Linear` + or `nn.Embedding), and all other ops fall into their places by the parallel axis of their input and rules we write. + + - Optimal Solution. Note that since we return the first solution we find, then it might not be optimal in terms of + memory consumption and communication overhead. But again we can adjust the order of search and try parallelize + as much as we can first before fall back to non-parallelized search paths. And we don't pay too much attention + on calculating communication overhead because in practice they are bounded under the constraint that only certain + layers are allowed to communicate. + + Our goal is not to solve an optimization problem which tries to give a best solution of parallelizing any model under memory/hardware + constraints, but rather a cheap solution which relieves you from writing boilerplate code for parallelizing layers of different models. """ - def try_form_parallel_linear_groups(self, linear: Node) -> None: - """ - We try to form linears by forming closures in a greedy way, we start with an unmarked linear node, and traverses down - recusively to find all the potential `downstream` linears, note that once we have reached a linear, the recursion stops. - And the newly found `downstream` linears are used as new seeds to traverse upwards to find all the potential `upstream` - linears, the process goes on until number of linears on both sides converges. - Args: - linear (Node): the first linear node used as `upstream` node seed to form closure. - - Raises: - RuntimeError: - raises runtime error when the pattern itself is not clear, there are no clear boundaries that can be drawn. - """ - upstream_nodes, downstream_nodes = {linear}, set() - - seeds, next_seeds = [(linear, "down")], [] - - def traverse(start: Node, cur: Node, direction: str = "down"): - if is_linear(cur) and cur is not start: - if direction == "up" and cur not in upstream_nodes: - upstream_nodes.add(cur) - next_seeds.append((cur, "down")) - elif direction == "down" and cur not in downstream_nodes: - downstream_nodes.add(cur) - next_seeds.append((cur, "up")) - return - - next_nodes = cur.all_input_nodes if direction == "up" else cur.users - for node in next_nodes: - # we should ignore shape-related dependencies - if is_shape_generator(node): - continue - traverse(start, node, direction) - - while seeds: - next_seeds = [] - for node, direction in seeds: - traverse(start=node, cur=node, direction=direction) - seeds = next_seeds - - if any(self.already_executed_per_node(node) for node in (upstream_nodes | downstream_nodes)) or ( - upstream_nodes & downstream_nodes - ): - raise RuntimeError( - "Failed to automatically group and parallelize ops in graph in greedy way: " - "no clear boudaries between `upstream` and `downstream` ops." - ) - - for node in upstream_nodes: - self.place_marker_per_node(node, {"axis": "column", "gather_output": False if downstream_nodes else True}) - - for node in downstream_nodes: - self.place_marker_per_node(node, {"axis": "row", "input_is_parallel": True}) + def trace_back(self, graph_module: GraphModule, decomp_graph: Graph) -> None: + node_map = {node.name: node for node in graph_module.graph.nodes} + + for node in decomp_graph.nodes: + if "traced_from" in node.meta: + node_name, _ = node.meta["traced_from"][0] + assert node_name in node_map, f"un-recognized node origin {node_name} not in graph being traced" + orig_node = node_map[node_name] + self.clear_marker_per_node(orig_node) + self.place_marker_per_node( + orig_node, {"parallel_axis": self.get_stored_field_info(node, field="parallel_axis")} + ) def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule: - graph: Graph = graph_module.graph + graph: Graph = decompose_and_functionalize(graph_module)(*ctx.example_inputs) stable_topological_sort(graph) - for node in graph.nodes: - if is_linear(node) and not self.already_executed_per_node(node): - self.try_form_parallel_linear_groups(node) - elif is_embedding(node): - # directly mark `nn.Embedding` layers - self.place_marker_per_node(node, {"axis": "vocab"}) - return graph_module + nodes = list(graph.nodes) + def search(idx: int): + if idx == len(nodes): + return True -class ParallelAxisPropagationPass(AnalyzeBase): - """ - A pass which tries to track which axis is being parallelized in the dataflow. For transformer models, the - axis being paralled for tensor parallism is almost always 2, i.e., the attention head axis, except for - Q and K matrices which need to swap the sequence length axis and head axis to do the attention computation, - so we focus on operations like `transpose` or `permute` which swaps axis, and try inducting the parallel - axis after these operations. - """ + node = nodes[idx] + if node.op == "call_function" and REGISTRY.is_supported(node.target): + prop_cls = REGISTRY.mapping[node.target] + else: + prop_cls = FallbackParallelAxisPropagateHandler - def propagate_transpose(self, node: Node, parallel_axis: int) -> bool: - dims = node.meta["example_value"].dim() - if "dim0" in node.kwargs and "dim1" in node.kwargs: - dim0, dim1 = node.kwargs["dim0"], node.kwargs["dim1"] - elif len(node.args) == 3: - dim0, dim1 = node.args[1:] - - dim0 = (dim0 + dims) % dims - dim1 = (dim1 + dims) % dims - - if dim0 == parallel_axis: - self.place_marker_per_node(node, {"parallel_axis": dim1}) - return True - elif dim1 == parallel_axis: - self.place_marker_per_node(node, {"parallel_axis": dim0}) - return True - return False - - def propagate_permute(self, node: Node, parallel_axis: int) -> bool: - if "dims" in node.kwargs: - dims = node.kwargs["dims"] - else: - dims = ( - list(node.args[1]) - if isinstance(node.args[1], tuple) - else [arg for arg in node.args if isinstance(arg, int)] - ) + prop = prop_cls(node, self.meta_key(), config) + axis_candidates = prop.propagate() + for axis in axis_candidates: + self.place_marker_per_node(node, {"parallel_axis": axis}) + if search(idx + 1): + return True + self.clear_marker_per_node(node) - dim_len = node.meta["example_value"].dim() - dims = [dim + dim_len if dim < 0 else dim for dim in dims] + return False - for i, dim in enumerate(dims): - if dim == parallel_axis: - self.place_marker_per_node(node, {"parallel_axis": i}) - return True - return False - - def propagate_getitem(self, node: Node, parallel_axis: int) -> bool: - slices = node.args[1] - dims = node.meta["example_value"].dim() - assert parallel_axis < dims - inc, i, j = 0, 0, 0 - - while i < parallel_axis and j < len(slices): - if isinstance(slices[j], int): - inc -= 1 - i += 1 - elif slices[j] is None: - inc += 1 - elif slices[j] is Ellipsis: - i = dims - k = j - while k < len(slices): - if slices[k] is not Ellipsis: - i -= 1 - k += 1 - else: - i += 1 - j += 1 + if not search(0): + raise RuntimeError("Failed to find a solution to automatically parallelize ops in graph in greedy way.") - if inc != 0: - assert parallel_axis + inc < dims and parallel_axis + inc >= 0 - self.place_marker_per_node(node, {"parallel_axis": parallel_axis + inc}) - return True - return False + self.trace_back(graph_module, graph) + return graph_module + + +class ParallelLayerAnnotatePass(AnalyzeBase): + """ + This pass annotates layers which have different parallel axis(requires communication inside the layer) in their + input and output tensors. Since heuristics applied during the searching process respect traditional classical ways of + parallelizing layers(like Megatron-style `ColumnLinear` or `RowLinear`), we are guaranteed to match a valid replacement + annotation according to parallelization strategy of input and output tensors. + """ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule: - g: Graph = graph_module.graph - stable_topological_sort(g) + for node in graph_module.graph.nodes: + if is_linear(node): + axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis") + axis_after = ParallelAxisSolverPass.get_stored_field_info(node, "parallel_axis") + info = {} + if axis_before is None: + info["axis"] = "column" + info["gather_output"] = True if axis_after is None else False + elif axis_before == 1: + assert ( + config.enable_sequence_parallel + ), "illegal parallel axis for sequence parallelism deactivated setting" + info["axis"] = "column" + info["sequence_parallel"] = True + info["gather_output"] = True if axis_after is None else False + elif axis_before == 2: + info["axis"] = "row" + info["input_is_parallel"] = True + if axis_after == 1: + assert ( + config.enable_sequence_parallel + ), "illegal parallel axis for sequence parallelism deactivated setting" + info["sequence_parallel"] = True + else: + info["sequence_parallel"] = False + self.place_marker_per_node(node, info) - for node in g.nodes: - if ParallelLayerAnnotatePass.already_executed_per_node(node): - # start propagating at ColumnLinear, marking the beginning of parallelized region - axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis", must_have=True) - gather_output = ParallelLayerAnnotatePass.get_stored_field_info(node, field="gather_output") - if axis == "column" and not gather_output: - self.place_marker_per_node(node, {"parallel_axis": 2}) - # stop propagating at RowLinear, concluding the ending of parallelized region + elif is_embedding(node): + axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis") + axis_after = ParallelAxisSolverPass.get_stored_field_info(node, "parallel_axis") + assert axis_before is None and axis_after in [1, None] + info = {"axis": "vocab"} + if axis_after == 1: + assert ( + config.enable_sequence_parallel + ), "illegal parallel axis for sequence parallelism deactivated setting" + info["sequence_parallel"] = True else: - continue - else: - already_marked_args, parallel_axis = [], None - for arg in node.all_input_nodes: - if not self.already_executed_per_node(arg): - continue - if parallel_axis is None: - parallel_axis = self.get_stored_field_info(arg, field="parallel_axis", must_have=True) - else: - assert parallel_axis == self.get_stored_field_info( - arg, field="parallel_axis", must_have=True - ), "`parallel_axis` should be equal for all arguments in any related ops" - already_marked_args.append(arg) - - if not already_marked_args: - continue - - marked = False - if is_transpose(node): - marked = self.propagate_transpose(node, parallel_axis) - elif is_permute(node): - marked = self.propagate_permute(node, parallel_axis) - - # fall back - if not marked: - self.place_marker_per_node(node, {"parallel_axis": parallel_axis}) + info["sequence_parallel"] = False + self.place_marker_per_node(node, info) + return graph_module class ParallelLayerReplacePass(PassBase): """ - A pass which modifies graph according to information provided by previous analytical passes, - in general it does two things for now: + A pass which modifies graph according to information provided by previous analytical passes, in general it does two things for now: 1. replaces linears and embedding layers with their parallel counterparts. 2. modifies hard-coded arguments like the number of attention heads in the graph by dividing it by parallelism level. """ @@ -453,7 +365,7 @@ def update(node: Node, new_shape: List[Any], parallel_axis: int): else: node.update_arg(parallel_axis + 1, shape[parallel_axis]) - parallel_axis = ParallelAxisPropagationPass.get_stored_field_info(node, field="parallel_axis") + parallel_axis = ParallelAxisSolverPass.get_stored_field_info(node, field="parallel_axis") if parallel_axis is None: return @@ -582,18 +494,18 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf def build_parallel_pass_pipeline() -> PassPipeline: """ Ensemble a pass pipeline which contains the following passes: - 1. `ParallelLayerAnnotatePass` to annoate which linears are `ColumnLinear`, which are `RowLinear` - 2. `ParallelAxisPropagationPass` to propate parallel axis along the data flow - 3. `ParallelLinearReplacePass` to do the actual replacement and modification of hard-coded attributes - 4. `InitializeOrLoadWeightsPass` to load or initialize weights for parameters + 1. `ParallelAxisSolverPass` to find a parallelization solution of tensors in the graph. + 2. `ParallelLayerAnnotatePass` to annotate parallelized layers according to the solution found in the first step. + 3. `ParallelLinearReplacePass` to do the actual replacement and modification of hard-coded attributes. + 4. `InitializeOrLoadWeightsPass` to load or initialize weights for parameters. Returns: PassPipeline: the pipeline used for automatic parallelism. """ return PassPipeline( [ + ParallelAxisSolverPass(), ParallelLayerAnnotatePass(), - ParallelAxisPropagationPass(), ParallelLayerReplacePass(), InitializeOrLoadWeightsPass(), ] diff --git a/optimum/fx/parallelization/utils.py b/optimum/fx/parallelization/utils.py index f129ffbd402..b7b1ccd41c8 100644 --- a/optimum/fx/parallelization/utils.py +++ b/optimum/fx/parallelization/utils.py @@ -17,7 +17,6 @@ import hashlib import importlib import json -import operator import os import re import tempfile @@ -45,6 +44,14 @@ def ensure_divisibility(numerator: int, denominator: int) -> None: ) +def is_activation(node: Node) -> bool: + # only consider leaf Module activations + if node.op != "call_module": + return False + mod = node.graph.owning_module + return getattr(mod.get_submodule(node.target), "__module__", "").startswith("torch.nn.modules.activation") + + def is_linear(node: Node) -> bool: if node.op != "call_module": return False @@ -67,26 +74,6 @@ def is_shape_consumer(node: Node) -> bool: return False -def is_transpose(node: Node) -> bool: - if node.op == "call_method": - return node.target in {"transpose", "transpose_"} - elif node.op == "call_function": - return node.target is torch.transpose - return False - - -def is_permute(node: Node) -> bool: - if node.op == "call_method": - return node.target in {"permute"} - elif node.op == "call_function": - return node.target is torch.permute - return False - - -def is_getitem(node: Node) -> bool: - return node.op == "call_function" and node.target is operator.getitem - - def is_output(node: Node) -> bool: return node.op == "output" From bb46ebea547a2545c33c36f77067406f687187b8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 2 Sep 2024 19:23:07 +0200 Subject: [PATCH 05/73] Modify token classification processor default dataset args (#2005) --- optimum/utils/preprocessing/token_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/utils/preprocessing/token_classification.py b/optimum/utils/preprocessing/token_classification.py index 64a0bf2da8a..1c59aa2285b 100644 --- a/optimum/utils/preprocessing/token_classification.py +++ b/optimum/utils/preprocessing/token_classification.py @@ -28,7 +28,7 @@ class TokenClassificationProcessing(TaskProcessor): ACCEPTED_PREPROCESSOR_CLASSES = (PreTrainedTokenizerBase,) - DEFAULT_DATASET_ARGS = {"path": "conll2003", "trust_remote_code": True} + DEFAULT_DATASET_ARGS = "conll2003" DEFAUL_DATASET_DATA_KEYS = {"primary": "tokens"} ALLOWED_DATA_KEY_NAMES = {"primary"} DEFAULT_REF_KEYS = ["ner_tags", "pos_tags", "chunk_tags"] From 8cb6832a2797f54ec1221ff5014a81d961016b6b Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 3 Sep 2024 11:54:59 +0200 Subject: [PATCH 06/73] Fix TFLite tests (#2007) downgrade datasets --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 3ac4315321b..98ee4f36a3f 100644 --- a/setup.py +++ b/setup.py @@ -73,6 +73,7 @@ "timm", "h5py", "numpy<1.24.0", + "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", ], "diffusers": ["diffusers"], From 1de4e2522ddf40ba449296877fe6de44a7650f0c Mon Sep 17 00:00:00 2001 From: Ju Hoon Park Date: Thu, 5 Sep 2024 22:02:01 +0900 Subject: [PATCH 07/73] fix attribute name from `inputs_names` to `input_names` (#2010) fix attribute name `inputs_names` fix attribute name of ORTmodel from `inputs_names` to `input_names` --- optimum/onnxruntime/io_binding/io_binding_helper.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/io_binding/io_binding_helper.py b/optimum/onnxruntime/io_binding/io_binding_helper.py index 31da5379184..f32ecc56e6e 100644 --- a/optimum/onnxruntime/io_binding/io_binding_helper.py +++ b/optimum/onnxruntime/io_binding/io_binding_helper.py @@ -157,9 +157,9 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding: Returns an IOBinding object for an inference session. This method is for general purpose, if the inputs and outputs are determined, you can prepare data buffers directly to avoid tensor transfers across frameworks. """ - if not all(input_name in inputs.keys() for input_name in ort_model.inputs_names): + if not all(input_name in inputs.keys() for input_name in ort_model.input_names): raise ValueError( - f"The ONNX model takes {ort_model.inputs_names.keys()} as inputs, but only {inputs.keys()} are given." + f"The ONNX model takes {ort_model.input_names.keys()} as inputs, but only {inputs.keys()} are given." ) name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_model.model) @@ -168,7 +168,7 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding: io_binding = ort_model.model.io_binding() # Bind inputs - for input_name in ort_model.inputs_names: + for input_name in ort_model.input_names: onnx_input = inputs.pop(input_name) onnx_input = onnx_input.contiguous() From c0d9111775709aac6f1451e59911abe36b4b6c37 Mon Sep 17 00:00:00 2001 From: nikkie Date: Thu, 5 Sep 2024 23:01:33 +0900 Subject: [PATCH 08/73] Fix typo in BetterTransformer's overview docs (#2015) --- docs/source/bettertransformer/overview.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx index 3d575c93c25..1a525ba4c8b 100644 --- a/docs/source/bettertransformer/overview.mdx +++ b/docs/source/bettertransformer/overview.mdx @@ -24,7 +24,7 @@ In the 2.0 version, PyTorch includes a native scaled dot-product attention opera We provide an integration with these optimizations out of the box in 🤗 Optimum, so that you can convert any supported 🤗 Transformers model so as to use the optimized paths & `scaled_dot_product_attention` function when relevant. -PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorc latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA. +PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorch latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA. From 29f23f1fa9dbcb148718ff852a60a495a87471ad Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Fri, 6 Sep 2024 02:57:48 -0400 Subject: [PATCH 09/73] Apply deprecated `evaluation_strategy` (#1819) Apply deprecation `evaluation_strategy` --- .../training/image-classification/README.md | 2 +- optimum/onnxruntime/training_args.py | 22 +++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/onnxruntime/training/image-classification/README.md b/examples/onnxruntime/training/image-classification/README.md index bf4bed8ee43..967942e7a93 100644 --- a/examples/onnxruntime/training/image-classification/README.md +++ b/examples/onnxruntime/training/image-classification/README.md @@ -39,7 +39,7 @@ torchrun --nproc_per_node=NUM_GPUS_YOU_HAVE run_image_classification.py \ --per_device_eval_batch_size 32 \ --logging_strategy steps \ --logging_steps 10 \ - --evaluation_strategy epoch \ + --eval_strategy epoch \ --seed 1337 ``` diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py index 6aec362c07c..6135abc1376 100644 --- a/optimum/onnxruntime/training_args.py +++ b/optimum/onnxruntime/training_args.py @@ -117,32 +117,32 @@ def __post_init__(self): if self.disable_tqdm is None: self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN - if isinstance(self.evaluation_strategy, EvaluationStrategy): + if isinstance(self.eval_strategy, EvaluationStrategy): warnings.warn( - "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5" + "using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5" " of 🤗 Transformers. Use `IntervalStrategy` instead", FutureWarning, ) # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it. - self.evaluation_strategy = self.evaluation_strategy.value + self.eval_strategy = self.eval_strategy.value - self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy) + self.eval_strategy = IntervalStrategy(self.eval_strategy) self.logging_strategy = IntervalStrategy(self.logging_strategy) self.save_strategy = IntervalStrategy(self.save_strategy) self.hub_strategy = HubStrategy(self.hub_strategy) self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) - if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO: + if self.do_eval is False and self.eval_strategy != IntervalStrategy.NO: self.do_eval = True # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero - if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0): + if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0): if self.logging_steps > 0: logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}") self.eval_steps = self.logging_steps else: raise ValueError( - f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or" + f"evaluation strategy {self.eval_strategy} requires either non-zero --eval_steps or" " --logging_steps" ) @@ -154,7 +154,7 @@ def __post_init__(self): if self.logging_steps != int(self.logging_steps): raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}") self.logging_steps = int(self.logging_steps) - if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1: + if self.eval_strategy == IntervalStrategy.STEPS and self.eval_steps > 1: if self.eval_steps != int(self.eval_steps): raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") self.eval_steps = int(self.eval_steps) @@ -165,13 +165,13 @@ def __post_init__(self): # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. if self.load_best_model_at_end: - if self.evaluation_strategy != self.save_strategy: + if self.eval_strategy != self.save_strategy: raise ValueError( "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps " f"{self.save_steps} and eval_steps {self.eval_steps}." ) - if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: + if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: if self.eval_steps < 1 or self.save_steps < 1: if not (self.eval_steps < 1 and self.save_steps < 1): raise ValueError( @@ -244,7 +244,7 @@ def __post_init__(self): ) if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU: - if self.evaluation_strategy == IntervalStrategy.NO: + if self.eval_strategy == IntervalStrategy.NO: raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy") if not is_torch_available(): raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0") From 2335ec258132881c8b56fcfb27ad2bd5d09367b6 Mon Sep 17 00:00:00 2001 From: Rohan Potdar <66227218+Rohan138@users.noreply.github.com> Date: Sat, 7 Sep 2024 06:17:34 -0400 Subject: [PATCH 10/73] update transformers imports for `deepspeed` and `is_torch_xla_available` (#2012) * change deepspeed to integrations.deepspeed * add version check and change tpu to xla * add version check --- optimum/onnxruntime/trainer.py | 24 ++++++++++++++++++++---- optimum/onnxruntime/trainer_seq2seq.py | 7 ++++++- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 9bc2bb5134d..86c333adb3f 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -55,7 +55,6 @@ from torch.utils.data import Dataset, RandomSampler from transformers.data.data_collator import DataCollator from transformers.debug_utils import DebugOption, DebugUnderflowOverflow -from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled from transformers.modeling_utils import PreTrainedModel, unwrap_model from transformers.tokenization_utils_base import PreTrainedTokenizerBase from transformers.trainer import Trainer @@ -81,10 +80,10 @@ is_apex_available, is_sagemaker_dp_enabled, is_sagemaker_mp_enabled, - is_torch_tpu_available, ) from ..utils import logging +from ..utils.import_utils import check_if_transformers_greater from .training_args import ORTOptimizerNames, ORTTrainingArguments from .utils import ( is_onnxruntime_training_available, @@ -94,8 +93,25 @@ if is_apex_available(): from apex import amp -if is_torch_tpu_available(check_device=False): - import torch_xla.core.xla_model as xm +if check_if_transformers_greater("4.33"): + from transformers.integrations.deepspeed import ( + deepspeed_init, + deepspeed_load_checkpoint, + is_deepspeed_zero3_enabled, + ) +else: + from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled + +if check_if_transformers_greater("4.39"): + from transformers.utils import is_torch_xla_available + + if is_torch_xla_available(): + import torch_xla.core.xla_model as xm +else: + from transformers.utils import is_torch_tpu_available + + if is_torch_tpu_available(check_device=False): + import torch_xla.core.xla_model as xm if TYPE_CHECKING: import optuna diff --git a/optimum/onnxruntime/trainer_seq2seq.py b/optimum/onnxruntime/trainer_seq2seq.py index 2e43ee89e00..1565ffa6acb 100644 --- a/optimum/onnxruntime/trainer_seq2seq.py +++ b/optimum/onnxruntime/trainer_seq2seq.py @@ -19,10 +19,10 @@ import torch from torch import nn from torch.utils.data import Dataset -from transformers.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer_utils import PredictionOutput from transformers.utils import is_accelerate_available, logging +from ..utils.import_utils import check_if_transformers_greater from .trainer import ORTTrainer @@ -33,6 +33,11 @@ "The package `accelerate` is required to use the ORTTrainer. Please install it following https://huggingface.co/docs/accelerate/basic_tutorials/install." ) +if check_if_transformers_greater("4.33"): + from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled +else: + from transformers.deepspeed import is_deepspeed_zero3_enabled + logger = logging.get_logger(__name__) From e604af32fcd054cdeafcfb5553d02e92e0787fd3 Mon Sep 17 00:00:00 2001 From: David Corvoysier Date: Mon, 9 Sep 2024 17:36:05 +0200 Subject: [PATCH 11/73] Add quanto install and instructions (#1976) * chore: add quanto install option * docs: add quanto to README * Apply suggestions from code review Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- README.md | 31 +++++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 32 insertions(+) diff --git a/README.md b/README.md index 9a6403cdacb..9a81e69e126 100644 --- a/README.md +++ b/README.md @@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op ``` You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training). + + +### Quanto + +[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend. + +You can quantize a model either using the python API or the `optimum-cli`. + +```python +from transformers import AutoModelForCausalLM +from optimum.quanto import QuantizedModelForCausalLM, qint4 + +model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B') +qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head') +``` + +The quantized model can be saved using `save_pretrained`: + +```python +qmodel.save_pretrained('./Llama-3.1-8B-quantized') +``` + +It can later be reloaded using `from_pretrained`: + +```python +from optimum.quanto import QuantizedModelForCausalLM + +qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized') +``` + +You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository. diff --git a/setup.py b/setup.py index 98ee4f36a3f..ac5db71a74b 100644 --- a/setup.py +++ b/setup.py @@ -88,6 +88,7 @@ "graphcore": "optimum-graphcore", "furiosa": "optimum-furiosa", "amd": "optimum-amd", + "quanto": ["optimum-quanto>=0.2.4"], "dev": TESTS_REQUIRE + QUALITY_REQUIRE, "tests": TESTS_REQUIRE, "quality": QUALITY_REQUIRE, From 26949f5853ae3e6e0325057fa68cf1a68c8e9398 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 11 Sep 2024 09:41:00 +0200 Subject: [PATCH 12/73] Dev version (#2022) --- optimum/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/version.py b/optimum/version.py index 8eeeb9d05a7..4a8a7edab63 100644 --- a/optimum/version.py +++ b/optimum/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.22.0.dev0" +__version__ = "1.23.0.dev0" From f1b708c4e29a392d84d69f820bcc45bfd89cc221 Mon Sep 17 00:00:00 2001 From: Tom Savage Date: Mon, 16 Sep 2024 09:04:31 +0100 Subject: [PATCH 13/73] Fixes detection of CuPy installed with pre-built wheels (#1965) The CuPy library ships both a source distribution (`cupy`) as well as versions containing pre-built wheels (`cupy-cuda11x`, `cupy-cuda12x`, `cupy-rocm-5-0`, `cupy-rocm-4-3`). Use of `_is_package_available` to detect CuPy only works for the source distribution of CuPy and fails when using the pre-built wheels versions. This is because the `_is_package_available` will always attempt to resolve version information (even if it's not required) and in doing so assumes that the _importable_ package name matches the _installed_ distribution name. While this is usually the case, it doesn't work for CuPy and several other libraries. ONNX Runtime for example might be installed as `onnxruntime` or `onnxruntime-gpu` and thus Optimum just uses `importlib.util.find_spec` to work around the same problem. This commit replicates the same solution for CuPy. --- optimum/onnxruntime/utils.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index ad40af92b9d..985980e31b0 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -13,6 +13,7 @@ # limitations under the License. """Utility functions, classes and constants for ONNX Runtime.""" +import importlib import os import re from enum import Enum @@ -31,7 +32,6 @@ import onnxruntime as ort from ..exporters.onnx import OnnxConfig, OnnxConfigWithLoss -from ..utils.import_utils import _is_package_available if TYPE_CHECKING: @@ -91,9 +91,11 @@ def is_onnxruntime_training_available(): def is_cupy_available(): """ - Checks if onnxruntime-training is available. + Checks if CuPy is available. """ - return _is_package_available("cupy") + # Don't use _is_package_available as it doesn't work with CuPy installed + # with `cupy-cuda*` and `cupy-rocm-*` package name (prebuilt wheels). + return importlib.util.find_spec("cupy") is not None class ORTConfigManager: From ca36fc4f66577cd4ac2e6cedcc204d830a1f4985 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Mon, 16 Sep 2024 11:08:34 +0200 Subject: [PATCH 14/73] Adding `ORTPipelineForxxx` entrypoints (#1960) * created auto task mappings * added correct auto classes * created auto task mappings * added correct auto classes * added ort/auto diffusion classes * fix ORTPipeline detection * start test refactoring * dynamic dtype * support torch random numbers generator * compact diffusion testing suite * fix * test * test * test * use latent-consistency architecture name instead of lcm * fix * add ort diffusion pipeline tests * added dummy objects * remove duplicate code * support testing without diffusers * remove unnecessary * revert * style * remove model parts from optimum.onnxruntime --- optimum/exporters/tasks.py | 2 +- optimum/modeling_base.py | 9 +- optimum/onnxruntime/__init__.py | 16 + optimum/onnxruntime/base.py | 50 +- optimum/onnxruntime/modeling_diffusion.py | 338 ++++++-- optimum/onnxruntime/modeling_seq2seq.py | 68 -- .../diffusers/pipeline_latent_consistency.py | 6 +- .../diffusers/pipeline_stable_diffusion.py | 16 +- .../pipeline_stable_diffusion_img2img.py | 83 +- .../pipeline_stable_diffusion_inpaint.py | 22 +- .../diffusers/pipeline_stable_diffusion_xl.py | 20 +- .../pipeline_stable_diffusion_xl_img2img.py | 28 +- optimum/pipelines/diffusers/pipeline_utils.py | 8 +- optimum/utils/dummy_diffusers_objects.py | 44 + tests/exporters/exporters_utils.py | 2 +- tests/onnxruntime/test_diffusion.py | 793 ++++++++++++++++++ tests/onnxruntime/test_modeling.py | 47 +- .../test_stable_diffusion_pipeline.py | 562 ------------- 18 files changed, 1287 insertions(+), 827 deletions(-) create mode 100644 tests/onnxruntime/test_diffusion.py delete mode 100644 tests/onnxruntime/test_stable_diffusion_pipeline.py diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 97053040879..a489f34fb06 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -308,9 +308,9 @@ class TasksManager: "image-feature-extraction": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) - "lcm": "text-to-image", "stable-diffusion": "text-to-image", "stable-diffusion-xl": "text-to-image", + "latent-consistency": "text-to-image", } _CUSTOM_CLASSES = { diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 5bab0622de4..3da2d9d0d21 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -85,7 +85,6 @@ class PreTrainedModel(ABC): # noqa: F811 class OptimizedModel(PreTrainedModel): config_class = AutoConfig - load_tf_weights = None base_model_prefix = "optimized_model" config_name = CONFIG_NAME @@ -378,10 +377,14 @@ def from_pretrained( ) model_id, revision = model_id.split("@") - library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token) + library_name = TasksManager.infer_library_from_model( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if library_name == "timm": - config = PretrainedConfig.from_pretrained(model_id, subfolder, revision) + config = PretrainedConfig.from_pretrained( + model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + ) if config is None: if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index f1d4f63a9ff..09a48ec955c 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,6 +79,10 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + "ORTPipelineForText2Image", + "ORTDiffusionPipeline", ] else: _import_structure["modeling_diffusion"] = [ @@ -88,6 +92,10 @@ "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + "ORTPipelineForText2Image", + "ORTDiffusionPipeline", ] @@ -137,7 +145,11 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, @@ -146,7 +158,11 @@ ) else: from .modeling_diffusion import ( + ORTDiffusionPipeline, ORTLatentConsistencyModelPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index d9877670ba8..0e54bafed78 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -41,17 +41,11 @@ class ORTModelPart: _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs - def __init__( - self, - session: InferenceSession, - parent_model: "ORTModel", - ): + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): self.session = session self.parent_model = parent_model - self.normalized_config = NormalizedConfigManager.get_normalized_config_class( - self.parent_model.config.model_type - )(self.parent_model.config) self.main_input_name = self.parent_model.main_input_name + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()} @@ -90,12 +84,18 @@ class ORTEncoder(ORTModelPart): Encoder part of the encoder-decoder model for ONNX Runtime inference. """ - def forward( - self, - input_ids: torch.LongTensor, - attention_mask: torch.LongTensor, - **kwargs, - ) -> BaseModelOutput: + def __init__(self, session: InferenceSession, parent_model: "ORTModel"): + super().__init__(session, parent_model) + + config = ( + self.parent_model.config.encoder + if hasattr(self.parent_model.config, "encoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + + def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput: use_torch = isinstance(input_ids, torch.Tensor) self.parent_model.raise_on_numpy_input_io_binding(use_torch) @@ -138,6 +138,14 @@ def __init__( ): super().__init__(session, parent_model) + config = ( + self.parent_model.config.decoder + if hasattr(self.parent_model.config, "decoder") + else self.parent_model.config + ) + + self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config) + # TODO: make this less hacky. self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)] self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)] @@ -153,11 +161,7 @@ def __init__( self.use_past_in_outputs = len(self.key_value_output_names) > 0 self.use_past_in_inputs = len(self.key_value_input_names) > 0 - self.use_fp16 = False - for inp in session.get_inputs(): - if "past_key_values" in inp.name and inp.type == "tensor(float16)": - self.use_fp16 = True - break + self.use_fp16 = self.dtype == torch.float16 # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2 # can be used but do not support KV caching for the cross-attention key/values, see: @@ -461,11 +465,3 @@ def prepare_inputs_for_merged( cache_position = cache_position.to(self.device) return use_cache_branch_tensor, past_key_values, cache_position - - -class ORTDecoder(ORTDecoderForSeq2Seq): - def __init__(self, *args, **kwargs): - logger.warning( - "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead." - ) - super().__init__(*args, **kwargs) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 4bbfb2eda2a..18cd38c5f29 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -17,7 +17,7 @@ import os import shutil import warnings -from abc import abstractmethod +from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory from typing import Any, Dict, Optional, Union @@ -25,18 +25,28 @@ import numpy as np import torch from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + ConfigMixin, DDIMScheduler, + LatentConsistencyModelPipeline, LMSDiscreteScheduler, PNDMScheduler, + StableDiffusionImg2ImgPipeline, + StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLPipeline, ) from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils import validate_hf_hub_args from transformers import CLIPFeatureExtractor, CLIPTokenizer from transformers.file_utils import add_end_docstrings +from transformers.modeling_outputs import ModelOutput import onnxruntime as ort @@ -56,9 +66,10 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) +from .base import ORTModelPart +from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( - _ORT_TO_NP_TYPE, ONNX_WEIGHTS_NAME, get_provider_for_device, parse_device, @@ -69,23 +80,23 @@ logger = logging.getLogger(__name__) -class ORTStableDiffusionPipelineBase(ORTModel): - auto_model_class = StableDiffusionPipeline - main_input_name = "input_ids" - base_model_prefix = "onnx_model" +class ORTPipeline(ORTModel): + auto_model_class = None + model_type = "onnx_pipeline" + config_name = "model_index.json" sub_component_config_name = "config.json" def __init__( self, vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, unet_session: ort.InferenceSession, - config: Dict[str, Any], tokenizer: CLIPTokenizer, + config: Dict[str, Any], scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], feature_extractor: Optional[CLIPFeatureExtractor] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, + text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, tokenizer_2: Optional[CLIPTokenizer] = None, use_io_binding: Optional[bool] = None, @@ -94,23 +105,28 @@ def __init__( """ Args: vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder. - text_encoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the text encoder. + The ONNX Runtime inference session associated to the VAE decoder unet_session (`ort.InferenceSession`): The ONNX Runtime inference session associated to the U-NET. + tokenizer (`CLIPTokenizer`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the text encoder. config (`Dict[str, Any]`): A config dictionary from which the model components will be instantiated. Make sure to only load configuration files of compatible classes. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): A model extracting features from generated images to be used as inputs for the `safety_checker` vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): The ONNX Runtime inference session associated to the VAE encoder. + text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): + The ONNX Runtime inference session associated to the text encoder. + tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): + Tokenizer of class + [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) + for the second text encoder. use_io_binding (`Optional[bool]`, defaults to `None`): Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to `True` if the device is CUDA, otherwise defaults to `False`. @@ -118,7 +134,7 @@ def __init__( The directory under which the model exported to ONNX was saved. """ self.shared_attributes_init( - vae_decoder_session, + model=vae_decoder_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, ) @@ -350,9 +366,9 @@ def _from_pretrained( text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=new_model_save_dir - / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER - / text_encoder_2_file_name, + text_encoder_2_path=( + new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name + ), provider=provider, session_options=session_options, provider_options=provider_options, @@ -399,7 +415,7 @@ def _from_transformers( provider_options: Optional[Dict[str, Any]] = None, use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTStableDiffusionPipeline": + ) -> "ORTPipeline": if use_auth_token is not None: warnings.warn( "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", @@ -480,131 +496,142 @@ def _save_config(self, save_directory): self.save_config(save_directory) -# TODO : Use ORTModelPart once IOBinding support is added -class _ORTDiffusionModelPart: - """ - For multi-file ONNX models, represents a part of the model. - It has its own `onnxruntime.InferenceSession`, and can perform a forward pass. - """ - +class ORTPipelinePart(ORTModelPart): CONFIG_NAME = "config.json" - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - self.session = session - self.parent_model = parent_model - self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): config_path = Path(session._model_path).parent / self.CONFIG_NAME - self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {} - self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()} + + if config_path.is_file(): + # TODO: use FrozenDict + self.config = parent_model._dict_from_json_file(config_path) + else: + self.config = {} + + super().__init__(session, parent_model) @property - def device(self): - return self.parent_model.device + def input_dtype(self): + # for backward compatibility and diffusion mixins (will be standardized in the future) + return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} - @abstractmethod - def forward(self, *args, **kwargs): - pass - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) +class ORTModelTextEncoder(ORTPipelinePart): + def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(input_ids, torch.Tensor) + model_inputs = {"input_ids": input_ids} -class ORTModelTextEncoder(_ORTDiffusionModelPart): - def forward(self, input_ids: np.ndarray): - onnx_inputs = { - "input_ids": input_ids, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + return ModelOutput(**model_outputs) -class ORTModelUnet(_ORTDiffusionModelPart): - def __init__(self, session: ort.InferenceSession, parent_model: ORTModel): - super().__init__(session, parent_model) +class ORTModelUnet(ORTPipelinePart): def forward( self, - sample: np.ndarray, - timestep: np.ndarray, - encoder_hidden_states: np.ndarray, - text_embeds: Optional[np.ndarray] = None, - time_ids: Optional[np.ndarray] = None, - timestep_cond: Optional[np.ndarray] = None, + sample: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, + time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, ): - onnx_inputs = { + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, + "text_embeds": text_embeds, + "time_ids": time_ids, + "timestep_cond": timestep_cond, } - if text_embeds is not None: - onnx_inputs["text_embeds"] = text_embeds - if time_ids is not None: - onnx_inputs["time_ids"] = time_ids - if timestep_cond is not None: - onnx_inputs["timestep_cond"] = timestep_cond - outputs = self.session.run(None, onnx_inputs) - return outputs + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(_ORTDiffusionModelPart): - def forward(self, latent_sample: np.ndarray): - onnx_inputs = { - "latent_sample": latent_sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs +class ORTModelVaeDecoder(ORTPipelinePart): + def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(latent_sample, torch.Tensor) -class ORTModelVaeEncoder(_ORTDiffusionModelPart): - def forward(self, sample: np.ndarray): - onnx_inputs = { - "sample": sample, - } - outputs = self.session.run(None, onnx_inputs) - return outputs + model_inputs = {"latent_sample": latent_sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) + + +class ORTModelVaeEncoder(ORTPipelinePart): + def forward(self, sample: Union[np.ndarray, torch.Tensor]): + use_torch = isinstance(sample, torch.Tensor) + + model_inputs = {"sample": sample} + + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + + return ModelOutput(**model_outputs) @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionPipeline + __call__ = StableDiffusionPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionImg2ImgPipeline + __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionInpaintPipeline + __call__ = StableDiffusionInpaintPipelineMixin.__call__ @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ - __call__ = LatentConsistencyPipelineMixin.__call__ + main_input_name = "prompt" + auto_model_class = LatentConsistencyModelPipeline + __call__ = LatentConsistencyPipelineMixin.__call__ -class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase): - auto_model_class = StableDiffusionXLImg2ImgPipeline +class ORTStableDiffusionXLPipelineBase(ORTPipeline): def __init__( self, vae_decoder_session: ort.InferenceSession, @@ -657,6 +684,9 @@ class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffu ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionXLPipeline + __call__ = StableDiffusionXLPipelineMixin.__call__ @@ -666,4 +696,140 @@ class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, Stab ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). """ + main_input_name = "prompt" + auto_model_class = StableDiffusionXLImg2ImgPipeline + __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + + +SUPPORTED_ORT_PIPELINES = [ + ORTStableDiffusionPipeline, + ORTStableDiffusionImg2ImgPipeline, + ORTStableDiffusionInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTStableDiffusionXLPipeline, + ORTStableDiffusionXLImg2ImgPipeline, +] + + +def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): + for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: + if ( + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return ort_pipeline_class + + if throw_error_if_not_exist: + raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") + + +class ORTDiffusionPipeline(ConfigMixin): + config_name = "model_index.json" + + @classmethod + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] + + ort_pipeline_class = _get_pipeline_class(class_name) + + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) + + +ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), + ("latent-consistency", ORTLatentConsistencyModelPipeline), + ] +) + +ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ] +) + +ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( + [ + ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ] +) + +SUPPORTED_ORT_PIPELINES_MAPPINGS = [ + ORT_TEXT2IMAGE_PIPELINES_MAPPING, + ORT_IMAGE2IMAGE_PIPELINES_MAPPING, + ORT_INPAINT_PIPELINES_MAPPING, +] + + +def _get_task_class(mapping, pipeline_class_name): + def _get_model_name(pipeline_class_name): + for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: + for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): + if ( + ort_pipeline_class.__name__ == pipeline_class_name + or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name + ): + return model_name + + model_name = _get_model_name(pipeline_class_name) + + if model_name is not None: + task_class = mapping.get(model_name, None) + if task_class is not None: + return task_class + + raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}") + + +class ORTPipelineForTask(ConfigMixin): + config_name = "model_index.json" + + @classmethod + def from_pretrained(cls, pretrained_model_or_path, **kwargs): + load_config_kwargs = { + "force_download": kwargs.get("force_download", False), + "resume_download": kwargs.get("resume_download", None), + "local_files_only": kwargs.get("local_files_only", False), + "cache_dir": kwargs.get("cache_dir", None), + "revision": kwargs.get("revision", None), + "proxies": kwargs.get("proxies", None), + "token": kwargs.get("token", None), + } + config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) + config = config[0] if isinstance(config, tuple) else config + class_name = config["_class_name"] + + ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) + + return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) + + +class ORTPipelineForText2Image(ORTPipelineForTask): + auto_model_class = AutoPipelineForText2Image + ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForImage2Image(ORTPipelineForTask): + auto_model_class = AutoPipelineForImage2Image + ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING + + +class ORTPipelineForInpainting(ORTPipelineForTask): + auto_model_class = AutoPipelineForInpainting + ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 4ce3e4707ed..3cecadafe3e 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -46,7 +46,6 @@ from ..onnx.utils import _get_external_data_paths from ..utils import check_if_transformers_greater from ..utils.file_utils import validate_file_exists -from ..utils.normalized_config import NormalizedConfigManager from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors from .base import ORTDecoderForSeq2Seq, ORTEncoder from .constants import ( @@ -72,16 +71,6 @@ from transformers.generation_utils import GenerationMixin -# if check_if_transformers_greater("4.37.0"): -# # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin` -# # and it implements many new features including short and long form generation, and starts with 2 init tokens -# from transformers.models.whisper.generation_whisper import WhisperGenerationMixin -# else: - -# class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin): -# pass - - if check_if_transformers_greater("4.43.0"): from transformers.cache_utils import EncoderDecoderCache else: @@ -1165,49 +1154,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin): auto_model_class = AutoModelForSeq2SeqLM main_input_name = "input_ids" - def __init__( - self, - encoder_session: ort.InferenceSession, - decoder_session: ort.InferenceSession, - config: "PretrainedConfig", - onnx_paths: List[str], - decoder_with_past_session: Optional[ort.InferenceSession] = None, - use_cache: bool = True, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - preprocessors: Optional[List] = None, - generation_config: Optional[GenerationConfig] = None, - **kwargs, - ): - super().__init__( - encoder_session, - decoder_session, - config, - onnx_paths, - decoder_with_past_session, - use_cache, - use_io_binding, - model_save_dir, - preprocessors, - generation_config, - **kwargs, - ) - - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - if config.model_type == "encoder-decoder": - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoder(session, self) @@ -1521,20 +1467,6 @@ def __init__( **kwargs, ) - # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized. - self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.encoder.model_type - )(config.encoder) - - self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - - if self.decoder_with_past is not None: - self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class( - config.decoder.model_type - )(config.decoder) - def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: return ORTEncoderForVisionEncoderDecoder(session, self) diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py index 41c85b5b6ac..630d463de73 100644 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ b/optimum/pipelines/diffusers/pipeline_latent_consistency.py @@ -36,7 +36,7 @@ def __call__( original_inference_steps: int = None, guidance_scale: float = 8.5, num_images_per_prompt: int = 1, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -66,7 +66,7 @@ def __call__( usually at the expense of lower image quality. num_images_per_prompt (`int`, defaults to 1): The number of images to generate per prompt. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -121,7 +121,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() prompt_embeds = self._encode_prompt( prompt, diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py index 98bff0de44d..6cc47fab1b9 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py @@ -189,7 +189,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -209,7 +217,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -248,7 +256,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -303,7 +311,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py index 81a6ffa1e04..a66035a789b 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py @@ -16,10 +16,9 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import deprecate from .pipeline_stable_diffusion import StableDiffusionPipelineMixin @@ -72,6 +71,43 @@ def check_inputs( f" {negative_prompt_embeds.shape}." ) + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): + batch_size = batch_size * num_images_per_prompt + + if image.shape[1] == 4: + init_latents = image + else: + init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) + + if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: + # expand init_latents for batch_size + additional_image_per_prompt = batch_size // init_latents.shape[0] + init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) + elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: + raise ValueError( + f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." + ) + else: + init_latents = np.concatenate([init_latents], axis=0) + + # add noise to latents using the timesteps + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + + init_latents = self.scheduler.add_noise( + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) + ).numpy() + + return init_latents + # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ def __call__( self, @@ -83,7 +119,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, output_type: str = "pil", @@ -125,7 +161,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): A np.random.RandomState to make generation deterministic. prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not @@ -168,7 +204,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -191,31 +227,7 @@ def __call__( latents_dtype = prompt_embeds.dtype image = image.astype(latents_dtype) - # encode the init image into latents and scale the latents - init_latents = self.vae_encoder(sample=image)[0] - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - init_latents = scaling_factor * init_latents - - if isinstance(prompt, str): - prompt = [prompt] - if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0: - # expand init_latents for batch_size - deprecation_message = ( - f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial" - " images (`image`). Initial images are now duplicating to match the number of text prompts. Note" - " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update" - " your script to pass as many initial images as text prompts to suppress this warning." - ) - deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False) - additional_image_per_prompt = len(prompt) // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0) - elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts." - ) - else: - init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0) # get the original timestep using init_timestep offset = self.scheduler.config.get("steps_offset", 0) @@ -225,12 +237,8 @@ def __call__( timesteps = self.scheduler.timesteps.numpy()[-init_timestep] timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(latents_dtype) - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() + # 5. Prepare latent variables + latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. @@ -241,8 +249,6 @@ def __call__( if accepts_eta: extra_step_kwargs["eta"] = eta - latents = init_latents - t_start = max(num_inference_steps - init_timestep + offset, 0) timesteps = self.scheduler.timesteps[t_start:].numpy() @@ -276,7 +282,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py index 19de793ccd0..cb3c7db96e9 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py @@ -16,7 +16,7 @@ from typing import Callable, List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput from diffusers.utils import PIL_INTERPOLATION @@ -108,7 +108,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -200,7 +200,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # set timesteps self.scheduler.set_timesteps(num_inference_steps) @@ -229,11 +229,19 @@ def __call__( width // self.vae_scale_factor, ) latents_dtype = prompt_embeds.dtype + if latents is None: - latents = generator.randn(*latents_shape).astype(latents_dtype) - else: - if latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*latents_shape).astype(latents_dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + elif latents.shape != latents_shape: + raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") # prepare mask and masked_image mask, masked_image = prepare_mask_and_masked_image( diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py index 2a5e7bf78b0..0407c16a77a 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py @@ -235,7 +235,15 @@ def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype ) if latents is None: - latents = generator.randn(*shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + latents = generator.randn(*shape).astype(dtype) + elif isinstance(generator, torch.Generator): + latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) elif latents.shape != shape: raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") @@ -270,7 +278,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -315,7 +323,7 @@ def __call__( eta (`float`, defaults to 0.0): Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: + generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: A np.random.RandomState to make generation deterministic. latents (`Optional[np.ndarray]`, defaults to `None`): Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image @@ -383,7 +391,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -440,6 +448,7 @@ def __call__( timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order for i, t in enumerate(self.progress_bar(timesteps)): # expand the latents if we are doing classifier free guidance @@ -475,7 +484,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py index a07903a735e..19988599b64 100644 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py @@ -17,7 +17,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput @@ -222,7 +222,7 @@ def get_timesteps(self, num_inference_steps, strength): return timesteps, num_inference_steps - t_start # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None): + def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): batch_size = batch_size * num_images_per_prompt if image.shape[1] == 4: @@ -242,11 +242,22 @@ def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dt init_latents = np.concatenate([init_latents], axis=0) # add noise to latents using the timesteps - noise = generator.randn(*init_latents.shape).astype(dtype) + if isinstance(generator, np.random.RandomState): + noise = generator.randn(*init_latents.shape).astype(dtype) + elif isinstance(generator, torch.Generator): + noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) + else: + raise ValueError( + f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" + f" {type(generator)}." + ) + init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep) + torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) ) - return init_latents.numpy() + init_latents = init_latents.numpy() + + return init_latents def _get_add_time_ids( self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype @@ -274,7 +285,7 @@ def __call__( negative_prompt: Optional[Union[str, List[str]]] = None, num_images_per_prompt: int = 1, eta: float = 0.0, - generator: Optional[np.random.RandomState] = None, + generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, latents: Optional[np.ndarray] = None, prompt_embeds: Optional[np.ndarray] = None, negative_prompt_embeds: Optional[np.ndarray] = None, @@ -375,7 +386,7 @@ def __call__( batch_size = prompt_embeds.shape[0] if generator is None: - generator = np.random + generator = np.random.RandomState() # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` @@ -482,7 +493,8 @@ def __call__( # call the callback, if provided if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): if callback is not None and i % callback_steps == 0: - callback(i, t, latents) + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) if output_type == "latent": image = latents diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py index 869b91ffe59..e9d5986b61c 100644 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ b/optimum/pipelines/diffusers/pipeline_utils.py @@ -17,7 +17,7 @@ from typing import List, Optional, Union import numpy as np -import PIL +import PIL.Image import torch from diffusers import ConfigMixin from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor @@ -206,7 +206,7 @@ def postprocess( def get_height_width( self, - image: [PIL.Image.Image, np.ndarray], + image: Union[PIL.Image.Image, np.ndarray], height: Optional[int] = None, width: Optional[int] = None, ): @@ -264,10 +264,10 @@ def reshape(images: np.ndarray) -> np.ndarray: # TODO : remove after diffusers v0.21.0 release def resize( self, - image: [PIL.Image.Image, np.ndarray, torch.Tensor], + image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], height: Optional[int] = None, width: Optional[int] = None, - ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]: + ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: """ Resize image. """ diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index f6914bbcd3a..35d1ffe9fc7 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -79,3 +79,47 @@ def __init__(self, *args, **kwargs): @classmethod def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) + + +class ORTDiffusionPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForText2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForImage2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForInpainting(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index a55c7a124df..c8a33b0be35 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -298,7 +298,7 @@ PYTORCH_DIFFUSION_MODEL = { "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", - "lcm": "echarlaix/tiny-random-latent-consistency", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", } PYTORCH_TIMM_MODEL = { diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py new file mode 100644 index 00000000000..9f480b2d1a0 --- /dev/null +++ b/tests/onnxruntime/test_diffusion.py @@ -0,0 +1,793 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest + +import numpy as np +import PIL +import pytest +import torch +from diffusers import ( + AutoPipelineForImage2Image, + AutoPipelineForInpainting, + AutoPipelineForText2Image, + DiffusionPipeline, +) +from diffusers.utils import load_image +from parameterized import parameterized +from transformers.testing_utils import require_torch_gpu +from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin + +from optimum.onnxruntime import ( + ORTDiffusionPipeline, + ORTPipelineForImage2Image, + ORTPipelineForInpainting, + ORTPipelineForText2Image, +) +from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor +from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm + + +def get_generator(framework, seed): + if framework == "np": + return np.random.RandomState(seed) + elif framework == "pt": + return torch.Generator().manual_seed(seed) + else: + raise ValueError(f"Unknown framework: {framework}") + + +def _generate_prompts(batch_size=1): + inputs = { + "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, + "num_inference_steps": 3, + "guidance_scale": 7.5, + "output_type": "np", + } + return inputs + + +def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"): + if input_type == "pil": + image = load_image( + "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" + "/in_paint/overture-creations-5sI6fQgYIuo.png" + ).resize((width, height)) + elif input_type == "np": + image = np.random.rand(height, width, channel) + elif input_type == "pt": + image = torch.rand((channel, height, width)) + + return [image] * batch_size + + +def to_np(image): + if isinstance(image[0], PIL.Image.Image): + return np.stack([np.array(i) for i in image], axis=0) + elif isinstance(image, torch.Tensor): + return image.cpu().numpy().transpose(0, 2, 3, 1) + return image + + +class ORTPipelineForText2ImageTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] + + ORTMODEL_CLASS = ORTPipelineForText2Image + AUTOMODEL_CLASS = AutoPipelineForText2Image + + TASK = "text-to-image" + + def generate_inputs(self, height=128, width=128, batch_size=1): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + if model_arch == "latent-consistency": + # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step + # TODO: Investigate why this is the case + inputs["num_inference_steps"] = 1 + + for output_type in ["latent", "np"]: + inputs["output_type"] = output_type + + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + height, width, batch_size = 64, 32, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_callback = Callback() + auto_callback = Callback() + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertTrue(auto_callback.has_been_called) + self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + if model_arch in ["latent-consistency"]: + pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_negative_prompt(self, model_arch: str): + if model_arch in ["latent-consistency"]: + pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + negative_prompt = ["This is a negative prompt"] + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + image_slice_1 = pipeline( + **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) + ).images[0, -3:, -3:, -1] + prompt = inputs.pop("prompt") + + if model_arch == "stable-diffusion-xl": + ( + inputs["prompt_embeds"], + inputs["negative_prompt_embeds"], + inputs["pooled_prompt_embeds"], + inputs["negative_pooled_prompt_embeds"], + ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + else: + text_ids = pipeline.tokenizer( + prompt, + max_length=pipeline.tokenizer.model_max_length, + padding="max_length", + return_tensors="np", + truncation=True, + ).input_ids + negative_text_ids = pipeline.tokenizer( + negative_prompt, + max_length=pipeline.tokenizer.model_max_length, + padding="max_length", + return_tensors="np", + truncation=True, + ).input_ids + inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] + inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] + + image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] + + self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + + +class ORTPipelineForImage2ImageTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + + AUTOMODEL_CLASS = AutoPipelineForImage2Image + ORTMODEL_CLASS = ORTPipelineForImage2Image + + TASK = "image-to-image" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) + + inputs["strength"] = 0.75 + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(list(SUPPORTED_ARCHITECTURES)) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" + ) + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 32, 64, 1 + + for input_type in ["np", "pil", "pt"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 128, 128, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + + self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + pytest.skip("Img2Img models do not support support output reproducibility for some reason") + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +class ORTPipelineForInpaintingTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["stable-diffusion"] + + AUTOMODEL_CLASS = AutoPipelineForInpainting + ORTMODEL_CLASS = ORTPipelineForInpainting + + TASK = "inpainting" + + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): + assert batch_size == 1, "Inpainting models only support batch_size=1" + assert input_type == "pil", "Inpainting models only support input_type='pil'" + + inputs = _generate_prompts(batch_size=batch_size) + + inputs["image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + inputs["mask_image"] = _generate_images( + height=height, width=width, batch_size=1, channel=channel, input_type="pil" + )[0] + + inputs["height"] = height + inputs["width"] = width + + return inputs + + @require_diffusers + def test_load_vanilla_model_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn( + f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_ort_pipeline_class_dispatch(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) + # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) + + # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_num_images_per_prompt(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertEqual(pipeline.vae_scale_factor, 2) + self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) + self.assertEqual(pipeline.unet.config["in_channels"], 4) + + batch_size, height = 1, 32 + for width in [64, 32]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + for num_images in [1, 3]: + outputs = pipeline(**inputs, num_images_per_prompt=num_images).images + self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) + ) + @require_torch_gpu + @pytest.mark.cuda_ep_test + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand( + grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) + ) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + @require_diffusers + def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + outputs = pipeline(**inputs).images + # Verify model devices + self.assertEqual(pipeline.device.type.lower(), "cuda") + # Verify model outptus + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_callback(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["num_inference_steps"] = 3 + + class Callback: + def __init__(self): + self.has_been_called = False + self.number_of_steps = 0 + + def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + self.has_been_called = True + self.number_of_steps += 1 + + ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + ort_callback = Callback() + auto_callback = Callback() + # callback_steps=1 to trigger callback every step + ort_pipe(**inputs, callback=ort_callback, callback_steps=1) + auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + self.assertTrue(ort_callback.has_been_called) + self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_shape(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + height, width, batch_size = 32, 64, 1 + + for input_type in ["pil"]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) + + for output_type in ["np", "pil", "latent"]: + inputs["output_type"] = output_type + outputs = pipeline(**inputs).images + if output_type == "pil": + self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) + elif output_type == "np": + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + else: + self.assertEqual( + outputs.shape, + (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_compare_to_diffusers_pipeline(self, model_arch: str): + if model_arch in ["stable-diffusion"]: + pytest.skip( + "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" + ) + + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + latents_shape = ( + batch_size, + ort_pipeline.vae_decoder.config["latent_channels"], + height // ort_pipeline.vae_scale_factor, + width // ort_pipeline.vae_scale_factor, + ) + + np_latents = np.random.rand(*latents_shape).astype(np.float32) + torch_latents = torch.from_numpy(np_latents) + + ort_output = ort_pipeline(**inputs, latents=np_latents).images + diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images + + self.assertTrue( + np.allclose(ort_output, diffusers_output, atol=1e-4), + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), + ) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_diffusers + def test_image_reproducibility(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 64, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + for generator_framework in ["np", "pt"]: + ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) + ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) + + self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) + self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + + +class ImageProcessorTest(unittest.TestCase): + def test_vae_image_processor_pt(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) + input_np = to_np(input_pt) + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_np(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) + out_np = to_np(out) + in_np = (input_np * 255).round() if output_type == "pil" else input_np + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + + def test_vae_image_processor_pil(self): + image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) + input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") + + for output_type in ["np", "pil"]: + out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) + for i, o in zip(input_pil, out): + in_np = np.array(i) + out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() + self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 4b44acb38ab..199b96342e7 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -89,15 +89,8 @@ ORTModelForSpeechSeq2Seq, ORTModelForTokenClassification, ORTModelForVision2Seq, - ORTStableDiffusionPipeline, ) from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) from optimum.onnxruntime.modeling_ort import ORTModel from optimum.pipelines import pipeline from optimum.utils import ( @@ -108,7 +101,24 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm +from optimum.utils.import_utils import is_diffusers_available +from optimum.utils.testing_utils import ( + grid_parameters, + remove_directory, + require_diffusers, + require_hf_token, + require_ort_rocm, +) + + +if is_diffusers_available(): + from optimum.onnxruntime.modeling_diffusion import ( + ORTModelTextEncoder, + ORTModelUnet, + ORTModelVaeDecoder, + ORTModelVaeEncoder, + ORTStableDiffusionPipeline, + ) logger = logging.get_logger() @@ -205,6 +215,7 @@ def test_load_seq2seq_model_from_empty_cache(self): with self.assertRaises(Exception): _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True) + @require_diffusers def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching @@ -218,6 +229,7 @@ def test_load_stable_diffusion_model_from_cache(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--") @@ -300,6 +312,7 @@ def test_load_seq2seq_model_unknown_provider(self): with self.assertRaises(ValueError): ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider") + @require_diffusers def test_load_stable_diffusion_model_from_hub(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) @@ -308,6 +321,7 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_load_stable_diffusion_model_cuda_provider(self): @@ -321,6 +335,7 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -335,6 +350,7 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider" @@ -346,6 +362,7 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): ORTStableDiffusionPipeline.from_pretrained( @@ -478,6 +495,7 @@ def test_passing_session_options_seq2seq(self): self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3) self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3) + @require_diffusers def test_passing_session_options_stable_diffusion(self): options = onnxruntime.SessionOptions() options.intra_op_num_threads = 3 @@ -772,6 +790,7 @@ def test_seq2seq_model_on_rocm_ep_str(self): self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_passing_provider_options_stable_diffusion(self): @@ -810,6 +829,7 @@ def test_passing_provider_options_stable_diffusion(self): model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0" ) + @require_diffusers def test_stable_diffusion_model_on_cpu(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") @@ -825,7 +845,7 @@ def test_stable_diffusion_model_on_cpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) - # test string device input for to() + @require_diffusers def test_stable_diffusion_model_on_cpu_str(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) cpu = torch.device("cpu") @@ -841,6 +861,7 @@ def test_stable_diffusion_model_on_cpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider") self.assertListEqual(model.providers, ["CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu(self): @@ -858,6 +879,7 @@ def test_stable_diffusion_model_on_gpu(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -876,6 +898,7 @@ def test_stable_diffusion_model_on_rocm_ep(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider") self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu") def test_stable_diffusion_model_on_gpu_id(self): model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) @@ -899,7 +922,7 @@ def test_stable_diffusion_model_on_gpu_id(self): self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1") - # test string device input for to() + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test def test_stable_diffusion_model_on_gpu_str(self): @@ -916,6 +939,7 @@ def test_stable_diffusion_model_on_gpu_str(self): self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider") self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"]) + @require_diffusers @require_torch_gpu @require_ort_rocm @pytest.mark.rocm_ep_test @@ -975,6 +999,7 @@ def test_save_seq2seq_model_without_past(self): self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents) self.assertTrue(CONFIG_NAME in folder_contents) + @require_diffusers def test_save_stable_diffusion_model(self): with tempfile.TemporaryDirectory() as tmpdirname: model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) @@ -1050,6 +1075,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool): os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") remove_directory(tmpdirname) + @require_diffusers def test_save_load_stable_diffusion_model_with_external_data(self): with tempfile.TemporaryDirectory() as tmpdirname: os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1" # force exporting small model with external data @@ -1180,6 +1206,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self): ) os.environ.pop("FORCE_ONNX_EXTERNAL_DATA") + @require_diffusers @require_hf_token def test_push_stable_diffusion_model_with_external_data_to_hub(self): with tempfile.TemporaryDirectory() as tmpdirname: diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py deleted file mode 100644 index 44cd22ffecc..00000000000 --- a/tests/onnxruntime/test_stable_diffusion_pipeline.py +++ /dev/null @@ -1,562 +0,0 @@ -# coding=utf-8 -# Copyright 2022 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import random -import unittest -from typing import Dict - -import numpy as np -import PIL -import pytest -import torch -from diffusers import ( - OnnxStableDiffusionImg2ImgPipeline, - StableDiffusionPipeline, - StableDiffusionXLPipeline, -) -from diffusers.utils import load_image -from diffusers.utils.testing_utils import floats_tensor -from packaging.version import Version, parse -from parameterized import parameterized -from transformers.testing_utils import require_torch_gpu -from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin - -from optimum.onnxruntime import ( - ORTLatentConsistencyModelPipeline, - ORTStableDiffusionImg2ImgPipeline, - ORTStableDiffusionInpaintPipeline, - ORTStableDiffusionPipeline, - ORTStableDiffusionXLImg2ImgPipeline, - ORTStableDiffusionXLPipeline, -) -from optimum.onnxruntime.modeling_diffusion import ( - ORTModelTextEncoder, - ORTModelUnet, - ORTModelVaeDecoder, - ORTModelVaeEncoder, -) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.import_utils import _diffusers_version -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm - - -if parse(_diffusers_version) > Version("0.21.4"): - from diffusers import LatentConsistencyModelPipeline - - -def _generate_inputs(batch_size=1): - inputs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 3, - "guidance_scale": 7.5, - "output_type": "np", - } - return inputs - - -def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"): - if input_type == "pil": - image = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - elif input_type == "np": - image = np.random.rand(height, width, channel) - elif input_type == "pt": - image = torch.rand((channel, height, width)) - - return [image] * batch_size - - -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - -class ORTStableDiffusionPipelineBase(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @require_diffusers - def test_load_vanilla_model_which_is_not_supported(self): - with self.assertRaises(Exception) as context: - _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True) - - self.assertIn( - f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception) - ) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_num_images_per_prompt(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_callback(self, model_arch: str): - def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None: - callback_fn.has_been_called = True - callback_fn.number_of_steps += 1 - - pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - callback_fn.has_been_called = False - callback_fn.number_of_steps = 0 - inputs = self.generate_inputs(height=64, width=64) - pipe(**inputs, callback=callback_fn, callback_steps=1) - self.assertTrue(callback_fn.has_been_called) - self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"]) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_shape(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width, batch_size = 128, 64, 1 - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - if self.TASK == "image-to-image": - input_types = ["np", "pil", "pt"] - elif self.TASK == "text-to-image": - input_types = ["np"] - else: - input_types = ["pil"] - - for input_type in input_types: - if self.TASK == "image-to-image": - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - else: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: - inputs["output_type"] = output_type - outputs = pipeline(**inputs).images - if output_type == "pil": - self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) - elif output_type == "np": - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = _generate_inputs(batch_size=batch_size) - inputs["height"] = height - inputs["width"] = width - return inputs - - -class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - height, width = 128, 128 - - inputs = self.generate_inputs(height=height, width=width) - inputs["prompt"] = "A painting of a squirrel eating a burger" - inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED)) - - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images - - self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ORTStableDiffusionPipelineTest(unittest.TestCase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - pipeline.safety_checker = None - batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32 - - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": "sailing ship in storm by Leonardo da Vinci", - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - self.assertIsInstance(ort_outputs, np.ndarray) - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - # Compare model outputs - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - def test_negative_prompt(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - inputs["height"], inputs["width"] = 64, 32 - negative_prompt = ["This is a negative prompt"] - np.random.seed(0) - image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1] - prompt = inputs.pop("prompt") - embeds = [] - for p in [prompt, negative_prompt]: - text_inputs = pipeline.tokenizer( - p, - padding="max_length", - max_length=pipeline.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32)) - embeds.append(pipeline.text_encoder(text_inputs)[0]) - - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds - np.random.seed(0) - image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1] - self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4)) - - -class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_rescale": 0.1, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_image_reproducibility(self, model_arch: str): - pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - inputs = _generate_inputs() - height, width = 64, 32 - np.random.seed(0) - ort_outputs_1 = pipeline(**inputs, height=height, width=width) - np.random.seed(0) - ort_outputs_2 = pipeline(**inputs, height=height, width=width) - ort_outputs_3 = pipeline(**inputs, height=height, width=width) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) - self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) - - -class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion", - ] - ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline - TASK = "inpainting" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_compare_diffusers_pipeline(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch]) - height, width = 64, 64 - latents_shape = ( - 1, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) - inputs = self.generate_inputs(height=height, width=width) - - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) - - ort_outputs = ort_pipeline(**inputs, latents=np_latents).images - self.assertEqual(ort_outputs.shape, (1, height, width, 3)) - - diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images - self.assertEqual(diffusers_outputs.shape, (1, height, width, 3)) - - self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4)) - - def generate_inputs(self, height=128, width=128, batch_size=1): - inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width) - inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0] - return inputs - - -class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "stable-diffusion-xl", - ] - ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline - TASK = "image-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - def test_inference(self, model_arch: str): - model_args = {"test_name": model_arch, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - - height, width = 128, 128 - inputs = self.generate_inputs(height=height, width=width) - inputs["image"] = load_image( - "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" - "/in_paint/overture-creations-5sI6fQgYIuo.png" - ).resize((width, height)) - output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1] - expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080]) - - self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1)) - - def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"): - inputs = _generate_inputs(batch_size=batch_size) - inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type) - inputs["strength"] = 0.75 - return inputs - - -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_create_image(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - -class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = [ - "latent-consistency", - ] - ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline - TASK = "text-to-image" - - @parameterized.expand(SUPPORTED_ARCHITECTURES) - @require_diffusers - @unittest.skipIf( - parse(_diffusers_version) <= Version("0.21.4"), - "not supported with this diffusers version, needs diffusers>=v0.22.0", - ) - def test_compare_to_diffusers(self, model_arch: str): - ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True) - self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder) - self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder) - self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder) - self.assertIsInstance(ort_pipeline.unet, ORTModelUnet) - self.assertIsInstance(ort_pipeline.config, Dict) - - pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch]) - batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32 - latents = ort_pipeline.prepare_latents( - batch_size * num_images_per_prompt, - ort_pipeline.unet.config["in_channels"], - height, - width, - dtype=np.float32, - generator=np.random.RandomState(0), - ) - - kwargs = { - "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size, - "num_inference_steps": 1, - "num_images_per_prompt": num_images_per_prompt, - "height": height, - "width": width, - "guidance_scale": 8.5, - } - - for output_type in ["latent", "np"]: - ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images - self.assertIsInstance(ort_outputs, np.ndarray) - with torch.no_grad(): - outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images - - # Compare model outputs - self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4)) - # Compare model devices - self.assertEqual(pipeline.device, ort_pipeline.device) From 2179d33a5a9539f065f3c80ca548a05859481688 Mon Sep 17 00:00:00 2001 From: yuanwu2017 Date: Wed, 18 Sep 2024 16:30:52 +0800 Subject: [PATCH 15/73] Disable the exllama on all non-cuda devices. (#2003) * Disable the exllama on all non-cuda devices. 1. Disable the exllama on all non-cuda devices. 2. Don't raise the error when running on non-cuda device. Signed-off-by: yuanwu * Refine the code Signed-off-by: yuanwu * Fix errors of make style Signed-off-by: yuanwu * Add hpu device Signed-off-by: yuanwu * Update optimum/gptq/constants.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Fix error of make style Signed-off-by: yuanwu --------- Signed-off-by: yuanwu Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/gptq/quantizer.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 902af87bbb0..949d4d260df 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -546,7 +546,7 @@ def tmp(_, input, output): if self.bits == 4: # device not on gpu - if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])): + if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])): if not self.disable_exllama: logger.warning( "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" @@ -589,13 +589,14 @@ def post_init_model(self, model): The input model """ if self.bits == 4 and not self.disable_exllama: - if get_device(model) == torch.device("cpu") or ( - hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"]) + if get_device(model).type != "cuda" or ( + hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"]) ): - raise ValueError( - "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU." - "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object" - ) + if not self.disable_exllama: + logger.warning( + "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" + ) + self.disable_exllama = True class StoreAttr(object): pass From bf1befdf7076c12a904eddfef167bfeb3e4fa0f2 Mon Sep 17 00:00:00 2001 From: Longjie Zheng <32992656+zhenglongjiepheonix@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:10:23 -0400 Subject: [PATCH 16/73] Add Parallel Cross Entropy (#2017) --- optimum/fx/parallelization/decomp.py | 2 +- .../op_registry/op_handlers.py | 35 +++- .../parallel_layers/__init__.py | 1 + .../parallelization/parallel_layers/loss.py | 163 ++++++++++++++++++ optimum/fx/parallelization/passes.py | 45 ++++- optimum/fx/parallelization/utils.py | 34 ++++ .../parallelization/test_tensor_parallel.py | 20 +-- 7 files changed, 280 insertions(+), 20 deletions(-) create mode 100644 optimum/fx/parallelization/parallel_layers/loss.py diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py index 26258d451bf..5410818e929 100644 --- a/optimum/fx/parallelization/decomp.py +++ b/optimum/fx/parallelization/decomp.py @@ -197,7 +197,7 @@ def run(self, *args, **kwargs): def decompose_and_functionalize( graph_module: GraphModule, decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(), - leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention], + leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention, F.cross_entropy], ) -> Callable: """ API to decompose and functionalize a high-level graph module. diff --git a/optimum/fx/parallelization/op_registry/op_handlers.py b/optimum/fx/parallelization/op_registry/op_handlers.py index 56b8fc16bc0..4a9c55e3764 100644 --- a/optimum/fx/parallelization/op_registry/op_handlers.py +++ b/optimum/fx/parallelization/op_registry/op_handlers.py @@ -19,7 +19,7 @@ from torch.fx import Node from ..core import Config -from ..utils import is_activation, is_embedding, is_linear +from ..utils import is_activation, is_cross_entropy, is_cross_entropy_parallel_compatible, is_embedding, is_linear class Registry: @@ -334,7 +334,16 @@ def propagate(self) -> List[int]: ndim = arg.meta["val"].ndim slice_dim = (slice_dim + ndim) % ndim if slice_dim == axis: - # slice on the parallel axis is not allowed + # slice on the parallel axis is not allowed, except it's a nop + start, stop, step = 0, arg.meta["val"].shape[axis], 1 + if len(self.node.args) > 2: + start = self.node.args[2] + elif len(self.node.args) > 3: + stop = self.node.args[3] + elif len(self.node.args) > 4: + step = self.node.args[4] + if start == 0 and stop >= arg.meta["val"].shape[axis] and step == 1: + return [axis] return [] return [axis] @@ -404,12 +413,12 @@ def propagate(self) -> List[int]: if self.node.op in ["placeholder", "get_attr"]: return [None] elif self.node.op == "output": - for node in self.node.all_input_nodes: - # TODO: allow parallelized nodes in output, and append comm ops in graph tp all-gather - # parallelized output if intructed - if self.extract_axis(node) is not None: - return [] - return [None] + # does not care about if output is being parallelized right now, because if the output is loss, + # then it must be not parallelized as long as it comes from sharded cross entropy. + # TODO: append all-gather comm ops before all parallelized output nodes if instructed. + input_arg = self.node.all_input_nodes[0] + axis = self.extract_axis(input_arg) + return [axis] elif is_linear(self.node): input_arg = self.node.all_input_nodes[0] axis = self.extract_axis(input_arg) @@ -438,6 +447,16 @@ def propagate(self) -> List[int]: return [1, None] if self.config.enable_sequence_parallel else [None] else: return [] + elif is_cross_entropy(self.node): + logits = self.node.all_input_nodes[0] + axis = self.extract_axis(logits) + if axis is None or ( + is_cross_entropy_parallel_compatible(self.node) and axis == logits.meta["val"].ndim - 1 + ): + # for cross entropy, the input logits parallel axis can only be the last axis or None + return [None] + else: + return [] elif is_activation(self.node): return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate() diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py index 9bfb13afdf6..474ae7f7eef 100644 --- a/optimum/fx/parallelization/parallel_layers/__init__.py +++ b/optimum/fx/parallelization/parallel_layers/__init__.py @@ -14,3 +14,4 @@ # limitations under the License. from .embedding import VocabParallelEmbedding from .linear import ColumnParallelLinear, RowParallelLinear +from .loss import VocabParallelCrossEntropyLoss, sharded_cross_entropy_wrapper_fn diff --git a/optimum/fx/parallelization/parallel_layers/loss.py b/optimum/fx/parallelization/parallel_layers/loss.py new file mode 100644 index 00000000000..0a11e33c08e --- /dev/null +++ b/optimum/fx/parallelization/parallel_layers/loss.py @@ -0,0 +1,163 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import wraps +from typing import Optional + +import torch +import torch.distributed as dist +import torch.nn as nn + +from ..core import ParallelExecutionCtx + + +# Adapted from https://github.com/huggingface/nanotron/blob/main/src/nanotron/parallel/tensor_parallel/functional.py +class _ShardedCrossEntropy(torch.autograd.Function): + @staticmethod + def forward( + ctx, + sharded_logits: torch.Tensor, # (batch_size, length, sharded_hidden_size) + target: torch.Tensor, # (batch_size, length) + group: dist.ProcessGroup, + ): + # Maximum value along last dimension across all GPUs. + logits_max = torch.max(sharded_logits, dim=-1)[0] + dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=group) + # Subtract the maximum value. + sharded_logits = sharded_logits - logits_max.unsqueeze(dim=-1) + + # Get the shard's indices + sharded_hidden_size = sharded_logits.shape[-1] + rank = dist.get_rank(group) + start_index = rank * sharded_hidden_size + end_index = start_index + sharded_hidden_size + + # Create a mask of valid ids (1 means it needs to be masked). + target_mask = (target < start_index) | (target >= end_index) + masked_target = target.clone() - start_index + masked_target[target_mask] = 0 + + # Get predicted-logits = logits[target]. + # For Simplicity, we convert logits to a 2-D tensor with size + # [*, shard-size] and target to a 1-D tensor of size [*]. + logits_2d = sharded_logits.view(-1, sharded_hidden_size) + masked_target_1d = masked_target.view(-1) + arange_1d = torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device) + predicted_logits_1d = logits_2d[arange_1d, masked_target_1d] + if predicted_logits_1d.is_contiguous(): + predicted_logits_1d = predicted_logits_1d.clone() + else: + predicted_logits_1d = predicted_logits_1d.contiguous() + predicted_logits = predicted_logits_1d.view_as(target) + predicted_logits[target_mask] = 0.0 + # All reduce is needed to get the chunks from other GPUs. + dist.all_reduce(predicted_logits, op=dist.ReduceOp.SUM, group=group) + + # Sum of exponential of logits along vocab dimension across all GPUs. + exp_logits = sharded_logits + torch.exp(sharded_logits, out=exp_logits) + sum_exp_logits = exp_logits.sum(dim=-1) + dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM, group=group) + + # Loss = log(sum(exp(logits))) - predicted-logit. + loss = torch.log(sum_exp_logits) - predicted_logits + + # Normalize and optionally smooth logits + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + + # Store softmax, target-mask and masked-target for backward pass. + ctx.save_for_backward(exp_logits, target_mask, masked_target_1d) + + return loss + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + # Retrieve tensors from the forward path. + softmax, target_mask, masked_target_1d = ctx.saved_tensors + + # All the inputs have softmax as their gradient. + grad_input = softmax + # For simplicity, work with the 2D gradient. + sharded_hidden_size = softmax.size()[-1] + grad_2d = grad_input.view(-1, sharded_hidden_size) + + # Add the gradient from matching classes. + arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device) + grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float() + + # Finally elementwise multiplication with the output gradients. + grad_input.mul_(grad_output.unsqueeze(dim=-1)) + + return grad_input, None, None + + +def sharded_cross_entropy(sharded_logits: torch.Tensor, target: torch.Tensor, process_group: dist.ProcessGroup): + return _ShardedCrossEntropy.apply(sharded_logits, target, process_group) + + +def sharded_cross_entropy_wrapper_fn(process_group: dist.ProcessGroup): + @wraps(sharded_cross_entropy) + def wrapper( + sharded_logits: torch.Tensor, + target: torch.Tensor, + weight: Optional[torch.Tensor] = None, + size_average: Optional[bool] = None, + ignore_index: int = -100, + reduce: Optional[bool] = None, + reduction: str = "mean", + label_smoothing: float = 0.0, + ): + if weight is not None or ignore_index != -100 or label_smoothing != 0.0: + raise ValueError( + "Does not support weighted mode, index ignoring and label smoothing in current parallel cross entropy implementation." + ) + loss: torch.Tensor = sharded_cross_entropy(sharded_logits, target, process_group) + + if size_average is not None or reduce is not None: + size_average = True if size_average is None else size_average + reduce = True if reduce is None else reduce + + if size_average and reduce: + reduction = "mean" + elif reduce: + reduction = "sum" + else: + reduction = "none" + + if reduction == "mean": + return loss.mean() + elif reduction == "sum": + return loss.sum() + return loss + + return wrapper + + +class VocabParallelCrossEntropyLoss(nn.Module): + """ + Simple parallel cross entropy implementation which does not support weighted mode and label smoothing yet. + """ + + def __init__(self, ctx: ParallelExecutionCtx, reduction: str = "mean") -> None: + super(VocabParallelCrossEntropyLoss, self).__init__() + self.process_group = ctx.tp_group + self.reduction = reduction + + def forward(self, sharded_logits: torch.Tensor, target: torch.Tensor): + loss: torch.Tensor = _ShardedCrossEntropy.apply(sharded_logits, target, self.process_group) + if self.reduction == "mean": + return loss.mean() + elif self.reduction == "sum": + return loss.sum() + return loss diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py index 14b652fff73..90155263281 100644 --- a/optimum/fx/parallelization/passes.py +++ b/optimum/fx/parallelization/passes.py @@ -26,8 +26,15 @@ from .decomp import decompose_and_functionalize from .distributed import scatter from .op_registry import REGISTRY, FallbackParallelAxisPropagateHandler -from .parallel_layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding +from .parallel_layers import ( + ColumnParallelLinear, + RowParallelLinear, + VocabParallelCrossEntropyLoss, + VocabParallelEmbedding, + sharded_cross_entropy_wrapper_fn, +) from .utils import ( + is_cross_entropy, is_embedding, is_linear, is_shape_consumer, @@ -273,6 +280,11 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf info["sequence_parallel"] = False self.place_marker_per_node(node, info) + elif is_cross_entropy(node): + axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis") + if axis_before is not None: + self.place_marker_per_node(node, {"axis": "vocab"}) + return graph_module @@ -343,6 +355,35 @@ def handle_embedding(node: Node, ctx: ParallelExecutionCtx) -> None: layer_cache[key] = new_mod setattr(parent_mod, field, new_mod) + @staticmethod + def handle_cross_entropy(node: Node, ctx: ParallelExecutionCtx) -> None: + axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis") + if axis is None: + return + + assert axis in {"vocab"}, "Only support parallelization on vocab dim for now." + if node.op == "call_module": + graph_module = node.graph.owning_module + prefix_and_field = node.target.rsplit(".", maxsplit=1) + if len(prefix_and_field) == 2: + parent_mod = graph_module.get_submodule(prefix_and_field[0]) + field = prefix_and_field[1] + else: + parent_mod = graph_module + field = node.target + + mod: nn.CrossEntropyLoss = graph_module.get_submodule(node.target) + key, layer_cache = node.target, ctx.parallel_layer_cache + if key in layer_cache: + new_mod = layer_cache[key] + else: + assert ctx.compile_times == 0, "illegal path for recompilation" + new_mod = VocabParallelCrossEntropyLoss(ctx, reduction=mod.reduction) + layer_cache[key] = new_mod + setattr(parent_mod, field, new_mod) + else: + node.target = sharded_cross_entropy_wrapper_fn(process_group=ctx.tp_group) + @staticmethod def handle_hard_coded_axis_param(node: Node, ctx: ParallelExecutionCtx) -> None: def extract_shape_from_node(node: Node) -> List[Any]: @@ -384,6 +425,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf self.handle_linear(node, ctx) elif is_embedding(node): self.handle_embedding(node, ctx) + elif is_cross_entropy(node): + self.handle_cross_entropy(node, ctx) # correct the attention head num in parallel setting elif is_shape_consumer(node): self.handle_hard_coded_axis_param(node, ctx) diff --git a/optimum/fx/parallelization/utils.py b/optimum/fx/parallelization/utils.py index b7b1ccd41c8..3074638737f 100644 --- a/optimum/fx/parallelization/utils.py +++ b/optimum/fx/parallelization/utils.py @@ -82,6 +82,40 @@ def is_shape_generator(node: Node) -> bool: return node.op == "call_method" and node.target == "size" +def is_cross_entropy(node: Node) -> bool: + if node.op == "call_function": + return node.target is F.cross_entropy + elif node.op == "call_module": + mod = node.graph.owning_module + return isinstance(mod.get_submodule(node.target), nn.CrossEntropyLoss) + return False + + +def is_cross_entropy_parallel_compatible(node: Node) -> bool: + """ + For now `VocabParallelCrossEntropyLoss` does not support weighted mode, index ignoring and label smoothing. + """ + if node.op == "call_function": + weight = node.kwargs.get("weight", None) + ignore_index = node.kwargs.get("ignore_index", -100) + label_smoothing = node.kwargs.get("label_smoothing", 0.0) + if len(node.args) > 2 and weight is None: + weight = node.args[2] + if len(node.args) > 4 and ignore_index == -100: + ignore_index = node.args[4] + if len(node.args) > 7 and label_smoothing == 0.0: + label_smoothing = node.args[7] + + return weight is None and ignore_index == -100 and label_smoothing == 0.0 + + elif node.op == "call_module": + mod: nn.CrossEntropyLoss = node.graph.owning_module.get_submodule(node.target) + weight, label_smoothing, ignore_index = mod.weight, mod.label_smoothing, mod.ignore_index + return weight is None and ignore_index == -100 and label_smoothing == 0.0 + + return False + + def stable_topological_sort(graph: Graph): def _args(n: torch.fx.Node) -> List[torch.fx.node.Argument]: args: List[torch.fx.node.Argument] = [] diff --git a/tests/fx/parallelization/test_tensor_parallel.py b/tests/fx/parallelization/test_tensor_parallel.py index 9626fccec3b..8a00393c4d7 100644 --- a/tests/fx/parallelization/test_tensor_parallel.py +++ b/tests/fx/parallelization/test_tensor_parallel.py @@ -36,6 +36,7 @@ "output_attentions": False, "output_hidden_states": False, "tie_word_embeddings": True, + "return_dict": True, } DUMMY_MODELS_TO_TEST = ( @@ -64,11 +65,10 @@ def prepare_dummy_inputs( seq_len: int = 10, device: Union[str, torch.device] = "cuda", ): - return { - "input_ids": torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device), - "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.int64, device=device), - "position_ids": torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, -1), - } + input_ids = torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device) + attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device) + labels = input_ids.clone() + return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, model_kwargs: Dict[str, Any]): @@ -82,8 +82,8 @@ def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, m model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs) inputs = prepare_dummy_inputs(model.config) - logits = model(**inputs)[0] - tensors = gather_at_main_process(tensor=logits, group=tp_group, rank=rank, world_size=world_size) + loss = model(**inputs).loss + tensors = gather_at_main_process(tensor=loss, group=tp_group, rank=rank, world_size=world_size) # check results at main worker process if rank == 0: @@ -145,7 +145,7 @@ def run_test_parallel_results_matches_non_parallel( inputs = prepare_dummy_inputs(model.config) set_seed(SEED) - logits = model(**inputs)[0] + loss = model(**inputs).loss torch._dynamo.reset() del model @@ -154,9 +154,9 @@ def run_test_parallel_results_matches_non_parallel( set_seed(SEED) ctx = ParallelExecutionCtx(tp_group=tp_group, current_device=device) model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs) - parallel_logits = model(**inputs)[0] + parallel_loss = model(**inputs).loss - torch.testing.assert_close(logits.cpu(), parallel_logits.cpu(), rtol=1e-4, atol=1e-4) + torch.testing.assert_close(loss.cpu(), parallel_loss.cpu(), rtol=1e-4, atol=1e-4) dist.barrier(tp_group) tearDown() From 2fb5ea5ca7ca8ea887af2851cce80ab2545d3f4f Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Wed, 18 Sep 2024 22:48:34 +0200 Subject: [PATCH 17/73] Fix `is_torch_tpu_available` in ORT Trainer (#2028) --- optimum/onnxruntime/trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py index 86c333adb3f..66273cbcf96 100644 --- a/optimum/onnxruntime/trainer.py +++ b/optimum/onnxruntime/trainer.py @@ -103,14 +103,14 @@ from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled if check_if_transformers_greater("4.39"): - from transformers.utils import is_torch_xla_available + from transformers.utils import is_torch_xla_available as is_torch_tpu_xla_available - if is_torch_xla_available(): + if is_torch_tpu_xla_available(): import torch_xla.core.xla_model as xm else: - from transformers.utils import is_torch_tpu_available + from transformers.utils import is_torch_tpu_available as is_torch_tpu_xla_available - if is_torch_tpu_available(check_device=False): + if is_torch_tpu_xla_available(check_device=False): import torch_xla.core.xla_model as xm if TYPE_CHECKING: @@ -735,7 +735,7 @@ def get_dataloader_sampler(dataloader): if ( args.logging_nan_inf_filter - and not is_torch_tpu_available() + and not is_torch_tpu_xla_available() and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) ): # if loss is nan or inf simply add the average of previous logged losses From fd638d20046a73a7221083b23c69b98445e2d321 Mon Sep 17 00:00:00 2001 From: Vijay Date: Thu, 26 Sep 2024 12:59:19 +0530 Subject: [PATCH 18/73] Added image-to-image task for ORT Pipeline (#2031) * Add ORTModelForImageToImage for image-to-image task SwinSR * Added image-to-image task to optimum pipeline * Add Tests fpr ORTModelForImageToImage for image-to-image task SwinSR * Use export=True for models from transformers, self._setup and more * Code Refactor * Refactor ORTModelForImageToImageIntegrationTest --- optimum/onnxruntime/__init__.py | 2 + optimum/onnxruntime/modeling_ort.py | 73 ++++++++++ optimum/pipelines/pipelines_base.py | 8 ++ tests/onnxruntime/test_modeling.py | 136 ++++++++++++++++++- tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 5 files changed, 219 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 09a48ec955c..1cb5b7c47b9 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -44,6 +44,7 @@ "ORTModelForSemanticSegmentation", "ORTModelForSequenceClassification", "ORTModelForTokenClassification", + "ORTModelForImageToImage", ], "modeling_seq2seq": [ "ORTModelForSeq2SeqLM", @@ -112,6 +113,7 @@ ORTModelForCustomTasks, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForMultipleChoice, ORTModelForQuestionAnswering, diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 254b771e334..9166f7c2cbe 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -34,6 +34,7 @@ AutoModelForAudioXVector, AutoModelForCTC, AutoModelForImageClassification, + AutoModelForImageToImage, AutoModelForMaskedLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering, @@ -47,6 +48,7 @@ BaseModelOutput, CausalLMOutput, ImageClassifierOutput, + ImageSuperResolutionOutput, MaskedLMOutput, ModelOutput, MultipleChoiceModelOutput, @@ -2183,6 +2185,77 @@ def forward( return TokenClassifierOutput(logits=logits) +IMAGE_TO_IMAGE_EXAMPLE = r""" + Example of image-to-image (Super Resolution): + + ```python + >>> from transformers import {processor_class} + >>> from optimum.onnxruntime import {model_class} + >>> from PIL import Image + + >>> image = Image.open("path/to/image.jpg") + + >>> image_processor = {processor_class}.from_pretrained("{checkpoint}") + >>> model = {model_class}.from_pretrained("{checkpoint}") + + >>> inputs = image_processor(images=image, return_tensors="pt") + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + ``` +""" + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTModelForImageToImage(ORTModel): + """ + ONNX Model for image-to-image tasks. This class officially supports pix2pix, cyclegan, wav2vec2, wav2vec2-conformer. + """ + + auto_model_class = AutoModelForImageToImage + + @add_start_docstrings_to_model_forward( + ONNX_IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width") + + IMAGE_TO_IMAGE_EXAMPLE.format( + processor_class=_PROCESSOR_FOR_DOC, + model_class="ORTModelForImgageToImage", + checkpoint="caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr", + ) + ) + def forward( + self, + pixel_values: Union[torch.Tensor, np.ndarray], + **kwargs, + ): + use_torch = isinstance(pixel_values, torch.Tensor) + self.raise_on_numpy_input_io_binding(use_torch) + if self.device.type == "cuda" and self.use_io_binding: + input_shapes = pixel_values.shape + io_binding, output_shapes, output_buffers = self.prepare_io_binding( + pixel_values, + ordered_input_names=self._ordered_input_names, + known_output_shapes={ + "reconstruction": [ + input_shapes[0], + input_shapes[1], + input_shapes[2] * self.config.upscale, + input_shapes[3] * self.config.upscale, + ] + }, + ) + io_binding.synchronize_inputs() + self.model.run_with_iobinding(io_binding) + io_binding.synchronize_outputs() + reconstruction = output_buffers["reconstruction"].view(output_shapes["reconstruction"]) + else: + model_inputs = {"pixel_values": pixel_values} + onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.model.run(None, onnx_inputs) + model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + reconstruction = model_outputs["reconstruction"] + return ImageSuperResolutionOutput(reconstruction=reconstruction) + + CUSTOM_TASKS_EXAMPLE = r""" Example of custom tasks(e.g. a sentence transformers taking `pooler_output` as output): diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py index a08ab8782a3..7690143f13f 100644 --- a/optimum/pipelines/pipelines_base.py +++ b/optimum/pipelines/pipelines_base.py @@ -24,6 +24,7 @@ FillMaskPipeline, ImageClassificationPipeline, ImageSegmentationPipeline, + ImageToImagePipeline, ImageToTextPipeline, Pipeline, PreTrainedTokenizer, @@ -55,6 +56,7 @@ ORTModelForCausalLM, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForQuestionAnswering, ORTModelForSemanticSegmentation, @@ -157,6 +159,12 @@ "default": "superb/hubert-base-superb-ks", "type": "audio", }, + "image-to-image": { + "impl": ImageToImagePipeline, + "class": (ORTModelForImageToImage,), + "default": "caidas/swin2SR-classical-sr-x2-64", + "type": "image", + }, } else: ORT_SUPPORTED_TASKS = {} diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 199b96342e7..f6771ce7618 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -42,6 +42,7 @@ AutoModelForCausalLM, AutoModelForCTC, AutoModelForImageClassification, + AutoModelForImageToImage, AutoModelForMaskedLM, AutoModelForMultipleChoice, AutoModelForQuestionAnswering, @@ -57,7 +58,9 @@ PretrainedConfig, set_seed, ) +from transformers.modeling_outputs import ImageSuperResolutionOutput from transformers.modeling_utils import no_init_weights +from transformers.models.swin2sr.configuration_swin2sr import Swin2SRConfig from transformers.onnx.utils import get_preprocessor from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin @@ -79,6 +82,7 @@ ORTModelForCustomTasks, ORTModelForFeatureExtraction, ORTModelForImageClassification, + ORTModelForImageToImage, ORTModelForMaskedLM, ORTModelForMultipleChoice, ORTModelForPix2Struct, @@ -4704,6 +4708,136 @@ def test_compare_generation_to_io_binding( gc.collect() +class ORTModelForImageToImageIntegrationTest(ORTModelTestMixin): + SUPPORTED_ARCHITECTURES = ["swin2sr"] + + ORTMODEL_CLASS = ORTModelForImageToImage + + TASK = "image-to-image" + + def _get_sample_image(self): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw) + return image + + def _get_preprocessors(self, model_id): + image_processor = AutoImageProcessor.from_pretrained(model_id) + + return image_processor + + def test_load_vanilla_transformers_which_is_not_supported(self): + with self.assertRaises(Exception) as context: + _ = ORTModelForImageToImage.from_pretrained(MODEL_NAMES["bert"], export=True) + + self.assertIn("only supports the tasks", str(context.exception)) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_compare_to_transformers(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + self.assertIsInstance(onnx_model.config, Swin2SRConfig) + set_seed(SEED) + + transformers_model = AutoModelForImageToImage.from_pretrained(model_id) + image_processor = self._get_preprocessors(model_id) + + data = self._get_sample_image() + features = image_processor(data, return_tensors="pt") + + with torch.no_grad(): + transformers_outputs = transformers_model(**features) + + onnx_outputs = onnx_model(**features) + self.assertIsInstance(onnx_outputs, ImageSuperResolutionOutput) + self.assertTrue("reconstruction" in onnx_outputs) + self.assertIsInstance(onnx_outputs.reconstruction, torch.Tensor) + self.assertTrue(torch.allclose(onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=1e-4)) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_generate_utils(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + + data = self._get_sample_image() + features = image_processor(data, return_tensors="pt") + + outputs = onnx_model(**features) + self.assertIsInstance(outputs, ImageSuperResolutionOutput) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + def test_pipeline_image_to_image(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + ) + data = self._get_sample_image() + outputs = pipe(data) + self.assertEqual(pipe.device, onnx_model.device) + self.assertIsInstance(outputs, Image.Image) + + gc.collect() + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_torch_gpu + @pytest.mark.cuda_ep_test + def test_pipeline_on_gpu(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + device=0, + ) + + data = self._get_sample_image() + outputs = pipe(data) + + self.assertEqual(pipe.model.device.type.lower(), "cuda") + self.assertIsInstance(outputs, Image.Image) + + @parameterized.expand(SUPPORTED_ARCHITECTURES) + @require_torch_gpu + @require_ort_rocm + @pytest.mark.rocm_ep_test + def test_pipeline_on_rocm(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + model_id = MODEL_NAMES[model_arch] + onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch]) + image_processor = self._get_preprocessors(model_id) + pipe = pipeline( + "image-to-image", + model=onnx_model, + feature_extractor=image_processor, + device=0, + ) + + data = self._get_sample_image() + outputs = pipe(data) + + self.assertEqual(pipe.model.device.type.lower(), "cuda") + self.assertIsInstance(outputs, Image.Image) + + class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr", "donut"] @@ -4831,7 +4965,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0]) ) for i in range(len(onnx_outputs["past_key_values"])): - print(onnx_outputs["past_key_values"][i]) for ort_pkv, trfs_pkv in zip( onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i] ): @@ -5517,6 +5650,7 @@ class TestBothExportersORTModel(unittest.TestCase): ["automatic-speech-recognition", ORTModelForCTCIntegrationTest], ["audio-xvector", ORTModelForAudioXVectorIntegrationTest], ["audio-frame-classification", ORTModelForAudioFrameClassificationIntegrationTest], + ["image-to-image", ORTModelForImageToImageIntegrationTest], ] ) def test_find_untested_architectures(self, task: str, test_class): diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index bb6935461d7..0790f6329dc 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -144,6 +144,7 @@ "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", + "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution", "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "trocr": "microsoft/trocr-small-handwritten", From f7c3a7fa766f06af63a15e94b162ada56d021b16 Mon Sep 17 00:00:00 2001 From: Guillaume LEGENDRE Date: Fri, 27 Sep 2024 14:12:14 +0200 Subject: [PATCH 19/73] CI - update runner type (#2033) update runner type --- .github/workflows/test_fx_automatic_parallel.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml index d8af6e40caa..05ebf7ea9e5 100644 --- a/.github/workflows/test_fx_automatic_parallel.yml +++ b/.github/workflows/test_fx_automatic_parallel.yml @@ -24,7 +24,7 @@ jobs: config: - name: GPU-enabled Optimum Test Suite image: nvidia/cuda:12.4.1-devel-ubuntu22.04 - gpu_target: ["nvidia-multi-gpu-a10-runners"] + gpu_target: ["aws-g5-12xlarge-plus"] name: ${{ matrix.config.name }} runs-on: From c6b46786ce12b3a9d2e8be2b8f41342ec314f46a Mon Sep 17 00:00:00 2001 From: rbrugaro Date: Mon, 30 Sep 2024 02:50:23 -0700 Subject: [PATCH 20/73] Add ipex to documentation (#2027) * adding ipex reference in optimum docs * minor fix --- docs/source/index.mdx | 2 +- docs/source/installation.mdx | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 7eb79c33ed2..06133664ca8 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -36,7 +36,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
Intel
-

Optimize your model to speedup inference with OpenVINO and Neural Compressor

+

Optimize your model to speedup inference with OpenVINO , Neural Compressor and IPEX

AWS Trainium/Inferentia
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx index c08b3f92e5c..27733574c80 100644 --- a/docs/source/installation.mdx +++ b/docs/source/installation.mdx @@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can | [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview) | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]` | | [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` | | [OpenVINO](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[openvino]` | +| [IPEX](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[ipex]` | | [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview) | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia` | | [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index) | `pip install --upgrade --upgrade-strategy eager optimum[amd]` | | [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index) | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]` | From 049b00f61c9bb17bd2b20a3b77d04cc4c0f20d86 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:51:01 +0200 Subject: [PATCH 21/73] Add Transformers v4.45 support (#2023) * transformers v4.45 support * fix transformers v4.45 compatibility * update opset * update model * Add generation config saving * fix codegen * bump default opset m2m100 * fix codegen * fix bettertransformers * add warnign deprecation bettertransformer * bettertransformers fixes * disable transformers 4.45 for onnx export * update model ID --- Makefile | 4 +- optimum/bettertransformer/models/attention.py | 84 +++++++++++++++++-- .../models/decoder_models.py | 35 +++++++- optimum/bettertransformer/transformation.py | 4 + optimum/exporters/onnx/convert.py | 18 ++++ optimum/exporters/onnx/model_configs.py | 11 +-- optimum/modeling_base.py | 3 + optimum/onnxruntime/modeling_decoder.py | 58 ++++++++----- optimum/onnxruntime/modeling_ort.py | 3 - optimum/onnxruntime/modeling_seq2seq.py | 64 +++++++------- optimum/onnxruntime/optimization.py | 12 ++- setup.py | 9 +- tests/bettertransformer/testing_utils.py | 4 +- tests/onnxruntime/utils_onnxruntime_tests.py | 6 +- 14 files changed, 223 insertions(+), 92 deletions(-) diff --git a/Makefile b/Makefile index e2c21263031..824ef3d0cf3 100644 --- a/Makefile +++ b/Makefile @@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL)) # Run code quality checks style_check: black --check . - ruff . + ruff check . style: black . - ruff . --fix + ruff check . --fix # Run tests for the library test: diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 9dfa57844d4..22b8faf1c21 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -92,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product( return sdpa_result, None +# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn +def gptj_wrapped_scaled_dot_product( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, +): + raise_on_head_mask(head_mask) + batch_size = query.shape[0] + + mask_value = torch.finfo(value.dtype).min + mask_value = torch.full([], mask_value, dtype=value.dtype) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + query = query.to(value.dtype) + key = key.to(value.dtype) + + if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1: + raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.") + + dropout_p = self.dropout_prob_attn if self.training else 0.0 + if batch_size == 1 or self.training: + if query.shape[2] > 1: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True + ) + else: + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False + ) + else: + query_length, key_length = query.size(-2), key.size(-2) + + # causal_mask is always [True, ..., True] otherwise, so executing this + # is unnecessary + if query_length > 1: + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + + causal_mask = torch.where(causal_mask, 0, mask_value) + + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + if attention_mask is not None: + attention_mask = causal_mask + attention_mask + + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] + + sdpa_result = torch.nn.functional.scaled_dot_product_attention( + query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False + ) + + # in gpt-neo-x and gpt-j the query and keys are always in fp32 + # thus we need to cast them to the value dtype + if self.downcast_qk: + sdpa_result = sdpa_result.to(value.dtype) + + return sdpa_result, None + + # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn def bark_wrapped_scaled_dot_product( self, @@ -195,7 +260,7 @@ def codegen_wrapped_scaled_dot_product( query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True ) else: - # in this case, which is the later decoding steps, the `causal_mask`` in + # in this case, which is the later decoding steps, the `causal_mask` in # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195 # is [True, ..., True] so actually not causal sdpa_result = torch.nn.functional.scaled_dot_product_attention( @@ -207,15 +272,20 @@ def codegen_wrapped_scaled_dot_product( # causal_mask is always [True, ..., True] otherwise, so executing this # is unnecessary if query_length > 1: - causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool) + if not check_if_transformers_greater("4.44.99"): + causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to( + torch.bool + ) - causal_mask = torch.where(causal_mask, 0, mask_value) + causal_mask = torch.where(causal_mask, 0, mask_value) - # torch.Tensor.expand does no memory copy - causal_mask = causal_mask.expand(batch_size, -1, -1, -1) + # torch.Tensor.expand does no memory copy + causal_mask = causal_mask.expand(batch_size, -1, -1, -1) - # we use torch.min to avoid having tensor(-inf) - attention_mask = torch.min(causal_mask, attention_mask) + # we use torch.min to avoid having tensor(-inf) + attention_mask = torch.min(causal_mask, attention_mask) + else: + attention_mask = attention_mask[:, :, :, : key.shape[-2]] sdpa_result = torch.nn.functional.scaled_dot_product_attention( query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index b64b7f5a1eb..52d28d076d3 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -44,6 +44,7 @@ codegen_wrapped_scaled_dot_product, gpt2_wrapped_scaled_dot_product, gpt_neo_wrapped_scaled_dot_product, + gptj_wrapped_scaled_dot_product, opt_forward, t5_forward, ) @@ -82,7 +83,7 @@ def forward(self, *args, **kwargs): class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module): - _attn = gpt2_wrapped_scaled_dot_product + _attn = gptj_wrapped_scaled_dot_product def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super().__init__(config) @@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): "out_proj", "attn_dropout", "resid_dropout", - "bias", "scale_attn", - "masked_bias", ] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "bias"): + submodules.append("bias") + if hasattr(layer, "masked_bias"): + submodules.append("masked_bias") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): self.module_mapping = None submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"] + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_id"): + submodules.append("layer_id") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) @@ -238,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): super(BetterTransformerBaseLayer, self).__init__(config) self.module_mapping = None - submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"] + submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"] # Attribute only for transformers>=4.28 if hasattr(layer, "embed_positions"): submodules.append("embed_positions") + # Attribute only for transformers<4.45 + if hasattr(layer, "causal_mask"): + submodules.append("causal_mask") + + # Attribute only for transformers>=4.45 + if hasattr(layer, "layer_idx"): + submodules.append("layer_idx") + for attr in submodules: setattr(self, attr, getattr(layer, attr)) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index 2105e199870..a101757b6fa 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -206,6 +206,10 @@ def transform( The converted model if the conversion has been successful. """ + logger.warning( + "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release." + ) + hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 63a9067b90c..f2bf95f3e3c 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,6 +26,7 @@ import numpy as np import onnx +import transformers from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -34,6 +35,7 @@ DEFAULT_DUMMY_SHAPES, ONNX_WEIGHTS_NAME, TORCH_MINIMUM_VERSION, + check_if_transformers_greater, is_diffusers_available, is_torch_onnx_support_available, logging, @@ -999,6 +1001,10 @@ def onnx_export_from_model( >>> onnx_export_from_model(model, output="gpt2_onnx/") ``` """ + if check_if_transformers_greater("4.44.99"): + raise ImportError( + f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}" + ) TasksManager.standardize_model_attributes(model) @@ -1120,6 +1126,18 @@ def onnx_export_from_model( if isinstance(atol, dict): atol = atol[task.replace("-with-past", "")] + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = model.config._get_non_default_generation_parameters() + if model.can_generate() and len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model.generation_config, param_name, param_value) + setattr(model.config, param_name, None) + # Saving the model config and preprocessor as this is needed sometimes. model.config.save_pretrained(output) generation_config = getattr(model, "generation_config", None) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index d4b15b2968b..36963a986d0 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -119,7 +119,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: class AlbertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class ConvBertOnnxConfig(BertOnnxConfig): @@ -171,11 +171,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig): class RobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class CamembertOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class FlaubertOnnxConfig(BertOnnxConfig): @@ -187,7 +187,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig): class XLMRobertaOnnxConfig(DistilBertOnnxConfig): - pass + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. class DebertaOnnxConfig(BertOnnxConfig): @@ -257,7 +257,7 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig): class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads") @@ -564,6 +564,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class M2M100OnnxConfig(TextSeq2SeqOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for torch>=2.1.1. NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args( encoder_num_layers="encoder_layers", decoder_num_layers="decoder_layers", diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 3da2d9d0d21..29521b7c0c6 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -371,6 +371,9 @@ def from_pretrained( export = from_transformers if len(model_id.split("@")) == 2: + logger.warning( + f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead." + ) if revision is not None: logger.warning( f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}" diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index f6d4b7e20ab..bda3ec98d9a 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -14,7 +14,6 @@ """Classes handling causal-lm related architectures in ONNX Runtime.""" import logging -import warnings from pathlib import Path from tempfile import TemporaryDirectory from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union @@ -149,6 +148,19 @@ def __init__( generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + self.onnx_paths = [self.model_path] self.use_merged = "use_cache_branch" in self.input_names self.model_type = self.config.model_type @@ -393,7 +405,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -410,15 +421,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -586,6 +589,22 @@ def _from_pretrained( else: init_cls = ORTModelForCausalLM + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + return init_cls( model=model, config=config, @@ -593,6 +612,7 @@ def _from_pretrained( model_save_dir=model_save_dir, preprocessors=preprocessors, use_cache=use_cache, + generation_config=generation_config, ) @classmethod @@ -600,7 +620,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -616,15 +635,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForCausalLM": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - file_name = ONNX_WEIGHTS_NAME if use_merged: @@ -655,8 +665,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -712,6 +720,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) -> for layer_past in past ) + def _save_pretrained(self, save_directory: Union[str, Path]): + super()._save_pretrained(save_directory) + self.generation_config.save_pretrained(save_directory) + class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM): # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 9166f7c2cbe..17bd3e2a4e7 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -663,8 +663,6 @@ def _export( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( @@ -1171,7 +1169,6 @@ def _export( library_name="transformers", ) - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index 3cecadafe3e..fda3ca82bbe 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -18,7 +18,6 @@ import logging import shutil -import warnings from abc import ABC, abstractmethod from pathlib import Path from tempfile import TemporaryDirectory @@ -706,6 +705,18 @@ def show_deprecated_argument(arg_name): generation_config = GenerationConfig.from_model_config(config) self.generation_config = generation_config + if check_if_transformers_greater("4.44.99"): + misplaced_generation_parameters = self.config._get_non_default_generation_parameters() + if len(misplaced_generation_parameters) > 0: + logger.warning( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(self.generation_config, param_name, param_value) + setattr(self.config, param_name, None) + @abstractmethod def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder: pass @@ -780,7 +791,6 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, force_download: bool = False, @@ -799,15 +809,7 @@ def _from_pretrained( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + generation_config = kwargs.pop("generation_config", None) model_path = Path(model_id) # We do not implement the logic for use_cache=False, use_merged=True @@ -996,19 +998,21 @@ def _from_pretrained( if model_save_dir is None: model_save_dir = new_model_save_dir - generation_config = None - try: - generation_config = GenerationConfig.from_pretrained( - model_id, - cache_dir=cache_dir, - force_download=force_download, - local_files_only=local_files_only, - token=token, - revision=revision, - subfolder=subfolder, - ) - except OSError: - logger.info("Generation config file not found, using a generation config created from the model config.") + if generation_config is None: + try: + generation_config = GenerationConfig.from_pretrained( + model_id, + cache_dir=cache_dir, + force_download=force_download, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) onnx_paths = [encoder_path] if use_merged is False: @@ -1035,7 +1039,6 @@ def _from_transformers( cls, model_id: str, config: "PretrainedConfig", - use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, revision: str = "main", force_download: bool = True, @@ -1051,15 +1054,6 @@ def _from_transformers( use_io_binding: Optional[bool] = None, task: Optional[str] = None, ) -> "ORTModelForConditionalGeneration": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - if use_cache is False and use_merged is True: raise ValueError( "The incompatible arguments use_cache=False, use_merged=True were passed to" @@ -1091,8 +1085,6 @@ def _from_transformers( force_download=force_download, trust_remote_code=trust_remote_code, ) - - config.save_pretrained(save_dir_path) maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder) return cls._from_pretrained( diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py index 9e62a3f324c..fd6958bba7d 100644 --- a/optimum/onnxruntime/optimization.py +++ b/optimum/onnxruntime/optimization.py @@ -20,6 +20,7 @@ import onnx from onnx import load_model +from transformers import GenerationConfig from transformers.models.auto.configuration_auto import AutoConfig from onnxruntime.transformers.onnx_model_bert import BertOnnxModel @@ -152,10 +153,6 @@ def optimize( save_dir = Path(save_dir) save_dir.mkdir(parents=True, exist_ok=True) ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config) - - self.config.save_pretrained(save_dir) - maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) - model_type = ORTConfigManager.get_model_ort_type(self.config.model_type) optimization_options = optimization_config.create_fusion_options(model_type) @@ -236,6 +233,13 @@ def optimize( # Save the model configuration self.config.save_pretrained(save_dir) ort_config.save_pretrained(save_dir) + maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir) + + try: + generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent) + generation_config.save_pretrained(save_dir) + except Exception: + pass logger.info( f"Optimized model saved at: {save_dir} (external data format: " diff --git a/setup.py b/setup.py index ac5db71a74b..24c1ae1cd4d 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29,<4.45.0", + "transformers[sentencepiece]>=4.29,<4.46.0", "torch>=1.11", "packaging", "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 @@ -54,6 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", + "transformers<4.45.0", ], "onnxruntime-gpu": [ "onnx", @@ -62,9 +63,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. + "transformers<4.45.0", ], - "exporters": ["onnx", "onnxruntime", "timm"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -75,6 +77,7 @@ "numpy<1.24.0", "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", + "transformers<4.45.0", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index e9e2edd9790..098882180aa 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -59,12 +59,12 @@ # "llama": "fxmarty/tiny-llama-fast-tokenizer", # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer", "m2m_100": "hf-internal-testing/tiny-random-nllb", - "marian": "fxmarty/tiny-marian", # the other tiny ones have a too small max_position_embeddings + "marian": "optimum-internal-testing/tiny-random-marian", # the other tiny ones have a too small max_position_embeddings "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "hf-internal-testing/tiny-random-mbart", "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", - "prophetnet": "hirotasoshu/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings + "prophetnet": "optimum-internal-testing/tiny-random-prophetnet", # the other tiny ones have a too small max_position_embeddings "rembert": "hf-internal-testing/tiny-random-RemBertModel", "roberta": "hf-internal-testing/tiny-random-RobertaModel", "rocbert": "hf-internal-testing/tiny-random-RoCBertModel", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 0790f6329dc..17f3b391b04 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -112,9 +112,9 @@ "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", "longt5": "hf-internal-testing/tiny-random-LongT5Model", - "llama": "fxmarty/tiny-llama-fast-tokenizer", + "llama": "optimum-internal-testing/tiny-random-llama", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", - "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken + "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", @@ -152,7 +152,7 @@ "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel", "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2", "vit": "hf-internal-testing/tiny-random-vit", - "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken + "whisper": "optimum-internal-testing/tiny-random-whisper", "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model", "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer", "wavlm": "hf-internal-testing/tiny-random-WavlmModel", From d9754abdd973a69829dda191c495c4e70359d8dc Mon Sep 17 00:00:00 2001 From: Vijay Date: Tue, 8 Oct 2024 15:58:24 +0530 Subject: [PATCH 22/73] Remove numpy version constraint in setup.py (#2039) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 24c1ae1cd4d..0e2f0fd1bb6 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ "transformers[sentencepiece]>=4.29,<4.46.0", "torch>=1.11", "packaging", - "numpy<2.0", # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569 + "numpy", "huggingface_hub>=0.8.0", "datasets", ] From d3c56cd55444de15499c8d72a501d07631eff5ae Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Wed, 9 Oct 2024 14:17:43 +0200 Subject: [PATCH 23/73] Update/Fix Pipeline Mixins and ORT Pipelines (#2021) * created auto task mappings * added correct auto classes * created auto task mappings * added correct auto classes * added ort/auto diffusion classes * fix ORTPipeline detection * start test refactoring * dynamic dtype * support torch random numbers generator * compact diffusion testing suite * fix * test * test * test * use latent-consistency architecture name instead of lcm * fix * add ort diffusion pipeline tests * added dummy objects * remove duplicate code * update stable diffusion mixin * update latent consistency * update sd for img2img * update latent consistency * update model parts to use frozen dict * update tests and utils * updated all mixins, enabled all tests ; all are passing except some reproducibility and comparaison tests (7 failed, 35 passed) * fix sd xl hidden states * style * support testing without diffusers * remove unnecessary * revert * export vae encoder by returning its latent distribution parameters * fix the modeling to handle distributions * create vae class to minimize changes in pipeline mixins * remove unnecessary tests * style * style * update diffusion models export test * style * fall back for when block_out_channels is not in vae config * remove model parts from optimum.onnxruntime * added .to to model parts * remove custom mixins * style * Update optimum/exporters/onnx/model_configs.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * Update optimum/exporters/onnx/model_configs.py * conversion to numpy always work * test adding two new pipelines * remove duplicated tests * match diffusers numpy input * simplify model saving * extend tests and only translate generators * cleanup * reduce parent model usage in model parts * fix * new tiny onnx diffusion model with configs * model_save_path * Update optimum/onnxruntime/modeling_diffusion.py Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> * migrate tiny-stable-diffusion-onnx * resolve breaking change and mandatory arguments * overwrite _get_add_time_ids * fix * remove inference calls from loading tests * misc * better compatibility between model parts and parent pipeline * remove subfolder * misc * update * support passing safety checker * dummies * remove the need for ORTPipeline --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/onnx/model_configs.py | 6 +- optimum/exporters/utils.py | 24 +- optimum/onnx/utils.py | 16 + optimum/onnxruntime/__init__.py | 8 + optimum/onnxruntime/base.py | 19 + optimum/onnxruntime/modeling_diffusion.py | 967 ++++++++++-------- optimum/onnxruntime/modeling_ort.py | 2 +- optimum/onnxruntime/modeling_seq2seq.py | 2 +- optimum/onnxruntime/utils.py | 15 + .../diffusers/pipeline_latent_consistency.py | 230 ----- .../diffusers/pipeline_stable_diffusion.py | 427 -------- .../pipeline_stable_diffusion_img2img.py | 309 ------ .../pipeline_stable_diffusion_inpaint.py | 353 ------- .../diffusers/pipeline_stable_diffusion_xl.py | 506 --------- .../pipeline_stable_diffusion_xl_img2img.py | 515 ---------- optimum/pipelines/diffusers/pipeline_utils.py | 282 ----- optimum/pipelines/diffusers/watermark.py | 31 - tests/exporters/onnx/test_onnx_export.py | 15 +- tests/onnxruntime/test_diffusion.py | 578 +++++------ tests/onnxruntime/test_modeling.py | 14 +- tests/onnxruntime/utils_onnxruntime_tests.py | 5 + 21 files changed, 914 insertions(+), 3410 deletions(-) delete mode 100644 optimum/pipelines/diffusers/pipeline_latent_consistency.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py delete mode 100644 optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py delete mode 100644 optimum/pipelines/diffusers/pipeline_utils.py delete mode 100644 optimum/pipelines/diffusers/watermark.py diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 36963a986d0..e77f649f69b 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -1112,7 +1112,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-2 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1132,12 +1132,12 @@ def inputs(self) -> Dict[str, Dict[int, str]]: @property def outputs(self) -> Dict[str, Dict[int, str]]: return { - "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, + "latent_parameters": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } class VaeDecoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-3 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index e2125736c4d..949b54f4685 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -46,11 +46,6 @@ from diffusers import ( DiffusionPipeline, - LatentConsistencyModelImg2ImgPipeline, - LatentConsistencyModelPipeline, - StableDiffusionImg2ImgPipeline, - StableDiffusionInpaintPipeline, - StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, @@ -92,27 +87,13 @@ def _get_submodels_for_export_diffusion( Returns the components of a Stable Diffusion model. """ - is_stable_diffusion = isinstance( - pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline) - ) is_stable_diffusion_xl = isinstance( pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline) ) - is_latent_consistency_model = isinstance( - pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline) - ) - if is_stable_diffusion_xl: projection_dim = pipeline.text_encoder_2.config.projection_dim - elif is_stable_diffusion: - projection_dim = pipeline.text_encoder.config.projection_dim - elif is_latent_consistency_model: - projection_dim = pipeline.text_encoder.config.projection_dim else: - raise ValueError( - f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. " - "Please open an issue or submit a PR to add the support." - ) + projection_dim = pipeline.text_encoder.config.projection_dim models_for_export = {} @@ -139,7 +120,8 @@ def _get_submodels_for_export_diffusion( vae_encoder = copy.deepcopy(pipeline.vae) if not is_torch_greater_or_equal_than_2_1: vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder) - vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()} + # we return the distribution parameters to be able to recreate it in the decoder + vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} models_for_export["vae_encoder"] = vae_encoder # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py index b52c4f4cdac..c014c1b3429 100644 --- a/optimum/onnx/utils.py +++ b/optimum/onnx/utils.py @@ -71,6 +71,22 @@ def _get_external_data_paths(src_paths: List[Path], dst_paths: List[Path]) -> Tu return src_paths, dst_paths +def _get_model_external_data_paths(model_path: Path) -> List[Path]: + """ + Gets external data paths from the model. + """ + + onnx_model = onnx.load(str(model_path), load_external_data=False) + model_tensors = _get_initializer_tensors(onnx_model) + # filter out tensors that are not external data + model_tensors_ext = [ + ExternalDataInfo(tensor).location + for tensor in model_tensors + if tensor.HasField("data_location") and tensor.data_location == onnx.TensorProto.EXTERNAL + ] + return [model_path.parent / tensor_name for tensor_name in model_tensors_ext] + + def check_model_uses_external_data(model: onnx.ModelProto) -> bool: """ Checks if the model uses external data. diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 1cb5b7c47b9..4e25a436909 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -79,7 +79,9 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", "ORTLatentConsistencyModelPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", "ORTPipelineForText2Image", @@ -92,6 +94,8 @@ "ORTStableDiffusionInpaintPipeline", "ORTStableDiffusionXLPipeline", "ORTStableDiffusionXLImg2ImgPipeline", + "ORTStableDiffusionXLInpaintPipeline", + "ORTLatentConsistencyModelImg2ImgPipeline", "ORTLatentConsistencyModelPipeline", "ORTPipelineForImage2Image", "ORTPipelineForInpainting", @@ -148,6 +152,7 @@ except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -156,11 +161,13 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: from .modeling_diffusion import ( ORTDiffusionPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, ORTPipelineForImage2Image, ORTPipelineForInpainting, @@ -169,6 +176,7 @@ ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py index 0e54bafed78..845780cafad 100644 --- a/optimum/onnxruntime/base.py +++ b/optimum/onnxruntime/base.py @@ -71,6 +71,25 @@ def dtype(self): return None + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a model part without changing the device of the parent model. " + "Please use the `to` method of the parent model to change the device." + ) + + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the model from {self.dtype} to {dtype}. " + f"Please export the model with the desired dtype." + ) + @abstractmethod def forward(self, *args, **kwargs): pass diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 18cd38c5f29..87fcb68c7e9 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -13,10 +13,11 @@ # limitations under the License. import importlib +import inspect import logging import os import shutil -import warnings +from abc import abstractmethod from collections import OrderedDict from pathlib import Path from tempfile import TemporaryDirectory @@ -24,23 +25,25 @@ import numpy as np import torch -from diffusers import ( +from diffusers.configuration_utils import ConfigMixin +from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image, - ConfigMixin, - DDIMScheduler, + LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline, - LMSDiscreteScheduler, - PNDMScheduler, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline, StableDiffusionPipeline, StableDiffusionXLImg2ImgPipeline, + StableDiffusionXLInpaintPipeline, StableDiffusionXLPipeline, ) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.schedulers import SchedulerMixin from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME -from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available +from diffusers.utils.constants import CONFIG_NAME from huggingface_hub import snapshot_download from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from huggingface_hub.utils import validate_hf_hub_args @@ -51,14 +54,7 @@ import onnxruntime as ort from ..exporters.onnx import main_export -from ..onnx.utils import _get_external_data_paths -from ..pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin -from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin -from ..pipelines.diffusers.pipeline_utils import VaeImageProcessor +from ..onnx.utils import _get_model_external_data_paths from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, @@ -66,12 +62,12 @@ DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, ) -from .base import ORTModelPart from .io_binding import TypeHelper from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel from .utils import ( ONNX_WEIGHTS_NAME, get_provider_for_device, + np_to_pt_generators, parse_device, validate_provider_availability, ) @@ -80,380 +76,287 @@ logger = logging.getLogger(__name__) -class ORTPipeline(ORTModel): - auto_model_class = None - model_type = "onnx_pipeline" - +# TODO: support from_pipe() +# TODO: Instead of ORTModel, it makes sense to have a compositional ORTMixin +# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline +class ORTDiffusionPipeline(ORTModel, DiffusionPipeline): config_name = "model_index.json" - sub_component_config_name = "config.json" + auto_model_class = DiffusionPipeline def __init__( self, - vae_decoder_session: ort.InferenceSession, + scheduler: "SchedulerMixin", unet_session: ort.InferenceSession, - tokenizer: CLIPTokenizer, - config: Dict[str, Any], - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - feature_extractor: Optional[CLIPFeatureExtractor] = None, + vae_decoder_session: ort.InferenceSession, + # optional pipeline models vae_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, + # optional pipeline submodels + tokenizer: Optional["CLIPTokenizer"] = None, + tokenizer_2: Optional["CLIPTokenizer"] = None, + feature_extractor: Optional["CLIPFeatureExtractor"] = None, + # stable diffusion xl specific arguments + force_zeros_for_empty_prompt: bool = True, + requires_aesthetics_score: bool = False, + add_watermarker: Optional[bool] = None, + # onnxruntime specific arguments use_io_binding: Optional[bool] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + **kwargs, ): - """ - Args: - vae_decoder_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the VAE decoder - unet_session (`ort.InferenceSession`): - The ONNX Runtime inference session associated to the U-NET. - tokenizer (`CLIPTokenizer`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the text encoder. - config (`Dict[str, Any]`): - A config dictionary from which the model components will be instantiated. Make sure to only load - configuration files of compatible classes. - scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`): - A scheduler to be used in combination with the U-NET component to denoise the encoded image latents. - feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`): - A model extracting features from generated images to be used as inputs for the `safety_checker` - vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the VAE encoder. - text_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`): - The ONNX Runtime inference session associated to the text encoder. - tokenizer_2 (`Optional[CLIPTokenizer]`, defaults to `None`): - Tokenizer of class - [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer) - for the second text encoder. - use_io_binding (`Optional[bool]`, defaults to `None`): - Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to - `True` if the device is CUDA, otherwise defaults to `False`. - model_save_dir (`Optional[str]`, defaults to `None`): - The directory under which the model exported to ONNX was saved. - """ - self.shared_attributes_init( - model=vae_decoder_session, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) - self._internal_dict = config - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_decoder_model_path = Path(vae_decoder_session._model_path) self.unet = ORTModelUnet(unet_session, self) - self.unet_model_path = Path(unet_session._model_path) - - if text_encoder_session is not None: - self.text_encoder_model_path = Path(text_encoder_session._model_path) - self.text_encoder = ORTModelTextEncoder(text_encoder_session, self) - else: - self.text_encoder_model_path = None - self.text_encoder = None - - if vae_encoder_session is not None: - self.vae_encoder_model_path = Path(vae_encoder_session._model_path) - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) - else: - self.vae_encoder_model_path = None - self.vae_encoder = None + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.text_encoder = ( + ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None + ) + self.text_encoder_2 = ( + ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None + ) + # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API + self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) - if text_encoder_2_session is not None: - self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path) - self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self) - else: - self.text_encoder_2_model_path = None - self.text_encoder_2 = None + # we allow passing these as torch models for now + self.image_encoder = kwargs.pop("image_encoder", None) # TODO: maybe implement ORTModelImageEncoder + self.safety_checker = kwargs.pop("safety_checker", None) # TODO: maybe implement ORTModelSafetyChecker + self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 - self.scheduler = scheduler self.feature_extractor = feature_extractor - self.safety_checker = None - - sub_models = { - DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder, - DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet, - DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder, - DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder, - DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2, - } - - # Modify config to keep the resulting model compatible with diffusers pipelines - for name in sub_models.keys(): - self._internal_dict[name] = ( - ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None) - ) - self._internal_dict.pop("vae", None) - - if "block_out_channels" in self.vae_decoder.config: - self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1) - else: - self.vae_scale_factor = 8 - - self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) - - @staticmethod - def load_model( - vae_decoder_path: Union[str, Path], - text_encoder_path: Union[str, Path], - unet_path: Union[str, Path], - vae_encoder_path: Optional[Union[str, Path]] = None, - text_encoder_2_path: Optional[Union[str, Path]] = None, - provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, - provider_options: Optional[Dict] = None, - ): - """ - Creates three inference sessions for respectively the VAE decoder, the text encoder and the U-NET models. - The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX. - Args: - vae_decoder_path (`Union[str, Path]`): - The path to the VAE decoder ONNX model. - text_encoder_path (`Union[str, Path]`): - The path to the text encoder ONNX model. - unet_path (`Union[str, Path]`): - The path to the U-NET ONNX model. - vae_encoder_path (`Union[str, Path]`, defaults to `None`): - The path to the VAE encoder ONNX model. - text_encoder_2_path (`Union[str, Path]`, defaults to `None`): - The path to the second text decoder ONNX model. - provider (`str`, defaults to `"CPUExecutionProvider"`): - ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/ - for possible providers. - session_options (`Optional[ort.SessionOptions]`, defaults to `None`): - ONNX Runtime session options to use for loading the model. Defaults to `None`. - provider_options (`Optional[Dict]`, defaults to `None`): - Provider option dictionary corresponding to the provider used. See available options - for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`. - """ - vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options) - unet = ORTModel.load_model(unet_path, provider, session_options, provider_options) - - sessions = { - "vae_encoder": vae_encoder_path, - "text_encoder": text_encoder_path, - "text_encoder_2": text_encoder_2_path, + all_pipeline_init_args = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + "scheduler": self.scheduler, + "tokenizer": self.tokenizer, + "tokenizer_2": self.tokenizer_2, + "feature_extractor": self.feature_extractor, + "requires_aesthetics_score": requires_aesthetics_score, + "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, + "add_watermarker": add_watermarker, } - for key, value in sessions.items(): - if value is not None and value.is_file(): - sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options) - else: - sessions[key] = None + diffusers_pipeline_args = {} + for key in inspect.signature(self.auto_model_class).parameters.keys(): + if key in all_pipeline_init_args: + diffusers_pipeline_args[key] = all_pipeline_init_args[key] + # inits diffusers pipeline specific attributes (registers modules and config) + self.auto_model_class.__init__(self, **diffusers_pipeline_args) - return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"] + # inits ort specific attributes + self.shared_attributes_init( + model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs + ) def _save_pretrained(self, save_directory: Union[str, Path]): save_directory = Path(save_directory) - src_to_dst_path = { - self.vae_decoder_model_path: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.text_encoder_model_path: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME, - self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME, - } - sub_models_to_save = { - self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, - self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + models_to_save_paths = { + (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), + (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), + (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), + (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), } - for path, subfolder in sub_models_to_save.items(): - if path is not None: - src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME - - # TODO: Modify _get_external_data_paths to give dictionnary - src_paths = list(src_to_dst_path.keys()) - dst_paths = list(src_to_dst_path.values()) - # Add external data paths in case of large models - src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths) - - for src_path, dst_path in zip(src_paths, dst_paths): - dst_path.parent.mkdir(parents=True, exist_ok=True) - shutil.copyfile(src_path, dst_path) - config_path = src_path.parent / self.sub_component_config_name - if config_path.is_file(): - shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name) + for model, save_path in models_to_save_paths: + if model is not None: + model_path = Path(model.session._model_path) + save_path.mkdir(parents=True, exist_ok=True) + # copy onnx model + shutil.copyfile(model_path, save_path / ONNX_WEIGHTS_NAME) + # copy external onnx data + external_data_paths = _get_model_external_data_paths(model_path) + for external_data_path in external_data_paths: + shutil.copyfile(external_data_path, save_path / external_data_path.name) + # copy model config + config_path = model_path.parent / CONFIG_NAME + if config_path.is_file(): + config_save_path = save_path / CONFIG_NAME + shutil.copyfile(config_path, config_save_path) self.scheduler.save_pretrained(save_directory / "scheduler") - if self.feature_extractor is not None: - self.feature_extractor.save_pretrained(save_directory / "feature_extractor") if self.tokenizer is not None: self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @classmethod def _from_pretrained( cls, model_id: Union[str, Path], config: Dict[str, Any], - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, + subfolder: str = "", + force_download: bool = False, + local_files_only: bool = False, revision: Optional[str] = None, + trust_remote_code: bool = False, cache_dir: str = HUGGINGFACE_HUB_CACHE, - vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, - text_encoder_file_name: str = ONNX_WEIGHTS_NAME, + token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, + vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, + text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, - local_files_only: bool = False, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", - session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, + session_options: Optional[ort.SessionOptions] = None, model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, + if use_io_binding: + raise ValueError( + "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False." ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - - if provider == "TensorrtExecutionProvider": - raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported") - model_id = str(model_id) - patterns = set(config.keys()) - sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"}) - - if not os.path.isdir(model_id): - patterns.update({"vae_encoder", "vae_decoder"}) - allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")} + if not os.path.isdir(str(model_id)): + all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"} + allow_patterns = {os.path.join(component, "*") for component in all_components} allow_patterns.update( { - vae_decoder_file_name, - text_encoder_file_name, unet_file_name, + vae_decoder_file_name, vae_encoder_file_name, + text_encoder_file_name, text_encoder_2_file_name, SCHEDULER_CONFIG_NAME, - CONFIG_NAME, cls.config_name, + CONFIG_NAME, } ) - # Downloads all repo's files matching the allowed patterns - model_id = snapshot_download( + model_save_folder = snapshot_download( model_id, cache_dir=cache_dir, + force_download=force_download, local_files_only=local_files_only, - token=token, revision=revision, + token=token, allow_patterns=allow_patterns, ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"], ) - new_model_save_dir = Path(model_id) + else: + model_save_folder = str(model_id) + + model_save_path = Path(model_save_folder) + + if model_save_dir is None: + model_save_dir = model_save_path + + model_paths = { + "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, + "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, + "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, + "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + } + + sessions = {} + for model, path in model_paths.items(): + if kwargs.get(model, None) is not None: + # this allows passing a model directly to from_pretrained + sessions[f"{model}_session"] = kwargs.pop(model) + else: + sessions[f"{model}_session"] = ( + ORTModel.load_model(path, provider, session_options, provider_options) if path.is_file() else None + ) - sub_models = {} - for name in sub_models_to_load: - library_name, library_classes = config[name] - if library_classes is not None: + submodels = {} + for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: + if kwargs.get(submodel, None) is not None: + submodels[submodel] = kwargs.pop(submodel) + elif config.get(submodel, (None, None))[0] is not None: + library_name, library_classes = config.get(submodel) library = importlib.import_module(library_name) class_obj = getattr(library, library_classes) load_method = getattr(class_obj, "from_pretrained") # Check if the module is in a subdirectory - if (new_model_save_dir / name).is_dir(): - sub_models[name] = load_method(new_model_save_dir / name) + if (model_save_path / submodel).is_dir(): + submodels[submodel] = load_method(model_save_path / submodel) else: - sub_models[name] = load_method(new_model_save_dir) - - vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model( - vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, - text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, - unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, - vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, - text_encoder_2_path=( - new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name - ), - provider=provider, - session_options=session_options, - provider_options=provider_options, - ) - - if model_save_dir is None: - model_save_dir = new_model_save_dir + submodels[submodel] = load_method(model_save_path) - if use_io_binding: - raise ValueError( - "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False." - ) + # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config + if cls.__name__ == "ORTDiffusionPipeline": + class_name = config["_class_name"] + ort_pipeline_class = _get_ort_class(class_name) + else: + ort_pipeline_class = cls - return cls( - vae_decoder_session=vae_decoder, - text_encoder_session=text_encoder, - unet_session=unet, - config=config, - tokenizer=sub_models.get("tokenizer", None), - scheduler=sub_models.get("scheduler"), - feature_extractor=sub_models.get("feature_extractor", None), - tokenizer_2=sub_models.get("tokenizer_2", None), - vae_encoder_session=vae_encoder, - text_encoder_2_session=text_encoder_2, + ort_pipeline = ort_pipeline_class( + **sessions, + **submodels, use_io_binding=use_io_binding, model_save_dir=model_save_dir, + **kwargs, ) + # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from + ort_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id))) + + return ort_pipeline + @classmethod - def _from_transformers( + def _export( cls, model_id: str, - config: Optional[str] = None, - use_auth_token: Optional[Union[bool, str]] = None, - token: Optional[Union[bool, str]] = None, - revision: str = "main", - force_download: bool = True, - cache_dir: str = HUGGINGFACE_HUB_CACHE, + config: Dict[str, Any], subfolder: str = "", + force_download: bool = False, local_files_only: bool = False, + revision: Optional[str] = None, trust_remote_code: bool = False, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + token: Optional[Union[bool, str]] = None, + use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", session_options: Optional[ort.SessionOptions] = None, provider_options: Optional[Dict[str, Any]] = None, - use_io_binding: Optional[bool] = None, task: Optional[str] = None, - ) -> "ORTPipeline": - if use_auth_token is not None: - warnings.warn( - "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", - FutureWarning, - ) - if token is not None: - raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") - token = use_auth_token - + **kwargs, + ) -> "ORTDiffusionPipeline": if task is None: task = cls._auto_model_to_task(cls.auto_model_class) - save_dir = TemporaryDirectory() - save_dir_path = Path(save_dir.name) + # we continue passing the model_save_dir from here on to avoid it being cleaned up + # might be better to use a persistent temporary directory such as the one implemented in + # https://gist.github.com/twolfson/2929dc1163b0a76d2c2b66d51f9bc808 + model_save_dir = TemporaryDirectory() + model_save_path = Path(model_save_dir.name) main_export( - model_name_or_path=model_id, - output=save_dir_path, - task=task, + model_id, + output=model_save_path, do_validation=False, no_post_process=True, - subfolder=subfolder, + token=token, revision=revision, cache_dir=cache_dir, - token=token, - local_files_only=local_files_only, + subfolder=subfolder, force_download=force_download, + local_files_only=local_files_only, trust_remote_code=trust_remote_code, + library_name="diffusers", + task=task, ) return cls._from_pretrained( - save_dir_path, + model_save_path, config=config, provider=provider, - session_options=session_options, provider_options=provider_options, + session_options=session_options, use_io_binding=use_io_binding, - model_save_dir=save_dir, + model_save_dir=model_save_dir, + **kwargs, ) def to(self, device: Union[torch.device, str, int]): @@ -471,19 +374,22 @@ def to(self, device: Union[torch.device, str, int]): device, provider_options = parse_device(device) provider = get_provider_for_device(device) - validate_provider_availability(provider) # raise error if the provider is not available + validate_provider_availability(provider) if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self - self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) - self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) self.unet.session.set_providers([provider], provider_options=[provider_options]) + self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder is not None: + self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder_2 is not None: + self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) - self.providers = self.vae_decoder.session.get_providers() + self.providers = self.unet.session.get_providers() self._device = device return self @@ -495,41 +401,142 @@ def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): def _save_config(self, save_directory): self.save_config(save_directory) + @property + def components(self) -> Dict[str, Any]: + components = { + "vae": self.vae, + "unet": self.unet, + "text_encoder": self.text_encoder, + "text_encoder_2": self.text_encoder_2, + "safety_checker": self.safety_checker, + "image_encoder": self.image_encoder, + } + components = {k: v for k, v in components.items() if v is not None} + return components -class ORTPipelinePart(ORTModelPart): - CONFIG_NAME = "config.json" + def __call__(self, *args, **kwargs): + # we do this to keep numpy random states support for now + # TODO: deprecate and add warnings when a random state is passed - def __init__(self, session: ort.InferenceSession, parent_model: ORTPipeline): - config_path = Path(session._model_path).parent / self.CONFIG_NAME + args = list(args) + for i in range(len(args)): + args[i] = np_to_pt_generators(args[i], self.device) - if config_path.is_file(): - # TODO: use FrozenDict - self.config = parent_model._dict_from_json_file(config_path) - else: - self.config = {} + for k, v in kwargs.items(): + kwargs[k] = np_to_pt_generators(v, self.device) + + return self.auto_model_class.__call__(self, *args, **kwargs) - super().__init__(session, parent_model) + +class ORTPipelinePart(ConfigMixin): + config_name: str = CONFIG_NAME + + def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionPipeline): + self.session = session + self.parent_pipeline = parent_pipeline + + self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} + self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} + self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} + + config_file_path = Path(session._model_path).parent / self.config_name + if not config_file_path.is_file(): + # config is mandatory for the model part to be used for inference + raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}") + config_dict = self._dict_from_json_file(config_file_path) + self.register_to_config(**config_dict) @property - def input_dtype(self): - # for backward compatibility and diffusion mixins (will be standardized in the future) - return {name: TypeHelper.ort_type_to_numpy_type(ort_type) for name, ort_type in self.input_dtypes.items()} + def device(self): + return self.parent_pipeline.device + @property + def dtype(self): + for dtype in self.input_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + for dtype in self.output_dtypes.values(): + torch_dtype = TypeHelper.ort_type_to_torch_type(dtype) + if torch_dtype.is_floating_point: + return torch_dtype + + return None + + def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None): + for arg in args: + if isinstance(arg, torch.device): + device = arg + elif isinstance(arg, (int, str)): + device = torch.device(arg) + elif isinstance(arg, torch.dtype): + dtype = arg + + if device is not None and device != self.device: + raise ValueError( + "Cannot change the device of a pipeline part without changing the device of the parent pipeline. " + "Please use the `to` method of the parent pipeline to change the device." + ) -class ORTModelTextEncoder(ORTPipelinePart): - def forward(self, input_ids: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(input_ids, torch.Tensor) + if dtype is not None and dtype != self.dtype: + raise NotImplementedError( + f"Cannot change the dtype of the pipeline from {self.dtype} to {dtype}. " + f"Please export the pipeline with the desired dtype." + ) - model_inputs = {"input_ids": input_ids} + def prepare_onnx_inputs(self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]) -> Dict[str, np.ndarray]: + onnx_inputs = {} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) - onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + # converts pytorch inputs into numpy inputs for onnx + for input_name in self.input_names.keys(): + onnx_inputs[input_name] = inputs.pop(input_name) - return ModelOutput(**model_outputs) + if use_torch: + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) + + if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: + onnx_inputs[input_name] = onnx_inputs[input_name].astype( + TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name]) + ) + + return onnx_inputs + + def prepare_onnx_outputs( + self, use_torch: bool, *onnx_outputs: np.ndarray + ) -> Dict[str, Union[torch.Tensor, np.ndarray]]: + model_outputs = {} + + # converts onnxruntime outputs into tensor for standard outputs + for output_name, idx in self.output_names.items(): + model_outputs[output_name] = onnx_outputs[idx] + + if use_torch: + model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device) + + return model_outputs + + @abstractmethod + def forward(self, *args, **kwargs): + raise NotImplementedError + + def __call__(self, *args, **kwargs): + return self.forward(*args, **kwargs) class ORTModelUnet(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "time_cond_proj_dim"): + logger.warning( + "The `time_cond_proj_dim` attribute is missing from the UNet configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(time_cond_proj_dim=None) + def forward( self, sample: Union[np.ndarray, torch.Tensor], @@ -538,9 +545,15 @@ def forward( text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, ): use_torch = isinstance(sample, torch.Tensor) + if len(timestep.shape) == 0: + timestep = timestep.unsqueeze(0) + model_inputs = { "sample": sample, "timestep": timestep, @@ -548,171 +561,323 @@ def forward( "text_embeds": text_embeds, "time_ids": time_ids, "timestep_cond": timestep_cond, + **(cross_attention_kwargs or {}), + **(added_cond_kwargs or {}), } - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) -class ORTModelVaeDecoder(ORTPipelinePart): - def forward(self, latent_sample: Union[np.ndarray, torch.Tensor]): - use_torch = isinstance(latent_sample, torch.Tensor) +class ORTModelTextEncoder(ORTPipelinePart): + def forward( + self, + input_ids: Union[np.ndarray, torch.Tensor], + attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None, + output_hidden_states: Optional[bool] = None, + return_dict: bool = False, + ): + use_torch = isinstance(input_ids, torch.Tensor) - model_inputs = {"latent_sample": latent_sample} + model_inputs = {"input_ids": input_ids} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if output_hidden_states: + model_outputs["hidden_states"] = [] + for i in range(self.config.num_hidden_layers): + model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) + model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) + else: + for i in range(self.config.num_hidden_layers): + model_outputs.pop(f"hidden_states.{i}", None) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) class ORTModelVaeEncoder(ORTPipelinePart): - def forward(self, sample: Union[np.ndarray, torch.Tensor]): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE encoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + + def forward( + self, + sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): use_torch = isinstance(sample, torch.Tensor) model_inputs = {"sample": sample} - onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs) + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) onnx_outputs = self.session.run(None, onnx_inputs) - model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + + if "latent_parameters" in model_outputs: + model_outputs["latent_dist"] = DiagonalGaussianDistribution( + parameters=model_outputs.pop("latent_parameters") + ) + + if return_dict: + return model_outputs return ModelOutput(**model_outputs) +class ORTModelVaeDecoder(ORTPipelinePart): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # can be missing from models exported long ago + if not hasattr(self.config, "scaling_factor"): + logger.warning( + "The `scaling_factor` attribute is missing from the VAE decoder configuration. " + "Please re-export the model with newer version of optimum and diffusers." + ) + self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) + + def forward( + self, + latent_sample: Union[np.ndarray, torch.Tensor], + generator: Optional[torch.Generator] = None, + return_dict: bool = False, + ): + use_torch = isinstance(latent_sample, torch.Tensor) + + model_inputs = {"latent_sample": latent_sample} + + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if "latent_sample" in model_outputs: + model_outputs["latents"] = model_outputs.pop("latent_sample") + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + +class ORTWrapperVae(ORTPipelinePart): + def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder): + self.decoder = decoder + self.encoder = encoder + + @property + def config(self): + return self.decoder.config + + @property + def dtype(self): + return self.decoder.dtype + + @property + def device(self): + return self.decoder.device + + def decode(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + def encode(self, *args, **kwargs): + return self.encoder(*args, **kwargs) + + def to(self, *args, **kwargs): + self.decoder.to(*args, **kwargs) + if self.encoder is not None: + self.encoder.to(*args, **kwargs) + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionPipeline(ORTPipeline, StableDiffusionPipelineMixin): +class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline). """ main_input_name = "prompt" + export_feature = "text-to-image" auto_model_class = StableDiffusionPipeline - __call__ = StableDiffusionPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionImg2ImgPipeline(ORTPipeline, StableDiffusionImg2ImgPipelineMixin): +class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline). """ - main_input_name = "prompt" + main_input_name = "image" + export_feature = "image-to-image" auto_model_class = StableDiffusionImg2ImgPipeline - __call__ = StableDiffusionImg2ImgPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionInpaintPipeline(ORTPipeline, StableDiffusionInpaintPipelineMixin): +class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipeline): """ ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline). """ main_input_name = "prompt" + export_feature = "inpainting" auto_model_class = StableDiffusionInpaintPipeline - __call__ = StableDiffusionInpaintPipelineMixin.__call__ - @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTLatentConsistencyModelPipeline(ORTPipeline, LatentConsistencyPipelineMixin): +class ORTStableDiffusionXLPipeline(ORTDiffusionPipeline, StableDiffusionXLPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). """ main_input_name = "prompt" - auto_model_class = LatentConsistencyModelPipeline - - __call__ = LatentConsistencyPipelineMixin.__call__ - + export_feature = "text-to-image" + auto_model_class = StableDiffusionXLPipeline -class ORTStableDiffusionXLPipelineBase(ORTPipeline): - def __init__( + def _get_add_time_ids( self, - vae_decoder_session: ort.InferenceSession, - text_encoder_session: ort.InferenceSession, - unet_session: ort.InferenceSession, - config: Dict[str, Any], - tokenizer: CLIPTokenizer, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - feature_extractor: Optional[CLIPFeatureExtractor] = None, - vae_encoder_session: Optional[ort.InferenceSession] = None, - text_encoder_2_session: Optional[ort.InferenceSession] = None, - tokenizer_2: Optional[CLIPTokenizer] = None, - use_io_binding: Optional[bool] = None, - model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, - add_watermarker: Optional[bool] = None, + original_size, + crops_coords_top_left, + target_size, + dtype, + text_encoder_projection_dim=None, ): - super().__init__( - vae_decoder_session=vae_decoder_session, - text_encoder_session=text_encoder_session, - unet_session=unet_session, - config=config, - tokenizer=tokenizer, - scheduler=scheduler, - feature_extractor=feature_extractor, - vae_encoder_session=vae_encoder_session, - text_encoder_2_session=text_encoder_2_session, - tokenizer_2=tokenizer_2, - use_io_binding=use_io_binding, - model_save_dir=model_save_dir, - ) + add_time_ids = list(original_size + crops_coords_top_left + target_size) - add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available() + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + return add_time_ids - if add_watermarker: - if not is_invisible_watermark_available(): - raise ImportError( - "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`." - ) - from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTStableDiffusionXLImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionXLImg2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). + """ - self.watermark = StableDiffusionXLWatermarker() + main_input_name = "prompt" + export_feature = "image-to-image" + auto_model_class = StableDiffusionXLImg2ImgPipeline + + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) else: - self.watermark = None + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) + + return add_time_ids, add_neg_time_ids @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin): +class ORTStableDiffusionXLInpaintPipeline(ORTDiffusionPipeline, StableDiffusionXLInpaintPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline). """ - main_input_name = "prompt" - auto_model_class = StableDiffusionXLPipeline + main_input_name = "image" + export_feature = "inpainting" + auto_model_class = StableDiffusionXLInpaintPipeline + + def _get_add_time_ids( + self, + original_size, + crops_coords_top_left, + target_size, + aesthetic_score, + negative_aesthetic_score, + negative_original_size, + negative_crops_coords_top_left, + negative_target_size, + dtype, + text_encoder_projection_dim=None, + ): + if self.config.requires_aesthetics_score: + add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,)) + add_neg_time_ids = list( + negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,) + ) + else: + add_time_ids = list(original_size + crops_coords_top_left + target_size) + add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size) + + add_time_ids = torch.tensor([add_time_ids], dtype=dtype) + add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype) - __call__ = StableDiffusionXLPipelineMixin.__call__ + return add_time_ids, add_neg_time_ids @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) -class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin): +class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyModelPipeline): """ - ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline). + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline). """ main_input_name = "prompt" - auto_model_class = StableDiffusionXLImg2ImgPipeline + export_feature = "text-to-image" + auto_model_class = LatentConsistencyModelPipeline + + +@add_end_docstrings(ONNX_MODEL_END_DOCSTRING) +class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline). + """ - __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__ + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = LatentConsistencyModelImg2ImgPipeline SUPPORTED_ORT_PIPELINES = [ ORTStableDiffusionPipeline, ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, - ORTLatentConsistencyModelPipeline, ORTStableDiffusionXLPipeline, ORTStableDiffusionXLImg2ImgPipeline, + ORTStableDiffusionXLInpaintPipeline, + ORTLatentConsistencyModelPipeline, + ORTLatentConsistencyModelImg2ImgPipeline, ] -def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): +def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True): for ort_pipeline_class in SUPPORTED_ORT_PIPELINES: if ( ort_pipeline_class.__name__ == pipeline_class_name @@ -724,31 +889,6 @@ def _get_pipeline_class(pipeline_class_name: str, throw_error_if_not_exist: bool raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}") -class ORTDiffusionPipeline(ConfigMixin): - config_name = "model_index.json" - - @classmethod - @validate_hf_hub_args - def from_pretrained(cls, pretrained_model_or_path, **kwargs): - load_config_kwargs = { - "force_download": kwargs.get("force_download", False), - "resume_download": kwargs.get("resume_download", None), - "local_files_only": kwargs.get("local_files_only", False), - "cache_dir": kwargs.get("cache_dir", None), - "revision": kwargs.get("revision", None), - "proxies": kwargs.get("proxies", None), - "token": kwargs.get("token", None), - } - - config = cls.load_config(pretrained_model_or_path, **load_config_kwargs) - config = config[0] if isinstance(config, tuple) else config - class_name = config["_class_name"] - - ort_pipeline_class = _get_pipeline_class(class_name) - - return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) - - ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionPipeline), @@ -761,12 +901,14 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): [ ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), + ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ] ) ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline), ] ) @@ -777,7 +919,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): ] -def _get_task_class(mapping, pipeline_class_name): +def _get_task_ort_class(mapping, pipeline_class_name): def _get_model_name(pipeline_class_name): for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS: for model_name, ort_pipeline_class in ort_pipelines_mapping.items(): @@ -801,7 +943,8 @@ class ORTPipelineForTask(ConfigMixin): config_name = "model_index.json" @classmethod - def from_pretrained(cls, pretrained_model_or_path, **kwargs): + @validate_hf_hub_args + def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTDiffusionPipeline: load_config_kwargs = { "force_download": kwargs.get("force_download", False), "resume_download": kwargs.get("resume_download", None), @@ -815,7 +958,7 @@ def from_pretrained(cls, pretrained_model_or_path, **kwargs): config = config[0] if isinstance(config, tuple) else config class_name = config["_class_name"] - ort_pipeline_class = _get_task_class(cls.ort_pipelines_mapping, class_name) + ort_pipeline_class = _get_task_ort_class(cls.ort_pipelines_mapping, class_name) return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 17bd3e2a4e7..9b29afa566b 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -938,7 +938,7 @@ def _prepare_onnx_inputs( onnx_inputs[input_name] = inputs.pop(input_name) if use_torch: - onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy() + onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True) if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]: onnx_inputs[input_name] = onnx_inputs[input_name].astype( diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py index fda3ca82bbe..27e0dc01b4c 100644 --- a/optimum/onnxruntime/modeling_seq2seq.py +++ b/optimum/onnxruntime/modeling_seq2seq.py @@ -67,7 +67,7 @@ if check_if_transformers_greater("4.25.0"): from transformers.generation import GenerationMixin else: - from transformers.generation_utils import GenerationMixin + from transformers.generation_utils import GenerationMixin # type: ignore if check_if_transformers_greater("4.43.0"): diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 985980e31b0..128e2406f11 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -403,3 +403,18 @@ def evaluation_loop( metrics = {} return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset)) + + +def np_to_pt_generators(np_object, device): + if isinstance(np_object, np.random.RandomState): + return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0])) + elif isinstance(np_object, np.random.Generator): + return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0])) + elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)): + return [np_to_pt_generators(a, device) for a in np_object] + elif isinstance(np_object, dict) and isinstance( + next(iter(np_object.values())), (np.random.RandomState, np.random.Generator) + ): + return {k: np_to_pt_generators(v, device) for k, v in np_object.items()} + else: + return np_object diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py deleted file mode 100644 index 630d463de73..00000000000 --- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import logging -from typing import Callable, List, Optional, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -logger = logging.getLogger(__name__) - - -class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin): - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 4, - original_inference_steps: int = None, - guidance_scale: float = 8.5, - num_images_per_prompt: int = 1, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - - # Don't need to get negative prompts due to LCM guided distillation - negative_prompt = None - negative_prompt_embeds = None - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - False, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps) - timesteps = self.scheduler.timesteps - - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config["in_channels"], - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - bs = batch_size * num_images_per_prompt - # get Guidance Scale Embedding - w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype) - w_embedding = self.get_guidance_scale_embedding( - w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype - ) - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latents, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - timestep_cond=w_embedding, - )[0] - - # compute the previous noisy sample x_t -> x_t-1 - latents, denoised = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False - ) - latents, denoised = latents.numpy(), denoised.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = denoised - has_nsfw_concept = None - else: - denoised /= self.vae_decoder.config["scaling_factor"] - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264 - def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None): - """ - See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 - - Args: - timesteps (`torch.Tensor`): - generate embedding vectors at these timesteps - embedding_dim (`int`, *optional*, defaults to 512): - dimension of the embeddings to generate - dtype: - data type of the generated embeddings - - Returns: - `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` - """ - w = w * 1000 - half_dim = embedding_dim // 2 - emb = np.log(10000.0) / (half_dim - 1) - emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb) - emb = w[:, None] * emb[None, :] - emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1) - - if embedding_dim % 2 == 1: # zero pad - emb = np.pad(emb, [(0, 0), (0, 1)]) - - assert emb.shape == (w.shape[0], embedding_dim) - return emb diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py deleted file mode 100644 index 6cc47fab1b9..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py +++ /dev/null @@ -1,427 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Callable, List, Optional, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionPipelineMixin(DiffusionPipelineMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114 - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if prompt_embeds is None: - # get prompt text embeddings - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids - - if not np.array_equal(text_input_ids, untruncated_ids): - removed_text = self.tokenizer.batch_decode( - untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] - ) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {self.tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0] - - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance and negative_prompt_embeds is None: - uncond_tokens: List[str] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] * batch_size - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - max_length = prompt_embeds.shape[1] - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0] - - if do_classifier_free_guidance: - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds]) - - return prompt_embeds - - # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217 - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - return latents - - # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264 - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - guidance_rescale: float = 0.0, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) - - def run_safety_checker(self, image: np.ndarray): - if self.safety_checker is None: - has_nsfw_concept = None - else: - feature_extractor_input = self.image_processor.numpy_to_pil(image) - safety_checker_input = self.feature_extractor( - feature_extractor_input, return_tensors="np" - ).pixel_values.astype(image.dtype) - images, has_nsfw_concept = [], [] - for i in range(image.shape[0]): - image_i, has_nsfw_concept_i = self.safety_checker( - clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1] - ) - images.append(image_i) - has_nsfw_concept.append(has_nsfw_concept_i[0]) - image = np.concatenate(images) - - return image, has_nsfw_concept diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py deleted file mode 100644 index a66035a789b..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py +++ /dev/null @@ -1,309 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from typing import Callable, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[Union[str, List[str]]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = np.concatenate([init_latents], axis=0) - - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ).numpy() - - return init_latents - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.8, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`): - A np.random.RandomState to make generation deterministic. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - - # check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) - - # define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - - image = self.image_processor.preprocess(image) - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - - # get the original timestep using init_timestep - offset = self.scheduler.config.get("steps_offset", 0) - init_timestep = int(num_inference_steps * strength) + offset - init_timestep = min(init_timestep, num_inference_steps) - - timesteps = self.scheduler.timesteps.numpy()[-init_timestep] - timesteps = np.array([timesteps] * batch_size * num_images_per_prompt) - - # 5. Prepare latent variables - latents = self.prepare_latents(image, timesteps, batch_size, num_images_per_prompt, latents_dtype, generator) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - t_start = max(num_inference_steps - init_timestep + offset, 0) - timesteps = self.scheduler.timesteps[t_start:].numpy() - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py deleted file mode 100644 index cb3c7db96e9..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -from typing import Callable, List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput -from diffusers.utils import PIL_INTERPOLATION - -from .pipeline_stable_diffusion import StableDiffusionPipelineMixin - - -def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor): - image = np.array( - image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - image = image[None].transpose(0, 3, 1, 2) - image = image.astype(np.float32) / 127.5 - 1.0 - - image_mask = np.array( - mask.convert("L").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor)) - ) - masked_image = image * (image_mask < 127.5) - - mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"]) - mask = np.array(mask.convert("L")) - mask = mask.astype(np.float32) / 255.0 - mask = mask[None, None] - mask[mask < 0.5] = 0 - mask[mask >= 0.5] = 1 - - return mask, masked_image - - -class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin): - # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - @torch.no_grad() - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image, - mask_image: PIL.Image.Image, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 7.5, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Union[str, List[str]]`): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`PIL.Image.Image`): - `Image`, or tensor representing an image batch which will be upscaled. - mask_image (`PIL.Image.Image`): - `Image`, or tensor representing a masked image batch which will be upscaled. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 7.5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor - - # check inputs. Raise error if not correct - self.check_inputs( - prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds - ) - - # define call parameters - if prompt is not None and isinstance(prompt, str): - batch_size = 1 - elif prompt is not None and isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # set timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - prompt_embeds = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - ) - - num_channels_latents = self.vae_decoder.config.get("latent_channels", 4) - num_channels_unet = self.unet.config.get("in_channels", 9) - latents_shape = ( - batch_size * num_images_per_prompt, - num_channels_latents, - height // self.vae_scale_factor, - width // self.vae_scale_factor, - ) - latents_dtype = prompt_embeds.dtype - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*latents_shape).astype(latents_dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*latents_shape, generator=generator).numpy().astype(latents_dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != latents_shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}") - - # prepare mask and masked_image - mask, masked_image = prepare_mask_and_masked_image( - image, mask_image, latents_shape[-2:], self.vae_scale_factor - ) - mask = mask.astype(latents.dtype) - masked_image = masked_image.astype(latents.dtype) - - masked_image_latents = self.vae_encoder(sample=masked_image)[0] - - scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215) - masked_image_latents = scaling_factor * masked_image_latents - - # duplicate mask and masked_image_latents for each generation per prompt - mask = mask.repeat(batch_size * num_images_per_prompt, 0) - masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0) - - mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = ( - np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents - ) - - # check that sizes of mask, masked image and latents match - if num_channels_unet == 9: - # default case for runwayml/stable-diffusion-inpainting - num_channels_mask = mask.shape[1] - num_channels_masked_image = masked_image_latents.shape[1] - if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet: - raise ValueError( - f"Incorrect configuration settings! The config of `pipeline.unet`: expects" - f" {num_channels_unet} but received `num_channels_latents`: {num_channels_latents} +" - f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" - f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" - " `pipeline.unet` or your `mask_image` or `image` input." - ) - elif num_channels_unet != 4: - raise ValueError( - f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {num_channels_unet}." - ) - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - # concat latents, mask, masked_image_latnets in the channel dimension - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - if num_channels_unet == 9: - latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1) - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[ - 0 - ] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - callback(i, t, latents) - - if output_type == "latent": - image = latents - has_nsfw_concept = None - else: - latents /= scaling_factor - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - image, has_nsfw_concept = self.run_safety_checker(image) - - if has_nsfw_concept is None: - do_denormalize = [True] * image.shape[0] - else: - do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] - - image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) - - if not return_dict: - return (image, has_nsfw_concept) - - return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py deleted file mode 100644 index 0407c16a77a..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py +++ /dev/null @@ -1,506 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - height: Optional[int], - width: Optional[int], - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - if height % 8 != 0 or width % 8 != 0: - raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - if prompt_embeds is not None and pooled_prompt_embeds is None: - raise ValueError( - "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`." - ) - - if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None: - raise ValueError( - "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`." - ) - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None): - shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) - if isinstance(generator, list) and len(generator) != batch_size: - raise ValueError( - f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" - f" size of {batch_size}. Make sure the batch size matches the length of the generators." - ) - - if latents is None: - if isinstance(generator, np.random.RandomState): - latents = generator.randn(*shape).astype(dtype) - elif isinstance(generator, torch.Generator): - latents = torch.randn(*shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - elif latents.shape != shape: - raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}") - - # scale the initial noise by the standard deviation required by the scheduler - latents = latents * np.float64(self.scheduler.init_noise_sigma) - - return latents - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs - def prepare_extra_step_kwargs(self, generator, eta): - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - - extra_step_kwargs = {} - - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta - - return extra_step_kwargs - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - height: Optional[int] = None, - width: Optional[int] = None, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - height (`Optional[int]`, defaults to None): - The height in pixels of the generated image. - width (`Optional[int]`, defaults to None): - The width in pixels of the generated image. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[Union[np.random.RandomState, torch.Generator]]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - - # 0. Default height and width to unet - height = height or self.unet.config["sample_size"] * self.vae_scale_factor - width = width or self.unet.config["sample_size"] * self.vae_scale_factor - - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 1. Check inputs. Raise error if not correct - self.check_inputs( - prompt, - height, - width, - callback_steps, - negative_prompt, - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) - - # 2. Define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 3. Encode input prompt - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - ) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - timesteps = self.scheduler.timesteps - - # 5. Prepare latent variables - latents = self.prepare_latents( - batch_size * num_images_per_prompt, - self.unet.config.get("in_channels", 4), - height, - width, - prompt_embeds.dtype, - generator, - latents, - ) - - # 6. Prepare extra step kwargs - extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) - - # 7. Prepare added time ids & embeddings - add_text_embeds = pooled_prompt_embeds - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # Adapted from diffusers to extend it for other runtimes than ORT - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - # 8. Denoising loop - - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, - ) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py deleted file mode 100644 index 19988599b64..00000000000 --- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py +++ /dev/null @@ -1,515 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import inspect -import logging -from typing import Any, Callable, Dict, List, Optional, Tuple, Union - -import numpy as np -import PIL.Image -import torch -from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput - -from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg - - -logger = logging.getLogger(__name__) - - -class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin): - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int, - do_classifier_free_guidance: bool, - negative_prompt: Optional[Union[str, list]], - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - ): - r""" - Encodes the prompt into text encoder hidden states. - - Args: - prompt (`Union[str, List[str]]`): - prompt to be encoded - num_images_per_prompt (`int`): - number of images that should be generated per prompt - do_classifier_free_guidance (`bool`): - whether to use classifier free guidance or not - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored - if `guidance_scale` is less than `1`). - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - """ - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - # Define tokenizers and text encoders - tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2] - text_encoders = ( - [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2] - ) - - if prompt_embeds is None: - prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - # get prompt text embeddings - text_inputs = tokenizer( - prompt, - padding="max_length", - max_length=tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids - - if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal( - text_input_ids, untruncated_ids - ): - removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1]) - logger.warning( - "The following part of your input was truncated because CLIP can only handle sequences up to" - f" {tokenizer.model_max_length} tokens: {removed_text}" - ) - - prompt_embeds = text_encoder( - input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - pooled_prompt_embeds = prompt_embeds[0] - prompt_embeds = prompt_embeds[-2] - prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0) - prompt_embeds_list.append(prompt_embeds) - - prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1) - - # get unconditional embeddings for classifier free guidance - zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"] - if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt: - negative_prompt_embeds = np.zeros_like(prompt_embeds) - negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds) - elif do_classifier_free_guidance and negative_prompt_embeds is None: - negative_prompt = negative_prompt or "" - if prompt is not None and type(prompt) is not type(negative_prompt): - raise TypeError( - f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" - f" {type(prompt)}." - ) - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - elif batch_size != len(negative_prompt): - raise ValueError( - f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" - f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" - " the batch size of `prompt`." - ) - else: - uncond_tokens = negative_prompt - - negative_prompt_embeds_list = [] - for tokenizer, text_encoder in zip(tokenizers, text_encoders): - max_length = prompt_embeds.shape[1] - uncond_input = tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - negative_prompt_embeds = text_encoder( - input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32)) - ) - negative_pooled_prompt_embeds = negative_prompt_embeds[0] - negative_prompt_embeds = negative_prompt_embeds[-2] - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0) - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - negative_prompt_embeds_list.append(negative_prompt_embeds) - negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1) - - pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0) - negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0) - - return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds - - # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs - def check_inputs( - self, - prompt: Union[str, List[str]], - strength: float, - callback_steps: int, - negative_prompt: Optional[str] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - ): - if strength < 0 or strength > 1: - raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}") - - if (callback_steps is None) or ( - callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0) - ): - raise ValueError( - f"`callback_steps` has to be a positive integer but is {callback_steps} of type" - f" {type(callback_steps)}." - ) - - if prompt is not None and prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" - " only forward one of the two." - ) - elif prompt is None and prompt_embeds is None: - raise ValueError( - "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." - ) - elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): - raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") - - if negative_prompt is not None and negative_prompt_embeds is not None: - raise ValueError( - f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" - f" {negative_prompt_embeds}. Please make sure to only forward one of the two." - ) - - if prompt_embeds is not None and negative_prompt_embeds is not None: - if prompt_embeds.shape != negative_prompt_embeds.shape: - raise ValueError( - "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" - f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" - f" {negative_prompt_embeds.shape}." - ) - - def get_timesteps(self, num_inference_steps, strength): - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy() - - return timesteps, num_inference_steps - t_start - - # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents - def prepare_latents(self, image, timesteps, batch_size, num_images_per_prompt, dtype, generator=None): - batch_size = batch_size * num_images_per_prompt - - if image.shape[1] == 4: - init_latents = image - else: - init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215) - - if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0: - # expand init_latents for batch_size - additional_image_per_prompt = batch_size // init_latents.shape[0] - init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0) - elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0: - raise ValueError( - f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts." - ) - else: - init_latents = np.concatenate([init_latents], axis=0) - - # add noise to latents using the timesteps - if isinstance(generator, np.random.RandomState): - noise = generator.randn(*init_latents.shape).astype(dtype) - elif isinstance(generator, torch.Generator): - noise = torch.randn(*init_latents.shape, generator=generator).numpy().astype(dtype) - else: - raise ValueError( - f"Expected `generator` to be of type `np.random.RandomState` or `torch.Generator`, but got" - f" {type(generator)}." - ) - - init_latents = self.scheduler.add_noise( - torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps) - ) - init_latents = init_latents.numpy() - - return init_latents - - def _get_add_time_ids( - self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype - ): - if self.config.get("requires_aesthetics_score"): - add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),) - add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),) - else: - add_time_ids = (original_size + crops_coords_top_left + target_size,) - add_neg_time_ids = (original_size + crops_coords_top_left + target_size,) - - add_time_ids = np.array(add_time_ids, dtype=dtype) - add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype) - - return add_time_ids, add_neg_time_ids - - # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__ - def __call__( - self, - prompt: Optional[Union[str, List[str]]] = None, - image: Union[np.ndarray, PIL.Image.Image] = None, - strength: float = 0.3, - num_inference_steps: int = 50, - guidance_scale: float = 5.0, - negative_prompt: Optional[Union[str, List[str]]] = None, - num_images_per_prompt: int = 1, - eta: float = 0.0, - generator: Optional[Union[np.random.RandomState, torch.Generator]] = None, - latents: Optional[np.ndarray] = None, - prompt_embeds: Optional[np.ndarray] = None, - negative_prompt_embeds: Optional[np.ndarray] = None, - pooled_prompt_embeds: Optional[np.ndarray] = None, - negative_pooled_prompt_embeds: Optional[np.ndarray] = None, - output_type: str = "pil", - return_dict: bool = True, - callback: Optional[Callable[[int, int, np.ndarray], None]] = None, - callback_steps: int = 1, - cross_attention_kwargs: Optional[Dict[str, Any]] = None, - guidance_rescale: float = 0.0, - original_size: Optional[Tuple[int, int]] = None, - crops_coords_top_left: Tuple[int, int] = (0, 0), - target_size: Optional[Tuple[int, int]] = None, - aesthetic_score: float = 6.0, - negative_aesthetic_score: float = 2.5, - ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`Optional[Union[str, List[str]]]`, defaults to None): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - image (`Union[np.ndarray, PIL.Image.Image]`): - `Image`, or tensor representing an image batch which will be upscaled. - strength (`float`, defaults to 0.8): - Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image` - will be used as a starting point, adding more noise to it the larger the `strength`. The number of - denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will - be maximum and the denoising process will run for the full number of iterations specified in - `num_inference_steps`. A value of 1, therefore, essentially ignores `image`. - num_inference_steps (`int`, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (`float`, defaults to 5): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`Optional[Union[str, list]]`): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` - is less than `1`). - num_images_per_prompt (`int`, defaults to 1): - The number of images to generate per prompt. - eta (`float`, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`Optional[np.random.RandomState]`, defaults to `None`):: - A np.random.RandomState to make generation deterministic. - latents (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - output_type (`str`, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a - plain tuple. - callback (Optional[Callable], defaults to `None`): - A function that will be called every `callback_steps` steps during inference. The function will be - called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. - callback_steps (`int`, defaults to 1): - The frequency at which the `callback` function will be called. If not specified, the callback will be - called at every step. - guidance_rescale (`float`, defaults to 0.7): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - - Returns: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple. - When returning a tuple, the first element is a list with the generated images, and the second element is a - list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work" - (nsfw) content, according to the `safety_checker`. - """ - # 0. Check inputs. Raise error if not correct - self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds) - - # 1. Define call parameters - if isinstance(prompt, str): - batch_size = 1 - elif isinstance(prompt, list): - batch_size = len(prompt) - else: - batch_size = prompt_embeds.shape[0] - - if generator is None: - generator = np.random.RandomState() - - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - - # 2. Encode input prompt - ( - prompt_embeds, - negative_prompt_embeds, - pooled_prompt_embeds, - negative_pooled_prompt_embeds, - ) = self._encode_prompt( - prompt, - num_images_per_prompt, - do_classifier_free_guidance, - negative_prompt, - prompt_embeds=prompt_embeds, - negative_prompt_embeds=negative_prompt_embeds, - pooled_prompt_embeds=pooled_prompt_embeds, - negative_pooled_prompt_embeds=negative_pooled_prompt_embeds, - ) - - # 3. Preprocess image - image = self.image_processor.preprocess(image) - - # 4. Prepare timesteps - self.scheduler.set_timesteps(num_inference_steps) - - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength) - latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0) - timestep_dtype = self.unet.input_dtype.get("timestep", np.float32) - - latents_dtype = prompt_embeds.dtype - image = image.astype(latents_dtype) - - # 5. Prepare latent variables - latents = self.prepare_latents( - image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator - ) - - # 6. Prepare extra step kwargs - extra_step_kwargs = {} - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - if accepts_eta: - extra_step_kwargs["eta"] = eta - - height, width = latents.shape[-2:] - height = height * self.vae_scale_factor - width = width * self.vae_scale_factor - original_size = original_size or (height, width) - target_size = target_size or (height, width) - - # 8. Prepare added time ids & embeddings - add_text_embeds = pooled_prompt_embeds - add_time_ids, add_neg_time_ids = self._get_add_time_ids( - original_size, - crops_coords_top_left, - target_size, - aesthetic_score, - negative_aesthetic_score, - dtype=prompt_embeds.dtype, - ) - - if do_classifier_free_guidance: - prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0) - add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0) - add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0) - add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0) - - # 8. Denoising loop - num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order - for i, t in enumerate(self.progress_bar(timesteps)): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t) - latent_model_input = latent_model_input.cpu().numpy() - - # predict the noise residual - timestep = np.array([t], dtype=timestep_dtype) - noise_pred = self.unet( - sample=latent_model_input, - timestep=timestep, - encoder_hidden_states=prompt_embeds, - text_embeds=add_text_embeds, - time_ids=add_time_ids, - ) - noise_pred = noise_pred[0] - - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2) - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - if guidance_rescale > 0.0: - # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf - noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale) - - # compute the previous noisy sample x_t -> x_t-1 - scheduler_output = self.scheduler.step( - torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs - ) - latents = scheduler_output.prev_sample.numpy() - - # call the callback, if provided - if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): - if callback is not None and i % callback_steps == 0: - step_idx = i // getattr(self.scheduler, "order", 1) - callback(step_idx, t, latents) - - if output_type == "latent": - image = latents - else: - latents /= self.vae_decoder.config.get("scaling_factor", 0.18215) - # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1 - image = np.concatenate( - [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])] - ) - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) - - if not return_dict: - return (image,) - - return StableDiffusionXLPipelineOutput(images=image) diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py deleted file mode 100644 index e9d5986b61c..00000000000 --- a/optimum/pipelines/diffusers/pipeline_utils.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import warnings -from typing import List, Optional, Union - -import numpy as np -import PIL.Image -import torch -from diffusers import ConfigMixin -from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor -from diffusers.utils.pil_utils import PIL_INTERPOLATION -from PIL import Image -from tqdm.auto import tqdm - - -class DiffusionPipelineMixin(ConfigMixin): - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L812 - @staticmethod - def numpy_to_pil(images): - """ - Converts a numpy image or a batch of images to a PIL image. - """ - if images.ndim == 3: - images = images[None, ...] - images = (images * 255).round().astype("uint8") - if images.shape[-1] == 1: - # special case for grayscale (single channel) images - pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images] - else: - pil_images = [Image.fromarray(image) for image in images] - - return pil_images - - # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827 - def progress_bar(self, iterable=None, total=None): - if not hasattr(self, "_progress_bar_config"): - self._progress_bar_config = {} - elif not isinstance(self._progress_bar_config, dict): - raise ValueError( - f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}." - ) - - if iterable is not None: - return tqdm(iterable, **self._progress_bar_config) - elif total is not None: - return tqdm(total=total, **self._progress_bar_config) - else: - raise ValueError("Either `total` or `iterable` has to be defined.") - - -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58 -def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0): - """ - Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and - Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4 - """ - std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True) - std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True) - # rescale the results from guidance (fixes overexposure) - noise_pred_rescaled = noise_cfg * (std_text / std_cfg) - # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images - noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg - return noise_cfg - - -class VaeImageProcessor(DiffusersVaeImageProcessor): - # Adapted from diffusers.VaeImageProcessor.denormalize - @staticmethod - def denormalize(images: np.ndarray): - """ - Denormalize an image array to [0,1]. - """ - return np.clip(images / 2 + 0.5, 0, 1) - - # Adapted from diffusers.VaeImageProcessor.preprocess - def preprocess( - self, - image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> np.ndarray: - """ - Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors. - """ - supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor) - - do_convert_grayscale = getattr(self.config, "do_convert_grayscale", False) - # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image - if do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3: - if isinstance(image, torch.Tensor): - # if image is a pytorch tensor could have 2 possible shapes: - # 1. batch x height x width: we should insert the channel dimension at position 1 - # 2. channnel x height x width: we should insert batch dimension at position 0, - # however, since both channel and batch dimension has same size 1, it is same to insert at position 1 - # for simplicity, we insert a dimension of size 1 at position 1 for both cases - image = image.unsqueeze(1) - else: - # if it is a numpy array, it could have 2 possible shapes: - # 1. batch x height x width: insert channel dimension on last position - # 2. height x width x channel: insert batch dimension on first position - if image.shape[-1] == 1: - image = np.expand_dims(image, axis=0) - else: - image = np.expand_dims(image, axis=-1) - - if isinstance(image, supported_formats): - image = [image] - elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)): - raise ValueError( - f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}" - ) - - if isinstance(image[0], PIL.Image.Image): - if self.config.do_convert_rgb: - image = [self.convert_to_rgb(i) for i in image] - elif do_convert_grayscale: - image = [self.convert_to_grayscale(i) for i in image] - if self.config.do_resize: - height, width = self.get_height_width(image[0], height, width) - image = [self.resize(i, height, width) for i in image] - image = self.reshape(self.pil_to_numpy(image)) - else: - if isinstance(image[0], torch.Tensor): - image = [self.pt_to_numpy(elem) for elem in image] - image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0) - else: - image = self.reshape(np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)) - - if do_convert_grayscale and image.ndim == 3: - image = np.expand_dims(image, 1) - - # don't need any preprocess if the image is latents - if image.shape[1] == 4: - return image - - if self.config.do_resize: - height, width = self.get_height_width(image, height, width) - image = self.resize(image, height, width) - - # expected range [0,1], normalize to [-1,1] - do_normalize = self.config.do_normalize - if image.min() < 0 and do_normalize: - warnings.warn( - "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] " - f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]", - FutureWarning, - ) - do_normalize = False - - if do_normalize: - image = self.normalize(image) - - if getattr(self.config, "do_binarize", False): - image = self.binarize(image) - - return image - - # Adapted from diffusers.VaeImageProcessor.postprocess - def postprocess( - self, - image: np.ndarray, - output_type: str = "pil", - do_denormalize: Optional[List[bool]] = None, - ): - if not isinstance(image, np.ndarray): - raise ValueError( - f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array" - ) - if output_type not in ["latent", "np", "pil"]: - deprecation_message = ( - f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: " - "`pil`, `np`, `pt`, `latent`" - ) - warnings.warn(deprecation_message, FutureWarning) - output_type = "np" - - if output_type == "latent": - return image - - if do_denormalize is None: - do_denormalize = [self.config.do_normalize] * image.shape[0] - - image = np.stack( - [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0 - ) - - image = image.transpose((0, 2, 3, 1)) - - if output_type == "pil": - image = self.numpy_to_pil(image) - - return image - - def get_height_width( - self, - image: Union[PIL.Image.Image, np.ndarray], - height: Optional[int] = None, - width: Optional[int] = None, - ): - """ - This function return the height and width that are downscaled to the next integer multiple of - `vae_scale_factor`. - - Args: - image(`PIL.Image.Image`, `np.ndarray`): - The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have - shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should - have shape `[batch, channel, height, width]`. - height (`int`, *optional*, defaults to `None`): - The height in preprocessed image. If `None`, will use the height of `image` input. - width (`int`, *optional*`, defaults to `None`): - The width in preprocessed. If `None`, will use the width of the `image` input. - """ - height = height or (image.height if isinstance(image, PIL.Image.Image) else image.shape[-2]) - width = width or (image.width if isinstance(image, PIL.Image.Image) else image.shape[-1]) - # resize to integer multiple of vae_scale_factor - width, height = (x - x % self.config.vae_scale_factor for x in (width, height)) - return height, width - - # Adapted from diffusers.VaeImageProcessor.numpy_to_pt - @staticmethod - def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor: - """ - Convert a NumPy image to a PyTorch tensor. - """ - if images.ndim == 3: - images = images[..., None] - - images = torch.from_numpy(images) - return images - - # Adapted from diffusers.VaeImageProcessor.pt_to_numpy - @staticmethod - def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray: - """ - Convert a PyTorch tensor to a NumPy image. - """ - images = images.cpu().float().numpy() - return images - - @staticmethod - def reshape(images: np.ndarray) -> np.ndarray: - """ - Reshape inputs to expected shape. - """ - if images.ndim == 3: - images = images[..., None] - - return images.transpose(0, 3, 1, 2) - - # TODO : remove after diffusers v0.21.0 release - def resize( - self, - image: Union[PIL.Image.Image, np.ndarray, torch.Tensor], - height: Optional[int] = None, - width: Optional[int] = None, - ) -> Union[PIL.Image.Image, np.ndarray, torch.Tensor]: - """ - Resize image. - """ - if isinstance(image, PIL.Image.Image): - image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample]) - elif isinstance(image, torch.Tensor): - image = torch.nn.functional.interpolate(image, size=(height, width)) - elif isinstance(image, np.ndarray): - image = self.numpy_to_pt(image) - image = torch.nn.functional.interpolate(image, size=(height, width)) - image = self.pt_to_numpy(image) - return image diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py deleted file mode 100644 index b3cd622edac..00000000000 --- a/optimum/pipelines/diffusers/watermark.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -from imwatermark import WatermarkEncoder - - -WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110 -WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]] - - -# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12 -class StableDiffusionXLWatermarker: - def __init__(self): - self.watermark = WATERMARK_BITS - self.encoder = WatermarkEncoder() - self.encoder.set_watermark("bits", self.watermark) - - def apply_watermark(self, images: np.array): - # can't encode images that are smaller than 256 - if images.shape[-1] < 256: - return images - - # cv2 doesn't support float16 - if images.dtype == np.float16: - images = images.astype(np.float32) - - images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1)) - - images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2)) - - np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images) - - return images diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index d1471aa218a..7671d6cd2e6 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -43,7 +43,7 @@ from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED from optimum.exporters.onnx.model_configs import WhisperOnnxConfig from optimum.exporters.onnx.utils import get_speecht5_models_for_export -from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig +from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig from optimum.utils.save_utils import maybe_load_preprocessors from optimum.utils.testing_utils import grid_parameters, require_diffusers @@ -292,27 +292,22 @@ def _onnx_export( gc.collect() - def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"): + def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device="cpu"): pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device) models_and_onnx_configs = get_diffusion_models_for_export(pipeline) - output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs] - model, _ = models_and_onnx_configs["vae_encoder"] - model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters} with TemporaryDirectory() as tmpdirname: _, onnx_outputs = export_models( models_and_onnx_configs=models_and_onnx_configs, opset=14, output_dir=Path(tmpdirname), - output_names=output_names, device=device, ) validate_models_outputs( models_and_onnx_configs=models_and_onnx_configs, onnx_named_outputs=onnx_outputs, output_dir=Path(tmpdirname), - atol=1e-3, - onnx_files_subpaths=output_names, + atol=1e-4, use_subprocess=False, ) @@ -403,7 +398,7 @@ def test_tensorflow_export( @require_vision @require_diffusers def test_pytorch_export_for_diffusion_models(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name) + self._onnx_export_diffusion_models(model_type, model_name) @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items()) @require_torch @@ -414,7 +409,7 @@ def test_pytorch_export_for_diffusion_models(self, model_type, model_name): @pytest.mark.run_slow @pytest.mark.gpu_test def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name): - self._onnx_export_sd(model_type, model_name, device="cuda") + self._onnx_export_diffusion_models(model_type, model_name, device="cuda") class CustomWhisperOnnxConfig(WhisperOnnxConfig): diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 9f480b2d1a0..956566f0e1f 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -12,10 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import unittest import numpy as np -import PIL import pytest import torch from diffusers import ( @@ -24,6 +22,7 @@ AutoPipelineForText2Image, DiffusionPipeline, ) +from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker from diffusers.utils import load_image from parameterized import parameterized from transformers.testing_utils import require_torch_gpu @@ -35,8 +34,7 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) -from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor -from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm +from optimum.utils.testing_utils import grid_parameters, require_diffusers def get_generator(framework, seed): @@ -72,16 +70,8 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= return [image] * batch_size -def to_np(image): - if isinstance(image[0], PIL.Image.Image): - return np.stack([np.array(i) for i in image], axis=0) - elif isinstance(image, torch.Tensor): - return image.cpu().numpy().transpose(0, 2, 3, 1) - return image - - class ORTPipelineForText2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["latent-consistency", "stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -126,17 +116,16 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): def test_num_images_per_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - height, width, batch_size = 64, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -150,61 +139,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - if model_arch == "latent-consistency": - # Latent Consistency Model (LCM) doesn't support deterministic outputs beyond the first inference step - # TODO: Investigate why this is the case - inputs["num_inference_steps"] = 1 - - for output_type in ["latent", "np"]: + for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) - self.assertEqual(ort_pipeline.device, diffusers_pipeline.device) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - height, width, batch_size = 64, 32, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -220,7 +161,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -243,17 +184,21 @@ def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: def test_shape(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) - height, width, batch_size = 128, 64, 1 + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + + height, width, batch_size = 128, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -263,9 +208,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) doesn't support deterministic outputs") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -279,14 +221,11 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): - if model_arch in ["latent-consistency"]: - pytest.skip("Latent Consistency Model (LCM) does not support negative prompts") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -295,9 +234,8 @@ def test_negative_prompt(self, model_arch: str): negative_prompt = ["This is a negative prompt"] pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - image_slice_1 = pipeline( - **inputs, negative_prompt=negative_prompt, generator=np.random.RandomState(SEED) - ).images[0, -3:, -3:, -1] + + images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images prompt = inputs.pop("prompt") if model_arch == "stable-diffusion-xl": @@ -306,39 +244,96 @@ def test_negative_prompt(self, model_arch: str): inputs["negative_prompt_embeds"], inputs["pooled_prompt_embeds"], inputs["negative_pooled_prompt_embeds"], - ) = pipeline._encode_prompt(prompt, 1, False, negative_prompt) + ) = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) else: - text_ids = pipeline.tokenizer( - prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - negative_text_ids = pipeline.tokenizer( - negative_prompt, - max_length=pipeline.tokenizer.model_max_length, - padding="max_length", - return_tensors="np", - truncation=True, - ).input_ids - inputs["prompt_embeds"] = pipeline.text_encoder(text_ids)[0] - inputs["negative_prompt_embeds"] = pipeline.text_encoder(negative_text_ids)[0] - - image_slice_2 = pipeline(**inputs, generator=np.random.RandomState(SEED)).images[0, -3:, -3:, -1] - - self.assertTrue(np.allclose(image_slice_1, image_slice_2, rtol=1e-1)) + inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( + prompt=prompt, + num_images_per_prompt=1, + device=torch.device("cpu"), + do_classifier_free_guidance=True, + negative_prompt=negative_prompt, + ) + + images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + + outputs = pipeline(**inputs).images + + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image TASK = "image-to-image" - def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="np"): + def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( @@ -369,11 +364,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -381,68 +371,18 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Img2Img doesn't behave as expected with callbacks (doesn't call it every step with callback_steps=1)" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -455,7 +395,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -478,18 +418,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["np", "pil", "pt"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -499,27 +442,26 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width, batch_size = 128, 128, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - ort_output = ort_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images - diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - diffusers_output = diffusers_pipeline(**inputs, generator=torch.Generator().manual_seed(SEED)).images + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - self.assertTrue(np.allclose(ort_output, diffusers_output, rtol=1e-2)) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_image_reproducibility(self, model_arch: str): - pytest.skip("Img2Img models do not support support output reproducibility for some reason") - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -533,12 +475,73 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device.type, "cuda") + + outputs = pipeline(**inputs).images + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion", "latent-consistency"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) + + ort_images = ort_output.images + diffusers_images = diffusers_output.images + + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) class ORTPipelineForInpaintingTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion"] + SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -546,18 +549,16 @@ class ORTPipelineForInpaintingTest(ORTModelTestMixin): TASK = "inpainting" def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"): - assert batch_size == 1, "Inpainting models only support batch_size=1" - assert input_type == "pil", "Inpainting models only support input_type='pil'" - inputs = _generate_prompts(batch_size=batch_size) inputs["image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" - )[0] + height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type + ) inputs["mask_image"] = _generate_images( - height=height, width=width, batch_size=1, channel=channel, input_type="pil" - )[0] + height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type + ) + inputs["strength"] = 0.75 inputs["height"] = height inputs["width"] = width @@ -583,11 +584,6 @@ def test_ort_pipeline_class_dispatch(self, model_arch: str): self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - # auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch]) - # ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch]) - - # self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__) - @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_num_images_per_prompt(self, model_arch: str): @@ -595,59 +591,14 @@ def test_num_images_per_prompt(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - self.assertEqual(pipeline.vae_scale_factor, 2) - self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4) - self.assertEqual(pipeline.unet.config["in_channels"], 4) - batch_size, height = 1, 32 - for width in [64, 32]: - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - for num_images in [1, 3]: - outputs = pipeline(**inputs, num_images_per_prompt=num_images).images - self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]}) - ) - @require_torch_gpu - @pytest.mark.cuda_ep_test - @require_diffusers - def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) - - @parameterized.expand( - grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]}) - ) - @require_torch_gpu - @require_ort_rocm - @pytest.mark.rocm_ep_test - @require_diffusers - def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str): - model_args = {"test_name": test_name, "model_arch": model_arch} - self._setup(model_args) - - height, width, batch_size = 32, 64, 1 - inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - outputs = pipeline(**inputs).images - # Verify model devices - self.assertEqual(pipeline.device.type.lower(), "cuda") - # Verify model outptus - self.assertIsInstance(outputs, np.ndarray) - self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + for batch_size in [1, 3]: + for height in [64, 128]: + for width in [64, 128]: + for num_images_per_prompt in [1, 3]: + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images + self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -664,7 +615,7 @@ def __init__(self): self.has_been_called = False self.number_of_steps = 0 - def __call__(self, step: int, timestep: int, latents: np.ndarray) -> None: + def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 @@ -687,18 +638,21 @@ def test_shape(self, model_arch: str): self._setup(model_args) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) - height, width, batch_size = 32, 64, 1 - for input_type in ["pil"]: + height, width, batch_size = 128, 64, 1 + + for input_type in ["pil", "np", "pt"]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type) - for output_type in ["np", "pil", "latent"]: + for output_type in ["pil", "np", "pt", "latent"]: inputs["output_type"] = output_type outputs = pipeline(**inputs).images if output_type == "pil": self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width)) elif output_type == "np": self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + elif output_type == "pt": + self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: self.assertEqual( outputs.shape, @@ -708,11 +662,6 @@ def test_shape(self, model_arch: str): @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers def test_compare_to_diffusers_pipeline(self, model_arch: str): - if model_arch in ["stable-diffusion"]: - pytest.skip( - "Stable Diffusion For Inpainting fails, it was used to be compared to StableDiffusionPipeline for some reason which is the text-to-image variant" - ) - model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) @@ -722,23 +671,13 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) - latents_shape = ( - batch_size, - ort_pipeline.vae_decoder.config["latent_channels"], - height // ort_pipeline.vae_scale_factor, - width // ort_pipeline.vae_scale_factor, - ) + for output_type in ["latent", "np", "pt"]: + inputs["output_type"] = output_type - np_latents = np.random.rand(*latents_shape).astype(np.float32) - torch_latents = torch.from_numpy(np_latents) + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - ort_output = ort_pipeline(**inputs, latents=np_latents).images - diffusers_output = diffusers_pipeline(**inputs, latents=torch_latents).images - - self.assertTrue( - np.allclose(ort_output, diffusers_output, atol=1e-4), - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4), - ) + np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -756,38 +695,65 @@ def test_image_reproducibility(self, model_arch: str): ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED)) ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1)) - self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0])) self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) + np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) + + @parameterized.expand( + grid_parameters( + { + "model_arch": SUPPORTED_ARCHITECTURES, + "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"], + } + ) + ) + @pytest.mark.rocm_ep_test + @pytest.mark.cuda_ep_test + @pytest.mark.trt_ep_test + @require_torch_gpu + @require_diffusers + def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): + model_args = {"test_name": test_name, "model_arch": model_arch} + self._setup(model_args) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device, "cuda") + + outputs = pipeline(**inputs).images + self.assertIsInstance(outputs, np.ndarray) + self.assertEqual(outputs.shape, (batch_size, height, width, 3)) + + @parameterized.expand(["stable-diffusion"]) + @require_diffusers + def test_safety_checker(self, model_arch: str): + model_args = {"test_name": model_arch, "model_arch": model_arch} + self._setup(model_args) + + safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker") + + pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained( + self.onnx_model_dirs[model_arch], safety_checker=safety_checker + ) + + self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker) + self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker) + + height, width, batch_size = 32, 64, 1 + inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + + ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)) + diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED)) + + ort_nsfw_content_detected = ort_output.nsfw_content_detected + diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected + self.assertTrue(ort_nsfw_content_detected is not None) + self.assertTrue(diffusers_nsfw_content_detected is not None) + self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected) -class ImageProcessorTest(unittest.TestCase): - def test_vae_image_processor_pt(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pt = torch.stack(_generate_images(height=8, width=8, batch_size=1, input_type="pt")) - input_np = to_np(input_pt) - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_np(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_np = np.stack(_generate_images(height=8, width=8, input_type="np")) - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type) - out_np = to_np(out) - in_np = (input_np * 255).round() if output_type == "pil" else input_np - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) - - def test_vae_image_processor_pil(self): - image_processor = VaeImageProcessor(do_resize=False, do_normalize=True) - input_pil = _generate_images(height=8, width=8, batch_size=1, input_type="pil") - - for output_type in ["np", "pil"]: - out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type) - for i, o in zip(input_pil, out): - in_np = np.array(i) - out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round() - self.assertTrue(np.allclose(in_np, out_np, atol=1e-6)) + ort_images = ort_output.images + diffusers_images = diffusers_output.images + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index f6771ce7618..665f253c480 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -148,7 +148,7 @@ def __init__(self, *args, **kwargs): self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small" self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro" self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx" - self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "optimum-internal-testing/tiny-stable-diffusion-onnx" def test_load_model_from_local_path(self): model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH) @@ -222,17 +222,17 @@ def test_load_seq2seq_model_from_empty_cache(self): @require_diffusers def test_load_stable_diffusion_model_from_cache(self): _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID) # caching - model = ORTStableDiffusionPipeline.from_pretrained( self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True ) - self.assertIsInstance(model.text_encoder, ORTModelTextEncoder) self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder) self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder) self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_from_empty_cache(self): dirpath = os.path.join( @@ -325,6 +325,8 @@ def test_load_stable_diffusion_model_from_hub(self): self.assertIsInstance(model.unet, ORTModelUnet) self.assertIsInstance(model.config, Dict) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @pytest.mark.cuda_ep_test @@ -339,6 +341,8 @@ def test_load_stable_diffusion_model_cuda_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers @require_torch_gpu @require_ort_rocm @@ -354,6 +358,8 @@ def test_load_stable_diffusion_model_rocm_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cuda:0")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_cpu_provider(self): model = ORTStableDiffusionPipeline.from_pretrained( @@ -366,6 +372,8 @@ def test_load_stable_diffusion_model_cpu_provider(self): self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers) self.assertEqual(model.device, torch.device("cpu")) + model(prompt="This is a sanity test prompt", num_inference_steps=2) + @require_diffusers def test_load_stable_diffusion_model_unknown_provider(self): with self.assertRaises(ValueError): diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 17f3b391b04..5071d0081af 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -171,6 +171,11 @@ class ORTModelTestMixin(unittest.TestCase): "np": np.ndarray, } + TASK = None + + ORTMODEL_CLASS = None + AUTOMODEL_CLASS = None + @classmethod def setUpClass(cls): cls.onnx_model_dirs = {} From 2c0476eda1398b9a81cb966c817a460ed6e53413 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 9 Oct 2024 18:49:38 +0200 Subject: [PATCH 24/73] Enable ONNX export for transformers 4.45 (#2045) * Enable ONNX export for transformers 4.45 * add comment * update setup --- optimum/exporters/onnx/convert.py | 11 +++++------ setup.py | 7 ++----- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index f2bf95f3e3c..d72fd7eb21a 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,7 +26,6 @@ import numpy as np import onnx -import transformers from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -531,6 +530,11 @@ def export_pytorch( logger.info(f"Using framework PyTorch: {torch.__version__}") FORCE_ONNX_EXTERNAL_DATA = os.getenv("FORCE_ONNX_EXTERNAL_DATA", "0") == "1" + model_kwargs = model_kwargs or {} + # num_logits_to_keep was added in transformers 4.45 and isn't added as inputs when exporting the model + if check_if_transformers_greater("4.44.99") and "num_logits_to_keep" in signature(model.forward).parameters.keys(): + model_kwargs["num_logits_to_keep"] = 0 + with torch.no_grad(): model.config.return_dict = True model = model.eval() @@ -1001,11 +1005,6 @@ def onnx_export_from_model( >>> onnx_export_from_model(model, output="gpt2_onnx/") ``` """ - if check_if_transformers_greater("4.44.99"): - raise ImportError( - f"ONNX conversion disabled for now for transformers version greater than v4.45, found {transformers.__version__}" - ) - TasksManager.standardize_model_attributes(model) if hasattr(model.config, "export_model_type"): diff --git a/setup.py b/setup.py index 0e2f0fd1bb6..63f202faa6e 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,6 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.45.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,10 +62,9 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.45.0", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.45.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.45.0"], + "exporters": ["onnx", "onnxruntime", "timm"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -77,7 +75,6 @@ "numpy<1.24.0", "datasets<=2.16", "transformers[sentencepiece]>=4.26,<4.38", - "transformers<4.45.0", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", From 1b5a63da593599b1e6e178754146e0109d3305d9 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:52:35 +0200 Subject: [PATCH 25/73] Remove the need for the config to be in the subfolder (#2044) * remove the need for the config to be in the subfolder * fix * fix for offline mode * add log * fix * enable load local model in subfolder * fix windows --- optimum/modeling_base.py | 36 ++++++++++++++++++----------- optimum/onnxruntime/modeling_ort.py | 6 ++--- tests/onnxruntime/test_modeling.py | 15 ++++++++++++ 3 files changed, 39 insertions(+), 18 deletions(-) diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py index 29521b7c0c6..48c738514ae 100644 --- a/optimum/modeling_base.py +++ b/optimum/modeling_base.py @@ -380,27 +380,35 @@ def from_pretrained( ) model_id, revision = model_id.split("@") + all_files, _ = TasksManager.get_model_files( + model_id, + subfolder=subfolder, + cache_dir=cache_dir, + revision=revision, + token=token, + ) + + config_folder = subfolder + if cls.config_name not in all_files: + logger.info( + f"{cls.config_name} not found in the specified subfolder {subfolder}. Using the top level {cls.config_name}." + ) + config_folder = "" + library_name = TasksManager.infer_library_from_model( - model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token ) if library_name == "timm": config = PretrainedConfig.from_pretrained( - model_id, subfolder=subfolder, revision=revision, cache_dir=cache_dir, token=token + model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token ) if config is None: - if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME: - if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)): - config = AutoConfig.from_pretrained( - os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code - ) - elif CONFIG_NAME in os.listdir(model_id): + if os.path.isdir(os.path.join(model_id, config_folder)) and cls.config_name == CONFIG_NAME: + if CONFIG_NAME in os.listdir(os.path.join(model_id, config_folder)): config = AutoConfig.from_pretrained( - os.path.join(model_id, CONFIG_NAME), trust_remote_code=trust_remote_code - ) - logger.info( - f"config.json not found in the specified subfolder {subfolder}. Using the top level config.json." + os.path.join(model_id, config_folder), trust_remote_code=trust_remote_code ) else: raise OSError(f"config.json not found in {model_id} local folder") @@ -411,7 +419,7 @@ def from_pretrained( cache_dir=cache_dir, token=token, force_download=force_download, - subfolder=subfolder, + subfolder=config_folder, trust_remote_code=trust_remote_code, ) elif isinstance(config, (str, os.PathLike)): @@ -421,7 +429,7 @@ def from_pretrained( cache_dir=cache_dir, token=token, force_download=force_download, - subfolder=subfolder, + subfolder=config_folder, trust_remote_code=trust_remote_code, ) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 9b29afa566b..ce1d68536ac 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -510,13 +510,12 @@ def _from_pretrained( if file_name is None: if model_path.is_dir(): - onnx_files = list(model_path.glob("*.onnx")) + onnx_files = list((model_path / subfolder).glob("*.onnx")) else: repo_files, _ = TasksManager.get_model_files( model_id, revision=revision, cache_dir=cache_dir, token=token ) repo_files = map(Path, repo_files) - pattern = "*.onnx" if subfolder == "" else f"{subfolder}/*.onnx" onnx_files = [p for p in repo_files if p.match(pattern)] @@ -983,10 +982,9 @@ def _cached_file( token = use_auth_token model_path = Path(model_path) - # locates a file in a local folder and repo, downloads and cache it if necessary. if model_path.is_dir(): - model_cache_path = model_path / file_name + model_cache_path = model_path / subfolder / file_name preprocessors = maybe_load_preprocessors(model_path.as_posix()) else: model_cache_path = hf_hub_download( diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 665f253c480..501c7dac240 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -28,6 +28,7 @@ import requests import timm import torch +from huggingface_hub import HfApi from huggingface_hub.constants import default_cache_path from parameterized import parameterized from PIL import Image @@ -1263,6 +1264,20 @@ def test_trust_remote_code(self): torch.allclose(pt_logits, ort_logits, atol=1e-4), f" Maxdiff: {torch.abs(pt_logits - ort_logits).max()}" ) + @parameterized.expand(("", "onnx")) + def test_loading_with_config_not_from_subfolder(self, subfolder): + # config.json file in the root directory and not in the subfolder + model_id = "sentence-transformers-testing/stsb-bert-tiny-onnx" + # hub model + ORTModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=subfolder == "") + # local model + api = HfApi() + with tempfile.TemporaryDirectory() as tmpdirname: + local_dir = Path(tmpdirname) / "model" + api.snapshot_download(repo_id=model_id, local_dir=local_dir) + ORTModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=subfolder == "") + remove_directory(tmpdirname) + class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin): SUPPORTED_ARCHITECTURES = [ From 851f04b13aab9f17f1c3b5080767a2cc440bb2b1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 11:54:01 +0200 Subject: [PATCH 26/73] Remove upper transformers version limit (#2048) --- setup.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/setup.py b/setup.py index 63f202faa6e..fb290274a3b 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29,<4.46.0", + "transformers[sentencepiece]>=4.29", "torch>=1.11", "packaging", "numpy", @@ -54,6 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", + "transformers<4.46.0", ], "onnxruntime-gpu": [ "onnx", @@ -62,9 +63,10 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. + "transformers<4.46.0", ], - "exporters": ["onnx", "onnxruntime", "timm"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"], + "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"], + "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", From 4ce73646eb13dff14503092eeb92f94d6a1ee7b1 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:57:31 +0200 Subject: [PATCH 27/73] Dev version (#2049) --- optimum/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/version.py b/optimum/version.py index 4a8a7edab63..4fff28e5c97 100644 --- a/optimum/version.py +++ b/optimum/version.py @@ -12,4 +12,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "1.23.0.dev0" +__version__ = "1.24.0.dev0" From 6172e96914d6f49aec253db05c98d827c158caab Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:24:00 +0200 Subject: [PATCH 28/73] Fix doc build (#2050) * Fix doc build * Trigger PR doc build when the PR doc build workflow is modified * Fix issue with torch-xla and ubuntu-latest --- .github/workflows/build_main_documentation.yml | 8 ++++++-- .github/workflows/build_pr_documentation.yml | 6 +++++- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index 20face917ab..11e36ed57f3 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -10,7 +10,7 @@ on: jobs: build_documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 steps: - uses: actions/checkout@v2 @@ -66,7 +66,7 @@ jobs: sudo apt-get purge -y '^mysql.*' sudo apt-get purge -y '^java.*' sudo apt-get purge -y '^openjdk.*' - sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel + sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel df -h sudo apt-get autoremove -y >/dev/null 2>&1 sudo apt-get clean @@ -110,6 +110,8 @@ jobs: - name: Setup environment run: | + python -m venv venv-doc + source venv-doc/bin/activate pip uninstall -y doc-builder cd doc-builder git pull origin main @@ -135,6 +137,7 @@ jobs: - name: Make Furiosa documentation run: | + source venv-doc/bin/activate cd optimum-furiosa pip install . sudo apt install software-properties-common @@ -159,6 +162,7 @@ jobs: - name: Make TPU documentation run: | sudo docker system prune -a -f + source venv-doc/bin/activate cd optimum-tpu pip install -U pip pip install . -f https://storage.googleapis.com/libtpu-releases/index.html diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml index e5f2dcb0d18..6eb09aff304 100644 --- a/.github/workflows/build_pr_documentation.yml +++ b/.github/workflows/build_pr_documentation.yml @@ -8,6 +8,7 @@ on: - "optimum/**.py" - "docs/**.mdx" - "docs/**.yml" + - ".github/workflows/build_pr_documentation.yml" concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,7 +16,7 @@ concurrency: jobs: build_documentation: - runs-on: ubuntu-latest + runs-on: ubuntu-22.04 env: COMMIT_SHA: ${{ github.event.pull_request.head.sha }} PR_NUMBER: ${{ github.event.number }} @@ -60,6 +61,8 @@ jobs: - name: Setup environment run: | + python -m venv venv-doc + source venv-doc/bin/activate pip uninstall -y doc-builder cd doc-builder git pull origin main @@ -99,6 +102,7 @@ jobs: - name: Make TPU documentation run: | sudo docker system prune -a -f + source venv-doc/bin/activate cd optimum-tpu pip install -U pip pip install . -f https://storage.googleapis.com/libtpu-releases/index.html From eb6f5de5ce3eb69f73d7a0ee0da30f9bd8ca2a08 Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:27:47 +0200 Subject: [PATCH 29/73] Don't hardcode the logger level to INFO; let users set TRANSFORMERS_VERBOSITY (#2047) And keep the default as Warning, i.e. the expected for Python modules --- optimum/exporters/onnx/__main__.py | 1 - optimum/exporters/tflite/__main__.py | 1 - optimum/onnx/transformations_utils.py | 1 - 3 files changed, 3 deletions(-) diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py index 703e98df3e2..6a2cc6834a6 100644 --- a/optimum/exporters/onnx/__main__.py +++ b/optimum/exporters/onnx/__main__.py @@ -43,7 +43,6 @@ from .base import OnnxConfig logger = logging.get_logger() -logger.setLevel(logging.INFO) def main_export( diff --git a/optimum/exporters/tflite/__main__.py b/optimum/exporters/tflite/__main__.py index b3c90cb63f2..0c4c7b994fa 100644 --- a/optimum/exporters/tflite/__main__.py +++ b/optimum/exporters/tflite/__main__.py @@ -28,7 +28,6 @@ logger = logging.get_logger() -logger.setLevel(logging.INFO) def main(): diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py index 1f0765112e8..fe55a5a5770 100644 --- a/optimum/onnx/transformations_utils.py +++ b/optimum/onnx/transformations_utils.py @@ -29,7 +29,6 @@ logger = logging.get_logger() -logger.setLevel(logging.INFO) def _find_duplicate_initializers( From 690d35b1ab31f375f5a4b74bf6eba37517656c05 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Fri, 11 Oct 2024 10:37:45 +0200 Subject: [PATCH 30/73] Add workflow to mark issues as stale (#2051) --- .github/workflows/stale.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/stale.yml diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml new file mode 100644 index 00000000000..a5e50a795b6 --- /dev/null +++ b/.github/workflows/stale.yml @@ -0,0 +1,14 @@ +name: 'Close stale issues and PRs' +on: + schedule: + - cron: '30 1 * * *' + +jobs: + stale: + runs-on: ubuntu-latest + steps: + - uses: actions/stale@v8 + with: + stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + days-before-stale: 30 + days-before-close: 5 From b42db7ee6b5fa43e41adcbd501a3bd183b589991 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 11:04:50 +0200 Subject: [PATCH 31/73] Fix onnx export CLI for transformers >= 4.45 (#2053) * fix onnx export * add test --- optimum/exporters/onnx/convert.py | 3 ++- tests/exporters/onnx/test_exporters_onnx_cli.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index d72fd7eb21a..565183b38fc 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -26,6 +26,7 @@ import numpy as np import onnx +from transformers.generation import GenerationMixin from transformers.modeling_utils import get_parameter_dtype from transformers.utils import is_tf_available, is_torch_available @@ -1127,7 +1128,7 @@ def onnx_export_from_model( if check_if_transformers_greater("4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - if model.can_generate() and len(misplaced_generation_parameters) > 0: + if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( "Moving the following attributes in the config to the generation config: " f"{misplaced_generation_parameters}. You are seeing this warning because you've set " diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py index ed611ade04e..8b186e9307b 100644 --- a/tests/exporters/onnx/test_exporters_onnx_cli.py +++ b/tests/exporters/onnx/test_exporters_onnx_cli.py @@ -602,6 +602,14 @@ def test_diffusion(self): check=True, ) + def test_sentence_transformers(self): + with TemporaryDirectory() as tmpdirname: + subprocess.run( + f"python3 -m optimum.exporters.onnx --model sentence-transformers-testing/stsb-bert-tiny-onnx --task feature-extraction {tmpdirname}", + shell=True, + check=True, + ) + def test_legacy(self): with TemporaryDirectory() as tmpdirname: subprocess.run( From 94201540ac41b0a86042b04df0a3b374793761b8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 12:18:30 +0200 Subject: [PATCH 32/73] Fix onnx export for transformers>=4.45 (#2054) * fix onnx export for transformers>=4.45 * fix tets * style --- optimum/exporters/onnx/convert.py | 6 +++++- tests/exporters/onnx/test_exporters_onnx_cli.py | 1 + 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 565183b38fc..2661d835979 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -1128,7 +1128,11 @@ def onnx_export_from_model( if check_if_transformers_greater("4.44.99"): misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: + if ( + isinstance(model, GenerationMixin) + and model.can_generate() + and len(misplaced_generation_parameters) > 0 + ): logger.warning( "Moving the following attributes in the config to the generation config: " f"{misplaced_generation_parameters}. You are seeing this warning because you've set " diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py index 8b186e9307b..9ac7832aa7d 100644 --- a/tests/exporters/onnx/test_exporters_onnx_cli.py +++ b/tests/exporters/onnx/test_exporters_onnx_cli.py @@ -602,6 +602,7 @@ def test_diffusion(self): check=True, ) + @require_sentence_transformers def test_sentence_transformers(self): with TemporaryDirectory() as tmpdirname: subprocess.run( From 0d808ade96b01e35e5c8a38b0b156ce4b241f433 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 11 Oct 2024 16:23:17 +0200 Subject: [PATCH 33/73] Upgrade macOS image for tests compatibility with numpy v2 (#2055) * update runner environment * fix * downgrade * Update .github/workflows/test_bettertransformer.yml Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update .github/workflows/test_bettertransformer.yml Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * fix --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- .github/workflows/test_bettertransformer.yml | 5 ++--- .github/workflows/test_onnx.yml | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml index 080d8272dfc..b023fa4bd1b 100644 --- a/.github/workflows/test_bettertransformer.yml +++ b/.github/workflows/test_bettertransformer.yml @@ -15,9 +15,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-13] - exclude: [{ python-version: 3.8, os: macos-13 }] + python-version: [3.9] + os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 9aa8b307235..22a11720798 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.8, 3.9] - os: [ubuntu-20.04, macos-13] + os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} steps: From 058593927303ec94726c15a0cebf4da96ee628ec Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Mon, 14 Oct 2024 10:45:17 +0200 Subject: [PATCH 34/73] Fix main doc build (#2057) --- .github/workflows/build_main_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index 11e36ed57f3..efd61c1fd4f 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -57,6 +57,7 @@ jobs: - name: Free disk space run: | df -h + sudo apt-get update sudo apt-get purge -y '^apache.*' sudo apt-get purge -y '^imagemagick.*' sudo apt-get purge -y '^dotnet.*' From 9fd9ca5505e83f67c2a5e4c4f6f56d5fcf28442f Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Mon, 14 Oct 2024 12:54:15 +0200 Subject: [PATCH 35/73] Enter venv before pushing doc in main doc build workflow (#2058) --- .github/workflows/build_main_documentation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index efd61c1fd4f..c922f5097da 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -197,6 +197,7 @@ jobs: - name: Push to repositories run: | + source venv-doc/bin/activate cd optimum/optimum-doc-build sudo chmod -R ugo+rwx optimum doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml From d11e2850158694258875b6e4fcba1c5db61af42a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:12:05 +0200 Subject: [PATCH 36/73] Fix tests expected environment variable name (#2059) * fix env variable name * fix test * comment * load onnx revision --- .github/workflows/test_onnxruntime.yml | 2 +- tests/onnxruntime/test_modeling.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 291a3b08335..a72bedb1ab7 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -51,7 +51,7 @@ jobs: - name: Test with pytest (in parallel) env: - FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} working-directory: tests run: | pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 501c7dac240..33243da278a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -974,9 +974,13 @@ def test_load_model_from_hub_private(self): token = os.environ.get("HF_HUB_READ_TOKEN", None) if token is None: - self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.") + self.skipTest( + "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`." + ) - model = ORTModelForCustomTasks.from_pretrained("optimum-internal-testing/tiny-random-phi-private", token=token) + model = ORTModelForCustomTasks.from_pretrained( + "optimum-internal-testing/tiny-random-phi-private", revision="onnx", token=token + ) self.assertIsInstance(model.model, onnxruntime.InferenceSession) self.assertIsInstance(model.config, PretrainedConfig) From 1e5014e70f17e0437c4b0a7f4e65e170688d8ab0 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 15 Oct 2024 11:12:17 +0200 Subject: [PATCH 37/73] Remove unused HF_TOKEN environment variable (#2061) remove unused HF_TOKEN environment variable --- .github/workflows/test_fx_automatic_parallel.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml index 05ebf7ea9e5..c5d82be38b3 100644 --- a/.github/workflows/test_fx_automatic_parallel.yml +++ b/.github/workflows/test_fx_automatic_parallel.yml @@ -35,7 +35,6 @@ jobs: options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/ env: NCCL_DEBUG: INFO - HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} defaults: run: shell: bash From 8e54205b3b6b45f10f6360c05bb3a560a27354fe Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 18 Oct 2024 11:06:48 +0200 Subject: [PATCH 38/73] Fix compatibility with diffusers < 0.25.0 (#2063) * Fix compatibility with diffusers < 0.25.0 * fix import --- optimum/onnxruntime/modeling_diffusion.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 87fcb68c7e9..3899a7b36b6 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -26,7 +26,6 @@ import numpy as np import torch from diffusers.configuration_utils import ConfigMixin -from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution from diffusers.pipelines import ( AutoPipelineForImage2Image, AutoPipelineForInpainting, @@ -52,6 +51,7 @@ from transformers.modeling_outputs import ModelOutput import onnxruntime as ort +from optimum.utils import check_if_diffusers_greater from ..exporters.onnx import main_export from ..onnx.utils import _get_model_external_data_paths @@ -73,6 +73,12 @@ ) +if check_if_diffusers_greater("0.25.0"): + from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution +else: + from diffusers.models.vae import DiagonalGaussianDistribution + + logger = logging.getLogger(__name__) From 59d6f7f04e390fb13fcba62bf22cea6ff2030623 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:59:22 +0200 Subject: [PATCH 39/73] Clean up ORT documentation (#2065) * refactor ort doc * fix links * fix --- .../package_reference/modeling_ort.mdx | 5 + .../onnxruntime/usage_guides/models.mdx | 275 +++--------------- 2 files changed, 48 insertions(+), 232 deletions(-) diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx index 65b2b60195a..2c93ab3ac0d 100644 --- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx +++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx @@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks. ## Stable Diffusion +#### ORTDiffusionPipeline + +[[autodoc]] onnxruntime.ORTDiffusionPipeline + - __call__ + #### ORTStableDiffusionPipeline [[autodoc]] onnxruntime.ORTStableDiffusionPipeline diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 131822e9568..1292e755c06 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -4,263 +4,74 @@ Optimum is a utility package for building and running inference with accelerated Optimum can be used to load optimized models from the [Hugging Face Hub](hf.co/models) and create pipelines to run accelerated inference without rewriting your APIs. -## Switching from Transformers to Optimum -The `optimum.onnxruntime.ORTModelForXXX` model classes are API compatible with Hugging Face Transformers models. This -means you can just replace your `AutoModelForXXX` class with the corresponding `ORTModelForXXX` class in `optimum.onnxruntime`. +## Loading -You do not need to adapt your code to get it to work with `ORTModelForXXX` classes: +### Transformers models -```diff -from transformers import AutoTokenizer, pipeline --from transformers import AutoModelForQuestionAnswering -+from optimum.onnxruntime import ORTModelForQuestionAnswering - --model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") # PyTorch checkpoint -+model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") # ONNX checkpoint -tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") - -onnx_qa = pipeline("question-answering",model=model,tokenizer=tokenizer) - -question = "What's my name?" -context = "My name is Philipp and I live in Nuremberg." -pred = onnx_qa(question, context) -``` - -### Loading a vanilla Transformers model - -Because the model you want to work with might not be already converted to ONNX, [`~optimum.onnxruntime.ORTModel`] -includes a method to convert vanilla Transformers models to ONNX ones. Simply pass `export=True` to the -[`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly: - -```python ->>> from optimum.onnxruntime import ORTModelForSequenceClassification - ->>> # Load the model from the hub and export it to the ONNX format ->>> model = ORTModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased-finetuned-sst-2-english", export=True -... ) -``` - -### Pushing ONNX models to the Hugging Face Hub - -It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to push your `ORTModelForXXX` to the -[Hugging Face Model Hub](https://hf.co/models): - -```python ->>> from optimum.onnxruntime import ORTModelForSequenceClassification - ->>> # Load the model from the hub and export it to the ONNX format ->>> model = ORTModelForSequenceClassification.from_pretrained( -... "distilbert-base-uncased-finetuned-sst-2-english", export=True -... ) - ->>> # Save the converted model ->>> model.save_pretrained("a_local_path_for_convert_onnx_model") - -# Push the onnx model to HF Hub ->>> model.push_to_hub( # doctest: +SKIP -... "a_local_path_for_convert_onnx_model", repository_id="my-onnx-repo", use_auth_token=True -... ) -``` - -## Sequence-to-sequence models - -Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models -are exported to the ONNX format, they are decomposed into three parts that are later combined during inference: -- The encoder part of the model -- The decoder part of the model + the language modeling head -- The same decoder part of the model + language modeling head but taking and using pre-computed key / values as inputs and -outputs. This makes inference faster. - -Here is an example of how you can load a T5 model to the ONNX format and run inference for a translation task: - - -```python ->>> from transformers import AutoTokenizer, pipeline ->>> from optimum.onnxruntime import ORTModelForSeq2SeqLM - -# Load the model from the hub and export it to the ONNX format ->>> model_name = "t5-small" ->>> model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True) ->>> tokenizer = AutoTokenizer.from_pretrained(model_name) - -# Create a pipeline ->>> onnx_translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer) ->>> text = "He never went out without a book under his arm, and he often came back with two." ->>> result = onnx_translation(text) ->>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}] -``` - -## Stable Diffusion - -Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models -are exported to the ONNX format, they are split into four components that are later combined during inference: -- The text encoder -- The U-NET -- The VAE encoder -- The VAE decoder - -Make sure you have 🤗 Diffusers installed. - -To install `diffusers`: -```bash -pip install diffusers -``` - -### Text-to-Image - -Here is an example of how you can load an ONNX Stable Diffusion model and run inference using ONNX Runtime: - -```python -from optimum.onnxruntime import ORTStableDiffusionPipeline - -model_id = "runwayml/stable-diffusion-v1-5" -pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, revision="onnx") -prompt = "sailing ship in storm by Leonardo da Vinci" -image = pipeline(prompt).images[0] -``` +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`. -To load your PyTorch model and convert it to ONNX on-the-fly, you can set `export=True`. - -```python -pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True) - -# Don't forget to save the ONNX model -save_directory = "a_local_path" -pipeline.save_pretrained(save_directory) -``` - -
- -
- -### Image-to-Image - -```python -import requests -import torch -from PIL import Image -from io import BytesIO -from optimum.onnxruntime import ORTStableDiffusionImg2ImgPipeline - -model_id = "runwayml/stable-diffusion-v1-5" -pipeline = ORTStableDiffusionImg2ImgPipeline.from_pretrained(model_id, revision="onnx") - -url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg" - -response = requests.get(url) -init_image = Image.open(BytesIO(response.content)).convert("RGB") -init_image = init_image.resize((768, 512)) - -prompt = "A fantasy landscape, trending on artstation" - -image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0] -image.save("fantasy_landscape.png") -``` - -### Inpaint - -```python -import PIL -import requests -import torch -from io import BytesIO -from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline - -model_id = "runwayml/stable-diffusion-inpainting" -pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, revision="onnx") - -def download_image(url): - response = requests.get(url) - return PIL.Image.open(BytesIO(response.content)).convert("RGB") - -img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" -mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" +```diff + from transformers import AutoTokenizer, pipeline +- from transformers import AutoModelForQuestionAnswering ++ from optimum.onnxruntime import ORTModelForQuestionAnswering -init_image = download_image(img_url).resize((512, 512)) -mask_image = download_image(mask_url).resize((512, 512)) +- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint ++ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint + tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") -prompt = "Face of a yellow cat, high resolution, sitting on a park bench" -image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + result = pipe("He never went out without a book under his arm") ``` +More information for all the supported `ORTModelForXxx` in our [documentation](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort) -## Stable Diffusion XL - -Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows: -```bash -pip install diffusers -pip install invisible-watermark>=0.2.0 -``` - -### Text-to-Image - -Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using ONNX Runtime : +### Diffusers models -```python -from optimum.onnxruntime import ORTStableDiffusionXLPipeline +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`. -model_id = "stabilityai/stable-diffusion-xl-base-1.0" -base = ORTStableDiffusionXLPipeline.from_pretrained(model_id) -prompt = "sailing ship in storm by Leonardo da Vinci" -image = base(prompt).images[0] -# Don't forget to save the ONNX model -save_directory = "sd_xl_base" -base.save_pretrained(save_directory) +```diff +- from diffusers import DiffusionPipeline ++ from optimum.onnxruntime import ORTDiffusionPipeline + + model_id = "runwayml/stable-diffusion-v1-5" +- pipeline = DiffusionPipeline.from_pretrained(model_id) ++ pipeline = ORTDiffusionPipeline.from_pretrained(model_id, revision="onnx") + prompt = "sailing ship in storm by Leonardo da Vinci" + image = pipeline(prompt).images[0] ``` +## Converting your model to ONNX on-the-fly -### Image-to-Image - -Here is an example of how you can load a PyTorch SDXL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime for *image-to-image* : +In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly. +Simply pass `export=True` to the [`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly: ```python -from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline -from diffusers.utils import load_image - -model_id = "stabilityai/stable-diffusion-xl-refiner-1.0" -pipeline = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True) +>>> from optimum.onnxruntime import ORTModelForSequenceClassification -url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png" -image = load_image(url).convert("RGB") -prompt = "medieval castle by Caspar David Friedrich" -image = pipeline(prompt, image=image).images[0] -image.save("medieval_castle.png") +>>> # Load the model from the hub and export it to the ONNX format +>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english" +>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True) ``` -### Refining the image output - -The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model. +## Pushing your model to the Hub +You can also call `push_to_hub` directly on your model to upload it to the [Hub](https://hf.co/models). ```python -from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline - -model_id = "stabilityai/stable-diffusion-xl-refiner-1.0" -refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True) - -image = base(prompt=prompt, output_type="latent").images[0] -image = refiner(prompt=prompt, image=image[None, :]).images[0] -image.save("sailing_ship.png") -``` - - - -## Latent Consistency Models - -### Text-to-Image +>>> from optimum.onnxruntime import ORTModelForSequenceClassification -Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using ONNX Runtime : +>>> # Load the model from the hub and export it to the ONNX format +>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english" +>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True) -```python -from optimum.onnxruntime import ORTLatentConsistencyModelPipeline +>>> # Save the converted model locally +>>> output_dir = "a_local_path_for_convert_onnx_model" +>>> model.save_pretrained(output_dir) -model_id = "SimianLuo/LCM_Dreamshaper_v7" -pipeline = ORTLatentConsistencyModelPipeline.from_pretrained(model_id, export=True) -prompt = "sailing ship in storm by Leonardo da Vinci" -images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images -``` +# Push the onnx model to HF Hub +>>> model.push_to_hub(output_dir, repository_id="my-onnx-repo") # doctest: +SKIP +``` \ No newline at end of file From 8af46e53bd1321d325ea4e712e7da8aca98df49f Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Sat, 19 Oct 2024 20:37:48 +0200 Subject: [PATCH 40/73] Fix ort documentation code snippet (#2070) fix code snippet --- docs/source/onnxruntime/usage_guides/models.mdx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 1292e755c06..905e6632c05 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -13,11 +13,11 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op ```diff from transformers import AutoTokenizer, pipeline -- from transformers import AutoModelForQuestionAnswering -+ from optimum.onnxruntime import ORTModelForQuestionAnswering +- from transformers import AutoModelForCausalLM ++ from optimum.onnxruntime import ORTModelForCausalLM -- model = AutoModelForQuestionAnswering.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint -+ model = ORTModelForQuestionAnswering.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint +- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint ++ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) @@ -74,4 +74,4 @@ You can also call `push_to_hub` directly on your model to upload it to the [Hub] # Push the onnx model to HF Hub >>> model.push_to_hub(output_dir, repository_id="my-onnx-repo") # doctest: +SKIP -``` \ No newline at end of file +``` From 58c3571156466c13ebc7e22d3405b376ab2d222b Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 22 Oct 2024 16:48:29 +0200 Subject: [PATCH 41/73] Update the habana extra (#2077) --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index fb290274a3b..822d8be1b80 100644 --- a/setup.py +++ b/setup.py @@ -84,7 +84,7 @@ "nncf": "optimum-intel[nncf]>=1.18.0", "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0", "ipex": "optimum-intel[ipex]>=1.18.0", - "habana": ["optimum-habana", "transformers>=4.43.0,<4.44.0"], + "habana": ["optimum-habana", "transformers>=4.45.0,<4.46.0"], "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers>=4.36.2,<4.42.0"], "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers>=4.36.2,<4.42.0"], "graphcore": "optimum-graphcore", From 2e637be5d6b3e15c2b300130599bcec0f3e12ec8 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 25 Oct 2024 13:41:10 +0200 Subject: [PATCH 42/73] Add sentence-transformers and timm documentation example (#2072) * add sentence-transformers and timm example to documentation * replace with onnx models * rephrase --- .../onnxruntime/usage_guides/models.mdx | 58 ++++++++++++++++++- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx index 905e6632c05..27ac446096b 100644 --- a/docs/source/onnxruntime/usage_guides/models.mdx +++ b/docs/source/onnxruntime/usage_guides/models.mdx @@ -9,7 +9,7 @@ to run accelerated inference without rewriting your APIs. ### Transformers models -Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `AutoModelForXxx` class with the corresponding `ORTModelForXxx`. +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModelForXxx` with the corresponding `ORTModelForXxx` class. ```diff from transformers import AutoTokenizer, pipeline @@ -29,7 +29,7 @@ More information for all the supported `ORTModelForXxx` in our [documentation](h ### Diffusers models -Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `DiffusionPipeline` class with the corresponding `ORTDiffusionPipeline`. +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `DiffusionPipeline` with the corresponding `ORTDiffusionPipeline` class. ```diff @@ -43,6 +43,60 @@ Once your model was [exported to the ONNX format](https://huggingface.co/docs/op image = pipeline(prompt).images[0] ``` + +### Sentence Transformers models + +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModel` with the corresponding `ORTModelForFeatureExtraction` class. + +```diff + from transformers import AutoTokenizer +- from transformers import AutoModel ++ from optimum.onnxruntime import ORTModelForFeatureExtraction + + tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") +- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2") ++ model = ORTModelForFeatureExtraction.from_pretrained("optimum/all-MiniLM-L6-v2") + inputs = tokenizer("This is an example sentence", return_tensors="pt") + outputs = model(**inputs) +``` + +You can also load your ONNX model directly using the [`sentence_transformers.SentenceTransformer`](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx) class, just make sure to have `sentence-transformers>=3.2` installed. If the model wasn't already converted to ONNX, it will be converted automatically on-the-fly. + +```diff + from sentence_transformers import SentenceTransformer + + model_id = "sentence-transformers/all-MiniLM-L6-v2" +- model = SentenceTransformer(model_id) ++ model = SentenceTransformer(model_id, backend="onnx") + + sentences = ["This is an example sentence", "Each sentence is converted"] + embeddings = model.encode(sentences) +``` + + +### Timm models + +Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `create_model` with the corresponding `ORTModelForImageClassification` class. + + +```diff + import requests + from PIL import Image +- from timm import create_model + from timm.data import resolve_data_config, create_transform ++ from optimum.onnxruntime import ORTModelForImageClassification + +- model = create_model("timm/mobilenetv3_large_100.ra_in1k", pretrained=True) ++ model = ORTModelForImageClassification.from_pretrained("optimum/mobilenetv3_large_100.ra_in1k") + transform = create_transform(**resolve_data_config(model.config.pretrained_cfg, model=model)) + url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" + image = Image.open(requests.get(url, stream=True).raw) + inputs = transform(image).unsqueeze(0) + outputs = model(inputs) +``` + + + ## Converting your model to ONNX on-the-fly In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly. From 4a39ae0b1de05601c8a33f4a13c244bdd016db24 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 29 Oct 2024 10:20:31 +0100 Subject: [PATCH 43/73] Create token type ids when not provided (#2081) * create token type ids when needed * add test --- optimum/onnxruntime/modeling_ort.py | 19 ++++++++++++++++++- tests/onnxruntime/test_modeling.py | 12 ++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index ce1d68536ac..8e5a814b689 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -931,7 +931,6 @@ def _prepare_onnx_inputs( self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray] ) -> Dict[str, np.ndarray]: onnx_inputs = {} - # converts pytorch inputs into numpy inputs for onnx for input_name in self.input_names.keys(): onnx_inputs[input_name] = inputs.pop(input_name) @@ -1086,6 +1085,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1241,6 +1243,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1330,6 +1335,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1437,6 +1445,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1527,6 +1538,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, @@ -1610,6 +1624,9 @@ def forward( use_torch = isinstance(input_ids, torch.Tensor) self.raise_on_numpy_input_io_binding(use_torch) + if token_type_ids is None and "token_type_ids" in self.input_names: + token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids) + if self.device.type == "cuda" and self.use_io_binding: io_binding, output_shapes, output_buffers = self.prepare_io_binding( input_ids, diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 33243da278a..da450b8e31c 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2192,6 +2192,18 @@ def test_compare_to_io_binding(self, model_arch): gc.collect() + def test_default_token_type_ids(self): + model_id = MODEL_NAMES["bert"] + model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + tokens = tokenizer("this is a simple input", return_tensors="np") + self.assertTrue("token_type_ids" in model.input_names) + token_type_ids = tokens.pop("token_type_ids") + outs = model(token_type_ids=token_type_ids, **tokens) + outs_without_token_type_ids = model(**tokens) + self.assertTrue(np.allclose(outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state)) + gc.collect() + class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin): # Multiple Choice tests are conducted on different models due to mismatch size in model's classifier From 6802a0c4e9868041aa825f629c5e983df96e3cab Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 29 Oct 2024 16:56:28 +0100 Subject: [PATCH 44/73] Add transformers 4.46 compatiblity (#2078) * transformers 4.46 * setup * uupdate setup * fix t5 * update python (3.8 eol) * fix onnx test * fixed deberta, onnxruntime tests in series passing * fix bt * fixed t5_forward for real, because it's also used by blip-2 as well * fix Phi3 * fix opt * vision encoder decoder * fix setup * style * fix encoder decoder * fixed transformers branch * branch * allow 4.47 * remove patch * add opt * add test * fix OPT ONNX export and inference * add test * update setup * style * merge tests * update tes num beams * add test transformers version * add architectures depending on transformers * add warning * revert * update test generation length * style --------- Co-authored-by: IlyasMoutawwakil --- .github/workflows/check_code_quality.yml | 2 +- .github/workflows/test_benckmark.yml | 30 +- .github/workflows/test_cli.yml | 4 +- .github/workflows/test_export_onnx.yml | 44 +-- .github/workflows/test_export_onnx_cli.yml | 30 +- .../workflows/test_export_onnx_cli_timm.yml | 26 +- .github/workflows/test_export_onnx_timm.yml | 27 +- .github/workflows/test_exporters_common.yml | 2 +- .github/workflows/test_exporters_slow.yml | 2 +- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_offline.yml | 2 +- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 13 +- .github/workflows/test_onnxruntime_slow.yml | 2 +- .github/workflows/test_optimum_common.yml | 39 +-- .github/workflows/test_utils.yml | 2 +- optimum/bettertransformer/models/attention.py | 326 ++++++++++++------ .../models/decoder_models.py | 4 +- optimum/bettertransformer/transformation.py | 36 +- optimum/exporters/onnx/model_configs.py | 49 ++- optimum/exporters/onnx/model_patcher.py | 3 +- optimum/exporters/onnx/utils.py | 6 +- optimum/onnxruntime/modeling_decoder.py | 4 +- optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 16 + setup.py | 24 +- tests/bettertransformer/test_audio.py | 20 +- tests/bettertransformer/test_common.py | 12 +- tests/bettertransformer/test_decoder.py | 8 +- tests/bettertransformer/test_encoder.py | 4 +- .../bettertransformer/test_encoder_decoder.py | 2 +- tests/bettertransformer/test_gpu.py | 4 +- tests/bettertransformer/testing_utils.py | 18 +- tests/onnx/test_onnx_export_custom_module.py | 17 +- tests/onnxruntime/test_modeling.py | 61 ++-- tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 36 files changed, 541 insertions(+), 304 deletions(-) diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml index c429b706bff..861684cfa4d 100644 --- a/.github/workflows/check_code_quality.yml +++ b/.github/workflows/check_code_quality.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index 7f7f2ace329..e859e845d64 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -4,9 +4,9 @@ name: Benchmark suite / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,20 +17,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install wheel - pip install .[tests,onnxruntime,benchmark] - - name: Test with unittest - run: | - python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install wheel + pip install .[tests,onnxruntime,benchmark] + - name: Test with unittest + run: | + python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml index ecb19d23aa3..2efab40aab6 100644 --- a/.github/workflows/test_cli.yml +++ b/.github/workflows/test_cli.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} @@ -34,7 +34,7 @@ jobs: run: | pip install --upgrade pip pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu - pip install .[tests,exporters,exporters-tf] + pip install .[tests,exporters-tf] - name: Test with pytest run: | diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml index 56ef674cb41..0cd19a1724c 100644 --- a/.github/workflows/test_export_onnx.yml +++ b/.github/workflows/test_export_onnx.yml @@ -2,9 +2,9 @@ name: Exporters ONNX / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,27 +15,27 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 - - name: Install dependencies for tensorflow export - run: | - pip install .[tests,exporters-tf] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0 + - name: Install dependencies for tensorflow export + run: | + pip install .[tests,exporters-tf] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml index 8fa4ebb045f..618a140c147 100644 --- a/.github/workflows/test_export_onnx_cli.yml +++ b/.github/workflows/test_export_onnx_cli.yml @@ -2,9 +2,9 @@ name: Exporters ONNX CLI / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -15,20 +15,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_cli_timm.yml b/.github/workflows/test_export_onnx_cli_timm.yml index 76a535fcebd..b92d5551ba1 100644 --- a/.github/workflows/test_export_onnx_cli_timm.yml +++ b/.github/workflows/test_export_onnx_cli_timm.yml @@ -14,20 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0 diff --git a/.github/workflows/test_export_onnx_timm.yml b/.github/workflows/test_export_onnx_timm.yml index 339e3e93dec..c16d20fbc18 100644 --- a/.github/workflows/test_export_onnx_timm.yml +++ b/.github/workflows/test_export_onnx_timm.yml @@ -14,21 +14,20 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies for pytorch export - run: | - pip install .[tests,exporters] - - name: Test with unittest - working-directory: tests - run: | - RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for pytorch export + run: | + pip install .[tests,exporters] + - name: Test with unittest + working-directory: tests + run: | + RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0 diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml index 8e8c3360c1f..11f6038afe4 100644 --- a/.github/workflows/test_exporters_common.yml +++ b/.github/workflows/test_exporters_common.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml index b22fdd7fd2a..453389d63fa 100644 --- a/.github/workflows/test_exporters_slow.yml +++ b/.github/workflows/test_exporters_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index f0366cf0d1e..a4e6dd3cd29 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml index 90b0108e512..20911fe6db8 100644 --- a/.github/workflows/test_offline.yml +++ b/.github/workflows/test_offline.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index 22a11720798..dd1f3bee63d 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index a72bedb1ab7..0ab95752d01 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -17,8 +17,11 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + transformers-version: ["latest"] os: [ubuntu-20.04, windows-2019, macos-13] + include: + - transformers-version: "4.45.*" + os: ubuntu-20.04 runs-on: ${{ matrix.os }} steps: @@ -33,10 +36,10 @@ jobs: - name: Checkout code uses: actions/checkout@v4 - - name: Setup Python ${{ matrix.python-version }} + - name: Setup Python uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: 3.9 - name: Install dependencies run: | @@ -44,6 +47,10 @@ jobs: pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu pip install .[tests,onnxruntime] + - name: Install transformers ${{ matrix.transformers-version }} + if: ${{ matrix.transformers-version != 'latest' }} + run: pip install transformers==${{ matrix.transformers-version }} + - name: Test with pytest (in series) working-directory: tests run: | diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index 20371f79150..c5679e5b307 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index ded149c9b69..5ad42807a5f 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -4,9 +4,9 @@ name: Optimum common / Python - Test on: push: - branches: [ main ] + branches: [main] pull_request: - branches: [ main ] + branches: [main] concurrency: group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} @@ -17,25 +17,24 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] + python-version: [3.9] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install .[tests] - ls -l optimum/ - - name: Test with unittest - shell: bash - run: | - # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. - export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} - pytest tests/test_*.py - + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[tests] + ls -l optimum/ + - name: Test with unittest + shell: bash + run: | + # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. + export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} + pytest tests/test_*.py diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 1ef33ced086..b5f2e27fc6a 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-13] - python-version: [3.8, 3.9] + python-version: [3.9] runs-on: ${{ matrix.os }} steps: diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index 22b8faf1c21..c8c91a04e4e 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -387,137 +387,243 @@ def opt_forward( # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward -def t5_forward( - self, - hidden_states, - mask=None, - key_value_states=None, - position_bias=None, - past_key_value=None, - layer_head_mask=None, - query_length=None, - use_cache=False, - output_attentions=False, - **kwargs, -): - raise_on_head_mask(layer_head_mask) +if check_if_transformers_greater("4.45.99"): - if output_attentions is True: - raise ValueError("output_attentions=True can not be supported with BetterTransformer.") - if len(self.pruned_heads) > 0: - raise ValueError(f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}.") - batch_size, seq_length = hidden_states.shape[:2] - - real_seq_length = seq_length - - if past_key_value is not None: - assert ( - len(past_key_value) == 2 - ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" - real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length - - key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] - - def shape(states): - """projection""" - return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) - - def unshape(states): - """reshape""" - return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) - - def project(hidden_states, proj_layer, key_value_states, past_key_value): - """projects hidden states correctly to key/query states""" - if key_value_states is None: - # self-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(hidden_states)) - elif past_key_value is None: - # cross-attn - # (batch_size, n_heads, seq_length, dim_per_head) - hidden_states = shape(proj_layer(key_value_states)) + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + cache_position=None, + ): + """ + Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states). + """ + # Input is (batch_size, seq_length, dim) + # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder) + batch_size, seq_length = hidden_states.shape[:2] + + # if key_value_states are provided this layer is used as a cross-attention layer for the decoder + is_cross_attention = key_value_states is not None + + query_states = self.q(hidden_states) + query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) if past_key_value is not None: + is_updated = past_key_value.is_updated.get(self.layer_idx) + if is_cross_attention: + # after the first generated id, we can subsequently re-use all key/value_states from cache + curr_past_key_value = past_key_value.cross_attention_cache + else: + curr_past_key_value = past_key_value.self_attention_cache + + current_states = key_value_states if is_cross_attention else hidden_states + if is_cross_attention and past_key_value is not None and is_updated: + # reuse k,v, cross_attentions + key_states = curr_past_key_value.key_cache[self.layer_idx] + value_states = curr_past_key_value.value_cache[self.layer_idx] + else: + key_states = self.k(current_states) + value_states = self.v(current_states) + key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + if past_key_value is not None: + # save all key/value_states to cache to be re-used for fast auto-regressive generation + cache_position = cache_position if not is_cross_attention else None + key_states, value_states = curr_past_key_value.update( + key_states, value_states, self.layer_idx, {"cache_position": cache_position} + ) + # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls + if is_cross_attention: + past_key_value.is_updated[self.layer_idx] = True + + if position_bias is None: + key_length = key_states.shape[-2] + # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past) + real_seq_length = query_length if query_length is not None else cache_position[-1] + 1 + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, seq_length, key_length), device=query_states.device, dtype=query_states.dtype + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias( + real_seq_length, key_length, device=query_states.device, cache_position=cache_position + ) + position_bias = position_bias[:, :, -seq_length:, :] + + if mask is not None: + causal_mask = mask[:, :, :, : key_states.shape[-2]] + position_bias = position_bias + causal_mask + + if self.pruned_heads: + mask = torch.ones(position_bias.shape[1]) + mask[list(self.pruned_heads)] = 0 + position_bias_masked = position_bias[:, mask.bool()] + else: + position_bias_masked = position_bias + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias_masked, + dropout_p=self.dropout if self.training else 0.0, + is_causal=False, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(batch_size, -1, self.inner_dim) + attn_output = self.o(attn_output) + + outputs = (attn_output, past_key_value, position_bias) + + return outputs + +else: + + def t5_forward( + self, + hidden_states, + mask=None, + key_value_states=None, + position_bias=None, + past_key_value=None, + layer_head_mask=None, + query_length=None, + use_cache=False, + output_attentions=False, + **kwargs, + ): + raise_on_head_mask(layer_head_mask) + + if output_attentions is True: + raise ValueError("output_attentions=True can not be supported with BetterTransformer.") + if len(self.pruned_heads) > 0: + raise ValueError( + f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}." + ) + + batch_size, seq_length = hidden_states.shape[:2] + + real_seq_length = seq_length + + if past_key_value is not None: + assert ( + len(past_key_value) == 2 + ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states" + real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length + + key_length = real_seq_length if key_value_states is None else key_value_states.shape[1] + + def shape(states): + """projection""" + return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2) + + def unshape(states): + """reshape""" + return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim) + + def project(hidden_states, proj_layer, key_value_states, past_key_value): + """projects hidden states correctly to key/query states""" if key_value_states is None: # self-attn - # (batch_size, n_heads, key_length, dim_per_head) - hidden_states = torch.cat([past_key_value, hidden_states], dim=2) - elif past_key_value.shape[2] != key_value_states.shape[1]: - # checking that the `sequence_length` of the `past_key_value` is the same as - # the provided `key_value_states` to support prefix tuning + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(hidden_states)) + elif past_key_value is None: # cross-attn # (batch_size, n_heads, seq_length, dim_per_head) hidden_states = shape(proj_layer(key_value_states)) - else: - # cross-attn - hidden_states = past_key_value - return hidden_states - - # get query states - query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) - # get key/value states - key_states = project( - hidden_states, - self.k, - key_value_states, - past_key_value[0] if past_key_value is not None else None, - ) - value_states = project( - hidden_states, - self.v, - key_value_states, - past_key_value[1] if past_key_value is not None else None, - ) + if past_key_value is not None: + if key_value_states is None: + # self-attn + # (batch_size, n_heads, key_length, dim_per_head) + hidden_states = torch.cat([past_key_value, hidden_states], dim=2) + elif past_key_value.shape[2] != key_value_states.shape[1]: + # checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + # cross-attn + # (batch_size, n_heads, seq_length, dim_per_head) + hidden_states = shape(proj_layer(key_value_states)) + else: + # cross-attn + hidden_states = past_key_value + return hidden_states + + # get query states + query_states = shape(self.q(hidden_states)) # (batch_size, n_heads, seq_length, dim_per_head) + + # get key/value states + key_states = project( + hidden_states, + self.k, + key_value_states, + past_key_value[0] if past_key_value is not None else None, + ) + value_states = project( + hidden_states, + self.v, + key_value_states, + past_key_value[1] if past_key_value is not None else None, + ) - dropout_p = self.dropout if self.training else 0.0 - query_states = self.scale * query_states - if position_bias is None and not self.has_relative_attention_bias: - if mask is None: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=None, dropout_p=dropout_p, is_causal=False - ) - elif mask is not None: + dropout_p = self.dropout if self.training else 0.0 + query_states = self.scale * query_states + if position_bias is None and not self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=mask, dropout_p=dropout_p, is_causal=False ) - if position_bias is None: - if not self.has_relative_attention_bias: - position_bias = torch.zeros( - (1, self.n_heads, real_seq_length, key_length), - device=value_states.device, - dtype=value_states.dtype, - ) - if self.gradient_checkpointing and self.training: - position_bias.requires_grad = True + if position_bias is None: + if not self.has_relative_attention_bias: + position_bias = torch.zeros( + (1, self.n_heads, real_seq_length, key_length), + device=value_states.device, + dtype=value_states.dtype, + ) + if self.gradient_checkpointing and self.training: + position_bias.requires_grad = True + else: + position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) + + # if key and values are already calculated + # we want only the last query position bias + if past_key_value is not None: + position_bias = position_bias[:, :, -hidden_states.size(1) :, :] + + if mask is not None: + position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) + + if self.has_relative_attention_bias: + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=position_bias, + dropout_p=dropout_p, + is_causal=False, + ) else: - position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device) - - # if key and values are already calculated - # we want only the last query position bias - if past_key_value is not None: - position_bias = position_bias[:, :, -hidden_states.size(1) :, :] - - if mask is not None: - position_bias = position_bias + mask # (batch_size, n_heads, seq_length, key_length) - - if self.has_relative_attention_bias: attn_output = torch.nn.functional.scaled_dot_product_attention( query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False ) - else: - attn_output = torch.nn.functional.scaled_dot_product_attention( - query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False - ) - attn_output = unshape(attn_output) # (batch_size, seq_length, dim) - attn_output = self.o(attn_output) + attn_output = unshape(attn_output) # (batch_size, seq_length, dim) + attn_output = self.o(attn_output) - present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None - outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) + present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None + outputs = (attn_output,) + (present_key_value_state,) + (position_bias,) - return outputs + return outputs # Adapted from transformers.models.bart.modeling_bart.BartAttention.forward diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 52d28d076d3..e8045e695c1 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -327,9 +327,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"): setattr(self, "relative_attention_bias", layer.relative_attention_bias) self.original_layers_mapping["relative_attention_bias"] = "relative_attention_bias" - self.module_mapping = None - + self.layer_idx = getattr(layer, "layer_idx", None) self.is_decoder = layer.is_decoder + self.module_mapping = None def forward(self, *args, **kwargs): return t5_forward(self, *args, **kwargs) diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py index a101757b6fa..b138862752e 100644 --- a/optimum/bettertransformer/transformation.py +++ b/optimum/bettertransformer/transformation.py @@ -20,7 +20,13 @@ import torch from packaging.version import parse -from ..utils import check_if_pytorch_greater, is_accelerate_available, recurse_getattr, recurse_setattr +from ..utils import ( + check_if_pytorch_greater, + check_if_torch_greater, + is_accelerate_available, + recurse_getattr, + recurse_setattr, +) from .models import BetterTransformerManager @@ -213,15 +219,18 @@ def transform( hf_config = model.config if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]: raise ValueError( - f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention." + f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." ) - # Check if we have to load the model using `accelerate` - if hasattr(model, "hf_device_map"): - load_accelerate = True - hf_device_map = model.hf_device_map - else: - load_accelerate = False + if hasattr(hf_config, "_attn_implementation") and hf_config._attn_implementation == "sdpa": + raise ValueError( + "This model already uses BetterTransformer optimizations from Transformers (torch.nn.functional.scaled_dot_product_attention). " + "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. " + "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention." + ) if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True: raise Exception( @@ -241,11 +250,20 @@ def transform( f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}." ) - if parse(torch.__version__) <= parse("1.14"): + if not check_if_torch_greater("2.0"): raise ValueError( f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch." ) + hf_config = model.config + + # Check if we have to load the model using `accelerate` + if hasattr(model, "hf_device_map"): + load_accelerate = True + hf_device_map = model.hf_device_map + else: + load_accelerate = False + if load_accelerate: # Remove the hooks from the original model to avoid weights being on `meta` device. remove_hook_from_module(model, recurse=True) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index e77f649f69b..9e57128c272 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -155,7 +155,7 @@ class SplinterOnnxConfig(BertOnnxConfig): class DistilBertOnnxConfig(BertOnnxConfig): - DEFAULT_ONNX_OPSET = 11 + DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0 @property def inputs(self) -> Dict[str, Dict[int, str]]: @@ -266,10 +266,18 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig -class OPTOnnxConfig(TextDecoderOnnxConfig): - # OPT does not require position_ids input. - DEFAULT_ONNX_OPSET = 13 - NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46 +if check_if_transformers_greater("4.45.99"): + + class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + +else: + + class OPTOnnxConfig(TextDecoderOnnxConfig): + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): @@ -304,6 +312,15 @@ class Phi3OnnxConfig(PhiOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") + def __init__(self, *args, **kwargs): + # TODO : replace check_if_transformers_greater with is_transformers_available + if check_if_transformers_greater("4.46.0") and not check_if_transformers_greater("4.46.1"): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Phi3 model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) + super().__init__(*args, **kwargs) + class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 @@ -480,7 +497,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class T5OnnxConfig(TextSeq2SeqOnnxConfig): - DEFAULT_ONNX_OPSET = 13 + DEFAULT_ONNX_OPSET = 14 # T5 uses aten::triu that requires opset>=14 DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES[:-1] + ( T5DummySeq2SeqPastKeyValuesGenerator, ) @@ -2027,6 +2044,7 @@ class TrOCROnnxConfig(TextSeq2SeqOnnxConfig): class VisionEncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig ATOL_FOR_VALIDATION = 1e-3 + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator) @@ -2156,8 +2174,21 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast): DummySeq2SeqPastKeyValuesGenerator, DummyPix2StructInputGenerator, ) - # Min operator needs to support int64, which is the case for opset>=12 - DEFAULT_ONNX_OPSET = 12 + + DEFAULT_ONNX_OPSET = 14 # use 'aten::triu' now which is opset 14 + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # TODO : replace check_if_transformers_greater with is_transformers_available + if ( + check_if_transformers_greater("4.46.0") + and not check_if_transformers_greater("4.46.1") + and self._behavior is ConfigBehavior.DECODER + ): + logger.error( + "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. " + "Please upgrade to v4.46.1 or higher, or downgrade your transformers version" + ) @property def inputs(self): @@ -2310,3 +2341,5 @@ def overwrite_shape_and_generate_input( class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig + + DEFAULT_ONNX_OPSET = 14 # uses SDPA in Transformers, hence opset>=14. diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 34ed5fcae46..fdfb0e280f5 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -34,11 +34,10 @@ if _transformers_version > version.parse("4.34.99"): - from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask + from transformers.modeling_attn_mask_utils import AttentionMaskConverter if _transformers_version >= version.parse("4.36"): from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa else: - _prepare_4d_causal_attention_mask = None _prepare_4d_causal_attention_mask_for_sdpa = None AttentionMaskConverter = None diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 675566ba23e..56249bbf5c3 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -27,7 +27,7 @@ is_diffusers_available, logging, ) -from ...utils.import_utils import _diffusers_version +from ...utils.import_utils import _diffusers_version, check_if_transformers_greater from ..utils import ( _get_submodels_and_export_configs, ) @@ -89,6 +89,10 @@ } +if check_if_transformers_greater("4.45.99"): + MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt") + + def check_onnxruntime_requirements(minimum_version: version.Version): """ Checks that ONNX Runtime is installed and if version is recent enough. diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index bda3ec98d9a..984d7f22ebf 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -582,7 +582,8 @@ def _from_pretrained( init_cls = ORTFalconForCausalLM elif config.model_type == "mpt": init_cls = ORTMPTForCausalLM - elif config.model_type == "opt": + # if model was exported with position_ids it means the model was exported with transformers >= v4.46 + elif config.model_type == "opt" and "position_ids" not in input_dims: init_cls = ORTOPTForCausalLM elif config.model_type == "gpt_bigcode": init_cls = ORTGPTBigCodeForCausalLM @@ -839,7 +840,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg attention_mask = kwargs.get("attention_mask", None) use_cache = kwargs.get("use_cache", None) - return { "input_ids": input_ids, "past_key_values": past_key_values, diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 5d5044e63e1..db7d1f6975d 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -29,6 +29,7 @@ TRANSFORMERS_MINIMUM_VERSION, check_if_diffusers_greater, check_if_pytorch_greater, + check_if_torch_greater, check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 4a57fda79ce..35a6294ab52 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -193,6 +193,22 @@ def check_if_diffusers_greater(target_version: str) -> bool: return version.parse(_diffusers_version) >= version.parse(target_version) +def check_if_torch_greater(target_version: str) -> bool: + """ + Checks whether the current install of torch is greater than or equal to the target version. + + Args: + target_version (str): version used as the reference for comparison. + + Returns: + bool: whether the check is True or not. + """ + if not is_torch_available(): + return False + + return torch_version >= version.parse(target_version) + + @contextmanager def require_numpy_strictly_lower(package_version: str, message: str): if not version.parse(np.__version__) < version.parse(package_version): diff --git a/setup.py b/setup.py index 822d8be1b80..82892bfcc8c 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ REQUIRED_PKGS = [ "coloredlogs", "sympy", - "transformers[sentencepiece]>=4.29", + "transformers>=4.29", "torch>=1.11", "packaging", "numpy", @@ -37,9 +37,9 @@ "diffusers>=0.17.0", "torchaudio", "einops", - "invisible-watermark", "timm", "scikit-learn", + "sentencepiece", "rjieba", ] @@ -54,7 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.46.0", + "transformers<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,10 +63,20 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.46.0", + "transformers<4.47.0", + ], + "exporters": [ + "onnx", + "onnxruntime", + "timm", + "transformers<4.47.0", + ], + "exporters-gpu": [ + "onnx", + "onnxruntime-gpu", + "timm", + "transformers<4.47.0", ], - "exporters": ["onnx", "onnxruntime", "timm", "transformers<4.46.0"], - "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm", "transformers<4.46.0"], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", "tf2onnx", @@ -76,7 +86,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers[sentencepiece]>=4.26,<4.38", + "transformers>=4.26,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py index be01a92d447..caca91e27ca 100644 --- a/tests/bettertransformer/test_audio.py +++ b/tests/bettertransformer/test_audio.py @@ -35,7 +35,7 @@ class TestsWhisper(unittest.TestCase): def test_error_message(self): - model = AutoModel.from_pretrained("openai/whisper-tiny") + model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager") with self.assertRaises(ValueError) as cm: model = BetterTransformer.transform(model) @@ -82,15 +82,19 @@ def _test_fp16_inference( set_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False) - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -147,7 +151,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int): model_id = MODELS_DICT[model_type] processor = AutoProcessor.from_pretrained(model_id) - model = AutoModel.from_pretrained(model_id) + model = AutoModel.from_pretrained(model_id, attn_implementation="eager") text = ["This is me and me"] if batch_size > 1: @@ -217,14 +221,14 @@ def test_logits(self, model_type: str): inputs = self.prepare_inputs_for_class(model_id, model_type) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config torch.manual_seed(0) converted_model = BetterTransformer.transform(hf_random_model) torch.manual_seed(0) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config self.assertFalse( diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py index 35b89d2ed2e..b8bc0a3b3d9 100644 --- a/tests/bettertransformer/test_common.py +++ b/tests/bettertransformer/test_common.py @@ -28,7 +28,7 @@ class BetterTransformerIntegrationTests(unittest.TestCase): def test_raise_error_on_double_transform_call(self): - model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel") + model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager") with self.assertRaises(Exception) as cm: bt_model = BetterTransformer.transform(model) @@ -59,7 +59,7 @@ def test_raise_on_save(self, model_type: str): ) for model_id in model_ids: with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_model, keep_original_model=False) bt_model.save_pretrained(tmpdirname) @@ -73,7 +73,7 @@ def test_conversion(self, model_type: str): MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],) ) for model_id in model_ids: - hf_random_model = AutoModel.from_pretrained(model_id) + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") converted_model = BetterTransformer.transform(hf_random_model) self.assertTrue( @@ -99,7 +99,7 @@ def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep ) for model_id in model_ids: # get hf and bt model - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") # get bt model and invert it bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -145,9 +145,11 @@ def test_raise_activation_fun(self, model_type: str): )() # random config class for the model to test hf_random_config.hidden_act = "silu" - hf_random_model = AutoModel.from_config(hf_random_config).eval() + hf_random_model = AutoModel.from_config(hf_random_config, attn_implementation="eager").eval() + with self.assertRaises(ValueError) as cm: _ = BetterTransformer.transform(hf_random_model, keep_original_model=True) + self.assertTrue("Activation function" in str(cm.exception)) def test_dict_class_consistency(self): diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py index bab8f376fcc..e2bc6ddc2fb 100644 --- a/tests/bettertransformer/test_decoder.py +++ b/tests/bettertransformer/test_decoder.py @@ -131,7 +131,7 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in model_id = MODELS_DICT[model_type] - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config) @@ -167,7 +167,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForCausalLM.from_pretrained(model_id) + model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: if tokenizer.eos_token != "": @@ -224,7 +224,9 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina @require_torch_gpu @require_accelerate def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None): - hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModelForCausalLM.from_pretrained( + "gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py index 74aacaed58c..7dd42c43b05 100644 --- a/tests/bettertransformer/test_encoder.py +++ b/tests/bettertransformer/test_encoder.py @@ -181,7 +181,9 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m If this works for roberta, it should work for all other models too. """ - hf_model = AutoModel.from_pretrained("xlm-roberta-base", device_map="auto", max_memory=max_memory).eval() + hf_model = AutoModel.from_pretrained( + "xlm-roberta-base", device_map="auto", max_memory=max_memory, attn_implementation="eager" + ).eval() bt_model = BetterTransformer.transform( hf_model, keep_original_model=keep_original_model, max_memory=max_memory ) diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py index 8d05923522a..5ce4d62b12c 100644 --- a/tests/bettertransformer/test_encoder_decoder.py +++ b/tests/bettertransformer/test_encoder_decoder.py @@ -153,7 +153,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd model_id = MODELS_DICT[model_type] tokenizer = AutoTokenizer.from_pretrained(model_id) - model = AutoModelForSeq2SeqLM.from_pretrained(model_id) + model = AutoModelForSeq2SeqLM.from_pretrained(model_id, attn_implementation="eager") if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token diff --git a/tests/bettertransformer/test_gpu.py b/tests/bettertransformer/test_gpu.py index b992b90d3c8..ada38e408fa 100644 --- a/tests/bettertransformer/test_gpu.py +++ b/tests/bettertransformer/test_gpu.py @@ -26,7 +26,9 @@ def timing_cuda(model, num_batches, input_ids, masks, decoder_input_ids): def benchmark(model_name: str, num_batches: int, batch_size: int, max_seqlen: int, is_half: bool): - hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16 if is_half else None).eval() + hf_model = AutoModel.from_pretrained( + model_name, torch_dtype=torch.float16 if is_half else None, attn_implementation="eager" + ).eval() hf_model = hf_model.to("cuda:0") bt_model = BetterTransformer.transform(hf_model, keep_original_model=True) diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index 098882180aa..f79cbb34512 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -136,10 +136,12 @@ def _test_fp16_inference( torch.manual_seed(0) if not use_to_operator: - hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0) + hf_random_model = automodel_class.from_pretrained( + model_id, torch_dtype=torch.float16, attn_implementation="eager" + ).to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) else: - hf_random_model = automodel_class.from_pretrained(model_id).to(0) + hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0) converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) hf_random_model = hf_random_model.to(torch.float16) converted_model = converted_model.to(torch.float16) @@ -169,7 +171,7 @@ def _test_fp16_inference( def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs): inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() random_config = hf_random_model.config # I could not obtain reproducible results with `torch.manual_seed` nor with @@ -309,7 +311,7 @@ def _test_train_decoder(self, model_id: str, model_type: str, **kwargs): """ inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs) - hf_random_model = AutoModel.from_pretrained(model_id).eval() + hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True) bt_model.train() @@ -328,7 +330,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): r""" Test that the inverse converted model and hf model have the same modules """ - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_modules = list(hf_model.modules()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -349,7 +351,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False): def _test_save_load_invertible(self, model_id, keep_original_model=True): with tempfile.TemporaryDirectory() as tmpdirname: - hf_model = AutoModel.from_pretrained(model_id).eval() + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval() hf_model_state_dict = copy.deepcopy(hf_model.state_dict()) bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model) @@ -362,7 +364,7 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True): # saving a normal transformers bark model fails because of shared tensors bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark") - bt_model_from_load = AutoModel.from_pretrained(tmpdirname) + bt_model_from_load = AutoModel.from_pretrained(tmpdirname, attn_implementation="eager") self.assertEqual( set(bt_model.state_dict().keys()), @@ -397,7 +399,7 @@ def _test_invert_model_logits( """ inputs = self.prepare_inputs_for_class(model_id, model_type=model_type, **preprocessor_kwargs) - hf_model = AutoModel.from_pretrained(model_id) + hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager") hf_model = hf_model.eval() with torch.inference_mode(): diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py index a144d5cd840..4398c14f01d 100644 --- a/tests/onnx/test_onnx_export_custom_module.py +++ b/tests/onnx/test_onnx_export_custom_module.py @@ -24,6 +24,8 @@ import torch from transformers.models.deberta import modeling_deberta + from optimum.utils import check_if_torch_greater + class StableDropoutTestCase(TestCase): """Tests export of StableDropout module.""" @@ -50,8 +52,8 @@ def test_training(self): training=training, ) - # Expected to fail with opset_version < 12 - with self.assertRaises(Exception): + if check_if_torch_greater("2.5"): + # Expected to pass with opset_version < 12 on torch >= 2.5 torch.onnx.export( sd, input, @@ -60,3 +62,14 @@ def test_training(self): do_constant_folding=do_constant_folding, training=training, ) + else: + # Expected to fail with opset_version < 12 on torch < 2.5 + with self.assertRaises(Exception): + torch.onnx.export( + sd, + input, + devnull, + opset_version=11, + do_constant_folding=do_constant_folding, + training=training, + ) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index da450b8e31c..597eb581e2a 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -54,6 +54,7 @@ AutoModelForTokenClassification, AutoModelForVision2Seq, AutoTokenizer, + GenerationConfig, MBartForConditionalGeneration, Pix2StructForConditionalGeneration, # Pix2Struct does not work with AutoModel PretrainedConfig, @@ -106,7 +107,7 @@ DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, logging, ) -from optimum.utils.import_utils import is_diffusers_available +from optimum.utils.import_utils import check_if_transformers_greater, is_diffusers_available from optimum.utils.testing_utils import ( grid_parameters, remove_directory, @@ -2326,10 +2327,12 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "llama", "mistral", "mpt", - "phi3", - "qwen2", + "opt", ] + if check_if_transformers_greater("4.40"): + SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [False, True], @@ -2338,7 +2341,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): ORTMODEL_CLASS = ORTModelForCausalLM TASK = "text-generation" - GENERATION_LENGTH = 100 + GENERATION_LENGTH = 90 SPEEDUP_CACHE = 1.1 @parameterized.expand([(False,), (True,)]) @@ -2411,7 +2414,7 @@ def test_merge_from_onnx_and_save(self, model_arch): self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents) self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents) - @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 3]})) + @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 4]})) def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, num_beams: int): use_io_binding = None if use_cache is False: @@ -2474,25 +2477,39 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers new_tokens = 5 - onnx_outputs = onnx_model.generate( - **tokens, - num_beams=num_beams, - do_sample=False, - min_new_tokens=new_tokens, - max_new_tokens=new_tokens, - eos_token_id=None, - ) + gen_kwargs = { + "max_new_tokens": new_tokens, + "min_new_tokens": new_tokens, + "eos_token_id": None, + "num_beams": num_beams, + } - transformers_outputs = transformers_model.generate( - **tokens, - num_beams=num_beams, - do_sample=False, - min_new_tokens=new_tokens, - max_new_tokens=new_tokens, - eos_token_id=None, - ) + beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs) + + if use_cache and num_beams == 4: + beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs) + group_beam_search_gen_config = GenerationConfig( + do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs + ) + gen_configs = ( + beam_search_gen_config, + beam_sample_gen_config, + group_beam_search_gen_config, + ) + else: + gen_configs = (beam_search_gen_config,) - self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs)) + for gen_config in gen_configs: + set_seed(SEED) + with torch.no_grad(): + transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config) + set_seed(SEED) + onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config) + + self.assertTrue( + torch.equal(onnx_outputs, transformers_outputs), + f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}", + ) gc.collect() diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 5071d0081af..e3d54237857 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -125,6 +125,7 @@ "mpt": "hf-internal-testing/tiny-random-MptForCausalLM", "mt5": "lewtun/tiny-random-mt5", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "opt": "hf-internal-testing/tiny-random-OPTModel", "pegasus": "hf-internal-testing/tiny-random-PegasusModel", "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver", "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv", From 7e8d857d1ed6be32046324bf8f424690f116b4e9 Mon Sep 17 00:00:00 2001 From: Gabe Goodhart Date: Thu, 31 Oct 2024 14:54:05 -0600 Subject: [PATCH 45/73] Add ONNX export support for granite models (#2043) * feat(exporters/onnx): Add GraniteOnnxConfig and task support list Branch: OnnxGranite Signed-off-by: Gabe Goodhart * feat: Add granite's normalized config for inference Branch: OnnxGranite Signed-off-by: Gabe Goodhart * feat(onnx opt): Add onnx optimization support for granite Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(onnx/granite): Use LlamaOnnxConfig as the base for GraniteOnnxConfig Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(onnxruntime): Add "granite" to list of model types with grouped attention Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix: Add granite to the list of models that require position_ids Branch: OnnxGranite Signed-off-by: Gabe Goodhart * fix(granite): Add MIN_TORCH_VERSION for recently fixed torch bug https://github.com/huggingface/optimum/pull/2043#issuecomment-2427975461 Branch: OnnxGranite Signed-off-by: Gabe Goodhart * test(granite): Add tiny random granite test for onnx exporter Branch: OnnxGranite Signed-off-by: Gabe Goodhart * tests(onnxruntime): Add granite to onnxruntime tests Branch: OnnxGranite Signed-off-by: Gabe Goodhart --------- Signed-off-by: Gabe Goodhart --- optimum/exporters/onnx/model_configs.py | 5 +++++ optimum/exporters/onnx/utils.py | 1 + optimum/exporters/tasks.py | 7 +++++++ optimum/onnxruntime/modeling_decoder.py | 2 +- optimum/onnxruntime/utils.py | 1 + optimum/utils/normalized_config.py | 1 + tests/exporters/exporters_utils.py | 1 + tests/onnxruntime/test_modeling.py | 1 + tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 9 files changed, 19 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 9e57128c272..cc752779d30 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -298,6 +298,11 @@ class GemmaOnnxConfig(LlamaOnnxConfig): pass +class GraniteOnnxConfig(LlamaOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + MIN_TORCH_VERSION = version.parse("2.5.0") + + class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 # Phi now uses F.scaled_dot_product_attention by default for torch>=2.1.1. NORMALIZED_CONFIG_CLASS = NormalizedTextConfig diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py index 56249bbf5c3..19e24f88743 100644 --- a/optimum/exporters/onnx/utils.py +++ b/optimum/exporters/onnx/utils.py @@ -86,6 +86,7 @@ "phi", "phi3", "qwen2", + "granite", } diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index a489f34fb06..fdc8bfcb539 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -915,6 +915,13 @@ class TasksManager: "text-classification", onnx="LlamaOnnxConfig", ), + "granite": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="GraniteOnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py index 984d7f22ebf..8f1d062221a 100644 --- a/optimum/onnxruntime/modeling_decoder.py +++ b/optimum/onnxruntime/modeling_decoder.py @@ -340,7 +340,7 @@ def prepare_past_key_values( if self.model_type == "gemma": num_attention_heads = self.normalized_config.num_key_value_heads embed_size_per_head = self.normalized_config.head_dim - elif self.model_type in {"mistral", "llama", "qwen2"}: + elif self.model_type in {"mistral", "llama", "qwen2", "granite"}: num_attention_heads = self.normalized_config.num_key_value_heads else: num_attention_heads = self.normalized_config.num_attention_heads diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 128e2406f11..9e92e0bd325 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -128,6 +128,7 @@ class ORTConfigManager: "gpt-neo": "gpt2", "gpt-neox": "gpt2", "gptj": "gpt2", + "granite": "gpt2", # longt5 with O4 results in segmentation fault "longt5": "bert", "llama": "gpt2", diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 81207b76496..9ceed24c2dd 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -281,6 +281,7 @@ class NormalizedConfigManager: "xlm-roberta": NormalizedTextConfig, "yolos": NormalizedVisionConfig, "qwen2": NormalizedTextConfig, + "granite": NormalizedTextConfigWithGQA, } @classmethod diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index c8a33b0be35..ccccb5510bf 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -100,6 +100,7 @@ "gpt-neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt-neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJModel", + "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", "ibert": "hf-internal-testing/tiny-random-IBertModel", "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 597eb581e2a..a335e014478 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2324,6 +2324,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "gpt_neo", "gpt_neox", "gptj", + "granite", "llama", "mistral", "mpt", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index e3d54237857..9f200e69b3d 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -104,6 +104,7 @@ "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel", "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM", "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM", + "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", From 35eebfe62bf721bbab365f569bd0c73057239732 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 5 Nov 2024 10:01:46 +0100 Subject: [PATCH 46/73] Drop python 3.8 (#2086) * drop python 3.8 * fix * add python 3.11 --- .github/workflows/dev_test_benckmark.yml | 8 ++------ .github/workflows/dev_test_bettertransformer.yml | 6 ++---- .github/workflows/dev_test_dummy_inputs.yml | 4 +--- .github/workflows/dev_test_exporters.yml | 8 ++------ .github/workflows/dev_test_fx.yml | 4 +--- .github/workflows/dev_test_onnx.yml | 4 +--- .github/workflows/dev_test_onnxruntime.yml | 4 +--- .github/workflows/dev_test_optimum_common.yml | 5 +---- .github/workflows/test_export_onnx.yml | 2 +- .github/workflows/test_export_tflite.yml | 5 ++--- .github/workflows/test_export_tflite_cli.yml | 5 ++--- .../test_export_tflite_cli_dynamic_quantization_int8.yml | 5 ++--- .../test_export_tflite_cli_quantization_fp16.yml | 5 ++--- .../test_export_tflite_cli_quantization_full_int8.yml | 5 ++--- ...export_tflite_cli_quantization_int8_custom_dataset.yml | 5 ++--- ...xport_tflite_cli_quantization_int8_default_dataset.yml | 5 ++--- .../test_export_tflite_cli_quantization_int8x16.yml | 5 ++--- .github/workflows/test_exporters_common.yml | 5 ++--- .github/workflows/test_exporters_slow.yml | 5 ++--- .github/workflows/test_fx.yml | 2 +- .github/workflows/test_offline.yml | 5 ++--- .github/workflows/test_onnx.yml | 2 +- .github/workflows/test_onnxruntime.yml | 2 +- .github/workflows/test_onnxruntime_slow.yml | 2 +- .github/workflows/test_optimum_common.yml | 4 ++-- .github/workflows/test_utils.yml | 2 +- setup.py | 7 ++++--- 27 files changed, 45 insertions(+), 76 deletions(-) diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml index 5f6fc825021..a898d288625 100644 --- a/.github/workflows/dev_test_benckmark.yml +++ b/.github/workflows/dev_test_benckmark.yml @@ -12,12 +12,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 - os: - - ubuntu-20.04 - runs-on: ${{ matrix.os }} + python-version: ['3.9', '3.11'] + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml index e4c999ca6da..e75b5e3bf98 100644 --- a/.github/workflows/dev_test_bettertransformer.yml +++ b/.github/workflows/dev_test_bettertransformer.yml @@ -12,18 +12,16 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 os: - ubuntu-20.04 - macos-13 runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v2 - - name: Setup Python ${{ matrix.python-version }} + - name: Setup Python uses: actions/setup-python@v2 with: - python-version: ${{ matrix.python-version }} + python-version: '3.9' - name: Install dependencies run: | pip install .[tests] diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml index 49baa49c418..72a4763e432 100644 --- a/.github/workflows/dev_test_dummy_inputs.yml +++ b/.github/workflows/dev_test_dummy_inputs.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_exporters.yml b/.github/workflows/dev_test_exporters.yml index 5d967d125f5..b2dee3ed3a9 100644 --- a/.github/workflows/dev_test_exporters.yml +++ b/.github/workflows/dev_test_exporters.yml @@ -12,12 +12,8 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 - os: - - ubuntu-20.04 - runs-on: ${{ matrix.os }} + python-version: ['3.9', '3.11'] + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml index 0b8633282f7..a0c54c78365 100644 --- a/.github/workflows/dev_test_fx.yml +++ b/.github/workflows/dev_test_fx.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml index 48052cfded3..f7514e1c5e5 100644 --- a/.github/workflows/dev_test_onnx.yml +++ b/.github/workflows/dev_test_onnx.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - macos-13 diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml index 857028ab2db..c9104ebbd6c 100644 --- a/.github/workflows/dev_test_onnxruntime.yml +++ b/.github/workflows/dev_test_onnxruntime.yml @@ -12,9 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - windows-2019 diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml index 807ed0b1dab..117db50437b 100644 --- a/.github/workflows/dev_test_optimum_common.yml +++ b/.github/workflows/dev_test_optimum_common.yml @@ -12,10 +12,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: - - 3.7 - - 3.8 - - 3.9 + python-version: ['3.9', '3.11'] os: - ubuntu-20.04 - windows-2019 diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml index 0cd19a1724c..d1fd4a9723f 100644 --- a/.github/workflows/test_export_onnx.yml +++ b/.github/workflows/test_export_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_export_tflite.yml b/.github/workflows/test_export_tflite.yml index 362390b166d..225a28c1cba 100644 --- a/.github/workflows/test_export_tflite.yml +++ b/.github/workflows/test_export_tflite.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli.yml b/.github/workflows/test_export_tflite_cli.yml index e14e4cde325..cfca58cf9c1 100644 --- a/.github/workflows/test_export_tflite_cli.yml +++ b/.github/workflows/test_export_tflite_cli.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml index 7e4a83b3b7b..9cebe8ac0f6 100644 --- a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml +++ b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml index 981dd005e52..ca35ad8b3eb 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml index 9064bfaf315..1531ffa5c9c 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml index 824e8933a08..7274d09c0f8 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml index e975997e379..6c8639ebfe0 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml index ef59cff0b92..39902d0dd50 100644 --- a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml +++ b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml @@ -20,10 +20,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.8, 3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml index 11f6038afe4..801e0bebc55 100644 --- a/.github/workflows/test_exporters_common.yml +++ b/.github/workflows/test_exporters_common.yml @@ -15,10 +15,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9', '3.11'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml index 453389d63fa..b5f142fc7dc 100644 --- a/.github/workflows/test_exporters_slow.yml +++ b/.github/workflows/test_exporters_slow.yml @@ -14,10 +14,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v2 - name: Setup Python ${{ matrix.python-version }} diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml index a4e6dd3cd29..0a1890cc715 100644 --- a/.github/workflows/test_fx.yml +++ b/.github/workflows/test_fx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, macos-13] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml index 20911fe6db8..29b7b183bd7 100644 --- a/.github/workflows/test_offline.yml +++ b/.github/workflows/test_offline.yml @@ -15,10 +15,9 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] - os: [ubuntu-20.04] + python-version: ['3.9'] - runs-on: ${{ matrix.os }} + runs-on: ubuntu-20.04 steps: - name: Checkout code uses: actions/checkout@v4 diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml index dd1f3bee63d..418a9e42c1a 100644 --- a/.github/workflows/test_onnx.yml +++ b/.github/workflows/test_onnx.yml @@ -15,7 +15,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, macos-14] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 0ab95752d01..089300f7cd9 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -39,7 +39,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: '3.9' - name: Install dependencies run: | diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml index c5679e5b307..89d44e57ad1 100644 --- a/.github/workflows/test_onnxruntime_slow.yml +++ b/.github/workflows/test_onnxruntime_slow.yml @@ -14,7 +14,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04] runs-on: ${{ matrix.os }} diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml index 5ad42807a5f..9aab45e4b71 100644 --- a/.github/workflows/test_optimum_common.yml +++ b/.github/workflows/test_optimum_common.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: ['3.9'] os: [ubuntu-20.04, windows-2019, macos-13] runs-on: ${{ matrix.os }} @@ -36,5 +36,5 @@ jobs: shell: bash run: | # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel. - export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }} + export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.9' && matrix.os == 'ubuntu-20.04' }} pytest tests/test_*.py diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index b5f2e27fc6a..0126b023c60 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-20.04, macos-13] - python-version: [3.9] + python-version: ['3.9'] runs-on: ${{ matrix.os }} steps: diff --git a/setup.py b/setup.py index 82892bfcc8c..7ea0da56c29 100644 --- a/setup.py +++ b/setup.py @@ -123,9 +123,10 @@ "Intended Audience :: Education", "Intended Audience :: Science/Research", "Operating System :: OS Independent", - "Programming Language :: Python :: 3.7", - "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3", "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], keywords="transformers, quantization, pruning, optimization, training, inference, onnx, onnx runtime, intel, " @@ -137,7 +138,7 @@ packages=find_namespace_packages(include=["optimum*"]), install_requires=REQUIRED_PKGS, extras_require=EXTRAS_REQUIRE, - python_requires=">=3.7.0", + python_requires=">=3.9.0", include_package_data=True, zip_safe=False, entry_points={"console_scripts": ["optimum-cli=optimum.commands.optimum_cli:main"]}, From e8b03321035ea19001bcbb773444e3f0574d4150 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Fri, 15 Nov 2024 17:15:33 +0100 Subject: [PATCH 47/73] Update Dockerfile base image (#2089) upgrade base image --- docs/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/Dockerfile b/docs/Dockerfile index 29ea0f916ce..d76dc50c556 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM nikolaik/python-nodejs:python3.8-nodejs18 +FROM nikolaik/python-nodejs:python3.9-nodejs18 ARG commit_sha ARG clone_url From c513437511e51ccedb4f28c30e6aea9c0cf76a4a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Mon, 18 Nov 2024 13:47:29 +0100 Subject: [PATCH 48/73] Add transformers 4.36 tests (#2085) * add transformers 4.36 tests * add test depending on tranformers version * add min transformers required version for gemma * update macos * fix whisper test * add opt * fix mpt * add comment * add granite testwhen supported by transformers --- .github/workflows/test_onnxruntime.yml | 4 ++- optimum/exporters/onnx/model_configs.py | 4 ++- setup.py | 10 +++---- tests/onnxruntime/test_modeling.py | 37 +++++++++++++++---------- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index 089300f7cd9..fec5c7e5b27 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -18,8 +18,10 @@ jobs: fail-fast: false matrix: transformers-version: ["latest"] - os: [ubuntu-20.04, windows-2019, macos-13] + os: [ubuntu-20.04, windows-2019, macos-15] include: + - transformers-version: "4.36.*" + os: ubuntu-20.04 - transformers-version: "4.45.*" os: ubuntu-20.04 diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index cc752779d30..6b92109b7b6 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -295,7 +295,7 @@ class Qwen2OnnxConfig(LlamaOnnxConfig): class GemmaOnnxConfig(LlamaOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator - pass + MIN_TRANSFORMERS_VERSION = version.parse("4.38.0") class GraniteOnnxConfig(LlamaOnnxConfig): @@ -348,6 +348,8 @@ def patch_model_for_export( class MPTOnnxConfig(TextDecoderOnnxConfig): # MPT does not require position_ids input. DEFAULT_ONNX_OPSET = 13 + # TODO: fix inference for transformers < v4.41 for beam_search > 1 + MIN_TRANSFORMERS_VERSION = version.parse("4.41.0") NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers" ) diff --git a/setup.py b/setup.py index 7ea0da56c29..29f97b604e0 100644 --- a/setup.py +++ b/setup.py @@ -54,7 +54,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "onnxruntime-gpu": [ "onnx", @@ -63,19 +63,19 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters": [ "onnx", "onnxruntime", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-gpu": [ "onnx", "onnxruntime-gpu", "timm", - "transformers<4.47.0", + "transformers>=4.36,<4.47.0", ], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", @@ -86,7 +86,7 @@ "h5py", "numpy<1.24.0", "datasets<=2.16", - "transformers>=4.26,<4.38", + "transformers>=4.36,<4.38", ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index a335e014478..84ac27029f9 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2318,21 +2318,28 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin): "bloom", "codegen", "falcon", - "gemma", "gpt2", "gpt_bigcode", "gpt_neo", "gpt_neox", "gptj", - "granite", "llama", "mistral", - "mpt", "opt", ] - if check_if_transformers_greater("4.40"): - SUPPORTED_ARCHITECTURES.extend(["gemma", "phi3", "qwen2"]) + if check_if_transformers_greater("4.37"): + SUPPORTED_ARCHITECTURES.append("qwen2") + + if check_if_transformers_greater("4.38"): + SUPPORTED_ARCHITECTURES.append("gemma") + + # TODO: fix "mpt" for which inference fails for transformers < v4.41 + if check_if_transformers_greater("4.41"): + SUPPORTED_ARCHITECTURES.extend(["phi3", "mpt"]) + + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES.append("granite") FULL_GRID = { "model_arch": SUPPORTED_ARCHITECTURES, @@ -2445,7 +2452,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach transformers_model = AutoModelForCausalLM.from_pretrained(model_id) transformers_model = transformers_model.eval() tokenizer = get_preprocessor(model_id) - tokens = tokenizer("This is a sample output", return_tensors="pt") + tokens = tokenizer("This is a sample input", return_tensors="pt") position_ids = None if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS: input_shape = tokens["input_ids"].shape @@ -2467,7 +2474,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach # Compare batched generation. tokenizer.pad_token_id = tokenizer.eos_token_id tokenizer.padding_side = "left" - tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True) + tokens = tokenizer(["This is", "This is a sample input"], return_tensors="pt", padding=True) onnx_model.generation_config.eos_token_id = None transformers_model.generation_config.eos_token_id = None onnx_model.config.eos_token_id = None @@ -4598,14 +4605,14 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str): ) self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv)) - self.assertEqual( - outputs_model_with_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) - self.assertEqual( - outputs_model_without_pkv.shape[1], - self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1, - ) + + if model_arch == "whisper" and check_if_transformers_greater("4.43"): + gen_length = self.GENERATION_LENGTH + 2 + else: + gen_length = self.GENERATION_LENGTH + 1 + + self.assertEqual(outputs_model_with_pkv.shape[1], gen_length) + self.assertEqual(outputs_model_without_pkv.shape[1], gen_length) self.GENERATION_LENGTH = generation_length if os.environ.get("TEST_LEVEL", 0) == "1": From 400bb82f312016b0a31b342d48b00d031786417d Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Mon, 18 Nov 2024 15:05:37 +0100 Subject: [PATCH 49/73] [`fix`] Allow ORTQuantizer over models with subfolder ONNX files (#2094) * Allow ORTQuantizer over models with subfolder ONNX files * Also catch ValueError as that seems a common fail when AutoConfig.from_pretrained("does/not/exist") * Use test case that previously failed --- optimum/onnxruntime/quantization.py | 9 +++++---- tests/onnxruntime/test_quantization.py | 8 ++++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index 056123f8d8e..f637916dcd2 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -100,7 +100,7 @@ def __init__(self, onnx_model_path: Path, config: Optional["PretrainedConfig"] = if self.config is None: try: self.config = AutoConfig.from_pretrained(self.onnx_model_path.parent) - except OSError: + except (OSError, ValueError): LOGGER.warning( f"Could not load the config for {self.onnx_model_path} automatically, this might make " "the quantized model harder to use because it will not be able to be loaded by an ORTModel without " @@ -134,6 +134,7 @@ def from_pretrained( model_or_path = Path(model_or_path) path = None + config = None if isinstance(model_or_path, ORTModelForConditionalGeneration): raise NotImplementedError(ort_quantizer_error_message) elif isinstance(model_or_path, Path) and file_name is None: @@ -147,13 +148,13 @@ def from_pretrained( file_name = onnx_files[0].name if isinstance(model_or_path, ORTModel): - if path is None: - path = Path(model_or_path.model._model_path) + path = Path(model_or_path.model._model_path) + config = model_or_path.config elif os.path.isdir(model_or_path): path = Path(model_or_path) / file_name else: raise ValueError(f"Unable to load model from {model_or_path}.") - return cls(path) + return cls(path, config=config) def fit( self, diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py index b6f1ebb70f6..34a9504f95a 100644 --- a/tests/onnxruntime/test_quantization.py +++ b/tests/onnxruntime/test_quantization.py @@ -30,6 +30,7 @@ AutoQuantizationConfig, ORTConfig, ORTModelForCausalLM, + ORTModelForFeatureExtraction, ORTModelForSeq2SeqLM, ORTModelForSequenceClassification, ORTQuantizer, @@ -52,6 +53,13 @@ class ORTQuantizerTest(unittest.TestCase): "optimum/distilbert-base-uncased-finetuned-sst-2-english" ) }, + "ort_model_with_onnx_model_in_subfolder": { + "model_or_path": ORTModelForFeatureExtraction.from_pretrained( + "sentence-transformers/all-MiniLM-L6-v2", + subfolder="onnx", + file_name="model.onnx", + ) + }, } @parameterized.expand(LOAD_CONFIGURATION.items()) From a7a807c9e712fd9669865358e34c1de072b78d8e Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 19 Nov 2024 13:10:57 +0100 Subject: [PATCH 50/73] SD3 and Flux support (#2073) * sd3 support * unsupported cli model types * flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests * fixes * move input generators * dummy diffusers * style * sd3 support * unsupported cli model types * flux transformer support, unet export fixes, updated callback test, updated negative prompt test, flux and sd3 tests * fixes * move input generators * dummy diffusers * style * distribute ort tests * fix * fix * fix * test num images * single process to reduce re-exports * test * revert unnecessary changes * T5Encoder inherits from TextEncoder * style * fix typo in timestep * style * only test sd3 and flux on latest transformers * conditional sd3 and flux modeling * forgot sd3 inpaint --- .github/workflows/test_onnxruntime.yml | 13 +- optimum/exporters/onnx/base.py | 1 + optimum/exporters/onnx/convert.py | 4 + optimum/exporters/onnx/model_configs.py | 123 +++++++++-- optimum/exporters/tasks.py | 29 ++- optimum/exporters/utils.py | 190 +++++++++++------ optimum/onnxruntime/__init__.py | 72 +++++-- optimum/onnxruntime/modeling_diffusion.py | 202 +++++++++++++++++-- optimum/utils/__init__.py | 7 + optimum/utils/constant.py | 4 +- optimum/utils/dummy_diffusers_objects.py | 74 ++++++- optimum/utils/input_generators.py | 81 +++++++- tests/exporters/exporters_utils.py | 4 +- tests/exporters/onnx/test_onnx_export.py | 2 - tests/onnxruntime/test_diffusion.py | 192 +++++++++++------- tests/onnxruntime/test_modeling.py | 2 +- tests/onnxruntime/test_quantization.py | 4 +- tests/onnxruntime/utils_onnxruntime_tests.py | 4 +- 18 files changed, 791 insertions(+), 217 deletions(-) diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index fec5c7e5b27..b20a3b46f88 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -26,14 +26,11 @@ jobs: os: ubuntu-20.04 runs-on: ${{ matrix.os }} + steps: - name: Free Disk Space (Ubuntu) if: matrix.os == 'ubuntu-20.04' uses: jlumbroso/free-disk-space@main - with: - tool-cache: false - swap-storage: false - large-packages: false - name: Checkout code uses: actions/checkout@v4 @@ -54,13 +51,11 @@ jobs: run: pip install transformers==${{ matrix.transformers-version }} - name: Test with pytest (in series) - working-directory: tests run: | - pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s + pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s - name: Test with pytest (in parallel) + run: | + pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto env: HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} - working-directory: tests - run: | - pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py index 8cd94194ffe..7e35691d54b 100644 --- a/optimum/exporters/onnx/base.py +++ b/optimum/exporters/onnx/base.py @@ -319,6 +319,7 @@ def fix_dynamic_axes( input_shapes = {} dummy_inputs = self.generate_dummy_inputs(framework="np", **input_shapes) dummy_inputs = self.generate_dummy_inputs_for_validation(dummy_inputs, onnx_input_names=onnx_input_names) + dummy_inputs = self.rename_ambiguous_inputs(dummy_inputs) onnx_inputs = {} for name, value in dummy_inputs.items(): diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index 2661d835979..c12a9ac222a 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -1183,6 +1183,10 @@ def onnx_export_from_model( if tokenizer_2 is not None: tokenizer_2.save_pretrained(output.joinpath("tokenizer_2")) + tokenizer_3 = getattr(model, "tokenizer_3", None) + if tokenizer_3 is not None: + tokenizer_3.save_pretrained(output.joinpath("tokenizer_3")) + model.save_config(output) if float_dtype == "bf16": diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 6b92109b7b6..8984162ee8c 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Model specific ONNX configurations.""" + import random from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union @@ -28,6 +29,8 @@ DummyCodegenDecoderTextInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, + DummyFluxTransformerTextInputGenerator, + DummyFluxTransformerVisionInputGenerator, DummyInputGenerator, DummyIntGenerator, DummyPastKeyValuesGenerator, @@ -38,6 +41,9 @@ DummySpeechT5InputGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, + DummyTransformerTextInputGenerator, + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, DummyVisionEmbeddingsGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator, DummyVisionInputGenerator, @@ -53,6 +59,7 @@ NormalizedTextConfig, NormalizedTextConfigWithGQA, NormalizedVisionConfig, + check_if_diffusers_greater, check_if_transformers_greater, is_diffusers_available, logging, @@ -1039,22 +1046,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]: "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, "pooler_output": {0: "batch_size"}, } + if self._normalized_config.output_hidden_states: for i in range(self._normalized_config.num_layers + 1): common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"} return common_outputs - def generate_dummy_inputs(self, framework: str = "pt", **kwargs): - dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) - - # TODO: fix should be by casting inputs during inference and not export - if framework == "pt": - import torch - - dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32) - return dummy_inputs - def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], @@ -1064,7 +1062,7 @@ def patch_model_for_export( class UNetOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-3 + ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1087,17 +1085,19 @@ class UNetOnnxConfig(VisionOnnxConfig): def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = { "sample": {0: "batch_size", 2: "height", 3: "width"}, - "timestep": {0: "steps"}, + "timestep": {}, # a scalar with no dimension "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, } - # TODO : add text_image, image and image_embeds + # TODO : add addition_embed_type == text_image, image and image_embeds + # https://github.com/huggingface/diffusers/blob/9366c8f84bfe47099ff047272661786ebb54721d/src/diffusers/models/unets/unet_2d_condition.py#L671 if getattr(self._normalized_config, "addition_embed_type", None) == "text_time": common_inputs["text_embeds"] = {0: "batch_size"} common_inputs["time_ids"] = {0: "batch_size"} if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None: common_inputs["timestep_cond"] = {0: "batch_size"} + return common_inputs @property @@ -1136,7 +1136,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]: class VaeEncoderOnnxConfig(VisionOnnxConfig): - ATOL_FOR_VALIDATION = 1e-4 + ATOL_FOR_VALIDATION = 3e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu # operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 @@ -1184,6 +1184,101 @@ def outputs(self) -> Dict[str, Dict[int, str]]: } +class T5EncoderOnnxConfig(TextEncoderOnnxConfig): + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + ATOL_FOR_VALIDATION = 1e-4 + DEFAULT_ONNX_OPSET = 12 # int64 was supported since opset 12 + + @property + def inputs(self): + return { + "input_ids": {0: "batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self): + return { + "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, + } + + +class SD3TransformerOnnxConfig(VisionOnnxConfig): + ATOL_FOR_VALIDATION = 1e-4 + # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu + # operator support, available since opset 14 + DEFAULT_ONNX_OPSET = 14 + + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, + DummyTransformerTextInputGenerator, + ) + + NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( + image_size="sample_size", + num_channels="in_channels", + vocab_size="attention_head_dim", + hidden_size="joint_attention_dim", + projection_size="pooled_projection_dim", + allow_new=True, + ) + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = { + "hidden_states": {0: "batch_size", 2: "height", 3: "width"}, + "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, + "pooled_projections": {0: "batch_size"}, + "timestep": {0: "step"}, + } + + return common_inputs + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "out_hidden_states": {0: "batch_size", 2: "height", 3: "width"}, + } + + @property + def torch_to_onnx_output_map(self) -> Dict[str, str]: + return { + "sample": "out_hidden_states", + } + + +class FluxTransformerOnnxConfig(SD3TransformerOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = ( + DummyTransformerTimestepInputGenerator, + DummyFluxTransformerVisionInputGenerator, + DummyFluxTransformerTextInputGenerator, + ) + + @property + def inputs(self): + common_inputs = super().inputs + common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} + common_inputs["txt_ids"] = ( + {0: "sequence_length"} if check_if_diffusers_greater("0.31.0") else {0: "batch_size", 1: "sequence_length"} + ) + common_inputs["img_ids"] = ( + {0: "packed_height_width"} + if check_if_diffusers_greater("0.31.0") + else {0: "batch_size", 1: "packed_height_width"} + ) + + if getattr(self._normalized_config, "guidance_embeds", False): + common_inputs["guidance"] = {0: "batch_size"} + + return common_inputs + + @property + def outputs(self): + return { + "out_hidden_states": {0: "batch_size", 1: "packed_height_width"}, + } + + class GroupViTOnnxConfig(CLIPOnnxConfig): pass diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index fdc8bfcb539..b4bce4696f3 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -335,7 +335,11 @@ class TasksManager: } _DIFFUSERS_SUPPORTED_MODEL_TYPE = { - "clip-text-model": supported_tasks_mapping( + "t5-encoder": supported_tasks_mapping( + "feature-extraction", + onnx="T5EncoderOnnxConfig", + ), + "clip-text": supported_tasks_mapping( "feature-extraction", onnx="CLIPTextOnnxConfig", ), @@ -343,7 +347,15 @@ class TasksManager: "feature-extraction", onnx="CLIPTextWithProjectionOnnxConfig", ), - "unet": supported_tasks_mapping( + "flux-transformer-2d": supported_tasks_mapping( + "semantic-segmentation", + onnx="FluxTransformerOnnxConfig", + ), + "sd3-transformer-2d": supported_tasks_mapping( + "semantic-segmentation", + onnx="SD3TransformerOnnxConfig", + ), + "unet-2d-condition": supported_tasks_mapping( "semantic-segmentation", onnx="UNetOnnxConfig", ), @@ -1177,12 +1189,17 @@ class TasksManager: "transformers": _SUPPORTED_MODEL_TYPE, } _UNSUPPORTED_CLI_MODEL_TYPE = { - "unet", + # diffusers model types + "clip-text", + "clip-text-with-projection", + "flux-transformer-2d", + "sd3-transformer-2d", + "t5-encoder", + "unet-2d-condition", "vae-encoder", "vae-decoder", - "clip-text-model", - "clip-text-with-projection", - "trocr", # supported through the vision-encoder-decoder model type + # redundant model types + "trocr", # same as vision-encoder-decoder } _SUPPORTED_CLI_MODEL_TYPE = ( set(_SUPPORTED_MODEL_TYPE.keys()) diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index 949b54f4685..60de169de5e 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -15,7 +15,6 @@ """Utilities for model preparation to export.""" - import copy from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union @@ -44,12 +43,7 @@ "Please update diffusers by running `pip install --upgrade diffusers`" ) - from diffusers import ( - DiffusionPipeline, - StableDiffusionXLImg2ImgPipeline, - StableDiffusionXLInpaintPipeline, - StableDiffusionXLPipeline, - ) + from diffusers import DiffusionPipeline from diffusers.models.attention_processor import ( Attention, AttnAddedKVProcessor, @@ -80,6 +74,20 @@ DECODER_MERGED_NAME = "decoder_model_merged" +_DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE = { + "CLIPTextModel": "clip-text", + "CLIPTextModelWithProjection": "clip-text-with-projection", + "FluxTransformer2DModel": "flux-transformer-2d", + "SD3Transformer2DModel": "sd3-transformer-2d", + "UNet2DConditionModel": "unet-2d-condition", + "T5EncoderModel": "t5-encoder", +} + + +def _get_diffusers_submodel_type(submodel): + return _DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE.get(submodel.__class__.__name__) + + def _get_submodels_for_export_diffusion( pipeline: "DiffusionPipeline", ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]: @@ -87,56 +95,87 @@ def _get_submodels_for_export_diffusion( Returns the components of a Stable Diffusion model. """ - is_stable_diffusion_xl = isinstance( - pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline) - ) - if is_stable_diffusion_xl: - projection_dim = pipeline.text_encoder_2.config.projection_dim - else: - projection_dim = pipeline.text_encoder.config.projection_dim - models_for_export = {} + is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0") + is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL") + is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3") + # Text encoder text_encoder = getattr(pipeline, "text_encoder", None) if text_encoder is not None: - if is_stable_diffusion_xl: + if is_sdxl or is_sd3: text_encoder.config.output_hidden_states = True + text_encoder.text_model.config.output_hidden_states = True + + text_encoder.config.export_model_type = _get_diffusers_submodel_type(text_encoder) models_for_export["text_encoder"] = text_encoder - # U-NET - # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 - is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0") - if not is_torch_greater_or_equal_than_2_1: - pipeline.unet.set_attn_processor(AttnProcessor()) + # Text encoder 2 + text_encoder_2 = getattr(pipeline, "text_encoder_2", None) + if text_encoder_2 is not None: + if is_sdxl or is_sd3: + text_encoder_2.config.output_hidden_states = True + text_encoder_2.text_model.config.output_hidden_states = True - pipeline.unet.config.text_encoder_projection_dim = projection_dim - # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` - # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 - pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) - models_for_export["unet"] = pipeline.unet + text_encoder_2.config.export_model_type = _get_diffusers_submodel_type(text_encoder_2) + models_for_export["text_encoder_2"] = text_encoder_2 - # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + # Text encoder 3 + text_encoder_3 = getattr(pipeline, "text_encoder_3", None) + if text_encoder_3 is not None: + text_encoder_3.config.export_model_type = _get_diffusers_submodel_type(text_encoder_3) + models_for_export["text_encoder_3"] = text_encoder_3 + + # U-NET + unet = getattr(pipeline, "unet", None) + if unet is not None: + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 + if not is_torch_greater_or_equal_than_2_1: + unet.set_attn_processor(AttnProcessor()) + + # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score` + # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 + unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None) + unet.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim + unet.config.export_model_type = _get_diffusers_submodel_type(unet) + models_for_export["unet"] = unet + + # Transformer + transformer = getattr(pipeline, "transformer", None) + if transformer is not None: + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 + if not is_torch_greater_or_equal_than_2_1: + transformer.set_attn_processor(AttnProcessor()) + + transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) + transformer.config.time_cond_proj_dim = getattr(pipeline.transformer.config, "time_cond_proj_dim", None) + transformer.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim + transformer.config.export_model_type = _get_diffusers_submodel_type(transformer) + models_for_export["transformer"] = transformer + + # VAE Encoder vae_encoder = copy.deepcopy(pipeline.vae) + + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 if not is_torch_greater_or_equal_than_2_1: vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder) + # we return the distribution parameters to be able to recreate it in the decoder vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters} models_for_export["vae_encoder"] = vae_encoder - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + # VAE Decoder vae_decoder = copy.deepcopy(pipeline.vae) + + # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0 if not is_torch_greater_or_equal_than_2_1: vae_decoder = override_diffusers_2_0_attn_processors(vae_decoder) + vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample) models_for_export["vae_decoder"] = vae_decoder - text_encoder_2 = getattr(pipeline, "text_encoder_2", None) - if text_encoder_2 is not None: - text_encoder_2.config.output_hidden_states = True - text_encoder_2.text_model.config.output_hidden_states = True - models_for_export["text_encoder_2"] = text_encoder_2 - return models_for_export @@ -294,33 +333,59 @@ def get_diffusion_models_for_export( `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `ExportConfig`]: A Dict containing the model and export configs for the different components of the model. """ + models_for_export = _get_submodels_for_export_diffusion(pipeline) # Text encoder if "text_encoder" in models_for_export: + text_encoder = models_for_export["text_encoder"] text_encoder_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.text_encoder, - exporter=exporter, - library_name="diffusers", - task="feature-extraction", + model=text_encoder, exporter=exporter, library_name="diffusers", task="feature-extraction" ) text_encoder_export_config = text_encoder_config_constructor( - pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype ) models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config) + # Text encoder 2 + if "text_encoder_2" in models_for_export: + text_encoder_2 = models_for_export["text_encoder_2"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_2, exporter=exporter, library_name="diffusers", task="feature-extraction" + ) + export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) + + # Text encoder 3 + if "text_encoder_3" in models_for_export: + text_encoder_3 = models_for_export["text_encoder_3"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=text_encoder_3, exporter=exporter, library_name="diffusers", task="feature-extraction" + ) + export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["text_encoder_3"] = (models_for_export["text_encoder_3"], export_config) + # U-NET - export_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.unet, - exporter=exporter, - library_name="diffusers", - task="semantic-segmentation", - model_type="unet", - ) - unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["unet"] = (models_for_export["unet"], unet_export_config) + if "unet" in models_for_export: + unet = models_for_export["unet"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=unet, exporter=exporter, library_name="diffusers", task="semantic-segmentation" + ) + unet_export_config = export_config_constructor(unet.config, int_dtype=int_dtype, float_dtype=float_dtype) + models_for_export["unet"] = (models_for_export["unet"], unet_export_config) - # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565 + # Transformer + if "transformer" in models_for_export: + transformer = models_for_export["transformer"] + export_config_constructor = TasksManager.get_exporter_config_constructor( + model=transformer, exporter=exporter, library_name="diffusers", task="semantic-segmentation" + ) + transformer_export_config = export_config_constructor( + transformer.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["transformer"] = (models_for_export["transformer"], transformer_export_config) + + # VAE Encoder vae_encoder = models_for_export["vae_encoder"] vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_encoder, @@ -329,10 +394,12 @@ def get_diffusion_models_for_export( task="semantic-segmentation", model_type="vae-encoder", ) - vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["vae_encoder"] = (vae_encoder, vae_export_config) + vae_encoder_export_config = vae_config_constructor( + vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config) - # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600 + # VAE Decoder vae_decoder = models_for_export["vae_decoder"] vae_config_constructor = TasksManager.get_exporter_config_constructor( model=vae_decoder, @@ -341,21 +408,10 @@ def get_diffusion_models_for_export( task="semantic-segmentation", model_type="vae-decoder", ) - vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype) - models_for_export["vae_decoder"] = (vae_decoder, vae_export_config) - - if "text_encoder_2" in models_for_export: - export_config_constructor = TasksManager.get_exporter_config_constructor( - model=pipeline.text_encoder_2, - exporter=exporter, - library_name="diffusers", - task="feature-extraction", - model_type="clip-text-with-projection", - ) - export_config = export_config_constructor( - pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype - ) - models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config) + vae_decoder_export_config = vae_config_constructor( + vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype + ) + models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config) return models_for_export diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py index 4e25a436909..f3f1535fd45 100644 --- a/optimum/onnxruntime/__init__.py +++ b/optimum/onnxruntime/__init__.py @@ -74,33 +74,51 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: _import_structure[".utils.dummy_diffusers_objects"] = [ - "ORTStableDiffusionPipeline", + "ORTDiffusionPipeline", + "ORTPipelineForText2Image", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + # flux + "ORTFluxPipeline", + # lcm + "ORTLatentConsistencyModelImg2ImgPipeline", + "ORTLatentConsistencyModelPipeline", + # sd3 + "ORTStableDiffusion3Img2ImgPipeline", + "ORTStableDiffusion3InpaintPipeline", + "ORTStableDiffusion3Pipeline", + # sd "ORTStableDiffusionImg2ImgPipeline", "ORTStableDiffusionInpaintPipeline", - "ORTStableDiffusionXLPipeline", + "ORTStableDiffusionPipeline", + # xl "ORTStableDiffusionXLImg2ImgPipeline", "ORTStableDiffusionXLInpaintPipeline", - "ORTLatentConsistencyModelPipeline", - "ORTLatentConsistencyModelImg2ImgPipeline", - "ORTPipelineForImage2Image", - "ORTPipelineForInpainting", - "ORTPipelineForText2Image", - "ORTDiffusionPipeline", + "ORTStableDiffusionXLPipeline", ] else: _import_structure["modeling_diffusion"] = [ - "ORTStableDiffusionPipeline", + "ORTDiffusionPipeline", + "ORTPipelineForText2Image", + "ORTPipelineForImage2Image", + "ORTPipelineForInpainting", + # flux + "ORTFluxPipeline", + # lcm + "ORTLatentConsistencyModelImg2ImgPipeline", + "ORTLatentConsistencyModelPipeline", + # sd3 + "ORTStableDiffusion3Img2ImgPipeline", + "ORTStableDiffusion3InpaintPipeline", + "ORTStableDiffusion3Pipeline", + # sd "ORTStableDiffusionImg2ImgPipeline", "ORTStableDiffusionInpaintPipeline", - "ORTStableDiffusionXLPipeline", + "ORTStableDiffusionPipeline", + # xl "ORTStableDiffusionXLImg2ImgPipeline", "ORTStableDiffusionXLInpaintPipeline", - "ORTLatentConsistencyModelImg2ImgPipeline", - "ORTLatentConsistencyModelPipeline", - "ORTPipelineForImage2Image", - "ORTPipelineForInpainting", - "ORTPipelineForText2Image", - "ORTDiffusionPipeline", + "ORTStableDiffusionXLPipeline", ] @@ -151,30 +169,52 @@ raise OptionalDependencyNotAvailable() except OptionalDependencyNotAvailable: from ..utils.dummy_diffusers_objects import ( + # generic entrypoint ORTDiffusionPipeline, + # flux + ORTFluxPipeline, + # lcm ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, + # task-specific entrypoints ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, + # sd3 + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTStableDiffusion3Pipeline, + # sd ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, + # xl ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, ) else: from .modeling_diffusion import ( + # generic entrypoint ORTDiffusionPipeline, + # flux + ORTFluxPipeline, + # lcm ORTLatentConsistencyModelImg2ImgPipeline, ORTLatentConsistencyModelPipeline, + # task-specific entrypoints ORTPipelineForImage2Image, ORTPipelineForInpainting, ORTPipelineForText2Image, + # sd3 + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTStableDiffusion3Pipeline, + # sd ORTStableDiffusionImg2ImgPipeline, ORTStableDiffusionInpaintPipeline, ORTStableDiffusionPipeline, + # xl ORTStableDiffusionXLImg2ImgPipeline, ORTStableDiffusionXLInpaintPipeline, ORTStableDiffusionXLPipeline, diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 3899a7b36b6..79d302be449 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -57,7 +57,9 @@ from ..onnx.utils import _get_model_external_data_paths from ..utils import ( DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER, DIFFUSION_MODEL_UNET_SUBFOLDER, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, @@ -76,7 +78,7 @@ if check_if_diffusers_greater("0.25.0"): from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution else: - from diffusers.models.vae import DiagonalGaussianDistribution + from diffusers.models.vae import DiagonalGaussianDistribution # type: ignore logger = logging.getLogger(__name__) @@ -92,15 +94,18 @@ class ORTDiffusionPipeline(ORTModel, DiffusionPipeline): def __init__( self, scheduler: "SchedulerMixin", - unet_session: ort.InferenceSession, vae_decoder_session: ort.InferenceSession, # optional pipeline models + unet_session: Optional[ort.InferenceSession] = None, + transformer_session: Optional[ort.InferenceSession] = None, vae_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_session: Optional[ort.InferenceSession] = None, text_encoder_2_session: Optional[ort.InferenceSession] = None, + text_encoder_3_session: Optional[ort.InferenceSession] = None, # optional pipeline submodels tokenizer: Optional["CLIPTokenizer"] = None, tokenizer_2: Optional["CLIPTokenizer"] = None, + tokenizer_3: Optional["CLIPTokenizer"] = None, feature_extractor: Optional["CLIPFeatureExtractor"] = None, # stable diffusion xl specific arguments force_zeros_for_empty_prompt: bool = True, @@ -111,16 +116,20 @@ def __init__( model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, **kwargs, ): - self.unet = ORTModelUnet(unet_session, self) - self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) - self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.unet = ORTModelUnet(unet_session, self) if unet_session is not None else None + self.transformer = ORTModelTransformer(transformer_session, self) if transformer_session is not None else None self.text_encoder = ( ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None ) self.text_encoder_2 = ( ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None ) + self.text_encoder_3 = ( + ORTModelTextEncoder(text_encoder_3_session, self) if text_encoder_3_session is not None else None + ) # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API + self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None + self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) if vae_decoder_session is not None else None self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder) # we allow passing these as torch models for now @@ -130,18 +139,22 @@ def __init__( self.scheduler = scheduler self.tokenizer = tokenizer self.tokenizer_2 = tokenizer_2 + self.tokenizer_3 = tokenizer_3 self.feature_extractor = feature_extractor all_pipeline_init_args = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, "scheduler": self.scheduler, "tokenizer": self.tokenizer, "tokenizer_2": self.tokenizer_2, + "tokenizer_3": self.tokenizer_3, "feature_extractor": self.feature_extractor, "requires_aesthetics_score": requires_aesthetics_score, "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt, @@ -157,7 +170,10 @@ def __init__( # inits ort specific attributes self.shared_attributes_init( - model=unet_session, use_io_binding=use_io_binding, model_save_dir=model_save_dir, **kwargs + model=unet_session if unet_session is not None else transformer_session, + use_io_binding=use_io_binding, + model_save_dir=model_save_dir, + **kwargs, ) def _save_pretrained(self, save_directory: Union[str, Path]): @@ -165,10 +181,12 @@ def _save_pretrained(self, save_directory: Union[str, Path]): models_to_save_paths = { (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER), + (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER), (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER), (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER), (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER), (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER), + (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER), } for model, save_path in models_to_save_paths: if model is not None: @@ -192,6 +210,8 @@ def _save_pretrained(self, save_directory: Union[str, Path]): self.tokenizer.save_pretrained(save_directory / "tokenizer") if self.tokenizer_2 is not None: self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2") + if self.tokenizer_3 is not None: + self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3") if self.feature_extractor is not None: self.feature_extractor.save_pretrained(save_directory / "feature_extractor") @@ -208,10 +228,12 @@ def _from_pretrained( cache_dir: str = HUGGINGFACE_HUB_CACHE, token: Optional[Union[bool, str]] = None, unet_file_name: str = ONNX_WEIGHTS_NAME, + transformer_file_name: str = ONNX_WEIGHTS_NAME, vae_decoder_file_name: str = ONNX_WEIGHTS_NAME, vae_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_file_name: str = ONNX_WEIGHTS_NAME, text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME, + text_encoder_3_file_name: str = ONNX_WEIGHTS_NAME, use_io_binding: Optional[bool] = None, provider: str = "CPUExecutionProvider", provider_options: Optional[Dict[str, Any]] = None, @@ -230,10 +252,12 @@ def _from_pretrained( allow_patterns.update( { unet_file_name, + transformer_file_name, vae_decoder_file_name, vae_encoder_file_name, text_encoder_file_name, text_encoder_2_file_name, + text_encoder_3_file_name, SCHEDULER_CONFIG_NAME, cls.config_name, CONFIG_NAME, @@ -259,10 +283,12 @@ def _from_pretrained( model_paths = { "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name, + "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name, "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name, "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name, "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name, "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name, + "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name, } sessions = {} @@ -276,7 +302,7 @@ def _from_pretrained( ) submodels = {} - for submodel in {"scheduler", "tokenizer", "tokenizer_2", "feature_extractor"}: + for submodel in {"scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"}: if kwargs.get(submodel, None) is not None: submodels[submodel] = kwargs.pop(submodel) elif config.get(submodel, (None, None))[0] is not None: @@ -385,17 +411,24 @@ def to(self, device: Union[torch.device, str, int]): if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider": return self - self.unet.session.set_providers([provider], provider_options=[provider_options]) self.vae_decoder.session.set_providers([provider], provider_options=[provider_options]) + if self.unet is not None: + self.unet.session.set_providers([provider], provider_options=[provider_options]) + if self.transformer is not None: + self.transformer.session.set_providers([provider], provider_options=[provider_options]) if self.vae_encoder is not None: self.vae_encoder.session.set_providers([provider], provider_options=[provider_options]) if self.text_encoder is not None: self.text_encoder.session.set_providers([provider], provider_options=[provider_options]) if self.text_encoder_2 is not None: self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options]) + if self.text_encoder_3 is not None: + self.text_encoder_3.session.set_providers([provider], provider_options=[provider_options]) - self.providers = self.unet.session.get_providers() + self.providers = ( + self.unet.session.get_providers() if self.unet is not None else self.transformer.session.get_providers() + ) self._device = device return self @@ -412,8 +445,10 @@ def components(self) -> Dict[str, Any]: components = { "vae": self.vae, "unet": self.unet, + "transformer": self.transformer, "text_encoder": self.text_encoder, "text_encoder_2": self.text_encoder_2, + "text_encoder_3": self.text_encoder_3, "safety_checker": self.safety_checker, "image_encoder": self.image_encoder, } @@ -443,9 +478,13 @@ def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionP self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())} + self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()} self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()} + self.input_shapes = {input_key.name: input_key.shape for input_key in self.session.get_inputs()} + self.output_shapes = {output_key.name: output_key.shape for output_key in self.session.get_outputs()} + config_file_path = Path(session._model_path).parent / self.config_name if not config_file_path.is_file(): # config is mandatory for the model part to be used for inference @@ -543,13 +582,18 @@ def __init__(self, *args, **kwargs): ) self.register_to_config(time_cond_proj_dim=None) + if len(self.input_shapes["timestep"]) > 0: + logger.warning( + "The exported unet onnx model expects a non scalar timestep input. " + "We will have to unsqueeze the timestep input at each iteration which might be inefficient. " + "Please re-export the pipeline with newer version of optimum and diffusers to avoid this warning." + ) + def forward( self, sample: Union[np.ndarray, torch.Tensor], timestep: Union[np.ndarray, torch.Tensor], encoder_hidden_states: Union[np.ndarray, torch.Tensor], - text_embeds: Optional[Union[np.ndarray, torch.Tensor]] = None, - time_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None, cross_attention_kwargs: Optional[Dict[str, Any]] = None, added_cond_kwargs: Optional[Dict[str, Any]] = None, @@ -557,15 +601,13 @@ def forward( ): use_torch = isinstance(sample, torch.Tensor) - if len(timestep.shape) == 0: + if len(self.input_shapes["timestep"]) > 0: timestep = timestep.unsqueeze(0) model_inputs = { "sample": sample, "timestep": timestep, "encoder_hidden_states": encoder_hidden_states, - "text_embeds": text_embeds, - "time_ids": time_ids, "timestep_cond": timestep_cond, **(cross_attention_kwargs or {}), **(added_cond_kwargs or {}), @@ -581,6 +623,42 @@ def forward( return ModelOutput(**model_outputs) +class ORTModelTransformer(ORTPipelinePart): + def forward( + self, + hidden_states: Union[np.ndarray, torch.Tensor], + encoder_hidden_states: Union[np.ndarray, torch.Tensor], + pooled_projections: Union[np.ndarray, torch.Tensor], + timestep: Union[np.ndarray, torch.Tensor], + guidance: Optional[Union[np.ndarray, torch.Tensor]] = None, + txt_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + img_ids: Optional[Union[np.ndarray, torch.Tensor]] = None, + joint_attention_kwargs: Optional[Dict[str, Any]] = None, + return_dict: bool = False, + ): + use_torch = isinstance(hidden_states, torch.Tensor) + + model_inputs = { + "hidden_states": hidden_states, + "encoder_hidden_states": encoder_hidden_states, + "pooled_projections": pooled_projections, + "timestep": timestep, + "guidance": guidance, + "txt_ids": txt_ids, + "img_ids": img_ids, + **(joint_attention_kwargs or {}), + } + + onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs) + onnx_outputs = self.session.run(None, onnx_inputs) + model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs) + + if return_dict: + return model_outputs + + return ModelOutput(**model_outputs) + + class ORTModelTextEncoder(ORTPipelinePart): def forward( self, @@ -599,11 +677,13 @@ def forward( if output_hidden_states: model_outputs["hidden_states"] = [] - for i in range(self.config.num_hidden_layers): + num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers + for i in range(num_layers): model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}")) model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state")) else: - for i in range(self.config.num_hidden_layers): + num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers + for i in range(num_layers): model_outputs.pop(f"hidden_states.{i}", None) if return_dict: @@ -620,7 +700,7 @@ def __init__(self, *args, **kwargs): if not hasattr(self.config, "scaling_factor"): logger.warning( "The `scaling_factor` attribute is missing from the VAE encoder configuration. " - "Please re-export the model with newer version of optimum and diffusers." + "Please re-export the model with newer version of optimum and diffusers to avoid this warning." ) self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) @@ -660,7 +740,7 @@ def __init__(self, *args, **kwargs): if not hasattr(self.config, "scaling_factor"): logger.warning( "The `scaling_factor` attribute is missing from the VAE decoder configuration. " - "Please re-export the model with newer version of optimum and diffusers." + "Please re-export the model with newer version of optimum and diffusers to avoid this warning." ) self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1)) @@ -871,6 +951,80 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi auto_model_class = LatentConsistencyModelImg2ImgPipeline +class ORTUnavailablePipeline: + MIN_VERSION = None + + def __init__(self, *args, **kwargs): + raise NotImplementedError( + f"The pipeline {self.__class__.__name__} is not available in the current version of `diffusers`. " + f"Please upgrade `diffusers` to {self.MIN_VERSION} or later." + ) + + +if check_if_diffusers_greater("0.29.0"): + from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3Pipeline(ORTDiffusionPipeline, StableDiffusion3Pipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Pipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusion3Pipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = StableDiffusion3Pipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3Img2ImgPipeline(ORTDiffusionPipeline, StableDiffusion3Img2ImgPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Img2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusion3Img2ImgPipeline). + """ + + main_input_name = "image" + export_feature = "image-to-image" + auto_model_class = StableDiffusion3Img2ImgPipeline + +else: + + class ORTStableDiffusion3Pipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.29.0" + + class ORTStableDiffusion3Img2ImgPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.29.0" + + +if check_if_diffusers_greater("0.30.0"): + from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTStableDiffusion3InpaintPipeline(ORTDiffusionPipeline, StableDiffusion3InpaintPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3InpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusion3InpaintPipeline). + """ + + main_input_name = "prompt" + export_feature = "inpainting" + auto_model_class = StableDiffusion3InpaintPipeline + + @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) + class ORTFluxPipeline(ORTDiffusionPipeline, FluxPipeline): + """ + ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.FluxPipeline](https://huggingface.co/docs/diffusers/api/pipelines/flux/text2img#diffusers.FluxPipeline). + """ + + main_input_name = "prompt" + export_feature = "text-to-image" + auto_model_class = FluxPipeline + +else: + + class ORTStableDiffusion3InpaintPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.30.0" + + class ORTFluxPipeline(ORTUnavailablePipeline): + MIN_VERSION = "0.30.0" + + SUPPORTED_ORT_PIPELINES = [ ORTStableDiffusionPipeline, ORTStableDiffusionImg2ImgPipeline, @@ -880,6 +1034,10 @@ class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsi ORTStableDiffusionXLInpaintPipeline, ORTLatentConsistencyModelPipeline, ORTLatentConsistencyModelImg2ImgPipeline, + ORTStableDiffusion3Pipeline, + ORTStableDiffusion3Img2ImgPipeline, + ORTStableDiffusion3InpaintPipeline, + ORTFluxPipeline, ] @@ -897,23 +1055,27 @@ def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = Tr ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( [ + ("flux", ORTFluxPipeline), + ("latent-consistency", ORTLatentConsistencyModelPipeline), ("stable-diffusion", ORTStableDiffusionPipeline), + ("stable-diffusion-3", ORTStableDiffusion3Pipeline), ("stable-diffusion-xl", ORTStableDiffusionXLPipeline), - ("latent-consistency", ORTLatentConsistencyModelPipeline), ] ) ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( [ + ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline), + ("stable-diffusion-3", ORTStableDiffusion3Img2ImgPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline), - ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline), ] ) ORT_INPAINT_PIPELINES_MAPPING = OrderedDict( [ ("stable-diffusion", ORTStableDiffusionInpaintPipeline), + ("stable-diffusion-3", ORTStableDiffusion3InpaintPipeline), ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline), ] ) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index db7d1f6975d..40d93d298e4 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -16,7 +16,9 @@ from .constant import ( CONFIG_NAME, DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER, + DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER, DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER, + DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER, DIFFUSION_MODEL_UNET_SUBFOLDER, DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER, DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER, @@ -52,6 +54,8 @@ DummyCodegenDecoderTextInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, + DummyFluxTransformerTextInputGenerator, + DummyFluxTransformerVisionInputGenerator, DummyInputGenerator, DummyIntGenerator, DummyLabelsGenerator, @@ -63,6 +67,9 @@ DummySpeechT5InputGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, + DummyTransformerTextInputGenerator, + DummyTransformerTimestepInputGenerator, + DummyTransformerVisionInputGenerator, DummyVisionEmbeddingsGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator, DummyVisionInputGenerator, diff --git a/optimum/utils/constant.py b/optimum/utils/constant.py index 4497b5246d4..eb7a67e9ece 100644 --- a/optimum/utils/constant.py +++ b/optimum/utils/constant.py @@ -15,8 +15,10 @@ CONFIG_NAME = "config.json" DIFFUSION_MODEL_UNET_SUBFOLDER = "unet" -DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder" +DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer" DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder" DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder" +DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder" DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER = "text_encoder_2" +DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3" ONNX_WEIGHTS_NAME = "model.onnx" diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py index 35d1ffe9fc7..ff8b587e19f 100644 --- a/optimum/utils/dummy_diffusers_objects.py +++ b/optimum/utils/dummy_diffusers_objects.py @@ -15,6 +15,50 @@ from .import_utils import DummyObject, requires_backends +class ORTDiffusionPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForText2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForImage2Image(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTPipelineForInpainting(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + class ORTStableDiffusionPipeline(metaclass=DummyObject): _backends = ["diffusers"] @@ -70,6 +114,17 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) +class ORTStableDiffusionXLInpaintPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + class ORTLatentConsistencyModelPipeline(metaclass=DummyObject): _backends = ["diffusers"] @@ -81,7 +136,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTDiffusionPipeline(metaclass=DummyObject): +class ORTLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -92,7 +147,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForText2Image(metaclass=DummyObject): +class ORTStableDiffusion3Pipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -103,7 +158,7 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForImage2Image(metaclass=DummyObject): +class ORTStableDiffusion3Img2ImgPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): @@ -114,7 +169,18 @@ def from_pretrained(cls, *args, **kwargs): requires_backends(cls, ["diffusers"]) -class ORTPipelineForInpainting(metaclass=DummyObject): +class ORTStableDiffusion3InpaintPipeline(metaclass=DummyObject): + _backends = ["diffusers"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["diffusers"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["diffusers"]) + + +class ORTFluxPipeline(metaclass=DummyObject): _backends = ["diffusers"] def __init__(self, *args, **kwargs): diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index dac14a38114..148072aa0b4 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -22,7 +22,7 @@ import numpy as np from transformers.utils import is_tf_available, is_torch_available -from ..utils import check_if_transformers_greater +from ..utils import check_if_diffusers_greater, check_if_transformers_greater from .normalized_config import ( NormalizedConfig, NormalizedEncoderDecoderConfig, @@ -36,7 +36,7 @@ import torch if is_tf_available(): - import tensorflow as tf + import tensorflow as tf # type: ignore def check_framework_is_available(func): @@ -871,8 +871,8 @@ def __init__( def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "timestep": - shape = [self.batch_size] - return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype) + shape = [] # a scalar with no dimension (it can be int or float depending on the sd architecture) + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) if input_name == "text_embeds": dim = self.text_encoder_projection_dim @@ -1411,3 +1411,76 @@ def generate( float_dtype: str = "fp32", ): return self.random_int_tensor(shape=(1,), min_value=20, max_value=22, framework=framework, dtype=int_dtype) + + +class DummyTransformerTimestepInputGenerator(DummyTimestepInputGenerator): + SUPPORTED_INPUT_NAMES = ("timestep",) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "timestep": + shape = [self.batch_size] # With transformer diffusers, timestep is a 1D tensor + return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyTransformerVisionInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ("hidden_states",) + + +class DummyTransformerTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "encoder_hidden_states", + "pooled_projection", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "encoder_hidden_states": + return super().generate(input_name, framework, int_dtype, float_dtype)[0] + + elif input_name == "pooled_projections": + return self.random_float_tensor( + [self.batch_size, self.normalized_config.projection_size], framework=framework, dtype=float_dtype + ) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTransformerVisionInputGenerator(DummyTransformerVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "hidden_states", + "img_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "hidden_states": + shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels] + return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) + elif input_name == "img_ids": + shape = ( + [(self.height // 2) * (self.width // 2), 3] + if check_if_diffusers_greater("0.31.0") + else [self.batch_size, (self.height // 2) * (self.width // 2), 3] + ) + return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) + + +class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "encoder_hidden_states", + "pooled_projections", + "txt_ids", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "txt_ids": + shape = ( + [self.sequence_length, 3] + if check_if_diffusers_greater("0.31.0") + else [self.batch_size, self.sequence_length, 3] + ) + return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + + return super().generate(input_name, framework, int_dtype, float_dtype) diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index ccccb5510bf..31059c403de 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -297,9 +297,11 @@ } PYTORCH_DIFFUSION_MODEL = { + "flux": "optimum-internal-testing/tiny-random-flux", + "latent-consistency": "echarlaix/tiny-random-latent-consistency", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", + "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", - "latent-consistency": "echarlaix/tiny-random-latent-consistency", } PYTORCH_TIMM_MODEL = { diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py index 7671d6cd2e6..88288547c95 100644 --- a/tests/exporters/onnx/test_onnx_export.py +++ b/tests/exporters/onnx/test_onnx_export.py @@ -299,7 +299,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device with TemporaryDirectory() as tmpdirname: _, onnx_outputs = export_models( models_and_onnx_configs=models_and_onnx_configs, - opset=14, output_dir=Path(tmpdirname), device=device, ) @@ -307,7 +306,6 @@ def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device models_and_onnx_configs=models_and_onnx_configs, onnx_named_outputs=onnx_outputs, output_dir=Path(tmpdirname), - atol=1e-4, use_subprocess=False, ) diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py index 956566f0e1f..07f90e8984e 100644 --- a/tests/onnxruntime/test_diffusion.py +++ b/tests/onnxruntime/test_diffusion.py @@ -34,6 +34,7 @@ ORTPipelineForInpainting, ORTPipelineForText2Image, ) +from optimum.utils import check_if_transformers_greater from optimum.utils.testing_utils import grid_parameters, require_diffusers @@ -71,7 +72,29 @@ def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type= class ORTPipelineForText2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3", "flux"] + + NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + CALLBACK_SUPPORTED_ARCHITECTURES += ["flux"] ORTMODEL_CLASS = ORTPipelineForText2Image AUTOMODEL_CLASS = AutoPipelineForText2Image @@ -120,8 +143,8 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images @@ -142,12 +165,12 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -164,6 +187,7 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_callback = Callback() auto_callback = Callback() @@ -171,9 +195,8 @@ def __call__(self, *args, **kwargs) -> None: ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertTrue(auto_callback.has_been_called) @@ -200,10 +223,20 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: - self.assertEqual( - outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), - ) + expected_height = height // pipeline.vae_scale_factor + expected_width = width // pipeline.vae_scale_factor + + if model_arch == "flux": + channels = pipeline.transformer.config.in_channels + expected_shape = (batch_size, expected_height * expected_width, channels) + elif model_arch == "stable-diffusion-3": + out_channels = pipeline.transformer.config.out_channels + expected_shape = (batch_size, out_channels, expected_height, expected_width) + else: + out_channels = pipeline.unet.config.out_channels + expected_shape = (batch_size, out_channels, expected_height, expected_width) + + self.assertEqual(outputs.shape, expected_shape) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -224,45 +257,22 @@ def test_image_reproducibility(self, model_arch: str): self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0])) np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES) def test_negative_prompt(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} self._setup(model_args) height, width, batch_size = 64, 64, 1 inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) + inputs["negative_prompt"] = ["This is a negative prompt"] * batch_size - negative_prompt = ["This is a negative prompt"] - pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) + diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) - images_1 = pipeline(**inputs, negative_prompt=negative_prompt, generator=get_generator("pt", SEED)).images - prompt = inputs.pop("prompt") - - if model_arch == "stable-diffusion-xl": - ( - inputs["prompt_embeds"], - inputs["negative_prompt_embeds"], - inputs["pooled_prompt_embeds"], - inputs["negative_pooled_prompt_embeds"], - ) = pipeline.encode_prompt( - prompt=prompt, - num_images_per_prompt=1, - device=torch.device("cpu"), - do_classifier_free_guidance=True, - negative_prompt=negative_prompt, - ) - else: - inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = pipeline.encode_prompt( - prompt=prompt, - num_images_per_prompt=1, - device=torch.device("cpu"), - do_classifier_free_guidance=True, - negative_prompt=negative_prompt, - ) - - images_2 = pipeline(**inputs, generator=get_generator("pt", SEED)).images - - np.testing.assert_allclose(images_1, images_2, atol=1e-4, rtol=1e-2) + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand( grid_parameters( @@ -285,9 +295,9 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) + self.assertEqual(pipeline.device.type, "cuda") outputs = pipeline(**inputs).images - self.assertIsInstance(outputs, np.ndarray) self.assertEqual(outputs.shape, (batch_size, height, width, 3)) @@ -326,7 +336,19 @@ def test_safety_checker(self, model_arch: str): class ORTPipelineForImage2ImageTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl", "latent-consistency"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + "latent-consistency", + ] AUTOMODEL_CLASS = AutoPipelineForImage2Image ORTMODEL_CLASS = ORTPipelineForImage2Image @@ -373,14 +395,14 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -398,15 +420,16 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_callback = Callback() auto_callback = Callback() - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) @@ -434,9 +457,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -454,10 +487,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -541,7 +574,17 @@ def test_safety_checker(self, model_arch: str): class ORTPipelineForInpaintingTest(ORTModelTestMixin): - SUPPORTED_ARCHITECTURES = ["stable-diffusion", "stable-diffusion-xl"] + SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + ] + if check_if_transformers_greater("4.45"): + SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"] + + CALLBACK_SUPPORTED_ARCHITECTURES = [ + "stable-diffusion", + "stable-diffusion-xl", + ] AUTOMODEL_CLASS = AutoPipelineForInpainting ORTMODEL_CLASS = ORTPipelineForInpainting @@ -593,14 +636,14 @@ def test_num_images_per_prompt(self, model_arch: str): pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) for batch_size in [1, 3]: - for height in [64, 128]: - for width in [64, 128]: + for height in [16, 32]: + for width in [16, 32]: for num_images_per_prompt in [1, 3]: inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3)) - @parameterized.expand(SUPPORTED_ARCHITECTURES) + @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES) @require_diffusers def test_callback(self, model_arch: str): model_args = {"test_name": model_arch, "model_arch": model_arch} @@ -618,15 +661,16 @@ def __init__(self): def __call__(self, *args, **kwargs) -> None: self.has_been_called = True self.number_of_steps += 1 + return kwargs ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch]) auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch]) ort_callback = Callback() auto_callback = Callback() - # callback_steps=1 to trigger callback every step - ort_pipe(**inputs, callback=ort_callback, callback_steps=1) - auto_pipe(**inputs, callback=auto_callback, callback_steps=1) + + ort_pipe(**inputs, callback_on_step_end=ort_callback) + auto_pipe(**inputs, callback_on_step_end=auto_callback) self.assertTrue(ort_callback.has_been_called) self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps) @@ -654,9 +698,19 @@ def test_shape(self, model_arch: str): elif output_type == "pt": self.assertEqual(outputs.shape, (batch_size, 3, height, width)) else: + out_channels = ( + pipeline.unet.config.out_channels + if pipeline.unet is not None + else pipeline.transformer.config.out_channels + ) self.assertEqual( outputs.shape, - (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor), + ( + batch_size, + out_channels, + height // pipeline.vae_scale_factor, + width // pipeline.vae_scale_factor, + ), ) @parameterized.expand(SUPPORTED_ARCHITECTURES) @@ -674,10 +728,10 @@ def test_compare_to_diffusers_pipeline(self, model_arch: str): for output_type in ["latent", "np", "pt"]: inputs["output_type"] = output_type - ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images - diffusers_output = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images + ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images + diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images - np.testing.assert_allclose(ort_output, diffusers_output, atol=1e-4, rtol=1e-2) + np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2) @parameterized.expand(SUPPORTED_ARCHITECTURES) @require_diffusers @@ -719,7 +773,7 @@ def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str): inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size) pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider) - self.assertEqual(pipeline.device, "cuda") + self.assertEqual(pipeline.device.type, "cuda") outputs = pipeline(**inputs).images self.assertIsInstance(outputs, np.ndarray) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 84ac27029f9..c4340dcd8b6 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -143,7 +143,7 @@ class ORTModelIntegrationTest(unittest.TestCase): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.TEST_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" - self.LOCAL_MODEL_PATH = "assets/onnx" + self.LOCAL_MODEL_PATH = "tests/assets/onnx" self.ONNX_MODEL_ID = "philschmid/distilbert-onnx" self.TINY_ONNX_MODEL_ID = "fxmarty/resnet-tiny-beans" self.FAIL_ONNX_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad" diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py index 34a9504f95a..cf451590fbd 100644 --- a/tests/onnxruntime/test_quantization.py +++ b/tests/onnxruntime/test_quantization.py @@ -42,10 +42,10 @@ class ORTQuantizerTest(unittest.TestCase): LOAD_CONFIGURATION = { "local_asset": { - "model_or_path": "assets/onnx", + "model_or_path": "tests/assets/onnx", }, "local_asset_different_name": { - "model_or_path": "assets/onnx", + "model_or_path": "tests/assets/onnx", "file_name": "different_name.onnx", }, "ort_model_class": { diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index 9f200e69b3d..ba8f6cc4abc 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -98,6 +98,7 @@ }, "falcon": "fxmarty/really-tiny-falcon-testing", "flaubert": "hf-internal-testing/tiny-random-flaubert", + "flux": "optimum-internal-testing/tiny-random-flux", "gemma": "fxmarty/tiny-random-GemmaForCausalLM", "gpt2": "hf-internal-testing/tiny-random-gpt2", "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel", @@ -108,10 +109,10 @@ "groupvit": "hf-internal-testing/tiny-random-groupvit", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", - "levit": "hf-internal-testing/tiny-random-LevitModel", "latent-consistency": "echarlaix/tiny-random-latent-consistency", "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel", "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model", + "levit": "hf-internal-testing/tiny-random-LevitModel", "longt5": "hf-internal-testing/tiny-random-LongT5Model", "llama": "optimum-internal-testing/tiny-random-llama", "m2m_100": "hf-internal-testing/tiny-random-m2m_100", @@ -143,6 +144,7 @@ "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", + "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", From d2a5a6aa2adbe9561527a85c4a4947a6d7fcfa58 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 21 Nov 2024 15:03:14 +0100 Subject: [PATCH 51/73] Remove datasets as required dependency (#2087) * remove datasets required dependency * install datasets when needed * add datasets installed when needed * style * add require dataset * divide datasets tests * import datasets only when needed --- .github/workflows/dev_test_benckmark.yml | 2 +- .github/workflows/test_benckmark.yml | 2 +- .github/workflows/test_utils.yml | 11 ++++++++++- optimum/gptq/data.py | 16 ++++++++++++++- optimum/gptq/quantizer.py | 2 +- optimum/onnxruntime/configuration.py | 15 +++++++++----- optimum/onnxruntime/model.py | 9 ++++++--- optimum/onnxruntime/quantization.py | 17 ++++++++++------ optimum/onnxruntime/runs/calibrator.py | 10 ++++++---- optimum/runs_base.py | 8 +++++--- optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 12 ++++++++++++ optimum/utils/preprocessing/base.py | 19 +++++++++++++----- optimum/utils/testing_utils.py | 5 +++++ pyproject.toml | 1 + setup.py | 3 --- tests/utils/test_task_processors.py | 25 +++++++++++++++++++++++- 17 files changed, 123 insertions(+), 35 deletions(-) diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml index a898d288625..381197b129a 100644 --- a/.github/workflows/dev_test_benckmark.yml +++ b/.github/workflows/dev_test_benckmark.yml @@ -23,7 +23,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets pip install -U git+https://github.com/huggingface/evaluate pip install -U git+https://github.com/huggingface/diffusers pip install -U git+https://github.com/huggingface/transformers diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml index e859e845d64..fe7df1a20cc 100644 --- a/.github/workflows/test_benckmark.yml +++ b/.github/workflows/test_benckmark.yml @@ -30,7 +30,7 @@ jobs: - name: Install dependencies run: | pip install wheel - pip install .[tests,onnxruntime,benchmark] + pip install .[tests,onnxruntime,benchmark] datasets - name: Test with unittest run: | python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py' diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml index 0126b023c60..bbe00e62841 100644 --- a/.github/workflows/test_utils.yml +++ b/.github/workflows/test_utils.yml @@ -37,4 +37,13 @@ jobs: - name: Test with pytest working-directory: tests run: | - python -m pytest -s -vvvv utils + pytest utils -s -n auto -m "not datasets_test" --durations=0 + + - name: Install datasets + run: | + pip install datasets + + - name: Tests needing datasets + working-directory: tests + run: | + pytest utils -s -n auto -m "datasets_test" --durations=0 \ No newline at end of file diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py index b8734da478e..7e5fc0b43db 100644 --- a/optimum/gptq/data.py +++ b/optimum/gptq/data.py @@ -18,7 +18,12 @@ import numpy as np import torch -from datasets import load_dataset + +from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available + + +if is_datasets_available(): + from datasets import load_dataset """ @@ -113,6 +118,9 @@ def pad_block(block, pads): def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2")) + if split == "train": data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train") elif split == "validation": @@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": @@ -157,6 +168,9 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"): + if not is_datasets_available(): + raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new")) + if split == "train": data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"}) elif split == "validation": diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 949d4d260df..849d8821ebf 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -88,7 +88,7 @@ def __init__( dataset (`Union[List[str], str, Any]`, defaults to `None`): The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...]) - or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new']. + or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new']. group_size (int, defaults to 128): The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization. damp_percent (`float`, defaults to `0.1`): diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py index 2e3d9f32d6a..adc1984795a 100644 --- a/optimum/onnxruntime/configuration.py +++ b/optimum/onnxruntime/configuration.py @@ -18,9 +18,8 @@ from dataclasses import asdict, dataclass, field from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union -from datasets import Dataset from packaging.version import Version, parse from onnxruntime import __version__ as ort_version @@ -33,6 +32,10 @@ from ..utils import logging +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.get_logger(__name__) # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel" @@ -117,7 +120,9 @@ def create_calibrator( class AutoCalibrationConfig: @staticmethod - def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig: + def minmax( + dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01 + ) -> CalibrationConfig: """ Args: dataset (`Dataset`): @@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f @staticmethod def entropy( - dataset: Dataset, + dataset: "Dataset", num_bins: int = 128, num_quantized_bins: int = 128, ) -> CalibrationConfig: @@ -188,7 +193,7 @@ def entropy( ) @staticmethod - def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: + def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig: """ Args: dataset (`Dataset`): diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py index caa662f3824..4182abc925f 100644 --- a/optimum/onnxruntime/model.py +++ b/optimum/onnxruntime/model.py @@ -14,10 +14,9 @@ import logging import os -from typing import Callable, Dict, List, Optional, Union +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union import numpy as np -from datasets import Dataset from transformers import EvalPrediction from transformers.trainer_pt_utils import nested_concat from transformers.trainer_utils import EvalLoopOutput @@ -25,6 +24,10 @@ from onnxruntime import InferenceSession +if TYPE_CHECKING: + from datasets import Dataset + + logger = logging.getLogger(__name__) @@ -59,7 +62,7 @@ def __init__( self.session = InferenceSession(str(model_path), providers=[execution_provider]) self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())} - def evaluation_loop(self, dataset: Dataset): + def evaluation_loop(self, dataset: "Dataset"): """ Run evaluation and returns metrics and predictions. diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py index f637916dcd2..054a2310a6b 100644 --- a/optimum/onnxruntime/quantization.py +++ b/optimum/onnxruntime/quantization.py @@ -21,7 +21,6 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union import onnx -from datasets import Dataset, load_dataset from packaging.version import Version, parse from transformers import AutoConfig @@ -29,6 +28,7 @@ from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer from onnxruntime.quantization.qdq_quantizer import QDQQuantizer +from optimum.utils.import_utils import requires_backends from ..quantization_base import OptimumQuantizer from ..utils.save_utils import maybe_save_preprocessors @@ -40,6 +40,7 @@ if TYPE_CHECKING: + from datasets import Dataset from transformers import PretrainedConfig LOGGER = logging.getLogger(__name__) @@ -48,7 +49,7 @@ class ORTCalibrationDataReader(CalibrationDataReader): __slots__ = ["batch_size", "dataset", "_dataset_iter"] - def __init__(self, dataset: Dataset, batch_size: int = 1): + def __init__(self, dataset: "Dataset", batch_size: int = 1): if dataset is None: raise ValueError("Provided dataset is None.") @@ -158,7 +159,7 @@ def from_pretrained( def fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -212,7 +213,7 @@ def fit( def partial_fit( self, - dataset: Dataset, + dataset: "Dataset", calibration_config: CalibrationConfig, onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx", operators_to_quantize: Optional[List[str]] = None, @@ -428,7 +429,7 @@ def get_calibration_dataset( seed: int = 2016, use_auth_token: Optional[Union[bool, str]] = None, token: Optional[Union[bool, str]] = None, - ) -> Dataset: + ) -> "Dataset": """ Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step. @@ -474,6 +475,10 @@ def get_calibration_dataset( "provided." ) + requires_backends(self, ["datasets"]) + + from datasets import load_dataset + calib_dataset = load_dataset( dataset_name, name=dataset_config_name, @@ -492,7 +497,7 @@ def get_calibration_dataset( return self.clean_calibration_dataset(processed_calib_dataset) - def clean_calibration_dataset(self, dataset: Dataset) -> Dataset: + def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset": model = onnx.load(self.onnx_model_path) model_inputs = {input.name for input in model.graph.input} ignored_columns = list(set(dataset.column_names) - model_inputs) diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py index c493a943747..bfdcd64d92e 100644 --- a/optimum/onnxruntime/runs/calibrator.py +++ b/optimum/onnxruntime/runs/calibrator.py @@ -1,6 +1,4 @@ -from typing import Dict, List - -from datasets import Dataset +from typing import TYPE_CHECKING, Dict, List from ...runs_base import Calibrator from .. import ORTQuantizer @@ -9,10 +7,14 @@ from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy +if TYPE_CHECKING: + from datasets import Dataset + + class OnnxRuntimeCalibrator(Calibrator): def __init__( self, - calibration_dataset: Dataset, + calibration_dataset: "Dataset", quantizer: ORTQuantizer, model_path: str, qconfig: QuantizationConfig, diff --git a/optimum/runs_base.py b/optimum/runs_base.py index 3a1d164c602..dadd445818f 100644 --- a/optimum/runs_base.py +++ b/optimum/runs_base.py @@ -2,13 +2,12 @@ import subprocess from contextlib import contextmanager from time import perf_counter_ns -from typing import Set +from typing import TYPE_CHECKING, Set import numpy as np import optuna import torch import transformers -from datasets import Dataset from tqdm import trange from . import version as optimum_version @@ -21,6 +20,9 @@ from .utils.runs import RunConfig, cpu_info_command +if TYPE_CHECKING: + from datasets import Dataset + os.environ["TOKENIZERS_PARALLELISM"] = "false" @@ -34,7 +36,7 @@ def get_autoclass_name(task): class Calibrator: def __init__( - self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion + self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion ): self.calibration_dataset = calibration_dataset self.quantizer = quantizer diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 40d93d298e4..fb1794af49c 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -35,6 +35,7 @@ check_if_transformers_greater, is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_onnx_available, is_onnxruntime_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 35a6294ab52..405e3815b33 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -69,6 +69,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _auto_gptq_available = _is_package_available("auto_gptq") _timm_available = _is_package_available("timm") _sentence_transformers_available = _is_package_available("sentence_transformers") +_datasets_available = _is_package_available("datasets") torch_version = None if is_torch_available(): @@ -131,6 +132,10 @@ def is_sentence_transformers_available(): return _sentence_transformers_available +def is_datasets_available(): + return _datasets_available + + def is_auto_gptq_available(): if _auto_gptq_available: version_autogptq = version.parse(importlib_metadata.version("auto_gptq")) @@ -230,6 +235,12 @@ def require_numpy_strictly_lower(package_version: str, message: str): -U transformers`. Please note that you may need to restart your runtime after installation. """ +DATASETS_IMPORT_ERROR = """ +{0} requires the datasets library but it was not found in your environment. You can install it with pip: +`pip install datasets`. Please note that you may need to restart your runtime after installation. +""" + + BACKENDS_MAPPING = OrderedDict( [ ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)), @@ -245,6 +256,7 @@ def require_numpy_strictly_lower(package_version: str, message: str): "transformers_434", (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")), ), + ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)), ] ) diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py index dc995ccc50b..7cfda13ba7d 100644 --- a/optimum/utils/preprocessing/base.py +++ b/optimum/utils/preprocessing/base.py @@ -20,15 +20,16 @@ from abc import ABC, abstractmethod from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union -from datasets import Dataset, DatasetDict -from datasets import load_dataset as datasets_load_dataset from transformers import PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +from optimum.utils.import_utils import requires_backends + from .. import logging if TYPE_CHECKING: + from datasets import Dataset, DatasetDict from transformers import PretrainedConfig @@ -102,11 +103,14 @@ def create_dataset_processing_func( def prepare_dataset( self, - dataset: Union[DatasetDict, Dataset], + dataset: Union["DatasetDict", "Dataset"], data_keys: Dict[str, str], ref_keys: Optional[List[str]] = None, split: Optional[str] = None, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + from datasets import Dataset + if isinstance(dataset, Dataset) and split is not None: raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.") elif split is not None: @@ -131,7 +135,12 @@ def load_dataset( num_samples: Optional[int] = None, shuffle: bool = False, **load_dataset_kwargs, - ) -> Union[DatasetDict, Dataset]: + ) -> Union["DatasetDict", "Dataset"]: + requires_backends(self, ["datasets"]) + + from datasets import Dataset, DatasetDict + from datasets import load_dataset as datasets_load_dataset + dataset = datasets_load_dataset(path, **load_dataset_kwargs) if isinstance(dataset, DatasetDict) and load_smallest_split: diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py index 76fe9a05b13..88b1acdb780 100644 --- a/optimum/utils/testing_utils.py +++ b/optimum/utils/testing_utils.py @@ -28,6 +28,7 @@ from . import ( is_accelerate_available, is_auto_gptq_available, + is_datasets_available, is_diffusers_available, is_sentence_transformers_available, is_timm_available, @@ -146,6 +147,10 @@ def require_sentence_transformers(test_case): return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case) +def require_datasets(test_case): + return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case) + + def grid_parameters( parameters: Dict[str, Iterable[Any]], yield_dict: bool = False, diff --git a/pyproject.toml b/pyproject.toml index 99a0f1c85fa..17bcd90e066 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ markers = [ "rocm_ep_test", "tensorflow_test", "timm_test", + "datasets_test", "run_in_series", "run_slow", "accelerate_test", diff --git a/setup.py b/setup.py index 29f97b604e0..6736085943a 100644 --- a/setup.py +++ b/setup.py @@ -13,14 +13,11 @@ REQUIRED_PKGS = [ - "coloredlogs", - "sympy", "transformers>=4.29", "torch>=1.11", "packaging", "numpy", "huggingface_hub>=0.8.0", - "datasets", ] # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py index 16567048073..1a9f352a79f 100644 --- a/tests/utils/test_task_processors.py +++ b/tests/utils/test_task_processors.py @@ -19,16 +19,21 @@ from typing import TYPE_CHECKING, Any, Dict, Tuple, Union from unittest import TestCase -from datasets import DatasetDict +import pytest from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer +from optimum.utils.import_utils import is_datasets_available from optimum.utils.preprocessing import TaskProcessorsManager +from optimum.utils.testing_utils import require_datasets if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedTokenizerBase from transformers.image_processing_utils import BaseImageProcessor +if is_datasets_available(): + from datasets import DatasetDict + TEXT_MODEL_NAME = "bert-base-uncased" CONFIG = AutoConfig.from_pretrained(TEXT_MODEL_NAME) @@ -122,6 +127,8 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre ) self.assertDictEqual(preprocessor_kwargs, clone) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_unallowed_data_keys(self): task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)( self.CONFIG, self.PREPROCESSOR @@ -188,15 +195,23 @@ def _test_load_dataset( return dataset + @require_datasets + @pytest.mark.datasets_test def test_load_dataset(self): return self._test_load_dataset(False, False, False) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_by_guessing_data_keys(self): return self._test_load_dataset(False, True, False) + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_and_only_keep_necessary_columns(self): return self._test_load_dataset(False, False, True) + @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): return self._test_load_dataset(True, False, False) @@ -207,6 +222,8 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -223,6 +240,8 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = random.randint(4, 16) dataset = self._test_load_dataset(False, False, True, max_length=max_length) @@ -232,6 +251,8 @@ def test_load_dataset_with_max_length(self): input_ids = dataset[0]["input_ids"] self.assertEqual(len(input_ids), max_length) + @require_datasets + @pytest.mark.datasets_test def test_load_default_dataset(self): self.skipTest( "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)" @@ -244,6 +265,8 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase): PREPROCESSOR = TOKENIZER WRONG_PREPROCESSOR = IMAGE_PROCESSOR + @require_datasets + @pytest.mark.datasets_test def test_load_dataset_with_max_length(self): max_length = 384 dataset = self._test_load_dataset(False, False, True, max_length=max_length) From 65a8a94adaf136dd677d28cfc837c0acfe993031 Mon Sep 17 00:00:00 2001 From: Raghu Ramarao Date: Mon, 25 Nov 2024 18:30:00 +0530 Subject: [PATCH 52/73] Add ONNX Support for Decision Transformer Model (#2038) * Decision Transformer to ONNX V0.1 * Decision Transformer to ONNX V0.2 * Update optimum/exporters/onnx/model_configs.py * Apply suggestions from code review * Update optimum/exporters/onnx/base.py * Update optimum/exporters/onnx/model_configs.py * Update optimum/utils/input_generators.py * Update optimum/exporters/onnx/model_configs.py * Apply suggestions from code review * Update optimum/exporters/tasks.py * ONNXToDT: changes to order of OrderedDict elements * make style changes * test * remove custom normalized config * remove unncessary dynamic axes --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Co-authored-by: IlyasMoutawwakil --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 25 +++++++++++++++++ optimum/exporters/tasks.py | 9 ++++++ optimum/utils/__init__.py | 1 + optimum/utils/input_generators.py | 37 +++++++++++++++++++++++++ tests/exporters/exporters_utils.py | 1 + 6 files changed, 74 insertions(+) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 747e1396fb4..2eaada7dadd 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -36,6 +36,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Data2VecVision - Deberta - Deberta-v2 +- Decision Transformer - Deit - Detr - DistilBert diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 8984162ee8c..bca7cf24acf 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -27,6 +27,7 @@ BloomDummyPastKeyValuesGenerator, DummyAudioInputGenerator, DummyCodegenDecoderTextInputGenerator, + DummyDecisionTransformerInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, DummyFluxTransformerTextInputGenerator, @@ -263,6 +264,30 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig): pass +class DecisionTransformerOnnxConfig(OnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (DummyDecisionTransformerInputGenerator,) + NORMALIZED_CONFIG_CLASS = NormalizedConfig + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "states": {0: "batch_size", 1: "sequence_length"}, + "actions": {0: "batch_size", 1: "sequence_length"}, + "timesteps": {0: "batch_size", 1: "sequence_length"}, + "returns_to_go": {0: "batch_size", 1: "sequence_length"}, + "attention_mask": {0: "batch_size", 1: "sequence_length"}, + } + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "state_preds": {0: "batch_size", 1: "sequence_length"}, + "action_preds": {0: "batch_size", 1: "sequence_length"}, + "return_preds": {0: "batch_size", 1: "sequence_length"}, + "last_hidden_state": {0: "batch_size", 1: "sequence_length"}, + } + + class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads") diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index b4bce4696f3..8f28ec42ce9 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -217,6 +217,7 @@ class TasksManager: "multiple-choice": "AutoModelForMultipleChoice", "object-detection": "AutoModelForObjectDetection", "question-answering": "AutoModelForQuestionAnswering", + "reinforcement-learning": "AutoModel", "semantic-segmentation": "AutoModelForSemanticSegmentation", "text-to-audio": ("AutoModelForTextToSpectrogram", "AutoModelForTextToWaveform"), "text-generation": "AutoModelForCausalLM", @@ -574,6 +575,11 @@ class TasksManager: onnx="DebertaV2OnnxConfig", tflite="DebertaV2TFLiteConfig", ), + "decision-transformer": supported_tasks_mapping( + "feature-extraction", + "reinforcement-learning", + onnx="DecisionTransformerOnnxConfig", + ), "deit": supported_tasks_mapping( "feature-extraction", "image-classification", @@ -2085,6 +2091,9 @@ def get_model_from_task( if original_task == "automatic-speech-recognition" or task == "automatic-speech-recognition": if original_task == "auto" and config.architectures is not None: model_class_name = config.architectures[0] + elif original_task == "reinforcement-learning" or task == "reinforcement-learning": + if config.architectures is not None: + model_class_name = config.architectures[0] if library_name == "diffusers": config = DiffusionPipeline.load_config(model_name_or_path, **kwargs) diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index fb1794af49c..2aa90253d08 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -53,6 +53,7 @@ DummyAudioInputGenerator, DummyBboxInputGenerator, DummyCodegenDecoderTextInputGenerator, + DummyDecisionTransformerInputGenerator, DummyDecoderTextInputGenerator, DummyEncodecInputGenerator, DummyFluxTransformerTextInputGenerator, diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 148072aa0b4..0ac1805f97d 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -507,6 +507,43 @@ class DummyDecoderTextInputGenerator(DummyTextInputGenerator): ) +class DummyDecisionTransformerInputGenerator(DummyTextInputGenerator): + """ + Generates dummy decision transformer inputs. + """ + + SUPPORTED_INPUT_NAMES = ( + "states", + "actions", + "timesteps", + "returns_to_go", + "attention_mask", + ) + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.act_dim = self.normalized_config.config.act_dim + self.state_dim = self.normalized_config.config.state_dim + self.max_ep_len = self.normalized_config.config.max_ep_len + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + if input_name == "states": + shape = [self.batch_size, self.sequence_length, self.state_dim] + elif input_name == "actions": + shape = [self.batch_size, self.sequence_length, self.act_dim] + elif input_name == "rewards": + shape = [self.batch_size, self.sequence_length, 1] + elif input_name == "returns_to_go": + shape = [self.batch_size, self.sequence_length, 1] + elif input_name == "attention_mask": + shape = [self.batch_size, self.sequence_length] + elif input_name == "timesteps": + shape = [self.batch_size, self.sequence_length] + return self.random_int_tensor(shape=shape, max_value=self.max_ep_len, framework=framework, dtype=int_dtype) + + return self.random_float_tensor(shape, min_value=-2.0, max_value=2.0, framework=framework, dtype=float_dtype) + + class DummySeq2SeqDecoderTextInputGenerator(DummyDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( "decoder_input_ids", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 31059c403de..c56132c384c 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -67,6 +67,7 @@ "data2vec-audio": "hf-internal-testing/tiny-random-Data2VecAudioModel", "deberta": "hf-internal-testing/tiny-random-DebertaModel", "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", + "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium", "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", From a6c696c7de105e7691d432dd80102beec78d8fd4 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 26 Nov 2024 20:52:43 +0100 Subject: [PATCH 53/73] Generate guidance for flux (#2104) generate guidance --- optimum/onnxruntime/modeling_diffusion.py | 17 +++++++++++++++-- optimum/utils/input_generators.py | 4 ++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py index 79d302be449..66b08e1ef66 100644 --- a/optimum/onnxruntime/modeling_diffusion.py +++ b/optimum/onnxruntime/modeling_diffusion.py @@ -437,8 +437,21 @@ def to(self, device: Union[torch.device, str, int]): def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs): return cls.load_config(config_name_or_path, **kwargs) - def _save_config(self, save_directory): - self.save_config(save_directory) + def _save_config(self, save_directory: Union[str, Path]): + model_dir = ( + self.model_save_dir + if not isinstance(self.model_save_dir, TemporaryDirectory) + else self.model_save_dir.name + ) + save_dir = Path(save_directory) + original_config = Path(model_dir) / self.config_name + if original_config.exists(): + if not save_dir.exists(): + save_dir.mkdir(parents=True) + + shutil.copy(original_config, save_dir) + else: + self.save_config(save_directory) @property def components(self) -> Dict[str, Any]: diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 0ac1805f97d..fbb77e6800a 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -1508,6 +1508,7 @@ class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator) SUPPORTED_INPUT_NAMES = ( "encoder_hidden_states", "pooled_projections", + "guidance", "txt_ids", ) @@ -1519,5 +1520,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int else [self.batch_size, self.sequence_length, 3] ) return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype) + elif input_name == "guidance": + shape = [self.batch_size] + return self.random_float_tensor(shape, min_value=0, max_value=1, framework=framework, dtype=float_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) From bd08f12d2d4ebffdb2a25e32eabab759e4de88e5 Mon Sep 17 00:00:00 2001 From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com> Date: Thu, 28 Nov 2024 15:13:11 +0100 Subject: [PATCH 54/73] Unbundle inputs generated by `DummyTimestepInputGenerator` (#2107) unbundle --- optimum/utils/input_generators.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index fbb77e6800a..18a2a5a3fd1 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -897,14 +897,14 @@ def __init__( ): self.task = task self.vocab_size = normalized_config.vocab_size - self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim - self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6 + self.text_encoder_projection_dim = getattr(normalized_config, "text_encoder_projection_dim", None) + self.time_ids = 5 if getattr(normalized_config, "requires_aesthetics_score", False) else 6 if random_batch_size_range: low, high = random_batch_size_range self.batch_size = random.randint(low, high) else: self.batch_size = batch_size - self.time_cond_proj_dim = normalized_config.config.time_cond_proj_dim + self.time_cond_proj_dim = getattr(normalized_config.config, "time_cond_proj_dim", None) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "timestep": @@ -912,8 +912,16 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) if input_name == "text_embeds": + if self.text_encoder_projection_dim is None: + raise ValueError( + "Unable to infer the value of `text_encoder_projection_dim` for generating `text_embeds`, please double check the config of your model." + ) dim = self.text_encoder_projection_dim elif input_name == "timestep_cond": + if self.time_cond_proj_dim is None: + raise ValueError( + "Unable to infer the value of `time_cond_proj_dim` for generating `timestep_cond`, please double check the config of your model." + ) dim = self.time_cond_proj_dim else: dim = self.time_ids From 28bd0ad8fccfb6dd8019cd2882a88d69386a134c Mon Sep 17 00:00:00 2001 From: Brando Tovar <44623235+bndos@users.noreply.github.com> Date: Thu, 28 Nov 2024 10:13:05 -0500 Subject: [PATCH 55/73] Pass the revision to SentenceTransformer models (#2105) feat: pass revision to SentenceTransformers --- optimum/exporters/tasks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 8f28ec42ce9..c50fa5cdfa4 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -2128,6 +2128,7 @@ def get_model_from_task( device=device, cache_folder=cache_folder, token=token, + revision=revision, trust_remote_code=trust_remote_code, ) else: From f22655c036e4e61a7b09748e7aa7e146a16ae64d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Mlyn=C3=A1=C5=99?= <47664722+mlynatom@users.noreply.github.com> Date: Mon, 2 Dec 2024 14:54:08 +0100 Subject: [PATCH 56/73] Add RemBERT ONNX support (#2108) * ONNX config for RemBERT added * added RemBERT to TasksManager * rembert added to exporters_utils * RemBERT added to test modelling tasks * changed rembert model * added RemBERT to test utils * Added RemBERT to documentation * Apply suggestions from code review --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 4 ++++ optimum/exporters/tasks.py | 9 +++++++++ tests/exporters/exporters_utils.py | 3 ++- tests/onnxruntime/test_modeling.py | 5 +++++ tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 2eaada7dadd..57005b85678 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -83,6 +83,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - PoolFormer - Qwen2(Qwen1.5) - RegNet +- RemBERT - ResNet - Roberta - Roformer diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index bca7cf24acf..b39d19ec782 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -162,6 +162,10 @@ class SplinterOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class RemBertOnnxConfig(BertOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + class DistilBertOnnxConfig(BertOnnxConfig): DEFAULT_ONNX_OPSET = 14 # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0 diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index c50fa5cdfa4..0a3758e97cf 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -431,6 +431,15 @@ class TasksManager: onnx="BertOnnxConfig", tflite="BertTFLiteConfig", ), + "rembert": supported_tasks_mapping( + "fill-mask", + "feature-extraction", + "text-classification", + "multiple-choice", + "token-classification", + "question-answering", + onnx="RemBertOnnxConfig", + ), # For big-bird and bigbird-pegasus being unsupported, refer to model_configs.py # "big-bird": supported_tasks_mapping( # "feature-extraction", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index c56132c384c..32156d9eebf 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -138,6 +138,7 @@ "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", # "rembert": "google/rembert", + "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen2": "fxmarty/tiny-dummy-qwen2", "regnet": "hf-internal-testing/tiny-random-RegNetModel", @@ -257,7 +258,7 @@ "owlv2": "google/owlv2-base-patch16", "owlvit": "google/owlvit-base-patch32", "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. - # "rembert": "google/rembert", + "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "regnet": "facebook/regnet-y-040", "resnet": "microsoft/resnet-50", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index c4340dcd8b6..8f52ef45180 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -1312,6 +1312,7 @@ class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm_qa", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1502,6 +1503,7 @@ class ORTModelForMaskedLMIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1682,6 +1684,7 @@ class ORTModelForSequenceClassificationIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -1882,6 +1885,7 @@ class ORTModelForTokenClassificationIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} @@ -2227,6 +2231,7 @@ class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin): "squeezebert", "xlm", "xlm_roberta", + "rembert", ] FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES} diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index ba8f6cc4abc..cccecd53817 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -135,6 +135,7 @@ "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", "qwen2": "fxmarty/tiny-dummy-qwen2", + "rembert": "hf-internal-testing/tiny-random-RemBertModel", "resnet": "hf-internal-testing/tiny-random-resnet", "roberta": "hf-internal-testing/tiny-random-RobertaModel", "roformer": "hf-internal-testing/tiny-random-RoFormerModel", From 3ba10576e755f8e0740251c891082ee96e722afa Mon Sep 17 00:00:00 2001 From: "Tang, Wenyi" Date: Mon, 2 Dec 2024 22:55:04 +0800 Subject: [PATCH 57/73] Fix `ModelPatcher` returns empty outputs (#2109) * fix bug `ModelPatcher` returns empty outputs When model's output is tuple or list, `filtered_outputs` doesn't get assigned and hence always a empty dict * typo --------- Co-authored-by: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> --- optimum/exporters/onnx/model_patcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index fdfb0e280f5..2c0f9aeba67 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -168,7 +168,7 @@ def patched_forward(*args, **kwargs): filterd_outputs[name] = value elif isinstance(outputs, (list, tuple)): outputs_list = list(config.outputs.keys()) - dict(zip(outputs_list, outputs)) + filterd_outputs = dict(zip(outputs_list, outputs)) else: if len(config.outputs) > 1: num_outputs = len(config.outputs) From ff8c8fc95cb03b6ce72e0812bf0294bb2ae4463a Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:00:05 +0100 Subject: [PATCH 58/73] Fix workflow to mark issues as stale (#2110) * add permissions * update stale message --- .github/workflows/stale.yml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index a5e50a795b6..7b3eb5feb0c 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -6,9 +6,12 @@ on: jobs: stale: runs-on: ubuntu-latest + permissions: + issues: write steps: - - uses: actions/stale@v8 + - uses: actions/stale@v9 with: - stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.' + stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' + exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' days-before-stale: 30 days-before-close: 5 From 01110adf076c94e395d1472a760eafac2c0a73aa Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 17:11:16 +0100 Subject: [PATCH 59/73] Remove doc-build (#2111) --- .github/workflows/build_main_documentation.yml | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml index c922f5097da..d38274f320a 100644 --- a/.github/workflows/build_main_documentation.yml +++ b/.github/workflows/build_main_documentation.yml @@ -18,12 +18,6 @@ jobs: repository: 'huggingface/doc-builder' path: doc-builder - - uses: actions/checkout@v2 - with: - repository: 'huggingface/doc-build' - path: doc-build - token: ${{ secrets.HUGGINGFACE_PUSH }} - - uses: actions/checkout@v2 with: repository: 'huggingface/optimum' From 7f2605ea94071f5495eac110ba240e2651ea8053 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:19:57 +0100 Subject: [PATCH 60/73] Downgrade stale bot to v8 and fix permissions (#2112) --- .github/workflows/stale.yml | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 7b3eb5feb0c..28cf3ad9dc2 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -3,13 +3,14 @@ on: schedule: - cron: '30 1 * * *' +permissions: + issues: write + jobs: stale: runs-on: ubuntu-latest - permissions: - issues: write steps: - - uses: actions/stale@v9 + - uses: actions/stale@v8 with: stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' From d6de6762e0e4bf8136f0435211a0e777f5bf2f33 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Tue, 3 Dec 2024 19:20:09 +0100 Subject: [PATCH 61/73] Update documentation color from google tpu section (#2113) * Update documentation color from google tpu section * fix --- docs/source/index.mdx | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 06133664ca8..1b54570ea80 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -43,7 +43,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem

Accelerate your training and inference workflows with AWS Trainium and AWS Inferentia

Google TPUs
+ >
Google TPUs

Accelerate your training and inference workflows with Google TPUs

-> [!TIP] -> Some packages provide hardware-agnostic features (e.g. INC interface in Optimum Intel). - - ## Open-source integrations 🤗 Optimum also supports a variety of open-source frameworks to make model optimization very easy. From 4a7cb298140ee9bed968d98a780a950d15bb2935 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Wed, 4 Dec 2024 17:04:37 +0100 Subject: [PATCH 62/73] Fix workflow to mark PRs as stale (#2116) --- .github/workflows/stale.yml | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 28cf3ad9dc2..6dc3ff2bbd9 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -5,6 +5,7 @@ on: permissions: issues: write + pull-requests: write jobs: stale: @@ -13,6 +14,10 @@ jobs: - uses: actions/stale@v8 with: stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.' + stale-pr-message: 'This PR has been marked as stale because it has been open for 90 days with no activity. This thread will be automatically closed in 30 days if no further activity occurs.' exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization' - days-before-stale: 30 - days-before-close: 5 + days-before-issue-stale: 30 + days-before-issue-close: 5 + days-before-pr-stale: 90 + days-before-pr-close: 30 + exempt-all-pr-assignees: true \ No newline at end of file From 12b3b35366bbc2282c45407eae642cdab4c1e894 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 12 Dec 2024 13:35:56 +0100 Subject: [PATCH 63/73] Enable transformers v4.47 support (#2119) * enable latest transformers release * fix custom module test * adapt config push to hub tests --- setup.py | 8 +++---- tests/onnx/test_onnx_export_custom_module.py | 4 ++-- tests/test_configuration_utils.py | 24 +++++++------------- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/setup.py b/setup.py index 6736085943a..28b6941ebe8 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ "datasets>=1.2.1", "evaluate", "protobuf>=3.20.1", - "transformers>=4.36,<4.47.0", + "transformers>=4.36,<4.48.0", ], "onnxruntime-gpu": [ "onnx", @@ -60,19 +60,19 @@ "evaluate", "protobuf>=3.20.1", "accelerate", # ORTTrainer requires it. - "transformers>=4.36,<4.47.0", + "transformers>=4.36,<4.48.0", ], "exporters": [ "onnx", "onnxruntime", "timm", - "transformers>=4.36,<4.47.0", + "transformers>=4.36,<4.48.0", ], "exporters-gpu": [ "onnx", "onnxruntime-gpu", "timm", - "transformers>=4.36,<4.47.0", + "transformers>=4.36,<4.48.0", ], "exporters-tf": [ "tensorflow>=2.4,<=2.12.1", diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py index 4398c14f01d..9416093c841 100644 --- a/tests/onnx/test_onnx_export_custom_module.py +++ b/tests/onnx/test_onnx_export_custom_module.py @@ -22,7 +22,7 @@ if is_torch_available(): import torch - from transformers.models.deberta import modeling_deberta + from transformers.models.sew_d import modeling_sew_d from optimum.utils import check_if_torch_greater @@ -36,7 +36,7 @@ def test_training(self): """Tests export of StableDropout in training mode.""" devnull = open(os.devnull, "wb") # drop_prob must be > 0 for the test to be meaningful - sd = modeling_deberta.StableDropout(0.1) + sd = modeling_sew_d.StableDropout(0.1) # Avoid warnings in training mode do_constant_folding = False # Dropout is a no-op in inference mode diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py index 4c721f089d7..d70b01fe7e1 100644 --- a/tests/test_configuration_utils.py +++ b/tests/test_configuration_utils.py @@ -12,13 +12,12 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import os import tempfile import unittest from huggingface_hub import HfFolder, delete_repo from requests.exceptions import HTTPError -from transformers.testing_utils import TOKEN, USER, is_staging_test +from transformers.testing_utils import TOKEN, TemporaryHubRepo, is_staging_test from optimum.configuration_utils import BaseConfig @@ -69,12 +68,11 @@ def tearDownClass(cls): def test_push_to_hub(self): config = FakeConfig(attribute=15) - with tempfile.TemporaryDirectory() as tmp_dir: - config.save_pretrained( - os.path.join(tmp_dir, "optimum-test-base-config"), push_to_hub=True, token=self._token - ) - new_config = FakeConfig.from_pretrained(f"{USER}/optimum-test-base-config") + with TemporaryHubRepo(token=self._token) as tmp_repo: + config.push_to_hub(tmp_repo.repo_id, token=self._token) + + new_config = FakeConfig.from_pretrained(tmp_repo.repo_id) for k, v in config.to_dict().items(): if k != "optimum_version" and k != "transformers_version": self.assertEqual(v, getattr(new_config, k)) @@ -82,15 +80,9 @@ def test_push_to_hub(self): def test_push_to_hub_in_organization(self): config = FakeConfig(attribute=15) - with tempfile.TemporaryDirectory() as tmp_dir: - config.save_pretrained( - os.path.join(tmp_dir, "optimum-test-base-config-org"), - push_to_hub=True, - token=self._token, - organization="valid_org", - ) - - new_config = FakeConfig.from_pretrained("valid_org/optimum-test-base-config-org") + with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo: + config.push_to_hub(tmp_repo.repo_id, token=self._token) + new_config = FakeConfig.from_pretrained(tmp_repo.repo_id) for k, v in config.to_dict().items(): if k != "optimum_version" and k != "transformers_version": self.assertEqual(v, getattr(new_config, k)) From 22d93e74ceffba796d8fb0dd47d99680be4b5608 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Thu, 12 Dec 2024 15:17:18 +0200 Subject: [PATCH 64/73] Add ONNX export support for MGP-STR (#2099) * Enable mpg-str ONNX export * No longer needed * Improve model patcher * Formatting * `ruff` * Also support image-to-text task * Add unit tests * Add listed support for MGP-STR --- docs/source/exporters/onnx/overview.mdx | 1 + optimum/exporters/onnx/model_configs.py | 16 ++++++++++++ optimum/exporters/onnx/model_patcher.py | 26 ++++++++++++++++++++ optimum/exporters/tasks.py | 7 +++++- tests/exporters/exporters_utils.py | 2 ++ tests/onnxruntime/utils_onnxruntime_tests.py | 1 + 6 files changed, 52 insertions(+), 1 deletion(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 57005b85678..46ab3cb8a64 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -65,6 +65,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Marian - MarkupLM - MBart +- MGP-STR - Mistral - MobileBert - MobileVit diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index b39d19ec782..85e235f9a96 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -82,6 +82,7 @@ from .model_patcher import ( CLIPModelPatcher, FalconModelPatcher, + MgpstrModelPatcher, MistralModelPatcher, MusicgenModelPatcher, SAMModelPatcher, @@ -933,6 +934,21 @@ def torch_to_onnx_input_map(self) -> Dict[str, str]: return {"x": "pixel_values"} +class MgpstrOnnxConfig(ViTOnnxConfig): + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + return { + "char_logits": {0: "batch_size"}, + "bpe_logits": {0: "batch_size"}, + "wp_logits": {0: "batch_size"}, + } + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return MgpstrModelPatcher(self, model, model_kwargs=model_kwargs) + + class SentenceTransformersTransformerOnnxConfig(TextEncoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DEFAULT_ONNX_OPSET = 14 # Some bottleneck transformers models require a specific ONNX opset to be successfully exported. We put a rather high opset here for the export to work for all architectures. diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py index 2c0f9aeba67..083bc127999 100644 --- a/optimum/exporters/onnx/model_patcher.py +++ b/optimum/exporters/onnx/model_patcher.py @@ -509,6 +509,32 @@ def patched_forward(*args, **kwargs): self.patched_forward = patched_forward +class MgpstrModelPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + super().__init__(config, model, model_kwargs) + + @functools.wraps(self.orig_forward) + def patched_forward(*args, **kwargs): + signature = inspect.signature(self.orig_forward) + args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) + + # logits is a tuple, so we unpack it and return them as separate outputs + char_logits, bpe_logits, wp_logits = self.orig_forward(*args, **kwargs).logits + + return { + "char_logits": char_logits, + "bpe_logits": bpe_logits, + "wp_logits": wp_logits, + } + + self.patched_forward = patched_forward + + class SAMModelPatcher(ModelPatcher): def __init__( self, diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 0a3758e97cf..ba17730f9d9 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -211,7 +211,7 @@ class TasksManager: "image-classification": "AutoModelForImageClassification", "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), "image-to-image": "AutoModelForImageToImage", - "image-to-text": "AutoModelForVision2Seq", + "image-to-text": ("AutoModelForVision2Seq", "AutoModel"), "mask-generation": "AutoModel", "masked-im": "AutoModelForMaskedImageModeling", "multiple-choice": "AutoModelForMultipleChoice", @@ -824,6 +824,11 @@ class TasksManager: "question-answering", onnx="MBartOnnxConfig", ), + "mgp-str": supported_tasks_mapping( + "feature-extraction", + "image-to-text", + onnx="MgpstrOnnxConfig", + ), "mistral": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 32156d9eebf..5f071e0f9eb 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -116,6 +116,7 @@ "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet-v2": "hf-internal-testing/tiny-random-MobileNetV2Model", @@ -247,6 +248,7 @@ "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", "mbart": "sshleifer/tiny-mbart", + "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", # "mobilenet_v1": "google/mobilenet_v1_0.75_192", # "mobilenet_v2": "google/mobilenet_v2_0.35_96", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index cccecd53817..c33c07fc7b1 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -118,6 +118,7 @@ "m2m_100": "hf-internal-testing/tiny-random-m2m_100", "marian": "echarlaix/tiny-random-marian", "mbart": "hf-internal-testing/tiny-random-mbart", + "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel", "mobilenet_v1": "google/mobilenet_v1_0.75_192", From 3f007661377956402439f2ea0567d28b930ca38c Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Fri, 13 Dec 2024 17:58:40 +0200 Subject: [PATCH 65/73] Add ONNX export support for OLMo and OLMo2 (#2121) Add support for OLMo and OLMo2 Co-authored-by: Ella Charlaix --- docs/source/exporters/onnx/overview.mdx | 2 ++ optimum/exporters/onnx/model_configs.py | 9 +++++++++ optimum/exporters/tasks.py | 14 ++++++++++++++ tests/exporters/exporters_utils.py | 2 ++ 4 files changed, 27 insertions(+) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index 46ab3cb8a64..fbe7b42c44f 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -75,6 +75,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - MT5 - Musicgen (text-conditional only) - Nystromformer +- OLMo +- OLMo2 - OWL-ViT - Pegasus - Perceiver diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 85e235f9a96..1c838408807 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -325,6 +325,15 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig +class OlmoOnnxConfig(LlamaOnnxConfig): + ATOL_FOR_VALIDATION = 1e-4 + MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") + + +class Olmo2OnnxConfig(OlmoOnnxConfig): + MIN_TRANSFORMERS_VERSION = version.parse("4.47.0") + + class Qwen2OnnxConfig(LlamaOnnxConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.37.0") diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index ba17730f9d9..32e90c7da19 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -954,6 +954,20 @@ class TasksManager: "text-generation-with-past", onnx="GraniteOnnxConfig", ), + "olmo": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="OlmoOnnxConfig", + ), + "olmo2": supported_tasks_mapping( + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + onnx="Olmo2OnnxConfig", + ), "pegasus": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index 5f071e0f9eb..e04a850bc8c 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -127,6 +127,8 @@ "mt5": "lewtun/tiny-random-mt5", "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration", "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel", + "olmo": "hf-internal-testing/tiny-random-OlmoForCausalLM", + "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM", "opt": "hf-internal-testing/tiny-random-OPTModel", "owlv2": "hf-internal-testing/tiny-random-Owlv2Model", "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel", From 4daa40896f693649e21696c509cd98c7e0c40e3c Mon Sep 17 00:00:00 2001 From: Sebastian Husch Lee Date: Fri, 13 Dec 2024 17:03:26 +0100 Subject: [PATCH 66/73] Pass on `model_kwargs` when loading a sentence-transformers model before export (#2126) --- optimum/exporters/tasks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 32e90c7da19..4db4130302d 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -2141,6 +2141,7 @@ def get_model_from_task( use_auth_token = model_kwargs.pop("use_auth_token", None) token = model_kwargs.pop("token", None) trust_remote_code = model_kwargs.pop("trust_remote_code", False) + model_kwargs["torch_dtype"] = torch_dtype if use_auth_token is not None: warnings.warn( @@ -2158,6 +2159,7 @@ def get_model_from_task( token=token, revision=revision, trust_remote_code=trust_remote_code, + model_kwargs=model_kwargs, ) else: try: From 0c42291f9dbdcaf52dc1cab44c25452d853a96b7 Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Thu, 19 Dec 2024 10:47:39 +0200 Subject: [PATCH 67/73] Add ONNX export support for DinoV2, Hiera, Maskformer, PVT, SigLIP, SwinV2, VitMAE, and VitMSN models (#2001) * Add support for siglip models * cleanup * remove submodule * Add ONNX export for DinoV2 models * Use height and width from preprocessor * formatting * Remove attention mask from model input * Add ONNX export support for Hiera models * Add ONNX export support for SwinV2 * Upgrade Siglip to opset=14 * Add VQA task * Add ONNX export support for Maskformer * Add ONNX export support for PVT * Add ONNX export support for ViTMAE and ViTMSN * Add siglip unit tests * Add vit-mae unit tests * Code formatting * Add maskformer to list of supported models * Formatting * fix typo * remove vit-mae masked-im task * remove vit-msn masked-im task * fix output names for maskformer export --------- Co-authored-by: Ella Charlaix --- docs/source/exporters/onnx/overview.mdx | 8 ++ optimum/exporters/onnx/model_configs.py | 118 +++++++++++++++++++ optimum/exporters/tasks.py | 70 ++++++++++- optimum/onnxruntime/modeling_ort.py | 4 +- optimum/onnxruntime/utils.py | 1 + optimum/utils/normalized_config.py | 4 + tests/exporters/exporters_utils.py | 16 +++ tests/onnxruntime/test_modeling.py | 1 + tests/onnxruntime/utils_onnxruntime_tests.py | 7 +- 9 files changed, 224 insertions(+), 5 deletions(-) diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx index fbe7b42c44f..b5129c23f21 100644 --- a/docs/source/exporters/onnx/overview.mdx +++ b/docs/source/exporters/onnx/overview.mdx @@ -39,6 +39,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Decision Transformer - Deit - Detr +- DINOv2 - DistilBert - Donut-Swin - Electra @@ -53,6 +54,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - GPT-NeoX - OPT - GroupVit +- Hiera - Hubert - IBert - LayoutLM @@ -64,6 +66,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - M2-M100 - Marian - MarkupLM +- MaskFormer - MBart - MGP-STR - Mistral @@ -84,6 +87,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - Phi3 - Pix2Struct - PoolFormer +- PVT - Qwen2(Qwen1.5) - RegNet - RemBERT @@ -95,10 +99,12 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - SEW - SEW-D - Speech2Text +- SigLIP - SpeechT5 - Splinter - SqueezeBert - Swin +- SwinV2 - T5 - Table Transformer - TROCR @@ -106,6 +112,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra - UniSpeech SAT - Vision Encoder Decoder - Vit +- VitMAE +- VitMSN - Wav2Vec2 - Wav2Vec2 Conformer - WavLM diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 1c838408807..4c5a727a183 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -847,6 +847,65 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class HieraOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class PvtOnnxConfig(ViTOnnxConfig): + DEFAULT_ONNX_OPSET = 11 + + +class VitMAEOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class VitMSNOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + +class Dinov2DummyInputGenerator(DummyVisionInputGenerator): + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = DEFAULT_DUMMY_SHAPES["width"], + height: int = DEFAULT_DUMMY_SHAPES["height"], + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + num_channels=num_channels, + width=width, + height=height, + **kwargs, + ) + + from transformers.onnx.utils import get_preprocessor + + preprocessor = get_preprocessor(normalized_config._name_or_path) + if preprocessor is not None and hasattr(preprocessor, "crop_size"): + self.height = preprocessor.crop_size.get("height", self.height) + self.width = preprocessor.crop_size.get("width", self.width) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + input_ = super().generate( + input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype + ) + return input_ + + +class Dinov2OnnxConfig(ViTOnnxConfig): + DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,) + + class MobileViTOnnxConfig(ViTOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 DEFAULT_ONNX_OPSET = 11 @@ -888,6 +947,10 @@ class SwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 +class SwinV2OnnxConfig(SwinOnnxConfig): + pass + + class Swin2srOnnxConfig(SwinOnnxConfig): pass @@ -923,6 +986,28 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig): pass +class MaskFormerOnnxConfig(ViTOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 12, try exporting with this version. + DEFAULT_ONNX_OPSET = 12 + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self.task == "image-segmentation": + return { + "class_queries_logits": {0: "batch_size", 1: "num_queries"}, + "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"}, + } + else: + return super().outputs + + @property + def torch_to_onnx_output_map(self) -> Dict[str, str]: + return { + "transformer_decoder_last_hidden_state": "last_hidden_state", + } + + class DonutSwinOnnxConfig(ViTOnnxConfig): DEFAULT_ONNX_OPSET = 11 @@ -1115,6 +1200,39 @@ def patch_model_for_export( return CLIPModelPatcher(self, model, model_kwargs=model_kwargs) +class SiglipNormalizedConfig(CLIPNormalizedConfig): + pass + + +class SiglipOnnxConfig(CLIPOnnxConfig): + NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + return { + "input_ids": {0: "text_batch_size", 1: "sequence_length"}, + "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, + # NOTE: No attention_mask + } + + +class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig): + pass + + +class SiglipTextOnnxConfig(CLIPTextOnnxConfig): + pass + + +class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig): + # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported. + # Support for this operator was added in version 14, try exporting with this version. + DEFAULT_ONNX_OPSET = 14 + + class UNetOnnxConfig(VisionOnnxConfig): ATOL_FOR_VALIDATION = 1e-4 # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py index 4db4130302d..7cb5a31d2d5 100644 --- a/optimum/exporters/tasks.py +++ b/optimum/exporters/tasks.py @@ -209,7 +209,12 @@ class TasksManager: "feature-extraction": "AutoModel", "fill-mask": "AutoModelForMaskedLM", "image-classification": "AutoModelForImageClassification", - "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"), + "image-segmentation": ( + "AutoModelForImageSegmentation", + "AutoModelForSemanticSegmentation", + "AutoModelForInstanceSegmentation", + "AutoModelForUniversalSegmentation", + ), "image-to-image": "AutoModelForImageToImage", "image-to-text": ("AutoModelForVision2Seq", "AutoModel"), "mask-generation": "AutoModel", @@ -224,6 +229,7 @@ class TasksManager: "text2text-generation": "AutoModelForSeq2SeqLM", "text-classification": "AutoModelForSequenceClassification", "token-classification": "AutoModelForTokenClassification", + "visual-question-answering": "AutoModelForVisualQuestionAnswering", "zero-shot-image-classification": "AutoModelForZeroShotImageClassification", "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection", } @@ -307,6 +313,7 @@ class TasksManager: "vision2seq-lm": "image-to-text", "zero-shot-classification": "text-classification", "image-feature-extraction": "feature-extraction", + "pretraining": "feature-extraction", # for backward compatibility and testing (where # model task and model type are still the same) "stable-diffusion": "text-to-image", @@ -601,6 +608,11 @@ class TasksManager: "image-segmentation", onnx="DetrOnnxConfig", ), + "dinov2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="Dinov2OnnxConfig", + ), "distilbert": supported_tasks_mapping( "feature-extraction", "fill-mask", @@ -732,6 +744,11 @@ class TasksManager: "feature-extraction", onnx="GroupViTOnnxConfig", ), + "hiera": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="HieraOnnxConfig", + ), "hubert": supported_tasks_mapping( "feature-extraction", "automatic-speech-recognition", @@ -813,6 +830,11 @@ class TasksManager: "question-answering", onnx="MarkupLMOnnxConfig", ), + "maskformer": supported_tasks_mapping( + "feature-extraction", + "image-segmentation", + onnx="MaskFormerOnnxConfig", + ), "mbart": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -1011,6 +1033,11 @@ class TasksManager: "image-classification", onnx="PoolFormerOnnxConfig", ), + "pvt": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="PvtOnnxConfig", + ), "regnet": supported_tasks_mapping( "feature-extraction", "image-classification", @@ -1070,6 +1097,23 @@ class TasksManager: "audio-classification", onnx="SEWDOnnxConfig", ), + "siglip": supported_tasks_mapping( + "feature-extraction", + "zero-shot-image-classification", + onnx="SiglipOnnxConfig", + ), + "siglip-text-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextOnnxConfig", + ), + "siglip-text-with-projection": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipTextWithProjectionOnnxConfig", + ), + "siglip-vision-model": supported_tasks_mapping( + "feature-extraction", + onnx="SiglipVisionModelOnnxConfig", + ), "speech-to-text": supported_tasks_mapping( "feature-extraction", "feature-extraction-with-past", @@ -1102,6 +1146,12 @@ class TasksManager: "masked-im", onnx="SwinOnnxConfig", ), + "swinv2": supported_tasks_mapping( + "feature-extraction", + "image-classification", + "masked-im", + onnx="SwinV2OnnxConfig", + ), "swin2sr": supported_tasks_mapping( "feature-extraction", "image-to-image", @@ -1148,7 +1198,19 @@ class TasksManager: onnx="VisionEncoderDecoderOnnxConfig", ), "vit": supported_tasks_mapping( - "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig" + "feature-extraction", + "image-classification", + "masked-im", + onnx="ViTOnnxConfig", + ), + "vit-mae": supported_tasks_mapping( + "feature-extraction", + onnx="VitMAEOnnxConfig", + ), + "vit-msn": supported_tasks_mapping( + "feature-extraction", + "image-classification", + onnx="VitMSNOnnxConfig", ), "vits": supported_tasks_mapping( "text-to-audio", @@ -1232,6 +1294,10 @@ class TasksManager: "unet-2d-condition", "vae-encoder", "vae-decoder", + "clip-text-model", + "clip-text-with-projection", + "siglip-text-model", + "siglip-text-with-projection", # redundant model types "trocr", # same as vision-encoder-decoder } diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py index 8e5a814b689..a55eb064fa3 100644 --- a/optimum/onnxruntime/modeling_ort.py +++ b/optimum/onnxruntime/modeling_ort.py @@ -1696,7 +1696,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForImageClassification(ORTModel): """ - ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit. + ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, dinov2, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, swinv2, vit. """ auto_model_class = AutoModelForImageClassification @@ -1784,7 +1784,7 @@ def forward( @add_end_docstrings(ONNX_MODEL_END_DOCSTRING) class ORTModelForSemanticSegmentation(ORTModel): """ - ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports segformer. + ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports maskformer, segformer. """ auto_model_class = AutoModelForSemanticSegmentation diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py index 9e92e0bd325..79375d958ff 100644 --- a/optimum/onnxruntime/utils.py +++ b/optimum/onnxruntime/utils.py @@ -178,6 +178,7 @@ def check_optimization_supported_model(cls, model_type: str, optimization_config "clip", "vit", "swin", + "swinv2", ] model_type = model_type.replace("_", "-") if (model_type not in cls._conf) or (cls._conf[model_type] not in supported_model_types_for_optimization): diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 9ceed24c2dd..9fde2bd4696 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -204,8 +204,10 @@ class NormalizedConfigManager: 'data2vec-text', 'data2vec-vision', 'detr', + 'dinov2', 'flaubert', 'groupvit', + 'hiera', 'ibert', 'layoutlm', 'layoutlmv3', @@ -216,6 +218,8 @@ class NormalizedConfigManager: 'owlvit', 'perceiver', 'roformer', + 'segformer', + 'siglip', 'squeezebert', 'table-transformer', """ diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py index e04a850bc8c..900b5f3b5ce 100644 --- a/tests/exporters/exporters_utils.py +++ b/tests/exporters/exporters_utils.py @@ -69,6 +69,7 @@ "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model", "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium", "deit": "hf-internal-testing/tiny-random-DeiTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel", "detr": "hf-internal-testing/tiny-random-DetrModel", # hf-internal-testing/tiny-random-detr is larger @@ -103,6 +104,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJModel", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "ibert": "hf-internal-testing/tiny-random-IBertModel", "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel", "levit": "hf-internal-testing/tiny-random-LevitModel", @@ -115,6 +117,7 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", "marian": "sshleifer/tiny-marian-en-de", # hf-internal-testing ones are broken "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation", "mbart": "hf-internal-testing/tiny-random-mbart", "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition", "mistral": "echarlaix/tiny-random-mistral", @@ -143,6 +146,7 @@ # "rembert": "google/rembert", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "regnet": "hf-internal-testing/tiny-random-RegNetModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -150,13 +154,18 @@ "roformer": "hf-internal-testing/tiny-random-RoFormerModel", "sam": "fxmarty/sam-vit-tiny-random", "segformer": "hf-internal-testing/tiny-random-SegformerModel", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", + "siglip-vision-model": "hf-internal-testing/tiny-random-SiglipVisionModel", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRModel", "t5": "hf-internal-testing/tiny-random-t5", "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel", "vit": "hf-internal-testing/tiny-random-vit", + "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel", + "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification", "vits": "echarlaix/tiny-random-vits", "yolos": "hf-internal-testing/tiny-random-YolosModel", "whisper": "openai/whisper-tiny.en", # hf-internal-testing ones are broken @@ -237,6 +246,7 @@ "gpt-neox": "EleutherAI/gpt-neox-20b", "gptj": "anton-l/gpt-j-tiny-random", # TODO "groupvit": "nvidia/groupvit-gcc-yfcc", + "hiera": "facebook/hiera-tiny-224-in1k-hf", "ibert": "kssteven/ibert-roberta-base", "imagegpt": "openai/imagegpt-small", "levit": "facebook/levit-128S", @@ -249,6 +259,7 @@ "m2m-100": "hf-internal-testing/tiny-random-m2m_100", # Not using facebook/m2m100_418M because it takes too much time for testing. "marian": "Helsinki-NLP/opus-mt-en-de", "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel", + "maskformer": "facebook/maskformer-swin-tiny-coco", "mbart": "sshleifer/tiny-mbart", "mgp-str": "alibaba-damo/mgp-str-base", "mobilebert": "google/mobilebert-uncased", @@ -264,18 +275,23 @@ "perceiver": "hf-internal-testing/tiny-random-PerceiverModel", # Not using deepmind/language-perceiver because it takes too much time for testing. "rembert": "google/rembert", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "regnet": "facebook/regnet-y-040", "resnet": "microsoft/resnet-50", "roberta": "roberta-base", "roformer": "junnyu/roformer_chinese_base", "sam": "facebook/sam-vit-base", "segformer": "nvidia/segformer-b0-finetuned-ade-512-512", + "siglip": "google/siglip-base-patch16-224", "splinter": "hf-internal-testing/tiny-random-SplinterModel", "squeezebert": "squeezebert/squeezebert-uncased", "swin": "microsoft/swin-tiny-patch4-window7-224", + "swinv2": "microsoft/swinv2-tiny-patch4-window16-256", "t5": "t5-small", "table-transformer": "microsoft/table-transformer-detection", "vit": "google/vit-base-patch16-224", + "vit-mae": "facebook/vit-mae-base", + "vit-msn": "facebook/vit-msn-small", "yolos": "hustvl/yolos-tiny", "whisper": "openai/whisper-tiny.en", "hubert": "facebook/hubert-base-ls960", diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 8f52ef45180..255c0d9d0e7 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -2827,6 +2827,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin): "convnextv2", "data2vec_vision", "deit", + "dinov2", "levit", "mobilenet_v1", "mobilenet_v2", diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py index c33c07fc7b1..02ced3be3aa 100644 --- a/tests/onnxruntime/utils_onnxruntime_tests.py +++ b/tests/onnxruntime/utils_onnxruntime_tests.py @@ -87,8 +87,9 @@ "deit": "hf-internal-testing/tiny-random-DeiTModel", "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder", "detr": "hf-internal-testing/tiny-random-detr", - "dpt": "hf-internal-testing/tiny-random-DPTModel", + "dinov2": "hf-internal-testing/tiny-random-Dinov2Model", "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", + "dpt": "hf-internal-testing/tiny-random-DPTModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "encoder-decoder": { "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [ @@ -107,6 +108,7 @@ "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM", "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM", "groupvit": "hf-internal-testing/tiny-random-groupvit", + "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification", "hubert": "hf-internal-testing/tiny-random-HubertModel", "ibert": "hf-internal-testing/tiny-random-IBertModel", "latent-consistency": "echarlaix/tiny-random-latent-consistency", @@ -135,6 +137,7 @@ "phi3": "Xenova/tiny-random-Phi3ForCausalLM", "pix2struct": "fxmarty/pix2struct-tiny-random", "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel", + "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification", "qwen2": "fxmarty/tiny-dummy-qwen2", "rembert": "hf-internal-testing/tiny-random-RemBertModel", "resnet": "hf-internal-testing/tiny-random-resnet", @@ -143,12 +146,14 @@ "segformer": "hf-internal-testing/tiny-random-SegformerModel", "sew": "hf-internal-testing/tiny-random-SEWModel", "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h", + "siglip": "hf-internal-testing/tiny-random-SiglipModel", "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel", "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel", "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch", "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3", "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl", "swin": "hf-internal-testing/tiny-random-SwinModel", + "swinv2": "hf-internal-testing/tiny-random-Swinv2Model", "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224", "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution", "t5": "hf-internal-testing/tiny-random-t5", From 35d35bd3da52273cdd8fd2a8300b4598b2d96cc7 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 19 Dec 2024 14:14:56 +0400 Subject: [PATCH 68/73] Move check_dummy_inputs_allowed to common export utils (#2114) * move check_dummy_inputs_allowed to common export utils * move decoder_merge import * Update optimum/exporters/utils.py * Update optimum/exporters/utils.py * avoid onnx import if not necessary * move merge decoders import * fix style * add comment --------- Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Co-authored-by: Ella Charlaix --- optimum/exporters/onnx/base.py | 12 ++++++---- optimum/exporters/onnx/config.py | 6 ++++- optimum/exporters/onnx/convert.py | 29 ++++--------------------- optimum/exporters/onnx/model_configs.py | 6 ++++- optimum/exporters/utils.py | 27 ++++++++++++++++++++++- 5 files changed, 48 insertions(+), 32 deletions(-) diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py index 7e35691d54b..b5adb4522a2 100644 --- a/optimum/exporters/onnx/base.py +++ b/optimum/exporters/onnx/base.py @@ -27,16 +27,12 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union import numpy as np -import onnx from transformers.utils import is_accelerate_available, is_torch_available -from ...onnx import remove_duplicate_weights_from_tied_info - if is_torch_available(): import torch.nn as nn -from ...onnx import merge_decoders from ...utils import ( DEFAULT_DUMMY_SHAPES, DummyInputGenerator, @@ -54,6 +50,8 @@ from .model_patcher import ModelPatcher, Seq2SeqModelPatcher +# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization + if is_accelerate_available(): from accelerate.utils import find_tied_parameters @@ -542,6 +540,10 @@ def post_process_exported_models( first_key = next(iter(models_and_onnx_configs)) if is_torch_available() and isinstance(models_and_onnx_configs[first_key][0], nn.Module): if is_accelerate_available(): + import onnx + + from ...onnx import remove_duplicate_weights_from_tied_info + logger.info("Deduplicating shared (tied) weights...") for subpath, key in zip(onnx_files_subpaths, models_and_onnx_configs): torch_model = models_and_onnx_configs[key][0] @@ -934,6 +936,8 @@ def post_process_exported_models( decoder_with_past_path = Path(path, onnx_files_subpaths[2]) decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx") try: + from ...onnx import merge_decoders + # The decoder with past does not output the cross attention past key values as they are constant, # hence the need for strict=False merge_decoders( diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py index 9e808e392b9..69366d6be13 100644 --- a/optimum/exporters/onnx/config.py +++ b/optimum/exporters/onnx/config.py @@ -20,7 +20,6 @@ from transformers.utils import is_tf_available -from ...onnx import merge_decoders from ...utils import ( DummyAudioInputGenerator, DummyBboxInputGenerator, @@ -38,6 +37,9 @@ from .model_patcher import DecoderModelPatcher +# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization + + if TYPE_CHECKING: from transformers import PretrainedConfig, PreTrainedModel @@ -129,6 +131,8 @@ def post_process_exported_models( # Attempt to merge only if the decoder-only was exported separately without/with past if self.use_past is True and len(models_and_onnx_configs) == 2: + from ...onnx import merge_decoders + decoder_path = Path(path, onnx_files_subpaths[0]) decoder_with_past_path = Path(path, onnx_files_subpaths[1]) decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx") diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py index c12a9ac222a..80d945580c7 100644 --- a/optimum/exporters/onnx/convert.py +++ b/optimum/exporters/onnx/convert.py @@ -22,7 +22,7 @@ from inspect import signature from itertools import chain from pathlib import Path -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union import numpy as np import onnx @@ -45,6 +45,7 @@ from ...utils.save_utils import maybe_save_preprocessors from ..error_utils import AtolError, MinimumVersionError, OutputMatchError, ShapeError from ..tasks import TasksManager +from ..utils import check_dummy_inputs_are_allowed from .base import OnnxConfig from .constants import UNPICKABLE_ARCHS from .model_configs import SpeechT5OnnxConfig @@ -56,6 +57,8 @@ ) +# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization + if is_torch_available(): import torch import torch.nn as nn @@ -75,30 +78,6 @@ class DynamicAxisNameError(ValueError): pass -def check_dummy_inputs_are_allowed( - model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str] -): - """ - Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`. - Args: - model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]): - The model instance. - model_inputs (`Iterable[str]`): - The model input names. - """ - - forward = model.forward if is_torch_available() and isinstance(model, nn.Module) else model.call - forward_parameters = signature(forward).parameters - forward_inputs_set = set(forward_parameters.keys()) - dummy_input_names = set(dummy_input_names) - - # We are fine if config_inputs has more keys than model_inputs - if not dummy_input_names.issubset(forward_inputs_set): - raise ValueError( - f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}" - ) - - def validate_models_outputs( models_and_onnx_configs: Dict[ str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"] diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py index 4c5a727a183..3a48a579c2c 100644 --- a/optimum/exporters/onnx/model_configs.py +++ b/optimum/exporters/onnx/model_configs.py @@ -21,7 +21,6 @@ from packaging import version from transformers.utils import is_tf_available -from ...onnx import merge_decoders from ...utils import ( DEFAULT_DUMMY_SHAPES, BloomDummyPastKeyValuesGenerator, @@ -94,6 +93,9 @@ ) +# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization + + if TYPE_CHECKING: from transformers import PretrainedConfig from transformers.modeling_utils import PreTrainedModel @@ -2018,6 +2020,8 @@ def post_process_exported_models( decoder_with_past_path = Path(path, onnx_files_subpaths[3]) decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx") try: + from ...onnx import merge_decoders + # The decoder with past does not output the cross attention past key values as they are constant, # hence the need for strict=False merge_decoders( diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index 60de169de5e..d4a4111075d 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -16,7 +16,8 @@ """Utilities for model preparation to export.""" import copy -from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union +from inspect import signature +from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import torch from packaging import version @@ -675,3 +676,27 @@ def _get_submodels_and_export_configs( export_config = next(iter(models_and_export_configs.values()))[1] return export_config, models_and_export_configs + + +def check_dummy_inputs_are_allowed( + model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str] +): + """ + Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`. + Args: + model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]): + The model instance. + model_inputs (`Iterable[str]`): + The model input names. + """ + + forward = model.forward if is_torch_available() and isinstance(model, torch.nn.Module) else model.call + forward_parameters = signature(forward).parameters + forward_inputs_set = set(forward_parameters.keys()) + dummy_input_names = set(dummy_input_names) + + # We are fine if config_inputs has more keys than model_inputs + if not dummy_input_names.issubset(forward_inputs_set): + raise ValueError( + f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}" + ) From 0ea269fb714877b5006e3293026d397de8d53767 Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 19 Dec 2024 11:15:28 +0100 Subject: [PATCH 69/73] Remove CI macos runners (#2129) remove macos runners --- .github/workflows/test_bettertransformer.yml | 2 +- .github/workflows/test_onnxruntime.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml index b023fa4bd1b..016e97304ad 100644 --- a/.github/workflows/test_bettertransformer.yml +++ b/.github/workflows/test_bettertransformer.yml @@ -16,7 +16,7 @@ jobs: fail-fast: false matrix: python-version: [3.9] - os: [ubuntu-20.04, macos-14] + os: [ubuntu-20.04] runs-on: ${{ matrix.os }} steps: diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml index b20a3b46f88..a0c5893d62c 100644 --- a/.github/workflows/test_onnxruntime.yml +++ b/.github/workflows/test_onnxruntime.yml @@ -18,7 +18,7 @@ jobs: fail-fast: false matrix: transformers-version: ["latest"] - os: [ubuntu-20.04, windows-2019, macos-15] + os: [ubuntu-20.04, windows-2019] # TODO : add macos-15 after mps fix include: - transformers-version: "4.36.*" os: ubuntu-20.04 From 21de42f05c297e6d165def90a5db95d5637b6d6c Mon Sep 17 00:00:00 2001 From: jiqing-feng Date: Thu, 19 Dec 2024 18:30:09 +0800 Subject: [PATCH 70/73] Enable GPTQModel (#2064) * align gptq check to transformers for supporting cpu * fix comment * gptqmodel Signed-off-by: jiqing-feng * compatible with auto-gptq Signed-off-by: jiqing-feng * fix compatible with auto-gptq Signed-off-by: jiqing-feng * fix compatible with auto-gptq linear Signed-off-by: jiqing-feng * revert unrelated changes Signed-off-by: jiqing-feng * gptqmodel need use checkpoint_format (#1) * need checkpoint_format * default value of checkpoint_format is gptq * fix quantize * fix quantize * fix quantize * Update quantizer.py * need convert to v1 before gptqmodel save * back checkpoint_format to gptq after convert * cleanup code * sym=False is not supported with auto-gptq * add comments * cleanup code * Update quantizer.py * always convert v2 to v1 if checkpoint_format = "gptq" * Update quantizer.py --------- Co-authored-by: ZX-ModelCloud Co-authored-by: Qubitium-ModelCloud * Mod backend code (#2) * keep gptq_v2 if sym is false * use hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format, and hf_gptqmodel_post_init * no need check backend * use device_map * cleanup * Update quantizer.py * move import --------- Co-authored-by: Qubitium-ModelCloud * fix format and log Signed-off-by: jiqing-feng * fix version check Signed-off-by: jiqing-feng * enable gptqmodel tests Signed-off-by: jiqing-feng * update check quant type Signed-off-by: jiqing-feng * Fix optimum compat (#3) * add meta info * cleanup * cleanup * The value of quantizer should be an array * Update quantizer.py * If is_auto_gptq_available() also writes "auto_gptq:version" to "quantizer" * If is_auto_gptq_available() also writes "auto_gptq:version" to "quantizer" * Update quantizer.py * cleanup * comment on meta * hf_select_quant_linear pass checkpoint_format * add todo fix * move convert code to quantizer.save() * Update quantizer.py * Optimize hf_convert_gptq_v2_to_v1_format() * Optimize hf_convert_gptq_v1_to_v2_format() * fix GPTQTestCUDA * hf_select_quant_linear() always set pack=True * gptqmodel.hf_select_quant_linear() now does not select ExllamaV2 * gptqmodel.hf_select_quant_linear() now does not select ExllamaV2 * GPTQQuantizer add backend * lower checkpoint_format and backend * cleanup * move backend to bottom * no need to check gptqmodel version for ipex support * Update import_utils.py * Update quantizer.py * fix UnboundLocalError: cannot access local variable 'version' where it is not associated with a value * make version var short * Update import_utils.py * fix unittest * use assertLessEqual --------- Co-authored-by: Qubitium-ModelCloud Co-authored-by: LRL * fix format and convert v2 to v1 Signed-off-by: jiqing-feng * [Fix] all tensors not same device (#5) * fix device error * update gptqmodel version * fix test * fix format Signed-off-by: jiqing-feng * add gptqmodel tests which contains cpu Signed-off-by: jiqing-feng * fix all auto-gptq tests Signed-off-by: jiqing-feng * revert tests Signed-off-by: jiqing-feng * rm gptqmodel yaml Signed-off-by: jiqing-feng * fix comment Signed-off-by: jiqing-feng * enable real cpu tests by fp32 Signed-off-by: jiqing-feng * fix test model name Signed-off-by: jiqing-feng * keep the original device setting when using auto-gptq Signed-off-by: jiqing-feng * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> * Update optimum/gptq/quantizer.py Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --------- Signed-off-by: jiqing-feng Co-authored-by: LRL-ModelCloud <165116337+LRL-ModelCloud@users.noreply.github.com> Co-authored-by: ZX-ModelCloud Co-authored-by: Qubitium-ModelCloud Co-authored-by: ZX-ModelCloud <165115237+ZX-ModelCloud@users.noreply.github.com> Co-authored-by: LRL Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> --- optimum/gptq/quantizer.py | 253 ++++++++++++++++++++++++++-------- optimum/gptq/utils.py | 15 ++ optimum/utils/__init__.py | 1 + optimum/utils/import_utils.py | 19 ++- 4 files changed, 227 insertions(+), 61 deletions(-) diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py index 849d8821ebf..844da3e3157 100644 --- a/optimum/gptq/quantizer.py +++ b/optimum/gptq/quantizer.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import importlib import json import os from enum import Enum @@ -19,17 +20,26 @@ from typing import Any, Dict, List, Optional, Tuple, Union import torch +from packaging import version from torch import nn from tqdm.auto import tqdm from transformers import AutoTokenizer from transformers.pytorch_utils import Conv1D from transformers.utils.quantization_config import QuantizationMethod -from ..utils import is_accelerate_available, is_auto_gptq_available +from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available from ..utils.modeling_utils import recurse_getattr +from ..version import __version__ as optimum_version from .constants import GPTQ_CONFIG from .data import get_dataset, prepare_dataset -from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen +from .utils import ( + get_block_name_with_pattern, + get_device, + get_layers, + get_preceding_modules, + get_seqlen, + nested_move_to, +) if is_accelerate_available(): @@ -40,14 +50,27 @@ from accelerate.hooks import remove_hook_from_module if is_auto_gptq_available(): + from auto_gptq import __version__ as autogptq_version from auto_gptq import exllama_set_max_input_length - from auto_gptq.modeling._utils import autogptq_post_init + from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init from auto_gptq.quantization import GPTQ - from auto_gptq.utils.import_utils import dynamically_import_QuantLinear + from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear + +if is_gptqmodel_available(): + from gptqmodel import exllama_set_max_input_length + from gptqmodel.quantization import GPTQ + from gptqmodel.utils.importer import hf_select_quant_linear + from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format + from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init + from gptqmodel.version import __version__ as gptqmodel_version logger = getLogger(__name__) +def has_device_more_than_cpu(): + return torch.cuda.is_available() or (hasattr(torch, "xpu") and torch.xpu.is_available()) + + class ExllamaVersion(int, Enum): ONE = 1 TWO = 2 @@ -74,10 +97,13 @@ def __init__( batch_size: int = 1, pad_token_id: Optional[int] = None, disable_exllama: bool = False, - exllama_config: Dict[str, Any] = None, + exllama_config: Optional[Dict[str, Any]] = None, max_input_length: Optional[int] = None, cache_block_outputs: Optional[bool] = True, modules_in_block_to_quantize: Optional[List[List[str]]] = None, + checkpoint_format: str = "gptq", + meta: Optional[Dict[str, any]] = None, + backend: Optional[str] = None, *args, **kwargs, ): @@ -129,6 +155,13 @@ def __init__( List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized. The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially. If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]` + checkpoint_format (`str`, *optional*, defaults to `gptq`): + GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only. + meta (`Dict[str, any]`, *optional*): + Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta. + i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"] + backend (`str`, *optional*): + Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py """ self.bits = bits @@ -138,6 +171,9 @@ def __init__( self.desc_act = desc_act self.sym = sym self.true_sequential = true_sequential + self.checkpoint_format = checkpoint_format.lower() + self.meta = meta + self.backend = backend.lower() if backend is not None else None self.use_cuda_fp16 = use_cuda_fp16 self.model_seqlen = model_seqlen self.block_name_to_quantize = block_name_to_quantize @@ -161,6 +197,8 @@ def __init__( "true_sequential", "quant_method", "modules_in_block_to_quantize", + "checkpoint_format", + "meta", ] if self.bits not in [2, 3, 4, 8]: @@ -182,6 +220,28 @@ def __init__( ) self.exllama_version = self.exllama_config["version"] + def select_quant_linear(self, device_map: Union[str, dict]): + if is_gptqmodel_available(): + self.quant_linear = hf_select_quant_linear( + bits=self.bits, + group_size=self.group_size, + desc_act=self.desc_act, + sym=self.sym, + checkpoint_format=self.checkpoint_format, + meta=self.meta, + device_map=device_map, + backend=self.backend, + ) + else: + self.quant_linear = hf_select_quant_linear( + use_triton=False, + desc_act=self.desc_act, + group_size=self.group_size, + bits=self.bits, + disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE, + disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO, + ) + def to_dict(self): """ Returns the args in dict format. @@ -189,6 +249,20 @@ def to_dict(self): gptq_dict = {} for key in self.serialization_keys: gptq_dict[key] = getattr(self, key) + + if gptq_dict.get("meta") is None: + gptq_dict["meta"] = {} + + meta = gptq_dict["meta"] + # store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer + if meta.get("quantizer") is None: + meta["quantizer"] = [f"optimum:{optimum_version}"] + + if is_gptqmodel_available(): + meta["quantizer"].append(f"gptqmodel:{gptqmodel_version}") + elif is_auto_gptq_available(): + meta["quantizer"].append(f"auto_gptq:{autogptq_version}") + return gptq_dict @classmethod @@ -205,7 +279,7 @@ def from_dict(cls, config_dict: Dict[str, Any]): """ return cls(**config_dict) - def convert_model(self, model: nn.Module): + def convert_model(self, model: nn.Module, **kwargs): """ Convert the model to a GPTQ model by getting and replacing the layers. @@ -226,7 +300,11 @@ def convert_model(self, model: nn.Module): f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)" ) del layers_to_be_replaced[name] + + self.select_quant_linear(device_map=kwargs.get("device_map", None)) + self._replace_by_quant_layers(model, layers_to_be_replaced) + return model def get_no_split_module_classes(self, model): @@ -253,15 +331,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st name (`str`, defaults to `""`): To keep track of the name of the current module """ - QuantLinear = dynamically_import_QuantLinear( - use_triton=False, - desc_act=self.desc_act, - group_size=self.group_size, - bits=self.bits, - disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE, - disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO, - ) - if isinstance(module, QuantLinear): + if isinstance(module, self.quant_linear): return for attr in dir(module): layer = getattr(module, attr) @@ -279,20 +349,37 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st in_features = layer.weight.shape[0] out_features = layer.weight.shape[1] bias = layer.bias is not None - if not (self.desc_act) or self.group_size == -1: - new_layer = QuantLinear( + if is_gptqmodel_available(): + new_layer = self.quant_linear( self.bits, self.group_size, + self.desc_act, + self.sym, in_features, out_features, bias, - use_cuda_fp16=self.use_cuda_fp16, weight_dtype=layer.weight.dtype, ) else: - new_layer = QuantLinear( - self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype - ) + if not (self.desc_act) or self.group_size == -1: + new_layer = self.quant_linear( + self.bits, + self.group_size, + in_features, + out_features, + bias, + use_cuda_fp16=self.use_cuda_fp16, + weight_dtype=layer.weight.dtype, + ) + else: + new_layer = self.quant_linear( + self.bits, + self.group_size, + in_features, + out_features, + bias, + weight_dtype=layer.weight.dtype, + ) new_layer.device = device setattr(module, attr, new_layer.to(device)) for name1, child in module.named_children(): @@ -318,13 +405,41 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): `nn.Module`: The quantized model """ - if not is_auto_gptq_available(): - raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`") - if not torch.cuda.is_available(): - raise RuntimeError("No GPU found. A GPU is needed to quantize model.") + if not is_auto_gptq_available() and not is_gptqmodel_available(): + raise RuntimeError( + "gptqmodel or auto-gptq is required in order to perform gptq quantzation: `pip install gptqmodel` or `pip install auto-gptq`. Please notice that auto-gptq will be deprecated in the future." + ) + elif is_gptqmodel_available() and is_auto_gptq_available(): + logger.warning( + "Detected gptqmodel and auto-gptq, will use gptqmodel. The auto_gptq will be deprecated in the future." + ) + + gptq_supports_cpu = ( + is_auto_gptq_available() + and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2") + ) or is_gptqmodel_available() + + if not gptq_supports_cpu and not torch.cuda.is_available(): + raise RuntimeError( + "No cuda gpu or cpu support using Intel/IPEX found. A gpu or cpu with Intel/IPEX is required for quantization." + ) + + if not self.sym and not is_gptqmodel_available(): + raise ValueError( + "Asymmetric sym=False quantization is not supported with auto-gptq. Please use gptqmodel: `pip install gptqmodel`" + ) + + if self.checkpoint_format == "gptq_v2" and not is_gptqmodel_available(): + raise ValueError( + "gptq_v2 format only supported with gptqmodel. Please install gptqmodel: `pip install gptqmodel`" + ) model.eval() + # gptqmodel internal is gptq_v2 for asym support, gptq(v1) can only support sym=True + if is_gptqmodel_available() and self.checkpoint_format != "gptq_v2": + self.checkpoint_format = "gptq_v2" + # For Transformer model has_config = False has_device_map = False @@ -403,27 +518,32 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None): blocks = recurse_getattr(model, self.block_name_to_quantize) + cur_layer_device = get_device(blocks[0]) + if not is_gptqmodel_available(): + cur_layer_device = 0 + if not has_device_map: - # put modules from module_name_preceding_first_block on cuda + # put modules from module_name_preceding_first_block on cuda or xpu or cpu + to_device = cur_layer_device for module_name in self.module_name_preceding_first_block: module = recurse_getattr(model, module_name) if module is None: raise ValueError(f"Module {module_name} was not found in model") - module = module.to(0) - blocks[0] = blocks[0].to(0) + module = module.to(to_device) + blocks[0] = blocks[0].to(to_device) def store_input_hook(_, input, *args): kwargs = args[0] if input is None: if "hidden_states" in kwargs: - input = (kwargs["hidden_states"],) + input = (nested_move_to(kwargs["hidden_states"], cur_layer_device),) else: raise ValueError("No input value found in the foward pass") layer_inputs.append(input) other_kwargs = {} for k, v in kwargs.items(): # make sure other arguments also be captured if k not in ["hidden_states"]: - other_kwargs[k] = v + other_kwargs[k] = nested_move_to(v, cur_layer_device) layer_input_kwargs.append(other_kwargs) raise ValueError @@ -431,11 +551,7 @@ def store_input_hook(_, input, *args): handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True) for data in dataset: for k, v in data.items(): - # put the data on gpu, we won't put them back to cpu - if not has_device_map or device.type == "cpu": - data[k] = v.to(0) - else: - data[k] = v.to(device) + data[k] = nested_move_to(v, cur_layer_device) try: model(**data) except ValueError: @@ -450,6 +566,8 @@ def store_input_hook(_, input, *args): raise ValueError(f"Module {module_name} was not found in model") torch.cuda.empty_cache() + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.empty_cache() # Step 3: Quantize the blocks quantizers = {} @@ -460,11 +578,7 @@ def store_input_hook(_, input, *args): handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True) for data in dataset: for k, v in data.items(): - # put the data on gpu, we won't put them back to cpu - if not has_device_map or device.type == "cpu": - data[k] = v.to(0) - else: - data[k] = v.to(device) + data[k] = nested_move_to(v, cur_layer_device) try: model(**data) except ValueError: @@ -473,9 +587,12 @@ def store_input_hook(_, input, *args): # move block to cuda if needed # in case we have offload modules, we need to put them on cuda because of GPTQ object - if not has_device_map or get_device(block) == torch.device("cpu"): + if (not has_device_map or get_device(block) == torch.device("cpu")) and has_device_more_than_cpu(): block = block.to(0) layers = get_layers(block) + block_device = get_device(block) + if not is_gptqmodel_available(): + block_device = 0 if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0: if self.true_sequential: layers_name_list = self.modules_in_block_to_quantize @@ -509,15 +626,20 @@ def tmp(_, input, output): for j in range(len(dataset)): # the args are already on the gpu # don't need to store the output + layer_inputs[j] = nested_move_to(layer_inputs[j], block_device) + for k, v in layer_input_kwargs[j].items(): + layer_input_kwargs[j][k] = nested_move_to(v, block_device) + block(*layer_inputs[j], **layer_input_kwargs[j]) # remove hook for h in handles: h.remove() for name in subset_name_list: logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...") - scale, zero, g_idx = gptq[name].fasterquant( + quant_outputs = gptq[name].fasterquant( percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act ) + scale, zero, g_idx = quant_outputs[0], quant_outputs[1], quant_outputs[2] quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = ( gptq[name].quantizer, scale, @@ -543,11 +665,13 @@ def tmp(_, input, output): del layer_inputs layer_inputs = [] torch.cuda.empty_cache() + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.empty_cache() if self.bits == 4: # device not on gpu if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])): - if not self.disable_exllama: + if not self.disable_exllama and not is_gptqmodel_available(): logger.warning( "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`" ) @@ -578,6 +702,8 @@ def tmp(_, input, output): model = self.post_init_model(model) torch.cuda.empty_cache() + if hasattr(torch, "xpu") and torch.xpu.is_available(): + torch.xpu.empty_cache() return model def post_init_model(self, model): @@ -601,9 +727,14 @@ def post_init_model(self, model): class StoreAttr(object): pass + if is_gptqmodel_available(): + model, _ = hf_convert_gptq_v1_to_v2_format( + model, self.bits, self.quant_linear, self.checkpoint_format, self.meta + ) + model.quantize_config = StoreAttr() model.quantize_config.desc_act = self.desc_act - model = autogptq_post_init(model, use_act_order=self.desc_act) + model = gptq_post_init(model, use_act_order=self.desc_act) if ( self.desc_act and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE) @@ -626,19 +757,14 @@ def pack_model( quantizers (`Dict[str,Tuple]`): A mapping of the layer name and the data needed to pack the layer """ - QuantLinear = dynamically_import_QuantLinear( - use_triton=False, - desc_act=self.desc_act, - group_size=self.group_size, - bits=self.bits, - disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE, - disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO, - ) logger.info("Packing model...") layers = get_layers(model) layers = {n: layers[n] for n in quantizers} + + self.select_quant_linear(device_map=model.hf_device_map) + self._replace_by_quant_layers(model, quantizers) - qlayers = get_layers(model, [QuantLinear]) + qlayers = get_layers(model, [self.quant_linear]) for name in qlayers: logger.info(name) quantizers[name], scale, zero, g_idx = quantizers[name] @@ -673,6 +799,15 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). """ + + # convert gptqmodel internal gptq_v2 format to v1 for max compatibility + if is_gptqmodel_available(): + model, converted = hf_convert_gptq_v2_to_v1_format( + model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta + ) + if converted: + self.checkpoint_format = "gptq" + os.makedirs(save_dir, exist_ok=True) model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization) with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f: @@ -736,10 +871,12 @@ def load_quantized_model( Returns: `nn.Module`: The quantized model """ - if not torch.cuda.is_available(): - raise RuntimeError("No GPU found. A GPU is needed to run quantized model.") - if not is_auto_gptq_available(): - raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`") + if not torch.cuda.is_available() and not is_gptqmodel_available(): + raise RuntimeError("No GPU found. A GPU is needed to run quantized model by auto_gptq.") + if not is_auto_gptq_available() and not is_gptqmodel_available(): + raise RuntimeError( + "gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) is required in order to load quantized weights. Please notice that auto-gptq will be deprecated in the future." + ) if not is_accelerate_available(): raise RuntimeError( "You need to install accelerate in order to load and dispatch weights to" @@ -777,7 +914,7 @@ def load_quantized_model( quantizer.exllama_version = quantizer.exllama_config["version"] quantizer.max_input_length = max_input_length - model = quantizer.convert_model(model) + model = quantizer.convert_model(model, device_map=device_map) if no_split_module_classes is None: no_split_module_classes = quantizer.get_no_split_module_classes(model) diff --git a/optimum/gptq/utils.py b/optimum/gptq/utils.py index a5f9afdaaef..732ecbd66b9 100644 --- a/optimum/gptq/utils.py +++ b/optimum/gptq/utils.py @@ -113,3 +113,18 @@ def get_seqlen(model: nn.Module): "We couldn't get the model sequence length. Setting it to 2048. You can overwrite this value by passing `model_seqlen` in` GPTQQuantizer`" ) return 2048 + + +def move_to(obj: torch.Tensor, device: torch.device): + if get_device(obj) != device: + obj = obj.to(device) + return obj + + +def nested_move_to(v, device): + if isinstance(v, torch.Tensor): + return move_to(v, device) + elif isinstance(v, (list, tuple)): + return type(v)([nested_move_to(e, device) for e in v]) + else: + return v diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py index 2aa90253d08..e2b53a7dbc7 100644 --- a/optimum/utils/__init__.py +++ b/optimum/utils/__init__.py @@ -37,6 +37,7 @@ is_auto_gptq_available, is_datasets_available, is_diffusers_available, + is_gptqmodel_available, is_onnx_available, is_onnxruntime_available, is_pydantic_available, diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 405e3815b33..d0f4c85db2b 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -52,6 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0") DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0") AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99") # Allows 0.5.0.dev0 +GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.2") # This is the minimal required version to support some ONNX Runtime features @@ -67,6 +68,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _accelerate_available = _is_package_available("accelerate") _diffusers_available = _is_package_available("diffusers") _auto_gptq_available = _is_package_available("auto_gptq") +_gptqmodel_available = _is_package_available("gptqmodel") _timm_available = _is_package_available("timm") _sentence_transformers_available = _is_package_available("sentence_transformers") _datasets_available = _is_package_available("datasets") @@ -138,12 +140,23 @@ def is_datasets_available(): def is_auto_gptq_available(): if _auto_gptq_available: - version_autogptq = version.parse(importlib_metadata.version("auto_gptq")) - if AUTOGPTQ_MINIMUM_VERSION < version_autogptq: + v = version.parse(importlib_metadata.version("auto_gptq")) + if v >= AUTOGPTQ_MINIMUM_VERSION: return True else: raise ImportError( - f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, but only version above {AUTOGPTQ_MINIMUM_VERSION} are supported" + f"Found an incompatible version of auto-gptq. Found version {v}, but only version >= {AUTOGPTQ_MINIMUM_VERSION} are supported" + ) + + +def is_gptqmodel_available(): + if _gptqmodel_available: + v = version.parse(importlib_metadata.version("gptqmodel")) + if v >= GPTQMODEL_MINIMUM_VERSION: + return True + else: + raise ImportError( + f"Found an incompatible version of gptqmodel. Found version {v}, but only version >= {GPTQMODEL_MINIMUM_VERSION} are supported" ) From 34b3d8bdfebe94ca34d61d5aeadcbc49eee6f95d Mon Sep 17 00:00:00 2001 From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com> Date: Thu, 19 Dec 2024 16:21:13 +0100 Subject: [PATCH 71/73] Skip private model loading for external contributors (#2130) --- tests/onnxruntime/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py index 255c0d9d0e7..456ad73505e 100644 --- a/tests/onnxruntime/test_modeling.py +++ b/tests/onnxruntime/test_modeling.py @@ -974,7 +974,7 @@ def test_stable_diffusion_model_on_rocm_ep_str(self): def test_load_model_from_hub_private(self): token = os.environ.get("HF_HUB_READ_TOKEN", None) - if token is None: + if not token: self.skipTest( "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`." ) From 984012142a62b34300966da2a7c98e9e851bc6ee Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Fri, 20 Dec 2024 15:20:27 +0400 Subject: [PATCH 72/73] fix sdxl refiner export (#2133) --- optimum/exporters/utils.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py index d4a4111075d..02b1d0fe3af 100644 --- a/optimum/exporters/utils.py +++ b/optimum/exporters/utils.py @@ -139,7 +139,11 @@ def _get_submodels_for_export_diffusion( # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571 unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False) unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None) - unet.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim + unet.config.text_encoder_projection_dim = ( + pipeline.text_encoder.config.projection_dim + if not is_sdxl + else pipeline.text_encoder_2.config.projection_dim + ) unet.config.export_model_type = _get_diffusers_submodel_type(unet) models_for_export["unet"] = unet From d21256c2964945fc3fe4623f7befb21082b69a25 Mon Sep 17 00:00:00 2001 From: Guang Yang <42389959+guangy10@users.noreply.github.com> Date: Fri, 20 Dec 2024 05:05:38 -0800 Subject: [PATCH 73/73] Export to ExecuTorch: Initial Integration (#2090) Co-authored-by: Guang Yang Co-authored-by: Michael Benayoun Co-authored-by: Github Executorch --- .github/workflows/test_executorch_export.yml | 35 ++ .github/workflows/test_executorch_runtime.yml | 42 ++ docs/Dockerfile | 4 +- docs/source/_toctree.yml | 17 + docs/source/exporters/executorch/overview.mdx | 26 + .../package_reference/configuration.mdx | 54 ++ .../executorch/package_reference/export.mdx | 26 + .../executorch/usage_guides/contribute.mdx | 57 +++ .../usage_guides/export_a_model.mdx | 124 +++++ docs/source/exporters/overview.mdx | 2 +- optimum/commands/__init__.py | 2 +- optimum/commands/export/__init__.py | 1 + optimum/commands/export/base.py | 6 + optimum/commands/export/executorch.py | 67 +++ optimum/executorchruntime/__init__.py | 29 ++ .../executorchruntime/modeling_executorch.py | 460 ++++++++++++++++++ optimum/exporters/__init__.py | 1 + optimum/exporters/executorch/__init__.py | 50 ++ optimum/exporters/executorch/__main__.py | 160 ++++++ optimum/exporters/executorch/convert.py | 90 ++++ .../exporters/executorch/recipe_registry.py | 68 +++ .../exporters/executorch/recipes/__init__.py | 13 + .../exporters/executorch/recipes/xnnpack.py | 97 ++++ optimum/exporters/executorch/task_registry.py | 68 +++ .../exporters/executorch/tasks/__init__.py | 13 + .../exporters/executorch/tasks/causal_lm.py | 66 +++ setup.py | 4 + tests/executorch/export/__init__.py | 14 + .../export/test_exporters_executorch.py | 115 +++++ tests/executorch/runtime/__init__.py | 14 + tests/executorch/runtime/test_modeling.py | 70 +++ .../executorch/runtime/test_modeling_gemma.py | 54 ++ .../runtime/test_modeling_gemma2.py | 56 +++ .../executorch/runtime/test_modeling_llama.py | 83 ++++ .../executorch/runtime/test_modeling_olmo.py | 54 ++ .../executorch/runtime/test_modeling_qwen2.py | 52 ++ 36 files changed, 2090 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/test_executorch_export.yml create mode 100644 .github/workflows/test_executorch_runtime.yml create mode 100644 docs/source/exporters/executorch/overview.mdx create mode 100644 docs/source/exporters/executorch/package_reference/configuration.mdx create mode 100644 docs/source/exporters/executorch/package_reference/export.mdx create mode 100644 docs/source/exporters/executorch/usage_guides/contribute.mdx create mode 100644 docs/source/exporters/executorch/usage_guides/export_a_model.mdx create mode 100644 optimum/commands/export/executorch.py create mode 100644 optimum/executorchruntime/__init__.py create mode 100644 optimum/executorchruntime/modeling_executorch.py create mode 100644 optimum/exporters/executorch/__init__.py create mode 100644 optimum/exporters/executorch/__main__.py create mode 100644 optimum/exporters/executorch/convert.py create mode 100644 optimum/exporters/executorch/recipe_registry.py create mode 100644 optimum/exporters/executorch/recipes/__init__.py create mode 100644 optimum/exporters/executorch/recipes/xnnpack.py create mode 100644 optimum/exporters/executorch/task_registry.py create mode 100644 optimum/exporters/executorch/tasks/__init__.py create mode 100644 optimum/exporters/executorch/tasks/causal_lm.py create mode 100644 tests/executorch/export/__init__.py create mode 100644 tests/executorch/export/test_exporters_executorch.py create mode 100644 tests/executorch/runtime/__init__.py create mode 100644 tests/executorch/runtime/test_modeling.py create mode 100644 tests/executorch/runtime/test_modeling_gemma.py create mode 100644 tests/executorch/runtime/test_modeling_gemma2.py create mode 100644 tests/executorch/runtime/test_modeling_llama.py create mode 100644 tests/executorch/runtime/test_modeling_olmo.py create mode 100644 tests/executorch/runtime/test_modeling_qwen2.py diff --git a/.github/workflows/test_executorch_export.yml b/.github/workflows/test_executorch_export.yml new file mode 100644 index 00000000000..1571cd0cffb --- /dev/null +++ b/.github/workflows/test_executorch_export.yml @@ -0,0 +1,35 @@ +name: ExecuTorch Export / Python - Test + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + strategy: + fail-fast: false + matrix: + python-version: ['3.10', '3.11', '3.12'] + os: [macos-15] + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for ExecuTorch + run: | + pip install .[tests,exporters-executorch] + pip list + - name: Run tests + working-directory: tests + run: | + RUN_SLOW=1 pytest executorch/export/test_*.py -s -vvvv --durations=0 diff --git a/.github/workflows/test_executorch_runtime.yml b/.github/workflows/test_executorch_runtime.yml new file mode 100644 index 00000000000..d5bbc0f8eaa --- /dev/null +++ b/.github/workflows/test_executorch_runtime.yml @@ -0,0 +1,42 @@ +name: ExecuTorch Runtime / Python - Test + +on: + push: + branches: [main] + pull_request: + branches: [main] + +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +jobs: + build: + strategy: + fail-fast: false + matrix: + python-version: ['3.10', '3.11', '3.12'] + os: [macos-15] + test-modeling: + - test_modeling_gemma2.py + - test_modeling_gemma.py + - test_modeling_llama.py + - test_modeling_olmo.py + - test_modeling.py + - test_modeling_qwen2.py + + runs-on: ${{ matrix.os }} + steps: + - uses: actions/checkout@v2 + - name: Setup Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies for ExecuTorch + run: | + pip install .[tests,exporters-executorch] + pip list + - name: Run tests + working-directory: tests + run: | + RUN_SLOW=1 pytest executorch/runtime/${{ matrix.test-modeling }} -s -vvvv --durations=0 diff --git a/docs/Dockerfile b/docs/Dockerfile index d76dc50c556..5181177f0db 100644 --- a/docs/Dockerfile +++ b/docs/Dockerfile @@ -1,4 +1,4 @@ -FROM nikolaik/python-nodejs:python3.9-nodejs18 +FROM nikolaik/python-nodejs:python3.11-nodejs23 ARG commit_sha ARG clone_url @@ -8,4 +8,4 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder.git RUN git clone $clone_url && cd optimum && git checkout $commit_sha -RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-tf,doc-build,diffusers] +RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-executorch,doc-build,diffusers] diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index 8444da1b9a9..dc69564b045 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -81,6 +81,23 @@ title: Reference isExpanded: false title: "ONNX" + - sections: + - local: exporters/executorch/overview + title: Overview + - sections: + - local: exporters/executorch/usage_guides/export_a_model + title: Export a model to ExecuTorch + - local: exporters/executorch/usage_guides/contribute + title: Add support for exporting an architecture to ExecuTorch + title: How-to guides + - sections: + - local: exporters/executorch/package_reference/configuration + title: ExecuTorch configurations + - local: exporters/executorch/package_reference/export + title: Export functions + title: Reference + isExpanded: false + title: "ExecuTorch" - sections: - local: exporters/tflite/overview title: Overview diff --git a/docs/source/exporters/executorch/overview.mdx b/docs/source/exporters/executorch/overview.mdx new file mode 100644 index 00000000000..0e880968bf7 --- /dev/null +++ b/docs/source/exporters/executorch/overview.mdx @@ -0,0 +1,26 @@ + + +# Overview + +🤗 Optimum handles the export of PyTorch to ExecuTorch in the `exporters.executorch` module. It provides classes, functions, and a command line interface to perform the export easily. + +Supported architectures from [🤗 Transformers](https://huggingface.co/docs/transformers/index): + +- Gemma +- Gemma2 +- Llama2 +- Llama3(Llama3.2) +- OLMo +- Qwen2(Qwen2.5) + +There are many more models are supported by ExecuTorch, we will add those models to Optimum over time. Read more at [pytorch/executorch/examples/](https://github.com/pytorch/executorch/tree/main/examples) diff --git a/docs/source/exporters/executorch/package_reference/configuration.mdx b/docs/source/exporters/executorch/package_reference/configuration.mdx new file mode 100644 index 00000000000..b7a10b80419 --- /dev/null +++ b/docs/source/exporters/executorch/package_reference/configuration.mdx @@ -0,0 +1,54 @@ + + +# Configuration for ExecuTorch Export + +ExecuTorch export provides a flexible configuration mechanism through dynamic registration, enabling users to have +complete control over the export process. The configuration system is divided into task configurations and recipe +configurations, each addressing specific aspects of the export pipeline. + + +## Task Configurations + +Task configurations determine how a Hugging Face model should be loaded and prepared for export, tailored to specific tasks. + +For instance, when exporting a model for a text generation task, the provided configuration utilizes **static caching** and +**SDPA (Scaled Dot-Product Attention)** for inference optimization. + +By leveraging task configurations, users can ensure that their models are appropriately prepared for efficient execution on +the ExecuTorch backend. + +[[autodoc]] exporters.executorch.task_registry.discover_tasks + +[[autodoc]] exporters.executorch.task_registry.register_task + +[[autodoc]] exporters.executorch.tasks.causal_lm.load_causal_lm_model + + +## Recipe Configurations + +Recipe configurations control the specifics of lowering an eager PyTorch module to the ExecuTorch backend. These +configurations allow users to: + +- Specify whether and how to **quantize** the model. +- Delegate computation to various accelerators, such as **CPU**, **GPU**, **NPU**, **DSP**, and others. +- Define **custom transformation passes**. +- Implement advanced techniques like memory planning algorithms to optimize resource utilization. + +[[autodoc]] exporters.executorch.recipe_registry.discover_recipes + +[[autodoc]] exporters.executorch.recipe_registry.register_recipe + +[[autodoc]] exporters.executorch.recipes.xnnpack.export_to_executorch_with_xnnpack + +The combination of task and recipe configurations ensures that users can customize both the high-level task setup +and the low-level export details to suit their deployment requirements. diff --git a/docs/source/exporters/executorch/package_reference/export.mdx b/docs/source/exporters/executorch/package_reference/export.mdx new file mode 100644 index 00000000000..6663eb5278e --- /dev/null +++ b/docs/source/exporters/executorch/package_reference/export.mdx @@ -0,0 +1,26 @@ + + +# Export functions + +## Main functions + +[[autodoc]] exporters.executorch.convert.export_to_executorch + +The primary export function is designed to be **model- and task-independent** as well as **optimization-agnostic**, providing a +highly flexible and modular interface for exporting Hugging Face models to the ExecuTorch backend. + +This approach highlights the **composability** of ExecuTorch export pipeline, where dynamically registered **task configurations** +specify how a :hug model is prepared, and **recipe configurations** encapsulate device-specific optimizations during export. This +separation allows users to customize the export process without altering the core function. + +For more details on task and recipe configurations, see the [Configuration for ExecuTorch Export](./configuration.mdx). diff --git a/docs/source/exporters/executorch/usage_guides/contribute.mdx b/docs/source/exporters/executorch/usage_guides/contribute.mdx new file mode 100644 index 00000000000..2c6c1593169 --- /dev/null +++ b/docs/source/exporters/executorch/usage_guides/contribute.mdx @@ -0,0 +1,57 @@ + + +# Adding support for an unsupported architecture + +We welcome contributions to extend the functionality of ExecuTorch export. This guide provides high-level instructions for contributors who want to: + +1. Export a new model that is not currently supported. +2. Add new recipes or support a new task for export. + +--- + +## Exporting a New Model + +If you want to export a model that is not already supported by the library, follow these steps: + +### Step 1: Export and Test the Model +1. Attempt to export and lower the model using an existing task and recipe. On success, it will store the exported model in a `.pte` file. +2. Add a test case for the model in the appropriate test suite. + - For example, you can make sure tests pass for the new `my_new_model` by running: + ```bash + pytest tests/executorch/export/test_*.py -k "test_my_new_model" # doctest: +SKIP + pytest tests/executorch/runtime/test_*.py -k "test_my_new_model" # doctest: +SKIP + ``` + +### Step 2: Handle Export Failures +1. If the export fails in Step 1, report the issue by opening a GitHub issue. +2. If the issue requires changes to the model’s architecture or its Hugging Face implementation, these modifications may be made upstream in the Hugging Face Transformers library. + +--- + +## Adding New Recipes or Tasks + +To extend ExecuTorch with new recipes or tasks, follow these guidelines: + +### Registering a New Recipe +You can add a custom recipe to define specific optimizations or configurations for exporting models. Below is an example: + +```python +from exporters.executorch import register_recipe + +@register_recipe("my_custom_recipe") +def export_with_custom_recipe(model, config, *args, **kwargs): + # Example: Apply a custom quantization +``` + +### Registering a Task +The task registration process is same as adding a recipe. Besides that you may need to implement a new `ExecuTorchModelForXXX` class. diff --git a/docs/source/exporters/executorch/usage_guides/export_a_model.mdx b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx new file mode 100644 index 00000000000..7993188cbd5 --- /dev/null +++ b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx @@ -0,0 +1,124 @@ + + +# Export a model to ExecuTorch with optimum.exporters.executorch + +If you need to deploy 🤗 Transformers models for on-device use cases, we recommend +exporting them to a serialized format that can be distributed and executed on specialized +runtimes and hardware. In this guide, we'll show you how to export these +models to [ExecuTorch](https://pytorch.org/executorch/main/intro-overview.html). + + +## Why ExecuTorch? + +ExecuTorch is the ideal solution for deploying PyTorch models on edge devices, offering a streamlined process from +export to deployment without leaving PyTorch ecosystem. + +Supporting on-device AI presents unique challenges with diverse hardware, critical power requirements, low/no internet +connectivity, and realtime processing needs. These constraints have historically prevented or slowed down the creation +of scalable and performant on-device AI solutions. We designed ExecuTorch, backed by our industry partners like Meta, +Arm, Apple, Qualcomm, MediaTek, etc. to be highly portable and provide superior developer productivity without losing on +performance. + + +## Summary + +Exporting a PyTorch model to ExecuTorch is as simple as + +```bash +optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b" +``` + +Check out the help for more options: + +```bash +optimum-cli export executorch --help +``` + + +## Exporting a model to ExecuTorch using the CLI + +To export a 🤗 Transformers model to ExecuTorch, you'll first need to install some extra +dependencies: + +```bash +pip install optimum[exporters-executorch] +``` + +The Optimum ExecuTorch export can be used through Optimum command-line: + +```bash +optimum-cli export executorch --help + +usage: optimum-cli export executorch [-h] -m MODEL [-o OUTPUT_DIR] [--task TASK] [--recipe RECIPE] + +options: + -h, --help show this help message and exit + +Required arguments: + -m MODEL, --model MODEL + Model ID on huggingface.co or path on disk to load model from. + -o OUTPUT_DIR, --output_dir OUTPUT_DIR + Path indicating the directory where to store the generated ExecuTorch model. + --task TASK The task to export the model for. Available tasks depend on the model, but are among: ['audio-classification', 'feature-extraction', 'image-to-text', + 'sentence-similarity', 'depth-estimation', 'image-segmentation', 'audio-frame-classification', 'masked-im', 'semantic-segmentation', 'text-classification', + 'audio-xvector', 'mask-generation', 'question-answering', 'text-to-audio', 'automatic-speech-recognition', 'image-to-image', 'multiple-choice', 'image- + classification', 'text2text-generation', 'token-classification', 'object-detection', 'zero-shot-object-detection', 'zero-shot-image-classification', 'text- + generation', 'fill-mask']. + --recipe RECIPE Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack". + +``` + +Exporting a checkpoint can be done as follows: + +```bash +optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b" +``` + +You should see a `model.pte` file is stored under "./meta_llama3_2_1b/": + +```bash +meta_llama3_2_1b/ +└── model.pte +``` + +This will fetch the model on the Hub and exports the PyTorch model with the specialized recipe. The resulting `model.pte` file can then be run on the [XNNPACK backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html), or on many +other ExecuTorh supported backends if exports with different recipes, e.g. Apple's [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) or [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [Qualcomm's SoCs](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html), [ARM's Ethos-U](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html), [Xtensa HiFi4 DSP](https://pytorch.org/executorch/main/build-run-xtensa.html), [Vulkan GPU](https://pytorch.org/executorch/main/build-run-vulkan.html), [MediaTek](https://pytorch.org/executorch/main/build-run-mediatek-backend.html), etc. + +For example, we can load and run the model with [ExecuTorch +Runtime](https://pytorch.org/executorch/main/runtime-overview.html) using the `optimum.executorchruntime` package as follows: + +```python +>>> from transformers import AutoTokenizer +>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM + +>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B") # doctest: +SKIP +>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta_llama3_2_1b/", export=False) # doctest: +SKIP + +>>> generated_text = model.text_generation(tokenizer=tokenizer, prompt="Simply put, the theory of relativity states that", max_seq_len=45) # doctest: +SKIP +``` + +Printing the `generated_text` would give that: + +``` +"Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference. In other words, the laws of physics are the same in all inertial frames of reference." +``` + +As you can see, converting a model to ExecuTorch does not mean leaving the Hugging Face ecosystem. You end up with a similar API as regular 🤗 Transformers models! + +It is also possible to export the model to ExecuTorch directly from the `ExecuTorchModelForCausalLM` class by doing the following: + +```python +>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM + +>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", export=True, task="text-generation", recipe="xnnpack") +``` diff --git a/docs/source/exporters/overview.mdx b/docs/source/exporters/overview.mdx index 6fd7bd9d916..2b4c2e11792 100644 --- a/docs/source/exporters/overview.mdx +++ b/docs/source/exporters/overview.mdx @@ -12,4 +12,4 @@ specific language governing permissions and limitations under the License. # Overview -🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, two exporting format are supported: ONNX and TFLite (TensorFlow Lite). +🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, three exporting format are supported: ONNX, TFLite (TensorFlow Lite), and ExecuTorch. diff --git a/optimum/commands/__init__.py b/optimum/commands/__init__.py index 8a2a276d1c5..a31344ed133 100644 --- a/optimum/commands/__init__.py +++ b/optimum/commands/__init__.py @@ -14,5 +14,5 @@ from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand from .env import EnvironmentCommand -from .export import ExportCommand, ONNXExportCommand, TFLiteExportCommand +from .export import ExecuTorchExportCommand, ExportCommand, ONNXExportCommand, TFLiteExportCommand from .optimum_cli import optimum_cli_subcommand diff --git a/optimum/commands/export/__init__.py b/optimum/commands/export/__init__.py index 19da68a60d2..b72cd5dbc8d 100644 --- a/optimum/commands/export/__init__.py +++ b/optimum/commands/export/__init__.py @@ -14,5 +14,6 @@ from .base import ExportCommand +from .executorch import ExecuTorchExportCommand from .onnx import ONNXExportCommand from .tflite import TFLiteExportCommand diff --git a/optimum/commands/export/base.py b/optimum/commands/export/base.py index 07737cb8eaf..e5ed4c90ff5 100644 --- a/optimum/commands/export/base.py +++ b/optimum/commands/export/base.py @@ -15,6 +15,7 @@ """optimum.exporters command-line interface base classes.""" from .. import BaseOptimumCLICommand, CommandInfo +from .executorch import ExecuTorchExportCommand from .onnx import ONNXExportCommand from .tflite import TFLiteExportCommand @@ -25,6 +26,11 @@ class ExportCommand(BaseOptimumCLICommand): help="Export PyTorch and TensorFlow models to several format.", ) SUBCOMMANDS = ( + CommandInfo( + name="executorch", + help="Export PyTorch model to ExecuTorch.", + subcommand_class=ExecuTorchExportCommand, + ), CommandInfo( name="onnx", help="Export PyTorch and TensorFlow to ONNX.", diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py new file mode 100644 index 00000000000..2bf2f1d3054 --- /dev/null +++ b/optimum/commands/export/executorch.py @@ -0,0 +1,67 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +"""Defines the command line for the export with ExecuTorch.""" + +from pathlib import Path +from typing import TYPE_CHECKING + +from ...exporters import TasksManager +from ..base import BaseOptimumCLICommand + + +if TYPE_CHECKING: + from argparse import ArgumentParser + + +def parse_args_executorch(parser): + required_group = parser.add_argument_group("Required arguments") + required_group.add_argument( + "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from." + ) + required_group.add_argument( + "-o", + "--output_dir", + type=Path, + help="Path indicating the directory where to store the generated ExecuTorch model.", + ) + required_group.add_argument( + "--task", + type=str, + default="text-generation", + help=( + "The task to export the model for. Available tasks depend on the model, but are among:" + f" {str(TasksManager.get_all_tasks())}." + ), + ) + required_group.add_argument( + "--recipe", + type=str, + default="xnnpack", + help='Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack".', + ) + + +class ExecuTorchExportCommand(BaseOptimumCLICommand): + @staticmethod + def parse_args(parser: "ArgumentParser"): + return parse_args_executorch(parser) + + def run(self): + from ...exporters.executorch import main_export + + main_export( + model_name_or_path=self.args.model, + task=self.args.task, + recipe=self.args.recipe, + output_dir=self.args.output_dir, + ) diff --git a/optimum/executorchruntime/__init__.py b/optimum/executorchruntime/__init__.py new file mode 100644 index 00000000000..0a84c3a139b --- /dev/null +++ b/optimum/executorchruntime/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from typing import TYPE_CHECKING + +from transformers.utils import _LazyModule + + +_import_structure = { + "modeling_executorch": [ + "ExecuTorchModelForCausalLM", + ], +} + +if TYPE_CHECKING: + from .modeling_executorch import ExecuTorchModelForCausalLM +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/optimum/executorchruntime/modeling_executorch.py b/optimum/executorchruntime/modeling_executorch.py new file mode 100644 index 00000000000..b93309f6a48 --- /dev/null +++ b/optimum/executorchruntime/modeling_executorch.py @@ -0,0 +1,460 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +"""ExecuTorchModelForXXX classes, allowing to run ExecuTorch Models with ExecuTorch Runtime using the same API as Transformers.""" + +import logging +import os +import warnings +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import List, Optional, Union + +import torch +from executorch.extension.pybindings.portable_lib import ( + ExecuTorchModule, + _load_for_executorch, +) +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from transformers import ( + AutoModelForCausalLM, + PretrainedConfig, + PreTrainedTokenizer, +) + +from ..exporters.executorch import main_export +from ..modeling_base import OptimizedModel + + +logger = logging.getLogger(__name__) + + +class ExecuTorchModelForCausalLM(OptimizedModel): + """ + ExecuTorch model with a causal language modeling head for inference using the ExecuTorch Runtime. + + This class provides an interface for loading, running, and generating outputs from a causal language model + optimized for ExecuTorch Runtime. It includes utilities for exporting and loading pre-trained models + compatible with ExecuTorch runtime. + + Attributes: + auto_model_class (`Type`): + Associated Transformers class, `AutoModelForCausalLM`. + et_model (`ExecuTorchModule`): + The loaded ExecuTorch model. + use_kv_cache (`bool`): + Whether key-value caching is enabled. For performance reasons, the exported model is + optimized to use a static cache. + max_cache_size (`int`): + Maximum sequence length supported by the cache. + max_batch_size (`int`): + Maximum supported batch size. + dtype (`str`): + Data type of the model parameters. + bos_token_id (`int`): + Beginning-of-sequence token ID. + eos_token_id (`int`): + End-of-sequence token ID. + vocab_size (`int`): + Size of the model vocabulary. + """ + + auto_model_class = AutoModelForCausalLM + + def __init__( + self, + model: "ExecuTorchModule", + config: "PretrainedConfig", + ): + super().__init__(model, config) + self.et_model = model + metadata = self.et_model.method_names() + logging.info(f"Load all static methods: {metadata}") + if "use_kv_cache" in metadata: + self.use_kv_cache = self.et_model.run_method("use_kv_cache")[0] + if "get_max_seq_len" in metadata: + self.max_cache_size = self.et_model.run_method("get_max_seq_len")[0] + if "get_max_batch_size" in metadata: + self.max_batch_size = self.et_model.run_method("get_max_batch_size")[0] + if "get_dtype" in metadata: + self.dtype = self.et_model.run_method("get_dtype")[0] + if "get_bos_id" in metadata: + self.bos_token_id = self.et_model.run_method("get_bos_id")[0] + if "get_eos_id" in metadata: + self.eos_token_id = self.et_model.run_method("get_eos_id")[0] + if "get_vocab_size" in metadata: + self.vocab_size = self.et_model.run_method("get_vocab_size")[0] + + def forward( + self, + input_ids: torch.Tensor, + cache_position: torch.Tensor, + ) -> torch.Tensor: + """ + Forward pass of the model, which is compatible with the ExecuTorch runtime for LLM. + + Args: + input_ids (`torch.Tensor`): Tensor representing current input token id to the model. + cache_position (`torch.Tensor`): Tensor representing current input position in the cache. + + Returns: + torch.Tensor: Logits output from the model. + """ + return self.et_model.forward((input_ids, cache_position))[0] + + @classmethod + def from_pretrained( + cls, + model_name_or_path: Union[str, Path], + export: bool = True, + task: str = "", + recipe: str = "", + config: "PretrainedConfig" = None, + subfolder: str = "", + revision: Optional[str] = None, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + force_download: bool = False, + local_files_only: bool = False, + use_auth_token: Optional[Union[bool, str]] = None, + token: Optional[Union[bool, str]] = None, + **kwargs, + ) -> "ExecuTorchModelForCausalLM": + """ + Load a pre-trained ExecuTorch model. + + Args: + model_name_or_path (`Union[str, Path]`): + Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`. + export (`bool`, *optional*, defaults to `True`): + If `True`, the model will be exported from eager to ExecuTorch after fetched from huggingface.co. `model_name_or_path` must be a valid model ID on huggingface.co. + If `False`, the previously exported ExecuTorch model will be loaded from a local path. `model_name_or_path` must be a valid local directory where a `model.pte` is stored. + task (`str`, defaults to `""`): + The task to export the model for, e.g. "text-generation". It is required to specify a task when `export` is `True`. + recipe (`str`, defaults to `""`): + The recipe to use to do the export, e.g. "xnnpack". It is required to specify a task when `export` is `True`. + config (`PretrainedConfig`, *optional*): + Configuration of the pre-trained model. + subfolder (`str`, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can + specify the folder name here. + revision (`str`, defaults to `"main"`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id. + cache_dir (`Optional[str]`, defaults to `None`): + Path indicating where to store cache. The default Hugging Face cache path will be used by default. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + local_files_only (`Optional[bool]`, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`): + Deprecated. Please use the `token` argument instead. + token (`Optional[Union[bool,str]]`, defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`). + **kwargs: + Additional configuration options to tasks and recipes. + + Returns: + `ExecuTorchModelForCausalLM`: An instance of the ExecuTorch model for text generation task. + """ + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") + token = use_auth_token + + if export: + # Fetch the model from huggingface.co and export it to ExecuTorch + if task == "": + raise ValueError("Please specify a task to export the model for.") + if recipe == "": + raise ValueError("Please specify a recipe to export the model for.") + return cls._export( + model_id=model_name_or_path, + task=task, + recipe=recipe, + config=config, + **kwargs, + ) + else: + # Load the ExecuTorch model from a local path + return cls._from_pretrained( + model_dir_path=model_name_or_path, + config=config, + ) + + @classmethod + def _from_pretrained( + cls, + model_dir_path: Union[str, Path], + config: PretrainedConfig, + subfolder: str = "", + revision: Optional[str] = None, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + force_download: bool = False, + local_files_only: bool = False, + use_auth_token: Optional[Union[bool, str]] = None, + token: Optional[Union[bool, str]] = None, + ) -> "ExecuTorchModelForCausalLM": + """ + Load a pre-trained ExecuTorch model from a local directory. + + Args: + model_dir_path (`Union[str, Path]`): + Path to the directory containing the ExecuTorch model file (`model.pte`). + config (`PretrainedConfig`, *optional*): + Configuration of the pre-trained model. + subfolder (`str`, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can + specify the folder name here. + revision (`str`, defaults to `"main"`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id. + cache_dir (`Optional[str]`, defaults to `None`): + Path indicating where to store cache. The default Hugging Face cache path will be used by default. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + local_files_only (`Optional[bool]`, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`): + Deprecated. Please use the `token` argument instead. + token (`Optional[Union[bool,str]]`, defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`). + + Returns: + `ExecuTorchModelForCausalLM`: The initialized ExecuTorch model. + + """ + full_path = os.path.join(f"{model_dir_path}", "model.pte") + model = _load_for_executorch(full_path) + logging.info(f"Loaded model from {full_path}") + logging.debug(f"{model.method_meta('forward')}") + return cls( + model=model, + config=config, + ) + + def _save_pretrained(self, save_directory): + """ + Saves a model weights into a directory, so that it can be re-loaded using the + [`from_pretrained`] class method. + """ + raise NotImplementedError + + @classmethod + def _export( + cls, + model_id: str, + task: str, + recipe: str, + config: PretrainedConfig, + cache_dir: str = HUGGINGFACE_HUB_CACHE, + trust_remote_code: bool = False, + subfolder: str = "", + revision: Optional[str] = None, + force_download: bool = False, + local_files_only: bool = False, + use_auth_token: Optional[Union[bool, str]] = None, + token: Optional[Union[bool, str]] = None, + **kwargs, + ): + """ + Fetch a model from the Hugging Face Hub and export it to ExecuTorch format. + + Args: + model_id (`str`): + Model ID on huggingface.co, for example: `model_name_or_path="meta-llama/Llama-3.2-1B"`. + task (`str`): + The task to export the model for, e.g. "text-generation". + recipe (`str`): + The recipe to use to do the export, e.g. "xnnpack". + config (`PretrainedConfig`, *optional*): + Configuration of the pre-trained model. + cache_dir (`Optional[str]`, defaults to `None`): + Path indicating where to store cache. The default Hugging Face cache path will be used by default. + trust_remote_code (`bool`, defaults to `False`): + Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories + you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the + model repository. + subfolder (`str`, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can + specify the folder name here. + revision (`str`, defaults to `"main"`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + local_files_only (`Optional[bool]`, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`): + Deprecated. Please use the `token` argument instead. + token (`Optional[Union[bool,str]]`, defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`). + **kwargs: + Additional configuration options to tasks and recipes. + + Returns: + `ExecuTorchModelForCausalLM`: The loaded and exported ExecuTorch model. + + """ + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") + token = use_auth_token + + save_dir = TemporaryDirectory() + save_dir_path = Path(save_dir.name) + + # Export to ExecuTorch and save the pte file to the temporary directory + main_export( + model_name_or_path=model_id, + output_dir=save_dir_path, + task=task, + recipe=recipe, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + return cls._from_pretrained( + model_dir_path=save_dir_path, + config=config, + use_auth_token=use_auth_token, + subfolder=subfolder, + revision=revision, + cache_dir=cache_dir, + token=token, + local_files_only=local_files_only, + force_download=force_download, + ) + + def generate( + self, + prompt_tokens: List[int], + echo: bool = False, + pos_base: int = 0, + max_seq_len: Optional[int] = None, + ) -> List[int]: + """ + Generate tokens from a prompt using the ExecuTorch model. + + Args: + prompt_tokens (List[int]): + List of token IDs representing the prompt. + echo (`bool`, *optional*): + Whether to include prompt tokens in the generated output. Defaults to `False`. + pos_base (`int`, *optional*): + Base position for the prompt tokens. Defaults to 0. + max_seq_len (`int`, *optional*): + Maximum sequence length for the generated output. + Defaults to None and uses the model's `max_cache_size` attribute. + Will be truncated to maximal cache size if larger than `max_cache_size`. + + Returns: + List[int]: List of generated token IDs. + + Note: + Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind. + Expect improvements to the pybind interface in ExecuTorch version 0.4.1. + """ + self.device = torch.device("cpu") + if max_seq_len is None: + # Default to max_cache_size if max_seq_len is not specified + max_seq_len = self.max_cache_size + elif max_seq_len > self.max_cache_size: + logging.warning( + f"max_seq_len={max_seq_len} is larger than max_cache_size={self.max_cache_size}. Generating tokens will be truncated to max_cache_size." + ) + max_seq_len = self.max_cache_size + generated_tokens = [] + + # prefill + for i, prompt_token in enumerate(prompt_tokens): + logits = self.forward( + input_ids=torch.tensor([prompt_token], dtype=torch.long, device=self.device).unsqueeze(0), + cache_position=torch.tensor([i], dtype=torch.long, device=self.device), + ) + + next_token = torch.argmax(logits, dim=-1).item() + generated_tokens = prompt_tokens + [next_token] + + while len(generated_tokens) < max_seq_len: + logits = self.forward( + input_ids=torch.tensor([next_token], dtype=torch.long, device=self.device).unsqueeze(0), + cache_position=torch.tensor( + [pos_base + len(generated_tokens) - 1], + dtype=torch.long, + device=self.device, + ), + ) + next_token = torch.argmax(logits, dim=-1).item() + generated_tokens.append(next_token) + if next_token == self.eos_token_id: + break + + return generated_tokens if echo else generated_tokens[len(prompt_tokens) :] + + def text_generation( + self, + tokenizer: "PreTrainedTokenizer", + prompt: str, + echo: bool = True, + max_seq_len: Optional[int] = None, + ): + """ + Perform text generation task for a given prompt using the ExecuTorch model. + + Args: + tokenizer (`PreTrainedTokenizer`): + The tokenizer used to encode and decode the prompt and output. + prompt (`str`): + The text prompt to complete. + echo (`bool`, *optional*): + Whether to include prompt tokens in the generated output. Defaults to `True`. + max_seq_len (`int`, *optional*): + Maximum sequence length for the generated output. + Defaults to None and uses the model's `max_cache_size` attribute. + Will be truncated to maximal cache size if larger than `max_cache_size`. + """ + self.tokenizer = tokenizer + + # Sanity check + if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.bos_token_id: + raise ValueError( + f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}." + ) + if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id != self.eos_token_id: + raise ValueError( + f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must be the same as the model's eos_token_id={self.eos_token_id}." + ) + + prompt_tokens = self.tokenizer.encode(prompt) + generated_tokens = self.generate( + prompt_tokens=prompt_tokens, + echo=echo, + max_seq_len=max_seq_len, + ) + return self.tokenizer.decode(generated_tokens, skip_special_tokens=True) diff --git a/optimum/exporters/__init__.py b/optimum/exporters/__init__.py index eef17dac7f7..7b08812a569 100644 --- a/optimum/exporters/__init__.py +++ b/optimum/exporters/__init__.py @@ -13,4 +13,5 @@ # See the License for the specific language governing permissions and # limitations under the License. from . import onnx # noqa +from . import executorch # noqa from .tasks import TasksManager # noqa diff --git a/optimum/exporters/executorch/__init__.py b/optimum/exporters/executorch/__init__.py new file mode 100644 index 00000000000..3409e69fcfb --- /dev/null +++ b/optimum/exporters/executorch/__init__.py @@ -0,0 +1,50 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from typing import TYPE_CHECKING + +from transformers.utils import _LazyModule + + +_import_structure = { + "convert": [ + "export_to_executorch", + ], + "recipe_registry": [ + "discover_recipes", + "register_recipe", + ], + "task_registry": [ + "discover_tasks", + "register_task", + ], + "tasks": [ + "causal_lm", + ], + "recipes": [ + "xnnpack", + ], + "__main__": ["main_export"], +} + +if TYPE_CHECKING: + from .__main__ import main_export + from .convert import export_to_executorch +else: + import sys + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + _import_structure, + module_spec=__spec__, + ) diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py new file mode 100644 index 00000000000..33a668b0674 --- /dev/null +++ b/optimum/exporters/executorch/__main__.py @@ -0,0 +1,160 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +"""Entry point to the optimum.exporters.executorch command line.""" + +import argparse +import os +import warnings +from pathlib import Path + +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from transformers.utils import is_torch_available + +from optimum.utils.import_utils import check_if_transformers_greater + +from ...commands.export.executorch import parse_args_executorch +from .convert import export_to_executorch +from .task_registry import discover_tasks, task_registry + + +if is_torch_available(): + pass + +from typing import Optional, Union + + +def main_export( + model_name_or_path: str, + task: str, + recipe: str, + output_dir: Union[str, Path], + cache_dir: str = HUGGINGFACE_HUB_CACHE, + trust_remote_code: bool = False, + pad_token_id: Optional[int] = None, + subfolder: str = "", + revision: str = "main", + force_download: bool = False, + local_files_only: bool = False, + use_auth_token: Optional[Union[bool, str]] = None, + token: Optional[Union[bool, str]] = None, + **kwargs, +): + """ + Full-suite ExecuTorch export function, exporting **from a model ID on Hugging Face Hub or a local model repository**. + + Args: + model_name_or_path (`str`): + Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`. + task (`str`): + The task to export the model for, e.g. "text-generation". + recipe (`str`): + The recipe to use to do the export, e.g. "xnnpack". + output_dir (`Union[str, Path]`): + Path indicating the directory where to store the generated ExecuTorch model. + cache_dir (`Optional[str]`, defaults to `None`): + Path indicating where to store cache. The default Hugging Face cache path will be used by default. + trust_remote_code (`bool`, defaults to `False`): + Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories + you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the + model repository. + pad_token_id (`Optional[int]`, defaults to `None`): + This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it. + subfolder (`str`, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can + specify the folder name here. + revision (`str`, defaults to `"main"`): + Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id. + force_download (`bool`, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + local_files_only (`Optional[bool]`, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`): + Deprecated. Please use the `token` argument instead. + token (`Optional[Union[bool,str]]`, defaults to `None`): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`). + **kwargs: + Additional configuration options to tasks and recipes. + + Example usage: + ```python + >>> from optimum.exporters.executorch import main_export + + >>> main_export("meta-llama/Llama-3.2-1B", "text-generation", "xnnpack", "meta_llama3_2_1b/") + ``` + """ + + if not check_if_transformers_greater("4.46"): + raise ValueError( + "The minimum Transformers version compatible with ExecuTorch is 4.46.0. Please upgrade to Transformers 4.46.0 or later." + ) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.") + token = use_auth_token + + # Dynamically discover and import registered tasks + discover_tasks() + + # Load the model for specific task + try: + task_func = task_registry.get(task) + except KeyError as e: + raise RuntimeError(f"The task '{task}' isn't registered. Detailed error: {e}") + + model = task_func(model_name_or_path, **kwargs) + + if task == "text-generation": + from transformers.integrations.executorch import TorchExportableModuleWithStaticCache + + model = TorchExportableModuleWithStaticCache(model) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + return export_to_executorch( + model=model, + task=task, + recipe=recipe, + output_dir=output_dir, + **kwargs, + ) + + +def main(): + parser = argparse.ArgumentParser("Hugging Face Optimum ExecuTorch exporter") + + parse_args_executorch(parser) + + # Retrieve CLI arguments + args = parser.parse_args() + + main_export( + model_name_or_path=args.model, + output_dir=args.output_dir, + task=args.task, + recipe=args.recipe, + cache_dir=args.cache_dir, + trust_remote_code=args.trust_remote_code, + pad_token_id=args.pad_token_id, + ) + + +if __name__ == "__main__": + main() diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py new file mode 100644 index 00000000000..f50a4b54a96 --- /dev/null +++ b/optimum/exporters/executorch/convert.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +"""ExecuTorch model check and export functions.""" + +import logging +import os +from pathlib import Path +from typing import Union + +from transformers.utils import is_torch_available + +from optimum.utils.import_utils import check_if_transformers_greater + +from .recipe_registry import discover_recipes, recipe_registry + + +if is_torch_available(): + from transformers.modeling_utils import PreTrainedModel + +if check_if_transformers_greater("4.46"): + from transformers.integrations.executorch import ( + TorchExportableModuleWithStaticCache, + ) + +logger = logging.getLogger(__name__) + + +def export_to_executorch( + model: Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"], + task: str, + recipe: str, + output_dir: Union[str, Path], + **kwargs, +): + """ + Export a pre-trained PyTorch model to the ExecuTorch format using a specified recipe. + + This function facilitates the transformation of a PyTorch model into an optimized ExecuTorch program. + + Args: + model (`Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"]`): + A PyTorch model to be exported. This can be a standard HuggingFace `PreTrainedModel` or a wrapped + module like `TorchExportableModuleWithStaticCache` for text generation task. + task (`str`): + The specific task the exported model will perform, e.g., "text-generation". + recipe (`str`): + The recipe to guide the export process, e.g., "xnnpack". Recipes define the optimization and lowering steps. + Will raise an exception if the specified recipe is not registered in the recipe registry. + output_dir (`Union[str, Path]`): + Path to the directory where the resulting ExecuTorch model will be saved. + **kwargs: + Additional configuration options passed to the recipe. + + Returns: + `ExecuTorchProgram`: + The lowered ExecuTorch program object. + + Notes: + - The function uses a dynamic recipe discovery mechanism to identify and import the specified recipe. + - The exported model is stored in the specified output directory with the fixed filename `model.pte`. + - The resulting ExecuTorch program is serialized and saved to the output directory. + """ + + # Dynamically discover and import registered recipes + discover_recipes() + + # Export and lower the model to ExecuTorch with the recipe + try: + recipe_func = recipe_registry.get(recipe) + except KeyError as e: + raise RuntimeError(f"The recipe '{recipe}' isn't registered. Detailed error: {e}") + + executorch_prog = recipe_func(model, task, **kwargs) + + full_path = os.path.join(f"{output_dir}", "model.pte") + with open(full_path, "wb") as f: + executorch_prog.write_to_file(f) + logging.info(f"Saved exported program to {full_path}") + + return executorch_prog diff --git a/optimum/exporters/executorch/recipe_registry.py b/optimum/exporters/executorch/recipe_registry.py new file mode 100644 index 00000000000..2eb728b7573 --- /dev/null +++ b/optimum/exporters/executorch/recipe_registry.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import importlib +import logging +import pkgutil + + +logger = logging.getLogger(__name__) + +recipe_registry = {} + +package_name = "optimum.exporters.executorch.recipes" + + +def register_recipe(recipe_name): + """ + Decorator to register a recipe for exporting and lowering an ExecuTorch model under a specific name. + + Args: + recipe_name (`str`): + The name of the recipe to associate with a callable recipe. + + Returns: + `Callable`: + The original function wrapped as a registered recipe. + + Example: + ```python + @register_recipe("my_new_recipe") + def my_new_recipe(...): + ... + ``` + """ + + def decorator(func): + recipe_registry[recipe_name] = func + return func + + return decorator + + +def discover_recipes(): + """ + Dynamically discovers and imports all recipe modules within the `optimum.exporters.executorch.recipes` package. + + Ensures recipes under `./recipes` directory are dynamically loaded without requiring manual imports. + + Notes: + New recipes **must** be added to the `./recipes` directory to be discovered and used by `main_export`. + Failure to do so will prevent dynamic discovery and registration. Recipes must also use the + `@register_recipe` decorator to be properly registered in the `recipe_registry`. + """ + package = importlib.import_module(package_name) + package_path = package.__path__ + + for _, module_name, _ in pkgutil.iter_modules(package_path): + logger.info(f"Importing {package_name}.{module_name}") + importlib.import_module(f"{package_name}.{module_name}") diff --git a/optimum/exporters/executorch/recipes/__init__.py b/optimum/exporters/executorch/recipes/__init__.py new file mode 100644 index 00000000000..a2e21cf3970 --- /dev/null +++ b/optimum/exporters/executorch/recipes/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from . import xnnpack diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py new file mode 100644 index 00000000000..d3b3a5d52aa --- /dev/null +++ b/optimum/exporters/executorch/recipes/xnnpack.py @@ -0,0 +1,97 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from typing import Union + +import torch +import torch.export._trace +from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner +from executorch.exir import ( + EdgeCompileConfig, + ExecutorchBackendConfig, + to_edge_transform_and_lower, +) +from torch.nn.attention import SDPBackend +from transformers import PreTrainedModel, TorchExportableModuleWithStaticCache + +from ..recipe_registry import register_recipe + + +@register_recipe("xnnpack") +def export_to_executorch_with_xnnpack( + model: Union[PreTrainedModel, TorchExportableModuleWithStaticCache], + task: str, + **kwargs, +): + """ + Export a PyTorch model to ExecuTorch w/ delegation to XNNPACK backend. + + This function also write metadata required by the ExecuTorch runtime to the model. + + Args: + model (Union[PreTrainedModel, TorchExportableModuleWithStaticCache]): + The PyTorch model to be exported to ExecuTorch. + task (str): + The task name to export the model for (e.g., "text-generation"). + **kwargs: + Additional keyword arguments for recipe-specific configurations. + + Returns: + ExecuTorchProgram: + The exported and optimized program for ExecuTorch. + """ + metadata = {} + if task == "text-generation": + example_input_ids = torch.tensor([[1]], dtype=torch.long) + example_cache_position = torch.tensor([0], dtype=torch.long) + + def _get_constant_methods(model: PreTrainedModel): + metadata = { + "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6, + "get_bos_id": model.config.bos_token_id, + "get_eos_id": model.config.eos_token_id, + "get_head_dim": model.config.hidden_size / model.config.num_attention_heads, + "get_max_batch_size": model.generation_config.cache_config.batch_size, + "get_max_seq_len": model.generation_config.cache_config.max_cache_len, + "get_n_kv_heads": model.config.num_key_value_heads, + "get_n_layers": model.config.num_hidden_layers, + "get_vocab_size": model.config.vocab_size, + "use_kv_cache": model.generation_config.use_cache, + } + return {k: v for k, v in metadata.items() if v is not None} + + metadata = _get_constant_methods(model if isinstance(model, PreTrainedModel) else model.model) + else: + # TODO: Prepare model inputs for other tasks + raise ValueError(f"Unsupported task '{task}'.") + + with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad(): + exported_program = torch.export._trace._export( + model, + args=(example_input_ids,), + kwargs={"cache_position": example_cache_position}, + pre_dispatch=False, + strict=True, + ) + + return to_edge_transform_and_lower( + exported_program, + partitioner=[XnnpackPartitioner()], + compile_config=EdgeCompileConfig( + _skip_dim_order=True, + ), + constant_methods=metadata, + ).to_executorch( + config=ExecutorchBackendConfig( + extract_delegate_segments=True, + ), + ) diff --git a/optimum/exporters/executorch/task_registry.py b/optimum/exporters/executorch/task_registry.py new file mode 100644 index 00000000000..fdc34f0359a --- /dev/null +++ b/optimum/exporters/executorch/task_registry.py @@ -0,0 +1,68 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +import importlib +import logging +import pkgutil + + +logger = logging.getLogger(__name__) + +task_registry = {} + +package_name = "optimum.exporters.executorch.tasks" + + +def register_task(task_name): + """ + Decorator to register a task under a specific name. + + Args: + task_name (`str`): + The name of the task to associate with a callable task. + + Returns: + `Callable`: + The original function wrapped as a registered task. + + Example: + ```python + @register_task("my_new_task") + def my_new_task(...): + ... + ``` + """ + + def decorator(func): + task_registry[task_name] = func + return func + + return decorator + + +def discover_tasks(): + """ + Dynamically discovers and imports all task modules within the `optimum.exporters.executorch.tasks` package. + + Ensures tasks under `./tasks` directory are dynamically loaded without requiring manual imports. + + Notes: + New tasks **must** be added to the `./tasks` directory to be discovered and used by `main_export`. + Failure to do so will prevent dynamic discovery and registration. Tasks must also use the + `@register_task` decorator to be properly registered in the `task_registry`. + """ + package = importlib.import_module(package_name) + package_path = package.__path__ + + for _, module_name, _ in pkgutil.iter_modules(package_path): + logger.info(f"Importing {package_name}.{module_name}") + importlib.import_module(f"{package_name}.{module_name}") diff --git a/optimum/exporters/executorch/tasks/__init__.py b/optimum/exporters/executorch/tasks/__init__.py new file mode 100644 index 00000000000..754a8241ca3 --- /dev/null +++ b/optimum/exporters/executorch/tasks/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from . import causal_lm diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py new file mode 100644 index 00000000000..b02da8b319e --- /dev/null +++ b/optimum/exporters/executorch/tasks/causal_lm.py @@ -0,0 +1,66 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on +# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from transformers import AutoModelForCausalLM, GenerationConfig + +from ..task_registry import register_task + + +@register_task("text-generation") +def load_causal_lm_model(model_name_or_path: str, **kwargs): + """ + Loads a causal language model for text generation and registers it under the task + 'text-generation' using Hugging Face's AutoModelForCausalLM. + + Args: + model_name_or_path (str): + Model ID on huggingface.co or path on disk to the model repository to export. For example: + `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder` + **kwargs: + Additional configuration options for the model: + - dtype (str, optional): + Data type for model weights (default: "float32"). + Options include "float16" and "bfloat16". + - attn_implementation (str, optional): + Attention mechanism implementation (default: "sdpa"). + - cache_implementation (str, optional): + Cache management strategy (default: "static"). + - max_length (int, optional): + Maximum sequence length for generation (default: 2048). + + Returns: + transformers.PreTrainedModel: + An instance of a model subclass (e.g., Llama, Gemma) with the configuration for exporting + and lowering to ExecuTorch. + """ + device = "cpu" + batch_size = 1 + dtype = kwargs.get("dtype", "float32") + attn_implementation = kwargs.get("attn_implementation", "sdpa") + cache_implementation = kwargs.get("cache_implementation", "static") + max_length = kwargs.get("max_length", 2048) + + return AutoModelForCausalLM.from_pretrained( + model_name_or_path, + device_map=device, + torch_dtype=dtype, + attn_implementation=attn_implementation, + generation_config=GenerationConfig( + use_cache=True, + cache_implementation=cache_implementation, + max_length=max_length, + cache_config={ + "batch_size": batch_size, + "max_cache_len": max_length, + }, + ), + ) diff --git a/setup.py b/setup.py index 28b6941ebe8..555580528fe 100644 --- a/setup.py +++ b/setup.py @@ -85,6 +85,10 @@ "datasets<=2.16", "transformers>=4.36,<4.38", ], + "exporters-executorch": [ + "executorch>=0.4.0", + "transformers>=4.46", + ], "diffusers": ["diffusers"], "intel": "optimum-intel>=1.18.0", "openvino": "optimum-intel[openvino]>=1.18.0", diff --git a/tests/executorch/export/__init__.py b/tests/executorch/export/__init__.py new file mode 100644 index 00000000000..fdc02578672 --- /dev/null +++ b/tests/executorch/export/__init__.py @@ -0,0 +1,14 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/executorch/export/test_exporters_executorch.py b/tests/executorch/export/test_exporters_executorch.py new file mode 100644 index 00000000000..f2467105e4f --- /dev/null +++ b/tests/executorch/export/test_exporters_executorch.py @@ -0,0 +1,115 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import subprocess +import tempfile +import unittest + +import pytest +from transformers.testing_utils import slow + + +class TestExportToExecuTorchCLI(unittest.TestCase): + def test_helps_no_raise(self): + subprocess.run( + "optimum-cli export executorch --help", + shell=True, + check=True, + ) + + @slow + @pytest.mark.run_slow + def test_llama3_2_1b_export_to_executorch(self): + model_id = "NousResearch/Llama-3.2-1B" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) + + @slow + @pytest.mark.run_slow + def test_llama3_2_3b_export_to_executorch(self): + model_id = "NousResearch/Hermes-3-Llama-3.2-3B" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) + + @slow + @pytest.mark.run_slow + def test_qwen2_5_export_to_executorch(self): + model_id = "Qwen/Qwen2.5-0.5B" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) + + @slow + @pytest.mark.run_slow + def test_gemma2_export_to_executorch(self): + model_id = "unsloth/gemma-2-2b-it" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) + + @slow + @pytest.mark.run_slow + def test_gemma_export_to_executorch(self): + model_id = "weqweasdas/RM-Gemma-2B" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) + + @slow + @pytest.mark.run_slow + def test_olmo_export_to_executorch(self): + model_id = "allenai/OLMo-1B-hf" + task = "text-generation" + recipe = "xnnpack" + with tempfile.TemporaryDirectory() as tempdir: + subprocess.run( + f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch", + shell=True, + check=True, + ) + self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte")) diff --git a/tests/executorch/runtime/__init__.py b/tests/executorch/runtime/__init__.py new file mode 100644 index 00000000000..fdc02578672 --- /dev/null +++ b/tests/executorch/runtime/__init__.py @@ -0,0 +1,14 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/executorch/runtime/test_modeling.py b/tests/executorch/runtime/test_modeling.py new file mode 100644 index 00000000000..c97b461403c --- /dev/null +++ b/tests/executorch/runtime/test_modeling.py @@ -0,0 +1,70 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_load_model_from_hub(self): + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path="NousResearch/Llama-3.2-1B", + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + @slow + @pytest.mark.run_slow + def test_load_model_from_local_path(self): + from optimum.exporters.executorch import main_export + + model_id = "NousResearch/Llama-3.2-1B" + task = "text-generation" + recipe = "xnnpack" + + with tempfile.TemporaryDirectory() as tempdir: + # Export to a local dir + main_export( + model_name_or_path=model_id, + task=task, + recipe=recipe, + output_dir=tempdir, + ) + self.assertTrue(os.path.exists(f"{tempdir}/model.pte")) + + # Load the exported model from a local dir + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=tempdir, + export=False, + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) diff --git a/tests/executorch/runtime/test_modeling_gemma.py b/tests/executorch/runtime/test_modeling_gemma.py new file mode 100644 index 00000000000..0e4238bf8ee --- /dev/null +++ b/tests/executorch/runtime/test_modeling_gemma.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers import AutoTokenizer +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_gemma_text_generation_with_xnnpack(self): + # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed + # model_id = "google/gemma-2b" + model_id = "weqweasdas/RM-Gemma-2B" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = "Hello I am doing a project for my school and I need to write a report on the history of the United States." + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="Hello I am doing a project for my school", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) diff --git a/tests/executorch/runtime/test_modeling_gemma2.py b/tests/executorch/runtime/test_modeling_gemma2.py new file mode 100644 index 00000000000..22fe4ab60d7 --- /dev/null +++ b/tests/executorch/runtime/test_modeling_gemma2.py @@ -0,0 +1,56 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers import AutoTokenizer +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_gemma2_text_generation_with_xnnpack(self): + # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed + # model_id = "google/gemma-2-2b" + model_id = "unsloth/gemma-2-2b-it" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = ( + "Hello I am doing a project for my school and I need to make sure it is a great to be creative and I can!" + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="Hello I am doing a project for my school", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) diff --git a/tests/executorch/runtime/test_modeling_llama.py b/tests/executorch/runtime/test_modeling_llama.py new file mode 100644 index 00000000000..fb08a5615a5 --- /dev/null +++ b/tests/executorch/runtime/test_modeling_llama.py @@ -0,0 +1,83 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers import AutoTokenizer +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_llama3_2_1b_text_generation_with_xnnpack(self): + # TODO: Switch to use meta-llama/Llama-3.2-1B once https://github.com/huggingface/optimum/issues/2127 is fixed + # model_id = "lama/Llama-3.2-1B" + model_id = "NousResearch/Llama-3.2-1B" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = "Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference." + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="Simply put, the theory of relativity states that", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) + + @slow + @pytest.mark.run_slow + @pytest.mark.skip(reason="OOMs with macos-15 CI instances on GH.") + def test_llama3_2_3b_text_generation_with_xnnpack(self): + # TODO: Switch to use meta-llama/Llama-3.2-3B once https://github.com/huggingface/optimum/issues/2127 is fixed + # model_id = "lama/Llama-3.2-3B" + model_id = "NousResearch/Hermes-3-Llama-3.2-3B" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = ( + "Simply put, the theory of relativity states that time is relative and can be affected " + "by an object's speed. This theory was developed by Albert Einstein in the early 20th " + "century. The theory has two parts" + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="Simply put, the theory of relativity states that", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) diff --git a/tests/executorch/runtime/test_modeling_olmo.py b/tests/executorch/runtime/test_modeling_olmo.py new file mode 100644 index 00000000000..aa57496f291 --- /dev/null +++ b/tests/executorch/runtime/test_modeling_olmo.py @@ -0,0 +1,54 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers import AutoTokenizer +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_olmo_text_generation_with_xnnpack(self): + model_id = "allenai/OLMo-1B-hf" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = ( + "Simply put, the theory of relativity states that the speed of light is the same in all directions." + ) + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="Simply put, the theory of relativity states that", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT) diff --git a/tests/executorch/runtime/test_modeling_qwen2.py b/tests/executorch/runtime/test_modeling_qwen2.py new file mode 100644 index 00000000000..ef624a784ea --- /dev/null +++ b/tests/executorch/runtime/test_modeling_qwen2.py @@ -0,0 +1,52 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +import pytest +from executorch.extension.pybindings.portable_lib import ExecuTorchModule +from transformers import AutoTokenizer +from transformers.testing_utils import ( + slow, +) + +from optimum.executorchruntime import ExecuTorchModelForCausalLM + + +class ExecuTorchModelIntegrationTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + @slow + @pytest.mark.run_slow + def test_qwen2_5_text_generation_with_xnnpack(self): + model_id = "Qwen/Qwen2.5-0.5B" + model = ExecuTorchModelForCausalLM.from_pretrained( + model_name_or_path=model_id, + export=True, + task="text-generation", + recipe="xnnpack", + ) + self.assertIsInstance(model, ExecuTorchModelForCausalLM) + self.assertIsInstance(model.model, ExecuTorchModule) + + EXPECTED_GENERATED_TEXT = "My favourite condiment is iced tea. I love it with my breakfast, my lunch" + tokenizer = AutoTokenizer.from_pretrained(model_id) + generated_text = model.text_generation( + tokenizer=tokenizer, + prompt="My favourite condiment is ", + max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)), + ) + self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)