diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
new file mode 100644
index 00000000000..20371f79150
--- /dev/null
+++ b/.github/workflows/test_onnxruntime_slow.yml
@@ -0,0 +1,33 @@
+name: ONNX Runtime slow / Python - Test
+
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: 0 7 * * * # every day at 7am
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.8, 3.9]
+        os: [ubuntu-20.04]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies for export
+      run: |
+        pip install .[tests,onnxruntime]
+    - name: Test with unittest
+      working-directory: tests
+      run: |
+        RUN_SLOW=1 pytest onnxruntime -s -m "run_slow" --durations=0
diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
index d496f6f0392..85661ccf6cf 100644
--- a/optimum/commands/export/onnx.py
+++ b/optimum/commands/export/onnx.py
@@ -136,14 +136,6 @@ def parse_args_onnx(parser):
         default=None,
         help=("The library on the model." " If not provided, will attempt to infer the local checkpoint's library"),
     )
-    optional_group.add_argument(
-        "--no-position-ids",
-        action="store_true",
-        help=(
-            "Disable the use of position_ids for text-generation models that require it for batched generation. This argument is introduced for backward compatibility and will be removed in a future release of Optimum."
-        ),
-    )
-
     input_group = parser.add_argument_group(
         "Input shapes (if necessary, this allows to override the shapes of the input given to the ONNX exporter, that requires an example input)."
     )
@@ -217,6 +209,14 @@ def parse_args_onnx(parser):
         default=DEFAULT_DUMMY_SHAPES["nb_points_per_image"],
         help="For Segment Anything. It corresponds to the number of points per segmentation masks.",
     )
+    optional_group.add_argument(
+        "--legacy",
+        action="store_true",
+        help=(
+            "Export decoder only models in three files (without + with past and the resulting merged model)."
+            "Also disable the use of position_ids for text-generation models that require it for batched generation. This argument is introduced for backward compatibility and will be removed in a future release of Optimum."
+        ),
+    )
 
     # deprecated argument
     parser.add_argument("--for-ort", action="store_true", help=argparse.SUPPRESS)
@@ -255,6 +255,6 @@ def run(self):
             use_subprocess=True,
             _variant=self.args.variant,
             library_name=self.args.library_name,
-            no_position_ids=self.args.no_position_ids,
+            legacy=self.args.legacy,
             **input_shapes,
         )
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 16a18afc552..1b601cdfb8d 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -68,7 +68,7 @@ def _get_submodels_and_onnx_configs(
     float_dtype: str = "fp32",
     fn_get_submodels: Optional[Callable] = None,
     preprocessors: Optional[List[Any]] = None,
-    no_position_ids: bool = False,
+    legacy: bool = False,
 ):
     is_stable_diffusion = "stable-diffusion" in task
     if not custom_architecture:
@@ -82,8 +82,8 @@ def _get_submodels_and_onnx_configs(
                 model=model, exporter="onnx", task=task
             )
             onnx_config_kwargs = {}
-            if task.startswith("text-generation") and no_position_ids:
-                onnx_config_kwargs["no_position_ids"] = no_position_ids
+            if task.startswith("text-generation") and legacy:
+                onnx_config_kwargs["no_position_ids"] = legacy
 
             onnx_config = onnx_config_constructor(
                 model.config,
@@ -106,7 +106,7 @@ def _get_submodels_and_onnx_configs(
             ):
                 models_and_onnx_configs = get_encoder_decoder_models_for_export(model, onnx_config)
             elif task.startswith("text-generation") and not monolith:
-                models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config)
+                models_and_onnx_configs = get_decoder_models_for_export(model, onnx_config, legacy=legacy)
             elif model.config.model_type == "sam":
                 models_and_onnx_configs = get_sam_models_for_export(model, onnx_config)
             else:
@@ -184,7 +184,7 @@ def main_export(
     use_subprocess: bool = False,
     _variant: str = "default",
     library_name: Optional[str] = None,
-    no_position_ids: bool = False,
+    legacy: bool = False,
     **kwargs_shapes,
 ):
     """
@@ -264,8 +264,8 @@ def main_export(
         library_name (`Optional[str]`, defaults to `None`):
             The library of the model(`"tansformers"` or `"diffusers"` or `"timm"`). If not provided, will attempt to automatically detect
             the library name for the checkpoint.
-        no_position_ids (`bool`, defaults to `False`):
-            Disable the use of position_ids for text-generation models that require it for batched generation. This argument is introduced for backward compatibility and will be removed in a future release of Optimum.
+        legacy (`bool`, defaults to `False`):
+            Disable the use of position_ids for text-generation models that require it for batched generation. Also enable to export decoder only models in three files (without + with past and the merged model). This argument is introduced for backward compatibility and will be removed in a future release of Optimum.
         **kwargs_shapes (`Dict`):
             Shapes to use during inference. This argument allows to override the default shapes used during the ONNX export.
 
@@ -353,9 +353,9 @@ def main_export(
     is_stable_diffusion = "stable-diffusion" in task
     model_type = "stable-diffusion" if is_stable_diffusion else model.config.model_type.replace("_", "-")
 
-    if no_position_ids and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and task.startswith("text-generation"):
+    if legacy and model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and task.startswith("text-generation"):
         logger.warning(
-            f"no_position_ids=True was specified in the ONNX export, although the model {model_name_or_path} (model type {model_type}) requires position_ids for batched inference. Passing `no_position_ids=True` is strongly discouraged, and this option will be removed in a future release. Reference: https://github.com/huggingface/optimum/pull/1381"
+            f"legacy=True was specified in the ONNX export, although the model {model_name_or_path} (model type {model_type}) requires position_ids for batched inference. Passing `legacy=True` is strongly discouraged, and this option will be removed in a future release. Reference: https://github.com/huggingface/optimum/pull/1381"
         )
 
     if not is_stable_diffusion:
@@ -424,7 +424,7 @@ def main_export(
         fn_get_submodels=fn_get_submodels,
         preprocessors=preprocessors,
         _variant=_variant,
-        no_position_ids=no_position_ids,
+        legacy=legacy,
     )
 
     if not is_stable_diffusion:
@@ -610,6 +610,7 @@ def main():
         pad_token_id=args.pad_token_id,
         for_ort=args.for_ort,
         library_name=args.library_name,
+        legacy=args.legacy,
         **input_shapes,
     )
 
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index 1e2ae99955c..a65374346ac 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -585,7 +585,7 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         elif self.task == "feature-extraction":
             common_outputs = OrderedDict({"last_hidden_state": {0: "batch_size"}})
         else:
-            common_outputs = OrderedDict({"logits": {0: "batch_size"}})
+            common_outputs = OrderedDict({"logits": {0: "batch_size", 1: "sequence_length"}})
         if self.use_past:
             # When exporting decoder models with use_cache=True, both the decoder without past and with past have the KV cache as an output.
             self.add_past_key_values(common_outputs, direction="outputs")
diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
index 9259ad853da..3aca641513c 100644
--- a/optimum/exporters/onnx/config.py
+++ b/optimum/exporters/onnx/config.py
@@ -92,7 +92,7 @@ def __init__(
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self.use_past_in_inputs:
-            common_inputs = {"input_ids": {0: "batch_size"}}
+            common_inputs = {"input_ids": {0: "batch_size", 1: "sequence_length"}}
             self.add_past_key_values(common_inputs, direction="inputs")
             common_inputs["attention_mask"] = {0: "batch_size", 1: "past_sequence_length + 1"}
         else:
@@ -164,10 +164,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         # generating wrong position_ids in the model itself:
         # https://github.com/huggingface/transformers/blob/v4.33.1/src/transformers/models/gpt2/modeling_gpt2.py#L802
         if not self.no_position_ids and self.task == "text-generation":
-            if self.use_past_in_inputs:
-                common_inputs["position_ids"] = {0: "batch_size"}
-            else:
-                common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
+            common_inputs["position_ids"] = {0: "batch_size", 1: "sequence_length"}
 
         return common_inputs
 
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 73308e24a5d..a83c8a91fa5 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -56,7 +56,15 @@
     TextSeq2SeqOnnxConfig,
     VisionOnnxConfig,
 )
-from .model_patcher import SAMModelPatcher, WavLMModelPatcher
+from .model_patcher import (
+    BartModelPatcher,
+    BloomModelPatcher,
+    LlamaModelPatcher,
+    MistralModelPatcher,
+    OPTModelPatcher,
+    SAMModelPatcher,
+    WavLMModelPatcher,
+)
 
 
 if TYPE_CHECKING:
@@ -216,6 +224,11 @@ class OPTOnnxConfig(TextDecoderOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return OPTModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator)
@@ -223,6 +236,11 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DEFAULT_ONNX_OPSET = 13
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return LlamaModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     # The ONNX export of this architecture needs the Trilu operator support, available since opset 14
@@ -233,6 +251,11 @@ class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True)
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MistralModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
@@ -241,6 +264,11 @@ class MPTOnnxConfig(TextDecoderOnnxConfig):
         num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers"
     )
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return BloomModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class BloomOnnxConfig(TextDecoderOnnxConfig):
     # Bloom does not require position_ids input.
@@ -274,6 +302,11 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
                 1: decoder_sequence_name,
             }
 
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return BloomModelPatcher(self, model, model_kwargs=model_kwargs)
+
 
 class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (
@@ -413,7 +446,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
         return int_tensor
 
 
-class BartOnnxConfig(TextSeq2SeqOnnxConfig):
+class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",
@@ -537,11 +570,14 @@ def flatten_past_key_values(self, flattened_output, name, idx, t):
             )
 
 
-class MBartOnnxConfig(BartOnnxConfig):
-    pass
+class BartOnnxConfig(M2M100OnnxConfig):
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return BartModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
-class M2M100OnnxConfig(BartOnnxConfig):
+class MBartOnnxConfig(BartOnnxConfig):
     pass
 
 
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index e6b50b6dc08..aa14526bd8c 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -19,6 +19,12 @@
 
 from transformers.utils import is_torch_available
 
+from ...utils.modeling_utils import (
+    _prepare_attn_mask,
+    _prepare_decoder_attention_mask,
+    _prepare_decoder_sliding_window_attention_mask,
+)
+
 
 if is_torch_available():
     import torch
@@ -342,3 +348,103 @@ def patched_forward(
                         return {"iou_scores": iou_predictions, "pred_masks": low_res_masks}
 
         self.patched_forward = patched_forward
+
+
+class CausalAttentionMaskModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+        self.patch = self.real_config.task == "text-generation" and self.real_config.use_past
+
+    def __enter__(self):
+        super().__enter__()
+        if self.patch:
+            setattr(self._model_to_patch, self._orig_func_name, self._patch_func.__get__(self._model_to_patch))
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        super().__exit__(exc_type, exc_value, traceback)
+        if self.patch:
+            setattr(self._model_to_patch, self._orig_func_name, self._orig_func.__get__(self._model_to_patch))
+
+
+class BloomModelPatcher(CausalAttentionMaskModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+        if self.patch:
+            self._model_to_patch = model.transformer
+            self._patch_func = _prepare_attn_mask
+            self._orig_func_name = "_prepare_attn_mask"
+            self._orig_func = self._model_to_patch._prepare_attn_mask
+
+
+class OPTModelPatcher(CausalAttentionMaskModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if self.patch:
+            self._model_to_patch = model.model.decoder
+            self._patch_func = _prepare_decoder_attention_mask
+            self._orig_func_name = "_prepare_decoder_attention_mask"
+            self._orig_func = self._model_to_patch._prepare_decoder_attention_mask
+
+
+class LlamaModelPatcher(CausalAttentionMaskModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if self.patch:
+            self._model_to_patch = model.model
+            self._patch_func = _prepare_decoder_attention_mask
+            self._orig_func_name = "_prepare_decoder_attention_mask"
+            self._orig_func = self._model_to_patch._prepare_decoder_attention_mask
+
+
+class MistralModelPatcher(CausalAttentionMaskModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if self.patch:
+            self._model_to_patch = model.model
+            self._patch_func = _prepare_decoder_sliding_window_attention_mask
+            self._orig_func_name = "_prepare_decoder_attention_mask"
+            self._orig_func = self._model_to_patch._prepare_decoder_attention_mask
+
+
+class BartModelPatcher(CausalAttentionMaskModelPatcher, Seq2SeqModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if self.patch:
+            self._model_to_patch = model.model.decoder
+            self._patch_func = _prepare_decoder_attention_mask
+            self._orig_func_name = "_prepare_decoder_attention_mask"
+            self._orig_func = self._model_to_patch._prepare_decoder_attention_mask
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 6e90fc617fb..2dda5594a66 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -29,6 +29,7 @@
     logging,
 )
 from ...utils.import_utils import _diffusers_version
+from ...utils.modeling_utils import _prepare_attn_mask, _prepare_decoder_attention_mask  # noqa: F401
 from ..tasks import TasksManager
 from .constants import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME
 
@@ -159,15 +160,16 @@ def _get_submodels_for_export_stable_diffusion(
 
 
 def _get_submodels_for_export_decoder(
-    model: Union["PreTrainedModel", "TFPreTrainedModel"], use_past: bool
+    model: Union["PreTrainedModel", "TFPreTrainedModel"],
+    use_past: bool,
+    legacy: bool = False,
 ) -> Dict[str, Union["PreTrainedModel", "TFPreTrainedModel"]]:
     """
     Returns the decoder part of the model.
     """
-    models_for_export = {}
+    models_for_export = {ONNX_DECODER_NAME if legacy else "model": model}
 
-    models_for_export[ONNX_DECODER_NAME] = model
-    if use_past:
+    if legacy and use_past:
         models_for_export[ONNX_DECODER_WITH_PAST_NAME] = model
 
     return models_for_export
@@ -227,6 +229,7 @@ def get_encoder_decoder_models_for_export(
 def get_decoder_models_for_export(
     model: Union["PreTrainedModel", "TFPreTrainedModel"],
     config: "OnnxConfig",
+    legacy: bool = False,
 ) -> Dict[str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel"], "OnnxConfig"]]:
     """
     Returns two versions of the decoder that can be used together to perform fast generation:
@@ -246,31 +249,42 @@ def get_decoder_models_for_export(
         `Dict[str, Tuple[Union[PreTrainedModel, TFPreTrainedModel], OnnxConfig]]: A Dict containing the model and
         onnx configs for the encoder and decoder parts of the model.
     """
-    models_for_export = _get_submodels_for_export_decoder(model, use_past=config.use_past)
+
+    models_for_export = _get_submodels_for_export_decoder(model, use_past=config.use_past, legacy=legacy)
 
     onnx_kwargs = {"task": config.task, "float_dtype": config.float_dtype, "int_dtype": config.int_dtype}
     if model.config.model_type.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
         onnx_kwargs["no_position_ids"] = config.no_position_ids
 
-    onnx_config = config.__class__(
-        model.config,
-        use_past=config.use_past,
-        use_past_in_inputs=False,
-        **onnx_kwargs,
-    )
-    models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], onnx_config)
-
-    if config.use_past:
-        onnx_config_with_past = config.__class__(
+    if legacy:
+        onnx_config = config.__class__(
             model.config,
-            use_past=True,
-            use_past_in_inputs=True,
+            use_past=config.use_past,
+            use_past_in_inputs=False,
             **onnx_kwargs,
         )
-        models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
-            models_for_export[ONNX_DECODER_WITH_PAST_NAME],
-            onnx_config_with_past,
+        models_for_export[ONNX_DECODER_NAME] = (models_for_export[ONNX_DECODER_NAME], onnx_config)
+
+        if config.use_past:
+            onnx_config_with_past = config.__class__(
+                model.config,
+                use_past=True,
+                use_past_in_inputs=True,
+                **onnx_kwargs,
+            )
+            models_for_export[ONNX_DECODER_WITH_PAST_NAME] = (
+                models_for_export[ONNX_DECODER_WITH_PAST_NAME],
+                onnx_config_with_past,
+            )
+
+    else:
+        onnx_config = config.__class__(
+            model.config,
+            use_past=config.use_past,
+            use_past_in_inputs=config.use_past,
+            **onnx_kwargs,
         )
+        models_for_export["model"] = (models_for_export["model"], onnx_config)
 
     return models_for_export
 
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 7a5c6364fe2..2707c6eeab2 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -14,42 +14,34 @@
 """Classes handling causal-lm related architectures in ONNX Runtime."""
 
 import logging
-import shutil
 from pathlib import Path
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
+import numpy as np
+import onnx
 import torch
-from huggingface_hub import hf_hub_download
-from huggingface_hub.utils import EntryNotFoundError
+from onnx.tools import update_model_dims
 from transformers import AutoModelForCausalLM, GenerationConfig
 from transformers.file_utils import add_end_docstrings, add_start_docstrings_to_model_forward
-from transformers.modeling_outputs import CausalLMOutputWithCrossAttentions
+from transformers.modeling_outputs import CausalLMOutputWithPast
 
 import onnxruntime
 
 from ..exporters.onnx import MODEL_TYPES_REQUIRING_POSITION_IDS, main_export
-from ..onnx.utils import _get_external_data_paths
-from ..utils import check_if_transformers_greater
-from ..utils.file_utils import validate_file_exists
-from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
-from .base import ORTDecoder
+from ..onnx.utils import check_model_uses_external_data
+from ..utils import NormalizedConfigManager, check_if_transformers_greater
+from ..utils.modeling_utils import MODEL_TO_PATCH_FOR_PAST
+from ..utils.save_utils import maybe_save_preprocessors
 from .constants import DECODER_MERGED_ONNX_FILE_PATTERN, DECODER_ONNX_FILE_PATTERN, DECODER_WITH_PAST_ONNX_FILE_PATTERN
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .models.bloom import bloom_convert_to_bloom_cache, bloom_convert_to_standard_cache
-from .utils import (
-    ONNX_DECODER_NAME,
-    ONNX_DECODER_WITH_PAST_NAME,
-    get_provider_for_device,
-    parse_device,
-    validate_provider_availability,
-)
+from .utils import MULTI_QUERY_ATTN_MODELS, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_WEIGHTS_NAME
 
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
 
-
 if check_if_transformers_greater("4.25.0"):
     from transformers.generation import GenerationMixin
 else:
@@ -119,220 +111,293 @@
 """
 
 
-class ORTModelDecoder(ORTModel):
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTModelForCausalLM(ORTModel, GenerationMixin):
     """
-    Base class for implementing models with a causal language modeling head using ONNX Runtime inference.
+    ONNX model with a causal language modeling head for ONNX Runtime inference. This class officially supports bloom, codegen, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gptj, llama.
     """
 
+    auto_model_class = AutoModelForCausalLM
+    main_input_name = "input_ids"
+
     def __init__(
         self,
-        decoder_session: onnxruntime.InferenceSession,
+        model: onnxruntime.InferenceSession,
         config: "PretrainedConfig",
-        onnx_paths: List[str],
-        decoder_with_past_session: Optional[onnxruntime.InferenceSession] = None,
-        use_cache: bool = True,
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         preprocessors: Optional[List] = None,
         generation_config: Optional[GenerationConfig] = None,
+        use_cache: Optional[bool] = None,
         **kwargs,
     ):
-        """
-        Args:
-            decoder_session (`onnxruntime.InferenceSession`):
-                The ONNX Runtime inference session associated to the decoder.
-            config ([`~transformers.PretrainedConfig`]):
-                An instance of the configuration associated to the model. Initializing with a config file does
-                not load the weights associated with the model, only the configuration.
-            decoder_with_past_session (`Optional[onnxruntime.InferenceSession]`, defaults to `None`):
-                The ONNX Runtime inference session associated to the decoder with past key values. This argument should not
-                be set if use_merged=True is used.
-            onnx_paths (`List[str]`):
-                Path to ONNX files associated with the model.
-            use_cache (`bool`, defaults to `True`):
-                Whether or not past key/values cache should be used. Defaults to `True`.
-            use_io_binding (`Optional[bool]`, defaults to `None`):
-                Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
-                `True` if the execution provider is CPUExecutionProvider or CUDAExecutionProvider, otherwise defaults to `False`.
-            model_save_dir (`Optional[Union[str, Path, TemporaryDirectory]]`, defaults to `""`):
-                The directory under which the model exported to ONNX was saved.
-            preprocessors (`Optional[List]`, defaults to `None`):
-                The list of the preprocessors (tokenizer, processor, feature_extractor) to save alongside the ORTModel.
-            generation_config (`Optional[GenerationConfig]`, defaults to `None`):
-                The generation configuration used by default when calling `generate()`.
-                Refer to https://huggingface.co/docs/transformers/main/en/main_classes/text_generation#transformers.GenerationMixin.generate.
-        """
         if use_io_binding is None:
-            if decoder_session.get_providers()[0] in ["CPUExecutionProvider", "CUDAExecutionProvider"]:
-                use_io_binding = True
-            else:
-                use_io_binding = False
+            use_io_binding = model.get_providers()[0] in ["CPUExecutionProvider", "CUDAExecutionProvider"]
 
-        self.shared_attributes_init(
-            decoder_session,
-            use_io_binding=use_io_binding,
-            model_save_dir=model_save_dir,
-        )
-        self.config = config
+        super().__init__(model, config, use_io_binding, model_save_dir, preprocessors, **kwargs)
 
-        # TODO: remove at version 2.0
-        def show_deprecated_argument(arg_name):
-            if kwargs.pop(arg_name, None) is not None:
-                logger.warning(
-                    f"The {arg_name} argument to create an {self.__class__.__name__} is deprecated, and not used "
-                    "anymore."
-                )
+        self.num_pkv = 2
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+        self.key_value_input_names = [key for key in self.inputs_names if (".key" in key) or (".value" in key)]
+        self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)]
+        self.use_cache = len(self.key_value_input_names) > 0
 
-        show_deprecated_argument("last_decoder_model_name")
-        show_deprecated_argument("last_decoder_with_past_model_name")
-        if kwargs:
-            raise ValueError(
-                f"{self.__class__.__name__} received {', '.join(kwargs.keys())}, but do not accept those arguments."
-            )
+        if generation_config is None:
+            generation_config = GenerationConfig.from_model_config(config)
+        self.generation_config = generation_config
+        self.onnx_paths = [self.model_path]
+        self.use_merged = "use_cache_branch" in self.inputs_names
 
-        if use_cache is True:
-            # Auto-detect whether the provided session is a merged non-past / with-past or not
-            # TODO: make __init__ private and pass `use_merged` as an argument
-            use_merged = "use_cache_branch" in [inp.name for inp in decoder_session.get_inputs()]
+        self.use_fp16 = False
+        for inp in model.get_inputs():
+            if inp.name == "past_key_values" and inp.type == "tensor(float16)":
+                self.use_fp16 = True
+                break
 
-            if use_merged is True and decoder_with_past_session is not None:
-                raise ValueError(
-                    "Detected a merged decoder, but decoder_with_past_session was provided."
-                    "Please only set decoder_session, or provide a non-merged decoder_session."
-                )
-            if use_cache is True and use_merged is False and decoder_with_past_session is None:
-                raise ValueError(
-                    "The parameter use_cache was set as True, but neither decoder_with_past_session was passed"
-                    " nor a use_cache branch can be found in the decoder_session."
-                    " Please pass a decoder_with_past_session or set use_cache=False."
-                )
-        else:
-            use_merged = False
+        # Reference: https://github.com/huggingface/optimum/pull/1381
+        model_type = config.model_type.replace("_", "-")
+        if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.inputs_names:
+            logger.warning(
+                f"ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture {model_type}. "
+                "We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support."
+            )
 
-            if decoder_with_past_session is not None:
-                raise ValueError(
-                    "The parameter decoder_with_past_session was passed, although use_cache is False."
-                    "Please pass use_cache=True for decoder_with_past_session to be used."
-                )
+        if use_cache ^ self.use_cache:
+            raise ValueError(
+                f"`use_cache` was set to `{use_cache}` but the loaded model only supports `use_cache={self.use_cache}`. "
+                f"Please load your current model with `use_cache={self.use_cache}` or export the original model "
+                f"once again with `use_cache={use_cache}` when calling the `from_pretrained` method. "
+                "To export your model, simply set `export=True`."
+            )
 
-        if use_cache is False and use_io_binding is True:
+        if use_io_binding and not use_cache:
             raise ValueError(
-                "When using CUDAExecutionProvider, the parameters combination use_cache=False, use_io_binding=True"
-                " is not supported. Please either pass use_cache=True, use_io_binding=True (default),"
-                " or use_cache=False, use_io_binding=False."
+                "The parameters combination use_cache=False, use_io_binding=True is not supported. "
+                "Please either pass use_cache=True, use_io_binding=True (default), or use_cache=False, use_io_binding=False."
             )
 
-        self.onnx_paths = onnx_paths
-        self.use_cache = use_cache
-        self.use_merged = use_merged
-        self.decoder = ORTDecoder(decoder_session, self)
-        self.decoder_model_path = Path(decoder_session._model_path)
-        self.decoder_model_name = self.decoder_model_path.name
+    @add_start_docstrings_to_model_forward(
+        CAUSALLM_ONNX_MODEL_DOCSTRING.format("batch_size, sequence_length")
+        + TEXT_GENERATION_EXAMPLE.format(
+            processor_class=_TOKENIZER_FOR_DOC,
+            model_class="ORTModelForCausalLM",
+            checkpoint="optimum/gpt2",
+        )
+    )
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache_branch: bool = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        # adding use_cache_branch in the signature here is just a hack for IO Binding
+        use_torch = isinstance(input_ids, torch.Tensor)
+        self.raise_on_numpy_input_io_binding(use_torch)
+
+        inputs = {}
+        known_output_shapes = {}
+        use_cache_branch = None
+        loss = None
+        if self.use_cache:
+            if past_key_values is not None:
+                input_ids = input_ids[:, -1:]
+                # Flatten the past_key_values (no need to flatten for models using multi-query attn)
+                if self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+                    past_key_values = tuple(
+                        past_key_value for pkv_per_layer in past_key_values for past_key_value in pkv_per_layer
+                    )
 
-        # Reference: https://github.com/huggingface/optimum/pull/1381
-        model_type = config.model_type.replace("_", "-")
-        if model_type in MODEL_TYPES_REQUIRING_POSITION_IDS and "position_ids" not in self.decoder.input_names:
-            logger.warning(
-                f"ORTModelForCausalLM loaded a legacy ONNX model with no position_ids input, although this input is required for batched generation for the architecture {model_type}. We strongly encourage to re-export the model with optimum>=1.14 for position_ids and batched inference support."
+            # Create dummy past_key_values for decoder first generation step if none given
+            use_cache_branch, past_key_values, known_output_shapes = self.prepare_past_key_values(
+                input_ids, past_key_values, use_torch
             )
 
-        self.decoder_with_past = None
-        self.decoder_with_past_model_path = None
-        self.decoder_with_past_model_name = None
-        if self.use_cache is True and self.use_merged is False:
-            self.decoder_with_past = ORTDecoder(decoder_with_past_session, self)
-            self.decoder_with_past_model_path = Path(decoder_with_past_session._model_path)
-            self.decoder_with_past_model_name = self.decoder_with_past_model_path.name
+        if self.use_io_binding:
+            # TODO: fix transformers generate to have contiguous input_ids here already
+            # For an unknown reason, calling `contiguous()` here is necessary to not have errors
+            # on CPU EP with batch size > 1, despite it being also called in _prepare_io_binding.
+            # I suspect the reason is the contiguous python list that messes something up?
+            model_inputs = [input_ids.contiguous()]
 
-        if generation_config is None:
-            generation_config = GenerationConfig.from_model_config(config)
-        self.generation_config = generation_config
+            if "attention_mask" in self.inputs_names:
+                model_inputs.append(attention_mask)
 
-    @staticmethod
-    def _generate_regular_names_for_filename(filename: str):
-        name, extension = filename.rsplit(".", maxsplit=1)
-        return [
-            filename,
-            f"{name}_quantized.{extension}",
-            f"{name}_optimized.{extension}",
-            f"{name}_merged.{extension}",
-        ]
+            if "position_ids" in self.inputs_names:
+                if position_ids is None:
+                    raise ValueError("position_ids was not passed but is a required input for this ONNX model.")
+                model_inputs.append(position_ids.contiguous())
 
-    @staticmethod
-    def load_model(
-        decoder_path: Union[str, Path],
-        decoder_with_past_path: Optional[Union[str, Path]] = None,
-        provider: str = "CPUExecutionProvider",
-        session_options: Optional[onnxruntime.SessionOptions] = None,
-        provider_options: Optional[Dict] = None,
-    ):
-        """
-        Creates an instance of [`~optimum.onnxruntime.ORTModelDecoder`].
-        Three inference sessions will be created for respectively the decoder and decoder with past key values
-        models. The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX.
-
-        Args:
-            decoder_path (`str` or `Path`):
-                The path of the decoder ONNX model.
-            decoder_with_past_path (`str` or `Path`, *optional*):
-                The path of the decoder with past key values ONNX model.
-            provider(`str`, *optional*, defaults to `"CPUExecutionProvider"`):
-                The ONNX Runtime provider to use for loading the model.
-            session_options (`Optional[onnxruntime.SessionOptions]`, *optional*),:
-                ONNX Runtime session options to use for loading the model.
-            provider_options (`Optional[Dict]`, *optional*):
-                Provider option dictionary corresponding to the provider used. See available options
-                for each provider: https://onnxruntime.ai/docs/api/c/group___global.html.
-        """
-        decoder_session = ORTModel.load_model(decoder_path, provider, session_options, provider_options)
-
-        decoder_with_past_session = None
-        # If a decoder_with_past_path is provided, an inference session for the decoder with past key/values as inputs
-        # will be enabled
-        if decoder_with_past_path is not None:
-            decoder_with_past_session = ORTModel.load_model(
-                decoder_with_past_path, provider, session_options, provider_options
+            if past_key_values is not None:
+                model_inputs += past_key_values
+
+            if use_cache_branch is not None:
+                model_inputs.append(use_cache_branch)
+
+            if "labels" in self.inputs_names:
+                model_inputs.append(labels)
+                known_output_shapes.update({"loss": []})
+
+            io_binding, output_shapes, output_buffers = self._prepare_io_binding(
+                self.model,
+                *model_inputs,
+                known_output_shapes=known_output_shapes,
+                ordered_input_names=self._ordered_input_names,
             )
 
-        return decoder_session, decoder_with_past_session
+            if self.device.type == "cpu":
+                self.model.run_with_iobinding(io_binding)
+            else:
+                io_binding.synchronize_inputs()
+                self.model.run_with_iobinding(io_binding)
+                io_binding.synchronize_outputs()
+
+            if self.use_cache:
+                # Tuple of length equal to : number of layer * number of past_key_value per decoder layer(2)
+                past_key_values = ()
+                for name in self.key_value_output_names:
+                    past_key_values += (output_buffers[name].view(output_shapes[name]),)
+
+            logits = output_buffers["logits"].view(output_shapes["logits"])
+
+            if "loss" in self.output_names:
+                loss = output_buffers["loss"].view(output_shapes["loss"])
+
+        else:
+            inputs["input_ids"] = input_ids.cpu().detach().numpy() if use_torch else input_ids
+
+            if "attention_mask" in self.inputs_names:
+                inputs["attention_mask"] = attention_mask.cpu().detach().numpy() if use_torch else attention_mask
+
+            if "labels" in self.inputs_names:
+                inputs["labels"] = labels.cpu().detach().numpy() if use_torch else labels
 
-    def _save_pretrained(self, save_directory: Union[str, Path]):
-        """
-        Saves the model decoder and decoder with past key values as well as its configuration file to a
-        directory, so that it can be re-loaded using the
-        [`~optimum.onnxruntime.modeling_causal.ORTModelDecoder.from_pretrained`] class method.
+            if "position_ids" in self.inputs_names:
+                if position_ids is None:
+                    raise ValueError("position_ids was not passed but is a required input for this ONNX model.")
+                inputs["position_ids"] = position_ids.cpu().detach().numpy() if use_torch else position_ids
 
-        Args:
-            save_directory (`str` or `Path`):
-                The directory where to save the model files.
-        """
-        save_directory = Path(save_directory)
-        src_paths = [Path(path) for path in self.onnx_paths]
-        dst_paths = [save_directory / path.name for path in src_paths]
+            # Add the past_key_values to the decoder inputs
+            if past_key_values is not None:
+                for input_name, past_key_value in zip(self.key_value_input_names, past_key_values):
+                    inputs[input_name] = past_key_value.cpu().detach().numpy() if use_torch else past_key_value
 
-        # add external data paths in case of large models
-        src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths)
+            if use_cache_branch is not None:
+                inputs["use_cache_branch"] = use_cache_branch.cpu().detach().numpy() if use_torch else use_cache_branch
 
-        for src_path, dst_path in zip(src_paths, dst_paths):
-            shutil.copyfile(src_path, dst_path)
+            outputs = self.model.run(None, inputs)
 
-        self.generation_config.save_pretrained(save_directory)
+            if self.use_cache:
+                # Tuple of length equal to : number of layer * number of past_key_value per decoder layer (2 for the self-attention)
+                past_key_values = tuple(
+                    torch.from_numpy(outputs[self.output_names[key]]).to(self.device)
+                    for key in self.key_value_output_names
+                )
+
+            logits = torch.from_numpy(outputs[self.output_names["logits"]]).to(self.device)
+            if "loss" in self.output_names:
+                loss = torch.from_numpy(outputs[self.output_names["loss"]]).to(self.device)
+
+        if self.use_cache and self.config.model_type not in MULTI_QUERY_ATTN_MODELS:
+            # Tuple of tuple of length `n_layers`, with each tuple of length equal to the number of self-attention and
+            # per decoder layer
+            past_key_values = tuple(
+                past_key_values[i : i + self.num_pkv] for i in range(0, len(past_key_values), self.num_pkv)
+            )
+
+        return CausalLMOutputWithPast(loss=loss, logits=logits, past_key_values=past_key_values)
+
+    def prepare_past_key_values(
+        self,
+        input_ids: Union[None, torch.LongTensor, np.ndarray],
+        past_key_values: Union[None, Tuple[torch.FloatTensor], Tuple[np.ndarray]],
+        use_torch: bool,
+    ):
+        sequence_length = input_ids.shape[1]
+
+        constructor = torch if use_torch else np
+        if self.use_merged:
+            # Uses without/with branch of a merged decoder depending on whether real past key values are passed
+            use_cache_branch = constructor.full((1,), past_key_values is not None)
+        else:
+            # Uses separate decoders
+            use_cache_branch = None
+
+        if use_torch and use_cache_branch is not None:
+            use_cache_branch = use_cache_branch.to(self.device)
+
+        # Generate dummy past for the first forward if uses a merged decoder
+        if past_key_values is None:
+            batch_size = input_ids.shape[0]
+            if self.config.model_type in {"mistral", "llama"}:
+                num_attention_heads = self.normalized_config.num_key_value_heads
+            else:
+                num_attention_heads = self.normalized_config.num_attention_heads
+            embed_size_per_head = self.normalized_config.hidden_size // self.normalized_config.num_attention_heads
+            dtype = constructor.float16 if self.use_fp16 else constructor.float32
+            # TODO: find a way to better handle this controlflow
+            # "1" is the dummy sequence length
+            if self.config.model_type == "bloom":
+                shape_value = (batch_size * num_attention_heads, 0, embed_size_per_head)
+                shape_key = (batch_size * num_attention_heads, embed_size_per_head, 0)
+                key = constructor.zeros(shape_key, dtype=dtype)
+                value = constructor.zeros(shape_value, dtype=dtype)
+
+                if use_torch:
+                    key = key.to(self.device)
+                    value = value.to(self.device)
+
+                past_key_values = tuple(
+                    key_or_value for _ in range(len(self.key_value_input_names) // 2) for key_or_value in [key, value]
+                )
+            elif self.config.model_type in MULTI_QUERY_ATTN_MODELS:
+                shape_key_and_value = (batch_size, 0, embed_size_per_head * 2)
+                key_and_value = constructor.zeros(shape_key_and_value, dtype=dtype)
+
+                if use_torch:
+                    key_and_value = key_and_value.to(self.device)
+
+                past_key_values = tuple(key_and_value for _ in range(len(self.key_value_input_names)))
+            else:
+                shape = (batch_size, num_attention_heads, 0, embed_size_per_head)
+                key_or_value = constructor.zeros(shape, dtype=dtype)
+
+                if use_torch:
+                    key_or_value = key_or_value.to(self.device)
+
+                past_key_values = tuple(key_or_value for _ in range(len(self.key_value_input_names)))
+
+        pkv_output_shape = {}
+        for name, value in zip(self.key_value_output_names, past_key_values):
+            shape = [*value.shape]
+            index = (
+                1
+                if self.config.model_type in MULTI_QUERY_ATTN_MODELS
+                or (self.config.model_type == "bloom" and "value" in name)
+                else 2
+            )
+
+            shape[index] += sequence_length
+            pkv_output_shape[name] = shape
+
+        return use_cache_branch, past_key_values, pkv_output_shape
 
     @classmethod
     def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        init_cls: Type["ORTModelDecoder"],
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
         cache_dir: Optional[str] = None,
-        decoder_file_name: str = ONNX_DECODER_NAME,
-        decoder_with_past_file_name: str = ONNX_DECODER_WITH_PAST_NAME,
+        file_name: Optional[str] = None,
         subfolder: str = "",
-        local_files_only: bool = False,
         use_cache: bool = True,
+        local_files_only: bool = False,
         use_merged: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
         session_options: Optional[onnxruntime.SessionOptions] = None,
@@ -340,7 +405,7 @@ def _from_pretrained(
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
-    ):
+    ) -> "ORTModelForCausalLM":
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -352,187 +417,137 @@ def _from_pretrained(
                 )
             use_merged = False
 
-        decoder_merged_path = None
-        # We use `is not False` here to include two cases: use_merged = None (in which case we auto-detect it),
-        # and use_merged = True (explicitely specified by the user)
-        if use_merged is not False:
-            try:
-                decoder_merged_path = ORTModelDecoder.infer_onnx_filename(
+        decoder_name = "decoder_file_name" if use_cache else "decoder_with_past_file_name"
+        decoder_file_name = kwargs.pop(decoder_name, None)
+
+        if decoder_file_name is not None:
+            logger.warning(f"The `{decoder_name}` argument is deprecated, please use `file_name` instead.")
+            file_name = file_name or decoder_file_name
+
+        if file_name is None:
+            decoder_path = None
+            # We use `is not False` here to include two cases: use_merged = None (in which case we auto-detect it),
+            # and use_merged = True (explicitely specified by the user)
+            if use_merged is not False:
+                try:
+                    decoder_path = ORTModelForCausalLM.infer_onnx_filename(
+                        model_id,
+                        [DECODER_MERGED_ONNX_FILE_PATTERN],
+                        argument_name=None,
+                        subfolder=subfolder,
+                        use_auth_token=use_auth_token,
+                        revision=revision,
+                    )
+                    use_merged = True
+                    file_name = decoder_path.name
+                except FileNotFoundError as e:
+                    if use_merged is True:
+                        raise FileNotFoundError(
+                            "The parameter `use_merged=True` was passed to ORTModelForCausalLM.from_pretrained()"
+                            " but no ONNX file for a merged decoder could be found in"
+                            f" {str(Path(model_id, subfolder))}, with the error: {e}"
+                        )
+                    use_merged = False
+
+            if use_merged is False:
+                pattern = DECODER_WITH_PAST_ONNX_FILE_PATTERN if use_cache else DECODER_ONNX_FILE_PATTERN
+                # exclude decoder file for first iteration
+                decoder_path = ORTModelForCausalLM.infer_onnx_filename(
                     model_id,
-                    [DECODER_MERGED_ONNX_FILE_PATTERN],
+                    [r"^((?!decoder).)*.onnx", pattern],
                     argument_name=None,
                     subfolder=subfolder,
                     use_auth_token=use_auth_token,
                     revision=revision,
                 )
-                use_merged = True
-                decoder_path = decoder_merged_path
-            except FileNotFoundError as e:
-                if use_merged is True:
-                    raise FileNotFoundError(
-                        "The parameter `use_merged=True` was passed to ORTModelForCausalLM.from_pretrained()"
-                        " but no ONNX file for a merged decoder could be found in"
-                        f" {str(Path(model_id, subfolder))}, with the error: {e}"
-                    )
-                use_merged = False
+                file_name = decoder_path.name
 
-        decoder_without_past_path = None
-        decoder_with_past_path = None
-        if use_merged is False:
-            if not validate_file_exists(model_id, decoder_file_name, subfolder=subfolder, revision=revision):
-                decoder_without_past_path = ORTModelDecoder.infer_onnx_filename(
-                    model_id,
-                    [DECODER_ONNX_FILE_PATTERN],
-                    "decoder_file_name",
-                    subfolder=subfolder,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
+            if file_name == ONNX_DECODER_WITH_PAST_NAME and config.model_type in MODEL_TO_PATCH_FOR_PAST:
+                raise ValueError(
+                    f"{ONNX_DECODER_WITH_PAST_NAME} not supported for the following architecture : {', '.join(MODEL_TO_PATCH_FOR_PAST)}. Please re-export your model or set use_cache=False."
                 )
-            else:
-                decoder_without_past_path = model_path / subfolder / decoder_file_name
 
-            decoder_path = decoder_without_past_path
+            regular_file_names = []
+            for name in [ONNX_WEIGHTS_NAME, ONNX_DECODER_WITH_PAST_NAME if use_cache else ONNX_DECODER_NAME]:
+                regular_file_names += ORTModelForCausalLM._generate_regular_names_for_filename(name)
 
-            decoder_regular_onnx_filenames = ORTModelDecoder._generate_regular_names_for_filename(ONNX_DECODER_NAME)
-            if decoder_path.name not in decoder_regular_onnx_filenames:
+            if file_name not in regular_file_names:
                 logger.warning(
-                    f"The ONNX file {decoder_path.name} is not a regular name used in optimum.onnxruntime that are {decoder_regular_onnx_filenames}, the "
+                    f"The ONNX file {file_name} is not a regular name used in optimum.onnxruntime that are {regular_file_names}, the "
                     f"{cls.__name__} might not behave as expected."
                 )
 
-            # If the decoder without / with past has been merged, we do not need to look for any additional file
-            if use_cache is True:
-                if not validate_file_exists(
-                    model_id, decoder_with_past_file_name, subfolder=subfolder, revision=revision
-                ):
-                    try:
-                        decoder_with_past_path = ORTModelDecoder.infer_onnx_filename(
-                            model_id,
-                            [DECODER_WITH_PAST_ONNX_FILE_PATTERN],
-                            "decoder_with_past_file_name",
-                            subfolder=subfolder,
-                            use_auth_token=use_auth_token,
-                            revision=revision,
-                        )
-                    except FileNotFoundError as e:
-                        raise FileNotFoundError(
-                            "The parameter `use_cache=True` was passed to ORTModelForCausalLM.from_pretrained()"
-                            " but no ONNX file using past key values could be found in"
-                            f" {str(Path(model_id, subfolder))}, with the error: {e}"
-                        )
-                else:
-                    decoder_with_past_path = model_path / subfolder / decoder_with_past_file_name
-
-                decoder_with_past_regular_onnx_filenames = ORTModelDecoder._generate_regular_names_for_filename(
-                    ONNX_DECODER_WITH_PAST_NAME
-                )
-
-                if decoder_with_past_path.name not in decoder_with_past_regular_onnx_filenames:
-                    logger.warning(
-                        f"The ONNX file {decoder_with_past_path.name} is not a regular name used in optimum.onnxruntime that are {decoder_with_past_regular_onnx_filenames}, "
-                        f"the {cls.__name__} might not behave as expected."
-                    )
-
-        preprocessors = None
-        if model_path.is_dir():
-            new_model_save_dir = model_path
-            preprocessors = maybe_load_preprocessors(model_id)
+        if config.model_type == "bloom":
+            init_cls = ORTBloomForCausalLM
+        elif config.model_type == "mpt":
+            init_cls = ORTMPTForCausalLM
+        elif config.model_type == "opt":
+            init_cls = ORTOPTForCausalLM
         else:
-            attribute_name_to_filename = {
-                "last_decoder_model_name": decoder_path.name if use_merged is False else None,
-                "last_decoder_with_past_model_name": decoder_with_past_path.name
-                if (use_cache is True and use_merged is False)
-                else None,
-                "last_decoder_merged_name": decoder_merged_path.name if use_merged is True else None,
-            }
-            paths = {}
-            for attr_name, filename in attribute_name_to_filename.items():
-                if filename is None:
-                    continue
-                model_cache_path = hf_hub_download(
-                    repo_id=model_id,
-                    subfolder=subfolder,
-                    filename=filename,
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    local_files_only=local_files_only,
-                )
+            init_cls = ORTModelForCausalLM
 
-                # try download external data
-                try:
-                    hf_hub_download(
-                        repo_id=model_id,
-                        subfolder=subfolder,
-                        filename=filename + "_data",
-                        use_auth_token=use_auth_token,
-                        revision=revision,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        local_files_only=local_files_only,
-                    )
-                except EntryNotFoundError:
-                    # model doesn't use external data
-                    pass
+        model_cache_path, preprocessors = cls._cached_file(
+            model_path=model_path,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            file_name=file_name,
+            subfolder=subfolder,
+            local_files_only=local_files_only,
+        )
+        new_model_save_dir = model_cache_path.parent
+
+        # model_save_dir can be provided in kwargs as a TemporaryDirectory instance, in which case we want to keep it
+        # instead of the path only.
+        if model_save_dir is None:
+            model_save_dir = new_model_save_dir
 
-                paths[attr_name] = Path(model_cache_path).name
-            new_model_save_dir = Path(model_cache_path).parent
-            preprocessors = maybe_load_preprocessors(model_id, subfolder=subfolder)
+        # Since v1.7.0 decoder with past models have fixed sequence length of 1
+        # To keep these models compatible we set this dimension to dynamic
+        onnx_model = onnx.load(str(model_cache_path), load_external_data=False)
+        model_uses_external_data = check_model_uses_external_data(onnx_model)
 
-            if use_merged is True:
-                decoder_path = new_model_save_dir / paths["last_decoder_merged_name"]
-                decoder_merged_path = new_model_save_dir / paths["last_decoder_merged_name"]
-            else:
-                decoder_path = new_model_save_dir / paths["last_decoder_model_name"]
-                decoder_without_past_path = new_model_save_dir / paths["last_decoder_model_name"]
+        if model_uses_external_data:
+            onnx_model = onnx.load(str(model_cache_path), load_external_data=True)
 
-                if use_cache is True:
-                    decoder_with_past_path = new_model_save_dir / paths["last_decoder_with_past_model_name"]
+        input_dims = {
+            node.name: [dim.dim_value or dim.dim_param for dim in node.type.tensor_type.shape.dim]
+            for node in onnx_model.graph.input
+        }
+        if input_dims["input_ids"][1] == 1:
+            input_dims["input_ids"][1] = "sequence_length"
+            output_dims = {
+                node.name: [dim.dim_value or dim.dim_param for dim in node.type.tensor_type.shape.dim]
+                for node in onnx_model.graph.output
+            }
+            output_dims["logits"][1] = "sequence_length"
+            onnx_model = update_model_dims.update_inputs_outputs_dims(onnx_model, input_dims, output_dims)
+
+            onnx.save(
+                onnx_model,
+                str(model_cache_path),
+                save_as_external_data=model_uses_external_data,
+                all_tensors_to_one_file=True,
+                location=model_cache_path.name + "_data",
+                size_threshold=0,
+            )
+        del onnx_model
 
-        ort_inference_sessions = cls.load_model(
-            decoder_path=decoder_path,
-            decoder_with_past_path=None if use_merged is True or use_cache is False else decoder_with_past_path,
+        model = ORTModel.load_model(
+            model_cache_path,
             provider=provider,
             session_options=session_options,
             provider_options=provider_options,
         )
 
-        if model_save_dir is None:
-            model_save_dir = new_model_save_dir
-
-        generation_config = None
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                subfolder=subfolder,
-            )
-        except OSError:
-            logger.info("Generation config file not found, using a generation config created from the model config.")
-
-        onnx_paths = []
-        if use_merged is False:
-            onnx_paths.append(decoder_without_past_path)
-            if use_cache is True:
-                onnx_paths.append(decoder_with_past_path)
-        else:
-            onnx_paths.append(decoder_merged_path)
-
         return init_cls(
-            ort_inference_sessions[0],
-            config,
-            decoder_with_past_session=ort_inference_sessions[1],
-            use_cache=use_cache,
+            model=model,
+            config=config,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
             preprocessors=preprocessors,
-            generation_config=generation_config,
-            onnx_paths=onnx_paths,
+            use_cache=use_cache,
         )
 
     @classmethod
@@ -554,19 +569,18 @@ def _from_transformers(
         provider_options: Optional[Dict[str, Any]] = None,
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTModelDecoder":
+    ) -> "ORTModelForCausalLM":
+        file_name = ONNX_WEIGHTS_NAME
+
+        if use_merged:
+            logger.warning("The `use_merged` argument is deprecated when the model is exported, and not used anymore.")
+            use_merged = False
+
         if task is None:
             task = cls._auto_model_to_task(cls.auto_model_class)
 
-            if use_cache is True:
-                task = task + "-with-past"
-
-        if use_cache is False and use_merged is True:
-            raise ValueError(
-                "The incompatible arguments use_cache=False, use_merged=True were passed to ORTModelForCausalLM.from_pretrained()."
-                " Please pass either use_cache=False, use_merged=False to disable past key value caching, or use_cache=True, use_merged=False"
-                " to disable the merging of the decoder not using / using past key and value."
-            )
+            if use_cache:
+                task += "-with-past"
 
         save_dir = TemporaryDirectory()
         save_dir_path = Path(save_dir.name)
@@ -576,7 +590,8 @@ def _from_transformers(
             output=save_dir_path,
             task=task,
             do_validation=False,
-            no_post_process=not use_merged,
+            no_post_process=False,
+            legacy=False,
             subfolder=subfolder,
             revision=revision,
             cache_dir=cache_dir,
@@ -599,88 +614,7 @@ def _from_transformers(
             provider_options=provider_options,
             use_io_binding=use_io_binding,
             model_save_dir=save_dir,
-        )
-
-    def to(self, device: Union[torch.device, str, int]):
-        """
-        Changes the ONNX Runtime provider according to the device.
-
-        Args:
-            device (`Union[torch.device, str, int]`):
-                Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run
-                the model on the associated CUDA device id. You can pass native `torch.device` or a `str` too.
-
-        Returns:
-            `ORTModel`: the model placed on the requested device.
-        """
-        device, provider_options = parse_device(device)
-
-        if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
-            return self
-
-        provider = get_provider_for_device(device)
-        validate_provider_availability(provider)  # raise error if the provider is not available
-        self.device = device
-        self.decoder.session.set_providers([provider], provider_options=[provider_options])
-        if self.decoder_with_past is not None:
-            self.decoder_with_past.session.set_providers([provider], provider_options=[provider_options])
-        self.providers = self.decoder.session.get_providers()
-
-        return self
-
-
-@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTModelForCausalLM(ORTModelDecoder, GenerationMixin):
-    """
-    ONNX model with a causal language modeling head for ONNX Runtime inference. This class officially supports bloom, codegen, gpt2, gpt_bigcode, gpt_neo, gpt_neox, gptj, llama.
-    """
-
-    auto_model_class = AutoModelForCausalLM
-    main_input_name = "input_ids"
-
-    @add_start_docstrings_to_model_forward(
-        CAUSALLM_ONNX_MODEL_DOCSTRING.format("batch_size, sequence_length")
-        + TEXT_GENERATION_EXAMPLE.format(
-            processor_class=_TOKENIZER_FOR_DOC,
-            model_class="ORTModelForCausalLM",
-            checkpoint="optimum/gpt2",
-        )
-    )
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        **kwargs,
-    ) -> CausalLMOutputWithCrossAttentions:
-        if past_key_values is None or self.use_cache is False:
-            outputs = self.decoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                past_key_values=past_key_values,
-                position_ids=position_ids,
-                labels=labels,
-            )
-        elif self.use_merged is True:
-            outputs = self.decoder(
-                input_ids=input_ids[:, -1:],
-                past_key_values=past_key_values,
-                attention_mask=attention_mask,
-                position_ids=position_ids,
-            )
-        else:
-            outputs = self.decoder_with_past(
-                input_ids=input_ids[:, -1:],
-                past_key_values=past_key_values,
-                attention_mask=attention_mask,
-                labels=labels,
-                position_ids=position_ids,
-            )
-
-        return CausalLMOutputWithCrossAttentions(
-            loss=outputs.get("loss", None), logits=outputs.logits, past_key_values=outputs.past_key_values
+            file_name=file_name,
         )
 
     # Adapted from transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel.prepare_inputs_for_generation
@@ -718,24 +652,6 @@ def can_generate(self):
         """Returns True to validate the check that the model using `GenerationMixin.generate()` can indeed generate."""
         return True
 
-    @classmethod
-    def _from_pretrained(
-        cls,
-        model_id: Union[str, Path],
-        config: "PretrainedConfig",
-        **kwargs,
-    ):
-        if config.model_type == "bloom":
-            init_cls = ORTBloomForCausalLM
-        elif config.model_type == "mpt":
-            init_cls = ORTMPTForCausalLM
-        elif config.model_type == "opt":
-            init_cls = ORTOPTForCausalLM
-        else:
-            init_cls = ORTModelForCausalLM
-
-        return super()._from_pretrained(model_id, config, init_cls=init_cls, **kwargs)
-
 
 class ORTBloomForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.bloom.modeling_bloom.BloomForCausalLM.prepare_inputs_for_generation
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 46963745da4..b58a37eb43a 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -486,55 +486,30 @@ def _from_pretrained(
                 "not behave as expected."
             )
 
-        preprocessors = None
-        if model_path.is_dir():
-            model = ORTModel.load_model(
-                model_path / file_name,
-                provider=provider,
-                session_options=session_options,
-                provider_options=provider_options,
-            )
-            new_model_save_dir = model_path
-            preprocessors = maybe_load_preprocessors(model_id)
-        else:
-            model_cache_path = hf_hub_download(
-                repo_id=model_id,
-                filename=file_name,
-                subfolder=subfolder,
-                use_auth_token=use_auth_token,
-                revision=revision,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-            )
-
-            # try download external data
-            try:
-                hf_hub_download(
-                    repo_id=model_id,
-                    subfolder=subfolder,
-                    filename=file_name + "_data",
-                    use_auth_token=use_auth_token,
-                    revision=revision,
-                    cache_dir=cache_dir,
-                    force_download=force_download,
-                    local_files_only=local_files_only,
-                )
-            except EntryNotFoundError:
-                # model doesn't use external data
-                pass
-
-            model = ORTModel.load_model(
-                model_cache_path, provider=provider, session_options=session_options, provider_options=provider_options
-            )
-            new_model_save_dir = Path(model_cache_path).parent
-            preprocessors = maybe_load_preprocessors(model_id, subfolder=subfolder)
+        model_cache_path, preprocessors = cls._cached_file(
+            model_path=model_path,
+            use_auth_token=use_auth_token,
+            revision=revision,
+            force_download=force_download,
+            cache_dir=cache_dir,
+            file_name=file_name,
+            subfolder=subfolder,
+            local_files_only=local_files_only,
+        )
+        new_model_save_dir = model_cache_path.parent
 
         # model_save_dir can be provided in kwargs as a TemporaryDirectory instance, in which case we want to keep it
         # instead of the path only.
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
+        model = ORTModel.load_model(
+            model_cache_path,
+            provider=provider,
+            session_options=session_options,
+            provider_options=provider_options,
+        )
+
         return cls(
             model=model,
             config=config,
@@ -753,13 +728,20 @@ def _prepare_io_binding(
             name = ordered_input_names[idx]
             tensor = tensor.contiguous()
             input_name_to_shape[name] = tensor.shape
+
+            data_ptr = tensor.data_ptr()
+            if "past" in name and data_ptr == 0:
+                # During first generation, sequence_length can be 0 when use_cache=True, which results in data_ptr to also be 0.
+                # To keep compatibility with IO binding, we pass the data pointer of input_ids instead. This will have no impact because past_key_values will not be used during the first generation.
+                data_ptr = model_inputs[0].data_ptr()
+
             io_binding.bind_input(
                 name,
                 tensor.device.type,
                 IOBindingHelper.get_device_index(self.device),
                 name_to_np_type[name],
                 tuple(tensor.shape),
-                tensor.data_ptr(),
+                data_ptr,
             )
         dimensions = {}
         for input_ in model.get_inputs():
@@ -821,6 +803,55 @@ def raise_on_numpy_input_io_binding(self, use_torch: bool):
                 " with model.use_io_binding = False, or pass torch.Tensor inputs instead."
             )
 
+    @staticmethod
+    def _cached_file(
+        model_path: Union[Path, str],
+        use_auth_token: Optional[Union[bool, str]] = None,
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        cache_dir: Optional[str] = None,
+        file_name: Optional[str] = None,
+        subfolder: str = "",
+        local_files_only: bool = False,
+    ):
+        model_path = Path(model_path)
+
+        # locates a file in a local folder and repo, downloads and cache it if necessary.
+        if model_path.is_dir():
+            model_cache_path = model_path / file_name
+            preprocessors = maybe_load_preprocessors(model_path.as_posix())
+        else:
+            model_cache_path = hf_hub_download(
+                repo_id=model_path.as_posix(),
+                filename=file_name,
+                subfolder=subfolder,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                local_files_only=local_files_only,
+            )
+            # try download external data
+            try:
+                hf_hub_download(
+                    repo_id=model_path.as_posix(),
+                    subfolder=subfolder,
+                    filename=file_name + "_data",
+                    use_auth_token=use_auth_token,
+                    revision=revision,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                )
+            except EntryNotFoundError:
+                # model doesn't use external data
+                pass
+
+            model_cache_path = Path(model_cache_path)
+            preprocessors = maybe_load_preprocessors(model_path.as_posix(), subfolder=subfolder)
+
+        return model_cache_path, preprocessors
+
 
 FEATURE_EXTRACTION_EXAMPLE = r"""
     Example of feature extraction:
diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py
index 2db9f753c34..9e62a3f324c 100644
--- a/optimum/onnxruntime/optimization.py
+++ b/optimum/onnxruntime/optimization.py
@@ -97,17 +97,11 @@ def from_pretrained(
                 # Add the decoder with past key/values if present
                 if model_or_path.use_cache:
                     onnx_model_path.append(model_or_path.decoder_with_past_model_path)
-            elif isinstance(model_or_path, ORTModelForCausalLM):
-                if model_or_path.use_merged is True:
-                    raise NotImplementedError(
-                        "ORTOptimizer does not support ORTModelForCausalLM models that use a single ONNX for both the without/with past cases."
-                        " Please pass an ORTModelForCausalLM that uses a separate ONNX for each without/with past cases. This can be done"
-                        " by using `ORTModelForCausalLM.from_pretrained(..., export=True, use_merged=False)`, or by"
-                        " using the option `--no-post-process` in the optimum-cli ONNX export tool."
-                    )
-                onnx_model_path.append(model_or_path.decoder_model_path)
-                if model_or_path.use_cache:
-                    onnx_model_path.append(model_or_path.decoder_with_past_model_path)
+            elif isinstance(model_or_path, ORTModelForCausalLM) and model_or_path.use_merged:
+                raise NotImplementedError(
+                    "ORTOptimizer does not support ORTModelForCausalLM models when without/with past models are merged. "
+                    "Please re-export your model. This can be done by using the optimum-cli ONNX export tool or `ORTModelForCausalLM.from_pretrained(..., export=True, use_merged=False)`."
+                )
             else:
                 onnx_model_path.append(model_or_path.model_path)
             config = model_or_path.config
diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
index 1c13bfb465f..d56e301c3cf 100644
--- a/optimum/onnxruntime/quantization.py
+++ b/optimum/onnxruntime/quantization.py
@@ -33,7 +33,6 @@
 from ..utils.save_utils import maybe_save_preprocessors
 from . import ORTQuantizableOperator
 from .configuration import CalibrationConfig, ORTConfig, QuantizationConfig
-from .modeling_decoder import ORTModelForCausalLM
 from .modeling_ort import ORTModel
 from .modeling_seq2seq import ORTModelForConditionalGeneration
 from .preprocessors import QuantizationPreprocessor
@@ -136,13 +135,6 @@ def from_pretrained(
         path = None
         if isinstance(model_or_path, ORTModelForConditionalGeneration):
             raise NotImplementedError(ort_quantizer_error_message)
-        elif isinstance(model_or_path, ORTModelForCausalLM):
-            if model_or_path.use_cache is False:
-                path = Path(model_or_path.decoder_model_path)
-            elif model_or_path.use_cache is True and model_or_path.use_merged is False:
-                raise NotImplementedError(ort_quantizer_error_message)
-            else:
-                path = Path(model_or_path.decoder_model_path)
         elif isinstance(model_or_path, Path) and file_name is None:
             onnx_files = list(model_or_path.glob("*.onnx"))
             if len(onnx_files) == 0:
diff --git a/optimum/utils/modeling_utils.py b/optimum/utils/modeling_utils.py
index 89f2f5598a6..67e12861eb5 100644
--- a/optimum/utils/modeling_utils.py
+++ b/optimum/utils/modeling_utils.py
@@ -13,6 +13,22 @@
 # limitations under the License.
 
 import functools
+from typing import Tuple
+
+import torch
+
+
+MODEL_TO_PATCH_FOR_PAST = {
+    "bart",
+    "blenderbot",
+    "blenderbot-small",
+    "bloom",
+    "llama",
+    "mistral",
+    "mpt",
+    "opt",
+    "pegasus",
+}
 
 
 def recurse_getattr(obj, attr: str):
@@ -39,3 +55,126 @@ def recurse_setattr(module, name, value):
     else:
         name, rest = name.split(".", 1)
         recurse_setattr(getattr(module, name), rest, value)
+
+
+# Modified from transformers.models.bloom.modeling_bloom._make_causal_mask
+def _make_causal_mask(
+    input_ids_shape: torch.Size,
+    device: torch.device,
+    past_key_values_length: int,
+    dtype: torch.dtype = torch.bool,
+) -> torch.BoolTensor:
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    batch_size, target_length = input_ids_shape
+    mask = torch.zeros((target_length, target_length + past_key_values_length), dtype=dtype, device=device)
+    seq_ids = torch.arange(target_length, device=device)
+
+    mask[:, past_key_values_length:] = (
+        (seq_ids[:, None] < seq_ids[None, :]) * torch.finfo(dtype).min
+        if torch.is_floating_point(mask)
+        else seq_ids[:, None] < seq_ids[None, :]
+    )
+
+    return mask[None, None, :, :].expand(batch_size, 1, target_length, target_length + past_key_values_length)
+
+
+# NOTE: For MODEL_TO_PATCH_FOR_PAST architectures, when exporting the model with an input of sequence length of 1, the attention masks will be generated incorrectly for other sequence length
+# https://github.com/huggingface/transformers/blob/0ee45906845c8d58b9bd2df5acd90e09b00047ff/src/transformers/models/bloom/modeling_bloom.py#L654
+# The method taking care of the decoder mask generation of the models from these architectures must be patched during export for sequence length of 1.
+
+
+# Modified from transformers.models.bloom.modeling_bloom._prepare_attn_mask
+def _prepare_attn_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_shape: Tuple[int, int],
+    past_key_values_length: int,
+) -> torch.BoolTensor:
+    from transformers.models.bloom.modeling_bloom import _expand_mask
+
+    # create causal mask
+    # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+    combined_attention_mask = None
+    device = attention_mask.device
+    _, src_length = input_shape
+
+    combined_attention_mask = _make_causal_mask(
+        input_shape, device=device, past_key_values_length=past_key_values_length
+    )
+    # [batch_size, seq_length] -> [batch_size, 1, tgt_length, src_length]
+    expanded_attn_mask = _expand_mask(attention_mask, tgt_length=src_length)
+    combined_attention_mask = (
+        expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask | combined_attention_mask
+    )
+
+    return combined_attention_mask
+
+
+# Modified from transformers.models.llama.modeling_llama._prepare_decoder_attention_mask
+def _prepare_decoder_attention_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_shape: Tuple[int, int],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+):
+    from transformers.models.llama.modeling_llama import _expand_mask
+
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+
+    combined_attention_mask = _make_causal_mask(
+        input_shape,
+        device=inputs_embeds.device,
+        past_key_values_length=past_key_values_length,
+        dtype=inputs_embeds.dtype,
+    )
+
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+            inputs_embeds.device
+        )
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        )
+
+    return combined_attention_mask
+
+
+# Modified from transformers.models.mistral.modeling_mistral._prepare_decoder_sliding_window_attention_mask
+def _prepare_decoder_sliding_window_attention_mask(
+    self,
+    attention_mask: torch.Tensor,
+    input_shape: Tuple[int, int],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: int,
+):
+    from transformers.models.mistral.modeling_mistral import _expand_mask, _make_sliding_window_causal_mask
+
+    # create causal mask
+    # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+    combined_attention_mask = None
+
+    combined_attention_mask = _make_sliding_window_causal_mask(
+        input_shape,
+        device=inputs_embeds.device,
+        dtype=inputs_embeds.dtype,
+        past_key_values_length=past_key_values_length,
+        sliding_window=sliding_window,
+    )
+
+    if attention_mask is not None:
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to(
+            inputs_embeds.device
+        )
+        combined_attention_mask = (
+            expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask
+        )
+
+    return combined_attention_mask
diff --git a/setup.py b/setup.py
index 7a7f4546844..f654e3a71bc 100644
--- a/setup.py
+++ b/setup.py
@@ -60,7 +60,15 @@
     ],
     "exporters": ["onnx", "onnxruntime", "timm"],
     "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
-    "exporters-tf": ["tensorflow>=2.4,<=2.12.1", "tf2onnx", "onnx", "onnxruntime", "timm", "h5py", "numpy<1.24.0"],
+    "exporters-tf": [
+        "tensorflow>=2.4,<=2.12.1",
+        "tf2onnx",
+        "onnx",
+        "onnxruntime",
+        "timm",
+        "h5py",
+        "numpy<1.24.0",
+    ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.11.0",
     "openvino": "optimum-intel[openvino]>=1.11.0",
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index b1cdedbea84..efdbaba4235 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -19,6 +19,7 @@
 from tempfile import TemporaryDirectory
 from typing import Dict, Optional
 
+import onnx
 import pytest
 from parameterized import parameterized
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, is_torch_available
@@ -26,7 +27,12 @@
 
 from optimum.exporters.error_utils import MinimumVersionError
 from optimum.exporters.onnx.__main__ import main_export
-from optimum.onnxruntime import ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME, ONNX_ENCODER_NAME
+from optimum.onnxruntime import (
+    ONNX_DECODER_MERGED_NAME,
+    ONNX_DECODER_NAME,
+    ONNX_DECODER_WITH_PAST_NAME,
+    ONNX_ENCODER_NAME,
+)
 from optimum.utils.testing_utils import require_diffusers, require_timm
 
 
@@ -413,6 +419,21 @@ def test_stable_diffusion(self):
                 check=True,
             )
 
+    def test_legacy(self):
+        with TemporaryDirectory() as tmpdirname:
+            subprocess.run(
+                f"python3 -m optimum.exporters.onnx --model  hf-internal-testing/tiny-random-gpt2 --task text-generation-with-past --legacy {tmpdirname}",
+                shell=True,
+                capture_output=True,
+            )
+            folder_contents = os.listdir(tmpdirname)
+            self.assertIn(ONNX_DECODER_NAME, folder_contents)
+            self.assertIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents)
+            self.assertIn(ONNX_DECODER_MERGED_NAME, folder_contents)
+
+            model = onnx.load(Path(tmpdirname) / ONNX_DECODER_MERGED_NAME)
+            self.assertNotIn("position_ids", {node.name for node in model.graph.input})
+
     @parameterized.expand(_get_models_to_test(PYTORCH_EXPORT_MODELS_TINY))
     @require_vision
     @require_torch_gpu
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index 10eaeddd13c..11e6a53da36 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 import gc
 import os
+from functools import partial
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Dict
@@ -529,8 +530,8 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
             inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 2: decoder_sequence_name}
 
 
-def fn_get_submodels_custom(model):
-    return {"decoder_model": model, "decoder_with_past_model": model}
+def fn_get_submodels_custom(model, legacy=False):
+    return {"decoder_model": model, "decoder_with_past_model": model} if legacy else {"model": model}
 
 
 class OnnxCustomExport(TestCase):
@@ -572,7 +573,6 @@ def test_custom_export_official_model(self):
     def test_custom_export_trust_remote(self, fn_get_submodels):
         model_id = "fxmarty/tiny-mpt-random-remote-code"
         config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
-
         onnx_config = CustomMPTOnnxConfig(
             config=config,
             task="text-generation",
@@ -581,22 +581,29 @@ def test_custom_export_trust_remote(self, fn_get_submodels):
         )
         onnx_config_with_past = CustomMPTOnnxConfig(config, task="text-generation", use_past=True)
 
-        custom_onnx_configs = {
-            "decoder_model": onnx_config,
-            "decoder_with_past_model": onnx_config_with_past,
-        }
+        for legacy in (True, False):
+            if legacy:
+                custom_onnx_configs = {
+                    "decoder_model": onnx_config,
+                    "decoder_with_past_model": onnx_config_with_past,
+                }
+            else:
+                custom_onnx_configs = {
+                    "model": onnx_config_with_past,
+                }
 
-        with TemporaryDirectory() as tmpdirname:
-            main_export(
-                model_id,
-                output=tmpdirname,
-                task="text-generation-with-past",
-                trust_remote_code=True,
-                custom_onnx_configs=custom_onnx_configs,
-                no_post_process=True,
-                fn_get_submodels=fn_get_submodels,
-                opset=14,
-            )
+            with TemporaryDirectory() as tmpdirname:
+                main_export(
+                    model_id,
+                    output=tmpdirname,
+                    task="text-generation-with-past",
+                    trust_remote_code=True,
+                    custom_onnx_configs=custom_onnx_configs,
+                    no_post_process=True,
+                    fn_get_submodels=partial(fn_get_submodels, legacy=legacy) if fn_get_submodels else None,
+                    legacy=legacy,
+                    opset=14,
+                )
 
     def test_custom_export_trust_remote_error(self):
         model_id = "mohitsha/tiny-ernie-random-remote-code"
diff --git a/tests/onnx/test_onnx_graph_transformations.py b/tests/onnx/test_onnx_graph_transformations.py
index bed539eaccb..c06ac5af971 100644
--- a/tests/onnx/test_onnx_graph_transformations.py
+++ b/tests/onnx/test_onnx_graph_transformations.py
@@ -85,6 +85,7 @@ def test_merge_decoders(self, *args):
                 tmpdir,
                 task=task,
                 no_post_process=True,
+                legacy=True,
             )
 
             decoder = onnx.load(os.path.join(tmpdir, "decoder_model.onnx"))
diff --git a/tests/onnxruntime/nightly_test_trainer.py b/tests/onnxruntime/nightly_test_trainer.py
index 38bdfd07973..2eb3ca433f7 100644
--- a/tests/onnxruntime/nightly_test_trainer.py
+++ b/tests/onnxruntime/nightly_test_trainer.py
@@ -40,11 +40,7 @@
     default_data_collator,
     is_torch_available,
 )
-from transformers.testing_utils import (
-    require_deepspeed,
-    require_torch,
-    slow,
-)
+from transformers.testing_utils import require_deepspeed, require_torch, slow
 from transformers.training_args import OptimizerNames
 
 
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index e6868c2fa7d..b7695cbd651 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -59,7 +59,7 @@
 )
 from transformers.modeling_utils import no_init_weights
 from transformers.onnx.utils import get_preprocessor
-from transformers.testing_utils import get_gpu_count, require_torch_gpu
+from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow
 from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
 
 from optimum.exporters import TasksManager
@@ -138,12 +138,12 @@ def __init__(self, *args, **kwargs):
 
     def test_load_model_from_local_path(self):
         model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH)
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
     def test_load_model_from_hub(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID)
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
     def test_load_model_from_hub_subfolder(self):
@@ -151,11 +151,11 @@ def test_load_model_from_hub_subfolder(self):
         model = ORTModelForSequenceClassification.from_pretrained(
             "fxmarty/tiny-bert-sst2-distilled-subfolder", subfolder="my_subfolder", export=True
         )
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
         model = ORTModel.from_pretrained("fxmarty/tiny-bert-sst2-distilled-onnx-subfolder", subfolder="my_subfolder")
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
     def test_load_seq2seq_model_from_hub_subfolder(self):
@@ -178,7 +178,7 @@ def test_load_model_from_cache(self):
 
         model = ORTModel.from_pretrained(self.TINY_ONNX_MODEL_ID, local_files_only=True)
 
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
     def test_load_model_from_empty_cache(self):
@@ -768,7 +768,7 @@ def test_stable_diffusion_model_on_gpu_str(self):
     @require_hf_token
     def test_load_model_from_hub_private(self):
         model = ORTModel.from_pretrained(self.ONNX_MODEL_ID, use_auth_token=os.environ.get("HF_AUTH_TOKEN", None))
-        self.assertIsInstance(model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
 
     def test_save_model(self):
@@ -832,11 +832,12 @@ def test_save_load_ort_model_with_external_data(self):
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
 
     @parameterized.expand([(False,), (True,)])
+    @pytest.mark.run_slow
+    @slow
     def test_save_load_decoder_model_with_external_data(self, use_cache: bool):
         with tempfile.TemporaryDirectory() as tmpdirname:
-            os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
             model = ORTModelForCausalLM.from_pretrained(
-                MODEL_NAMES["gpt2"],
+                "gpt2-large",
                 use_cache=use_cache,
                 export=True,
                 use_merged=False,
@@ -846,18 +847,14 @@ def test_save_load_decoder_model_with_external_data(self, use_cache: bool):
 
             # verify external data is exported
             folder_contents = os.listdir(tmpdirname)
-            self.assertTrue(ONNX_DECODER_NAME in folder_contents)
-            self.assertTrue(ONNX_DECODER_NAME + "_data" in folder_contents)
-
-            if use_cache:
-                self.assertTrue(ONNX_DECODER_WITH_PAST_NAME in folder_contents)
-                self.assertTrue(ONNX_DECODER_WITH_PAST_NAME + "_data" in folder_contents)
+            self.assertTrue(ONNX_WEIGHTS_NAME in folder_contents)
+            self.assertTrue(ONNX_WEIGHTS_NAME + "_data" in folder_contents)
+            self.assertFalse(use_cache ^ model.use_cache)
 
             # verify loading from local folder works
             model = ORTModelForCausalLM.from_pretrained(
                 tmpdirname, use_cache=use_cache, export=False, use_io_binding=False
             )
-            os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
 
     @parameterized.expand([(False,), (True,)])
     def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool):
@@ -1103,7 +1100,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         onnx_model = ORTModelForQuestionAnswering.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1270,7 +1267,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForMaskedLM.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1432,7 +1429,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForSequenceClassification.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1605,7 +1602,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         onnx_model = ORTModelForTokenClassification.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1728,7 +1725,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         onnx_model = ORTModelForFeatureExtraction.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1873,7 +1870,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         onnx_model = ORTModelForMultipleChoice.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -1962,7 +1959,6 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,
         "use_cache": [False, True],
-        "use_merged": [False, True],
     }
 
     ORTMODEL_CLASS = ORTModelForCausalLM
@@ -1971,27 +1967,37 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
     GENERATION_LENGTH = 100
     SPEEDUP_CACHE = 1.1
 
-    def test_inference_old_onnx_model(self):
-        model = ORTModelForCausalLM.from_pretrained("optimum/gpt2")
-
-        tokenizer = get_preprocessor("optimum/gpt2")
+    @parameterized.expand([(False,), (True,)])
+    def test_inference_old_onnx_model(self, use_cache):
+        model_id = "optimum/gpt2"
+        model = AutoModelForCausalLM.from_pretrained("gpt2")
+        tokenizer = get_preprocessor(model_id)
         text = "This is a sample output"
         tokens = tokenizer(text, return_tensors="pt")
+        onnx_model = ORTModelForCausalLM.from_pretrained(model_id, use_cache=use_cache, use_io_binding=use_cache)
 
-        model.generate(**tokens)
+        self.assertEqual(onnx_model.use_cache, use_cache)
+        self.assertEqual(onnx_model.model_path.name, ONNX_DECODER_WITH_PAST_NAME if use_cache else ONNX_DECODER_NAME)
+        outputs_onnx = onnx_model.generate(
+            **tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30
+        )
+        outputs = model.generate(**tokens, num_beams=1, do_sample=False, min_new_tokens=30, max_new_tokens=30)
+        self.assertTrue(torch.allclose(outputs_onnx, outputs))
 
     def test_load_model_from_hub_onnx(self):
         model = ORTModelForCausalLM.from_pretrained("fxmarty/onnx-tiny-random-gpt2-without-merge")
 
         self.assertFalse(model.use_merged)
         self.assertTrue(model.use_cache)
-        self.assertTrue(model.decoder_with_past is not None)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
+        self.assertEqual(model.onnx_paths[0].name, ONNX_DECODER_WITH_PAST_NAME)
 
         model = ORTModelForCausalLM.from_pretrained("fxmarty/onnx-tiny-random-gpt2-with-merge")
 
         self.assertTrue(model.use_merged)
         self.assertTrue(model.use_cache)
-        self.assertTrue(model.decoder_with_past is None)
+        self.assertIsInstance(model.model, onnxruntime.InferenceSession)
+        self.assertEqual(model.onnx_paths[0].name, ONNX_DECODER_MERGED_NAME)
 
     def test_load_vanilla_transformers_which_is_not_supported(self):
         with self.assertRaises(Exception) as context:
@@ -1999,24 +2005,6 @@ def test_load_vanilla_transformers_which_is_not_supported(self):
 
         self.assertIn("Unrecognized configuration class", str(context.exception))
 
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_merge_from_transformers_and_save(self, model_arch):
-        if "text-generation-with-past" not in TasksManager.get_supported_tasks_for_model_type(
-            model_arch.replace("_", "-"), exporter="onnx"
-        ):
-            self.skipTest("Unsupported -with-past export case")
-
-        model_id = MODEL_NAMES[model_arch]
-        model = ORTModelForCausalLM.from_pretrained(model_id, export=True, use_merged=True)
-        with tempfile.TemporaryDirectory() as tmpdir:
-            model.save_pretrained(tmpdir)
-            save_path = os.path.join(tmpdir, ONNX_DECODER_MERGED_NAME)
-            self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
-
-            folder_contents = os.listdir(tmpdir)
-            self.assertTrue(ONNX_DECODER_NAME not in folder_contents)
-            self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
-
     @parameterized.expand(SUPPORTED_ARCHITECTURES)
     def test_merge_from_onnx_and_save(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
@@ -2026,26 +2014,23 @@ def test_merge_from_onnx_and_save(self, model_arch):
             self.skipTest("Unsupported export case")
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            main_export(model_id, tmpdir, task=task)
+            main_export(model_id, tmpdir, task=task, legacy=True)
 
             model = ORTModelForCausalLM.from_pretrained(tmpdir)
 
             self.assertTrue(model.use_merged)
-            self.assertTrue(model.decoder_with_past is None)
-
+            self.assertIsInstance(model.model, onnxruntime.InferenceSession)
             model.save_pretrained(tmpdir + "_save")
             save_path = os.path.join(tmpdir + "_save", ONNX_DECODER_MERGED_NAME)
             self.assertTrue(has_onnx_input(save_path, "use_cache_branch"))
 
             folder_contents = os.listdir(tmpdir + "_save")
-            self.assertTrue(ONNX_DECODER_NAME not in folder_contents)
-            self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
+            self.assertNotIn(ONNX_DECODER_NAME, folder_contents)
+            self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents)
+            self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents)
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
-        if use_cache is False and use_merged is True:
-            self.skipTest("use_cache=False, use_merged=True are uncompatible")
-
+    def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool):
         use_io_binding = None
         if use_cache is False:
             use_io_binding = False
@@ -2054,7 +2039,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
             "test_name": test_name,
             "model_arch": model_arch,
             "use_cache": use_cache,
-            "use_merged": use_merged,
         }
         self._setup(model_args)
 
@@ -2064,21 +2048,11 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
             use_cache=use_cache,
             use_io_binding=use_io_binding,
         )
-        if use_merged is False:
-            model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_NAME)
-            self.assertFalse(has_onnx_input(model_path, "use_cache_branch"))
-            self.assertEqual(onnx_model.use_merged, False)
-        else:
-            model_path = Path(self.onnx_model_dirs[test_name], ONNX_DECODER_MERGED_NAME)
-            self.assertTrue(has_onnx_input(model_path, "use_cache_branch"))
-            self.assertEqual(onnx_model.use_merged, True)
-
-        self.assertIsInstance(onnx_model.decoder, ORTDecoder)
-        if onnx_model.use_cache is True and onnx_model.use_merged is False:
-            self.assertIsInstance(onnx_model.decoder_with_past, ORTDecoder)
-        if onnx_model.use_cache is True and onnx_model.use_merged is True:
-            self.assertTrue(onnx_model.decoder_with_past is None)
 
+        model_path = Path(self.onnx_model_dirs[test_name], ONNX_WEIGHTS_NAME)
+        self.assertFalse(has_onnx_input(model_path, "use_cache_branch"))
+        self.assertFalse(onnx_model.use_merged)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -2122,10 +2096,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         gc.collect()
 
     @parameterized.expand(grid_parameters(FULL_GRID))
-    def test_pipeline_ort_model(self, test_name: str, model_arch: str, use_cache: bool, use_merged: bool):
-        if use_cache is False and use_merged is True:
-            self.skipTest("use_cache=False, use_merged=True are uncompatible")
-
+    def test_pipeline_ort_model(self, test_name: str, model_arch: str, use_cache: bool):
         use_io_binding = None
         if use_cache is False:
             use_io_binding = False
@@ -2134,7 +2105,6 @@ def test_pipeline_ort_model(self, test_name: str, model_arch: str, use_cache: bo
             "test_name": test_name,
             "model_arch": model_arch,
             "use_cache": use_cache,
-            "use_merged": use_merged,
         }
         self._setup(model_args)
 
@@ -2284,18 +2254,10 @@ def test_compare_with_and_without_past_key_values(self, model_arch):
 
     @parameterized.expand(grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "use_cache": [True]}))
     def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, model_arch: str, use_cache: bool):
-        model_args = {
-            "test_name": test_name + "_True",
-            "model_arch": model_arch,
-            "use_cache": use_cache,
-            "use_merged": True,
-        }
-        self._setup(model_args)
         model_args = {
             "test_name": test_name + "_False",
             "model_arch": model_arch,
             "use_cache": use_cache,
-            "use_merged": False,
         }
         self._setup(model_args)
 
@@ -2303,20 +2265,29 @@ def test_compare_merged_and_not_merged_models_outputs(self, test_name: str, mode
         tokenizer = get_preprocessor(model_id)
         text = "My Name is Philipp and i live"
         tokens = tokenizer(text, return_tensors="pt", return_token_type_ids=False if model_arch == "llama" else None)
-
         model_not_merged_dir = self.onnx_model_dirs[test_name + "_False"]
-        model_merged_dir = self.onnx_model_dirs[test_name + "_True"]
-
         model_not_merged = ORTModelForCausalLM.from_pretrained(model_not_merged_dir)
-        not_merged_onnx_path = Path(model_not_merged_dir, ONNX_DECODER_NAME)
+        not_merged_onnx_path = Path(model_not_merged_dir, ONNX_WEIGHTS_NAME)
         self.assertFalse(has_onnx_input(not_merged_onnx_path, "use_cache_branch"))
-        self.assertEqual(model_not_merged.use_merged, False)
+        self.assertFalse(model_not_merged.use_merged)
+
+        model_merged_dir = Path(model_not_merged_dir) / "merged"
+        task = model_not_merged.export_feature
+        if use_cache:
+            task += "-with-past"
+
+        main_export(
+            model_id,
+            output=model_merged_dir,
+            task=task,
+            no_post_process=False,
+            legacy=True,
+        )
 
         model_merged = ORTModelForCausalLM.from_pretrained(model_merged_dir)
         merged_onnx_path = Path(model_merged_dir, ONNX_DECODER_MERGED_NAME)
         self.assertTrue(has_onnx_input(merged_onnx_path, "use_cache_branch"))
-        self.assertEqual(model_merged.decoder_with_past, None)
-        self.assertEqual(model_merged.use_merged, True)
+        self.assertTrue(model_merged.use_merged)
 
         outputs_model_not_merged = model_not_merged.generate(**tokens)
         outputs_model_merged = model_merged.generate(**tokens)
@@ -2435,7 +2406,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch] if model_arch in MODEL_NAMES else self.ARCH_MODEL_MAP[model_arch]
         onnx_model = ORTModelForImageClassification.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -2575,7 +2546,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = MODEL_NAMES[model_arch]
         onnx_model = ORTModelForSemanticSegmentation.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -2730,7 +2701,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForAudioClassification.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -2882,7 +2853,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForCTC.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -2941,7 +2912,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForAudioXVector.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
@@ -3033,7 +3004,7 @@ def test_compare_to_transformers(self, model_arch):
         model_id = self.ARCH_MODEL_MAP[model_arch] if model_arch in self.ARCH_MODEL_MAP else MODEL_NAMES[model_arch]
         onnx_model = ORTModelForAudioFrameClassification.from_pretrained(self.onnx_model_dirs[model_arch])
 
-        self.assertIsInstance(onnx_model.model, onnxruntime.capi.onnxruntime_inference_collection.InferenceSession)
+        self.assertIsInstance(onnx_model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(onnx_model.config, PretrainedConfig)
 
         set_seed(SEED)
diff --git a/tests/onnxruntime/test_optimization.py b/tests/onnxruntime/test_optimization.py
index 7e3e670edd0..cd2127ac3be 100644
--- a/tests/onnxruntime/test_optimization.py
+++ b/tests/onnxruntime/test_optimization.py
@@ -518,15 +518,7 @@ def _test_optimization_levels(
         ort_model = ORTModelForCausalLM.from_pretrained(
             self.onnx_model_dirs[export_name], use_cache=use_cache, provider=provider, use_io_binding=use_io_binding
         )
-
-        if use_merged:
-            with self.assertRaises(NotImplementedError) as cm:
-                optimizer = ORTOptimizer.from_pretrained(ort_model)
-
-            self.assertTrue("ORTModelForCausalLM models that use a single ONNX" in str(cm.exception))
-            self.skipTest("Unsupported optimization case")
-        else:
-            optimizer = ORTOptimizer.from_pretrained(ort_model)
+        optimizer = ORTOptimizer.from_pretrained(ort_model)
 
         if provider == "CUDAExecutionProvider":
             for_gpu = True
@@ -541,7 +533,6 @@ def _test_optimization_levels(
 
         with tempfile.TemporaryDirectory(suffix="_optimized") as tmp_dir:
             optimizer.optimize(save_dir=tmp_dir, optimization_config=optimization_config)
-
             optimized_model = ORTModelForCausalLM.from_pretrained(
                 tmp_dir, use_cache=use_cache, provider=provider, use_io_binding=use_io_binding
             )
@@ -594,3 +585,15 @@ def test_optimization_levels_gpu(
                 provider="CUDAExecutionProvider",
                 use_io_binding=use_io_binding,
             )
+
+    def test_merged_optimization(self):
+        ort_model = ORTModelForCausalLM.from_pretrained("fxmarty/onnx-tiny-random-gpt2-with-merge")
+        self.assertTrue(ort_model.use_cache)
+
+        with self.assertRaises(NotImplementedError) as cm:
+            ORTOptimizer.from_pretrained(ort_model)
+
+        self.assertTrue(
+            "ORTOptimizer does not support ORTModelForCausalLM models when without/with past models are merged"
+            in str(cm.exception)
+        )
diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py
index aff1b51b534..4062c556ea9 100644
--- a/tests/onnxruntime/test_quantization.py
+++ b/tests/onnxruntime/test_quantization.py
@@ -35,6 +35,7 @@
     ORTQuantizer,
     QuantizationConfig,
 )
+from optimum.utils.testing_utils import grid_parameters
 
 
 class ORTQuantizerTest(unittest.TestCase):
@@ -78,6 +79,10 @@ class ORTDynamicQuantizationTest(unittest.TestCase):
         (ORTModelForSequenceClassification, "hf-internal-testing/tiny-random-bart", 32),
     )
 
+    SUPPORTED_DECODER_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
+        (ORTModelForCausalLM, "hf-internal-testing/tiny-random-gpt2", 22),
+    )
+
     @parameterized.expand(SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS)
     def test_dynamic_quantization(self, model_cls, model_name, expected_quantized_matmuls):
         qconfig = QuantizationConfig(
@@ -96,11 +101,7 @@ def test_dynamic_quantization(self, model_cls, model_name, expected_quantized_ma
             model.save_pretrained(tmp_dir)
 
             quantizer = ORTQuantizer.from_pretrained(model)
-            quantizer.quantize(
-                save_dir=output_dir,
-                quantization_config=qconfig,
-            )
-
+            quantizer.quantize(save_dir=output_dir, quantization_config=qconfig)
             expected_ort_config = ORTConfig(quantization=qconfig)
             ort_config = ORTConfig.from_pretrained(tmp_dir)
             # Verify the ORTConfig was correctly created and saved
@@ -119,19 +120,12 @@ def test_dynamic_quantization_subgraphs(self):
         qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=True)
         tmp_dir = tempfile.mkdtemp()
         output_dir = Path(tmp_dir)
-        model = ORTModelForCausalLM.from_pretrained(
-            "hf-internal-testing/tiny-random-gpt2", export=True, use_merged=True
-        )
-
+        model = ORTModelForCausalLM.from_pretrained("fxmarty/onnx-tiny-random-gpt2-with-merge", use_merged=True)
         self.assertTrue(model.use_merged)
         model.save_pretrained(tmp_dir)
 
         quantizer = ORTQuantizer.from_pretrained(model)
-        quantizer.quantize(
-            save_dir=output_dir,
-            quantization_config=qconfig,
-        )
-
+        quantizer.quantize(save_dir=output_dir, quantization_config=qconfig)
         expected_ort_config = ORTConfig(quantization=qconfig)
         ort_config = ORTConfig.from_pretrained(tmp_dir)
         # Verify the ORTConfig was correctly created and saved
@@ -146,6 +140,34 @@ def test_dynamic_quantization_subgraphs(self):
         self.assertTrue(num_quantized_matmul > 0)
         gc.collect()
 
+    @parameterized.expand(
+        grid_parameters(
+            {"model_arch": SUPPORTED_DECODER_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS, "use_cache": [True, False]}
+        )
+    )
+    def test_decoder_quantization_with_and_without_cache(self, test_name, model_info, use_cache):
+        model_cls, model_name, expected_quantized_matmuls = model_info
+        qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=True)
+        model = model_cls.from_pretrained(model_name, export=True, use_cache=use_cache, use_io_binding=use_cache)
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model.save_pretrained(tmp_dir)
+            output_dir = Path(tmp_dir)
+            quantizer = ORTQuantizer.from_pretrained(model)
+            quantizer.quantize(save_dir=output_dir, quantization_config=qconfig)
+            expected_ort_config = ORTConfig(quantization=qconfig)
+            ort_config = ORTConfig.from_pretrained(tmp_dir)
+
+            # Verify the ORTConfig was correctly created and saved
+            self.assertEqual(ort_config.to_dict(), expected_ort_config.to_dict())
+            quantized_model = onnx_load(output_dir.joinpath("model_quantized.onnx"))
+            num_quantized_matmul = 0
+            for initializer in quantized_model.graph.initializer:
+                if "weight" in initializer.name and "quantized" in initializer.name:
+                    num_quantized_matmul += 1
+            self.assertEqual(expected_quantized_matmuls, num_quantized_matmul)
+            gc.collect()
+
 
 class ORTStaticQuantizationTest(unittest.TestCase):
     SUPPORTED_ARCHITECTURES_WITH_EXPECTED_QUANTIZED_MATMULS = (
@@ -184,10 +206,7 @@ def preprocess_function(examples, tokenizer):
                 dataset_split="train",
             )
             calibration_config = AutoCalibrationConfig.minmax(calibration_dataset)
-            ranges = quantizer.fit(
-                dataset=calibration_dataset,
-                calibration_config=calibration_config,
-            )
+            ranges = quantizer.fit(dataset=calibration_dataset, calibration_config=calibration_config)
             quantizer.quantize(
                 save_dir=output_dir,
                 calibration_tensors_range=ranges,
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 09603c5e1e8..949cfa242e3 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -40,7 +40,7 @@
     "clip": "hf-internal-testing/tiny-random-CLIPModel",
     "convbert": "hf-internal-testing/tiny-random-ConvBertModel",
     "convnext": "hf-internal-testing/tiny-random-convnext",
-    "codegen": "hf-internal-testing/tiny-random-CodeGenModel",
+    "codegen": "hf-internal-testing/tiny-random-CodeGenForCausalLM",
     "data2vec_text": "hf-internal-testing/tiny-random-Data2VecTextModel",
     "data2vec_vision": "hf-internal-testing/tiny-random-Data2VecVisionModel",
     "data2vec_audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
@@ -62,7 +62,7 @@
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
-    "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",