From 5584eb86731d88477c22cdac85df723b918403be Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Fri, 5 Apr 2024 10:51:49 +0200
Subject: [PATCH 01/35] Fix infer task for stable diffusion (#1793)

* fix

* apply suggestions
---
 optimum/exporters/tasks.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 7ccb0d9c7b5..e6e7920c60d 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1560,7 +1560,14 @@ def _infer_task_from_model_name_or_path(
             library_name = TasksManager.infer_library_from_model(model_name_or_path, subfolder, revision)
 
             if library_name == "diffusers":
-                class_name = model_info.config["diffusers"]["class_name"]
+                if model_info.config["diffusers"].get("class_name", None):
+                    class_name = model_info.config["diffusers"]["class_name"]
+                elif model_info.config["diffusers"].get("_class_name", None):
+                    class_name = model_info.config["diffusers"]["_class_name"]
+                else:
+                    raise ValueError(
+                        f"Could not automatically infer the class name for {model_name_or_path}. Please open an issue at https://github.com/huggingface/optimum/issues."
+                    )
                 inferred_task_name = "stable-diffusion-xl" if "StableDiffusionXL" in class_name else "stable-diffusion"
             elif library_name == "timm":
                 inferred_task_name = "image-classification"

From 35a81dceba60e51d38d2d45db5a3941e9ae690d8 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Fri, 5 Apr 2024 10:53:05 +0200
Subject: [PATCH 02/35] Add Nvidia and Neuron to README (#1791)

improve
---
 README.md | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 4907ca62839..df6d22e62f8 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,16 @@ python -m pip install optimum
 
 If you'd like to use the accelerator-specific features of 🤗 Optimum, you can install the required dependencies according to the table below:
 
-| Accelerator                                                                                                            | Installation                                      |
-|:-----------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------|
-| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                                                           | `pip install --upgrade-strategy eager optimum[onnxruntime]`       |
-| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)       | `pip install --upgrade-strategy eager optimum[neural-compressor]`|
-| [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                                 | `pip install --upgrade-strategy eager optimum[openvino,nncf]`    |
-| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                     | `pip install --upgrade-strategy eager optimum[amd]`              |
-| [Habana Gaudi Processor (HPU)](https://huggingface.co/docs/optimum/habana/index)                                                            | `pip install --upgrade-strategy eager optimum[habana]`           |
-| [FuriosaAI](https://huggingface.co/docs/optimum/furiosa/index)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`          |
+| Accelerator                                                                                                            | Installation                                                      |
+|:-----------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|
+| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade-strategy eager optimum[onnxruntime]`       |
+| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade-strategy eager optimum[neural-compressor]` |
+| [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade-strategy eager optimum[openvino,nncf]`     |
+| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia` |
+| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade-strategy eager optimum[amd]`               |
+| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade-strategy eager optimum[neuronx]`           |
+| [Habana Gaudi Processor (HPU)](https://huggingface.co/docs/optimum/habana/index)                                       | `pip install --upgrade-strategy eager optimum[habana]`            |
+| [FuriosaAI](https://huggingface.co/docs/optimum/furiosa/index)                                                         | `pip install --upgrade-strategy eager optimum[furiosa]`           |
 
 The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
@@ -45,6 +47,8 @@ python -m pip install optimum[onnxruntime]@git+https://github.com/huggingface/op
 - TensorFlow Lite
 - [OpenVINO](https://huggingface.co/docs/optimum/intel/inference)
 - Habana first-gen Gaudi / Gaudi2, more details [here](https://huggingface.co/docs/optimum/main/en/habana/usage_guides/accelerate_inference)
+- AWS Inferentia 2 / Inferentia 1, more details [here](https://huggingface.co/docs/optimum-neuron/en/guides/models)
+- NVIDIA TensorRT-LLM , more details [here](https://huggingface.co/blog/optimum-nvidia)
 
 The [export](https://huggingface.co/docs/optimum/exporters/overview) and optimizations can be done both programmatically and with a command line.
 
@@ -190,6 +194,7 @@ optimum-cli export tflite \
 We support many providers:
 
 - Habana's Gaudi processors
+- AWS Trainium instances, check [here](https://huggingface.co/docs/optimum-neuron/en/guides/distributed_training)
 - ONNX Runtime (optimized for GPUs)
 
 ### Habana

From dac864530ef170bd20877724317f1508ea7a0f3b Mon Sep 17 00:00:00 2001
From: Prathik Rao <prathik.rao@gmail.com>
Date: Fri, 5 Apr 2024 01:58:30 -0700
Subject: [PATCH 03/35] adds debug options to dump onnx graphs (#1789)

add debug options
---
 optimum/onnxruntime/trainer.py       |  9 ++++++++-
 optimum/onnxruntime/training_args.py | 30 ++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 2b7fb654303..9bc2bb5134d 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -459,7 +459,14 @@ def _inner_training_loop(
 
         # Wrap the model with `ORTModule`
         logger.info("Wrap ORTModule for ONNX Runtime training.")
-        model = ORTModule(self.model)
+        if self.args.save_onnx:
+            from torch_ort import DebugOptions
+
+            model = ORTModule(
+                self.model, DebugOptions(save_onnx=self.args.save_onnx, onnx_prefix=self.args.onnx_prefix)
+            )
+        else:
+            model = ORTModule(self.model)
         self.model_wrapped = model
         self.model = model
 
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index b05da6a5ede..6aec362c07c 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -79,6 +79,29 @@ class ORTTrainingArguments(TrainingArguments):
         },
     )
 
+    save_onnx: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "Configure ORTModule to save onnx models. Defaults to False. \
+            The output directory of the onnx models by default is set to args.output_dir. \
+            To change the output directory, the environment variable ORTMODULE_SAVE_ONNX_PATH can be \
+            set to the destination directory path."
+        },
+    )
+
+    onnx_prefix: Optional[str] = field(
+        default=None,
+        metadata={"help": "Prefix for the saved ORTModule file names. Must be provided if save_onnx is True."},
+    )
+
+    onnx_log_level: Optional[str] = field(
+        default="WARNING",
+        metadata={
+            "help": "Configure ORTModule log level. Defaults to WARNING. \
+            onnx_log_level can also be set to one of VERBOSE, INFO, WARNING, ERROR, FATAL."
+        },
+    )
+
     # This method will not need to be overriden after the deprecation of `--adafactor` in version 5 of 🤗 Transformers.
     def __post_init__(self):
         # expand paths, if not os.makedirs("~/bar") will make directory
@@ -244,6 +267,13 @@ def __post_init__(self):
             if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
                 raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
 
+        if self.save_onnx:
+            if not self.onnx_prefix:
+                raise ValueError("onnx_prefix must be provided if save_onnx is True")
+            if not os.getenv("ORTMODULE_SAVE_ONNX_PATH", None):
+                os.environ["ORTMODULE_SAVE_ONNX_PATH"] = self.output_dir
+            os.environ["ORTMODULE_LOG_LEVEL"] = self.onnx_log_level
+
         if (
             is_torch_available()
             and (self.device.type != "cuda")

From 2c06aa270a83b0a571dd286c94805c074a87a479 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Mon, 8 Apr 2024 14:32:33 +0200
Subject: [PATCH 04/35] Improve PR template (#1799)

indicate who can review
---
 .github/PULL_REQUEST_TEMPLATE.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 1bb8ea7670f..f24e554c89f 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -20,3 +20,13 @@ Fixes # (issue)
 - [ ] Did you make sure to update the documentation with your changes?
 - [ ] Did you write any new necessary tests?
 
+## Who can review?
+
+<!--
+For faster review, we strongly recommend you to ping the following people:
+- ONNX / ONNX Runtime : @fxmarty, @echarlaix, @JingyaHuang, @michaelbenayoun
+- ONNX Runtime Training: @JingyaHuang
+- BetterTransformer: @fxmarty
+- GPTQ, quantization: @fxmarty, @SunMarc
+- TFLite export: @michaelbenayoun
+-->

From 5d194ddd58cb19f2abea6fd4f227c2e1ae3b4894 Mon Sep 17 00:00:00 2001
From: Funtowicz Morgan <mfuntowicz@users.noreply.github.com>
Date: Mon, 8 Apr 2024 16:10:39 +0200
Subject: [PATCH 05/35] Add Google TPU to the mix (#1797)

Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 .../workflows/build_main_documentation.yml    | 17 ++++++++++-
 .github/workflows/build_pr_documentation.yml  | 17 ++++++++++-
 docs/combine_docs.py                          | 28 +++++++++++++++++++
 docs/source/_redirects.yml                    |  3 ++
 docs/source/index.mdx                         |  6 +++-
 5 files changed, 68 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index 3f8c230a98b..f25ee611f6f 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -49,6 +49,11 @@ jobs:
           repository: 'huggingface/optimum-amd'
           path: optimum-amd
 
+      - uses: actions/checkout@v2
+        with:
+          repository: 'huggingface/optimum-tpu'
+          path: optimum-tpu
+
       - name: Free disk space
         run: |
           df -h
@@ -150,6 +155,16 @@ jobs:
           mv furiosa-doc-build ../optimum
           cd ..
 
+      - name: Make TPU documentation
+        run: |
+          sudo docker system prune -a -f
+          cd optimum-tpu
+          pip install -U pip
+          pip install .
+          doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
+          mv tpu-doc-build ../optimum
+          cd ..
+
       - name: Make AMD documentation
         run: |
           sudo docker system prune -a -f
@@ -171,7 +186,7 @@ jobs:
       - name: Combine subpackage documentation
         run: |
           cd optimum
-          sudo python docs/combine_docs.py --subpackages nvidia amd intel neuron habana furiosa --version ${{ env.VERSION }}
+          sudo python docs/combine_docs.py --subpackages nvidia amd intel neuron tpu habana furiosa --version ${{ env.VERSION }}
           cd ..
 
       - name: Push to repositories
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index 01d4c4e7a41..c1fc4d859ce 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -53,6 +53,11 @@ jobs:
           repository: 'huggingface/optimum-amd'
           path: optimum-amd
 
+      - uses: actions/checkout@v2
+        with:
+          repository: 'huggingface/optimum-tpu'
+          path: optimum-tpu
+
       - name: Setup environment
         run: |
           pip uninstall -y doc-builder
@@ -91,6 +96,16 @@ jobs:
           sudo mv amd-doc-build ../optimum
           cd ..
 
+      - name: Make TPU documentation
+        run: |
+          sudo docker system prune -a -f
+          cd optimum-tpu
+          pip install -U pip
+          pip install .
+          doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
+          mv tpu-doc-build ../optimum
+          cd ..
+
       - name: Make Optimum documentation
         run: |
           sudo docker system prune -a -f
@@ -101,7 +116,7 @@ jobs:
       - name: Combine subpackage documentation
         run: |
           cd optimum
-          sudo python docs/combine_docs.py --subpackages nvidia amd intel neuron habana furiosa --version pr_$PR_NUMBER
+          sudo python docs/combine_docs.py --subpackages nvidia amd intel neuron tpu habana furiosa --version pr_$PR_NUMBER
           sudo mv optimum-doc-build ../
           cd ..
 
diff --git a/docs/combine_docs.py b/docs/combine_docs.py
index 17d99a3a021..e01c6c83e86 100755
--- a/docs/combine_docs.py
+++ b/docs/combine_docs.py
@@ -108,6 +108,31 @@ def add_neuron_doc(base_toc: List):
     )
 
 
+def add_tpu_doc(base_toc: List):
+    """
+    Extends the table of content with a section about Optimum TPU.
+
+    Args:
+        base_toc (List): table of content for the doc of Optimum.
+    """
+    # Update optimum table of contents
+    base_toc.insert(
+        SUBPACKAGE_TOC_INSERT_INDEX,
+        {
+            "sections": [
+                {
+                    # Ideally this should directly point at https://huggingface.co/docs/optimum-tpu/index
+                    # Current hacky solution is to have a redirection in _redirects.yml
+                    "local": "docs/optimum-tpu/index",
+                    "title": "🤗 Optimum-TPU",
+                }
+            ],
+            "title": "Google TPUs",
+            "isExpanded": False,
+        },
+    )
+
+
 def main():
     args = parser.parse_args()
     optimum_path = Path("optimum-doc-build")
@@ -121,6 +146,9 @@ def main():
         if subpackage == "neuron":
             # Neuron has its own doc so it is managed differently
             add_neuron_doc(base_toc)
+        elif subpackage == "tpu":
+            # Optimum TPU has its own doc so it is managed differently
+            add_tpu_doc(base_toc)
         elif subpackage == "nvidia":
             # At the moment, Optimum Nvidia's doc is the README of the GitHub repo
             # It is linked to in optimum/docs/source/nvidia_overview.mdx
diff --git a/docs/source/_redirects.yml b/docs/source/_redirects.yml
index e3bfc0f93ba..9ad8becb84e 100644
--- a/docs/source/_redirects.yml
+++ b/docs/source/_redirects.yml
@@ -28,3 +28,6 @@ intel_trainer: intel/reference_inc
 
 # Optimum Neuron
 docs/optimum-neuron/index: /docs/optimum-neuron/index
+
+# Optimum TPU
+docs/optimum-tpu/index: /docs/optimum-tpu/index
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 4e61e960d58..7eb79c33ed2 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -25,7 +25,7 @@ As such, Optimum enables developers to efficiently use any of these platforms wi
 The packages below enable you to get the best of the 🤗 Hugging Face ecosystem on various types of devices.
 
 <div class="mt-10">
-  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-3 md:gap-y-4 md:gap-x-5">
+  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-4 md:gap-y-4 md:gap-x-5">
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://github.com/huggingface/optimum-nvidia"
       ><div class="w-full text-center bg-gradient-to-br from-green-600 to-green-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">NVIDIA</div>
       <p class="text-gray-700">Accelerate inference with NVIDIA TensorRT-LLM on the <span class="underline" onclick="event.preventDefault(); window.open('https://developer.nvidia.com/blog/nvidia-tensorrt-llm-supercharges-large-language-model-inference-on-nvidia-h100-gpus/', '_blank');">NVIDIA platform</span></p>
@@ -42,6 +42,10 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
       ><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>
       <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/trainium/', '_blank');">AWS Trainium</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/inferentia/', '_blank');">AWS Inferentia</span></p>
     </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-tpu/index"
+      ><div class="w-full text-center bg-gradient-to-tr from-blue-200 to-blue-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Google TPUs</div>
+      <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://cloud.google.com/tpu', '_blank');">Google TPUs</span></p>
+    </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./habana/index"
       ><div class="w-full text-center bg-gradient-to-br from-indigo-400 to-indigo-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Habana</div>
       <p class="text-gray-700">Maximize training throughput and efficiency with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.habana.ai/en/latest/Gaudi_Overview/Gaudi_Architecture.html', '_blank');">Habana's Gaudi processor</span></p>

From 69f604c83cf628b43076c19ec139456eb624f901 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 8 Apr 2024 18:34:26 +0200
Subject: [PATCH 06/35] Add redirection for Optimum TPU (#1801)

---
 docs/source/_redirects.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/_redirects.yml b/docs/source/_redirects.yml
index 9ad8becb84e..4022ba1618d 100644
--- a/docs/source/_redirects.yml
+++ b/docs/source/_redirects.yml
@@ -31,3 +31,4 @@ docs/optimum-neuron/index: /docs/optimum-neuron/index
 
 # Optimum TPU
 docs/optimum-tpu/index: /docs/optimum-tpu/index
+tpu/index: /docs/optimum-tpu/index

From 2ac676d0bbe97d3736a5809c6034428378699c4a Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Tue, 9 Apr 2024 01:08:09 +0200
Subject: [PATCH 07/35] Improve the installation of optimum-neuron through
 optimum extras  (#1778)

* pin for neuron

* remove accelerate
---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 81b3d74c9cc..7b555bbd887 100644
--- a/setup.py
+++ b/setup.py
@@ -81,8 +81,8 @@
     "neural-compressor": "optimum-intel[neural-compressor]>=1.15.0",
     "graphcore": "optimum-graphcore",
     "habana": ["optimum-habana", "transformers >= 4.37.0, < 4.38.0"],
-    "neuron": "optimum-neuron[neuron]",
-    "neuronx": "optimum-neuron[neuronx]",
+    "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers == 4.36.2"],
+    "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers == 4.36.2"],
     "furiosa": "optimum-furiosa",
     "amd": "optimum-amd",
     "dev": TESTS_REQUIRE + QUALITY_REQUIRE,

From 69af5dbab133f2e0ae892721759825d06f6cb3b7 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Tue, 9 Apr 2024 13:34:46 +0200
Subject: [PATCH 08/35] Add Nvidia and Neuron to the installation doc (#1803)

update doc
---
 docs/source/installation.mdx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index cf84d429ea6..09b8632c72d 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -20,12 +20,14 @@ python -m pip install optimum
 
 If you'd like to use the accelerator-specific features of 🤗 Optimum, you can install the required dependencies according to the table below:
 
-| Accelerator                                                                                                            | Installation                                      |
-|:-----------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------|
+| Accelerator                                                                                                            | Installation                                                       |
+|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------|
 | [ONNX runtime](https://onnxruntime.ai/docs/)                                                                           | `pip install --upgrade-strategy eager install optimum[onnxruntime]`|
 | [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager optimum[neural-compressor]`  |
 | [Intel OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager optimum[openvino,nncf]`      |
-| [AMD Instinct GPUs and Ryzen AI NPU](https://www.amd.com/en/graphics/instinct-server-accelerators)                                      | `pip install --upgrade-strategy eager optimum[amd]`                |
+| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`  |
+| [AMD Instinct GPUs and Ryzen AI NPU](https://www.amd.com/en/graphics/instinct-server-accelerators)                     | `pip install --upgrade-strategy eager optimum[amd]`                |
+| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade-strategy eager optimum[neuronx]`            |
 | [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `pip install --upgrade-strategy eager optimum[habana]`             |
 | [FuriosaAI](https://www.furiosa.ai/)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`            |
 

From 5ea14c17bc5ee4d0953df8962e43678ea680b1b3 Mon Sep 17 00:00:00 2001
From: pogzyb <pogzyb@umich.edu>
Date: Wed, 10 Apr 2024 03:02:01 -0400
Subject: [PATCH 09/35] Add support for markuplm ONNX export (#1784)

* Add xpath dummy generator

* Add markuplm onnx config

* Update docs

* Add model to tests

* Get pad ids from normalized config

* Use hf-internal model

* Add markuplm to tiny exports

* Apply formatting
---
 docs/source/exporters/onnx/overview.mdx |  1 +
 optimum/exporters/onnx/model_configs.py | 20 ++++++++++
 optimum/exporters/tasks.py              |  7 ++++
 optimum/utils/__init__.py               |  1 +
 optimum/utils/input_generators.py       | 52 +++++++++++++++++++++++++
 optimum/utils/normalized_config.py      |  1 +
 tests/exporters/exporters_utils.py      |  2 +
 7 files changed, 84 insertions(+)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index b3dc7e053cf..c52bed2be77 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -62,6 +62,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Llama
 - M2-M100
 - Marian
+- MarkupLM
 - MBart
 - Mistral
 - MobileBert
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index cd9d54eeca9..c708fd017ee 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -36,6 +36,7 @@
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
+    DummyXPathSeqInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     GemmaDummyPastKeyValuesGenerator,
     GPTBigCodeDummyPastKeyValuesGenerator,
@@ -182,6 +183,25 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
         return common_inputs
 
 
+class MarkupLMOnnxConfig(BertOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTextInputGenerator,
+        DummyXPathSeqInputGenerator,
+    )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        dynamic_axis = {0: "batch_size", 1: "sequence_length"}
+        xpath_dynamic_axis = {0: "batch_size", 1: "sequence_length", 2: "max_depth"}
+        return {
+            "input_ids": dynamic_axis,
+            "attention_mask": dynamic_axis,
+            "token_type_ids": dynamic_axis,
+            "xpath_subs_seq": xpath_dynamic_axis,
+            "xpath_tags_seq": xpath_dynamic_axis,
+        }
+
+
 class DebertaV2OnnxConfig(DebertaOnnxConfig):
     pass
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index e6e7920c60d..0c55ac3e63e 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -732,6 +732,13 @@ class TasksManager:
             "text-generation-with-past",
             onnx="MarianOnnxConfig",
         ),
+        "markuplm": supported_tasks_mapping(
+            "feature-extraction",
+            "text-classification",
+            "token-classification",
+            "question-answering",
+            onnx="MarkupLMOnnxConfig",
+        ),
         "mbart": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index b4e42121797..99ce8693d42 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -62,6 +62,7 @@
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
+    DummyXPathSeqInputGenerator,
     FalconDummyPastKeyValuesGenerator,
     GemmaDummyPastKeyValuesGenerator,
     GPTBigCodeDummyPastKeyValuesGenerator,
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 7f6df3e723a..2d80bbeb374 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -437,6 +437,58 @@ def generate(
             return self.random_int_tensor(shape, max_value, min_value=min_value, framework=framework, dtype=int_dtype)
 
 
+class DummyXPathSeqInputGenerator(DummyTextInputGenerator):
+    """
+    Generates dummy xpath sequences.
+    """
+
+    SUPPORTED_INPUT_NAMES = (
+        "xpath_tags_seq",
+        "xpath_subs_seq",
+    )
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        num_choices: int = DEFAULT_DUMMY_SHAPES["num_choices"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        random_num_choices_range: Optional[Tuple[int, int]] = None,
+        padding_side: str = "right",
+        **kwargs,
+    ):
+        super().__init__(
+            task,
+            normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            num_choices=num_choices,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+            random_num_choices_range=random_num_choices_range,
+            padding_side=padding_side,
+            **kwargs,
+        )
+        self.max_depth = normalized_config.max_depth
+        self.tag_pad_id = normalized_config.tag_pad_id
+        self.subs_pad_id = normalized_config.subs_pad_id
+
+    def generate(
+        self,
+        input_name: str,
+        framework: str = "pt",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+    ):
+        min_value = 0
+        max_value = self.tag_pad_id if input_name == "xpath_tags_seq" else self.subs_pad_id
+        shape = [self.batch_size, self.sequence_length, self.max_depth]
+        return self.random_int_tensor(shape, max_value, min_value=min_value, framework=framework, dtype=int_dtype)
+
+
 class DummyDecoderTextInputGenerator(DummyTextInputGenerator):
     """
     Generates dummy decoder text inputs.
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index fe4ec1d1ed7..8a5ef377854 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -239,6 +239,7 @@ class NormalizedConfigManager:
         "llama": NormalizedTextConfigWithGQA,
         "longt5": T5LikeNormalizedTextConfig,
         "marian": BartLikeNormalizedTextConfig,
+        "markuplm": NormalizedTextConfig,
         "mbart": BartLikeNormalizedTextConfig,
         "mistral": NormalizedTextConfigWithGQA,
         "mixtral": NormalizedTextConfigWithGQA,
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index afb7acb0b2e..4d987ed982f 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -111,6 +111,7 @@
     # "longformer": "allenai/longformer-base-4096",
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",
     "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
+    "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
@@ -237,6 +238,7 @@
     # "longformer": "allenai/longformer-base-4096",
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",  # Not using facebook/m2m100_418M because it takes too much time for testing.
     "marian": "Helsinki-NLP/opus-mt-en-de",
+    "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "sshleifer/tiny-mbart",
     "mobilebert": "google/mobilebert-uncased",
     # "mobilenet_v1": "google/mobilenet_v1_0.75_192",

From 2f75b0da85b95030b82b281205aa4284df2f998a Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 10 Apr 2024 10:54:34 +0200
Subject: [PATCH 10/35] Musicgen ONNX export (text-conditional only) (#1779)

* WIP but need to work on encodec first

* musicgen onnx export

* better logs

* add tests

* rename audio_encoder_decode.onnx to encodec_decode.onnx

* fix num heads in pkv

* nits

* add build_delay_pattern_mask

* fix wrong hidden_size for cross attention pkv

* fix tests

* update doc
---
 docs/source/exporters/onnx/overview.mdx |   1 +
 optimum/exporters/onnx/config.py        |   7 +
 optimum/exporters/onnx/constants.py     |   1 +
 optimum/exporters/onnx/convert.py       |   5 +-
 optimum/exporters/onnx/model_configs.py | 307 +++++++++++++++++++++++-
 optimum/exporters/onnx/model_patcher.py | 137 ++++++++++-
 optimum/exporters/tasks.py              |   7 +-
 optimum/exporters/utils.py              |  46 ++++
 optimum/onnx/transformations_utils.py   |   2 +-
 optimum/utils/__init__.py               |   3 +
 optimum/utils/input_generators.py       | 132 +++++++++-
 tests/exporters/exporters_utils.py      |   2 +
 12 files changed, 637 insertions(+), 13 deletions(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index c52bed2be77..22471c297a5 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -71,6 +71,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - MobileNet v2
 - MPNet
 - MT5
+- Musicgen (text-conditional only)
 - Nystromformer
 - OWL-ViT
 - Pegasus
diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
index c5052379480..0faf5048f60 100644
--- a/optimum/exporters/onnx/config.py
+++ b/optimum/exporters/onnx/config.py
@@ -383,6 +383,13 @@ def __init__(
             )
 
         self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS = self._decoder_onnx_config._normalized_config
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS = self._decoder_onnx_config._normalized_config
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.encoder_num_attention_heads = (
+            self._decoder_onnx_config._normalized_config.num_attention_heads
+        )
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.decoder_num_attention_heads = (
+            self._decoder_onnx_config._normalized_config.num_attention_heads
+        )
 
         if isinstance(self._decoder_onnx_config, OnnxSeq2SeqConfigWithPast):
             self._past_key_values_generator = (
diff --git a/optimum/exporters/onnx/constants.py b/optimum/exporters/onnx/constants.py
index abdd78bbfac..0a6f9f9b363 100644
--- a/optimum/exporters/onnx/constants.py
+++ b/optimum/exporters/onnx/constants.py
@@ -36,5 +36,6 @@
 
 SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED = [
     "bart",
+    "musicgen",
     "whisper",
 ]
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 1ad0f896814..053a7a5aebe 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -258,7 +258,7 @@ def _run_validation(
 
     model_kwargs = model_kwargs if model_kwargs is not None else {}
 
-    logger.info(f"Validating ONNX model {onnx_model.as_posix()}...")
+    logger.info(f"\nValidating ONNX model {onnx_model.as_posix()}...")
 
     if atol is None:
         atol = config.ATOL_FOR_VALIDATION
@@ -764,6 +764,9 @@ def export_models(
         output_path = output_dir / output_name
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
+        logger.info(
+            f"\n***** Exporting submodel {i + 1}/{len(models_and_onnx_configs)}: {submodel.__class__.__name__} *****"
+        )
         outputs.append(
             export(
                 model=submodel,
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index c708fd017ee..72d047efa01 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -14,17 +14,22 @@
 # limitations under the License.
 """Model specific ONNX configurations."""
 import random
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
 
 from packaging import version
 from transformers.utils import is_tf_available
 
+from ...onnx import merge_decoders
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     BloomDummyPastKeyValuesGenerator,
     DummyAudioInputGenerator,
+    DummyCodegenDecoderTextInputGenerator,
     DummyDecoderTextInputGenerator,
+    DummyEncodecInputGenerator,
     DummyInputGenerator,
+    DummyIntGenerator,
     DummyPastKeyValuesGenerator,
     DummyPix2StructInputGenerator,
     DummyPointsGenerator,
@@ -47,6 +52,7 @@
     NormalizedTextAndVisionConfig,
     NormalizedTextConfig,
     NormalizedVisionConfig,
+    is_diffusers_available,
     logging,
 )
 from ...utils.normalized_config import NormalizedConfigManager
@@ -62,8 +68,10 @@
     TextSeq2SeqOnnxConfig,
     VisionOnnxConfig,
 )
+from .constants import ONNX_DECODER_MERGED_NAME, ONNX_DECODER_NAME, ONNX_DECODER_WITH_PAST_NAME
 from .model_patcher import (
     FalconModelPatcher,
+    MusicgenModelPatcher,
     SAMModelPatcher,
     SentenceTransformersCLIPPatcher,
     SentenceTransformersTransformerPatcher,
@@ -82,6 +90,9 @@
     if is_tf_available():
         from transformers.modeling_tf_utils import TFPreTrainedModel
 
+    if is_diffusers_available():
+        from diffusers import ModelMixin
+
 logger = logging.get_logger(__name__)
 
 
@@ -1392,10 +1403,302 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         return common_outputs
 
 
+class MusicgenOnnxConfig(OnnxSeq2SeqConfigWithPast):
+    # NOTE: Several warnings during the export are not to worry about:
+    # * for i, indices in enumerate(codes): --> can be unrolled, fixed length (num_quantizers).
+    # * max_pad = max(padding_left, padding_right) --> does not impact later controlflows.
+    # if length <= max_pad:  --> appears to be always False for Musicgen.
+
+    # opset>=13 needed to avoid a bug in T5 encoder SelfAttention.
+    # opset>=14 needed for torch.tril export.
+    DEFAULT_ONNX_OPSET = 14
+
+    VARIANTS = {
+        "text-conditional-with-past": "Exports Musicgen to ONNX to generate audio samples conditioned on a text prompt (Reference: https://huggingface.co/docs/transformers/model_doc/musicgen#text-conditional-generation). This uses the decoder KV cache. The following subcomponents are exported:\n\t\t* text_encoder.onnx: corresponds to the text encoder part in https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/musicgen/modeling_musicgen.py#L1457.\n\t\t* encodec_decode.onnx: corresponds to the Encodec audio encoder part in https://github.com/huggingface/transformers/blob/v4.39.1/src/transformers/models/musicgen/modeling_musicgen.py#L2472-L2480.\n\t\t* decoder_model.onnx: The Musicgen decoder, without past key values input, and computing cross attention. Not required at inference (use decoder_model_merged.onnx instead).\n\t\t* decoder_with_past_model.onnx: The Musicgen decoder, with past_key_values input (KV cache filled), not computing cross attention. Not required at inference (use decoder_model_merged.onnx instead).\n\t\t* decoder_model_merged.onnx: The two previous models fused in one, to avoid duplicating weights. A boolean input `use_cache_branch` allows to select the branch to use. In the first forward pass where the KV cache is empty, dummy past key values inputs need to be passed and are ignored with use_cache_branch=False.\n\t\t* build_delay_pattern_mask.onnx: A model taking as input `input_ids`, `pad_token_id`, `max_length`, and building a delayed pattern mask to the input_ids. Implements https://github.com/huggingface/transformers/blob/v4.39.3/src/transformers/models/musicgen/modeling_musicgen.py#L1054.",
+    }
+    # TODO: support audio-prompted generation (- audio_encoder_encode.onnx: corresponds to the audio encoder part in https://github.com/huggingface/transformers/blob/f01e1609bf4dba146d1347c1368c8c49df8636f6/src/transformers/models/musicgen/modeling_musicgen.py#L2087.\n\t)
+    # With that, we have full Encodec support.
+    DEFAULT_VARIANT = "text-conditional-with-past"
+
+    NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTextInputGenerator,
+        DummyCodegenDecoderTextInputGenerator,
+        DummySeq2SeqPastKeyValuesGenerator,
+        DummyEncodecInputGenerator,
+        DummyIntGenerator,
+    )
+    DUMMY_PKV_GENERATOR_CLASS = DummySeq2SeqPastKeyValuesGenerator
+
+    def __init__(
+        self,
+        config: "PretrainedConfig",
+        task: str = "feature-extraction",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+        use_past: bool = False,
+        use_past_in_inputs: bool = False,
+        behavior: ConfigBehavior = ConfigBehavior.ENCODER,
+        preprocessors: Optional[List[Any]] = None,
+        model_part: Optional[Literal["text_encoder", "encodec_decode", "decoder", "build_delay_pattern_mask"]] = None,
+        legacy: bool = False,
+        variant: str = "text-conditional-with-past",
+    ):
+        super().__init__(
+            config=config,
+            task=task,
+            int_dtype=int_dtype,
+            float_dtype=float_dtype,
+            use_past=use_past,
+            use_past_in_inputs=use_past_in_inputs,
+            behavior=behavior,
+            preprocessors=preprocessors,
+            legacy=legacy,
+        )
+        if legacy:
+            raise ValueError("Musicgen does not support legacy=True.")
+
+        if (
+            model_part in ["text_encoder", "encodec_decode", "build_delay_pattern_mask"]
+            and behavior != ConfigBehavior.ENCODER
+        ):
+            raise ValueError(
+                f"model_part is {model_part} and behavior is {behavior}. This is not supported, please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        if model_part == "decoder" and behavior != ConfigBehavior.DECODER:
+            raise ValueError(
+                f"model_part is {model_part} and behavior is {behavior}. This is not supported, please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        if behavior == ConfigBehavior.MONOLITH:
+            raise ValueError(
+                "Musicgen does not support behavior=ConfigBehavior.MONOLITH. Please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        if config.audio_encoder.model_type != "encodec":
+            raise ValueError(
+                f"Optimum ONNX export for Musicgen supports only Encodec as the audio encoder, got: {config.audio_encoder.model_type}. Please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        # Handling it would require to trace the audio_encoder.decode with torch.jit.script as we than have an unrollable loop.
+        if config.audio_encoder.chunk_length_s is not None:
+            raise ValueError(
+                f"Musicgen ONNX export currently does not support audio_encoder.chunk_length_s not None (got {config.audio_encoder.chunk_length_s}). Please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        self.model_part = model_part
+        if self.model_part == "decoder":
+            self.use_past = True  # without past is not supported, hard-code it here.
+
+        self._normalized_config.ENCODER_NORMALIZED_CONFIG_CLASS = NormalizedTextConfig(self._config.text_encoder)
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS = NormalizedConfig(self._config.decoder)
+        self._normalized_config.decoder_num_layers = self._config.decoder.num_hidden_layers
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.num_layers = self._config.decoder.num_hidden_layers
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.encoder_num_attention_heads = (
+            self._config.decoder.num_attention_heads
+        )
+        self._normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.decoder_num_attention_heads = (
+            self._config.decoder.num_attention_heads
+        )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        # Batched inference is not supported in Transformers.
+        if self.model_part == "text_encoder":
+            common_inputs = {
+                "input_ids": {0: "batch_size", 1: "encoder_sequence_length"},
+                "attention_mask": {0: "batch_size", 1: "encoder_sequence_length"},
+            }
+        elif self.model_part == "encodec_decode":
+            # 0: always 1 for chunk_length_s=None, 2: num_quantizers fixed.
+            common_inputs = {"audio_codes": {1: "batch_size", 3: "chunk_length"}}
+        elif self.model_part == "build_delay_pattern_mask":
+            common_inputs = {
+                "input_ids": {0: "batch_size_x_num_codebooks"},
+                "pad_token_id": {},
+                "max_length": {},
+            }
+        elif self._behavior is ConfigBehavior.DECODER:
+            # Naming it total_batch_size as in case we use guidance_scale, the dimension 0 may be larger than simply the batch_size.
+            # Reference: https://github.com/huggingface/transformers/blob/31c575bcf13c2b85b65d652dd1b5b401f99be999/src/transformers/models/musicgen/modeling_musicgen.py#L1932-L1935
+            common_inputs = {
+                "decoder_input_ids": {0: "total_batch_size_x_num_codebooks"},
+                "encoder_outputs": {0: "total_batch_size", 1: "encoder_sequence_length"},
+                # MusicgenForConditionalGeneration maps attention_mask to encoder_attention_mask.
+                "attention_mask": {
+                    0: "batch_size",
+                    1: "encoder_sequence_length",
+                },
+            }
+            if self.use_past_in_inputs:
+                # TODO: validate the axis name for attention_mask
+                # common_inputs["attention_mask"][1] = "past_encoder_sequence_length + sequence_length"
+                self.add_past_key_values(common_inputs, direction="inputs")
+            else:
+                common_inputs["decoder_input_ids"] = {
+                    0: "total_batch_size_x_num_codebooks",
+                    1: "decoder_sequence_length",
+                }
+        else:
+            raise ValueError(
+                "This should not happen. Please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        common_outputs = {}
+
+        if self.model_part == "text_encoder":
+            common_outputs = super().outputs
+        elif self.model_part == "encodec_decode":
+            common_outputs["audio_values"] = {0: "batch_size", 2: "audio_length"}
+        elif self.model_part == "build_delay_pattern_mask":
+            common_outputs["input_ids_edited"] = {0: "total_batch_size_x_num_codebooks"}
+            common_outputs["delay_pattern_mask"] = {0: "total_batch_size_x_num_codebooks", 1: "max_length"}
+        elif self._behavior is ConfigBehavior.DECODER:
+            common_outputs = super().outputs
+
+            # MusicgenForConditionalGeneration output is named logits, not last_hidden_state.
+            # Rename last_hidden_state -> logits while keeping the order.
+            common_outputs = {
+                "logits" if name == "last_hidden_state" else name: value for name, value in common_outputs.items()
+            }
+        else:
+            raise ValueError(
+                "This should not happen. Please open an issue at https://github.com/huggingface/optimum/issues."
+            )
+
+        return common_outputs
+
+    def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
+        if direction not in ["inputs", "outputs"]:
+            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
+
+        if direction == "inputs":
+            decoder_sequence_name = "past_decoder_sequence_length"
+            name = "past_key_values"
+        else:
+            decoder_sequence_name = "past_decoder_sequence_length + 1"
+            name = "present"
+
+        for i in range(self._normalized_config.decoder_num_layers):
+            inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "total_batch_size", 2: decoder_sequence_name}
+            inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "total_batch_size", 2: decoder_sequence_name}
+
+            if (
+                self.is_merged is True
+                or (self._behavior is ConfigBehavior.DECODER and not self.use_past_in_inputs)
+                or direction == "inputs"
+            ):
+                # TODO: we only need to call it encoder_sequence_length_out in the merge case - but at torch.onnx.export()
+                # time we have currently no case to check whether we will merge at a later step or not (self.is_merged is
+                # not yet set at this time)
+                inputs_or_outputs[f"{name}.{i}.encoder.key"] = {
+                    0: "total_batch_size",
+                    2: "encoder_sequence_length_out",
+                }
+                inputs_or_outputs[f"{name}.{i}.encoder.value"] = {
+                    0: "total_batch_size",
+                    2: "encoder_sequence_length_out",
+                }
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MusicgenModelPatcher(self, model, model_kwargs=model_kwargs)
+
+    @property
+    def torch_to_onnx_input_map(self) -> Dict[str, str]:
+        if self._behavior is ConfigBehavior.DECODER:
+            return {
+                "decoder_input_ids": "input_ids",
+                "encoder_outputs": "encoder_hidden_states",
+                "attention_mask": "encoder_attention_mask",
+            }
+        return {}
+
+    def post_process_exported_models(
+        self,
+        path: Path,
+        models_and_onnx_configs: Dict[
+            str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
+        ],
+        onnx_files_subpaths: List[str],
+    ):
+        # Attempt to merge only if the decoder was exported without/with past, and ignore seq2seq models exported with text-generation task
+        if "with-past" in self.variant:
+            decoder_path = Path(path, onnx_files_subpaths[2])
+            decoder_with_past_path = Path(path, onnx_files_subpaths[3])
+            decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
+            try:
+                # The decoder with past does not output the cross attention past key values as they are constant,
+                # hence the need for strict=False
+                merge_decoders(
+                    decoder=decoder_path,
+                    decoder_with_past=decoder_with_past_path,
+                    save_path=decoder_merged_path,
+                    strict=False,
+                )
+            except Exception as e:
+                raise Exception(f"Unable to merge decoders. Detailed error: {e}")
+
+            # In order to do the validation of the two branches on the same file
+            text_encoder_path = onnx_files_subpaths[0]
+            encodec_decode_path = onnx_files_subpaths[1]
+            build_delay_pattern_mask_path = onnx_files_subpaths[4]
+
+            onnx_files_subpaths_new = [
+                text_encoder_path,
+                encodec_decode_path,
+                decoder_merged_path.name,
+                decoder_merged_path.name,
+                build_delay_pattern_mask_path,
+            ]
+
+            # We validate the two branches of the decoder model then
+            models_and_onnx_configs[ONNX_DECODER_NAME][1].is_merged = True
+            models_and_onnx_configs[ONNX_DECODER_NAME][1].use_cache_branch = False
+
+            # Past key values won't be generated by default, but added in the input
+            models_and_onnx_configs[ONNX_DECODER_NAME][1].use_past_in_inputs = True
+
+            models_and_onnx_configs[ONNX_DECODER_WITH_PAST_NAME][1].use_cache_branch = True
+            models_and_onnx_configs[ONNX_DECODER_WITH_PAST_NAME][1].is_merged = True
+        else:
+            onnx_files_subpaths_new = onnx_files_subpaths
+
+        return models_and_onnx_configs, onnx_files_subpaths_new
+
+    def overwrite_shape_and_generate_input(
+        self, dummy_input_gen: "DummyInputGenerator", input_name: str, framework: str, input_shapes: Dict
+    ):
+        if self.model_part == "build_delay_pattern_mask" and input_name == "input_ids":
+            original_batch_size = dummy_input_gen.batch_size
+            dummy_input_gen.batch_size = (
+                original_batch_size * dummy_input_gen.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.num_codebooks
+            )
+
+            dummy_input = dummy_input_gen.generate(
+                input_name, framework=framework, int_dtype=self.int_dtype, float_dtype=self.float_dtype
+            )
+
+            dummy_input_gen.batch_size = original_batch_size
+
+        else:
+            dummy_input = super().overwrite_shape_and_generate_input(
+                dummy_input_gen, input_name, framework, input_shapes
+            )
+
+        return dummy_input
+
+
 class SpeechT5OnnxConfig(OnnxSeq2SeqConfigWithPast):
     # TODO: Transformers batched generation for Speecht5 is BROKEN (https://github.com/huggingface/transformers/pull/25943),
     # so we won't support for now.
-    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(decoder_num_layers="decoder_layers")
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         hidden_size="hidden_size",
         num_attention_heads="encoder_attention_heads",  # TODO: bugged in case encoder and decoder have different number of heads
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 523d1ae0ed1..0a105343546 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -254,7 +254,6 @@ def patched_forward(*args, **kwargs):
                         elif self.real_config._behavior == "decoder" and self.real_config.use_past_in_inputs:
                             # The filtering happens here. The decoder with use_past_in_inputs=True corresponds to the autoregressive one.
                             filterd_outputs[name] = tuple([v[:2] for v in value])
-
             return filterd_outputs
 
         self.patched_forward = patched_forward
@@ -796,3 +795,139 @@ def patched_forward(input_ids, attention_mask, pixel_values):
             return {"text_embeds": text_embeds, "image_embeds": image_embeds}
 
         self.patched_forward = patched_forward
+
+
+# Triu with possible dynamic `diagonal` argument. Not possible with torch.triu unfortunately.
+def triu_onnx(x, diagonal=0):
+    l, w = x.shape
+    arange_rows = torch.arange(l, device=x.device)
+
+    arange_cols = torch.arange(w, device=x.device)
+    mask = arange_cols.expand(l, w)
+
+    arange_rows = arange_rows[:, None] + diagonal
+    mask = mask >= arange_rows
+    return x.masked_fill(mask == 0, 0)
+
+
+def patched_build_delay_pattern_mask(self, input_ids: torch.Tensor, pad_token_id: int, max_length: int = None):
+    # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+    input_ids = input_ids.reshape(-1, self.num_codebooks, input_ids.shape[-1])
+    bsz, num_codebooks, seq_len = input_ids.shape
+
+    max_length = max_length if max_length is not None else self.generation_config.max_length
+    input_ids_shifted = torch.ones((bsz, num_codebooks, max_length), dtype=torch.long, device=input_ids.device) * -1
+
+    channel_codebooks = num_codebooks // 2 if self.config.audio_channels == 2 else num_codebooks
+    # we only apply the mask if we have a large enough seq len - otherwise we return as is
+    if max_length < 2 * channel_codebooks - 1:
+        raise NotImplementedError("Not supported in ONNX export. Please open an issue in Optimum repository.")
+
+    # fill the shifted ids with the prompt entries, offset by the codebook idx
+    for codebook in range(channel_codebooks):
+        if self.config.audio_channels == 1:
+            # mono channel - loop over the codebooks one-by-one
+            input_ids_shifted[:, codebook, codebook : seq_len + codebook] = input_ids[:, codebook]
+        else:
+            # left/right channels are interleaved in the generated codebooks, so handle one then the other
+            input_ids_shifted[:, 2 * codebook, codebook : seq_len + codebook] = input_ids[:, 2 * codebook]
+            input_ids_shifted[:, 2 * codebook + 1, codebook : seq_len + codebook] = input_ids[:, 2 * codebook + 1]
+
+    # construct a pattern mask that indicates the positions of padding tokens for each codebook
+    # first fill the upper triangular part (the EOS padding)
+    # NOTE: We could use torch.bool here, but PyTorch the complains with `The exported ONNX model failed ONNX shape inference.`
+    # Using int8 leads to `Could not find an implementation for Where`
+    delay_pattern = triu_onnx(
+        torch.ones((channel_codebooks, max_length), dtype=torch.int32), diagonal=max_length - channel_codebooks + 1
+    )
+
+    # NOTE: We could use torch.bool here, but PyTorch the complains with `The exported ONNX model failed ONNX shape inference.`
+    # Using int32 leads to `Could not find an implementation for Trilu`, hence int64 here
+
+    # then fill the lower triangular part (the BOS padding)
+    delay_pattern = delay_pattern + torch.tril(torch.ones((channel_codebooks, max_length), dtype=torch.int64))
+    delay_pattern = delay_pattern.to(torch.bool)
+
+    if self.config.audio_channels == 2:
+        # for left/right channel we need to duplicate every row of the pattern mask in an interleaved fashion
+        delay_pattern = delay_pattern.repeat_interleave(2, dim=0)
+
+    mask = ~delay_pattern.to(input_ids.device)
+    input_ids = mask * input_ids_shifted + ~mask * pad_token_id
+
+    # find the first position to start generating - this is the first place we have the -1 token
+    # and will always be in the first codebook (since it has no codebook offset)
+    first_codebook_ids = input_ids[:, 0, :]
+    start_ids = (first_codebook_ids == -1).nonzero()[:, 1]
+
+    # TODO: Is this OK?
+    first_start_id = start_ids.min()
+
+    # (bsz * num_codebooks, seq_len) -> (bsz, num_codebooks, seq_len)
+    pattern_mask = input_ids.reshape(bsz * num_codebooks, -1)
+    input_ids_edited = input_ids[..., :first_start_id].reshape(bsz * num_codebooks, -1)
+    return {"input_ids_edited": input_ids_edited, "delay_pattern_mask": pattern_mask}
+
+
+class MusicgenModelPatcher(Seq2SeqModelPatcher):
+    def __enter__(self):
+        self.patch_ops()
+        if self.real_config.model_part == "build_delay_pattern_mask":
+            # For build_delay_pattern_mask, we need to override the signature too.
+            self._model.forward = types.MethodType(patched_build_delay_pattern_mask, self._model)
+        else:
+            setattr(self._model, self.orig_forward_name, self.patched_forward)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.restore_ops()
+        if self.real_config.model_part == "build_delay_pattern_mask":
+            self._model.forward = self.original_decoder_forward
+        else:
+            setattr(self._model, self.orig_forward_name, self.orig_forward)
+
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        if config.model_part == "build_delay_pattern_mask":
+            self.original_decoder_forward = self.orig_forward
+        elif config.model_part == "encodec_decode":
+            # EncodecModel.forward -> EncodecModel.decode
+            @functools.wraps(self.orig_forward)
+            def patched_forward(
+                input_values: Optional["torch.Tensor"] = None,
+                padding_mask: Optional["torch.Tensor"] = None,
+                audio_codes: Optional["torch.Tensor"] = None,
+                bandwidth: Optional[float] = None,
+                audio_scales: Optional["torch.Tensor"] = None,
+                return_dict: Optional[bool] = None,
+            ):
+                chunk_length = self.real_config._config.audio_encoder.chunk_length
+                if chunk_length is None:
+                    if audio_scales is not None:
+                        audio_scales = audio_scales[0]
+
+                    if len(audio_codes) != 1:
+                        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
+                    audio_values = self._model._decode_frame(audio_codes[0], audio_scales)
+                else:
+                    raise ValueError("Not supported, a meaningful error should have been raised ahead.")
+                    decoded_frames = []
+
+                    for frame, scale in zip(audio_codes, audio_scales):
+                        frames = self._model._decode_frame(frame, scale)
+                        decoded_frames.append(frames)
+
+                    audio_values = self._model._linear_overlap_add(decoded_frames, self.config.chunk_stride or 1)
+
+                # truncate based on padding mask
+                if padding_mask is not None and padding_mask.shape[-1] < audio_values.shape[-1]:
+                    audio_values = audio_values[..., : padding_mask.shape[-1]]
+
+                return {"audio_values": audio_values}
+
+            self.patched_forward = patched_forward
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 0c55ac3e63e..d18a9ebb1ca 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -177,7 +177,7 @@ class TasksManager:
             "object-detection": "AutoModelForObjectDetection",
             "question-answering": "AutoModelForQuestionAnswering",
             "semantic-segmentation": "AutoModelForSemanticSegmentation",
-            "text-to-audio": "AutoModelForTextToSpectrogram",
+            "text-to-audio": ("AutoModelForTextToSpectrogram", "AutoModelForTextToWaveform"),
             "text-generation": "AutoModelForCausalLM",
             "text2text-generation": "AutoModelForSeq2SeqLM",
             "text-classification": "AutoModelForSequenceClassification",
@@ -334,6 +334,7 @@ class TasksManager:
 
     # TODO: some models here support text-generation export but are not supported in ORTModelForCausalLM
     # Set of model topologies we support associated to the tasks supported by each topology and the factory
+    # TODO: remove `-with-past` tasks and rather rely on `variant`.
     _SUPPORTED_MODEL_TYPE = {
         "audio-spectrogram-transformer": supported_tasks_mapping(
             "feature-extraction",
@@ -813,6 +814,10 @@ class TasksManager:
             "text2text-generation-with-past",
             onnx="MT5OnnxConfig",
         ),
+        "musicgen": supported_tasks_mapping(
+            "text-to-audio",  # "variant" handles the "-with-past". We should generalize that.
+            onnx="MusicgenOnnxConfig",
+        ),
         "m2m-100": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index abec09ff5e8..74d2d983850 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -345,6 +345,50 @@ def get_stable_diffusion_models_for_export(
     return models_for_export
 
 
+def get_musicgen_models_for_export(model: Union["PreTrainedModel", "TFPreTrainedModel"], config: "ExportConfig"):
+    models_for_export = {
+        "text_encoder": model.text_encoder,
+        "encodec_decode": model.audio_encoder,
+        # For the decoder, we do not pass model.decoder because we may need to export model.enc_to_dec_proj
+        DECODER_NAME: model,
+        DECODER_WITH_PAST_NAME: model,
+        "build_delay_pattern_mask": model.decoder,
+    }
+
+    text_encoder_config = config.__class__(
+        model.config, task=config.task, legacy=False, model_part="text_encoder", variant=config.variant
+    )
+    models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_config)
+
+    audio_encoder_config = config.__class__(
+        model.config, task=config.task, legacy=False, model_part="encodec_decode", variant=config.variant
+    )
+    models_for_export["encodec_decode"] = (models_for_export["encodec_decode"], audio_encoder_config)
+
+    use_past = "with-past" in config.variant
+    decoder_export_config = config.with_behavior("decoder", use_past=use_past, use_past_in_inputs=False)
+    decoder_export_config.model_part = "decoder"
+    models_for_export[DECODER_NAME] = (models_for_export[DECODER_NAME], decoder_export_config)
+
+    if "with-past" in config.variant:
+        decoder_export_config_with_past = config.with_behavior("decoder", use_past=True, use_past_in_inputs=True)
+        decoder_export_config_with_past.model_part = "decoder"
+        models_for_export[DECODER_WITH_PAST_NAME] = (
+            models_for_export[DECODER_WITH_PAST_NAME],
+            decoder_export_config_with_past,
+        )
+
+    build_delay_pattern_mask_config = config.__class__(
+        model.config, task=config.task, legacy=False, model_part="build_delay_pattern_mask", variant=config.variant
+    )
+    models_for_export["build_delay_pattern_mask"] = (
+        models_for_export["build_delay_pattern_mask"],
+        build_delay_pattern_mask_config,
+    )
+
+    return models_for_export
+
+
 def _get_submodels_for_export_sam(model, variant):
     models_for_export = {}
 
@@ -513,6 +557,8 @@ def _get_submodels_and_export_configs(
                 models_and_export_configs = get_sam_models_for_export(model, export_config)
             elif model.config.model_type == "speecht5":
                 models_and_export_configs = get_speecht5_models_for_export(model, export_config, model_kwargs)
+            elif model.config.model_type == "musicgen":
+                models_and_export_configs = get_musicgen_models_for_export(model, export_config)
             else:
                 models_and_export_configs = {"model": (model, export_config)}
 
diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py
index 05931753bfd..1f0765112e8 100644
--- a/optimum/onnx/transformations_utils.py
+++ b/optimum/onnx/transformations_utils.py
@@ -160,7 +160,7 @@ def _unify_onnx_outputs(model1: ModelProto, model2: ModelProto, strict: bool):
         else:
             logger.info(
                 f"The two models proto have different outputs ({len(model1_outputs)} and {len(model2_outputs)} outputs)."
-                " Constant outputs will be added to unify the two models outputs."
+                " Constant outputs will be added to unify the two models outputs. This is expected for encoder-decoder models where cached cross-attention key/values are constant outputs, omitted in the model with KV cache."
             )
 
     if model2_outputs.issubset(model1_outputs) is False:
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 99ce8693d42..07be3f7e1a6 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -48,8 +48,11 @@
     BloomDummyPastKeyValuesGenerator,
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
+    DummyCodegenDecoderTextInputGenerator,
     DummyDecoderTextInputGenerator,
+    DummyEncodecInputGenerator,
     DummyInputGenerator,
+    DummyIntGenerator,
     DummyLabelsGenerator,
     DummyPastKeyValuesGenerator,
     DummyPix2StructInputGenerator,
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 2d80bbeb374..ec27fe8db4b 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -401,7 +401,12 @@ def __init__(
         **kwargs,
     ):
         self.task = task
-        self.vocab_size = normalized_config.vocab_size
+
+        if isinstance(normalized_config, NormalizedEncoderDecoderConfig):
+            self.vocab_size = normalized_config.vocab_size
+        else:
+            self.vocab_size = normalized_config.vocab_size
+
         if random_batch_size_range:
             low, high = random_batch_size_range
             self.batch_size = random.randint(low, high)
@@ -418,6 +423,7 @@ def __init__(
         else:
             self.num_choices = num_choices
         self.padding_side = padding_side
+        self.normalized_config = normalized_config
 
     def generate(
         self,
@@ -609,7 +615,7 @@ class DummySeq2SeqPastKeyValuesGenerator(DummyInputGenerator):
     def __init__(
         self,
         task: str,
-        normalized_config: NormalizedSeq2SeqConfig,
+        normalized_config: Union[NormalizedSeq2SeqConfig, NormalizedEncoderDecoderConfig],
         batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
         sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
         encoder_sequence_length: Optional[int] = None,
@@ -632,18 +638,37 @@ def __init__(
             self.sequence_length if encoder_sequence_length is None else encoder_sequence_length
         )
 
+        if isinstance(normalized_config, NormalizedEncoderDecoderConfig):
+            # encoder_num_attention_heads / decoder_num_attention_heads are bad names, they rather refer to cross / self attention num heads.
+            self.encoder_num_attention_heads = (
+                self.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.encoder_num_attention_heads
+            )
+            self.decoder_num_attention_heads = (
+                self.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.decoder_num_attention_heads
+            )
+            # Same, `encoder_hidden_size` and `decoder_hidden_size` are bad names.
+            self.encoder_hidden_size = self.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.hidden_size
+            self.decoder_hidden_size = self.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.hidden_size
+            self.decoder_num_layers = self.normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.num_layers
+        else:
+            self.encoder_num_attention_heads = self.normalized_config.encoder_num_attention_heads
+            self.decoder_num_attention_heads = self.normalized_config.decoder_num_attention_heads
+            self.encoder_hidden_size = self.normalized_config.hidden_size
+            self.decoder_hidden_size = self.normalized_config.hidden_size
+            self.decoder_num_layers = self.normalized_config.decoder_num_layers
+
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         encoder_shape = (
             self.batch_size,
-            self.normalized_config.encoder_num_attention_heads,
+            self.encoder_num_attention_heads,
             self.encoder_sequence_length,
-            self.normalized_config.hidden_size // self.normalized_config.encoder_num_attention_heads,
+            self.encoder_hidden_size // self.encoder_num_attention_heads,
         )
         decoder_shape = (
             self.batch_size,
-            self.normalized_config.decoder_num_attention_heads,
+            self.decoder_num_attention_heads,
             self.sequence_length,
-            self.normalized_config.hidden_size // self.normalized_config.decoder_num_attention_heads,
+            self.decoder_hidden_size // self.decoder_num_attention_heads,
         )
         return [
             (
@@ -652,7 +677,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 self.random_float_tensor(encoder_shape, framework=framework, dtype=float_dtype),
                 self.random_float_tensor(encoder_shape, framework=framework, dtype=float_dtype),
             )
-            for _ in range(self.normalized_config.decoder_num_layers)
+            for _ in range(self.decoder_num_layers)
         ]
 
 
@@ -1277,3 +1302,96 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
                 )
                 for _ in range(self.num_layers)
             ]
+
+
+class DummyCodegenDecoderTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        num_choices: int = DEFAULT_DUMMY_SHAPES["num_choices"],
+        random_batch_size_range: Optional[Tuple[int, int]] = None,
+        random_sequence_length_range: Optional[Tuple[int, int]] = None,
+        random_num_choices_range: Optional[Tuple[int, int]] = None,
+        **kwargs,
+    ):
+        super().__init__(
+            task,
+            normalized_config,
+            batch_size=batch_size,
+            sequence_length=sequence_length,
+            num_choices=num_choices,
+            random_batch_size_range=random_batch_size_range,
+            random_sequence_length_range=random_sequence_length_range,
+            random_num_choices_range=random_num_choices_range,
+        )
+        self.num_codebooks = normalized_config.DECODER_NORMALIZED_CONFIG_CLASS.num_codebooks
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name in ["decoder_input_ids"]:
+            min_value = 0
+            max_value = 2 if input_name != "input_ids" else self.vocab_size
+            shape = [self.batch_size * self.num_codebooks, self.sequence_length]
+            return self.random_int_tensor(shape, max_value, min_value=min_value, framework=framework, dtype=int_dtype)
+
+        return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype)
+
+
+class DummyEncodecInputGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("audio_codes",)
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedConfig,
+        sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"],
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        **kwargs,
+    ):
+        self.task = task
+        self.batch_size = batch_size
+
+        self.num_codebooks = normalized_config.decoder.num_codebooks
+        self.sequence_length = sequence_length
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "audio_codes":
+            # Kind of a hack to use `self.sequence_length` here, for Musicgen pad tokens are filtered out, see
+            # https://github.com/huggingface/transformers/blob/31c575bcf13c2b85b65d652dd1b5b401f99be999/src/transformers/models/musicgen/modeling_musicgen.py#L2458
+            shape = [1, self.batch_size, self.num_codebooks, self.sequence_length]
+        else:
+            raise ValueError(f"Unsupported input {input_name} for DummyEncodecInputGenerator")
+
+        return self.random_int_tensor(
+            shape=shape,
+            min_value=0,
+            max_value=50,
+            framework=framework,
+            dtype=int_dtype,
+        )
+
+
+class DummyIntGenerator(DummyInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "pad_token_id",
+        "max_length",
+    )
+
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedTextConfig,
+        **kwargs,
+    ):
+        pass
+
+    def generate(
+        self,
+        input_name: str,
+        framework: str = "pt",
+        int_dtype: str = "int64",
+        float_dtype: str = "fp32",
+    ):
+        return self.random_int_tensor(shape=(1,), min_value=20, max_value=22, framework=framework, dtype=int_dtype)
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index 4d987ed982f..bc1d8a4a289 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -121,6 +121,7 @@
     "mpnet": "hf-internal-testing/tiny-random-MPNetModel",
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "lewtun/tiny-random-mt5",
+    "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "owlv2": "hf-internal-testing/tiny-random-Owlv2Model",
@@ -246,6 +247,7 @@
     "mobilevit": "apple/mobilevit-small",
     "mpt": "mosaicml/mpt-7b",
     "mt5": "lewtun/tiny-random-mt5",  # Not using google/mt5-small because it takes too much time for testing.
+    "musicgen": "facebook/musicgen-small",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
     "owlv2": "google/owlv2-base-patch16",
     "owlvit": "google/owlvit-base-patch32",

From fca7e99b4bee11734855ea326e78bf7efbfee68f Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 10 Apr 2024 14:32:45 +0200
Subject: [PATCH 11/35] Update installation instructions (#1806)

* Update installation instructions

* update
---
 README.md                    | 27 ++++++++++++++-------------
 docs/source/installation.mdx | 24 ++++++++++++------------
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index df6d22e62f8..c892a142994 100644
--- a/README.md
+++ b/README.md
@@ -16,16 +16,16 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 
 | Accelerator                                                                                                            | Installation                                                      |
 |:-----------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|
-| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade-strategy eager optimum[onnxruntime]`       |
-| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade-strategy eager optimum[neural-compressor]` |
-| [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade-strategy eager optimum[openvino,nncf]`     |
-| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia` |
-| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade-strategy eager optimum[amd]`               |
-| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade-strategy eager optimum[neuronx]`           |
-| [Habana Gaudi Processor (HPU)](https://huggingface.co/docs/optimum/habana/index)                                       | `pip install --upgrade-strategy eager optimum[habana]`            |
-| [FuriosaAI](https://huggingface.co/docs/optimum/furiosa/index)                                                         | `pip install --upgrade-strategy eager optimum[furiosa]`           |
+| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]`      |
+| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]`|
+| [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade --upgrade-strategy eager optimum[openvino]`         |
+| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`          |
+| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade --upgrade-strategy eager optimum[amd]`              |
+| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]`          |
+| [Habana Gaudi Processor (HPU)](https://huggingface.co/docs/optimum/habana/index)                                       | `pip install --upgrade --upgrade-strategy eager optimum[habana]`           |
+| [FuriosaAI](https://huggingface.co/docs/optimum/furiosa/index)                                                         | `pip install --upgrade --upgrade-strategy eager optimum[furiosa]`          |
 
-The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
+The `--upgrade --upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
 To install from source:
 
@@ -70,7 +70,7 @@ The [export](https://huggingface.co/docs/optimum/exporters/overview) and optimiz
 Before you begin, make sure you have all the necessary libraries installed :
 
 ```bash
-pip install --upgrade-strategy eager optimum[openvino,nncf]
+pip install --upgrade --upgrade-strategy eager optimum[openvino]
 ```
 
 It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO format easily:
@@ -79,7 +79,8 @@ It is possible to export 🤗 Transformers and Diffusers models to the OpenVINO
 optimum-cli export openvino --model distilbert-base-uncased-finetuned-sst-2-english distilbert_sst2_ov
 ```
 
-If you add `--int8`, the weights will be quantized to INT8. Static quantization can also be applied on the activations using [NNCF](https://github.com/openvinotoolkit/nncf), more information can be found in the [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov).
+If you add `--weight-format int8`, the weights will be quantized to `int8`, check out our [documentation](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#weight-only-quantization) for more detail on weight only quantization. To apply quantization on both weights and activations, you can find more information [here](https://huggingface.co/docs/optimum/main/en/intel/optimization_ov#static-quantization).
+
 
 To load a model and run inference with OpenVINO Runtime, you can just replace your `AutoModelForXxx` class with the corresponding `OVModelForXxx` class. To load a PyTorch checkpoint and convert it to the OpenVINO format on-the-fly, you can set `export=True` when loading your model.
 
@@ -104,7 +105,7 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 Before you begin, make sure you have all the necessary libraries installed :
 
 ```bash
-pip install --upgrade-strategy eager optimum[neural-compressor]
+pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]
 ```
 
 Dynamic quantization can be applied on your model:
@@ -202,7 +203,7 @@ We support many providers:
 Before you begin, make sure you have all the necessary libraries installed :
 
 ```bash
-pip install --upgrade-strategy eager optimum[habana]
+pip install --upgrade --upgrade-strategy eager optimum[habana]
 ```
 
 ```diff
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index 09b8632c72d..c08b3f92e5c 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -20,18 +20,18 @@ python -m pip install optimum
 
 If you'd like to use the accelerator-specific features of 🤗 Optimum, you can install the required dependencies according to the table below:
 
-| Accelerator                                                                                                            | Installation                                                       |
-|:-----------------------------------------------------------------------------------------------------------------------|:-------------------------------------------------------------------|
-| [ONNX runtime](https://onnxruntime.ai/docs/)                                                                           | `pip install --upgrade-strategy eager install optimum[onnxruntime]`|
-| [Intel Neural Compressor (INC)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html) | `pip install --upgrade-strategy eager optimum[neural-compressor]`  |
-| [Intel OpenVINO](https://docs.openvino.ai/latest/index.html)                                                           | `pip install --upgrade-strategy eager optimum[openvino,nncf]`      |
-| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`  |
-| [AMD Instinct GPUs and Ryzen AI NPU](https://www.amd.com/en/graphics/instinct-server-accelerators)                     | `pip install --upgrade-strategy eager optimum[amd]`                |
-| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade-strategy eager optimum[neuronx]`            |
-| [Habana Gaudi Processor (HPU)](https://habana.ai/training/)                                                            | `pip install --upgrade-strategy eager optimum[habana]`             |
-| [FuriosaAI](https://www.furiosa.ai/)                                                                                   | `pip install --upgrade-strategy eager optimum[furiosa]`            |
-
-The `--upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
+| Accelerator                                                                                                            | Installation                                                      |
+|:-----------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------|
+| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]`       |
+| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` |
+| [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade --upgrade-strategy eager optimum[openvino]`          |
+| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`           |
+| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade --upgrade-strategy eager optimum[amd]`               |
+| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]`           |
+| [Habana Gaudi Processor (HPU)](https://huggingface.co/docs/optimum/habana/index)                                       | `pip install --upgrade --upgrade-strategy eager optimum[habana]`            |
+| [FuriosaAI](https://huggingface.co/docs/optimum/furiosa/index)                                                         | `pip install --upgrade --upgrade-strategy eager optimum[furiosa]`           |
+
+The `--upgrade --upgrade-strategy eager` option is needed to ensure the different packages are upgraded to the latest possible version.
 
 If you'd like to play with the examples or need the bleeding edge of the code and can't wait for a new release, you can install the base library from source as follows:
 

From 8c017dabececbd76afadef3e7039342d95a897f0 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 10 Apr 2024 15:49:27 +0200
Subject: [PATCH 12/35] Fix offline compatibility (#1805)

* fix offline compatibility

* fix working dir
---
 .github/workflows/test_offline.yml        | 43 +++++++++++++++++++++++
 optimum/commands/export/onnx.py           |  6 +++-
 optimum/exporters/onnx/__main__.py        |  3 +-
 optimum/exporters/tasks.py                | 35 +++++++++++-------
 optimum/modeling_base.py                  | 11 +++---
 optimum/onnxruntime/modeling_decoder.py   |  5 +--
 optimum/onnxruntime/modeling_diffusion.py |  5 +--
 optimum/onnxruntime/modeling_ort.py       | 22 +++++++-----
 optimum/onnxruntime/modeling_seq2seq.py   |  5 +--
 9 files changed, 102 insertions(+), 33 deletions(-)
 create mode 100644 .github/workflows/test_offline.yml

diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml
new file mode 100644
index 00000000000..ca90730b6bc
--- /dev/null
+++ b/.github/workflows/test_offline.yml
@@ -0,0 +1,43 @@
+name: Offline usage / Python - Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: [3.9]
+        os: [ubuntu-20.04]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@v2
+    - name: Setup Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v2
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies for pytorch export
+      run: |
+        pip install .[tests,exporters,onnxruntime]
+    - name: Test with unittest
+      run: |
+        HF_HOME=/tmp/ huggingface-cli download hf-internal-testing/tiny-random-gpt2
+
+        HF_HOME=/tmp/ HF_HUB_OFFLINE=1 optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 gpt2_onnx --task text-generation
+
+        huggingface-cli download hf-internal-testing/tiny-random-gpt2
+
+        HF_HUB_OFFLINE=1 optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 gpt2_onnx --task text-generation
+
+        pytest tests/onnxruntime/test_modeling.py -k "test_load_model_from_hub and not from_hub_onnx" -s -vvvvv
+
+        HF_HUB_OFFLINE=1 pytest tests/onnxruntime/test_modeling.py -k "test_load_model_from_hub and not from_hub_onnx" -s -vvvvv
\ No newline at end of file
diff --git a/optimum/commands/export/onnx.py b/optimum/commands/export/onnx.py
index b2772c85e75..bddd60da998 100644
--- a/optimum/commands/export/onnx.py
+++ b/optimum/commands/export/onnx.py
@@ -18,6 +18,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+
 from ...exporters import TasksManager
 from ...utils import DEFAULT_DUMMY_SHAPES
 from ..base import BaseOptimumCLICommand
@@ -122,7 +124,9 @@ def parse_args_onnx(parser):
         default=None,
         help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.",
     )
-    optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.")
+    optional_group.add_argument(
+        "--cache_dir", type=str, default=HUGGINGFACE_HUB_CACHE, help="Path indicating where to store cache."
+    )
     optional_group.add_argument(
         "--trust-remote-code",
         action="store_true",
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 33a5220e3be..585a779c2e5 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -17,6 +17,7 @@
 import argparse
 from pathlib import Path
 
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from packaging import version
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, AutoTokenizer
@@ -57,7 +58,7 @@ def main_export(
     no_post_process: bool = False,
     framework: Optional[str] = None,
     atol: Optional[float] = None,
-    cache_dir: Optional[str] = None,
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
     trust_remote_code: bool = False,
     pad_token_id: Optional[int] = None,
     subfolder: str = "",
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index d18a9ebb1ca..ca71dca92a9 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -23,6 +23,7 @@
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
 import huggingface_hub
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from packaging import version
 from requests.exceptions import ConnectionError as RequestsConnectionError
 from transformers import AutoConfig, PretrainedConfig, is_tf_available, is_torch_available
@@ -1377,8 +1378,9 @@ def get_model_class_for_task(
     def get_model_files(
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
-        cache_dir: str = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         use_auth_token: Optional[str] = None,
+        revision: Optional[str] = None,
     ):
         request_exception = None
         full_model_path = Path(model_name_or_path) / subfolder
@@ -1393,21 +1395,25 @@ def get_model_files(
                 if not isinstance(model_name_or_path, str):
                     model_name_or_path = str(model_name_or_path)
                 all_files = huggingface_hub.list_repo_files(
-                    model_name_or_path, repo_type="model", token=use_auth_token
+                    model_name_or_path,
+                    repo_type="model",
+                    token=use_auth_token,
+                    revision=revision,
                 )
                 if subfolder != "":
                     all_files = [file[len(subfolder) + 1 :] for file in all_files if file.startswith(subfolder)]
-            except RequestsConnectionError as e:  # Hub not accessible
+            except (RequestsConnectionError, huggingface_hub.utils._http.OfflineModeIsEnabled) as e:
                 request_exception = e
                 object_id = model_name_or_path.replace("/", "--")
                 full_model_path = Path(cache_dir, f"models--{object_id}")
                 if full_model_path.is_dir():  # explore the cache first
                     # Resolve refs (for instance to convert main to the associated commit sha)
-                    revision_file = Path(full_model_path, "refs", "main")
-                    revision = ""
-                    if revision_file.is_file():
-                        with open(revision_file) as f:
-                            revision = f.read()
+                    if revision is None:
+                        revision_file = Path(full_model_path, "refs", "main")
+                        revision = ""
+                        if revision_file.is_file():
+                            with open(revision_file) as f:
+                                revision = f.read()
                     cached_path = Path(full_model_path, "snapshots", revision, subfolder)
                     all_files = [
                         os.path.relpath(os.path.join(dirpath, file), cached_path)
@@ -1422,7 +1428,7 @@ def determine_framework(
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
         framework: Optional[str] = None,
-        cache_dir: str = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
     ) -> str:
         """
         Determines the framework to use for the export.
@@ -1568,7 +1574,12 @@ def _infer_task_from_model_name_or_path(
                 raise RuntimeError(
                     "Cannot infer the task from a model repo with a subfolder yet, please specify the task manually."
                 )
-            model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+            try:
+                model_info = huggingface_hub.model_info(model_name_or_path, revision=revision)
+            except (RequestsConnectionError, huggingface_hub.utils._http.OfflineModeIsEnabled):
+                raise RuntimeError(
+                    f"Hugging Face Hub is not reachable and we cannot infer the task from a cached model. Make sure you are not offline, or otherwise please specify the `task` (or `--task` in command-line) argument ({', '.join(TasksManager.get_all_tasks())})."
+                )
             library_name = TasksManager.infer_library_from_model(model_name_or_path, subfolder, revision)
 
             if library_name == "diffusers":
@@ -1680,7 +1691,7 @@ def infer_library_from_model(
         model_name_or_path: Union[str, Path],
         subfolder: str = "",
         revision: Optional[str] = None,
-        cache_dir: str = huggingface_hub.constants.HUGGINGFACE_HUB_CACHE,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         library_name: Optional[str] = None,
         use_auth_token: Optional[str] = None,
     ):
@@ -1827,7 +1838,7 @@ def get_model_from_task(
         subfolder: str = "",
         revision: Optional[str] = None,
         framework: Optional[str] = None,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         torch_dtype: Optional["torch.dtype"] = None,
         device: Optional[Union["torch.device", str]] = None,
         library_name: str = None,
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index e7254276c24..9523f5c5042 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -22,6 +22,7 @@
 from typing import TYPE_CHECKING, Optional, Union
 
 from huggingface_hub import HfApi, HfFolder
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from transformers import AutoConfig, PretrainedConfig, add_start_docstrings
 
 from .exporters import TasksManager
@@ -220,7 +221,7 @@ def _load_config(
         cls,
         config_name_or_path: Union[str, os.PathLike],
         revision: Optional[str] = None,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         use_auth_token: Optional[Union[bool, str]] = False,
         force_download: bool = False,
         subfolder: str = "",
@@ -262,7 +263,7 @@ def _from_pretrained(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         **kwargs,
@@ -278,7 +279,7 @@ def _from_transformers(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
@@ -298,7 +299,7 @@ def _export(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
@@ -317,7 +318,7 @@ def from_pretrained(
         export: bool = False,
         force_download: bool = False,
         use_auth_token: Optional[str] = None,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         config: Optional[PretrainedConfig] = None,
         local_files_only: bool = False,
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 366146e2a4a..455236126b6 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -21,6 +21,7 @@
 import numpy as np
 import onnx
 import torch
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from onnx.tools import update_model_dims
 from transformers import AutoModelForCausalLM, GenerationConfig
 from transformers.file_utils import add_end_docstrings, add_start_docstrings_to_model_forward
@@ -407,7 +408,7 @@ def _from_pretrained(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: Optional[str] = None,
         subfolder: str = "",
         use_cache: bool = True,
@@ -577,7 +578,7 @@ def _from_transformers(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index ce51d6a8fcb..63360ce80a8 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -33,6 +33,7 @@
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
 from huggingface_hub import snapshot_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
 
@@ -272,7 +273,7 @@ def _from_pretrained(
         config: Dict[str, Any],
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
         text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         unet_file_name: str = ONNX_WEIGHTS_NAME,
@@ -377,7 +378,7 @@ def _from_transformers(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 82da2aab656..eb38a7fef12 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -22,7 +22,8 @@
 
 import numpy as np
 import torch
-from huggingface_hub import HfApi, HfFolder, hf_hub_download
+from huggingface_hub import HfFolder, hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from huggingface_hub.utils import EntryNotFoundError
 from transformers import (
     AutoConfig,
@@ -449,7 +450,7 @@ def _from_pretrained(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: Optional[str] = None,
         subfolder: str = "",
         local_files_only: bool = False,
@@ -471,7 +472,12 @@ def _from_pretrained(
                     token = HfFolder().get_token()
                 else:
                     token = use_auth_token
-                repo_files = map(Path, HfApi().list_repo_files(model_id, revision=revision, token=token))
+
+                repo_files, _ = TasksManager.get_model_files(
+                    model_id, revision=revision, cache_dir=cache_dir, use_auth_token=token
+                )
+                repo_files = map(Path, repo_files)
+
                 pattern = "*.onnx" if subfolder == "" else f"{subfolder}/*.onnx"
                 onnx_files = [p for p in repo_files if p.match(pattern)]
 
@@ -531,7 +537,7 @@ def _from_transformers(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
@@ -567,7 +573,7 @@ def _export(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
@@ -619,7 +625,7 @@ def from_pretrained(
         export: bool = False,
         force_download: bool = False,
         use_auth_token: Optional[str] = None,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         config: Optional["PretrainedConfig"] = None,
         local_files_only: bool = False,
@@ -852,7 +858,7 @@ def _cached_file(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         file_name: Optional[str] = None,
         subfolder: str = "",
         local_files_only: bool = False,
@@ -1017,7 +1023,7 @@ def _export(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 69bb718f361..2da4b4c8c45 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -28,6 +28,7 @@
 import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
 from transformers import (
     AutoModelForSeq2SeqLM,
     AutoModelForSpeechSeq2Seq,
@@ -778,7 +779,7 @@ def _from_pretrained(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         encoder_file_name: str = ONNX_ENCODER_NAME,
         decoder_file_name: str = ONNX_DECODER_NAME,
         decoder_with_past_file_name: str = ONNX_DECODER_WITH_PAST_NAME,
@@ -1023,7 +1024,7 @@ def _from_transformers(
         use_auth_token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
-        cache_dir: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
         subfolder: str = "",
         local_files_only: bool = False,
         trust_remote_code: bool = False,

From 4936662f6a22d459d84b04649f4dfbe1e638aae2 Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Wed, 10 Apr 2024 18:12:01 +0200
Subject: [PATCH 13/35] Remove unnecessary constants for > 2GB ONNX models
 (#1808)

* remove some more unnecessary constants for > 2GB ONNX models

* remove typo
---
 optimum/exporters/onnx/convert.py |  7 ++++++-
 optimum/onnx/utils.py             | 13 +++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 053a7a5aebe..f1122e43626 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -29,7 +29,7 @@
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
-from ...onnx.utils import _get_onnx_external_data_tensors, check_model_uses_external_data
+from ...onnx.utils import _get_onnx_external_constants, _get_onnx_external_data_tensors, check_model_uses_external_data
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
@@ -592,6 +592,7 @@ def remap(value):
 
         if model_uses_external_data or FORCE_ONNX_EXTERNAL_DATA:
             tensors_paths = _get_onnx_external_data_tensors(onnx_model)
+            constant_paths = _get_onnx_external_constants(onnx_model)
             logger.info("Saving external data to one file...")
 
             # try free model memory
@@ -618,6 +619,10 @@ def remap(value):
             for tensor in tensors_paths:
                 os.remove(output.parent / tensor)
 
+            for tensor in constant_paths:
+                if os.path.isfile(output.parent / tensor):
+                    os.remove(output.parent / tensor)
+
     return input_names, output_names
 
 
diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py
index 3eca9a86102..b52c4f4cdac 100644
--- a/optimum/onnx/utils.py
+++ b/optimum/onnx/utils.py
@@ -19,6 +19,19 @@
 from onnx.external_data_helper import ExternalDataInfo, _get_initializer_tensors
 
 
+def _get_onnx_external_constants(model: onnx.ModelProto) -> List[str]:
+    external_constants = []
+
+    for node in model.graph.node:
+        if node.op_type == "Constant":
+            for attribute in node.attribute:
+                external_datas = attribute.t.external_data
+                for external_data in external_datas:
+                    external_constants.append(external_data.value)
+
+    return external_constants
+
+
 def _get_onnx_external_data_tensors(model: onnx.ModelProto) -> List[str]:
     """
     Gets the paths of the external data tensors in the model.

From 0b52e3a39f652f50894e3a918b931f6ae4557e0d Mon Sep 17 00:00:00 2001
From: Naor Matania <naormatania91@gmail.com>
Date: Mon, 15 Apr 2024 12:01:17 +0300
Subject: [PATCH 14/35] Add onnx export function for pix2struct model (#1815)

Add onnx export for pix2struct
---
 optimum/onnxruntime/utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index 6da38c7ea7a..0e1da447a64 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -132,6 +132,7 @@ class ORTConfigManager:
         "vit": "vit",
         "whisper": "bart",
         "xlm-roberta": "bert",
+        "pix2struct": "vit",
     }
 
     @classmethod

From 56aabbebd0ce532f82f566a2a946769cee3bb36b Mon Sep 17 00:00:00 2001
From: fxmarty <9808326+fxmarty@users.noreply.github.com>
Date: Tue, 16 Apr 2024 16:21:13 +0200
Subject: [PATCH 15/35] Update dev version to 1.20.0.dev0 (#1817)

update version
---
 optimum/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/version.py b/optimum/version.py
index ab0bfa597b8..b71e4d4a8c3 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.19.0.dev0"
+__version__ = "1.20.0.dev0"

From f2a2a253d84b107e3194525d97230c6209974839 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:51:07 +0200
Subject: [PATCH 16/35] Bump transformers version (#1824)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7b555bbd887..7fac0a1c9d5 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.40.0",
+    "transformers[sentencepiece]>=4.26.0,<4.41.0",
     "torch>=1.11",
     "packaging",
     "numpy",

From 5b90bd7c4f56f20f99bc32d2434b2c916bf13921 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 24 Apr 2024 11:34:18 +0200
Subject: [PATCH 17/35] Remove call to `apt update` before `apt purge` in the
 main doc build workflow (#1830)

---
 .github/workflows/build_main_documentation.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index f25ee611f6f..e2a90843aa3 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -57,7 +57,6 @@ jobs:
       - name: Free disk space
         run: |
           df -h
-          sudo apt-get update
           sudo apt-get purge -y '^apache.*'
           sudo apt-get purge -y '^imagemagick.*'
           sudo apt-get purge -y '^dotnet.*'

From 8180375d293a8c7f18d7a2b80c7521e09c5f78a9 Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Wed, 24 Apr 2024 11:39:17 +0200
Subject: [PATCH 18/35] Update github workflows  (#1829)

---
 .github/workflows/dev_test_bettertransformer.yml | 4 ++--
 .github/workflows/dev_test_dummy_inputs.yml      | 4 ++--
 .github/workflows/dev_test_fx.yml                | 4 ++--
 .github/workflows/dev_test_onnx.yml              | 4 ++--
 .github/workflows/dev_test_onnxruntime.yml       | 4 ++--
 .github/workflows/dev_test_optimum_common.yml    | 4 ++--
 .github/workflows/test_bettertransformer.yml     | 2 +-
 .github/workflows/test_cli.yml                   | 2 +-
 .github/workflows/test_dummy_inputs.yml          | 2 +-
 .github/workflows/test_fx.yml                    | 2 +-
 .github/workflows/test_onnx.yml                  | 2 +-
 .github/workflows/test_onnxruntime.yml           | 2 +-
 .github/workflows/test_optimum_common.yml        | 2 +-
 13 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml
index 28add750ca0..e4c999ca6da 100644
--- a/.github/workflows/dev_test_bettertransformer.yml
+++ b/.github/workflows/dev_test_bettertransformer.yml
@@ -16,7 +16,7 @@ jobs:
         - 3.8
         os:
         - ubuntu-20.04
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -35,4 +35,4 @@ jobs:
     - name: Test with unittest
       working-directory: tests
       run: |
-        python -m unittest discover -s bettertransformer -p test_*.py
\ No newline at end of file
+        python -m unittest discover -s bettertransformer -p test_*.py
diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml
index f0afb16c57b..49baa49c418 100644
--- a/.github/workflows/dev_test_dummy_inputs.yml
+++ b/.github/workflows/dev_test_dummy_inputs.yml
@@ -17,7 +17,7 @@ jobs:
         - 3.9
         os:
         - ubuntu-20.04
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -35,4 +35,4 @@ jobs:
     - name: Test with unittest
       working-directory: tests
       run: |
-        python -m unittest discover -s utils -p test_*.py
\ No newline at end of file
+        python -m unittest discover -s utils -p test_*.py
diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml
index 7f007dfaafc..0b8633282f7 100644
--- a/.github/workflows/dev_test_fx.yml
+++ b/.github/workflows/dev_test_fx.yml
@@ -17,7 +17,7 @@ jobs:
         - 3.9
         os:
         - ubuntu-20.04
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -35,4 +35,4 @@ jobs:
     - name: Test with unittest
       working-directory: tests
       run: |
-        python -m pytest fx/optimization/test_transformations.py --exitfirst
\ No newline at end of file
+        python -m pytest fx/optimization/test_transformations.py --exitfirst
diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml
index 46b548b7ede..48052cfded3 100644
--- a/.github/workflows/dev_test_onnx.yml
+++ b/.github/workflows/dev_test_onnx.yml
@@ -17,7 +17,7 @@ jobs:
         - 3.9
         os:
         - ubuntu-20.04
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -34,4 +34,4 @@ jobs:
     - name: Test with unittest
       working-directory: tests
       run: |
-        python -m unittest discover -s onnx -p test_*.py
\ No newline at end of file
+        python -m unittest discover -s onnx -p test_*.py
diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml
index 9d097ad7255..857028ab2db 100644
--- a/.github/workflows/dev_test_onnxruntime.yml
+++ b/.github/workflows/dev_test_onnxruntime.yml
@@ -18,7 +18,7 @@ jobs:
         os:
         - ubuntu-20.04
         - windows-2019
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -36,4 +36,4 @@ jobs:
       working-directory: tests
       run: |
         python -m pytest -n auto -m "not run_in_series" onnxruntime
-        python -m pytest -m "run_in_series" onnxruntime
\ No newline at end of file
+        python -m pytest -m "run_in_series" onnxruntime
diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml
index bec011b5246..807ed0b1dab 100644
--- a/.github/workflows/dev_test_optimum_common.yml
+++ b/.github/workflows/dev_test_optimum_common.yml
@@ -19,7 +19,7 @@ jobs:
         os:
         - ubuntu-20.04
         - windows-2019
-        - macos-latest
+        - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
@@ -42,4 +42,4 @@ jobs:
         as the staging tests cannot run in parallel.
         export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == 3.8 && matrix.os
         == ubuntu-20.04 }}
-        python -m unittest discover -s tests -p test_*.py
\ No newline at end of file
+        python -m unittest discover -s tests -p test_*.py
diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
index dc57605ba7c..6607466dc22 100644
--- a/.github/workflows/test_bettertransformer.yml
+++ b/.github/workflows/test_bettertransformer.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-latest]
+        os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml
index 665982fc4a6..7eae0186076 100644
--- a/.github/workflows/test_cli.yml
+++ b/.github/workflows/test_cli.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-latest]
+        os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_dummy_inputs.yml b/.github/workflows/test_dummy_inputs.yml
index a0c7e448ca7..60ca033843c 100644
--- a/.github/workflows/test_dummy_inputs.yml
+++ b/.github/workflows/test_dummy_inputs.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-latest]
+        os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml
index edede4ad68b..2535f1b154d 100644
--- a/.github/workflows/test_fx.yml
+++ b/.github/workflows/test_fx.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-latest]
+        os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 7cac9e4e829..5a21f12d015 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-latest]
+        os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 30d45d4d5ea..f173cc6c6bd 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, windows-2019, macos-latest]
+        os: [ubuntu-20.04, windows-2019, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml
index bd616e19585..ded149c9b69 100644
--- a/.github/workflows/test_optimum_common.yml
+++ b/.github/workflows/test_optimum_common.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, windows-2019, macos-latest]
+        os: [ubuntu-20.04, windows-2019, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:

From a7296b45de0ea4b7e115543eb04043c729ea0ef3 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:08:19 +0200
Subject: [PATCH 19/35] Remove bad PPA in main doc build workflow (#1831)

---
 .github/workflows/build_main_documentation.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index e2a90843aa3..f1f4a23262a 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -137,6 +137,8 @@ jobs:
         run: |
           cd optimum-furiosa
           pip install .
+          sudo apt install software-properties-common
+          sudo add-apt-repository --remove https://packages.microsoft.com/ubuntu/22.04/prod
           sudo apt update
           sudo apt install -y ca-certificates apt-transport-https gnupg
           sudo apt-key adv --keyserver keyserver.ubuntu.com --recv-key 5F03AFA423A751913F249259814F888B20B09A7E

From 3b5c486821628d1f1e4c34a59d097e5a5f4e1803 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 25 Apr 2024 12:13:10 +0200
Subject: [PATCH 20/35] Fix TPU doc build (#1834)

---
 .github/workflows/build_main_documentation.yml | 2 +-
 .github/workflows/build_pr_documentation.yml   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index f1f4a23262a..20face917ab 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -161,7 +161,7 @@ jobs:
           sudo docker system prune -a -f
           cd optimum-tpu
           pip install -U pip
-          pip install .
+          pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
           doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
           mv tpu-doc-build ../optimum
           cd ..
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index c1fc4d859ce..e5f2dcb0d18 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -101,7 +101,7 @@ jobs:
           sudo docker system prune -a -f
           cd optimum-tpu
           pip install -U pip
-          pip install .
+          pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
           doc-builder build optimum.tpu docs/source/ --build_dir tpu-doc-build --version pr_$PR_NUMBER --version_tag_suffix "" --html --clean
           mv tpu-doc-build ../optimum
           cd ..

From c55f8824f58db1a2f1cfc7879451b4743b8f206b Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 25 Apr 2024 15:15:34 +0200
Subject: [PATCH 21/35] Fix  infer library for sentence transformers models
 (#1832)

---
 optimum/exporters/tasks.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index ca71dca92a9..4ec641d6c14 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1671,7 +1671,10 @@ def _infer_library_from_model(
         if library_name is not None:
             return library_name
 
-        if (
+        # SentenceTransformer models have no config attributes
+        if hasattr(model, "_model_config"):
+            library_name = "sentence_transformers"
+        elif (
             hasattr(model, "pretrained_cfg")
             or hasattr(model.config, "pretrained_cfg")
             or hasattr(model.config, "architecture")
@@ -1679,8 +1682,6 @@ def _infer_library_from_model(
             library_name = "timm"
         elif hasattr(model.config, "_diffusers_version") or getattr(model, "config_name", "") == "model_index.json":
             library_name = "diffusers"
-        elif hasattr(model, "_model_config"):
-            library_name = "sentence_transformers"
         else:
             library_name = "transformers"
         return library_name
@@ -1905,7 +1906,6 @@ def get_model_from_task(
         model_class = TasksManager.get_model_class_for_task(
             task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name
         )
-
         if library_name == "timm":
             model = model_class(f"hf_hub:{model_name_or_path}", pretrained=True, exportable=True)
             model = model.to(torch_dtype).to(device)

From e3fd2776a318a3a7b9d33315cc42c04c181f6d2f Mon Sep 17 00:00:00 2001
From: B-201 <116639249+B-201@users.noreply.github.com>
Date: Mon, 29 Apr 2024 19:23:04 +0800
Subject: [PATCH 22/35] Fix bug causing random initialization of bias when
 using GPTQ quantization with models without bias (#1827)

* Fix gptq quantization for models without bias

* Fix gptq quantization for models without bias
---
 optimum/gptq/quantizer.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 289e3256825..2c2c9d7e71a 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -278,19 +278,20 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
                 elif isinstance(layer, Conv1D):
                     in_features = layer.weight.shape[0]
                     out_features = layer.weight.shape[1]
+                bias = layer.bias is not None
                 if not (self.desc_act) or self.group_size == -1:
                     new_layer = QuantLinear(
                         self.bits,
                         self.group_size,
                         in_features,
                         out_features,
-                        True,
+                        bias,
                         use_cuda_fp16=self.use_cuda_fp16,
                         weight_dtype=layer.weight.dtype,
                     )
                 else:
                     new_layer = QuantLinear(
-                        self.bits, self.group_size, in_features, out_features, True, weight_dtype=layer.weight.dtype
+                        self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
                     )
                 new_layer.device = device
                 setattr(module, attr, new_layer.to(device))

From 189dd25aa4e247538102c3f48fedc3957304267a Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 2 May 2024 18:50:11 +0200
Subject: [PATCH 23/35] Add onnx export for VITS architecture (#1607)

* add onnx export for VITS architecture

* fix style

* set task
---
 optimum/exporters/onnx/model_configs.py | 19 +++++++++++++++++++
 optimum/exporters/tasks.py              |  4 ++++
 tests/exporters/exporters_utils.py      |  1 +
 3 files changed, 24 insertions(+)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 72d047efa01..496957b2b5d 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -1842,6 +1842,25 @@ def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], dire
                 inputs_or_outputs[f"{name}.{i}.encoder.value"] = {2: "encoder_sequence_length_out"}
 
 
+class VitsOnnxConfig(TextEncoderOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    ATOL_FOR_VALIDATION = 1e-4
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "text_batch_size", 1: "sequence_length"},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "waveform": {0: "text_batch_size", 1: "n_samples"},
+            "spectrogram": {0: "text_batch_size", 2: "num_bins"},
+        }
+
+
 class Speech2TextDummyAudioInputGenerator(DummyAudioInputGenerator):
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         shape = [self.batch_size, self.sequence_length, self.normalized_config.input_features_per_channel]
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 4ec641d6c14..efa782353b4 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1044,6 +1044,10 @@ class TasksManager:
         "vit": supported_tasks_mapping(
             "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig"
         ),
+        "vits": supported_tasks_mapping(
+            "text-to-audio",
+            onnx="VitsOnnxConfig",
+        ),
         "wavlm": supported_tasks_mapping(
             "feature-extraction",
             "automatic-speech-recognition",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index bc1d8a4a289..ab0b8488fb8 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -149,6 +149,7 @@
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "vit": "hf-internal-testing/tiny-random-vit",
+    "vits": "echarlaix/tiny-random-vits",
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
     "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
     "hubert": "hf-internal-testing/tiny-random-HubertModel",

From db6db6fc6a0690bce501569ab384f1bf10a2c7da Mon Sep 17 00:00:00 2001
From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com>
Date: Thu, 9 May 2024 02:12:55 -0700
Subject: [PATCH 24/35] Add Phi-3 mini to Optimum (#1841)

* Load config from folder

* Add Phi-3 to normalized config
---
 optimum/modeling_base.py           | 2 +-
 optimum/utils/normalized_config.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 9523f5c5042..9663a311692 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -358,7 +358,7 @@ def from_pretrained(
             if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
                 if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)):
                     config = AutoConfig.from_pretrained(
-                        os.path.join(model_id, subfolder, CONFIG_NAME), trust_remote_code=trust_remote_code
+                        os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code
                     )
                 elif CONFIG_NAME in os.listdir(model_id):
                     config = AutoConfig.from_pretrained(
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 8a5ef377854..a894001d359 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -252,6 +252,7 @@ class NormalizedConfigManager:
         "pegasus": BartLikeNormalizedTextConfig,
         "pix2struct": Pix2StructNormalizedTextConfig,
         "phi": NormalizedTextConfig,
+        "phi3": NormalizedTextConfigWithGQA,
         "poolformer": NormalizedVisionConfig,
         "regnet": NormalizedVisionConfig,
         "resnet": NormalizedVisionConfig,

From b3ecb6c405b7fd5425d79483fd7dc88c0609be8e Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 9 May 2024 12:58:50 +0200
Subject: [PATCH 25/35] Update the Transformers dependency in the Habana extra
 (#1851)

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 7fac0a1c9d5..4e154819bc2 100644
--- a/setup.py
+++ b/setup.py
@@ -80,7 +80,7 @@
     "nncf": "optimum-intel[nncf]>=1.15.0",
     "neural-compressor": "optimum-intel[neural-compressor]>=1.15.0",
     "graphcore": "optimum-graphcore",
-    "habana": ["optimum-habana", "transformers >= 4.37.0, < 4.38.0"],
+    "habana": ["optimum-habana", "transformers >= 4.38.0, < 4.39.0"],
     "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers == 4.36.2"],
     "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers == 4.36.2"],
     "furiosa": "optimum-furiosa",

From 02c6ed5f413384d543bcf83a3a9094be2c0429a5 Mon Sep 17 00:00:00 2001
From: Ekaterina Aidova <ekaterina.aidova@intel.com>
Date: Wed, 15 May 2024 19:06:40 +0400
Subject: [PATCH 26/35] Make stable diffusion unet and vae number of channels
 static (#1840)

---
 optimum/exporters/onnx/model_configs.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 496957b2b5d..d4c4ac934b9 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -981,7 +981,7 @@ class UNetOnnxConfig(VisionOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = {
-            "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            "sample": {0: "batch_size", 2: "height", 3: "width"},
             "timestep": {0: "steps"},
             "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
         }
@@ -998,7 +998,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "out_sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            "out_sample": {0: "batch_size", 2: "height", 3: "width"},
         }
 
     @property
@@ -1045,13 +1045,13 @@ class VaeEncoderOnnxConfig(VisionOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            "sample": {0: "batch_size", 2: "height", 3: "width"},
         }
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "latent_sample": {0: "batch_size", 1: "num_channels_latent", 2: "height_latent", 3: "width_latent"},
+            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
         }
 
 
@@ -1069,13 +1069,13 @@ class VaeDecoderOnnxConfig(VisionOnnxConfig):
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "latent_sample": {0: "batch_size", 1: "num_channels_latent", 2: "height_latent", 3: "width_latent"},
+            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
         }
 
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "sample": {0: "batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            "sample": {0: "batch_size", 2: "height", 3: "width"},
         }
 
 

From e0f58121140ce4baa01919ad70a6c13e936f7605 Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Tue, 21 May 2024 19:13:36 +0200
Subject: [PATCH 27/35] Expand support (#1864)

add
---
 optimum/utils/normalized_config.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index a894001d359..682f70e3ca3 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -253,6 +253,7 @@ class NormalizedConfigManager:
         "pix2struct": Pix2StructNormalizedTextConfig,
         "phi": NormalizedTextConfig,
         "phi3": NormalizedTextConfigWithGQA,
+        "phi3small": NormalizedTextConfigWithGQA,
         "poolformer": NormalizedVisionConfig,
         "regnet": NormalizedVisionConfig,
         "resnet": NormalizedVisionConfig,

From cc9889b78ae00b474478c3933f730b56e68d7dbd Mon Sep 17 00:00:00 2001
From: Ella Charlaix <80481427+echarlaix@users.noreply.github.com>
Date: Thu, 23 May 2024 15:58:55 +0200
Subject: [PATCH 28/35] Fix compatibility with transformers v4.41.0 for ONNX 
 (#1860)

* bump transformers

* update default onnx opset

* style

* save export for model with invalid generation config

* set minimum onnx opset

* update setup
---
 optimum/exporters/onnx/convert.py       |  7 ++-
 optimum/exporters/onnx/model_configs.py | 66 +++++++++++++++----------
 2 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index f1122e43626..4d5a2afc374 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -1123,7 +1123,12 @@ def onnx_export_from_model(
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
         if generation_config is not None:
-            generation_config.save_pretrained(output)
+            # since v4.41.0 an exceptions will be raised when saving a generation config considered invalid
+            # https://github.com/huggingface/transformers/blob/v4.41.0/src/transformers/generation/configuration_utils.py#L697
+            try:
+                generation_config.save_pretrained(output)
+            except Exception as exception:
+                logger.warning(f"The generation config is invalid and will not be saved : {exception}")
 
         model_name_or_path = model.config._name_or_path
         maybe_save_preprocessors(model_name_or_path, output)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index d4c4ac934b9..eb0ac9a4988 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -99,6 +99,7 @@
 class BertOnnxConfig(TextEncoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -114,42 +115,44 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class AlbertOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class ConvBertOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class ElectraOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class RoFormerOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class SqueezeBertOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class MobileBertOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class NystromformerOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class XLMOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class SplinterOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class DistilBertOnnxConfig(BertOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self.task == "multiple-choice":
@@ -172,7 +175,7 @@ class CamembertOnnxConfig(DistilBertOnnxConfig):
 
 
 class FlaubertOnnxConfig(BertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class IBertOnnxConfig(DistilBertOnnxConfig):
@@ -195,6 +198,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class MarkupLMOnnxConfig(BertOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
     DUMMY_INPUT_GENERATOR_CLASSES = (
         DummyTextInputGenerator,
         DummyXPathSeqInputGenerator,
@@ -706,6 +710,7 @@ class MarianOnnxConfig(BartOnnxConfig):
 class ViTOnnxConfig(VisionOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     MIN_TORCH_VERSION = version.parse("1.11")
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -725,36 +730,38 @@ class CvTOnnxConfig(ViTOnnxConfig):
 
 
 class LevitOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class DeiTOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class BeitOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class ConvNextOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class ConvNextV2OnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class MobileViTOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 11
 
 
 class RegNetOnnxConfig(ViTOnnxConfig):
     # This config has the same inputs as ViTOnnxConfig
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class ResNetOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-3
+    DEFAULT_ONNX_OPSET = 11
 
 
 class DetrOnnxConfig(ViTOnnxConfig):
@@ -776,11 +783,11 @@ class TableTransformerOnnxConfig(DetrOnnxConfig):
 
 
 class YolosOnnxConfig(ViTOnnxConfig):
-    DEFAULT_ONNX_OPSET = 12
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class SwinOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class Swin2srOnnxConfig(SwinOnnxConfig):
@@ -788,16 +795,17 @@ class Swin2srOnnxConfig(SwinOnnxConfig):
 
 
 class DptOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class GlpnOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class PoolFormerOnnxConfig(ViTOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig
     ATOL_FOR_VALIDATION = 2e-3
+    DEFAULT_ONNX_OPSET = 11
 
 
 class SegformerOnnxConfig(YolosOnnxConfig):
@@ -806,6 +814,7 @@ class SegformerOnnxConfig(YolosOnnxConfig):
 
 class MobileNetV1OnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 11
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
@@ -817,7 +826,7 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig):
 
 
 class DonutSwinOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class TimmDefaultOnnxConfig(ViTOnnxConfig):
@@ -1191,12 +1200,13 @@ class Data2VecTextOnnxConfig(DistilBertOnnxConfig):
 
 
 class Data2VecVisionOnnxConfig(ViTOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class Data2VecAudioOnnxConfig(AudioOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig
     ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class PerceiverDummyInputGenerator(DummyVisionInputGenerator):
@@ -1292,18 +1302,19 @@ def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
 
 class HubertOnnxConfig(AudioOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedConfig
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class Wav2Vec2OnnxConfig(HubertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class Wav2Vec2ConformerOnnxConfig(HubertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 11
 
 
 class SEWOnnxConfig(HubertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class SEWDOnnxConfig(HubertOnnxConfig):
@@ -1311,11 +1322,11 @@ class SEWDOnnxConfig(HubertOnnxConfig):
 
 
 class UniSpeechOnnxConfig(HubertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class UniSpeechSATOnnxConfig(HubertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class WavLMOnnxConfig(HubertOnnxConfig):
@@ -1344,6 +1355,7 @@ class ASTOnnxConfig(OnnxConfig):
     )
     DUMMY_INPUT_GENERATOR_CLASSES = (ASTDummyAudioInputGenerator,)
     ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:

From 7184ef4e720369ed75dcfa1404195fffb7b71aec Mon Sep 17 00:00:00 2001
From: Jingya HUANG <44135271+JingyaHuang@users.noreply.github.com>
Date: Fri, 24 May 2024 15:53:06 +0200
Subject: [PATCH 29/35] Add phi3 support in ONNX exporter (#1870)

* add phi3 support

* add test

* add phi3 to modeeling test

* replace ckpt to avoid remote code

* fix test

* bump trfrs to fix test

---------

Co-authored-by: Jingya <jingya@huggingface.co>
---
 docs/source/exporters/onnx/overview.mdx      | 1 +
 optimum/exporters/onnx/model_configs.py      | 9 +++++++++
 optimum/exporters/onnx/utils.py              | 1 +
 optimum/exporters/tasks.py                   | 8 ++++++++
 optimum/utils/__init__.py                    | 1 +
 setup.py                                     | 2 +-
 tests/exporters/exporters_utils.py           | 1 +
 tests/onnxruntime/test_modeling.py           | 1 +
 tests/onnxruntime/utils_onnxruntime_tests.py | 1 +
 9 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 22471c297a5..747e1396fb4 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -77,6 +77,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Pegasus
 - Perceiver
 - Phi
+- Phi3
 - Pix2Struct
 - PoolFormer
 - Qwen2(Qwen1.5)
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index eb0ac9a4988..e23716d4b74 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -51,6 +51,7 @@
     NormalizedSeq2SeqConfig,
     NormalizedTextAndVisionConfig,
     NormalizedTextConfig,
+    NormalizedTextConfigWithGQA,
     NormalizedVisionConfig,
     is_diffusers_available,
     logging,
@@ -291,6 +292,14 @@ class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
+class Phi3OnnxConfig(PhiOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        MistralDummyPastKeyValuesGenerator,
+    ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
+    DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA
+
+
 class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35
     MIN_TRANSFORMERS_VERSION = version.parse("4.34.99")
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 747cc687996..8ecba9231f6 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -84,6 +84,7 @@
     "llama",
     "mistral",
     "phi",
+    "phi3",
     "qwen2",
 }
 
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index efa782353b4..608b3df0d7c 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -893,6 +893,14 @@ class TasksManager:
             "text-classification",
             onnx="PhiOnnxConfig",
         ),
+        "phi3": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            "text-classification",
+            onnx="Phi3OnnxConfig",
+        ),
         "pix2struct": supported_tasks_mapping(
             "image-to-text",
             "image-to-text-with-past",
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 07be3f7e1a6..5d5044e63e1 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -80,5 +80,6 @@
     NormalizedSeq2SeqConfig,
     NormalizedTextAndVisionConfig,
     NormalizedTextConfig,
+    NormalizedTextConfigWithGQA,
     NormalizedVisionConfig,
 )
diff --git a/setup.py b/setup.py
index 4e154819bc2..407f6a2a3fb 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
 REQUIRED_PKGS = [
     "coloredlogs",
     "sympy",
-    "transformers[sentencepiece]>=4.26.0,<4.41.0",
+    "transformers[sentencepiece]>=4.26.0,<4.42.0",
     "torch>=1.11",
     "packaging",
     "numpy",
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index ab0b8488fb8..0c52754ff60 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -132,6 +132,7 @@
         "hf-internal-testing/tiny-random-vision_perceiver_conv": ["image-classification"],
     },
     "phi": "echarlaix/tiny-random-PhiForCausalLM",
+    "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     # "rembert": "google/rembert",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index dd2bc858c41..182e64beb90 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -2258,6 +2258,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "llama",
         "mistral",
         "mpt",
+        "phi3",
         "qwen2",
     ]
 
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index 9de5e495e3b..65298265780 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -128,6 +128,7 @@
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
     "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver",
     "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv",
+    "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
     "qwen2": "fxmarty/tiny-dummy-qwen2",

From d2fade2bf7fd95fef0addbcea62e6c597930df37 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 27 May 2024 16:38:34 +0200
Subject: [PATCH 30/35] Fix FX CI (#1866)

* use gen_constructor_wrapper

* use original wrapper generator and eager attn

* force fx tracing
---
 .github/workflows/test_fx.yml                 | 33 ++++++++++---------
 optimum/fx/optimization/transformations.py    | 24 +++++++++++---
 tests/fx/optimization/test_transformations.py |  3 +-
 3 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml
index 2535f1b154d..f0366cf0d1e 100644
--- a/.github/workflows/test_fx.yml
+++ b/.github/workflows/test_fx.yml
@@ -2,9 +2,9 @@ name: FX / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -20,16 +20,19 @@ jobs:
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install .[tests]
-        pip install git+https://github.com/huggingface/transformers.git
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        python -m pytest fx/optimization/test_transformations.py --exitfirst
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install .[tests]
+
+      - name: Test with pytest
+        working-directory: tests
+        run: |
+          python -m pytest -s -v -x fx/optimization
diff --git a/optimum/fx/optimization/transformations.py b/optimum/fx/optimization/transformations.py
index 2013a063434..348a862db81 100644
--- a/optimum/fx/optimization/transformations.py
+++ b/optimum/fx/optimization/transformations.py
@@ -19,15 +19,31 @@
 import operator
 import warnings
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List
+from typing import List
 
 import torch
+from torch.fx import GraphModule, Node
 from transformers.file_utils import add_end_docstrings
-from transformers.utils.fx import _gen_constructor_wrapper
 
 
-if TYPE_CHECKING:
-    from torch.fx import GraphModule, Node
+try:
+    from transformers.utils.fx import _gen_constructor_wrapper
+except ImportError:
+    from transformers.utils.fx import gen_constructor_wrapper
+
+    def _gen_constructor_wrapper(*args, **kwargs):
+        wrapper, target = gen_constructor_wrapper(*args, **kwargs)
+
+        def wrapper_with_forced_tracing(*_args, **_kwargs):
+            import torch.fx._symbolic_trace
+
+            orginal_flag = torch.fx._symbolic_trace._is_fx_tracing_flag
+            torch.fx._symbolic_trace._is_fx_tracing_flag = True
+            out = wrapper(*_args, **_kwargs)
+            torch.fx._symbolic_trace._is_fx_tracing_flag = orginal_flag
+            return out
+
+        return wrapper_with_forced_tracing, target
 
 
 _ATTRIBUTES_DOCSTRING = r"""
diff --git a/tests/fx/optimization/test_transformations.py b/tests/fx/optimization/test_transformations.py
index e6a77a13ffc..3aaa7fe6c69 100644
--- a/tests/fx/optimization/test_transformations.py
+++ b/tests/fx/optimization/test_transformations.py
@@ -86,7 +86,8 @@ def transform(self, graph_module):
 
 
 def get_bert_model():
-    model = BertModel.from_pretrained(_MODEL_NAME)
+    # sdpa attn became default
+    model = BertModel.from_pretrained(_MODEL_NAME, attn_implementation="eager")
     model.eval()
     traced = symbolic_trace(model, input_names=["input_ids", "attention_mask", "token_type_ids"])
     return model, traced

From ff0a0b3ad13572df76bf13fead7cdfeafbd74c0f Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 27 May 2024 16:41:48 +0200
Subject: [PATCH 31/35] Fix Utils CI (#1867)

* use smallest split

* fix typing

* use pytest

* utils ci

* test disk freeing action

* test without freeing disk

* disable caching

* reduce num_samples

* avoid downloading torch cuda binaries

* add verbosity
---
 .github/workflows/test_dummy_inputs.yml       | 37 -----------------
 .github/workflows/test_utils.yml              | 40 +++++++++++++++++++
 .../preprocessing/task_processors_manager.py  |  6 +--
 tests/utils/test_dummpy_input_generators.py   |  8 ++--
 tests/utils/test_task_processors.py           | 17 +++++++-
 5 files changed, 62 insertions(+), 46 deletions(-)
 delete mode 100644 .github/workflows/test_dummy_inputs.yml
 create mode 100644 .github/workflows/test_utils.yml

diff --git a/.github/workflows/test_dummy_inputs.yml b/.github/workflows/test_dummy_inputs.yml
deleted file mode 100644
index 60ca033843c..00000000000
--- a/.github/workflows/test_dummy_inputs.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-name: Dummy inputs / Python - Test
-
-on:
-  push:
-    branches: [ main ]
-  pull_request:
-    branches: [ main ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-jobs:
-  build:
-    strategy:
-      fail-fast: false
-      matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
-
-    runs-on: ${{ matrix.os }}
-    steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install .[tests]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        python -m unittest discover -s utils -p 'test_*.py'
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
new file mode 100644
index 00000000000..1ef33ced086
--- /dev/null
+++ b/.github/workflows/test_utils.yml
@@ -0,0 +1,40 @@
+name: Utils / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-20.04, macos-13]
+        python-version: [3.8, 3.9]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[tests]
+
+      - name: Test with pytest
+        working-directory: tests
+        run: |
+          python -m pytest -s -vvvv utils
diff --git a/optimum/utils/preprocessing/task_processors_manager.py b/optimum/utils/preprocessing/task_processors_manager.py
index 2720ed41fbb..0426d1a2b43 100644
--- a/optimum/utils/preprocessing/task_processors_manager.py
+++ b/optimum/utils/preprocessing/task_processors_manager.py
@@ -23,7 +23,7 @@
 
 
 if TYPE_CHECKING:
-    from .base import DatasetProcessing
+    from .base import TaskProcessor
 
 
 class TaskProcessorsManager:
@@ -35,7 +35,7 @@ class TaskProcessorsManager:
     }
 
     @classmethod
-    def get_task_processor_class_for_task(cls, task: str) -> Type:
+    def get_task_processor_class_for_task(cls, task: str) -> Type["TaskProcessor"]:
         if task not in cls._TASK_TO_DATASET_PROCESSING_CLASS:
             supported_tasks = ", ".join(cls._TASK_TO_DATASET_PROCESSING_CLASS.keys())
             raise KeyError(
@@ -45,5 +45,5 @@ def get_task_processor_class_for_task(cls, task: str) -> Type:
         return cls._TASK_TO_DATASET_PROCESSING_CLASS[task]
 
     @classmethod
-    def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "DatasetProcessing":
+    def for_task(cls, task: str, *dataset_processing_args, **dataset_processing_kwargs: Any) -> "TaskProcessor":
         return cls.get_task_processor_class_for_task(task)(*dataset_processing_args, **dataset_processing_kwargs)
diff --git a/tests/utils/test_dummpy_input_generators.py b/tests/utils/test_dummpy_input_generators.py
index 9dd83714c5f..ff9558f1477 100644
--- a/tests/utils/test_dummpy_input_generators.py
+++ b/tests/utils/test_dummpy_input_generators.py
@@ -31,13 +31,13 @@
     from optimum.utils.input_generators import DummyInputGenerator
 
 
-TEXT_ENCODER_MODELS = {"distilbert": "distilbert-base-cased"}
+TEXT_ENCODER_MODELS = {"distilbert": "hf-internal-testing/tiny-random-DistilBertModel"}
 
 VISION_MODELS = {"resnet": "hf-internal-testing/tiny-random-resnet"}
 
-SEQ2SEQ_MODELS = {"t5": "t5-small"}
+SEQ2SEQ_MODELS = {"t5": "hf-internal-testing/tiny-random-T5Model"}
 
-AUDIO_MODELS = {"whisper": "openai/whisper-tiny.en"}
+AUDIO_MODELS = {"whisper": "hf-internal-testing/tiny-random-WhisperModel"}
 
 DUMMY_SHAPES = {
     "batch_size": [2, 4],
@@ -60,7 +60,7 @@ class GenerateDummy(TestCase):
         "np": tuple,
     }
     if is_tf_available():
-        import tensorflow as tf
+        import tensorflow as tf  # type: ignore[import]
 
         _FRAMEWORK_TO_SHAPE_CLS["tf"] = tf.TensorShape
 
diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py
index f8a0a6d5a92..af89aec2b90 100644
--- a/tests/utils/test_task_processors.py
+++ b/tests/utils/test_task_processors.py
@@ -55,6 +55,9 @@
     },
 }
 
+LOAD_SMALLEST_SPLIT = True
+NUM_SAMPLES = 10
+
 
 # Taken from https://pynative.com/python-generate-random-string/
 def get_random_string(length: int) -> str:
@@ -148,7 +151,11 @@ def _test_load_dataset(
         )
         dataset_with_all_columns = None
         if default_dataset:
-            dataset = task_processor.load_default_dataset(only_keep_necessary_columns=only_keep_necessary_columns)
+            dataset = task_processor.load_default_dataset(
+                only_keep_necessary_columns=only_keep_necessary_columns,
+                load_smallest_split=LOAD_SMALLEST_SPLIT,
+                num_samples=NUM_SAMPLES,
+            )
             if only_keep_necessary_columns:
                 dataset_with_all_columns = task_processor.load_default_dataset()
         else:
@@ -157,11 +164,17 @@ def _test_load_dataset(
                 path,
                 data_keys=data_keys,
                 only_keep_necessary_columns=only_keep_necessary_columns,
+                load_smallest_split=LOAD_SMALLEST_SPLIT,
+                num_samples=NUM_SAMPLES,
                 **load_dataset_kwargs,
             )
             if only_keep_necessary_columns:
                 dataset_with_all_columns = task_processor.load_dataset(
-                    path, data_keys=data_keys, **load_dataset_kwargs
+                    path,
+                    data_keys=data_keys,
+                    load_smallest_split=LOAD_SMALLEST_SPLIT,
+                    num_samples=NUM_SAMPLES,
+                    **load_dataset_kwargs,
                 )
 
         # We only check if the column names of the dataset with the not necessary columns removed are a strict subset

From e81bd73a778b4833b8b2781c16b4427b7aa1111c Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Mon, 27 May 2024 16:46:15 +0200
Subject: [PATCH 32/35] Fix BT CI (#1872)

* fix bt test failures due to default sdpa attention

* exclude macos13+py3.8

* update tr

* check transformers version
---
 .github/workflows/test_bettertransformer.yml | 54 +++++++++++---------
 optimum/pipelines/pipelines_base.py          |  9 +++-
 tests/bettertransformer/test_encoder.py      |  2 +-
 tests/bettertransformer/testing_utils.py     |  2 +-
 4 files changed, 39 insertions(+), 28 deletions(-)

diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
index 6607466dc22..080d8272dfc 100644
--- a/.github/workflows/test_bettertransformer.yml
+++ b/.github/workflows/test_bettertransformer.yml
@@ -2,9 +2,9 @@ name: BetterTransformer / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,29 +17,35 @@ jobs:
       matrix:
         python-version: [3.8, 3.9]
         os: [ubuntu-20.04, macos-13]
+        exclude: [{ python-version: 3.8, os: macos-13 }]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install .[tests]
-        pip install --no-cache-dir --upgrade torch torchvision torchaudio
-        pip install accelerate
-    - name: Test on pytorch stable
-      working-directory: tests
-      run: |
-        pytest bettertransformer/test_*.py -s -vvvvv
-    - name: Install dependencies 2
-      run: |
-        pip uninstall -y torch torchvision torchaudio
-        pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
-    - name: Test on pytorch nightly
-      working-directory: tests
-      run: |
-        pytest bettertransformer/test_*.py -s -vvvvv
+      - name: Checkout code
+        uses: actions/checkout@v4
 
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v3
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install .[tests]
+          pip install --no-cache-dir --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install accelerate
+
+      - name: Test with stable pytorch
+        working-directory: tests
+        run: |
+          pytest bettertransformer -s -vvvvv
+
+      - name: Install dependencies 2
+        run: |
+          pip uninstall -y torch torchvision torchaudio
+          pip install --no-cache-dir --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+
+      - name: Test with nightly pytorch
+        working-directory: tests
+        run: |
+          pytest bettertransformer -s -vvvvv
diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py
index e2046882bd6..cc36e94ef5d 100644
--- a/optimum/pipelines/pipelines_base.py
+++ b/optimum/pipelines/pipelines_base.py
@@ -45,7 +45,7 @@
 from transformers.pipelines import infer_framework_load_model
 
 from ..bettertransformer import BetterTransformer
-from ..utils import is_onnxruntime_available
+from ..utils import check_if_transformers_greater, is_onnxruntime_available
 from ..utils.file_utils import find_files_matching_pattern
 
 
@@ -179,7 +179,12 @@ def load_bettertransformer(
     **kwargs,
 ):
     if model_kwargs is None:
-        model_kwargs = {}
+        # the argument was first introduced in 4.36.0 but most models didn't have an sdpa implementation then
+        # see https://github.com/huggingface/transformers/blob/v4.36.0/src/transformers/modeling_utils.py#L1258
+        if check_if_transformers_greater("4.36.0"):
+            model_kwargs = {"attn_implementation": "eager"}
+        else:
+            model_kwargs = {}
 
     if model is None:
         model_id = SUPPORTED_TASKS[targeted_task]["default"]
diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py
index cbf4bcbae90..74aacaed58c 100644
--- a/tests/bettertransformer/test_encoder.py
+++ b/tests/bettertransformer/test_encoder.py
@@ -114,7 +114,7 @@ def test_inference_speed(self):
         """
         model_name = "bert-base-uncased"
 
-        hf_model = AutoModel.from_pretrained(model_name).eval()
+        hf_model = AutoModel.from_pretrained(model_name, attn_implementation="eager").eval()
         bt_model = BetterTransformer.transform(hf_model, keep_original_model=True)
 
         BATCH_SIZE = 8
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index eb4f0ab9a4d..6e7ff71ddd9 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -235,7 +235,7 @@ def _test_logits(self, model_id: str, model_type: str, **preprocessor_kwargs):
         inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs)
 
         torch.manual_seed(0)
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
         random_config = hf_random_model.config
 
         hf_random_model = hf_random_model.eval()

From 6d56c5fadd94e388efe2674820f167ab7c004a6f Mon Sep 17 00:00:00 2001
From: Sarthak Gupta <81774392+mr-sarthakgupta@users.noreply.github.com>
Date: Tue, 28 May 2024 13:49:57 +0530
Subject: [PATCH 33/35] Fix ORTConfig loading (#1879)

---
 optimum/commands/onnxruntime/quantize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/commands/onnxruntime/quantize.py b/optimum/commands/onnxruntime/quantize.py
index 0ce7e6c3dce..2613cb33ba6 100644
--- a/optimum/commands/onnxruntime/quantize.py
+++ b/optimum/commands/onnxruntime/quantize.py
@@ -96,7 +96,7 @@ def run(self):
                 "TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization"
             )
         else:
-            qconfig = ORTConfig.from_pretained(self.args.config).quantization
+            qconfig = ORTConfig.from_pretrained(self.args.config).quantization
 
         for q in quantizers:
             q.quantize(save_dir=save_dir, quantization_config=qconfig)

From f3008651c6f674d4b89de66a2d21fc5e7cafaf84 Mon Sep 17 00:00:00 2001
From: Mohit Sharma <mohit21sharma.ms@gmail.com>
Date: Tue, 28 May 2024 14:02:19 +0530
Subject: [PATCH 34/35] Update ORT doc for ROCM 6.0 (#1862)

* Update ORT doc for ROCM 6.0

* Update amdgpu.mdx
---
 docs/source/onnxruntime/usage_guides/amdgpu.mdx | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/onnxruntime/usage_guides/amdgpu.mdx b/docs/source/onnxruntime/usage_guides/amdgpu.mdx
index acd8d732ac3..575f7700ce9 100644
--- a/docs/source/onnxruntime/usage_guides/amdgpu.mdx
+++ b/docs/source/onnxruntime/usage_guides/amdgpu.mdx
@@ -7,11 +7,11 @@ Our testing involved AMD Instinct GPUs, and for specific GPU compatibility, plea
 This guide will show you how to run inference on the `ROCMExecutionProvider` execution provider that ONNX Runtime supports for AMD GPUs.
 
 ## Installation
-The following setup installs the ONNX Runtime support with ROCM Execution Provider with ROCm 5.7. 
+The following setup installs the ONNX Runtime support with ROCM Execution Provider with ROCm 6.0. 
 
 #### 1 ROCm Installation
 
-Refer to the [ROCm installation guide](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) to install ROCm 5.7.
+Refer to the [ROCm installation guide](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) to install ROCm 6.0.
 
 #### 2 Installing `onnxruntime-rocm`
 
@@ -26,11 +26,11 @@ docker build -f Dockerfile -t ort/rocm .
 **Local Installation Steps:**
 
 ##### 2.1 PyTorch with ROCm Support
-Optimum ONNX Runtime integration relies on some functionalities of Transformers that require PyTorch. For now, we recommend to use Pytorch compiled against RoCm 5.7, that can be installed following [PyTorch installation guide](https://pytorch.org/get-started/locally/): 
+Optimum ONNX Runtime integration relies on some functionalities of Transformers that require PyTorch. For now, we recommend to use Pytorch compiled against RoCm 6.0, that can be installed following [PyTorch installation guide](https://pytorch.org/get-started/locally/): 
 
 ```bash
-pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7
-# Use 'rocm/pytorch:latest' as the preferred base image when using Docker for PyTorch installation.
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.0
+# Use 'rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2' as the preferred base image when using Docker for PyTorch installation.
 ```
 
 ##### 2.2 ONNX Runtime with ROCm Execution Provider
@@ -42,13 +42,13 @@ pip install cmake onnx
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
 
 # Install ONNXRuntime from source
-git clone --recursive  https://github.com/ROCmSoftwarePlatform/onnxruntime.git
+git clone --single-branch --branch main --recursive https://github.com/Microsoft/onnxruntime onnxruntime
 cd onnxruntime
-git checkout rocm5.7_internal_testing_eigen-3.4.zip_hash
 
-./build.sh --config Release --build_wheel --update --build --parallel --cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --rocm_home=/opt/rocm
+./build.sh --config Release --build_wheel --allow_running_as_root --update --build --parallel --cmake_extra_defines CMAKE_HIP_ARCHITECTURES=gfx90a,gfx942 ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) --use_rocm --rocm_home=/opt/rocm
 pip install build/Linux/Release/dist/*
 ```
+Note: The instructions build ORT for `MI210/MI250/MI300` gpus. To support other architectures, please update the `CMAKE_HIP_ARCHITECTURES` in the build command.
 
 <Tip>
 To avoid conflicts between `onnxruntime` and `onnxruntime-rocm`, make sure the package `onnxruntime` is not installed by running `pip uninstall onnxruntime` prior to installing `onnxruntime-rocm`.

From cbbda3e43284c49a02732375cfcabc61e4923046 Mon Sep 17 00:00:00 2001
From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com>
Date: Tue, 28 May 2024 18:03:08 +0200
Subject: [PATCH 35/35] Fix ort config instantiation (from_pretrained) and
 saving (save_pretrained) (#1865)

* fix ort config instatiation (from_dict) and saving (to_dict)

* added tests for quantization with ort config

* style

* handle empty quant dictionary
---
 .github/workflows/test_cli.yml       | 33 ++++++++++---------
 optimum/onnxruntime/configuration.py | 49 ++++++++++++++++++++++++++--
 tests/cli/test_cli.py                | 31 +++++++++---------
 3 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml
index 7eae0186076..ecb19d23aa3 100644
--- a/.github/workflows/test_cli.yml
+++ b/.github/workflows/test_cli.yml
@@ -4,9 +4,9 @@ name: Optimum CLI / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -22,17 +22,20 @@ jobs:
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install .[tests,exporters,exporters-tf]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        python -m unittest discover -s cli -p 'test_*.py'
+      - name: Checkout code
+        uses: actions/checkout@v4
 
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          pip install .[tests,exporters,exporters-tf]
+
+      - name: Test with pytest
+        run: |
+          pytest tests/cli -s -vvvv --durations=0
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
index c11cf58b8b0..2e3d9f32d6a 100644
--- a/optimum/onnxruntime/configuration.py
+++ b/optimum/onnxruntime/configuration.py
@@ -18,7 +18,7 @@
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union
 
 from datasets import Dataset
 from packaging.version import Version, parse
@@ -298,6 +298,15 @@ def __post_init__(self):
             )
             self.operators_to_quantize = operators_to_quantize
 
+        if isinstance(self.format, str):
+            self.format = QuantFormat[self.format]
+        if isinstance(self.mode, str):
+            self.mode = QuantizationMode[self.mode]
+        if isinstance(self.activations_dtype, str):
+            self.activations_dtype = QuantType[self.activations_dtype]
+        if isinstance(self.weights_dtype, str):
+            self.weights_dtype = QuantType[self.weights_dtype]
+
     @staticmethod
     def quantization_type_str(activations_dtype: QuantType, weights_dtype: QuantType) -> str:
         return (
@@ -984,8 +993,28 @@ def __init__(
         self.opset = opset
         self.use_external_data_format = use_external_data_format
         self.one_external_file = one_external_file
-        self.optimization = self.dataclass_to_dict(optimization)
-        self.quantization = self.dataclass_to_dict(quantization)
+
+        if isinstance(optimization, dict) and optimization:
+            self.optimization = OptimizationConfig(**optimization)
+        elif isinstance(optimization, OptimizationConfig):
+            self.optimization = optimization
+        elif not optimization:
+            self.optimization = None
+        else:
+            raise ValueError(
+                f"Optional argument `optimization` must be a dictionary or an instance of OptimizationConfig, got {type(optimization)}"
+            )
+        if isinstance(quantization, dict) and quantization:
+            self.quantization = QuantizationConfig(**quantization)
+        elif isinstance(quantization, QuantizationConfig):
+            self.quantization = quantization
+        elif not quantization:
+            self.quantization = None
+        else:
+            raise ValueError(
+                f"Optional argument `quantization` must be a dictionary or an instance of QuantizationConfig, got {type(quantization)}"
+            )
+
         self.optimum_version = kwargs.pop("optimum_version", None)
 
     @staticmethod
@@ -1002,3 +1031,17 @@ def dataclass_to_dict(config) -> dict:
                 v = [elem.name if isinstance(elem, Enum) else elem for elem in v]
             new_config[k] = v
         return new_config
+
+    def to_dict(self) -> Dict[str, Any]:
+        dict_config = {
+            "opset": self.opset,
+            "use_external_data_format": self.use_external_data_format,
+            "one_external_file": self.one_external_file,
+            "optimization": self.dataclass_to_dict(self.optimization),
+            "quantization": self.dataclass_to_dict(self.quantization),
+        }
+
+        if self.optimum_version:
+            dict_config["optimum_version"] = self.optimum_version
+
+        return dict_config
diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
index 2e64dc9cdfb..ca4ebf8bd23 100644
--- a/tests/cli/test_cli.py
+++ b/tests/cli/test_cli.py
@@ -21,10 +21,8 @@
 import unittest
 from pathlib import Path
 
-from onnxruntime import __version__ as ort_version
-from packaging.version import Version, parse
-
 import optimum.commands
+from optimum.onnxruntime.configuration import AutoQuantizationConfig, ORTConfig
 
 
 CLI_WIH_CUSTOM_COMMAND_PATH = Path(__file__).parent / "cli_with_custom_command.py"
@@ -83,30 +81,33 @@ def test_optimize_commands(self):
 
     def test_quantize_commands(self):
         with tempfile.TemporaryDirectory() as tempdir:
+            ort_config = ORTConfig(quantization=AutoQuantizationConfig.avx2(is_static=False))
+            ort_config.save_pretrained(tempdir)
+
             # First export a tiny encoder, decoder only and encoder-decoder
             export_commands = [
-                f"optimum-cli export onnx --model hf-internal-testing/tiny-random-BertModel {tempdir}/encoder",
+                f"optimum-cli export onnx --model hf-internal-testing/tiny-random-bert {tempdir}/encoder",
                 f"optimum-cli export onnx --model hf-internal-testing/tiny-random-gpt2 {tempdir}/decoder",
-                # f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder",
+                f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder",
             ]
             quantize_commands = [
                 f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder --avx2 -o {tempdir}/quantized_encoder",
                 f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/decoder --avx2 -o {tempdir}/quantized_decoder",
-                # f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder",
+                f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder",
             ]
 
-            if parse(ort_version) != Version("1.16.0") and parse(ort_version) != Version("1.17.0"):
-                # Failing on onnxruntime==1.17.0, will be fixed on 1.17.1: https://github.com/microsoft/onnxruntime/pull/19421
-                export_commands.append(
-                    f"optimum-cli export onnx --model hf-internal-testing/tiny-random-t5 {tempdir}/encoder-decoder"
-                )
-                quantize_commands.append(
-                    f"optimum-cli onnxruntime quantize --onnx_model {tempdir}/encoder-decoder --avx2 -o {tempdir}/quantized_encoder_decoder"
-                )
+            quantize_with_config_commands = [
+                f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-bert --c {tempdir}/ort_config.json -o {tempdir}/quantized_encoder_with_config",
+                f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-gpt2 --c {tempdir}/ort_config.json -o {tempdir}/quantized_decoder_with_config",
+                f"optimum-cli onnxruntime quantize --onnx_model hf-internal-testing/tiny-random-t5 --c {tempdir}/ort_config.json -o {tempdir}/quantized_encoder_decoder_with_config",
+            ]
 
-            for export, quantize in zip(export_commands, quantize_commands):
+            for export, quantize, quantize_with_config in zip(
+                export_commands, quantize_commands, quantize_with_config_commands
+            ):
                 subprocess.run(export, shell=True, check=True)
                 subprocess.run(quantize, shell=True, check=True)
+                subprocess.run(quantize_with_config, shell=True, check=True)
 
     def _run_command_and_check_content(self, command: str, content: str) -> bool:
         proc = subprocess.Popen(command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE)