diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
index 20face917ab..d38274f320a 100644
--- a/.github/workflows/build_main_documentation.yml
+++ b/.github/workflows/build_main_documentation.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v2
@@ -18,12 +18,6 @@ jobs:
           repository: 'huggingface/doc-builder'
           path: doc-builder
 
-      - uses: actions/checkout@v2
-        with:
-          repository: 'huggingface/doc-build'
-          path: doc-build
-          token: ${{ secrets.HUGGINGFACE_PUSH }}
-
       - uses: actions/checkout@v2
         with:
           repository: 'huggingface/optimum'
@@ -57,6 +51,7 @@ jobs:
       - name: Free disk space
         run: |
           df -h
+          sudo apt-get update
           sudo apt-get purge -y '^apache.*'
           sudo apt-get purge -y '^imagemagick.*'
           sudo apt-get purge -y '^dotnet.*'
@@ -66,7 +61,7 @@ jobs:
           sudo apt-get purge -y '^mysql.*'
           sudo apt-get purge -y '^java.*'
           sudo apt-get purge -y '^openjdk.*'
-          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel
           df -h
           sudo apt-get autoremove -y >/dev/null 2>&1
           sudo apt-get clean
@@ -110,6 +105,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -135,6 +132,7 @@ jobs:
 
       - name: Make Furiosa documentation
         run: |
+          source venv-doc/bin/activate
           cd optimum-furiosa
           pip install .
           sudo apt install software-properties-common
@@ -159,6 +157,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
@@ -192,6 +191,7 @@ jobs:
 
       - name: Push to repositories
         run: |
+          source venv-doc/bin/activate
           cd optimum/optimum-doc-build
           sudo chmod -R ugo+rwx optimum
           doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml
diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
index e5f2dcb0d18..6eb09aff304 100644
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@@ -8,6 +8,7 @@ on:
       - "optimum/**.py"
       - "docs/**.mdx"
       - "docs/**.yml"
+      - ".github/workflows/build_pr_documentation.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,7 +16,7 @@ concurrency:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
       PR_NUMBER: ${{ github.event.number }}
@@ -60,6 +61,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -99,6 +102,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index c429b706bff..861684cfa4d 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -16,7 +16,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/dev_test_benckmark.yml b/.github/workflows/dev_test_benckmark.yml
index 5f6fc825021..381197b129a 100644
--- a/.github/workflows/dev_test_benckmark.yml
+++ b/.github/workflows/dev_test_benckmark.yml
@@ -12,12 +12,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
-        os:
-        - ubuntu-20.04
-    runs-on: ${{ matrix.os }}
+        python-version: ['3.9', '3.11']
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
@@ -27,7 +23,7 @@ jobs:
     - name: Install dependencies
       run: |
         pip install wheel
-        pip install .[tests,onnxruntime,benchmark]
+        pip install .[tests,onnxruntime,benchmark] datasets
         pip install -U git+https://github.com/huggingface/evaluate
         pip install -U git+https://github.com/huggingface/diffusers
         pip install -U git+https://github.com/huggingface/transformers
diff --git a/.github/workflows/dev_test_bettertransformer.yml b/.github/workflows/dev_test_bettertransformer.yml
index e4c999ca6da..e75b5e3bf98 100644
--- a/.github/workflows/dev_test_bettertransformer.yml
+++ b/.github/workflows/dev_test_bettertransformer.yml
@@ -12,18 +12,16 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
         os:
         - ubuntu-20.04
         - macos-13
     runs-on: ${{ matrix.os }}
     steps:
     - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
+    - name: Setup Python
       uses: actions/setup-python@v2
       with:
-        python-version: ${{ matrix.python-version }}
+        python-version: '3.9'
     - name: Install dependencies
       run: |
         pip install .[tests]
diff --git a/.github/workflows/dev_test_dummy_inputs.yml b/.github/workflows/dev_test_dummy_inputs.yml
index 49baa49c418..72a4763e432 100644
--- a/.github/workflows/dev_test_dummy_inputs.yml
+++ b/.github/workflows/dev_test_dummy_inputs.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_exporters.yml b/.github/workflows/dev_test_exporters.yml
index 5d967d125f5..b2dee3ed3a9 100644
--- a/.github/workflows/dev_test_exporters.yml
+++ b/.github/workflows/dev_test_exporters.yml
@@ -12,12 +12,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
-        os:
-        - ubuntu-20.04
-    runs-on: ${{ matrix.os }}
+        python-version: ['3.9', '3.11']
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/dev_test_fx.yml b/.github/workflows/dev_test_fx.yml
index 0b8633282f7..a0c54c78365 100644
--- a/.github/workflows/dev_test_fx.yml
+++ b/.github/workflows/dev_test_fx.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_onnx.yml b/.github/workflows/dev_test_onnx.yml
index 48052cfded3..f7514e1c5e5 100644
--- a/.github/workflows/dev_test_onnx.yml
+++ b/.github/workflows/dev_test_onnx.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - macos-13
diff --git a/.github/workflows/dev_test_onnxruntime.yml b/.github/workflows/dev_test_onnxruntime.yml
index 857028ab2db..c9104ebbd6c 100644
--- a/.github/workflows/dev_test_onnxruntime.yml
+++ b/.github/workflows/dev_test_onnxruntime.yml
@@ -12,9 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - windows-2019
diff --git a/.github/workflows/dev_test_optimum_common.yml b/.github/workflows/dev_test_optimum_common.yml
index 807ed0b1dab..117db50437b 100644
--- a/.github/workflows/dev_test_optimum_common.yml
+++ b/.github/workflows/dev_test_optimum_common.yml
@@ -12,10 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version:
-        - 3.7
-        - 3.8
-        - 3.9
+        python-version: ['3.9', '3.11']
         os:
         - ubuntu-20.04
         - windows-2019
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
new file mode 100644
index 00000000000..6dc3ff2bbd9
--- /dev/null
+++ b/.github/workflows/stale.yml
@@ -0,0 +1,23 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+permissions:
+  issues: write
+  pull-requests: write
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue has been marked as stale because it has been open for 30 days with no activity. This thread will be automatically closed in 5 days if no further activity occurs.'
+          stale-pr-message: 'This PR has been marked as stale because it has been open for 90 days with no activity. This thread will be automatically closed in 30 days if no further activity occurs.'
+          exempt-issue-labels: 'bug,exporters,good first issue,onnx,onnxruntime,quantization'
+          days-before-issue-stale: 30
+          days-before-issue-close: 5
+          days-before-pr-stale: 90
+          days-before-pr-close: 30
+          exempt-all-pr-assignees: true
\ No newline at end of file
diff --git a/.github/workflows/test_benckmark.yml b/.github/workflows/test_benckmark.yml
index 7f7f2ace329..fe7df1a20cc 100644
--- a/.github/workflows/test_benckmark.yml
+++ b/.github/workflows/test_benckmark.yml
@@ -4,9 +4,9 @@ name: Benchmark suite / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,20 +17,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        pip install wheel
-        pip install .[tests,onnxruntime,benchmark]
-    - name: Test with unittest
-      run: |
-        python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          pip install wheel
+          pip install .[tests,onnxruntime,benchmark] datasets
+      - name: Test with unittest
+        run: |
+          python -m unittest discover --start-directory tests/benchmark --pattern 'test_*.py'
diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
index 080d8272dfc..016e97304ad 100644
--- a/.github/workflows/test_bettertransformer.yml
+++ b/.github/workflows/test_bettertransformer.yml
@@ -15,9 +15,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
-        exclude: [{ python-version: 3.8, os: macos-13 }]
+        python-version: [3.9]
+        os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_cli.yml b/.github/workflows/test_cli.yml
index ecb19d23aa3..2efab40aab6 100644
--- a/.github/workflows/test_cli.yml
+++ b/.github/workflows/test_cli.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
@@ -34,7 +34,7 @@ jobs:
         run: |
           pip install --upgrade pip
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          pip install .[tests,exporters,exporters-tf]
+          pip install .[tests,exporters-tf]
 
       - name: Test with pytest
         run: |
diff --git a/.github/workflows/test_executorch_export.yml b/.github/workflows/test_executorch_export.yml
new file mode 100644
index 00000000000..1571cd0cffb
--- /dev/null
+++ b/.github/workflows/test_executorch_export.yml
@@ -0,0 +1,35 @@
+name: ExecuTorch Export / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        os: [macos-15]
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for ExecuTorch
+        run: |
+          pip install .[tests,exporters-executorch]
+          pip list
+      - name: Run tests
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest executorch/export/test_*.py -s -vvvv --durations=0
diff --git a/.github/workflows/test_executorch_runtime.yml b/.github/workflows/test_executorch_runtime.yml
new file mode 100644
index 00000000000..d5bbc0f8eaa
--- /dev/null
+++ b/.github/workflows/test_executorch_runtime.yml
@@ -0,0 +1,42 @@
+name: ExecuTorch Runtime / Python - Test
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    branches: [main]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ['3.10', '3.11', '3.12']
+        os: [macos-15]
+        test-modeling:
+          - test_modeling_gemma2.py
+          - test_modeling_gemma.py
+          - test_modeling_llama.py
+          - test_modeling_olmo.py
+          - test_modeling.py
+          - test_modeling_qwen2.py
+
+    runs-on: ${{ matrix.os }}
+    steps:
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for ExecuTorch
+        run: |
+          pip install .[tests,exporters-executorch]
+          pip list
+      - name: Run tests
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest executorch/runtime/${{ matrix.test-modeling }} -s -vvvv --durations=0
diff --git a/.github/workflows/test_export_onnx.yml b/.github/workflows/test_export_onnx.yml
index 56ef674cb41..d1fd4a9723f 100644
--- a/.github/workflows/test_export_onnx.yml
+++ b/.github/workflows/test_export_onnx.yml
@@ -2,9 +2,9 @@ name: Exporters ONNX / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,27 +15,27 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0
-    - name: Install dependencies for tensorflow export
-      run: |
-        pip install .[tests,exporters-tf]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_onnx_*.py -s -n auto -m "not tensorflow_test and not timm_test" --durations=0
+      - name: Install dependencies for tensorflow export
+        run: |
+          pip install .[tests,exporters-tf]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_onnx_*.py -n auto -m "tensorflow_test" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_cli.yml b/.github/workflows/test_export_onnx_cli.yml
index 8fa4ebb045f..618a140c147 100644
--- a/.github/workflows/test_export_onnx_cli.yml
+++ b/.github/workflows/test_export_onnx_cli.yml
@@ -2,9 +2,9 @@ name: Exporters ONNX CLI / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -15,20 +15,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -m "not tensorflow_test and not timm_test" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_cli_timm.yml b/.github/workflows/test_export_onnx_cli_timm.yml
index 76a535fcebd..b92d5551ba1 100644
--- a/.github/workflows/test_export_onnx_cli_timm.yml
+++ b/.github/workflows/test_export_onnx_cli_timm.yml
@@ -14,20 +14,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest exporters/onnx/test_exporters_onnx_cli.py -n auto -k "timm" -s --durations=0
diff --git a/.github/workflows/test_export_onnx_timm.yml b/.github/workflows/test_export_onnx_timm.yml
index 339e3e93dec..c16d20fbc18 100644
--- a/.github/workflows/test_export_onnx_timm.yml
+++ b/.github/workflows/test_export_onnx_timm.yml
@@ -14,21 +14,20 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: [3.9]
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies for pytorch export
-      run: |
-        pip install .[tests,exporters]
-    - name: Test with unittest
-      working-directory: tests
-      run: |
-        RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0
-
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for pytorch export
+        run: |
+          pip install .[tests,exporters]
+      - name: Test with unittest
+        working-directory: tests
+        run: |
+          RUN_SLOW=1 pytest exporters/onnx/ -s -n auto -k "timm" --durations=0
diff --git a/.github/workflows/test_export_tflite.yml b/.github/workflows/test_export_tflite.yml
index 362390b166d..225a28c1cba 100644
--- a/.github/workflows/test_export_tflite.yml
+++ b/.github/workflows/test_export_tflite.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli.yml b/.github/workflows/test_export_tflite_cli.yml
index e14e4cde325..cfca58cf9c1 100644
--- a/.github/workflows/test_export_tflite_cli.yml
+++ b/.github/workflows/test_export_tflite_cli.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
index 7e4a83b3b7b..9cebe8ac0f6 100644
--- a/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
+++ b/.github/workflows/test_export_tflite_cli_dynamic_quantization_int8.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
index 981dd005e52..ca35ad8b3eb 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_fp16.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
index 9064bfaf315..1531ffa5c9c 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_full_int8.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
index 824e8933a08..7274d09c0f8 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8_custom_dataset.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
index e975997e379..6c8639ebfe0 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8_default_dataset.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
index ef59cff0b92..39902d0dd50 100644
--- a/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
+++ b/.github/workflows/test_export_tflite_cli_quantization_int8x16.yml
@@ -20,10 +20,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_exporters_common.yml b/.github/workflows/test_exporters_common.yml
index 8e8c3360c1f..801e0bebc55 100644
--- a/.github/workflows/test_exporters_common.yml
+++ b/.github/workflows/test_exporters_common.yml
@@ -15,10 +15,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9', '3.11']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_exporters_slow.yml b/.github/workflows/test_exporters_slow.yml
index b22fdd7fd2a..b5f142fc7dc 100644
--- a/.github/workflows/test_exporters_slow.yml
+++ b/.github/workflows/test_exporters_slow.yml
@@ -14,10 +14,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
     - uses: actions/checkout@v2
     - name: Setup Python ${{ matrix.python-version }}
diff --git a/.github/workflows/test_fx.yml b/.github/workflows/test_fx.yml
index f0366cf0d1e..0a1890cc715 100644
--- a/.github/workflows/test_fx.yml
+++ b/.github/workflows/test_fx.yml
@@ -15,7 +15,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04, macos-13]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml
index 3c913e3f7ed..c5d82be38b3 100644
--- a/.github/workflows/test_fx_automatic_parallel.yml
+++ b/.github/workflows/test_fx_automatic_parallel.yml
@@ -24,7 +24,7 @@ jobs:
         config:
           - name: GPU-enabled Optimum Test Suite
             image: nvidia/cuda:12.4.1-devel-ubuntu22.04
-        gpu_target: ["nvidia-multi-gpu-l4-runners", "nvidia-multi-gpu-a10-runners"]
+        gpu_target: ["aws-g5-12xlarge-plus"]
 
     name: ${{ matrix.config.name }}
     runs-on:
@@ -35,7 +35,6 @@ jobs:
       options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
       env:
         NCCL_DEBUG: INFO
-        HF_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
     defaults:
       run:
         shell: bash
diff --git a/.github/workflows/test_offline.yml b/.github/workflows/test_offline.yml
index 90b0108e512..29b7b183bd7 100644
--- a/.github/workflows/test_offline.yml
+++ b/.github/workflows/test_offline.yml
@@ -15,10 +15,9 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04]
+        python-version: ['3.9']
 
-    runs-on: ${{ matrix.os }}
+    runs-on: ubuntu-20.04
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
index 9aa8b307235..418a9e42c1a 100644
--- a/.github/workflows/test_onnx.yml
+++ b/.github/workflows/test_onnx.yml
@@ -15,8 +15,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
+        python-version: ['3.9']
+        os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
     steps:
diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
index 291a3b08335..a0c5893d62c 100644
--- a/.github/workflows/test_onnxruntime.yml
+++ b/.github/workflows/test_onnxruntime.yml
@@ -17,26 +17,28 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, windows-2019, macos-13]
+        transformers-version: ["latest"]
+        os: [ubuntu-20.04, windows-2019] # TODO : add macos-15 after mps fix
+        include:
+          - transformers-version: "4.36.*"
+            os: ubuntu-20.04
+          - transformers-version: "4.45.*"
+            os: ubuntu-20.04
 
     runs-on: ${{ matrix.os }}
+
     steps:
       - name: Free Disk Space (Ubuntu)
         if: matrix.os == 'ubuntu-20.04'
         uses: jlumbroso/free-disk-space@main
-        with:
-          tool-cache: false
-          swap-storage: false
-          large-packages: false
 
       - name: Checkout code
         uses: actions/checkout@v4
 
-      - name: Setup Python ${{ matrix.python-version }}
+      - name: Setup Python
         uses: actions/setup-python@v5
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: '3.9'
 
       - name: Install dependencies
         run: |
@@ -44,14 +46,16 @@ jobs:
           pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
           pip install .[tests,onnxruntime]
 
+      - name: Install transformers ${{ matrix.transformers-version }}
+        if: ${{ matrix.transformers-version != 'latest' }}
+        run: pip install transformers==${{ matrix.transformers-version }}
+
       - name: Test with pytest (in series)
-        working-directory: tests
         run: |
-          pytest onnxruntime -m "run_in_series" --durations=0 -vvvv -s
+          pytest tests/onnxruntime -m "run_in_series" --durations=0 -vvvv -s
 
       - name: Test with pytest (in parallel)
-        env:
-          FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
-        working-directory: tests
         run: |
-          pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
+          pytest tests/onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
+        env:
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
diff --git a/.github/workflows/test_onnxruntime_slow.yml b/.github/workflows/test_onnxruntime_slow.yml
index 20371f79150..89d44e57ad1 100644
--- a/.github/workflows/test_onnxruntime_slow.yml
+++ b/.github/workflows/test_onnxruntime_slow.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04]
 
     runs-on: ${{ matrix.os }}
diff --git a/.github/workflows/test_optimum_common.yml b/.github/workflows/test_optimum_common.yml
index ded149c9b69..9aab45e4b71 100644
--- a/.github/workflows/test_optimum_common.yml
+++ b/.github/workflows/test_optimum_common.yml
@@ -4,9 +4,9 @@ name: Optimum common / Python - Test
 
 on:
   push:
-    branches: [ main ]
+    branches: [main]
   pull_request:
-    branches: [ main ]
+    branches: [main]
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
@@ -17,25 +17,24 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
+        python-version: ['3.9']
         os: [ubuntu-20.04, windows-2019, macos-13]
 
     runs-on: ${{ matrix.os }}
     steps:
-    - uses: actions/checkout@v2
-    - name: Setup Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python-version }}
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install .[tests]
-        ls -l optimum/
-    - name: Test with unittest
-      shell: bash
-      run: |
-        # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel.
-        export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.8' && matrix.os == 'ubuntu-20.04' }}
-        pytest tests/test_*.py
-        
+      - uses: actions/checkout@v2
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[tests]
+          ls -l optimum/
+      - name: Test with unittest
+        shell: bash
+        run: |
+          # Setting HUGGINGFACE_CO_STAGING to true for only one job of the matrix as the staging tests cannot run in parallel.
+          export HUGGINGFACE_CO_STAGING=${{ matrix.python-version == '3.9' && matrix.os == 'ubuntu-20.04' }}
+          pytest tests/test_*.py
diff --git a/.github/workflows/test_utils.yml b/.github/workflows/test_utils.yml
index 1ef33ced086..bbe00e62841 100644
--- a/.github/workflows/test_utils.yml
+++ b/.github/workflows/test_utils.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-20.04, macos-13]
-        python-version: [3.8, 3.9]
+        python-version: ['3.9']
 
     runs-on: ${{ matrix.os }}
     steps:
@@ -37,4 +37,13 @@ jobs:
       - name: Test with pytest
         working-directory: tests
         run: |
-          python -m pytest -s -vvvv utils
+          pytest utils -s -n auto -m "not datasets_test" --durations=0
+
+      - name: Install datasets
+        run: |
+          pip install datasets
+
+      - name: Tests needing datasets
+        working-directory: tests
+        run: |
+          pytest utils -s -n auto -m "datasets_test" --durations=0
\ No newline at end of file
diff --git a/Makefile b/Makefile
index e2c21263031..824ef3d0cf3 100644
--- a/Makefile
+++ b/Makefile
@@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 # Run code quality checks
 style_check:
 	black --check .
-	ruff .
+	ruff check .
 
 style:
 	black .
-	ruff . --fix
+	ruff check . --fix
 
 # Run tests for the library
 test:
diff --git a/README.md b/README.md
index 9a6403cdacb..9a81e69e126 100644
--- a/README.md
+++ b/README.md
@@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 ```
 
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training).
+
+
+### Quanto
+
+[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend.
+
+You can quantize a model either using the python API or the `optimum-cli`.
+
+```python
+from transformers import AutoModelForCausalLM
+from optimum.quanto import QuantizedModelForCausalLM, qint4
+
+model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B')
+qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
+```
+
+The quantized model can be saved using `save_pretrained`:
+
+```python
+qmodel.save_pretrained('./Llama-3.1-8B-quantized')
+```
+
+It can later be reloaded using `from_pretrained`:
+
+```python
+from optimum.quanto import QuantizedModelForCausalLM
+
+qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized')
+```
+
+You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository.
diff --git a/docs/Dockerfile b/docs/Dockerfile
index 29ea0f916ce..5181177f0db 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM nikolaik/python-nodejs:python3.8-nodejs18
+FROM nikolaik/python-nodejs:python3.11-nodejs23
 
 ARG commit_sha
 ARG clone_url
@@ -8,4 +8,4 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder.git
 
 RUN git clone $clone_url && cd optimum && git checkout $commit_sha
-RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-tf,doc-build,diffusers]
+RUN python3 -m pip install --no-cache-dir ./optimum[onnxruntime,benchmark,quality,exporters-executorch,doc-build,diffusers]
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
index 8444da1b9a9..dc69564b045 100644
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@@ -81,6 +81,23 @@
       title: Reference
       isExpanded: false
     title: "ONNX"
+  - sections:
+    - local: exporters/executorch/overview
+      title: Overview
+    - sections:
+      - local: exporters/executorch/usage_guides/export_a_model
+        title: Export a model to ExecuTorch
+      - local: exporters/executorch/usage_guides/contribute
+        title: Add support for exporting an architecture to ExecuTorch
+      title: How-to guides
+    - sections:
+      - local: exporters/executorch/package_reference/configuration
+        title: ExecuTorch configurations
+      - local: exporters/executorch/package_reference/export
+        title: Export functions
+      title: Reference
+      isExpanded: false
+    title: "ExecuTorch"
   - sections:
     - local: exporters/tflite/overview
       title: Overview
diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx
index 3d575c93c25..1a525ba4c8b 100644
--- a/docs/source/bettertransformer/overview.mdx
+++ b/docs/source/bettertransformer/overview.mdx
@@ -24,7 +24,7 @@ In the 2.0 version, PyTorch includes a native scaled dot-product attention opera
 We provide an integration with these optimizations out of the box in 🤗 Optimum, so that you can convert any supported 🤗 Transformers model so as to use the optimized paths & `scaled_dot_product_attention` function when relevant.
 
 <Tip warning={true}>
-PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorc latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
+PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorch latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
 </Tip>
 
 <Tip warning={true}>
diff --git a/docs/source/exporters/executorch/overview.mdx b/docs/source/exporters/executorch/overview.mdx
new file mode 100644
index 00000000000..0e880968bf7
--- /dev/null
+++ b/docs/source/exporters/executorch/overview.mdx
@@ -0,0 +1,26 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Overview
+
+🤗 Optimum handles the export of PyTorch to ExecuTorch in the `exporters.executorch` module. It provides classes, functions, and a command line interface to perform the export easily.
+
+Supported architectures from [🤗 Transformers](https://huggingface.co/docs/transformers/index):
+
+- Gemma
+- Gemma2
+- Llama2
+- Llama3(Llama3.2)
+- OLMo
+- Qwen2(Qwen2.5)
+
+There are many more models are supported by ExecuTorch, we will add those models to Optimum over time. Read more at [pytorch/executorch/examples/](https://github.com/pytorch/executorch/tree/main/examples)
diff --git a/docs/source/exporters/executorch/package_reference/configuration.mdx b/docs/source/exporters/executorch/package_reference/configuration.mdx
new file mode 100644
index 00000000000..b7a10b80419
--- /dev/null
+++ b/docs/source/exporters/executorch/package_reference/configuration.mdx
@@ -0,0 +1,54 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Configuration for ExecuTorch Export
+
+ExecuTorch export provides a flexible configuration mechanism through dynamic registration, enabling users to have
+complete control over the export process. The configuration system is divided into task configurations and recipe
+configurations, each addressing specific aspects of the export pipeline.
+
+
+## Task Configurations
+
+Task configurations determine how a Hugging Face model should be loaded and prepared for export, tailored to specific tasks.
+
+For instance, when exporting a model for a text generation task, the provided configuration utilizes **static caching** and
+**SDPA (Scaled Dot-Product Attention)** for inference optimization.
+
+By leveraging task configurations, users can ensure that their models are appropriately prepared for efficient execution on
+the ExecuTorch backend.
+
+[[autodoc]] exporters.executorch.task_registry.discover_tasks
+
+[[autodoc]] exporters.executorch.task_registry.register_task
+
+[[autodoc]] exporters.executorch.tasks.causal_lm.load_causal_lm_model
+
+
+## Recipe Configurations
+
+Recipe configurations control the specifics of lowering an eager PyTorch module to the ExecuTorch backend. These
+configurations allow users to:
+
+- Specify whether and how to **quantize** the model.
+- Delegate computation to various accelerators, such as **CPU**, **GPU**, **NPU**, **DSP**, and others.
+- Define **custom transformation passes**.
+- Implement advanced techniques like memory planning algorithms to optimize resource utilization.
+
+[[autodoc]] exporters.executorch.recipe_registry.discover_recipes
+
+[[autodoc]] exporters.executorch.recipe_registry.register_recipe
+
+[[autodoc]] exporters.executorch.recipes.xnnpack.export_to_executorch_with_xnnpack
+
+The combination of task and recipe configurations ensures that users can customize both the high-level task setup
+and the low-level export details to suit their deployment requirements.
diff --git a/docs/source/exporters/executorch/package_reference/export.mdx b/docs/source/exporters/executorch/package_reference/export.mdx
new file mode 100644
index 00000000000..6663eb5278e
--- /dev/null
+++ b/docs/source/exporters/executorch/package_reference/export.mdx
@@ -0,0 +1,26 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Export functions
+
+## Main functions
+
+[[autodoc]] exporters.executorch.convert.export_to_executorch
+
+The primary export function is designed to be **model- and task-independent** as well as **optimization-agnostic**, providing a
+highly flexible and modular interface for exporting Hugging Face models to the ExecuTorch backend.
+
+This approach highlights the **composability** of ExecuTorch export pipeline, where dynamically registered **task configurations**
+specify how a :hug model is prepared, and **recipe configurations** encapsulate device-specific optimizations during export. This
+separation allows users to customize the export process without altering the core function.
+
+For more details on task and recipe configurations, see the [Configuration for ExecuTorch Export](./configuration.mdx).
diff --git a/docs/source/exporters/executorch/usage_guides/contribute.mdx b/docs/source/exporters/executorch/usage_guides/contribute.mdx
new file mode 100644
index 00000000000..2c6c1593169
--- /dev/null
+++ b/docs/source/exporters/executorch/usage_guides/contribute.mdx
@@ -0,0 +1,57 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Adding support for an unsupported architecture
+
+We welcome contributions to extend the functionality of ExecuTorch export. This guide provides high-level instructions for contributors who want to:
+
+1. Export a new model that is not currently supported.
+2. Add new recipes or support a new task for export.
+
+---
+
+## Exporting a New Model
+
+If you want to export a model that is not already supported by the library, follow these steps:
+
+### Step 1: Export and Test the Model
+1. Attempt to export and lower the model using an existing task and recipe. On success, it will store the exported model in a `.pte` file.
+2. Add a test case for the model in the appropriate test suite.
+   - For example, you can make sure tests pass for the new `my_new_model` by running:
+     ```bash
+     pytest tests/executorch/export/test_*.py -k "test_my_new_model"  # doctest: +SKIP
+     pytest tests/executorch/runtime/test_*.py -k "test_my_new_model"  # doctest: +SKIP
+     ```
+
+### Step 2: Handle Export Failures
+1. If the export fails in Step 1, report the issue by opening a GitHub issue.
+2. If the issue requires changes to the model’s architecture or its Hugging Face implementation, these modifications may be made upstream in the Hugging Face Transformers library.
+
+---
+
+## Adding New Recipes or Tasks
+
+To extend ExecuTorch with new recipes or tasks, follow these guidelines:
+
+### Registering a New Recipe
+You can add a custom recipe to define specific optimizations or configurations for exporting models. Below is an example:
+
+```python
+from exporters.executorch import register_recipe
+
+@register_recipe("my_custom_recipe")
+def export_with_custom_recipe(model, config, *args, **kwargs):
+    # Example: Apply a custom quantization
+```
+
+### Registering a Task
+The task registration process is same as adding a recipe. Besides that you may need to implement a new `ExecuTorchModelForXXX` class.
diff --git a/docs/source/exporters/executorch/usage_guides/export_a_model.mdx b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx
new file mode 100644
index 00000000000..7993188cbd5
--- /dev/null
+++ b/docs/source/exporters/executorch/usage_guides/export_a_model.mdx
@@ -0,0 +1,124 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Export a model to ExecuTorch with optimum.exporters.executorch
+
+If you need to deploy 🤗 Transformers models for on-device use cases, we recommend
+exporting them to a serialized format that can be distributed and executed on specialized
+runtimes and hardware. In this guide, we'll show you how to export these
+models to [ExecuTorch](https://pytorch.org/executorch/main/intro-overview.html).
+
+
+## Why ExecuTorch?
+
+ExecuTorch is the ideal solution for deploying PyTorch models on edge devices, offering a streamlined process from
+export to deployment without leaving PyTorch ecosystem.
+
+Supporting on-device AI presents unique challenges with diverse hardware, critical power requirements, low/no internet
+connectivity, and realtime processing needs. These constraints have historically prevented or slowed down the creation
+of scalable and performant on-device AI solutions. We designed ExecuTorch, backed by our industry partners like Meta,
+Arm, Apple, Qualcomm, MediaTek, etc. to be highly portable and provide superior developer productivity without losing on
+performance.
+
+
+## Summary
+
+Exporting a PyTorch model to ExecuTorch is as simple as
+
+```bash
+optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b"
+```
+
+Check out the help for more options:
+
+```bash
+optimum-cli export executorch --help
+```
+
+
+## Exporting a model to ExecuTorch using the CLI
+
+To export a 🤗 Transformers model to ExecuTorch, you'll first need to install some extra
+dependencies:
+
+```bash
+pip install optimum[exporters-executorch]
+```
+
+The Optimum ExecuTorch export can be used through Optimum command-line:
+
+```bash
+optimum-cli export executorch --help
+
+usage: optimum-cli export executorch [-h] -m MODEL [-o OUTPUT_DIR] [--task TASK] [--recipe RECIPE]
+
+options:
+  -h, --help            show this help message and exit
+
+Required arguments:
+  -m MODEL, --model MODEL
+                        Model ID on huggingface.co or path on disk to load model from.
+  -o OUTPUT_DIR, --output_dir OUTPUT_DIR
+                        Path indicating the directory where to store the generated ExecuTorch model.
+  --task TASK           The task to export the model for. Available tasks depend on the model, but are among: ['audio-classification', 'feature-extraction', 'image-to-text',
+                        'sentence-similarity', 'depth-estimation', 'image-segmentation', 'audio-frame-classification', 'masked-im', 'semantic-segmentation', 'text-classification',
+                        'audio-xvector', 'mask-generation', 'question-answering', 'text-to-audio', 'automatic-speech-recognition', 'image-to-image', 'multiple-choice', 'image-
+                        classification', 'text2text-generation', 'token-classification', 'object-detection', 'zero-shot-object-detection', 'zero-shot-image-classification', 'text-
+                        generation', 'fill-mask'].
+  --recipe RECIPE       Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack".
+
+```
+
+Exporting a checkpoint can be done as follows:
+
+```bash
+optimum-cli export executorch --model "meta-llama/Llama-3.2-1B" --task "text-generation" --recipe "xnnpack" --output_dir "meta_llama3_2_1b"
+```
+
+You should see a `model.pte` file is stored under "./meta_llama3_2_1b/":
+
+```bash
+meta_llama3_2_1b/
+└── model.pte
+```
+
+This will fetch the model on the Hub and exports the PyTorch model with the specialized recipe. The resulting `model.pte` file can then be run on the [XNNPACK backend](https://pytorch.org/executorch/main/tutorial-xnnpack-delegate-lowering.html), or on many
+other ExecuTorh supported backends if exports with different recipes, e.g. Apple's [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) or [MPS](https://pytorch.org/executorch/main/build-run-mps.html), [Qualcomm's SoCs](https://pytorch.org/executorch/main/build-run-qualcomm-ai-engine-direct-backend.html), [ARM's Ethos-U](https://pytorch.org/executorch/main/executorch-arm-delegate-tutorial.html), [Xtensa HiFi4 DSP](https://pytorch.org/executorch/main/build-run-xtensa.html), [Vulkan GPU](https://pytorch.org/executorch/main/build-run-vulkan.html), [MediaTek](https://pytorch.org/executorch/main/build-run-mediatek-backend.html), etc.
+
+For example, we can load and run the model with [ExecuTorch
+Runtime](https://pytorch.org/executorch/main/runtime-overview.html) using the `optimum.executorchruntime` package as follows:
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")  # doctest: +SKIP
+>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta_llama3_2_1b/", export=False)  # doctest: +SKIP
+
+>>> generated_text = model.text_generation(tokenizer=tokenizer, prompt="Simply put, the theory of relativity states that", max_seq_len=45)  # doctest: +SKIP
+```
+
+Printing the `generated_text` would give that:
+
+```
+"Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference. In other words, the laws of physics are the same in all inertial frames of reference."
+```
+
+As you can see, converting a model to ExecuTorch does not mean leaving the Hugging Face ecosystem. You end up with a similar API as regular 🤗 Transformers models!
+
+It is also possible to export the model to ExecuTorch directly from the `ExecuTorchModelForCausalLM` class by doing the following:
+
+```python
+>>> from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+>>> model = ExecuTorchModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", export=True, task="text-generation", recipe="xnnpack")
+```
diff --git a/docs/source/exporters/onnx/overview.mdx b/docs/source/exporters/onnx/overview.mdx
index 747e1396fb4..b5129c23f21 100644
--- a/docs/source/exporters/onnx/overview.mdx
+++ b/docs/source/exporters/onnx/overview.mdx
@@ -36,8 +36,10 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Data2VecVision
 - Deberta
 - Deberta-v2
+- Decision Transformer
 - Deit
 - Detr
+- DINOv2
 - DistilBert
 - Donut-Swin
 - Electra
@@ -52,6 +54,7 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - GPT-NeoX
 - OPT
 - GroupVit
+- Hiera
 - Hubert
 - IBert
 - LayoutLM
@@ -63,7 +66,9 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - M2-M100
 - Marian
 - MarkupLM
+- MaskFormer
 - MBart
+- MGP-STR
 - Mistral
 - MobileBert
 - MobileVit
@@ -73,6 +78,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - MT5
 - Musicgen (text-conditional only)
 - Nystromformer
+- OLMo
+- OLMo2
 - OWL-ViT
 - Pegasus
 - Perceiver
@@ -80,8 +87,10 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - Phi3
 - Pix2Struct
 - PoolFormer
+- PVT
 - Qwen2(Qwen1.5)
 - RegNet
+- RemBERT
 - ResNet
 - Roberta
 - Roformer
@@ -90,10 +99,12 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - SEW
 - SEW-D
 - Speech2Text
+- SigLIP
 - SpeechT5
 - Splinter
 - SqueezeBert
 - Swin
+- SwinV2
 - T5
 - Table Transformer
 - TROCR
@@ -101,6 +112,8 @@ Supported architectures from [🤗 Transformers](https://huggingface.co/docs/tra
 - UniSpeech SAT
 - Vision Encoder Decoder
 - Vit
+- VitMAE
+- VitMSN
 - Wav2Vec2
 - Wav2Vec2 Conformer
 - WavLM
diff --git a/docs/source/exporters/overview.mdx b/docs/source/exporters/overview.mdx
index 6fd7bd9d916..2b4c2e11792 100644
--- a/docs/source/exporters/overview.mdx
+++ b/docs/source/exporters/overview.mdx
@@ -12,4 +12,4 @@ specific language governing permissions and limitations under the License.
 
 # Overview
 
-🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, two exporting format are supported: ONNX and TFLite (TensorFlow Lite).
+🤗 Optimum enables exporting models from PyTorch or TensorFlow to different formats through its `exporters` module. For now, three exporting format are supported: ONNX, TFLite (TensorFlow Lite), and ExecuTorch.
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 7eb79c33ed2..1b54570ea80 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -36,14 +36,14 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./intel/index"
       ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Intel</div>
-      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span></p>
+      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> , <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://intel.github.io/intel-extension-for-pytorch/index.html', '_blank');">IPEX</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-neuron/index"
       ><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>
       <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/trainium/', '_blank');">AWS Trainium</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://aws.amazon.com/machine-learning/inferentia/', '_blank');">AWS Inferentia</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-tpu/index"
-      ><div class="w-full text-center bg-gradient-to-tr from-blue-200 to-blue-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Google TPUs</div>
+      ><div class="w-full text-center bg-gradient-to-tr from-blue-500 to-blue-600 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Google TPUs</div>
       <p class="text-gray-700">Accelerate your training and inference workflows with <span class="underline" onclick="event.preventDefault(); window.open('https://cloud.google.com/tpu', '_blank');">Google TPUs</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./habana/index"
@@ -57,10 +57,6 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
   </div>
 </div>
 
-> [!TIP]
-> Some packages provide hardware-agnostic features (e.g. INC interface in Optimum Intel).
-
-
 ## Open-source integrations
 
 🤗 Optimum also supports a variety of open-source frameworks to make model optimization very easy.
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index c08b3f92e5c..27733574c80 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 | [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]`       |
 | [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` |
 | [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade --upgrade-strategy eager optimum[openvino]`          |
+| [IPEX](https://huggingface.co/docs/optimum/intel/index)                                                                | `pip install --upgrade --upgrade-strategy eager optimum[ipex]`              |
 | [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`           |
 | [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade --upgrade-strategy eager optimum[amd]`               |
 | [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]`           |
diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
index 65b2b60195a..2c93ab3ac0d 100644
--- a/docs/source/onnxruntime/package_reference/modeling_ort.mdx
+++ b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks.
 
 ## Stable Diffusion
 
+#### ORTDiffusionPipeline
+
+[[autodoc]] onnxruntime.ORTDiffusionPipeline
+    - __call__
+
 #### ORTStableDiffusionPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionPipeline
diff --git a/docs/source/onnxruntime/usage_guides/models.mdx b/docs/source/onnxruntime/usage_guides/models.mdx
index 131822e9568..27ac446096b 100644
--- a/docs/source/onnxruntime/usage_guides/models.mdx
+++ b/docs/source/onnxruntime/usage_guides/models.mdx
@@ -4,263 +4,128 @@ Optimum is a utility package for building and running inference with accelerated
 Optimum can be used to load optimized models from the [Hugging Face Hub](hf.co/models) and create pipelines
 to run accelerated inference without rewriting your APIs.
 
-## Switching from Transformers to Optimum
 
-The `optimum.onnxruntime.ORTModelForXXX` model classes are API compatible with Hugging Face Transformers models. This
-means you can just replace your `AutoModelForXXX` class with the corresponding `ORTModelForXXX` class in `optimum.onnxruntime`.
+## Loading
 
-You do not need to adapt your code to get it to work with `ORTModelForXXX` classes:
+### Transformers models
 
-```diff
-from transformers import AutoTokenizer, pipeline
--from transformers import AutoModelForQuestionAnswering
-+from optimum.onnxruntime import ORTModelForQuestionAnswering
-
--model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2") # PyTorch checkpoint
-+model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") # ONNX checkpoint
-tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
-
-onnx_qa = pipeline("question-answering",model=model,tokenizer=tokenizer)
-
-question = "What's my name?"
-context = "My name is Philipp and I live in Nuremberg."
-pred = onnx_qa(question, context)
-```
-
-### Loading a vanilla Transformers model
-
-Because the model you want to work with might not be already converted to ONNX,  [`~optimum.onnxruntime.ORTModel`]
-includes a method to convert vanilla Transformers models to ONNX ones. Simply pass `export=True` to the
-[`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly:
-
-```python
->>> from optimum.onnxruntime import ORTModelForSequenceClassification
-
->>> # Load the model from the hub and export it to the ONNX format
->>> model = ORTModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased-finetuned-sst-2-english", export=True
-... )
-```
-
-### Pushing ONNX models to the Hugging Face Hub
-
-It is also possible, just as with regular [`~transformers.PreTrainedModel`]s, to push your `ORTModelForXXX` to the
-[Hugging Face Model Hub](https://hf.co/models):
-
-```python
->>> from optimum.onnxruntime import ORTModelForSequenceClassification
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModelForXxx` with the corresponding `ORTModelForXxx` class.
 
->>> # Load the model from the hub and export it to the ONNX format
->>> model = ORTModelForSequenceClassification.from_pretrained(
-...     "distilbert-base-uncased-finetuned-sst-2-english", export=True
-... )
+```diff
+  from transformers import AutoTokenizer, pipeline
+- from transformers import AutoModelForCausalLM
++ from optimum.onnxruntime import ORTModelForCausalLM
 
->>> # Save the converted model
->>> model.save_pretrained("a_local_path_for_convert_onnx_model")
+- model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B) # PyTorch checkpoint
++ model = ORTModelForCausalLM.from_pretrained("onnx-community/Llama-3.2-1B", subfolder="onnx") # ONNX checkpoint
+  tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
 
-# Push the onnx model to HF Hub
->>> model.push_to_hub(  # doctest: +SKIP
-...   "a_local_path_for_convert_onnx_model", repository_id="my-onnx-repo", use_auth_token=True
-... )
+  pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
+  result = pipe("He never went out without a book under his arm")
 ```
 
-## Sequence-to-sequence models
+More information for all the supported `ORTModelForXxx` in our [documentation](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort)
 
-Sequence-to-sequence (Seq2Seq) models can also be used when running inference with ONNX Runtime. When Seq2Seq models
-are exported to the ONNX format, they are decomposed into three parts that are later combined during inference:
-- The encoder part of the model
-- The decoder part of the model + the language modeling head
-- The same decoder part of the model + language modeling head but taking and using pre-computed key / values as inputs and
-outputs. This makes inference faster.
 
-Here is an example of how you can load a T5 model to the ONNX format and run inference for a translation task:
+### Diffusers models
 
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `DiffusionPipeline` with the corresponding `ORTDiffusionPipeline` class.
 
-```python
->>> from transformers import AutoTokenizer, pipeline
->>> from optimum.onnxruntime import ORTModelForSeq2SeqLM
-
-# Load the model from the hub and export it to the ONNX format
->>> model_name = "t5-small"
->>> model = ORTModelForSeq2SeqLM.from_pretrained(model_name, export=True)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-# Create a pipeline
->>> onnx_translation = pipeline("translation_en_to_fr", model=model, tokenizer=tokenizer)
->>> text = "He never went out without a book under his arm, and he often came back with two."
->>> result = onnx_translation(text)
->>> # [{'translation_text': "Il n'est jamais sorti sans un livre sous son bras, et il est souvent revenu avec deux."}]
-```
 
-## Stable Diffusion
-
-Stable Diffusion models can also be used when running inference with ONNX Runtime. When Stable Diffusion models
-are exported to the ONNX format, they are split into four components that are later combined during inference:
-- The text encoder
-- The U-NET
-- The VAE encoder
-- The VAE decoder
-
-Make sure you have 🤗 Diffusers installed.
-
-To install `diffusers`:
-```bash
-pip install diffusers
+```diff
+- from diffusers import DiffusionPipeline
++ from optimum.onnxruntime import ORTDiffusionPipeline
+
+  model_id = "runwayml/stable-diffusion-v1-5"
+- pipeline = DiffusionPipeline.from_pretrained(model_id)
++ pipeline = ORTDiffusionPipeline.from_pretrained(model_id, revision="onnx")
+  prompt = "sailing ship in storm by Leonardo da Vinci"
+  image = pipeline(prompt).images[0]
 ```
 
-### Text-to-Image
 
-Here is an example of how you can load an ONNX Stable Diffusion model and run inference using ONNX Runtime:
+### Sentence Transformers models
 
-```python
-from optimum.onnxruntime import ORTStableDiffusionPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, revision="onnx")
-prompt = "sailing ship in storm by Leonardo da Vinci"
-image = pipeline(prompt).images[0]
-```
-
-To load your PyTorch model and convert it to ONNX on-the-fly, you can set `export=True`.
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing `AutoModel` with the corresponding `ORTModelForFeatureExtraction` class.
 
-```python
-pipeline = ORTStableDiffusionPipeline.from_pretrained(model_id, export=True)
-
-# Don't forget to save the ONNX model
-save_directory = "a_local_path"
-pipeline.save_pretrained(save_directory)
+```diff
+  from transformers import AutoTokenizer
+- from transformers import AutoModel
++ from optimum.onnxruntime import ORTModelForFeatureExtraction
+
+  tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
+- model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
++ model = ORTModelForFeatureExtraction.from_pretrained("optimum/all-MiniLM-L6-v2")
+  inputs = tokenizer("This is an example sentence", return_tensors="pt")
+  outputs = model(**inputs)
 ```
 
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/optimum/documentation-images/resolve/main/onnxruntime/stable_diffusion_v1_5_ort_sail_boat.png">
-</div>
-
-### Image-to-Image
+You can also load your ONNX model directly using the [`sentence_transformers.SentenceTransformer`](https://sbert.net/docs/sentence_transformer/usage/efficiency.html#onnx) class, just make sure to have `sentence-transformers>=3.2` installed. If the model wasn't already converted to ONNX, it will be converted automatically on-the-fly.
 
-```python
-import requests
-import torch
-from PIL import Image
-from io import BytesIO
-from optimum.onnxruntime import ORTStableDiffusionImg2ImgPipeline
-
-model_id = "runwayml/stable-diffusion-v1-5"
-pipeline = ORTStableDiffusionImg2ImgPipeline.from_pretrained(model_id, revision="onnx")
-
-url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
-
-response = requests.get(url)
-init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image = init_image.resize((768, 512))
+```diff
+  from sentence_transformers import SentenceTransformer
 
-prompt = "A fantasy landscape, trending on artstation"
+  model_id = "sentence-transformers/all-MiniLM-L6-v2"
+- model = SentenceTransformer(model_id)
++ model = SentenceTransformer(model_id, backend="onnx")
 
-image = pipeline(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5).images[0]
-image.save("fantasy_landscape.png")
+  sentences = ["This is an example sentence", "Each sentence is converted"]
+  embeddings = model.encode(sentences)
 ```
 
-### Inpaint
 
-```python
-import PIL
-import requests
-import torch
-from io import BytesIO
-from optimum.onnxruntime import ORTStableDiffusionInpaintPipeline
-
-model_id = "runwayml/stable-diffusion-inpainting"
-pipeline = ORTStableDiffusionInpaintPipeline.from_pretrained(model_id, revision="onnx")
+### Timm models
 
-def download_image(url):
-    response = requests.get(url)
-    return PIL.Image.open(BytesIO(response.content)).convert("RGB")
+Once your model was [exported to the ONNX format](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), you can load it by replacing the `create_model` with the corresponding `ORTModelForImageClassification` class.
 
-img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png"
-mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png"
 
-init_image = download_image(img_url).resize((512, 512))
-mask_image = download_image(mask_url).resize((512, 512))
-
-prompt = "Face of a yellow cat, high resolution, sitting on a park bench"
-image = pipeline(prompt=prompt, image=init_image, mask_image=mask_image).images[0]
+```diff
+  import requests
+  from PIL import Image
+- from timm import create_model
+  from timm.data import resolve_data_config, create_transform
++ from optimum.onnxruntime import ORTModelForImageClassification
+
+- model = create_model("timm/mobilenetv3_large_100.ra_in1k", pretrained=True)
++ model = ORTModelForImageClassification.from_pretrained("optimum/mobilenetv3_large_100.ra_in1k")
+  transform = create_transform(**resolve_data_config(model.config.pretrained_cfg, model=model))
+  url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
+  image = Image.open(requests.get(url, stream=True).raw)
+  inputs = transform(image).unsqueeze(0)
+  outputs = model(inputs)
 ```
 
 
-## Stable Diffusion XL
 
-Before using `ORTStableDiffusionXLPipeline` make sure to have `diffusers` and `invisible_watermark` installed. You can install the libraries as follows:
+## Converting your model to ONNX on-the-fly
 
-```bash
-pip install diffusers
-pip install invisible-watermark>=0.2.0
-```
-
-### Text-to-Image
-
-Here is an example of how you can load a SDXL ONNX model from [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) and run inference using ONNX Runtime :
+In case your model wasn't already [converted to ONNX](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model), [`~optimum.onnxruntime.ORTModel`] includes a method to convert your model to ONNX on-the-fly.
+Simply pass `export=True` to the [`~optimum.onnxruntime.ORTModel.from_pretrained`] method, and your model will be loaded and converted to ONNX on-the-fly:
 
 ```python
-from optimum.onnxruntime import ORTStableDiffusionXLPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-base = ORTStableDiffusionXLPipeline.from_pretrained(model_id)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-image = base(prompt).images[0]
-
-# Don't forget to save the ONNX model
-save_directory = "sd_xl_base"
-base.save_pretrained(save_directory)
-```
-
-
-### Image-to-Image
-
-Here is an example of how you can load a PyTorch SDXL model, convert it to ONNX on-the-fly and run inference using ONNX Runtime for *image-to-image* :
-
-```python
-from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
-from diffusers.utils import load_image
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-pipeline = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
 
-url = "https://huggingface.co/datasets/optimum/documentation-images/resolve/main/intel/openvino/sd_xl/castle_friedrich.png"
-image = load_image(url).convert("RGB")
-prompt = "medieval castle by Caspar David Friedrich"
-image = pipeline(prompt, image=image).images[0]
-image.save("medieval_castle.png")
+>>> # Load the model from the hub and export it to the ONNX format
+>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)
 ```
 
 
-### Refining the image output
-
-The image can be refined by making use of a model like [stabilityai/stable-diffusion-xl-refiner-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0). In this case, you only have to output the latents from the base model.
+## Pushing your model to the Hub
 
+You can also call `push_to_hub` directly on your model to upload it to the [Hub](https://hf.co/models).
 
 ```python
-from optimum.onnxruntime import ORTStableDiffusionXLImg2ImgPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-refiner-1.0"
-refiner = ORTStableDiffusionXLImg2ImgPipeline.from_pretrained(model_id, export=True)
-
-image = base(prompt=prompt, output_type="latent").images[0]
-image = refiner(prompt=prompt, image=image[None, :]).images[0]
-image.save("sailing_ship.png")
-```
-
-
-
-## Latent Consistency Models
-
-### Text-to-Image
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
 
-Here is an example of how you can load a Latent Consistency Models (LCMs) from [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/LCM_Dreamshaper_v7) and run inference using ONNX Runtime :
+>>> # Load the model from the hub and export it to the ONNX format
+>>> model_id = "distilbert-base-uncased-finetuned-sst-2-english"
+>>> model = ORTModelForSequenceClassification.from_pretrained(model_id, export=True)
 
-```python
-from optimum.onnxruntime import ORTLatentConsistencyModelPipeline
+>>> # Save the converted model locally
+>>> output_dir = "a_local_path_for_convert_onnx_model"
+>>> model.save_pretrained(output_dir)
 
-model_id = "SimianLuo/LCM_Dreamshaper_v7"
-pipeline = ORTLatentConsistencyModelPipeline.from_pretrained(model_id, export=True)
-prompt = "sailing ship in storm by Leonardo da Vinci"
-images = pipeline(prompt, num_inference_steps=4, guidance_scale=8.0).images
+# Push the onnx model to HF Hub
+>>> model.push_to_hub(output_dir, repository_id="my-onnx-repo")  # doctest: +SKIP
 ```
diff --git a/examples/onnxruntime/training/image-classification/README.md b/examples/onnxruntime/training/image-classification/README.md
index bf4bed8ee43..967942e7a93 100644
--- a/examples/onnxruntime/training/image-classification/README.md
+++ b/examples/onnxruntime/training/image-classification/README.md
@@ -39,7 +39,7 @@ torchrun --nproc_per_node=NUM_GPUS_YOU_HAVE run_image_classification.py \
     --per_device_eval_batch_size 32 \
     --logging_strategy steps \
     --logging_steps 10 \
-    --evaluation_strategy epoch \
+    --eval_strategy epoch \
     --seed 1337
 ```
 
diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py
index 6c8f16f057c..c8c91a04e4e 100644
--- a/optimum/bettertransformer/models/attention.py
+++ b/optimum/bettertransformer/models/attention.py
@@ -15,6 +15,9 @@
 from typing import Optional, Tuple
 
 import torch
+import torch.nn.functional as F
+
+from ...utils import check_if_transformers_greater
 
 
 # TODO (CRITICAL): Layer-wise attention scaling is broken for several archs.
@@ -23,7 +26,7 @@
 def raise_on_head_mask(head_mask: Optional[torch.Tensor]):
     if head_mask is not None:
         raise ValueError(
-            "layer_head_mask different than None is unsupported for now with BetterTransformer, please"
+            "layer_head_mask (or head_mask) different than None is unsupported for now with BetterTransformer, please"
             "open a PR or an issue at https://github.com/huggingface/optimum."
         )
 
@@ -89,6 +92,71 @@ def gpt2_wrapped_scaled_dot_product(
     return sdpa_result, None
 
 
+# Adapted from transformers.models.gptj.modeling_gptj.GPTJAttention._attn
+def gptj_wrapped_scaled_dot_product(
+    self,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    head_mask: Optional[torch.Tensor] = None,
+):
+    raise_on_head_mask(head_mask)
+    batch_size = query.shape[0]
+
+    mask_value = torch.finfo(value.dtype).min
+    mask_value = torch.full([], mask_value, dtype=value.dtype)
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        query = query.to(value.dtype)
+        key = key.to(value.dtype)
+
+    if batch_size == 1 and attention_mask is not None and attention_mask[0, 0, -1, -1] < -1:
+        raise ValueError("BetterTransformer does not support padding='max_length' with a batch size of 1.")
+
+    dropout_p = self.dropout_prob_attn if self.training else 0.0
+    if batch_size == 1 or self.training:
+        if query.shape[2] > 1:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
+            )
+        else:
+            sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=False
+            )
+    else:
+        query_length, key_length = query.size(-2), key.size(-2)
+
+        # causal_mask is always [True, ..., True] otherwise, so executing this
+        # is unnecessary
+        if query_length > 1:
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+
+                causal_mask = torch.where(causal_mask, 0, mask_value)
+
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                if attention_mask is not None:
+                    attention_mask = causal_mask + attention_mask
+
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
+        )
+
+    # in gpt-neo-x and gpt-j the query and keys are always in fp32
+    # thus we need to cast them to the value dtype
+    if self.downcast_qk:
+        sdpa_result = sdpa_result.to(value.dtype)
+
+    return sdpa_result, None
+
+
 # Adapted from transformers.models.bark.modeling_bark.BarkSelfAttention._attn
 def bark_wrapped_scaled_dot_product(
     self,
@@ -192,7 +260,7 @@ def codegen_wrapped_scaled_dot_product(
                 query, key, value, attn_mask=None, dropout_p=dropout_p, is_causal=True
             )
         else:
-            # in this case, which is the later decoding steps, the `causal_mask`` in
+            # in this case, which is the later decoding steps, the `causal_mask` in
             # https://github.com/huggingface/transformers/blob/ae54e3c3b18bac0832ad62ea9b896dfd52a09850/src/transformers/models/gpt2/modeling_gpt2.py#L195
             # is [True, ..., True] so actually not causal
             sdpa_result = torch.nn.functional.scaled_dot_product_attention(
@@ -204,15 +272,20 @@ def codegen_wrapped_scaled_dot_product(
         # causal_mask is always [True, ..., True] otherwise, so executing this
         # is unnecessary
         if query_length > 1:
-            causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(torch.bool)
+            if not check_if_transformers_greater("4.44.99"):
+                causal_mask = self.causal_mask[:, :, key_length - query_length : key_length, :key_length].to(
+                    torch.bool
+                )
 
-            causal_mask = torch.where(causal_mask, 0, mask_value)
+                causal_mask = torch.where(causal_mask, 0, mask_value)
 
-            # torch.Tensor.expand does no memory copy
-            causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
+                # torch.Tensor.expand does no memory copy
+                causal_mask = causal_mask.expand(batch_size, -1, -1, -1)
 
-            # we use torch.min to avoid having tensor(-inf)
-            attention_mask = torch.min(causal_mask, attention_mask)
+                # we use torch.min to avoid having tensor(-inf)
+                attention_mask = torch.min(causal_mask, attention_mask)
+            else:
+                attention_mask = attention_mask[:, :, :, : key.shape[-2]]
 
         sdpa_result = torch.nn.functional.scaled_dot_product_attention(
             query, key, value, attn_mask=attention_mask, dropout_p=dropout_p, is_causal=False
@@ -314,137 +387,243 @@ def opt_forward(
 
 
 # Adapted from transformers.models.t5.modeling_t5.T5Attention.forward
-def t5_forward(
-    self,
-    hidden_states,
-    mask=None,
-    key_value_states=None,
-    position_bias=None,
-    past_key_value=None,
-    layer_head_mask=None,
-    query_length=None,
-    use_cache=False,
-    output_attentions=False,
-    **kwargs,
-):
-    raise_on_head_mask(layer_head_mask)
+if check_if_transformers_greater("4.45.99"):
 
-    if output_attentions is True:
-        raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
-    if len(self.pruned_heads) > 0:
-        raise ValueError(f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}.")
-    batch_size, seq_length = hidden_states.shape[:2]
-
-    real_seq_length = seq_length
-
-    if past_key_value is not None:
-        assert (
-            len(past_key_value) == 2
-        ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
-        real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
-
-    key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
-
-    def shape(states):
-        """projection"""
-        return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
-
-    def unshape(states):
-        """reshape"""
-        return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
-
-    def project(hidden_states, proj_layer, key_value_states, past_key_value):
-        """projects hidden states correctly to key/query states"""
-        if key_value_states is None:
-            # self-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(hidden_states))
-        elif past_key_value is None:
-            # cross-attn
-            # (batch_size, n_heads, seq_length, dim_per_head)
-            hidden_states = shape(proj_layer(key_value_states))
+    def t5_forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        """
+        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
+        """
+        # Input is (batch_size, seq_length, dim)
+        # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
+        is_cross_attention = key_value_states is not None
+
+        query_states = self.q(hidden_states)
+        query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        if past_key_value is not None:
+            is_updated = past_key_value.is_updated.get(self.layer_idx)
+            if is_cross_attention:
+                # after the first generated id, we can subsequently re-use all key/value_states from cache
+                curr_past_key_value = past_key_value.cross_attention_cache
+            else:
+                curr_past_key_value = past_key_value.self_attention_cache
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_value is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.key_cache[self.layer_idx]
+            value_states = curr_past_key_value.value_cache[self.layer_idx]
+        else:
+            key_states = self.k(current_states)
+            value_states = self.v(current_states)
+            key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+            if past_key_value is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention:
+                    past_key_value.is_updated[self.layer_idx] = True
+
+        if position_bias is None:
+            key_length = key_states.shape[-2]
+            # cache position is 0-indexed so we add 1 to get the real length of queries (aka with past)
+            real_seq_length = query_length if query_length is not None else cache_position[-1] + 1
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, seq_length, key_length), device=query_states.device, dtype=query_states.dtype
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(
+                    real_seq_length, key_length, device=query_states.device, cache_position=cache_position
+                )
+                position_bias = position_bias[:, :, -seq_length:, :]
+
+            if mask is not None:
+                causal_mask = mask[:, :, :, : key_states.shape[-2]]
+                position_bias = position_bias + causal_mask
+
+        if self.pruned_heads:
+            mask = torch.ones(position_bias.shape[1])
+            mask[list(self.pruned_heads)] = 0
+            position_bias_masked = position_bias[:, mask.bool()]
+        else:
+            position_bias_masked = position_bias
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=position_bias_masked,
+            dropout_p=self.dropout if self.training else 0.0,
+            is_causal=False,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(batch_size, -1, self.inner_dim)
+        attn_output = self.o(attn_output)
+
+        outputs = (attn_output, past_key_value, position_bias)
+
+        return outputs
+
+else:
+
+    def t5_forward(
+        self,
+        hidden_states,
+        mask=None,
+        key_value_states=None,
+        position_bias=None,
+        past_key_value=None,
+        layer_head_mask=None,
+        query_length=None,
+        use_cache=False,
+        output_attentions=False,
+        **kwargs,
+    ):
+        raise_on_head_mask(layer_head_mask)
+
+        if output_attentions is True:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+        if len(self.pruned_heads) > 0:
+            raise ValueError(
+                f"Setting `pruned_heads` is unsupported with BetterTransformer, found {self.pruned_heads}."
+            )
+
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        real_seq_length = seq_length
 
         if past_key_value is not None:
+            assert (
+                len(past_key_value) == 2
+            ), f"past_key_value should have 2 past states: keys and values. Got { len(past_key_value)} past states"
+            real_seq_length += past_key_value[0].shape[2] if query_length is None else query_length
+
+        key_length = real_seq_length if key_value_states is None else key_value_states.shape[1]
+
+        def shape(states):
+            """projection"""
+            return states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
+
+        def unshape(states):
+            """reshape"""
+            return states.transpose(1, 2).contiguous().view(batch_size, -1, self.inner_dim)
+
+        def project(hidden_states, proj_layer, key_value_states, past_key_value):
+            """projects hidden states correctly to key/query states"""
             if key_value_states is None:
                 # self-attn
-                # (batch_size, n_heads, key_length, dim_per_head)
-                hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
-            elif past_key_value.shape[2] != key_value_states.shape[1]:
-                # checking that the `sequence_length` of the `past_key_value` is the same as
-                # the provided `key_value_states` to support prefix tuning
+                # (batch_size, n_heads, seq_length, dim_per_head)
+                hidden_states = shape(proj_layer(hidden_states))
+            elif past_key_value is None:
                 # cross-attn
                 # (batch_size, n_heads, seq_length, dim_per_head)
                 hidden_states = shape(proj_layer(key_value_states))
-            else:
-                # cross-attn
-                hidden_states = past_key_value
-        return hidden_states
-
-    # get query states
-    query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
 
-    # get key/value states
-    key_states = project(
-        hidden_states,
-        self.k,
-        key_value_states,
-        past_key_value[0] if past_key_value is not None else None,
-    )
-    value_states = project(
-        hidden_states,
-        self.v,
-        key_value_states,
-        past_key_value[1] if past_key_value is not None else None,
-    )
+            if past_key_value is not None:
+                if key_value_states is None:
+                    # self-attn
+                    # (batch_size, n_heads, key_length, dim_per_head)
+                    hidden_states = torch.cat([past_key_value, hidden_states], dim=2)
+                elif past_key_value.shape[2] != key_value_states.shape[1]:
+                    # checking that the `sequence_length` of the `past_key_value` is the same as
+                    # the provided `key_value_states` to support prefix tuning
+                    # cross-attn
+                    # (batch_size, n_heads, seq_length, dim_per_head)
+                    hidden_states = shape(proj_layer(key_value_states))
+                else:
+                    # cross-attn
+                    hidden_states = past_key_value
+            return hidden_states
+
+        # get query states
+        query_states = shape(self.q(hidden_states))  # (batch_size, n_heads, seq_length, dim_per_head)
+
+        # get key/value states
+        key_states = project(
+            hidden_states,
+            self.k,
+            key_value_states,
+            past_key_value[0] if past_key_value is not None else None,
+        )
+        value_states = project(
+            hidden_states,
+            self.v,
+            key_value_states,
+            past_key_value[1] if past_key_value is not None else None,
+        )
 
-    dropout_p = self.dropout if self.training else 0.0
-    query_states = self.scale * query_states
-    if position_bias is None and not self.has_relative_attention_bias:
-        if mask is None:
-            attn_output = torch.nn.functional.scaled_dot_product_attention(
-                query_states, key_states, value_states, attn_mask=None, dropout_p=dropout_p, is_causal=False
-            )
-        elif mask is not None:
+        dropout_p = self.dropout if self.training else 0.0
+        query_states = self.scale * query_states
+        if position_bias is None and not self.has_relative_attention_bias:
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states, key_states, value_states, attn_mask=mask, dropout_p=dropout_p, is_causal=False
             )
 
-    if position_bias is None:
-        if not self.has_relative_attention_bias:
-            position_bias = torch.zeros(
-                (1, self.n_heads, real_seq_length, key_length),
-                device=value_states.device,
-                dtype=value_states.dtype,
-            )
-            if self.gradient_checkpointing and self.training:
-                position_bias.requires_grad = True
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                position_bias = torch.zeros(
+                    (1, self.n_heads, real_seq_length, key_length),
+                    device=value_states.device,
+                    dtype=value_states.dtype,
+                )
+                if self.gradient_checkpointing and self.training:
+                    position_bias.requires_grad = True
+            else:
+                position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device)
+
+            # if key and values are already calculated
+            # we want only the last query position bias
+            if past_key_value is not None:
+                position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
+
+            if mask is not None:
+                position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
+
+            if self.has_relative_attention_bias:
+                attn_output = torch.nn.functional.scaled_dot_product_attention(
+                    query_states,
+                    key_states,
+                    value_states,
+                    attn_mask=position_bias,
+                    dropout_p=dropout_p,
+                    is_causal=False,
+                )
         else:
-            position_bias = self.compute_bias(real_seq_length, key_length, device=value_states.device)
-
-        # if key and values are already calculated
-        # we want only the last query position bias
-        if past_key_value is not None:
-            position_bias = position_bias[:, :, -hidden_states.size(1) :, :]
-
-        if mask is not None:
-            position_bias = position_bias + mask  # (batch_size, n_heads, seq_length, key_length)
-
-        if self.has_relative_attention_bias:
             attn_output = torch.nn.functional.scaled_dot_product_attention(
                 query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False
             )
-    else:
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states, key_states, value_states, attn_mask=position_bias, dropout_p=dropout_p, is_causal=False
-        )
 
-    attn_output = unshape(attn_output)  # (batch_size, seq_length, dim)
-    attn_output = self.o(attn_output)
+        attn_output = unshape(attn_output)  # (batch_size, seq_length, dim)
+        attn_output = self.o(attn_output)
 
-    present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
-    outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
+        present_key_value_state = (key_states, value_states) if (self.is_decoder and use_cache) else None
+        outputs = (attn_output,) + (present_key_value_state,) + (position_bias,)
 
-    return outputs
+        return outputs
 
 
 # Adapted from transformers.models.bart.modeling_bart.BartAttention.forward
@@ -534,88 +713,159 @@ def bart_forward(
     return attn_output, None, past_key_value
 
 
-# Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward
-def bloom_forward(
-    self,
-    hidden_states: torch.Tensor,
-    residual: torch.Tensor,
-    alibi: torch.Tensor,
-    attention_mask: torch.Tensor,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    **kwargs,
-):
-    raise_on_head_mask(head_mask)
+if check_if_transformers_greater("4.44"):
+    from transformers.cache_utils import Cache
+    from transformers.models.bloom.modeling_bloom import dropout_add
+
+    # Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward
+    def bloom_forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ):
+        raise_on_head_mask(head_mask)
+
+        if output_attentions is True:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+
+        batch_size, q_length, _ = hidden_states.shape
+        # [batch_size, seq_length, 3 x hidden_size]
+        fused_qkv = self.query_key_value(hidden_states)
+        # 3 x [batch_size, num_heads, seq_length, head_dim]
+        query_layer, key_layer, value_layer = self._reshape(fused_qkv)
+
+        if layer_past is not None:
+            cache_kwargs = {"cache_position": cache_position}
+            key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs)
+
+        alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            kv_length = cache_position[-1] + 1  # cache position is 0-indexed while length should start from 1
+            causal_mask = attention_mask[:, :, :, :kv_length]
+            alibi = torch.masked_fill(alibi, causal_mask.bool(), torch.finfo(alibi.dtype).min)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=alibi,
+            dropout_p=self.dropout_prob_attn if self.training else 0.0,
+        )
 
-    if output_attentions is True:
-        raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
+        # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim]
+        context_layer = context_layer.transpose(1, 2)
+        context_layer = context_layer.reshape(batch_size, q_length, self.hidden_size)
+
+        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            slices = self.hidden_size / self.pretraining_tp
+            output_tensor = torch.zeros_like(context_layer)
+            for i in range(self.pretraining_tp):
+                output_tensor = output_tensor + F.linear(
+                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            output_tensor = self.dense(context_layer)
 
-    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training)
 
-    # 3 x [batch_size, seq_length, num_heads, head_dim]
-    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+        outputs = (output_tensor, layer_past)
 
-    batch_size, q_length, _, _ = query_layer.shape
+        return outputs
 
-    # Permute to [batch_size, num_heads, seq_length, head_dim]
-    query_layer = query_layer.transpose(1, 2)
+else:
+    # Adapted from transformers.models.bloom.modeling_bloom.BloomAttention.forward
+    def bloom_forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor,
+        alibi: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        **kwargs,
+    ):
+        raise_on_head_mask(head_mask)
 
-    if layer_past is not None:
-        past_key, past_value = layer_past
-        past_key = past_key.transpose(1, 2)
+        if output_attentions is True:
+            raise ValueError("output_attentions=True can not be supported with BetterTransformer.")
 
-        key_layer = key_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
-        value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+        # [batch_size, seq_length, 3 x hidden_size]
+        fused_qkv = self.query_key_value(hidden_states)
 
-        # concatenate along seq_length dimension
-        key_layer = torch.cat((past_key, key_layer), dim=1)
-        value_layer = torch.cat((past_value, value_layer), dim=1)
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
 
-        # untangle batch_size from self.num_heads
-        key_layer = key_layer.reshape(batch_size, self.num_heads, *key_layer.shape[1:])
-        value_layer = value_layer.reshape(batch_size, self.num_heads, *value_layer.shape[1:])
-    else:
-        key_layer = key_layer.transpose(1, 2)
-        value_layer = value_layer.transpose(1, 2)
-
-    alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
-    alibi = torch.masked_fill(alibi, attention_mask, torch.finfo(alibi.dtype).min)
-
-    context_layer = torch.nn.functional.scaled_dot_product_attention(
-        query_layer,
-        key_layer,
-        value_layer,
-        attn_mask=alibi,
-        dropout_p=self.dropout_prob_attn if self.training else 0.0,
-    )
+        batch_size, q_length, _, _ = query_layer.shape
 
-    # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim]
-    context_layer = context_layer.transpose(1, 2)
-    context_layer = context_layer.reshape(*context_layer.shape[:2], -1)
-
-    # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
-    if self.pretraining_tp > 1 and self.slow_but_exact:
-        slices = self.hidden_size / self.pretraining_tp
-        output_tensor = torch.zeros_like(context_layer)
-        for i in range(self.pretraining_tp):
-            output_tensor = output_tensor + torch.nn.functional.linear(
-                context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
-                self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
-            )
-    else:
-        output_tensor = self.dense(context_layer)
+        # Permute to [batch_size, num_heads, seq_length, head_dim]
+        query_layer = query_layer.transpose(1, 2)
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            past_key = past_key.transpose(1, 2)
+
+            key_layer = key_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
+            value_layer = value_layer.transpose(1, 2).reshape(batch_size * self.num_heads, q_length, self.head_dim)
 
-    output_tensor = torch.nn.functional.dropout(output_tensor, p=self.hidden_dropout, training=self.training)
-    output_tensor = residual + output_tensor
+            # concatenate along seq_length dimension
+            key_layer = torch.cat((past_key, key_layer), dim=1)
+            value_layer = torch.cat((past_value, value_layer), dim=1)
 
-    if use_cache is True:
-        present = (
-            key_layer.reshape(-1, *key_layer.shape[2:]).transpose(1, 2),
-            value_layer.reshape(-1, *value_layer.shape[2:]),
+            # untangle batch_size from self.num_heads
+            key_layer = key_layer.reshape(batch_size, self.num_heads, *key_layer.shape[1:])
+            value_layer = value_layer.reshape(batch_size, self.num_heads, *value_layer.shape[1:])
+        else:
+            key_layer = key_layer.transpose(1, 2)
+            value_layer = value_layer.transpose(1, 2)
+
+        alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+        alibi = torch.masked_fill(alibi, attention_mask, torch.finfo(alibi.dtype).min)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            attn_mask=alibi,
+            dropout_p=self.dropout_prob_attn if self.training else 0.0,
         )
-    else:
-        present = None
 
-    return (output_tensor, present)
+        # Transform [batch_size, num_heads, seq_length, head_dim] to [batch_size, seq_length, num_heads * head_dim]
+        context_layer = context_layer.transpose(1, 2)
+        context_layer = context_layer.reshape(*context_layer.shape[:2], -1)
+
+        # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232
+        if self.pretraining_tp > 1 and self.slow_but_exact:
+            slices = self.hidden_size / self.pretraining_tp
+            output_tensor = torch.zeros_like(context_layer)
+            for i in range(self.pretraining_tp):
+                output_tensor = output_tensor + torch.nn.functional.linear(
+                    context_layer[:, :, int(i * slices) : int((i + 1) * slices)],
+                    self.dense.weight[:, int(i * slices) : int((i + 1) * slices)],
+                )
+        else:
+            output_tensor = self.dense(context_layer)
+
+        output_tensor = torch.nn.functional.dropout(output_tensor, p=self.hidden_dropout, training=self.training)
+        output_tensor = residual + output_tensor
+
+        if use_cache is True:
+            present = (
+                key_layer.reshape(-1, *key_layer.shape[2:]).transpose(1, 2),
+                value_layer.reshape(-1, *value_layer.shape[2:]),
+            )
+        else:
+            present = None
+
+        return (output_tensor, present)
diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py
index 4bcc057373a..e8045e695c1 100644
--- a/optimum/bettertransformer/models/decoder_models.py
+++ b/optimum/bettertransformer/models/decoder_models.py
@@ -44,6 +44,7 @@
     codegen_wrapped_scaled_dot_product,
     gpt2_wrapped_scaled_dot_product,
     gpt_neo_wrapped_scaled_dot_product,
+    gptj_wrapped_scaled_dot_product,
     opt_forward,
     t5_forward,
 )
@@ -82,7 +83,7 @@ def forward(self, *args, **kwargs):
 
 
 class GPTJAttentionLayerBetterTransformer(BetterTransformerBaseLayer, GPTJAttention, nn.Module):
-    _attn = gpt2_wrapped_scaled_dot_product
+    _attn = gptj_wrapped_scaled_dot_product
 
     def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         super().__init__(config)
@@ -96,14 +97,22 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             "out_proj",
             "attn_dropout",
             "resid_dropout",
-            "bias",
             "scale_attn",
-            "masked_bias",
         ]
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "bias"):
+            submodules.append("bias")
+        if hasattr(layer, "masked_bias"):
+            submodules.append("masked_bias")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -127,6 +136,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["rotary_emb", "query_key_value", "dense", "bias", "masked_bias", "norm_factor"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -155,6 +169,11 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
 
         self.module_mapping = None
         submodules = ["attn_dropout", "resid_dropout", "k_proj", "v_proj", "q_proj", "out_proj", "bias", "masked_bias"]
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_id"):
+            submodules.append("layer_id")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -216,6 +235,8 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
         self.dropout_prob_attn = config.attention_dropout
 
         self.module_mapping = None
+        self.layer_idx = getattr(layer, "layer_idx", None)
+
         submodules = ["query_key_value", "dense", "attention_dropout"]
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
@@ -236,12 +257,20 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             super(BetterTransformerBaseLayer, self).__init__(config)
 
         self.module_mapping = None
-        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "causal_mask", "scale_attn"]
+        submodules = ["attn_dropout", "resid_dropout", "qkv_proj", "out_proj", "scale_attn"]
 
         # Attribute only for transformers>=4.28
         if hasattr(layer, "embed_positions"):
             submodules.append("embed_positions")
 
+        # Attribute only for transformers<4.45
+        if hasattr(layer, "causal_mask"):
+            submodules.append("causal_mask")
+
+        # Attribute only for transformers>=4.45
+        if hasattr(layer, "layer_idx"):
+            submodules.append("layer_idx")
+
         for attr in submodules:
             setattr(self, attr, getattr(layer, attr))
 
@@ -298,9 +327,9 @@ def __init__(self, layer: "nn.Module", config: "PretrainedConfig"):
             setattr(self, "relative_attention_bias", layer.relative_attention_bias)
             self.original_layers_mapping["relative_attention_bias"] = "relative_attention_bias"
 
-        self.module_mapping = None
-
+        self.layer_idx = getattr(layer, "layer_idx", None)
         self.is_decoder = layer.is_decoder
+        self.module_mapping = None
 
     def forward(self, *args, **kwargs):
         return t5_forward(self, *args, **kwargs)
diff --git a/optimum/bettertransformer/transformation.py b/optimum/bettertransformer/transformation.py
index 2105e199870..b138862752e 100644
--- a/optimum/bettertransformer/transformation.py
+++ b/optimum/bettertransformer/transformation.py
@@ -20,7 +20,13 @@
 import torch
 from packaging.version import parse
 
-from ..utils import check_if_pytorch_greater, is_accelerate_available, recurse_getattr, recurse_setattr
+from ..utils import (
+    check_if_pytorch_greater,
+    check_if_torch_greater,
+    is_accelerate_available,
+    recurse_getattr,
+    recurse_setattr,
+)
 from .models import BetterTransformerManager
 
 
@@ -206,18 +212,25 @@ def transform(
             The converted model if the conversion has been successful.
         """
 
+        logger.warning(
+            "The class `optimum.bettertransformers.transformation.BetterTransformer` is deprecated and will be removed in a future release."
+        )
+
         hf_config = model.config
         if hf_config.model_type in ["falcon", "gpt_bigcode", "llama", "whisper"]:
             raise ValueError(
-                f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention."
+                f"Transformers now supports natively BetterTransformer optimizations (torch.nn.functional.scaled_dot_product_attention) for the model type {hf_config.model_type}. "
+                "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. "
+                "Please upgrade to transformers>=4.36 and torch>=2.1.1 to use it. "
+                "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention."
             )
 
-        # Check if we have to load the model using `accelerate`
-        if hasattr(model, "hf_device_map"):
-            load_accelerate = True
-            hf_device_map = model.hf_device_map
-        else:
-            load_accelerate = False
+        if hasattr(hf_config, "_attn_implementation") and hf_config._attn_implementation == "sdpa":
+            raise ValueError(
+                "This model already uses BetterTransformer optimizations from Transformers (torch.nn.functional.scaled_dot_product_attention). "
+                "As such, there is no need to use `model.to_bettertransformers()` or `BetterTransformer.transform(model)` from the Optimum library. "
+                "Details: https://huggingface.co/docs/transformers/perf_infer_gpu_one#pytorch-scaled-dot-product-attention."
+            )
 
         if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
             raise Exception(
@@ -237,11 +250,20 @@ def transform(
                 f" Currently supported models are: {BetterTransformerManager.MODEL_MAPPING.keys()}."
             )
 
-        if parse(torch.__version__) <= parse("1.14"):
+        if not check_if_torch_greater("2.0"):
             raise ValueError(
                 f"BetterTransformer requires torch>=2.0 but {torch.__version__} is installed. Please upgrade PyTorch."
             )
 
+        hf_config = model.config
+
+        # Check if we have to load the model using `accelerate`
+        if hasattr(model, "hf_device_map"):
+            load_accelerate = True
+            hf_device_map = model.hf_device_map
+        else:
+            load_accelerate = False
+
         if load_accelerate:
             # Remove the hooks from the original model to avoid weights being on `meta` device.
             remove_hook_from_module(model, recurse=True)
diff --git a/optimum/commands/__init__.py b/optimum/commands/__init__.py
index 8a2a276d1c5..a31344ed133 100644
--- a/optimum/commands/__init__.py
+++ b/optimum/commands/__init__.py
@@ -14,5 +14,5 @@
 
 from .base import BaseOptimumCLICommand, CommandInfo, RootOptimumCLICommand
 from .env import EnvironmentCommand
-from .export import ExportCommand, ONNXExportCommand, TFLiteExportCommand
+from .export import ExecuTorchExportCommand, ExportCommand, ONNXExportCommand, TFLiteExportCommand
 from .optimum_cli import optimum_cli_subcommand
diff --git a/optimum/commands/export/__init__.py b/optimum/commands/export/__init__.py
index 19da68a60d2..b72cd5dbc8d 100644
--- a/optimum/commands/export/__init__.py
+++ b/optimum/commands/export/__init__.py
@@ -14,5 +14,6 @@
 
 
 from .base import ExportCommand
+from .executorch import ExecuTorchExportCommand
 from .onnx import ONNXExportCommand
 from .tflite import TFLiteExportCommand
diff --git a/optimum/commands/export/base.py b/optimum/commands/export/base.py
index 07737cb8eaf..e5ed4c90ff5 100644
--- a/optimum/commands/export/base.py
+++ b/optimum/commands/export/base.py
@@ -15,6 +15,7 @@
 """optimum.exporters command-line interface base classes."""
 
 from .. import BaseOptimumCLICommand, CommandInfo
+from .executorch import ExecuTorchExportCommand
 from .onnx import ONNXExportCommand
 from .tflite import TFLiteExportCommand
 
@@ -25,6 +26,11 @@ class ExportCommand(BaseOptimumCLICommand):
         help="Export PyTorch and TensorFlow models to several format.",
     )
     SUBCOMMANDS = (
+        CommandInfo(
+            name="executorch",
+            help="Export PyTorch model to ExecuTorch.",
+            subcommand_class=ExecuTorchExportCommand,
+        ),
         CommandInfo(
             name="onnx",
             help="Export PyTorch and TensorFlow to ONNX.",
diff --git a/optimum/commands/export/executorch.py b/optimum/commands/export/executorch.py
new file mode 100644
index 00000000000..2bf2f1d3054
--- /dev/null
+++ b/optimum/commands/export/executorch.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""Defines the command line for the export with ExecuTorch."""
+
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+from ...exporters import TasksManager
+from ..base import BaseOptimumCLICommand
+
+
+if TYPE_CHECKING:
+    from argparse import ArgumentParser
+
+
+def parse_args_executorch(parser):
+    required_group = parser.add_argument_group("Required arguments")
+    required_group.add_argument(
+        "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from."
+    )
+    required_group.add_argument(
+        "-o",
+        "--output_dir",
+        type=Path,
+        help="Path indicating the directory where to store the generated ExecuTorch model.",
+    )
+    required_group.add_argument(
+        "--task",
+        type=str,
+        default="text-generation",
+        help=(
+            "The task to export the model for. Available tasks depend on the model, but are among:"
+            f" {str(TasksManager.get_all_tasks())}."
+        ),
+    )
+    required_group.add_argument(
+        "--recipe",
+        type=str,
+        default="xnnpack",
+        help='Pre-defined recipes for export to ExecuTorch. Defaults to "xnnpack".',
+    )
+
+
+class ExecuTorchExportCommand(BaseOptimumCLICommand):
+    @staticmethod
+    def parse_args(parser: "ArgumentParser"):
+        return parse_args_executorch(parser)
+
+    def run(self):
+        from ...exporters.executorch import main_export
+
+        main_export(
+            model_name_or_path=self.args.model,
+            task=self.args.task,
+            recipe=self.args.recipe,
+            output_dir=self.args.output_dir,
+        )
diff --git a/optimum/executorchruntime/__init__.py b/optimum/executorchruntime/__init__.py
new file mode 100644
index 00000000000..0a84c3a139b
--- /dev/null
+++ b/optimum/executorchruntime/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "modeling_executorch": [
+        "ExecuTorchModelForCausalLM",
+    ],
+}
+
+if TYPE_CHECKING:
+    from .modeling_executorch import ExecuTorchModelForCausalLM
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/optimum/executorchruntime/modeling_executorch.py b/optimum/executorchruntime/modeling_executorch.py
new file mode 100644
index 00000000000..b93309f6a48
--- /dev/null
+++ b/optimum/executorchruntime/modeling_executorch.py
@@ -0,0 +1,460 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""ExecuTorchModelForXXX classes, allowing to run ExecuTorch Models with ExecuTorch Runtime using the same API as Transformers."""
+
+import logging
+import os
+import warnings
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List, Optional, Union
+
+import torch
+from executorch.extension.pybindings.portable_lib import (
+    ExecuTorchModule,
+    _load_for_executorch,
+)
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers import (
+    AutoModelForCausalLM,
+    PretrainedConfig,
+    PreTrainedTokenizer,
+)
+
+from ..exporters.executorch import main_export
+from ..modeling_base import OptimizedModel
+
+
+logger = logging.getLogger(__name__)
+
+
+class ExecuTorchModelForCausalLM(OptimizedModel):
+    """
+    ExecuTorch model with a causal language modeling head for inference using the ExecuTorch Runtime.
+
+    This class provides an interface for loading, running, and generating outputs from a causal language model
+    optimized for ExecuTorch Runtime. It includes utilities for exporting and loading pre-trained models
+    compatible with ExecuTorch runtime.
+
+    Attributes:
+        auto_model_class (`Type`):
+            Associated Transformers class, `AutoModelForCausalLM`.
+        et_model (`ExecuTorchModule`):
+            The loaded ExecuTorch model.
+        use_kv_cache (`bool`):
+            Whether key-value caching is enabled. For performance reasons, the exported model is
+            optimized to use a static cache.
+        max_cache_size (`int`):
+            Maximum sequence length supported by the cache.
+        max_batch_size (`int`):
+            Maximum supported batch size.
+        dtype (`str`):
+            Data type of the model parameters.
+        bos_token_id (`int`):
+            Beginning-of-sequence token ID.
+        eos_token_id (`int`):
+            End-of-sequence token ID.
+        vocab_size (`int`):
+            Size of the model vocabulary.
+    """
+
+    auto_model_class = AutoModelForCausalLM
+
+    def __init__(
+        self,
+        model: "ExecuTorchModule",
+        config: "PretrainedConfig",
+    ):
+        super().__init__(model, config)
+        self.et_model = model
+        metadata = self.et_model.method_names()
+        logging.info(f"Load all static methods: {metadata}")
+        if "use_kv_cache" in metadata:
+            self.use_kv_cache = self.et_model.run_method("use_kv_cache")[0]
+        if "get_max_seq_len" in metadata:
+            self.max_cache_size = self.et_model.run_method("get_max_seq_len")[0]
+        if "get_max_batch_size" in metadata:
+            self.max_batch_size = self.et_model.run_method("get_max_batch_size")[0]
+        if "get_dtype" in metadata:
+            self.dtype = self.et_model.run_method("get_dtype")[0]
+        if "get_bos_id" in metadata:
+            self.bos_token_id = self.et_model.run_method("get_bos_id")[0]
+        if "get_eos_id" in metadata:
+            self.eos_token_id = self.et_model.run_method("get_eos_id")[0]
+        if "get_vocab_size" in metadata:
+            self.vocab_size = self.et_model.run_method("get_vocab_size")[0]
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the model, which is compatible with the ExecuTorch runtime for LLM.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the model.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        return self.et_model.forward((input_ids, cache_position))[0]
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name_or_path: Union[str, Path],
+        export: bool = True,
+        task: str = "",
+        recipe: str = "",
+        config: "PretrainedConfig" = None,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+        **kwargs,
+    ) -> "ExecuTorchModelForCausalLM":
+        """
+        Load a pre-trained ExecuTorch model.
+
+        Args:
+            model_name_or_path (`Union[str, Path]`):
+                Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`.
+            export (`bool`, *optional*, defaults to `True`):
+                If `True`, the model will be exported from eager to ExecuTorch after fetched from huggingface.co. `model_name_or_path` must be a valid model ID on huggingface.co.
+                If `False`, the previously exported ExecuTorch model will be loaded from a local path. `model_name_or_path` must be a valid local directory where a `model.pte` is stored.
+            task (`str`, defaults to `""`):
+                The task to export the model for, e.g. "text-generation". It is required to specify a task when `export` is `True`.
+            recipe (`str`, defaults to `""`):
+                The recipe to use to do the export, e.g. "xnnpack". It is required to specify a task when `export` is `True`.
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+            **kwargs:
+                Additional configuration options to tasks and recipes.
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: An instance of the ExecuTorch model for text generation task.
+        """
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+            token = use_auth_token
+
+        if export:
+            # Fetch the model from huggingface.co and export it to ExecuTorch
+            if task == "":
+                raise ValueError("Please specify a task to export the model for.")
+            if recipe == "":
+                raise ValueError("Please specify a recipe to export the model for.")
+            return cls._export(
+                model_id=model_name_or_path,
+                task=task,
+                recipe=recipe,
+                config=config,
+                **kwargs,
+            )
+        else:
+            # Load the ExecuTorch model from a local path
+            return cls._from_pretrained(
+                model_dir_path=model_name_or_path,
+                config=config,
+            )
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        model_dir_path: Union[str, Path],
+        config: PretrainedConfig,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+    ) -> "ExecuTorchModelForCausalLM":
+        """
+        Load a pre-trained ExecuTorch model from a local directory.
+
+        Args:
+            model_dir_path (`Union[str, Path]`):
+                Path to the directory containing the ExecuTorch model file (`model.pte`).
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: The initialized ExecuTorch model.
+
+        """
+        full_path = os.path.join(f"{model_dir_path}", "model.pte")
+        model = _load_for_executorch(full_path)
+        logging.info(f"Loaded model from {full_path}")
+        logging.debug(f"{model.method_meta('forward')}")
+        return cls(
+            model=model,
+            config=config,
+        )
+
+    def _save_pretrained(self, save_directory):
+        """
+        Saves a model weights into a directory, so that it can be re-loaded using the
+        [`from_pretrained`] class method.
+        """
+        raise NotImplementedError
+
+    @classmethod
+    def _export(
+        cls,
+        model_id: str,
+        task: str,
+        recipe: str,
+        config: PretrainedConfig,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        trust_remote_code: bool = False,
+        subfolder: str = "",
+        revision: Optional[str] = None,
+        force_download: bool = False,
+        local_files_only: bool = False,
+        use_auth_token: Optional[Union[bool, str]] = None,
+        token: Optional[Union[bool, str]] = None,
+        **kwargs,
+    ):
+        """
+        Fetch a model from the Hugging Face Hub and export it to ExecuTorch format.
+
+        Args:
+            model_id (`str`):
+                Model ID on huggingface.co, for example: `model_name_or_path="meta-llama/Llama-3.2-1B"`.
+            task (`str`):
+                The task to export the model for, e.g. "text-generation".
+            recipe (`str`):
+                The recipe to use to do the export, e.g. "xnnpack".
+            config (`PretrainedConfig`, *optional*):
+                Configuration of the pre-trained model.
+            cache_dir (`Optional[str]`, defaults to `None`):
+                Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+            trust_remote_code (`bool`, defaults to `False`):
+                Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+                you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+                model repository.
+            subfolder (`str`, defaults to `""`):
+                In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+                specify the folder name here.
+            revision (`str`, defaults to `"main"`):
+                Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+            force_download (`bool`, defaults to `False`):
+                Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+                cached versions if they exist.
+            local_files_only (`Optional[bool]`, defaults to `False`):
+                Whether or not to only look at local files (i.e., do not try to download the model).
+            use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+                Deprecated. Please use the `token` argument instead.
+            token (`Optional[Union[bool,str]]`, defaults to `None`):
+                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+                when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+            **kwargs:
+                Additional configuration options to tasks and recipes.
+
+        Returns:
+            `ExecuTorchModelForCausalLM`: The loaded and exported ExecuTorch model.
+
+        """
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+                FutureWarning,
+            )
+            if token is not None:
+                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+            token = use_auth_token
+
+        save_dir = TemporaryDirectory()
+        save_dir_path = Path(save_dir.name)
+
+        # Export to ExecuTorch and save the pte file to the temporary directory
+        main_export(
+            model_name_or_path=model_id,
+            output_dir=save_dir_path,
+            task=task,
+            recipe=recipe,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+            trust_remote_code=trust_remote_code,
+            **kwargs,
+        )
+
+        return cls._from_pretrained(
+            model_dir_path=save_dir_path,
+            config=config,
+            use_auth_token=use_auth_token,
+            subfolder=subfolder,
+            revision=revision,
+            cache_dir=cache_dir,
+            token=token,
+            local_files_only=local_files_only,
+            force_download=force_download,
+        )
+
+    def generate(
+        self,
+        prompt_tokens: List[int],
+        echo: bool = False,
+        pos_base: int = 0,
+        max_seq_len: Optional[int] = None,
+    ) -> List[int]:
+        """
+        Generate tokens from a prompt using the ExecuTorch model.
+
+        Args:
+            prompt_tokens (List[int]):
+                List of token IDs representing the prompt.
+            echo (`bool`, *optional*):
+                Whether to include prompt tokens in the generated output. Defaults to `False`.
+            pos_base (`int`, *optional*):
+                Base position for the prompt tokens. Defaults to 0.
+            max_seq_len (`int`, *optional*):
+                Maximum sequence length for the generated output.
+                Defaults to None and uses the model's `max_cache_size` attribute.
+                Will be truncated to maximal cache size if larger than `max_cache_size`.
+
+        Returns:
+            List[int]: List of generated token IDs.
+
+        Note:
+            Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind.
+            Expect improvements to the pybind interface in ExecuTorch version 0.4.1.
+        """
+        self.device = torch.device("cpu")
+        if max_seq_len is None:
+            # Default to max_cache_size if max_seq_len is not specified
+            max_seq_len = self.max_cache_size
+        elif max_seq_len > self.max_cache_size:
+            logging.warning(
+                f"max_seq_len={max_seq_len} is larger than max_cache_size={self.max_cache_size}. Generating tokens will be truncated to max_cache_size."
+            )
+            max_seq_len = self.max_cache_size
+        generated_tokens = []
+
+        # prefill
+        for i, prompt_token in enumerate(prompt_tokens):
+            logits = self.forward(
+                input_ids=torch.tensor([prompt_token], dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.tensor([i], dtype=torch.long, device=self.device),
+            )
+
+        next_token = torch.argmax(logits, dim=-1).item()
+        generated_tokens = prompt_tokens + [next_token]
+
+        while len(generated_tokens) < max_seq_len:
+            logits = self.forward(
+                input_ids=torch.tensor([next_token], dtype=torch.long, device=self.device).unsqueeze(0),
+                cache_position=torch.tensor(
+                    [pos_base + len(generated_tokens) - 1],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+            )
+            next_token = torch.argmax(logits, dim=-1).item()
+            generated_tokens.append(next_token)
+            if next_token == self.eos_token_id:
+                break
+
+        return generated_tokens if echo else generated_tokens[len(prompt_tokens) :]
+
+    def text_generation(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        prompt: str,
+        echo: bool = True,
+        max_seq_len: Optional[int] = None,
+    ):
+        """
+        Perform text generation task for a given prompt using the ExecuTorch model.
+
+        Args:
+            tokenizer (`PreTrainedTokenizer`):
+                The tokenizer used to encode and decode the prompt and output.
+            prompt (`str`):
+                The text prompt to complete.
+            echo (`bool`, *optional*):
+                Whether to include prompt tokens in the generated output. Defaults to `True`.
+            max_seq_len (`int`, *optional*):
+                Maximum sequence length for the generated output.
+                Defaults to None and uses the model's `max_cache_size` attribute.
+                Will be truncated to maximal cache size if larger than `max_cache_size`.
+        """
+        self.tokenizer = tokenizer
+
+        # Sanity check
+        if self.tokenizer.bos_token_id is not None and self.tokenizer.bos_token_id != self.bos_token_id:
+            raise ValueError(
+                f"The tokenizer's bos_token_id={self.tokenizer.bos_token_id} must be the same as the model's bos_token_id={self.bos_token_id}."
+            )
+        if self.tokenizer.eos_token_id is not None and self.tokenizer.eos_token_id != self.eos_token_id:
+            raise ValueError(
+                f"The tokenizer's eos_token_id={self.tokenizer.eos_token_id} must be the same as the model's eos_token_id={self.eos_token_id}."
+            )
+
+        prompt_tokens = self.tokenizer.encode(prompt)
+        generated_tokens = self.generate(
+            prompt_tokens=prompt_tokens,
+            echo=echo,
+            max_seq_len=max_seq_len,
+        )
+        return self.tokenizer.decode(generated_tokens, skip_special_tokens=True)
diff --git a/optimum/exporters/__init__.py b/optimum/exporters/__init__.py
index eef17dac7f7..7b08812a569 100644
--- a/optimum/exporters/__init__.py
+++ b/optimum/exporters/__init__.py
@@ -13,4 +13,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from . import onnx  # noqa
+from . import executorch  # noqa
 from .tasks import TasksManager  # noqa
diff --git a/optimum/exporters/executorch/__init__.py b/optimum/exporters/executorch/__init__.py
new file mode 100644
index 00000000000..3409e69fcfb
--- /dev/null
+++ b/optimum/exporters/executorch/__init__.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from transformers.utils import _LazyModule
+
+
+_import_structure = {
+    "convert": [
+        "export_to_executorch",
+    ],
+    "recipe_registry": [
+        "discover_recipes",
+        "register_recipe",
+    ],
+    "task_registry": [
+        "discover_tasks",
+        "register_task",
+    ],
+    "tasks": [
+        "causal_lm",
+    ],
+    "recipes": [
+        "xnnpack",
+    ],
+    "__main__": ["main_export"],
+}
+
+if TYPE_CHECKING:
+    from .__main__ import main_export
+    from .convert import export_to_executorch
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
diff --git a/optimum/exporters/executorch/__main__.py b/optimum/exporters/executorch/__main__.py
new file mode 100644
index 00000000000..33a668b0674
--- /dev/null
+++ b/optimum/exporters/executorch/__main__.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""Entry point to the optimum.exporters.executorch command line."""
+
+import argparse
+import os
+import warnings
+from pathlib import Path
+
+from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from transformers.utils import is_torch_available
+
+from optimum.utils.import_utils import check_if_transformers_greater
+
+from ...commands.export.executorch import parse_args_executorch
+from .convert import export_to_executorch
+from .task_registry import discover_tasks, task_registry
+
+
+if is_torch_available():
+    pass
+
+from typing import Optional, Union
+
+
+def main_export(
+    model_name_or_path: str,
+    task: str,
+    recipe: str,
+    output_dir: Union[str, Path],
+    cache_dir: str = HUGGINGFACE_HUB_CACHE,
+    trust_remote_code: bool = False,
+    pad_token_id: Optional[int] = None,
+    subfolder: str = "",
+    revision: str = "main",
+    force_download: bool = False,
+    local_files_only: bool = False,
+    use_auth_token: Optional[Union[bool, str]] = None,
+    token: Optional[Union[bool, str]] = None,
+    **kwargs,
+):
+    """
+    Full-suite ExecuTorch export function, exporting **from a model ID on Hugging Face Hub or a local model repository**.
+
+    Args:
+        model_name_or_path (`str`):
+            Model ID on huggingface.co or path on disk to the model repository to export. Example: `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`.
+        task (`str`):
+            The task to export the model for, e.g. "text-generation".
+        recipe (`str`):
+            The recipe to use to do the export, e.g. "xnnpack".
+        output_dir (`Union[str, Path]`):
+            Path indicating the directory where to store the generated ExecuTorch model.
+        cache_dir (`Optional[str]`, defaults to `None`):
+            Path indicating where to store cache. The default Hugging Face cache path will be used by default.
+        trust_remote_code (`bool`, defaults to `False`):
+            Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories
+            you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the
+            model repository.
+        pad_token_id (`Optional[int]`, defaults to `None`):
+            This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it.
+        subfolder (`str`, defaults to `""`):
+            In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, you can
+            specify the folder name here.
+        revision (`str`, defaults to `"main"`):
+            Revision is the specific model version to use. It can be a branch name, a tag name, or a commit id.
+        force_download (`bool`, defaults to `False`):
+            Whether or not to force the (re-)download of the model weights and configuration files, overriding the
+            cached versions if they exist.
+        local_files_only (`Optional[bool]`, defaults to `False`):
+            Whether or not to only look at local files (i.e., do not try to download the model).
+        use_auth_token (`Optional[Union[bool,str]]`, defaults to `None`):
+            Deprecated. Please use the `token` argument instead.
+        token (`Optional[Union[bool,str]]`, defaults to `None`):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `huggingface_hub.constants.HF_TOKEN_PATH`).
+        **kwargs:
+            Additional configuration options to tasks and recipes.
+
+    Example usage:
+    ```python
+    >>> from optimum.exporters.executorch import main_export
+
+    >>> main_export("meta-llama/Llama-3.2-1B", "text-generation", "xnnpack", "meta_llama3_2_1b/")
+    ```
+    """
+
+    if not check_if_transformers_greater("4.46"):
+        raise ValueError(
+            "The minimum Transformers version compatible with ExecuTorch is 4.46.0. Please upgrade to Transformers 4.46.0 or later."
+        )
+
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
+        token = use_auth_token
+
+    # Dynamically discover and import registered tasks
+    discover_tasks()
+
+    # Load the model for specific task
+    try:
+        task_func = task_registry.get(task)
+    except KeyError as e:
+        raise RuntimeError(f"The task '{task}' isn't registered. Detailed error: {e}")
+
+    model = task_func(model_name_or_path, **kwargs)
+
+    if task == "text-generation":
+        from transformers.integrations.executorch import TorchExportableModuleWithStaticCache
+
+        model = TorchExportableModuleWithStaticCache(model)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    return export_to_executorch(
+        model=model,
+        task=task,
+        recipe=recipe,
+        output_dir=output_dir,
+        **kwargs,
+    )
+
+
+def main():
+    parser = argparse.ArgumentParser("Hugging Face Optimum ExecuTorch exporter")
+
+    parse_args_executorch(parser)
+
+    # Retrieve CLI arguments
+    args = parser.parse_args()
+
+    main_export(
+        model_name_or_path=args.model,
+        output_dir=args.output_dir,
+        task=args.task,
+        recipe=args.recipe,
+        cache_dir=args.cache_dir,
+        trust_remote_code=args.trust_remote_code,
+        pad_token_id=args.pad_token_id,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/exporters/executorch/convert.py b/optimum/exporters/executorch/convert.py
new file mode 100644
index 00000000000..f50a4b54a96
--- /dev/null
+++ b/optimum/exporters/executorch/convert.py
@@ -0,0 +1,90 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+"""ExecuTorch model check and export functions."""
+
+import logging
+import os
+from pathlib import Path
+from typing import Union
+
+from transformers.utils import is_torch_available
+
+from optimum.utils.import_utils import check_if_transformers_greater
+
+from .recipe_registry import discover_recipes, recipe_registry
+
+
+if is_torch_available():
+    from transformers.modeling_utils import PreTrainedModel
+
+if check_if_transformers_greater("4.46"):
+    from transformers.integrations.executorch import (
+        TorchExportableModuleWithStaticCache,
+    )
+
+logger = logging.getLogger(__name__)
+
+
+def export_to_executorch(
+    model: Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"],
+    task: str,
+    recipe: str,
+    output_dir: Union[str, Path],
+    **kwargs,
+):
+    """
+    Export a pre-trained PyTorch model to the ExecuTorch format using a specified recipe.
+
+    This function facilitates the transformation of a PyTorch model into an optimized ExecuTorch program.
+
+    Args:
+        model (`Union["PreTrainedModel", "TorchExportableModuleWithStaticCache"]`):
+            A PyTorch model to be exported. This can be a standard HuggingFace `PreTrainedModel` or a wrapped
+            module like `TorchExportableModuleWithStaticCache` for text generation task.
+        task (`str`):
+            The specific task the exported model will perform, e.g., "text-generation".
+        recipe (`str`):
+            The recipe to guide the export process, e.g., "xnnpack". Recipes define the optimization and lowering steps.
+            Will raise an exception if the specified recipe is not registered in the recipe registry.
+        output_dir (`Union[str, Path]`):
+            Path to the directory where the resulting ExecuTorch model will be saved.
+        **kwargs:
+            Additional configuration options passed to the recipe.
+
+    Returns:
+        `ExecuTorchProgram`:
+            The lowered ExecuTorch program object.
+
+    Notes:
+        - The function uses a dynamic recipe discovery mechanism to identify and import the specified recipe.
+        - The exported model is stored in the specified output directory with the fixed filename `model.pte`.
+        - The resulting ExecuTorch program is serialized and saved to the output directory.
+    """
+
+    # Dynamically discover and import registered recipes
+    discover_recipes()
+
+    # Export and lower the model to ExecuTorch with the recipe
+    try:
+        recipe_func = recipe_registry.get(recipe)
+    except KeyError as e:
+        raise RuntimeError(f"The recipe '{recipe}' isn't registered. Detailed error: {e}")
+
+    executorch_prog = recipe_func(model, task, **kwargs)
+
+    full_path = os.path.join(f"{output_dir}", "model.pte")
+    with open(full_path, "wb") as f:
+        executorch_prog.write_to_file(f)
+        logging.info(f"Saved exported program to {full_path}")
+
+    return executorch_prog
diff --git a/optimum/exporters/executorch/recipe_registry.py b/optimum/exporters/executorch/recipe_registry.py
new file mode 100644
index 00000000000..2eb728b7573
--- /dev/null
+++ b/optimum/exporters/executorch/recipe_registry.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import importlib
+import logging
+import pkgutil
+
+
+logger = logging.getLogger(__name__)
+
+recipe_registry = {}
+
+package_name = "optimum.exporters.executorch.recipes"
+
+
+def register_recipe(recipe_name):
+    """
+    Decorator to register a recipe for exporting and lowering an ExecuTorch model under a specific name.
+
+    Args:
+        recipe_name (`str`):
+            The name of the recipe to associate with a callable recipe.
+
+    Returns:
+        `Callable`:
+            The original function wrapped as a registered recipe.
+
+    Example:
+        ```python
+        @register_recipe("my_new_recipe")
+        def my_new_recipe(...):
+            ...
+        ```
+    """
+
+    def decorator(func):
+        recipe_registry[recipe_name] = func
+        return func
+
+    return decorator
+
+
+def discover_recipes():
+    """
+    Dynamically discovers and imports all recipe modules within the `optimum.exporters.executorch.recipes` package.
+
+    Ensures recipes under `./recipes` directory are dynamically loaded without requiring manual imports.
+
+    Notes:
+        New recipes **must** be added to the `./recipes` directory to be discovered and used by `main_export`.
+        Failure to do so will prevent dynamic discovery and registration. Recipes must also use the
+        `@register_recipe` decorator to be properly registered in the `recipe_registry`.
+    """
+    package = importlib.import_module(package_name)
+    package_path = package.__path__
+
+    for _, module_name, _ in pkgutil.iter_modules(package_path):
+        logger.info(f"Importing {package_name}.{module_name}")
+        importlib.import_module(f"{package_name}.{module_name}")
diff --git a/optimum/exporters/executorch/recipes/__init__.py b/optimum/exporters/executorch/recipes/__init__.py
new file mode 100644
index 00000000000..a2e21cf3970
--- /dev/null
+++ b/optimum/exporters/executorch/recipes/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from . import xnnpack
diff --git a/optimum/exporters/executorch/recipes/xnnpack.py b/optimum/exporters/executorch/recipes/xnnpack.py
new file mode 100644
index 00000000000..d3b3a5d52aa
--- /dev/null
+++ b/optimum/exporters/executorch/recipes/xnnpack.py
@@ -0,0 +1,97 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from typing import Union
+
+import torch
+import torch.export._trace
+from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+from executorch.exir import (
+    EdgeCompileConfig,
+    ExecutorchBackendConfig,
+    to_edge_transform_and_lower,
+)
+from torch.nn.attention import SDPBackend
+from transformers import PreTrainedModel, TorchExportableModuleWithStaticCache
+
+from ..recipe_registry import register_recipe
+
+
+@register_recipe("xnnpack")
+def export_to_executorch_with_xnnpack(
+    model: Union[PreTrainedModel, TorchExportableModuleWithStaticCache],
+    task: str,
+    **kwargs,
+):
+    """
+    Export a PyTorch model to ExecuTorch w/ delegation to XNNPACK backend.
+
+    This function also write metadata required by the ExecuTorch runtime to the model.
+
+    Args:
+        model (Union[PreTrainedModel, TorchExportableModuleWithStaticCache]):
+            The PyTorch model to be exported to ExecuTorch.
+        task (str):
+            The task name to export the model for (e.g., "text-generation").
+        **kwargs:
+            Additional keyword arguments for recipe-specific configurations.
+
+    Returns:
+        ExecuTorchProgram:
+            The exported and optimized program for ExecuTorch.
+    """
+    metadata = {}
+    if task == "text-generation":
+        example_input_ids = torch.tensor([[1]], dtype=torch.long)
+        example_cache_position = torch.tensor([0], dtype=torch.long)
+
+        def _get_constant_methods(model: PreTrainedModel):
+            metadata = {
+                "get_dtype": 5 if model.config.torch_dtype == torch.float16 else 6,
+                "get_bos_id": model.config.bos_token_id,
+                "get_eos_id": model.config.eos_token_id,
+                "get_head_dim": model.config.hidden_size / model.config.num_attention_heads,
+                "get_max_batch_size": model.generation_config.cache_config.batch_size,
+                "get_max_seq_len": model.generation_config.cache_config.max_cache_len,
+                "get_n_kv_heads": model.config.num_key_value_heads,
+                "get_n_layers": model.config.num_hidden_layers,
+                "get_vocab_size": model.config.vocab_size,
+                "use_kv_cache": model.generation_config.use_cache,
+            }
+            return {k: v for k, v in metadata.items() if v is not None}
+
+        metadata = _get_constant_methods(model if isinstance(model, PreTrainedModel) else model.model)
+    else:
+        # TODO: Prepare model inputs for other tasks
+        raise ValueError(f"Unsupported task '{task}'.")
+
+    with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+        exported_program = torch.export._trace._export(
+            model,
+            args=(example_input_ids,),
+            kwargs={"cache_position": example_cache_position},
+            pre_dispatch=False,
+            strict=True,
+        )
+
+        return to_edge_transform_and_lower(
+            exported_program,
+            partitioner=[XnnpackPartitioner()],
+            compile_config=EdgeCompileConfig(
+                _skip_dim_order=True,
+            ),
+            constant_methods=metadata,
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                extract_delegate_segments=True,
+            ),
+        )
diff --git a/optimum/exporters/executorch/task_registry.py b/optimum/exporters/executorch/task_registry.py
new file mode 100644
index 00000000000..fdc34f0359a
--- /dev/null
+++ b/optimum/exporters/executorch/task_registry.py
@@ -0,0 +1,68 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import importlib
+import logging
+import pkgutil
+
+
+logger = logging.getLogger(__name__)
+
+task_registry = {}
+
+package_name = "optimum.exporters.executorch.tasks"
+
+
+def register_task(task_name):
+    """
+    Decorator to register a task under a specific name.
+
+    Args:
+        task_name (`str`):
+            The name of the task to associate with a callable task.
+
+    Returns:
+        `Callable`:
+            The original function wrapped as a registered task.
+
+    Example:
+        ```python
+        @register_task("my_new_task")
+        def my_new_task(...):
+            ...
+        ```
+    """
+
+    def decorator(func):
+        task_registry[task_name] = func
+        return func
+
+    return decorator
+
+
+def discover_tasks():
+    """
+    Dynamically discovers and imports all task modules within the `optimum.exporters.executorch.tasks` package.
+
+    Ensures tasks under `./tasks` directory are dynamically loaded without requiring manual imports.
+
+    Notes:
+        New tasks **must** be added to the `./tasks` directory to be discovered and used by `main_export`.
+        Failure to do so will prevent dynamic discovery and registration. Tasks must also use the
+        `@register_task` decorator to be properly registered in the `task_registry`.
+    """
+    package = importlib.import_module(package_name)
+    package_path = package.__path__
+
+    for _, module_name, _ in pkgutil.iter_modules(package_path):
+        logger.info(f"Importing {package_name}.{module_name}")
+        importlib.import_module(f"{package_name}.{module_name}")
diff --git a/optimum/exporters/executorch/tasks/__init__.py b/optimum/exporters/executorch/tasks/__init__.py
new file mode 100644
index 00000000000..754a8241ca3
--- /dev/null
+++ b/optimum/exporters/executorch/tasks/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from . import causal_lm
diff --git a/optimum/exporters/executorch/tasks/causal_lm.py b/optimum/exporters/executorch/tasks/causal_lm.py
new file mode 100644
index 00000000000..b02da8b319e
--- /dev/null
+++ b/optimum/exporters/executorch/tasks/causal_lm.py
@@ -0,0 +1,66 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+# an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from transformers import AutoModelForCausalLM, GenerationConfig
+
+from ..task_registry import register_task
+
+
+@register_task("text-generation")
+def load_causal_lm_model(model_name_or_path: str, **kwargs):
+    """
+    Loads a causal language model for text generation and registers it under the task
+    'text-generation' using Hugging Face's AutoModelForCausalLM.
+
+    Args:
+        model_name_or_path (str):
+            Model ID on huggingface.co or path on disk to the model repository to export. For example:
+            `model_name_or_path="meta-llama/Llama-3.2-1B"` or `mode_name_or_path="/path/to/model_folder`
+        **kwargs:
+            Additional configuration options for the model:
+                - dtype (str, optional):
+                    Data type for model weights (default: "float32").
+                    Options include "float16" and "bfloat16".
+                - attn_implementation (str, optional):
+                    Attention mechanism implementation (default: "sdpa").
+                - cache_implementation (str, optional):
+                    Cache management strategy (default: "static").
+                - max_length (int, optional):
+                    Maximum sequence length for generation (default: 2048).
+
+    Returns:
+        transformers.PreTrainedModel:
+            An instance of a model subclass (e.g., Llama, Gemma) with the configuration for exporting
+            and lowering to ExecuTorch.
+    """
+    device = "cpu"
+    batch_size = 1
+    dtype = kwargs.get("dtype", "float32")
+    attn_implementation = kwargs.get("attn_implementation", "sdpa")
+    cache_implementation = kwargs.get("cache_implementation", "static")
+    max_length = kwargs.get("max_length", 2048)
+
+    return AutoModelForCausalLM.from_pretrained(
+        model_name_or_path,
+        device_map=device,
+        torch_dtype=dtype,
+        attn_implementation=attn_implementation,
+        generation_config=GenerationConfig(
+            use_cache=True,
+            cache_implementation=cache_implementation,
+            max_length=max_length,
+            cache_config={
+                "batch_size": batch_size,
+                "max_cache_len": max_length,
+            },
+        ),
+    )
diff --git a/optimum/exporters/onnx/__main__.py b/optimum/exporters/onnx/__main__.py
index 703e98df3e2..6a2cc6834a6 100644
--- a/optimum/exporters/onnx/__main__.py
+++ b/optimum/exporters/onnx/__main__.py
@@ -43,7 +43,6 @@
     from .base import OnnxConfig
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def main_export(
diff --git a/optimum/exporters/onnx/base.py b/optimum/exporters/onnx/base.py
index 8cd94194ffe..b5adb4522a2 100644
--- a/optimum/exporters/onnx/base.py
+++ b/optimum/exporters/onnx/base.py
@@ -27,16 +27,12 @@
 from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
 
 import numpy as np
-import onnx
 from transformers.utils import is_accelerate_available, is_torch_available
 
-from ...onnx import remove_duplicate_weights_from_tied_info
-
 
 if is_torch_available():
     import torch.nn as nn
 
-from ...onnx import merge_decoders
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     DummyInputGenerator,
@@ -54,6 +50,8 @@
 from .model_patcher import ModelPatcher, Seq2SeqModelPatcher
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
 if is_accelerate_available():
     from accelerate.utils import find_tied_parameters
 
@@ -319,6 +317,7 @@ def fix_dynamic_axes(
                 input_shapes = {}
             dummy_inputs = self.generate_dummy_inputs(framework="np", **input_shapes)
             dummy_inputs = self.generate_dummy_inputs_for_validation(dummy_inputs, onnx_input_names=onnx_input_names)
+            dummy_inputs = self.rename_ambiguous_inputs(dummy_inputs)
 
             onnx_inputs = {}
             for name, value in dummy_inputs.items():
@@ -541,6 +540,10 @@ def post_process_exported_models(
         first_key = next(iter(models_and_onnx_configs))
         if is_torch_available() and isinstance(models_and_onnx_configs[first_key][0], nn.Module):
             if is_accelerate_available():
+                import onnx
+
+                from ...onnx import remove_duplicate_weights_from_tied_info
+
                 logger.info("Deduplicating shared (tied) weights...")
                 for subpath, key in zip(onnx_files_subpaths, models_and_onnx_configs):
                     torch_model = models_and_onnx_configs[key][0]
@@ -933,6 +936,8 @@ def post_process_exported_models(
             decoder_with_past_path = Path(path, onnx_files_subpaths[2])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
             try:
+                from ...onnx import merge_decoders
+
                 # The decoder with past does not output the cross attention past key values as they are constant,
                 # hence the need for strict=False
                 merge_decoders(
diff --git a/optimum/exporters/onnx/config.py b/optimum/exporters/onnx/config.py
index 9e808e392b9..69366d6be13 100644
--- a/optimum/exporters/onnx/config.py
+++ b/optimum/exporters/onnx/config.py
@@ -20,7 +20,6 @@
 
 from transformers.utils import is_tf_available
 
-from ...onnx import merge_decoders
 from ...utils import (
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
@@ -38,6 +37,9 @@
 from .model_patcher import DecoderModelPatcher
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedModel
 
@@ -129,6 +131,8 @@ def post_process_exported_models(
 
         # Attempt to merge only if the decoder-only was exported separately without/with past
         if self.use_past is True and len(models_and_onnx_configs) == 2:
+            from ...onnx import merge_decoders
+
             decoder_path = Path(path, onnx_files_subpaths[0])
             decoder_with_past_path = Path(path, onnx_files_subpaths[1])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 63a9067b90c..80d945580c7 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -22,10 +22,11 @@
 from inspect import signature
 from itertools import chain
 from pathlib import Path
-from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 import numpy as np
 import onnx
+from transformers.generation import GenerationMixin
 from transformers.modeling_utils import get_parameter_dtype
 from transformers.utils import is_tf_available, is_torch_available
 
@@ -34,6 +35,7 @@
     DEFAULT_DUMMY_SHAPES,
     ONNX_WEIGHTS_NAME,
     TORCH_MINIMUM_VERSION,
+    check_if_transformers_greater,
     is_diffusers_available,
     is_torch_onnx_support_available,
     logging,
@@ -43,6 +45,7 @@
 from ...utils.save_utils import maybe_save_preprocessors
 from ..error_utils import AtolError, MinimumVersionError, OutputMatchError, ShapeError
 from ..tasks import TasksManager
+from ..utils import check_dummy_inputs_are_allowed
 from .base import OnnxConfig
 from .constants import UNPICKABLE_ARCHS
 from .model_configs import SpeechT5OnnxConfig
@@ -54,6 +57,8 @@
 )
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
 if is_torch_available():
     import torch
     import torch.nn as nn
@@ -73,30 +78,6 @@ class DynamicAxisNameError(ValueError):
     pass
 
 
-def check_dummy_inputs_are_allowed(
-    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str]
-):
-    """
-    Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`.
-    Args:
-        model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]):
-            The model instance.
-        model_inputs (`Iterable[str]`):
-            The model input names.
-    """
-
-    forward = model.forward if is_torch_available() and isinstance(model, nn.Module) else model.call
-    forward_parameters = signature(forward).parameters
-    forward_inputs_set = set(forward_parameters.keys())
-    dummy_input_names = set(dummy_input_names)
-
-    # We are fine if config_inputs has more keys than model_inputs
-    if not dummy_input_names.issubset(forward_inputs_set):
-        raise ValueError(
-            f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}"
-        )
-
-
 def validate_models_outputs(
     models_and_onnx_configs: Dict[
         str, Tuple[Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], "OnnxConfig"]
@@ -529,6 +510,11 @@ def export_pytorch(
     logger.info(f"Using framework PyTorch: {torch.__version__}")
     FORCE_ONNX_EXTERNAL_DATA = os.getenv("FORCE_ONNX_EXTERNAL_DATA", "0") == "1"
 
+    model_kwargs = model_kwargs or {}
+    # num_logits_to_keep was added in transformers 4.45 and isn't added as inputs when exporting the model
+    if check_if_transformers_greater("4.44.99") and "num_logits_to_keep" in signature(model.forward).parameters.keys():
+        model_kwargs["num_logits_to_keep"] = 0
+
     with torch.no_grad():
         model.config.return_dict = True
         model = model.eval()
@@ -999,7 +985,6 @@ def onnx_export_from_model(
     >>> onnx_export_from_model(model, output="gpt2_onnx/")
     ```
     """
-
     TasksManager.standardize_model_attributes(model)
 
     if hasattr(model.config, "export_model_type"):
@@ -1120,6 +1105,22 @@ def onnx_export_from_model(
             if isinstance(atol, dict):
                 atol = atol[task.replace("-with-past", "")]
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = model.config._get_non_default_generation_parameters()
+            if (
+                isinstance(model, GenerationMixin)
+                and model.can_generate()
+                and len(misplaced_generation_parameters) > 0
+            ):
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(model.generation_config, param_name, param_value)
+                    setattr(model.config, param_name, None)
+
         # Saving the model config and preprocessor as this is needed sometimes.
         model.config.save_pretrained(output)
         generation_config = getattr(model, "generation_config", None)
@@ -1161,6 +1162,10 @@ def onnx_export_from_model(
         if tokenizer_2 is not None:
             tokenizer_2.save_pretrained(output.joinpath("tokenizer_2"))
 
+        tokenizer_3 = getattr(model, "tokenizer_3", None)
+        if tokenizer_3 is not None:
+            tokenizer_3.save_pretrained(output.joinpath("tokenizer_3"))
+
         model.save_config(output)
 
     if float_dtype == "bf16":
diff --git a/optimum/exporters/onnx/model_configs.py b/optimum/exporters/onnx/model_configs.py
index 3e11c7e614a..3a48a579c2c 100644
--- a/optimum/exporters/onnx/model_configs.py
+++ b/optimum/exporters/onnx/model_configs.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Model specific ONNX configurations."""
+
 import random
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, Union
@@ -20,14 +21,16 @@
 from packaging import version
 from transformers.utils import is_tf_available
 
-from ...onnx import merge_decoders
 from ...utils import (
     DEFAULT_DUMMY_SHAPES,
     BloomDummyPastKeyValuesGenerator,
     DummyAudioInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
+    DummyDecisionTransformerInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
+    DummyFluxTransformerTextInputGenerator,
+    DummyFluxTransformerVisionInputGenerator,
     DummyInputGenerator,
     DummyIntGenerator,
     DummyPastKeyValuesGenerator,
@@ -38,6 +41,9 @@
     DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
+    DummyTransformerTextInputGenerator,
+    DummyTransformerTimestepInputGenerator,
+    DummyTransformerVisionInputGenerator,
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
@@ -53,6 +59,7 @@
     NormalizedTextConfig,
     NormalizedTextConfigWithGQA,
     NormalizedVisionConfig,
+    check_if_diffusers_greater,
     check_if_transformers_greater,
     is_diffusers_available,
     logging,
@@ -74,6 +81,7 @@
 from .model_patcher import (
     CLIPModelPatcher,
     FalconModelPatcher,
+    MgpstrModelPatcher,
     MistralModelPatcher,
     MusicgenModelPatcher,
     SAMModelPatcher,
@@ -85,6 +93,9 @@
 )
 
 
+# TODO : moved back onnx imports applied in https://github.com/huggingface/optimum/pull/2114/files after refactorization
+
+
 if TYPE_CHECKING:
     from transformers import PretrainedConfig
     from transformers.modeling_utils import PreTrainedModel
@@ -119,7 +130,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
 
 
 class AlbertOnnxConfig(BertOnnxConfig):
-    DEFAULT_ONNX_OPSET = 11
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class ConvBertOnnxConfig(BertOnnxConfig):
@@ -154,9 +165,13 @@ class SplinterOnnxConfig(BertOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
-class DistilBertOnnxConfig(BertOnnxConfig):
+class RemBertOnnxConfig(BertOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
+
+class DistilBertOnnxConfig(BertOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for transformers>=4.46.0
+
     @property
     def inputs(self) -> Dict[str, Dict[int, str]]:
         if self.task == "multiple-choice":
@@ -171,11 +186,11 @@ class MPNetOnnxConfig(DistilBertOnnxConfig):
 
 
 class RobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class CamembertOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class FlaubertOnnxConfig(BertOnnxConfig):
@@ -187,7 +202,7 @@ class IBertOnnxConfig(DistilBertOnnxConfig):
 
 
 class XLMRobertaOnnxConfig(DistilBertOnnxConfig):
-    pass
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
 
 
 class DebertaOnnxConfig(BertOnnxConfig):
@@ -256,8 +271,32 @@ class ImageGPTOnnxConfig(GPT2OnnxConfig):
     pass
 
 
+class DecisionTransformerOnnxConfig(OnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (DummyDecisionTransformerInputGenerator,)
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "states": {0: "batch_size", 1: "sequence_length"},
+            "actions": {0: "batch_size", 1: "sequence_length"},
+            "timesteps": {0: "batch_size", 1: "sequence_length"},
+            "returns_to_go": {0: "batch_size", 1: "sequence_length"},
+            "attention_mask": {0: "batch_size", 1: "sequence_length"},
+        }
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "state_preds": {0: "batch_size", 1: "sequence_length"},
+            "action_preds": {0: "batch_size", 1: "sequence_length"},
+            "return_preds": {0: "batch_size", 1: "sequence_length"},
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+
+
 class GPTNeoOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_attention_heads="num_heads")
 
 
@@ -266,10 +305,18 @@ class GPTNeoXOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
-class OPTOnnxConfig(TextDecoderOnnxConfig):
-    # OPT does not require position_ids input.
-    DEFAULT_ONNX_OPSET = 13
-    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+# OPT does not take position_ids as input for transfomers < v4.46, needs it for transformers >= v4.46
+if check_if_transformers_greater("4.45.99"):
+
+    class OPTOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
+        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+
+else:
+
+    class OPTOnnxConfig(TextDecoderOnnxConfig):
+        DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
+        NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
 class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -280,6 +327,15 @@ class LlamaOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
 
 
+class OlmoOnnxConfig(LlamaOnnxConfig):
+    ATOL_FOR_VALIDATION = 1e-4
+    MIN_TRANSFORMERS_VERSION = version.parse("4.40.0")
+
+
+class Olmo2OnnxConfig(OlmoOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.47.0")
+
+
 class Qwen2OnnxConfig(LlamaOnnxConfig):
     MIN_TRANSFORMERS_VERSION = version.parse("4.37.0")
 
@@ -287,7 +343,12 @@ class Qwen2OnnxConfig(LlamaOnnxConfig):
 class GemmaOnnxConfig(LlamaOnnxConfig):
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator)
     DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator
-    pass
+    MIN_TRANSFORMERS_VERSION = version.parse("4.38.0")
+
+
+class GraniteOnnxConfig(LlamaOnnxConfig):
+    MIN_TRANSFORMERS_VERSION = version.parse("4.45.0")
+    MIN_TORCH_VERSION = version.parse("2.5.0")
 
 
 class PhiOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -304,6 +365,15 @@ class Phi3OnnxConfig(PhiOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfigWithGQA
     MIN_TRANSFORMERS_VERSION = version.parse("4.41.0")
 
+    def __init__(self, *args, **kwargs):
+        # TODO : replace check_if_transformers_greater with is_transformers_available
+        if check_if_transformers_greater("4.46.0") and not check_if_transformers_greater("4.46.1"):
+            logger.error(
+                "Found transformers v4.46.0 while trying to exporting a Phi3 model, this specific version of transformers is not supported. "
+                "Please upgrade to v4.46.1 or higher, or downgrade your transformers version"
+            )
+        super().__init__(*args, **kwargs)
+
 
 class MistralOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
     # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35
@@ -326,6 +396,8 @@ def patch_model_for_export(
 class MPTOnnxConfig(TextDecoderOnnxConfig):
     # MPT does not require position_ids input.
     DEFAULT_ONNX_OPSET = 13
+    # TODO: fix inference for transformers < v4.41 for beam_search > 1
+    MIN_TRANSFORMERS_VERSION = version.parse("4.41.0")
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(
         num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers"
     )
@@ -338,27 +410,31 @@ class BloomOnnxConfig(TextDecoderOnnxConfig):
     ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES
     DUMMY_PKV_GENERATOR_CLASS = BloomDummyPastKeyValuesGenerator
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_layers="n_layer", num_attention_heads="n_head")
+    DEFAULT_ONNX_OPSET = 14  # Bloom uses aten::triu that requires opset>=14, and F.scaled_dot_product_attention
 
     def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str):
-        if direction not in ["inputs", "outputs"]:
-            raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
-
-        if direction == "inputs":
-            decoder_sequence_name = "past_sequence_length"
-            name = "past_key_values"
+        if check_if_transformers_greater("4.44"):
+            super().add_past_key_values(inputs_or_outputs, direction)
         else:
-            decoder_sequence_name = "past_sequence_length + 1"
-            name = "present"
+            if direction not in ["inputs", "outputs"]:
+                raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given')
 
-        for i in range(self._normalized_config.num_layers):
-            inputs_or_outputs[f"{name}.{i}.key"] = {
-                0: "batch_size x num_heads",
-                2: decoder_sequence_name,
-            }
-            inputs_or_outputs[f"{name}.{i}.value"] = {
-                0: "batch_size x num_heads",
-                1: decoder_sequence_name,
-            }
+            if direction == "inputs":
+                decoder_sequence_name = "past_sequence_length"
+                name = "past_key_values"
+            else:
+                decoder_sequence_name = "past_sequence_length + 1"
+                name = "present"
+
+            for i in range(self._normalized_config.num_layers):
+                inputs_or_outputs[f"{name}.{i}.key"] = {
+                    0: "batch_size x num_heads",
+                    2: decoder_sequence_name,
+                }
+                inputs_or_outputs[f"{name}.{i}.value"] = {
+                    0: "batch_size x num_heads",
+                    1: decoder_sequence_name,
+                }
 
 
 class GPTBigCodeOnnxConfig(TextDecoderWithPositionIdsOnnxConfig):
@@ -476,7 +552,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class T5OnnxConfig(TextSeq2SeqOnnxConfig):
-    DEFAULT_ONNX_OPSET = 13
+    DEFAULT_ONNX_OPSET = 14  # T5 uses aten::triu that requires opset>=14
     DUMMY_INPUT_GENERATOR_CLASSES = TextSeq2SeqOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES[:-1] + (
         T5DummySeq2SeqPastKeyValuesGenerator,
     )
@@ -560,6 +636,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 
 class M2M100OnnxConfig(TextSeq2SeqOnnxConfig):
+    DEFAULT_ONNX_OPSET = 14  # now uses F.scaled_dot_product_attention by default for torch>=2.1.1.
     NORMALIZED_CONFIG_CLASS = NormalizedSeq2SeqConfig.with_args(
         encoder_num_layers="encoder_layers",
         decoder_num_layers="decoder_layers",
@@ -772,6 +849,65 @@ class ConvNextV2OnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class HieraOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
+class PvtOnnxConfig(ViTOnnxConfig):
+    DEFAULT_ONNX_OPSET = 11
+
+
+class VitMAEOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
+class VitMSNOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
+class Dinov2DummyInputGenerator(DummyVisionInputGenerator):
+    def __init__(
+        self,
+        task: str,
+        normalized_config: NormalizedVisionConfig,
+        batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"],
+        num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"],
+        width: int = DEFAULT_DUMMY_SHAPES["width"],
+        height: int = DEFAULT_DUMMY_SHAPES["height"],
+        **kwargs,
+    ):
+        super().__init__(
+            task=task,
+            normalized_config=normalized_config,
+            batch_size=batch_size,
+            num_channels=num_channels,
+            width=width,
+            height=height,
+            **kwargs,
+        )
+
+        from transformers.onnx.utils import get_preprocessor
+
+        preprocessor = get_preprocessor(normalized_config._name_or_path)
+        if preprocessor is not None and hasattr(preprocessor, "crop_size"):
+            self.height = preprocessor.crop_size.get("height", self.height)
+            self.width = preprocessor.crop_size.get("width", self.width)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        input_ = super().generate(
+            input_name=input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        return input_
+
+
+class Dinov2OnnxConfig(ViTOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (Dinov2DummyInputGenerator,)
+
+
 class MobileViTOnnxConfig(ViTOnnxConfig):
     ATOL_FOR_VALIDATION = 1e-4
     DEFAULT_ONNX_OPSET = 11
@@ -813,6 +949,10 @@ class SwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
 
+class SwinV2OnnxConfig(SwinOnnxConfig):
+    pass
+
+
 class Swin2srOnnxConfig(SwinOnnxConfig):
     pass
 
@@ -848,6 +988,28 @@ class MobileNetV2OnnxConfig(MobileNetV1OnnxConfig):
     pass
 
 
+class MaskFormerOnnxConfig(ViTOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::einsum' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 12, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 12
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        if self.task == "image-segmentation":
+            return {
+                "class_queries_logits": {0: "batch_size", 1: "num_queries"},
+                "masks_queries_logits": {0: "batch_size", 1: "num_queries", 2: "height", 3: "width"},
+            }
+        else:
+            return super().outputs
+
+    @property
+    def torch_to_onnx_output_map(self) -> Dict[str, str]:
+        return {
+            "transformer_decoder_last_hidden_state": "last_hidden_state",
+        }
+
+
 class DonutSwinOnnxConfig(ViTOnnxConfig):
     DEFAULT_ONNX_OPSET = 11
 
@@ -868,6 +1030,21 @@ def torch_to_onnx_input_map(self) -> Dict[str, str]:
         return {"x": "pixel_values"}
 
 
+class MgpstrOnnxConfig(ViTOnnxConfig):
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "char_logits": {0: "batch_size"},
+            "bpe_logits": {0: "batch_size"},
+            "wp_logits": {0: "batch_size"},
+        }
+
+    def patch_model_for_export(
+        self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None
+    ) -> "ModelPatcher":
+        return MgpstrModelPatcher(self, model, model_kwargs=model_kwargs)
+
+
 class SentenceTransformersTransformerOnnxConfig(TextEncoderOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
     DEFAULT_ONNX_OPSET = 14  # Some bottleneck transformers models require a specific ONNX opset to be successfully exported. We put a rather high opset here for the export to work for all architectures.
@@ -1010,22 +1187,13 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
             "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
             "pooler_output": {0: "batch_size"},
         }
+
         if self._normalized_config.output_hidden_states:
             for i in range(self._normalized_config.num_layers + 1):
                 common_outputs[f"hidden_states.{i}"] = {0: "batch_size", 1: "sequence_length"}
 
         return common_outputs
 
-    def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
-        dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
-
-        # TODO: fix should be by casting inputs during inference and not export
-        if framework == "pt":
-            import torch
-
-            dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int32)
-        return dummy_inputs
-
     def patch_model_for_export(
         self,
         model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"],
@@ -1034,8 +1202,41 @@ def patch_model_for_export(
         return CLIPModelPatcher(self, model, model_kwargs=model_kwargs)
 
 
+class SiglipNormalizedConfig(CLIPNormalizedConfig):
+    pass
+
+
+class SiglipOnnxConfig(CLIPOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = SiglipNormalizedConfig
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 13 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "input_ids": {0: "text_batch_size", 1: "sequence_length"},
+            "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"},
+            # NOTE: No attention_mask
+        }
+
+
+class SiglipTextWithProjectionOnnxConfig(CLIPTextWithProjectionOnnxConfig):
+    pass
+
+
+class SiglipTextOnnxConfig(CLIPTextOnnxConfig):
+    pass
+
+
+class SiglipVisionModelOnnxConfig(CLIPVisionModelOnnxConfig):
+    # torch.onnx.errors.UnsupportedOperatorError: Exporting the operator 'aten::scaled_dot_product_attention' to ONNX opset version 11 is not supported.
+    # Support for this operator was added in version 14, try exporting with this version.
+    DEFAULT_ONNX_OPSET = 14
+
+
 class UNetOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-3
+    ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1058,17 +1259,19 @@ class UNetOnnxConfig(VisionOnnxConfig):
     def inputs(self) -> Dict[str, Dict[int, str]]:
         common_inputs = {
             "sample": {0: "batch_size", 2: "height", 3: "width"},
-            "timestep": {0: "steps"},
+            "timestep": {},  # a scalar with no dimension
             "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
         }
 
-        # TODO : add text_image, image and image_embeds
+        # TODO : add addition_embed_type == text_image, image and image_embeds
+        # https://github.com/huggingface/diffusers/blob/9366c8f84bfe47099ff047272661786ebb54721d/src/diffusers/models/unets/unet_2d_condition.py#L671
         if getattr(self._normalized_config, "addition_embed_type", None) == "text_time":
             common_inputs["text_embeds"] = {0: "batch_size"}
             common_inputs["time_ids"] = {0: "batch_size"}
 
         if getattr(self._normalized_config, "time_cond_proj_dim", None) is not None:
             common_inputs["timestep_cond"] = {0: "batch_size"}
+
         return common_inputs
 
     @property
@@ -1107,7 +1310,7 @@ def ordered_inputs(self, model) -> Dict[str, Dict[int, str]]:
 
 
 class VaeEncoderOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-2
+    ATOL_FOR_VALIDATION = 3e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1127,12 +1330,12 @@ def inputs(self) -> Dict[str, Dict[int, str]]:
     @property
     def outputs(self) -> Dict[str, Dict[int, str]]:
         return {
-            "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
+            "latent_parameters": {0: "batch_size", 2: "height_latent", 3: "width_latent"},
         }
 
 
 class VaeDecoderOnnxConfig(VisionOnnxConfig):
-    ATOL_FOR_VALIDATION = 1e-3
+    ATOL_FOR_VALIDATION = 1e-4
     # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
     # operator support, available since opset 14
     DEFAULT_ONNX_OPSET = 14
@@ -1155,6 +1358,101 @@ def outputs(self) -> Dict[str, Dict[int, str]]:
         }
 
 
+class T5EncoderOnnxConfig(TextEncoderOnnxConfig):
+    NORMALIZED_CONFIG_CLASS = NormalizedTextConfig
+    ATOL_FOR_VALIDATION = 1e-4
+    DEFAULT_ONNX_OPSET = 12  # int64 was supported since opset 12
+
+    @property
+    def inputs(self):
+        return {
+            "input_ids": {0: "batch_size", 1: "sequence_length"},
+        }
+
+    @property
+    def outputs(self):
+        return {
+            "last_hidden_state": {0: "batch_size", 1: "sequence_length"},
+        }
+
+
+class SD3TransformerOnnxConfig(VisionOnnxConfig):
+    ATOL_FOR_VALIDATION = 1e-4
+    # The ONNX export of a CLIPText architecture, an other Stable Diffusion component, needs the Trilu
+    # operator support, available since opset 14
+    DEFAULT_ONNX_OPSET = 14
+
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTransformerTimestepInputGenerator,
+        DummyTransformerVisionInputGenerator,
+        DummyTransformerTextInputGenerator,
+    )
+
+    NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args(
+        image_size="sample_size",
+        num_channels="in_channels",
+        vocab_size="attention_head_dim",
+        hidden_size="joint_attention_dim",
+        projection_size="pooled_projection_dim",
+        allow_new=True,
+    )
+
+    @property
+    def inputs(self) -> Dict[str, Dict[int, str]]:
+        common_inputs = {
+            "hidden_states": {0: "batch_size", 2: "height", 3: "width"},
+            "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"},
+            "pooled_projections": {0: "batch_size"},
+            "timestep": {0: "step"},
+        }
+
+        return common_inputs
+
+    @property
+    def outputs(self) -> Dict[str, Dict[int, str]]:
+        return {
+            "out_hidden_states": {0: "batch_size", 2: "height", 3: "width"},
+        }
+
+    @property
+    def torch_to_onnx_output_map(self) -> Dict[str, str]:
+        return {
+            "sample": "out_hidden_states",
+        }
+
+
+class FluxTransformerOnnxConfig(SD3TransformerOnnxConfig):
+    DUMMY_INPUT_GENERATOR_CLASSES = (
+        DummyTransformerTimestepInputGenerator,
+        DummyFluxTransformerVisionInputGenerator,
+        DummyFluxTransformerTextInputGenerator,
+    )
+
+    @property
+    def inputs(self):
+        common_inputs = super().inputs
+        common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"}
+        common_inputs["txt_ids"] = (
+            {0: "sequence_length"} if check_if_diffusers_greater("0.31.0") else {0: "batch_size", 1: "sequence_length"}
+        )
+        common_inputs["img_ids"] = (
+            {0: "packed_height_width"}
+            if check_if_diffusers_greater("0.31.0")
+            else {0: "batch_size", 1: "packed_height_width"}
+        )
+
+        if getattr(self._normalized_config, "guidance_embeds", False):
+            common_inputs["guidance"] = {0: "batch_size"}
+
+        return common_inputs
+
+    @property
+    def outputs(self):
+        return {
+            "out_hidden_states": {0: "batch_size", 1: "packed_height_width"},
+        }
+
+
 class GroupViTOnnxConfig(CLIPOnnxConfig):
     pass
 
@@ -1722,6 +2020,8 @@ def post_process_exported_models(
             decoder_with_past_path = Path(path, onnx_files_subpaths[3])
             decoder_merged_path = Path(path, ONNX_DECODER_MERGED_NAME + ".onnx")
             try:
+                from ...onnx import merge_decoders
+
                 # The decoder with past does not output the cross attention past key values as they are constant,
                 # hence the need for strict=False
                 merge_decoders(
@@ -2022,6 +2322,7 @@ class TrOCROnnxConfig(TextSeq2SeqOnnxConfig):
 class VisionEncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
     ATOL_FOR_VALIDATION = 1e-3
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
 
     DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionEncoderDecoderPastKeyValuesGenerator)
 
@@ -2151,8 +2452,21 @@ class Pix2StructOnnxConfig(OnnxSeq2SeqConfigWithPast):
         DummySeq2SeqPastKeyValuesGenerator,
         DummyPix2StructInputGenerator,
     )
-    # Min operator needs to support int64, which is the case for opset>=12
-    DEFAULT_ONNX_OPSET = 12
+
+    DEFAULT_ONNX_OPSET = 14  # use 'aten::triu' now which is opset 14
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # TODO : replace check_if_transformers_greater with is_transformers_available
+        if (
+            check_if_transformers_greater("4.46.0")
+            and not check_if_transformers_greater("4.46.1")
+            and self._behavior is ConfigBehavior.DECODER
+        ):
+            logger.error(
+                "Found transformers v4.46.0 while trying to exporting a Pix2Struct model, this specific version of transformers is not supported. "
+                "Please upgrade to v4.46.1 or higher, or downgrade your transformers version"
+            )
 
     @property
     def inputs(self):
@@ -2305,3 +2619,5 @@ def overwrite_shape_and_generate_input(
 
 class EncoderDecoderOnnxConfig(EncoderDecoderBaseOnnxConfig):
     NORMALIZED_CONFIG_CLASS = NormalizedEncoderDecoderConfig
+
+    DEFAULT_ONNX_OPSET = 14  # uses SDPA in Transformers, hence opset>=14.
diff --git a/optimum/exporters/onnx/model_patcher.py b/optimum/exporters/onnx/model_patcher.py
index 34ed5fcae46..083bc127999 100644
--- a/optimum/exporters/onnx/model_patcher.py
+++ b/optimum/exporters/onnx/model_patcher.py
@@ -34,11 +34,10 @@
 
 
 if _transformers_version > version.parse("4.34.99"):
-    from transformers.modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 if _transformers_version >= version.parse("4.36"):
     from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
 else:
-    _prepare_4d_causal_attention_mask = None
     _prepare_4d_causal_attention_mask_for_sdpa = None
     AttentionMaskConverter = None
 
@@ -169,7 +168,7 @@ def patched_forward(*args, **kwargs):
                         filterd_outputs[name] = value
             elif isinstance(outputs, (list, tuple)):
                 outputs_list = list(config.outputs.keys())
-                dict(zip(outputs_list, outputs))
+                filterd_outputs = dict(zip(outputs_list, outputs))
             else:
                 if len(config.outputs) > 1:
                     num_outputs = len(config.outputs)
@@ -510,6 +509,32 @@ def patched_forward(*args, **kwargs):
         self.patched_forward = patched_forward
 
 
+class MgpstrModelPatcher(ModelPatcher):
+    def __init__(
+        self,
+        config: "OnnxConfig",
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        model_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        super().__init__(config, model, model_kwargs)
+
+        @functools.wraps(self.orig_forward)
+        def patched_forward(*args, **kwargs):
+            signature = inspect.signature(self.orig_forward)
+            args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs)
+
+            # logits is a tuple, so we unpack it and return them as separate outputs
+            char_logits, bpe_logits, wp_logits = self.orig_forward(*args, **kwargs).logits
+
+            return {
+                "char_logits": char_logits,
+                "bpe_logits": bpe_logits,
+                "wp_logits": wp_logits,
+            }
+
+        self.patched_forward = patched_forward
+
+
 class SAMModelPatcher(ModelPatcher):
     def __init__(
         self,
diff --git a/optimum/exporters/onnx/utils.py b/optimum/exporters/onnx/utils.py
index 675566ba23e..19e24f88743 100644
--- a/optimum/exporters/onnx/utils.py
+++ b/optimum/exporters/onnx/utils.py
@@ -27,7 +27,7 @@
     is_diffusers_available,
     logging,
 )
-from ...utils.import_utils import _diffusers_version
+from ...utils.import_utils import _diffusers_version, check_if_transformers_greater
 from ..utils import (
     _get_submodels_and_export_configs,
 )
@@ -86,9 +86,14 @@
     "phi",
     "phi3",
     "qwen2",
+    "granite",
 }
 
 
+if check_if_transformers_greater("4.45.99"):
+    MODEL_TYPES_REQUIRING_POSITION_IDS.add("opt")
+
+
 def check_onnxruntime_requirements(minimum_version: version.Version):
     """
     Checks that ONNX Runtime is installed and if version is recent enough.
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index e9869e00e54..c94c423e4e2 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -209,20 +209,27 @@ class TasksManager:
             "feature-extraction": "AutoModel",
             "fill-mask": "AutoModelForMaskedLM",
             "image-classification": "AutoModelForImageClassification",
-            "image-segmentation": ("AutoModelForImageSegmentation", "AutoModelForSemanticSegmentation"),
+            "image-segmentation": (
+                "AutoModelForImageSegmentation",
+                "AutoModelForSemanticSegmentation",
+                "AutoModelForInstanceSegmentation",
+                "AutoModelForUniversalSegmentation",
+            ),
             "image-to-image": "AutoModelForImageToImage",
-            "image-to-text": "AutoModelForVision2Seq",
+            "image-to-text": ("AutoModelForVision2Seq", "AutoModel"),
             "mask-generation": "AutoModel",
             "masked-im": "AutoModelForMaskedImageModeling",
             "multiple-choice": "AutoModelForMultipleChoice",
             "object-detection": "AutoModelForObjectDetection",
             "question-answering": "AutoModelForQuestionAnswering",
+            "reinforcement-learning": "AutoModel",
             "semantic-segmentation": "AutoModelForSemanticSegmentation",
             "text-to-audio": ("AutoModelForTextToSpectrogram", "AutoModelForTextToWaveform"),
             "text-generation": "AutoModelForCausalLM",
             "text2text-generation": "AutoModelForSeq2SeqLM",
             "text-classification": "AutoModelForSequenceClassification",
             "token-classification": "AutoModelForTokenClassification",
+            "visual-question-answering": "AutoModelForVisualQuestionAnswering",
             "zero-shot-image-classification": "AutoModelForZeroShotImageClassification",
             "zero-shot-object-detection": "AutoModelForZeroShotObjectDetection",
         }
@@ -306,11 +313,12 @@ class TasksManager:
         "vision2seq-lm": "image-to-text",
         "zero-shot-classification": "text-classification",
         "image-feature-extraction": "feature-extraction",
+        "pretraining": "feature-extraction",
         # for backward compatibility and testing (where
         # model task and model type are still the same)
-        "lcm": "text-to-image",
         "stable-diffusion": "text-to-image",
         "stable-diffusion-xl": "text-to-image",
+        "latent-consistency": "text-to-image",
     }
 
     _CUSTOM_CLASSES = {
@@ -335,7 +343,11 @@ class TasksManager:
     }
 
     _DIFFUSERS_SUPPORTED_MODEL_TYPE = {
-        "clip-text-model": supported_tasks_mapping(
+        "t5-encoder": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="T5EncoderOnnxConfig",
+        ),
+        "clip-text": supported_tasks_mapping(
             "feature-extraction",
             onnx="CLIPTextOnnxConfig",
         ),
@@ -343,7 +355,15 @@ class TasksManager:
             "feature-extraction",
             onnx="CLIPTextWithProjectionOnnxConfig",
         ),
-        "unet": supported_tasks_mapping(
+        "flux-transformer-2d": supported_tasks_mapping(
+            "semantic-segmentation",
+            onnx="FluxTransformerOnnxConfig",
+        ),
+        "sd3-transformer-2d": supported_tasks_mapping(
+            "semantic-segmentation",
+            onnx="SD3TransformerOnnxConfig",
+        ),
+        "unet-2d-condition": supported_tasks_mapping(
             "semantic-segmentation",
             onnx="UNetOnnxConfig",
         ),
@@ -418,6 +438,15 @@ class TasksManager:
             onnx="BertOnnxConfig",
             tflite="BertTFLiteConfig",
         ),
+        "rembert": supported_tasks_mapping(
+            "fill-mask",
+            "feature-extraction",
+            "text-classification",
+            "multiple-choice",
+            "token-classification",
+            "question-answering",
+            onnx="RemBertOnnxConfig",
+        ),
         # For big-bird and bigbird-pegasus being unsupported, refer to model_configs.py
         # "big-bird": supported_tasks_mapping(
         #     "feature-extraction",
@@ -562,6 +591,11 @@ class TasksManager:
             onnx="DebertaV2OnnxConfig",
             tflite="DebertaV2TFLiteConfig",
         ),
+        "decision-transformer": supported_tasks_mapping(
+            "feature-extraction",
+            "reinforcement-learning",
+            onnx="DecisionTransformerOnnxConfig",
+        ),
         "deit": supported_tasks_mapping(
             "feature-extraction",
             "image-classification",
@@ -574,6 +608,11 @@ class TasksManager:
             "image-segmentation",
             onnx="DetrOnnxConfig",
         ),
+        "dinov2": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="Dinov2OnnxConfig",
+        ),
         "distilbert": supported_tasks_mapping(
             "feature-extraction",
             "fill-mask",
@@ -705,6 +744,11 @@ class TasksManager:
             "feature-extraction",
             onnx="GroupViTOnnxConfig",
         ),
+        "hiera": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="HieraOnnxConfig",
+        ),
         "hubert": supported_tasks_mapping(
             "feature-extraction",
             "automatic-speech-recognition",
@@ -787,6 +831,11 @@ class TasksManager:
             "question-answering",
             onnx="MarkupLMOnnxConfig",
         ),
+        "maskformer": supported_tasks_mapping(
+            "feature-extraction",
+            "image-segmentation",
+            onnx="MaskFormerOnnxConfig",
+        ),
         "mbart": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -798,6 +847,11 @@ class TasksManager:
             "question-answering",
             onnx="MBartOnnxConfig",
         ),
+        "mgp-str": supported_tasks_mapping(
+            "feature-extraction",
+            "image-to-text",
+            onnx="MgpstrOnnxConfig",
+        ),
         "mistral": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -916,6 +970,27 @@ class TasksManager:
             "text-classification",
             onnx="LlamaOnnxConfig",
         ),
+        "granite": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            onnx="GraniteOnnxConfig",
+        ),
+        "olmo": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            onnx="OlmoOnnxConfig",
+        ),
+        "olmo2": supported_tasks_mapping(
+            "feature-extraction",
+            "feature-extraction-with-past",
+            "text-generation",
+            "text-generation-with-past",
+            onnx="Olmo2OnnxConfig",
+        ),
         "pegasus": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -959,6 +1034,11 @@ class TasksManager:
             "image-classification",
             onnx="PoolFormerOnnxConfig",
         ),
+        "pvt": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="PvtOnnxConfig",
+        ),
         "regnet": supported_tasks_mapping(
             "feature-extraction",
             "image-classification",
@@ -1018,6 +1098,23 @@ class TasksManager:
             "audio-classification",
             onnx="SEWDOnnxConfig",
         ),
+        "siglip": supported_tasks_mapping(
+            "feature-extraction",
+            "zero-shot-image-classification",
+            onnx="SiglipOnnxConfig",
+        ),
+        "siglip-text-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextOnnxConfig",
+        ),
+        "siglip-text-with-projection": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipTextWithProjectionOnnxConfig",
+        ),
+        "siglip-vision-model": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="SiglipVisionModelOnnxConfig",
+        ),
         "speech-to-text": supported_tasks_mapping(
             "feature-extraction",
             "feature-extraction-with-past",
@@ -1050,6 +1147,12 @@ class TasksManager:
             "masked-im",
             onnx="SwinOnnxConfig",
         ),
+        "swinv2": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            "masked-im",
+            onnx="SwinV2OnnxConfig",
+        ),
         "swin2sr": supported_tasks_mapping(
             "feature-extraction",
             "image-to-image",
@@ -1096,7 +1199,19 @@ class TasksManager:
             onnx="VisionEncoderDecoderOnnxConfig",
         ),
         "vit": supported_tasks_mapping(
-            "feature-extraction", "image-classification", "masked-im", onnx="ViTOnnxConfig"
+            "feature-extraction",
+            "image-classification",
+            "masked-im",
+            onnx="ViTOnnxConfig",
+        ),
+        "vit-mae": supported_tasks_mapping(
+            "feature-extraction",
+            onnx="VitMAEOnnxConfig",
+        ),
+        "vit-msn": supported_tasks_mapping(
+            "feature-extraction",
+            "image-classification",
+            onnx="VitMSNOnnxConfig",
         ),
         "vits": supported_tasks_mapping(
             "text-to-audio",
@@ -1171,12 +1286,21 @@ class TasksManager:
         "transformers": _SUPPORTED_MODEL_TYPE,
     }
     _UNSUPPORTED_CLI_MODEL_TYPE = {
-        "unet",
+        # diffusers model types
+        "clip-text",
+        "clip-text-with-projection",
+        "flux-transformer-2d",
+        "sd3-transformer-2d",
+        "t5-encoder",
+        "unet-2d-condition",
         "vae-encoder",
         "vae-decoder",
         "clip-text-model",
         "clip-text-with-projection",
-        "trocr",  # supported through the vision-encoder-decoder model type
+        "siglip-text-model",
+        "siglip-text-with-projection",
+        # redundant model types
+        "trocr",  # same as vision-encoder-decoder
     }
     _SUPPORTED_CLI_MODEL_TYPE = (
         set(_SUPPORTED_MODEL_TYPE.keys())
@@ -1938,12 +2062,6 @@ def standardize_model_attributes(cls, model: Union["PreTrainedModel", "TFPreTrai
                 if inferred_model_type is not None:
                     break
 
-            if inferred_model_type is None:
-                raise ValueError(
-                    f"The export of a DiffusionPipeline model with the class name {model.__class__.__name__} is currently not supported in Optimum. "
-                    "Please open an issue or submit a PR to add the support."
-                )
-
             # `model_type` is a class attribute in Transformers, let's avoid modifying it.
             model.config.export_model_type = inferred_model_type
 
@@ -2068,10 +2186,20 @@ def get_model_from_task(
             if original_task == "automatic-speech-recognition" or task == "automatic-speech-recognition":
                 if original_task == "auto" and config.architectures is not None:
                     model_class_name = config.architectures[0]
+            elif original_task == "reinforcement-learning" or task == "reinforcement-learning":
+                if config.architectures is not None:
+                    model_class_name = config.architectures[0]
+
+        if library_name == "diffusers":
+            config = DiffusionPipeline.load_config(model_name_or_path, **kwargs)
+            class_name = config.get("_class_name", None)
+            loaded_library = importlib.import_module(library_name)
+            model_class = getattr(loaded_library, class_name)
+        else:
+            model_class = TasksManager.get_model_class_for_task(
+                task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name
+            )
 
-        model_class = TasksManager.get_model_class_for_task(
-            task, framework, model_type=model_type, model_class_name=model_class_name, library=library_name
-        )
         if library_name == "timm":
             model = model_class(f"hf_hub:{model_name_or_path}", pretrained=True, exportable=True)
             model = model.to(torch_dtype).to(device)
@@ -2080,6 +2208,7 @@ def get_model_from_task(
             use_auth_token = model_kwargs.pop("use_auth_token", None)
             token = model_kwargs.pop("token", None)
             trust_remote_code = model_kwargs.pop("trust_remote_code", False)
+            model_kwargs["torch_dtype"] = torch_dtype
 
             if use_auth_token is not None:
                 warnings.warn(
@@ -2095,7 +2224,9 @@ def get_model_from_task(
                 device=device,
                 cache_folder=cache_folder,
                 token=token,
+                revision=revision,
                 trust_remote_code=trust_remote_code,
+                model_kwargs=model_kwargs,
             )
         else:
             try:
diff --git a/optimum/exporters/tflite/__main__.py b/optimum/exporters/tflite/__main__.py
index b3c90cb63f2..0c4c7b994fa 100644
--- a/optimum/exporters/tflite/__main__.py
+++ b/optimum/exporters/tflite/__main__.py
@@ -28,7 +28,6 @@
 
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def main():
diff --git a/optimum/exporters/utils.py b/optimum/exporters/utils.py
index e2125736c4d..02b1d0fe3af 100644
--- a/optimum/exporters/utils.py
+++ b/optimum/exporters/utils.py
@@ -15,9 +15,9 @@
 
 """Utilities for model preparation to export."""
 
-
 import copy
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
+from inspect import signature
+from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
 
 import torch
 from packaging import version
@@ -44,17 +44,7 @@
             "Please update diffusers by running `pip install --upgrade diffusers`"
         )
 
-    from diffusers import (
-        DiffusionPipeline,
-        LatentConsistencyModelImg2ImgPipeline,
-        LatentConsistencyModelPipeline,
-        StableDiffusionImg2ImgPipeline,
-        StableDiffusionInpaintPipeline,
-        StableDiffusionPipeline,
-        StableDiffusionXLImg2ImgPipeline,
-        StableDiffusionXLInpaintPipeline,
-        StableDiffusionXLPipeline,
-    )
+    from diffusers import DiffusionPipeline
     from diffusers.models.attention_processor import (
         Attention,
         AttnAddedKVProcessor,
@@ -85,6 +75,20 @@
 DECODER_MERGED_NAME = "decoder_model_merged"
 
 
+_DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE = {
+    "CLIPTextModel": "clip-text",
+    "CLIPTextModelWithProjection": "clip-text-with-projection",
+    "FluxTransformer2DModel": "flux-transformer-2d",
+    "SD3Transformer2DModel": "sd3-transformer-2d",
+    "UNet2DConditionModel": "unet-2d-condition",
+    "T5EncoderModel": "t5-encoder",
+}
+
+
+def _get_diffusers_submodel_type(submodel):
+    return _DIFFUSERS_CLASS_NAME_TO_SUBMODEL_TYPE.get(submodel.__class__.__name__)
+
+
 def _get_submodels_for_export_diffusion(
     pipeline: "DiffusionPipeline",
 ) -> Dict[str, Union["PreTrainedModel", "ModelMixin"]]:
@@ -92,69 +96,91 @@ def _get_submodels_for_export_diffusion(
     Returns the components of a Stable Diffusion model.
     """
 
-    is_stable_diffusion = isinstance(
-        pipeline, (StableDiffusionPipeline, StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline)
-    )
-    is_stable_diffusion_xl = isinstance(
-        pipeline, (StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipeline, StableDiffusionXLInpaintPipeline)
-    )
-    is_latent_consistency_model = isinstance(
-        pipeline, (LatentConsistencyModelPipeline, LatentConsistencyModelImg2ImgPipeline)
-    )
-
-    if is_stable_diffusion_xl:
-        projection_dim = pipeline.text_encoder_2.config.projection_dim
-    elif is_stable_diffusion:
-        projection_dim = pipeline.text_encoder.config.projection_dim
-    elif is_latent_consistency_model:
-        projection_dim = pipeline.text_encoder.config.projection_dim
-    else:
-        raise ValueError(
-            f"The export of a DiffusionPipeline model with the class name {pipeline.__class__.__name__} is currently not supported in Optimum. "
-            "Please open an issue or submit a PR to add the support."
-        )
-
     models_for_export = {}
 
+    is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0")
+    is_sdxl = pipeline.__class__.__name__.startswith("StableDiffusionXL")
+    is_sd3 = pipeline.__class__.__name__.startswith("StableDiffusion3")
+
     # Text encoder
     text_encoder = getattr(pipeline, "text_encoder", None)
     if text_encoder is not None:
-        if is_stable_diffusion_xl:
+        if is_sdxl or is_sd3:
             text_encoder.config.output_hidden_states = True
+            text_encoder.text_model.config.output_hidden_states = True
+
+        text_encoder.config.export_model_type = _get_diffusers_submodel_type(text_encoder)
         models_for_export["text_encoder"] = text_encoder
 
-    # U-NET
-    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
-    is_torch_greater_or_equal_than_2_1 = version.parse(torch.__version__) >= version.parse("2.1.0")
-    if not is_torch_greater_or_equal_than_2_1:
-        pipeline.unet.set_attn_processor(AttnProcessor())
+    # Text encoder 2
+    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
+    if text_encoder_2 is not None:
+        if is_sdxl or is_sd3:
+            text_encoder_2.config.output_hidden_states = True
+            text_encoder_2.text_model.config.output_hidden_states = True
 
-    pipeline.unet.config.text_encoder_projection_dim = projection_dim
-    # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
-    # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
-    pipeline.unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
-    models_for_export["unet"] = pipeline.unet
+        text_encoder_2.config.export_model_type = _get_diffusers_submodel_type(text_encoder_2)
+        models_for_export["text_encoder_2"] = text_encoder_2
 
-    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    # Text encoder 3
+    text_encoder_3 = getattr(pipeline, "text_encoder_3", None)
+    if text_encoder_3 is not None:
+        text_encoder_3.config.export_model_type = _get_diffusers_submodel_type(text_encoder_3)
+        models_for_export["text_encoder_3"] = text_encoder_3
+
+    # U-NET
+    unet = getattr(pipeline, "unet", None)
+    if unet is not None:
+        # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
+        if not is_torch_greater_or_equal_than_2_1:
+            unet.set_attn_processor(AttnProcessor())
+
+        # The U-NET time_ids inputs shapes depends on the value of `requires_aesthetics_score`
+        # https://github.com/huggingface/diffusers/blob/v0.18.2/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py#L571
+        unet.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+        unet.config.time_cond_proj_dim = getattr(pipeline.unet.config, "time_cond_proj_dim", None)
+        unet.config.text_encoder_projection_dim = (
+            pipeline.text_encoder.config.projection_dim
+            if not is_sdxl
+            else pipeline.text_encoder_2.config.projection_dim
+        )
+        unet.config.export_model_type = _get_diffusers_submodel_type(unet)
+        models_for_export["unet"] = unet
+
+    # Transformer
+    transformer = getattr(pipeline, "transformer", None)
+    if transformer is not None:
+        # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
+        if not is_torch_greater_or_equal_than_2_1:
+            transformer.set_attn_processor(AttnProcessor())
+
+        transformer.config.requires_aesthetics_score = getattr(pipeline.config, "requires_aesthetics_score", False)
+        transformer.config.time_cond_proj_dim = getattr(pipeline.transformer.config, "time_cond_proj_dim", None)
+        transformer.config.text_encoder_projection_dim = pipeline.text_encoder.config.projection_dim
+        transformer.config.export_model_type = _get_diffusers_submodel_type(transformer)
+        models_for_export["transformer"] = transformer
+
+    # VAE Encoder
     vae_encoder = copy.deepcopy(pipeline.vae)
+
+    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
     if not is_torch_greater_or_equal_than_2_1:
         vae_encoder = override_diffusers_2_0_attn_processors(vae_encoder)
-    vae_encoder.forward = lambda sample: {"latent_sample": vae_encoder.encode(x=sample)["latent_dist"].sample()}
+
+    # we return the distribution parameters to be able to recreate it in the decoder
+    vae_encoder.forward = lambda sample: {"latent_parameters": vae_encoder.encode(x=sample)["latent_dist"].parameters}
     models_for_export["vae_encoder"] = vae_encoder
 
-    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    # VAE Decoder
     vae_decoder = copy.deepcopy(pipeline.vae)
+
+    # ONNX export of torch.nn.functional.scaled_dot_product_attention not supported for < v2.1.0
     if not is_torch_greater_or_equal_than_2_1:
         vae_decoder = override_diffusers_2_0_attn_processors(vae_decoder)
+
     vae_decoder.forward = lambda latent_sample: vae_decoder.decode(z=latent_sample)
     models_for_export["vae_decoder"] = vae_decoder
 
-    text_encoder_2 = getattr(pipeline, "text_encoder_2", None)
-    if text_encoder_2 is not None:
-        text_encoder_2.config.output_hidden_states = True
-        text_encoder_2.text_model.config.output_hidden_states = True
-        models_for_export["text_encoder_2"] = text_encoder_2
-
     return models_for_export
 
 
@@ -312,33 +338,59 @@ def get_diffusion_models_for_export(
         `Dict[str, Tuple[Union[`PreTrainedModel`, `TFPreTrainedModel`], `ExportConfig`]: A Dict containing the model and
         export configs for the different components of the model.
     """
+
     models_for_export = _get_submodels_for_export_diffusion(pipeline)
 
     # Text encoder
     if "text_encoder" in models_for_export:
+        text_encoder = models_for_export["text_encoder"]
         text_encoder_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=pipeline.text_encoder,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
+            model=text_encoder, exporter=exporter, library_name="diffusers", task="feature-extraction"
         )
         text_encoder_export_config = text_encoder_config_constructor(
-            pipeline.text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+            text_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
         )
         models_for_export["text_encoder"] = (models_for_export["text_encoder"], text_encoder_export_config)
 
+    # Text encoder 2
+    if "text_encoder_2" in models_for_export:
+        text_encoder_2 = models_for_export["text_encoder_2"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_2, exporter=exporter, library_name="diffusers", task="feature-extraction"
+        )
+        export_config = export_config_constructor(text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config)
+
+    # Text encoder 3
+    if "text_encoder_3" in models_for_export:
+        text_encoder_3 = models_for_export["text_encoder_3"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=text_encoder_3, exporter=exporter, library_name="diffusers", task="feature-extraction"
+        )
+        export_config = export_config_constructor(text_encoder_3.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["text_encoder_3"] = (models_for_export["text_encoder_3"], export_config)
+
     # U-NET
-    export_config_constructor = TasksManager.get_exporter_config_constructor(
-        model=pipeline.unet,
-        exporter=exporter,
-        library_name="diffusers",
-        task="semantic-segmentation",
-        model_type="unet",
-    )
-    unet_export_config = export_config_constructor(pipeline.unet.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["unet"] = (models_for_export["unet"], unet_export_config)
+    if "unet" in models_for_export:
+        unet = models_for_export["unet"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=unet, exporter=exporter, library_name="diffusers", task="semantic-segmentation"
+        )
+        unet_export_config = export_config_constructor(unet.config, int_dtype=int_dtype, float_dtype=float_dtype)
+        models_for_export["unet"] = (models_for_export["unet"], unet_export_config)
+
+    # Transformer
+    if "transformer" in models_for_export:
+        transformer = models_for_export["transformer"]
+        export_config_constructor = TasksManager.get_exporter_config_constructor(
+            model=transformer, exporter=exporter, library_name="diffusers", task="semantic-segmentation"
+        )
+        transformer_export_config = export_config_constructor(
+            transformer.config, int_dtype=int_dtype, float_dtype=float_dtype
+        )
+        models_for_export["transformer"] = (models_for_export["transformer"], transformer_export_config)
 
-    # VAE Encoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L565
+    # VAE Encoder
     vae_encoder = models_for_export["vae_encoder"]
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_encoder,
@@ -347,10 +399,12 @@ def get_diffusion_models_for_export(
         task="semantic-segmentation",
         model_type="vae-encoder",
     )
-    vae_export_config = vae_config_constructor(vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["vae_encoder"] = (vae_encoder, vae_export_config)
+    vae_encoder_export_config = vae_config_constructor(
+        vae_encoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_encoder"] = (vae_encoder, vae_encoder_export_config)
 
-    # VAE Decoder https://github.com/huggingface/diffusers/blob/v0.11.1/src/diffusers/models/vae.py#L600
+    # VAE Decoder
     vae_decoder = models_for_export["vae_decoder"]
     vae_config_constructor = TasksManager.get_exporter_config_constructor(
         model=vae_decoder,
@@ -359,21 +413,10 @@ def get_diffusion_models_for_export(
         task="semantic-segmentation",
         model_type="vae-decoder",
     )
-    vae_export_config = vae_config_constructor(vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype)
-    models_for_export["vae_decoder"] = (vae_decoder, vae_export_config)
-
-    if "text_encoder_2" in models_for_export:
-        export_config_constructor = TasksManager.get_exporter_config_constructor(
-            model=pipeline.text_encoder_2,
-            exporter=exporter,
-            library_name="diffusers",
-            task="feature-extraction",
-            model_type="clip-text-with-projection",
-        )
-        export_config = export_config_constructor(
-            pipeline.text_encoder_2.config, int_dtype=int_dtype, float_dtype=float_dtype
-        )
-        models_for_export["text_encoder_2"] = (models_for_export["text_encoder_2"], export_config)
+    vae_decoder_export_config = vae_config_constructor(
+        vae_decoder.config, int_dtype=int_dtype, float_dtype=float_dtype
+    )
+    models_for_export["vae_decoder"] = (vae_decoder, vae_decoder_export_config)
 
     return models_for_export
 
@@ -637,3 +680,27 @@ def _get_submodels_and_export_configs(
         export_config = next(iter(models_and_export_configs.values()))[1]
 
     return export_config, models_and_export_configs
+
+
+def check_dummy_inputs_are_allowed(
+    model: Union["PreTrainedModel", "TFPreTrainedModel", "ModelMixin"], dummy_input_names: Iterable[str]
+):
+    """
+    Checks that the dummy inputs from the ONNX config is a subset of the allowed inputs for `model`.
+    Args:
+        model (`Union[transformers.PreTrainedModel, transformers.TFPreTrainedModel`]):
+            The model instance.
+        model_inputs (`Iterable[str]`):
+            The model input names.
+    """
+
+    forward = model.forward if is_torch_available() and isinstance(model, torch.nn.Module) else model.call
+    forward_parameters = signature(forward).parameters
+    forward_inputs_set = set(forward_parameters.keys())
+    dummy_input_names = set(dummy_input_names)
+
+    # We are fine if config_inputs has more keys than model_inputs
+    if not dummy_input_names.issubset(forward_inputs_set):
+        raise ValueError(
+            f"Config dummy inputs are not a subset of the model inputs: {dummy_input_names} vs {forward_inputs_set}"
+        )
diff --git a/optimum/fx/parallelization/api.py b/optimum/fx/parallelization/api.py
index bd307bd93c1..9700b491e52 100644
--- a/optimum/fx/parallelization/api.py
+++ b/optimum/fx/parallelization/api.py
@@ -15,10 +15,11 @@
 import importlib
 import os
 from functools import partial
-from typing import List, Union
+from typing import Callable, List
 
 import torch
 from torch.fx import GraphModule
+from transformers import AutoConfig
 
 from .core import Config, ParallelExecutionCtx
 from .passes import build_parallel_pass_pipeline
@@ -43,30 +44,31 @@ def parallelize_backend(
 
 
 def parallelize_model(
-    model: Union[torch.nn.Module, str],
+    model: str,
     parallel_ctx: ParallelExecutionCtx,
     *model_args,
     **kwargs,
-):
+) -> Callable:
     """
     API for automatic model parallelism through Pytorch FX.
 
     Args:
-        model (Union[torch.nn.Module, str]):
-            Model to parallelize, could either be a module or a model id on the Huggingface Hub.
-        parallel_ctx (ParallelExecutionCtx):
+        model (`str`):
+            Model to parallelize, a model id on the Huggingface Hub or path to a local directory containing config and weights
+            of the model.
+        parallel_ctx (`ParallelExecutionCtx`):
             Parallel execution context containing process groups the current process belongs to.
-        *model_args (Any):
+        *model_args (`Any`):
             Additional postional arguments for intializing the model if a model id is passed.
-        revision (str, defaults to `main`):
+        revision (`str`, defaults to `main`):
             Model revision for weights downloading if a model id is passed.
-        cache_dir (Optional[str], defaults to `None`):
+        cache_dir (`Optional[str]`, defaults to `None`):
             Cache directory to store downloaded weights. Defaults to None.
-        local_files_only (bool, defaults to `False`):
+        local_files_only (`bool`, defaults to `False`):
             Whether to use local files only, will avoid downloading from remote if set to `True`.
-        skip_load_weights (bool, defaults to `False`):
+        skip_load_weights (`bool`, defaults to `False`):
             Whether to skip loading weights from disk to model.
-        **kwargs (Dict[str, Any]):
+        **kwargs (`Dict[str, Any]`):
             Addtional keyword arguments for overriding fields in parallel config, model config and `Model.__init__`.
     """
     revision = kwargs.pop("revision", "main")
@@ -80,44 +82,41 @@ def parallelize_model(
             setattr(parallel_config, k, v)
             kwargs.pop(k)
 
-    if isinstance(model, str):
-        from transformers import AutoConfig
-
-        is_local = os.path.isdir(model)
-        if not is_local:
-            hf_folder = download_model_from_hf(
-                model_name_or_path=model,
-                cache_dir=cache_dir,
-                revision=revision,
-                local_files_only=local_files_only,
-                skip_download_weights=skip_load_weights,
-            )
-        else:
-            hf_folder = model
-
-        # should be able to load config using only local files
-        model_config, kwargs = AutoConfig.from_pretrained(
-            hf_folder, revision=revision, local_files_only=True, return_unused_kwargs=True, **kwargs
+    is_local = os.path.isdir(model)
+    if not is_local:
+        hf_folder = download_model_from_hf(
+            model_name_or_path=model,
+            cache_dir=cache_dir,
+            revision=revision,
+            local_files_only=local_files_only,
+            skip_download_weights=skip_load_weights,
         )
+    else:
+        hf_folder = model
 
-        # try getting model class info from config
-        model_arch = model_config.architectures
-        model_cls = getattr(importlib.import_module("transformers"), model_arch[0])
+    # should be able to load config using only local files
+    model_config, kwargs = AutoConfig.from_pretrained(
+        hf_folder, revision=revision, local_files_only=True, return_unused_kwargs=True, **kwargs
+    )
 
-        if not skip_load_weights:
-            parallel_ctx.weight_map = try_collect_weight_map(model, cache_dir, hf_folder)
+    # try getting model class info from config
+    model_arch = model_config.architectures
+    model_cls = getattr(importlib.import_module("transformers"), model_arch[0])
 
-        torch_dtype, dtype_orig = kwargs.pop("torch_dtype", None), None
-        if torch_dtype is not None:
-            dtype_orig = model_cls._set_default_torch_dtype(torch_dtype)
+    if not skip_load_weights:
+        parallel_ctx.weight_map = try_collect_weight_map(model, cache_dir, hf_folder)
 
-        with MetaAwareMethodsPatcher():
-            model = model_cls(model_config, *model_args, **kwargs)
-            # TODO: remove this once support training-time trace
-            model.eval()
+    torch_dtype, dtype_orig = kwargs.pop("torch_dtype", None), None
+    if torch_dtype is not None:
+        dtype_orig = model_cls._set_default_torch_dtype(torch_dtype)
 
-        if dtype_orig is not None:
-            torch.set_default_dtype(dtype_orig)
+    with MetaAwareMethodsPatcher():
+        model = model_cls(model_config, *model_args, **kwargs)
+        # TODO: remove this once support training-time trace
+        model.eval()
+
+    if dtype_orig is not None:
+        torch.set_default_dtype(dtype_orig)
 
     move_model_to_device(model, device=parallel_ctx.current_device)
     initialize_parameter_meta(model)
diff --git a/optimum/fx/parallelization/core.py b/optimum/fx/parallelization/core.py
index cba7d454441..84737292f07 100644
--- a/optimum/fx/parallelization/core.py
+++ b/optimum/fx/parallelization/core.py
@@ -125,6 +125,11 @@ class ParallelExecutionCtx:
             because we have to make sure we don't initiate new parameters and replace original ones when
             recompilation happens in training process.
 
+        - param_cache (`Dict[str, nn.Parameter]`):
+            Cache which keeps record of newly created parameters. Similar to `parallel_layer_cache`, we
+            need to make sure all the newly created parameters in the first compilation will still be used
+            when recompilation happens.
+
         - weight_map (`Dict[str, str]`):
             Mapping between parameter names and their locations on disk, useful when loading weights
             from disk.
@@ -140,6 +145,7 @@ class ParallelExecutionCtx:
     current_device: torch.device
     example_inputs: List[Any] = field(default_factory=list)
     parallel_layer_cache: Dict[str, nn.Module] = field(default_factory=dict)
+    param_cache: Dict[str, nn.Parameter] = field(default_factory=dict)
     weight_map: Dict[str, str] = field(default_factory=dict)
     last_optimized_graph_module: Optional[GraphModule] = None
     compile_times: int = 0
@@ -160,8 +166,13 @@ class Config:
         - weight_init_fn (`Callable`, defaults to `partial(nn.init.normal_, std=0.02)`)
             Initialization function of weights in `nn.Linear` and `nn.Embedding` layers,
             if not provided weights loading path.
+
+        - enable_sequence_parallel (`bool`, defaults to `False`):
+            Whether to enable Megatron-style sequence parallelism in searching parallelization
+            strategies.
     """
 
     lint_and_recompile: bool = True
     clean_markers_after_all_passes: bool = True
     weight_init_fn: Callable = partial(nn.init.normal_, std=0.02)
+    enable_sequence_parallel: bool = False
diff --git a/optimum/fx/parallelization/decomp.py b/optimum/fx/parallelization/decomp.py
new file mode 100644
index 00000000000..5410818e929
--- /dev/null
+++ b/optimum/fx/parallelization/decomp.py
@@ -0,0 +1,225 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+from typing import Callable, Dict, List
+
+import torch
+import torch.nn.functional as F
+import torch.utils._pytree as pytree
+from torch import SymBool, SymFloat, SymInt
+from torch._decomp import core_aten_decompositions
+from torch._functorch._aot_autograd.functional_utils import from_fun, to_fun
+from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode, disable_functional_mode
+from torch.fx import Graph, GraphModule, Interpreter, Proxy, traceback
+from torch.fx.experimental.proxy_tensor import (
+    ProxyTorchDispatchMode,
+    _ProxyTensor,
+    _SymNodeDict,
+    decompose,
+    disable_proxy_modes_tracing,
+    fetch_object_proxy,
+    fetch_sym_proxy,
+    get_proxy_slot,
+    track_tensor_tree,
+)
+from torch.fx.proxy import GraphAppendingTracer
+from torch.utils.weak import WeakTensorKeyDictionary
+
+
+def is_leaf_module(m):
+    return (m.__module__.startswith("torch.nn") or m.__module__.startswith("torch.ao.nn")) and not isinstance(
+        m, torch.nn.Sequential
+    )
+
+
+@contextlib.contextmanager
+def trace_decomp_origin():
+    creat_node = Graph.create_node
+
+    def create_node_(*args, **kwargs):
+        node = creat_node(*args, **kwargs)
+        node.meta["traced_from"] = traceback.get_current_meta()["from_node"]
+        return node
+
+    try:
+        Graph.create_node = create_node_
+        yield
+    finally:
+        Graph.create_node = creat_node
+
+
+class DecompTracer(GraphAppendingTracer):
+    """
+    DecompTracer is a tracer class which works together with `DecompositionInterpreter`, it keeps track of tensors and their
+    corresponding proxy objects during execution process. When invoked with `create_proxy`, it creates a node in the containing
+    graph and associate the output tensor of the node with the created proxy.
+
+    See https://github.com/pytorch/pytorch/blob/main/torch/fx/experimental/proxy_tensor.py for more details.
+    """
+
+    def __init__(self, graph: Graph):
+        super().__init__(graph)
+        self.tensor_tracker = WeakTensorKeyDictionary()
+        self.symnode_tracker = _SymNodeDict()
+
+
+class DecompositionInterpreter(Interpreter):
+    """
+    DecompositionInterpreter takes the high-level graph module, run the iternal nodes following the topo order, and decompose
+    high-level pytorch operators into core aten operators by utilizing torch dispatch infrastructure along the way.
+
+    Notes:
+        - Certain primitive layers(like `nn.Linear`, `nn.Embedding`, and activation layers) are preserved because we have specific
+          heuristic based parallelization strategy for them so that we can conveniently replace them into their parallelized counterparts
+          in the orignal graph module.
+
+        - The traced graph is a low-level equivalent representation of the original graph module, and is only used for
+          parallel axis propagation and analysis, the original graph module is still used for real execution.
+    """
+
+    def __init__(
+        self, module: GraphModule, new_graph: Graph, decomposition_table=None, leaf_function_targets=None, **kwargs
+    ):
+        super().__init__(module, **kwargs)
+        self.new_graph = new_graph
+        self.tracer = DecompTracer(new_graph)
+
+        self.decomposition_table = decomposition_table
+        if self.decomposition_table is None:
+            self.decomposition_table = {}
+
+        self.leaf_function_targets = leaf_function_targets
+        if self.leaf_function_targets is None:
+            self.leaf_function_targets = []
+
+        self.fun_mode = FunctionalTensorMode()
+        self.mode = ProxyTorchDispatchMode(self.tracer, tracing_mode="real")
+
+    def placeholder(self, target, args, kwargs):
+        out = super().placeholder(target, args, kwargs)
+        out = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), out)
+        proxy = self.tracer.create_proxy("placeholder", target, args, kwargs)
+
+        with disable_proxy_modes_tracing():
+            track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+
+        out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out)
+        return out
+
+    def call_function(self, target, args, kwargs):
+        if target in self.leaf_function_targets:
+            args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args)
+            kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs)
+
+            with disable_proxy_modes_tracing(), disable_functional_mode():
+                out = target(*args, **kwargs)
+
+            args, kwargs = pytree.tree_map_only((torch.Tensor,), fetch_object_proxy(self.tracer), (args, kwargs))
+            proxy_args, proxy_kwargs = pytree.tree_map_only(
+                (SymInt, SymFloat, SymBool),
+                fetch_sym_proxy(self.tracer),
+                pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (args, kwargs)),
+            )
+            proxy = self.tracer.create_proxy("call_function", target, proxy_args, proxy_kwargs)
+
+            with disable_proxy_modes_tracing():
+                track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+
+            out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out)
+            return out
+
+        return super().call_function(target, args, kwargs)
+
+    def call_module(self, target, args, kwargs):
+        assert isinstance(target, str)
+        submod = self.fetch_attr(target)
+        if not is_leaf_module(submod):
+            return super().call_module(target, args, kwargs)
+
+        args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args)
+        kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs)
+
+        with disable_proxy_modes_tracing(), disable_functional_mode():
+            out = submod(*args, **kwargs)
+
+        args, kwargs = pytree.tree_map_only((torch.Tensor,), fetch_object_proxy(self.tracer), (args, kwargs))
+        proxy_args, proxy_kwargs = pytree.tree_map_only(
+            (SymInt, SymFloat, SymBool),
+            fetch_sym_proxy(self.tracer),
+            pytree.tree_map_only(_ProxyTensor, lambda e: e.proxy, (args, kwargs)),
+        )
+        proxy = self.tracer.create_proxy("call_module", target, proxy_args, proxy_kwargs)
+
+        with disable_proxy_modes_tracing():
+            track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+
+        out = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), out)
+        return out
+
+    def get_attr(self, target, args, kwargs):
+        out = super().get_attr(target, args, kwargs)
+        proxy = Proxy(self.new_graph.get_attr(target), self.tracer)
+        with disable_proxy_modes_tracing():
+            track_tensor_tree(out, proxy, constant=None, tracer=self.tracer)
+        return out
+
+    def output(self, target, args, kwargs):
+        args = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), args)
+        kwargs = pytree.tree_map_only(FunctionalTensor, lambda x: from_fun(x), kwargs)
+        out = super().output(target, args, kwargs)
+
+        def unwrap(e):
+            return get_proxy_slot(e, self.tracer, e, lambda x: x.proxy.node)
+
+        self.new_graph.output(pytree.tree_map(unwrap, out))
+        return out
+
+    def run(self, *args, **kwargs):
+        with self.fun_mode:
+            args = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), args)
+            kwargs = pytree.tree_map_only(torch.Tensor, lambda x: to_fun(x), kwargs)
+            with traceback.preserve_node_meta(), trace_decomp_origin(), decompose(self.decomposition_table), self.mode:
+                return super().run(*args, **kwargs)
+
+
+def decompose_and_functionalize(
+    graph_module: GraphModule,
+    decomposition_table: Dict[torch._ops.OperatorBase, Callable] = core_aten_decompositions(),
+    leaf_function_targets: List[Callable] = [F.scaled_dot_product_attention, F.cross_entropy],
+) -> Callable:
+    """
+    API to decompose and functionalize a high-level graph module.
+
+    Args:
+        graph_module (`GraphModule`):
+            The high-level graph module to be decomposed and functionalized.
+        decomposition_table (`Dict[torch._ops.OperatorBase, Callable]`, defaults to `core_aten_decompostions()`):
+            The lookup table which maps high-level torch op to their equivalent low-level implementation.
+        leaf_function_targets (`List[Callable]`, defaults to `[F.scaled_dot_product_attention]`):
+            Functions which will not be traced through for convenience, `F.scaled_dot_product_attention` is
+            treated as a leaf function by default so that we don't have to deal with all detailed version of
+            sdpas in the traced graph.
+
+    Returns:
+        Callable: a wrapper which returns the traced low-level graph when called with concrete arguments.
+    """
+    new_graph = Graph(owning_module=graph_module)
+    interp = DecompositionInterpreter(graph_module, new_graph, decomposition_table, leaf_function_targets)
+
+    def wrapper(*args, **kwargs):
+        interp.run(*args, **kwargs)
+        return new_graph
+
+    return wrapper
diff --git a/optimum/fx/parallelization/op_registry/__init__.py b/optimum/fx/parallelization/op_registry/__init__.py
new file mode 100644
index 00000000000..8f8df0f7bd0
--- /dev/null
+++ b/optimum/fx/parallelization/op_registry/__init__.py
@@ -0,0 +1,15 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .op_handlers import REGISTRY, FallbackParallelAxisPropagateHandler
diff --git a/optimum/fx/parallelization/op_registry/op_handlers.py b/optimum/fx/parallelization/op_registry/op_handlers.py
new file mode 100644
index 00000000000..4a9c55e3764
--- /dev/null
+++ b/optimum/fx/parallelization/op_registry/op_handlers.py
@@ -0,0 +1,469 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+from typing import Any, List, Optional
+
+import torch
+from torch.fx import Node
+
+from ..core import Config
+from ..utils import is_activation, is_cross_entropy, is_cross_entropy_parallel_compatible, is_embedding, is_linear
+
+
+class Registry:
+    """
+    Registry class handles registration of parallel axis propagation handlers of different aten ops.
+    To support a new aten op, you need to register the corresponding handler class by decorating it with `register` function.
+    """
+
+    def __init__(self) -> None:
+        self.mapping = {}
+
+    def register(self, op_types):
+        def wrapper(cls):
+            if isinstance(op_types, (list, tuple)):
+                for op_type in op_types:
+                    self.mapping[op_type] = cls
+            else:
+                self.mapping[op_types] = cls
+            return cls
+
+        return wrapper
+
+    def is_supported(self, op_type) -> bool:
+        return op_type in self.mapping
+
+
+REGISTRY = Registry()
+
+
+class OpParallelAxisPropagateHandler:
+    def __init__(self, node: Node, meta_key: str, config: Config) -> None:
+        self.node = node
+        self.meta_key = meta_key
+        self.config = config
+
+    def extract_axis(self, arg: Any) -> Optional[int]:
+        if not isinstance(arg, Node):
+            return None
+        return arg.meta[self.meta_key].get("parallel_axis", None)
+
+    @abstractmethod
+    def propagate(self) -> List[int]:
+        raise NotImplementedError
+
+
+@REGISTRY.register(
+    [
+        torch.ops.aten.pow.Tensor_Scalar,
+        torch.ops.aten.rsqrt.default,
+        torch.ops.aten.clone.default,
+        torch.ops.aten.bitwise_not.default,
+        torch.ops.aten.abs.default,
+        torch.ops.aten._to_copy.default,
+        torch.ops.aten.acos.default,
+        torch.ops.aten.acosh.default,
+        torch.ops.aten.alias.default,
+        torch.ops.aten.asin.default,
+        torch.ops.aten.asinh.default,
+        torch.ops.aten.atan.default,
+        torch.ops.aten.atanh.default,
+        torch.ops.aten.ceil.default,
+        torch.ops.aten.clamp.default,
+        torch.ops.aten.cos.default,
+        torch.ops.aten.cosh.default,
+        torch.ops.aten.erf.default,
+        torch.ops.aten.exp.default,
+        torch.ops.aten.trunc.default,
+        torch.ops.aten.tanh.default,
+        torch.ops.aten.tan.default,
+        torch.ops.aten.add.Scalar,
+        torch.ops.aten.sub.Scalar,
+        torch.ops.aten.sqrt.default,
+        torch.ops.aten.sin.default,
+        torch.ops.aten.sinh.default,
+        torch.ops.aten.sign.default,
+        torch.ops.aten.sigmoid.default,
+        torch.ops.aten.round.default,
+        torch.ops.aten.remainder.Scalar,
+        torch.ops.aten.relu.default,
+        torch.ops.aten.reciprocal.default,
+        torch.ops.aten.neg.default,
+        torch.ops.aten.ne.Scalar,
+        torch.ops.aten.native_dropout.default,
+        torch.ops.aten.mul.Scalar,
+        torch.ops.aten.logical_not.default,
+        torch.ops.aten.lt.Scalar,
+        torch.ops.aten.le.Scalar,
+        torch.ops.aten.log.default,
+        torch.ops.aten.log10.default,
+        torch.ops.aten.log2.default,
+        torch.ops.aten.log1p.default,
+        torch.ops.aten.leaky_relu.default,
+        torch.ops.aten.isnan.default,
+        torch.ops.aten.isinf.default,
+        torch.ops.aten.hardtanh.default,
+        torch.ops.aten.gt.Scalar,
+        torch.ops.aten.gelu.default,
+        torch.ops.aten.ge.Scalar,
+        torch.ops.aten.fmod.Scalar,
+        torch.ops.aten.floor.default,
+        torch.ops.aten.fill.Scalar,
+        torch.ops.aten.div.Scalar_mode,
+        torch.ops.aten.div.Scalar,
+        torch.ops.aten.bitwise_and.Scalar,
+        torch.ops.aten.bitwise_or.Scalar,
+        torch.ops.aten.bitwise_xor.Scalar,
+    ]
+)
+class UnaryOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg = self.node.all_input_nodes[0]
+        axis = self.extract_axis(arg)
+        return [axis]
+
+
+@REGISTRY.register(
+    [
+        torch.ops.aten.atan2.default,
+        torch.ops.aten.add.Tensor,
+        torch.ops.aten.bitwise_and.Tensor,
+        torch.ops.aten.bitwise_or.Tensor,
+        torch.ops.aten.bitwise_xor.Tensor,
+        torch.ops.aten.div.Tensor,
+        torch.ops.aten.div.Tensor_mode,
+        torch.ops.aten.eq.Tensor,
+        torch.ops.aten.fmod.Tensor,
+        torch.ops.aten.ge.Tensor,
+        torch.ops.aten.gt.Tensor,
+        torch.ops.aten.le.Tensor,
+        torch.ops.aten.logical_and.default,
+        torch.ops.aten.logical_or.default,
+        torch.ops.aten.logical_xor.default,
+        torch.ops.aten.lt.Tensor,
+        torch.ops.aten.maximum.default,
+        torch.ops.aten.minimum.default,
+        torch.ops.aten.mul.Tensor,
+        torch.ops.aten.ne.Tensor,
+        torch.ops.aten.pow.Tensor_Tensor,
+        torch.ops.aten.remainder.Tensor,
+        torch.ops.aten.sub.Tensor,
+    ]
+)
+class BinaryOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        input_nodes = self.node.all_input_nodes
+        # only one node
+        if len(input_nodes) == 1:
+            return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate()
+
+        assert len(input_nodes) == 2, "binary op should have exact two nodes as inputs"
+        lhs_shape, rhs_shape = input_nodes[0].meta["val"].shape, input_nodes[1].meta["val"].shape
+        lhs_axis = self.extract_axis(input_nodes[0])
+        rhs_axis = self.extract_axis(input_nodes[1])
+        i, j = len(lhs_shape) - 1, len(rhs_shape) - 1
+        while i >= 0 and j >= 0:
+            k = max(lhs_shape[i], rhs_shape[j])
+            assert (
+                k % min(lhs_shape[i], rhs_shape[j]) == 0
+            ), f"shape {lhs_shape} and {rhs_shape} are not broadcastable!"
+            i -= 1
+            j -= 1
+
+        if i < 0 and lhs_axis is not None:
+            lhs_axis += j + 1
+        if j < 0 and rhs_axis is not None:
+            rhs_axis += i + 1
+
+        if lhs_axis is None:
+            return [rhs_axis]
+        elif rhs_axis is None:
+            return [lhs_axis]
+        elif lhs_axis != rhs_axis:
+            return []
+        return [lhs_axis]
+
+
+@REGISTRY.register(
+    [
+        torch.ops.aten.amax.default,
+        torch.ops.aten.amin.default,
+        torch.ops.aten.any.dim,
+        torch.ops.aten._log_softmax.default,
+        torch.ops.aten._softmax.default,
+        torch.ops.aten.cumsum.default,
+        torch.ops.aten.mean.dim,
+        # torch.ops.aten.min.dim,
+        # torch.ops.aten.max.dim,
+        torch.ops.aten.var.dim,
+        torch.ops.aten.sum.dim_IntList,
+        torch.ops.aten.prod.dim_int,
+    ]
+)
+class ReductionOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def extract_dims(
+        self,
+    ) -> List[int]:
+        ndim = self.node.meta["val"].ndim
+        dims = None
+        if "dim" in self.node.kwargs:
+            dims = self.node.kwargs["dim"]
+        elif len(self.node.args) > 1 and isinstance(self.node.args[1], (int, list)):
+            dims = self.node.args[1]
+
+        if isinstance(dims, int):
+            dims = [dims]
+        if not dims:
+            dims = list(range(ndim))
+        dims = [(dim + ndim) % ndim for dim in dims]
+
+        keepdim = False
+        if "keepdim" in self.node.kwargs:
+            keepdim = self.node.kwargs
+        elif len(self.node.args) > 2 and isinstance(self.node.args[2], bool):
+            keepdim = self.node.args[2]
+
+        return dims, keepdim
+
+    def propagate(self) -> List[int]:
+        dims, keepdim = self.extract_dims()
+        arg = self.node.all_input_nodes[0]
+        axis = self.extract_axis(arg)
+        if axis in dims:
+            return []
+        if axis is None:
+            return [None]
+        if keepdim:
+            return [axis]
+        return [axis - sum([1 if dim < axis else 0 for dim in dims])]
+
+
+@REGISTRY.register(torch.ops.aten.view.default)
+class ViewLikeOpParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg = self.node.args[0]
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+        shape_before, shape_after = arg.meta["val"].shape, self.node.meta["val"].shape
+        size = 1
+        for i in range(len(shape_before) - 1, axis - 1, -1):
+            size *= shape_before[i]
+
+        cur, i, res = 1, len(shape_after) - 1, []
+        while cur <= size and i >= 0:
+            cur *= shape_after[i]
+            if cur == size:
+                res.append(i)
+            i -= 1
+
+        return res
+
+
+@REGISTRY.register(torch.ops.aten.unsqueeze.default)
+class UnsqueezeParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg, dim = self.node.args[0], self.node.args[1]
+        ndim = arg.meta["val"].ndim
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+        dim = (dim + ndim) % ndim
+        if dim <= axis:
+            return [axis + 1]
+        return [axis]
+
+
+@REGISTRY.register(
+    [
+        torch.ops.aten.squeeze.dim,
+        torch.ops.aten.squeeze.dims,
+    ]
+)
+class SqueezeParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg, dims = self.node.args[0], self.node.args[1]
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+
+        ndim = self.node.args[0].meta["val"].ndim
+        if isinstance(dims, int):
+            dims = [dims]
+        dims = [(dim + ndim) % ndim for dim in dims]
+        if axis in dims:
+            # being conservative
+            return []
+        return [axis - sum([1 if dim < axis else 0 for dim in dims])]
+
+
+@REGISTRY.register(torch.ops.aten.permute.default)
+class PermuteParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg, dims = self.node.args[0], self.node.args[1]
+        ndim = arg.meta["val"].ndim
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+
+        for i, dim in enumerate(dims):
+            if (dim + ndim) % ndim == axis:
+                return [i]
+        return []
+
+
+@REGISTRY.register(torch.ops.aten.slice.Tensor)
+class SliceParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg, slice_dim = self.node.args[0], self.node.args[1]
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+        ndim = arg.meta["val"].ndim
+        slice_dim = (slice_dim + ndim) % ndim
+        if slice_dim == axis:
+            # slice on the parallel axis is not allowed, except it's a nop
+            start, stop, step = 0, arg.meta["val"].shape[axis], 1
+            if len(self.node.args) > 2:
+                start = self.node.args[2]
+            elif len(self.node.args) > 3:
+                stop = self.node.args[3]
+            elif len(self.node.args) > 4:
+                step = self.node.args[4]
+            if start == 0 and stop >= arg.meta["val"].shape[axis] and step == 1:
+                return [axis]
+            return []
+        return [axis]
+
+
+@REGISTRY.register(torch.ops.aten.expand.default)
+class ExpandParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        arg, size = self.node.args[0], self.node.args[1]
+        axis = self.extract_axis(arg)
+        if axis is None:
+            return [None]
+        assert len(size) >= arg.meta["val"].ndim, "input size must be broadcastable to the target size in expand"
+        return [axis + len(size) - arg.meta["val"].ndim]
+
+
+@REGISTRY.register(torch.ops.aten.cat.default)
+class CatParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        nodes, cat_axis = self.node.all_input_nodes, self.node.args[1]
+        axis, ndim = self.extract_axis(nodes[0]), nodes[0].meta["val"].ndim
+        cat_axis = (cat_axis + ndim) % ndim
+        if cat_axis == axis:
+            return []
+        for i in range(1, len(nodes)):
+            if self.extract_axis(nodes[i]) != axis:
+                return []
+        return [axis]
+
+
+@REGISTRY.register(torch.ops.aten.constant_pad_nd.default)
+class PadParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        pad, ndim = self.node.args[1], self.node.args[0].meta["val"].ndim
+        axis = self.extract_axis(self.node.args[0])
+        if axis is None:
+            return [None]
+        if axis >= ndim - pad // 2:
+            return []
+        return [axis]
+
+
+@REGISTRY.register(torch.ops.aten.copy.default)
+class CopyParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        dst, src = self.node.all_input_nodes
+        axis_dst = self.extract_axis(dst)
+        axis_src = self.extract_axis(src)
+        if axis_dst != axis_src:
+            return []
+        return [axis_dst]
+
+
+@REGISTRY.register(torch.nn.functional.scaled_dot_product_attention)
+class SpdaAttnParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        q, k, v = self.node.args[:3]
+        q_axis = self.extract_axis(q)
+        # parallel axis must be the head dimension if being parallelized
+        if q_axis != self.extract_axis(k) or q_axis != self.extract_axis(v) or q_axis not in {None, 1}:
+            return []
+        return [q_axis]
+
+
+class FallbackParallelAxisPropagateHandler(OpParallelAxisPropagateHandler):
+    def propagate(self) -> List[int]:
+        # by default we don't parallelize inputs and constants(except parameters embeded in modules)
+        if self.node.op in ["placeholder", "get_attr"]:
+            return [None]
+        elif self.node.op == "output":
+            # does not care about if output is being parallelized right now, because if the output is loss,
+            # then it must be not parallelized as long as it comes from sharded cross entropy.
+            # TODO: append all-gather comm ops before all parallelized output nodes if instructed.
+            input_arg = self.node.all_input_nodes[0]
+            axis = self.extract_axis(input_arg)
+            return [axis]
+        elif is_linear(self.node):
+            input_arg = self.node.all_input_nodes[0]
+            axis = self.extract_axis(input_arg)
+            if axis is None:
+                # with input being not parallelized, output can be parallelized on the head dimension,
+                # i.e., `ColumnLinear`, or not being parallelized by all-gather at the end
+                return [2, None]
+            elif self.config.enable_sequence_parallel and axis == 1:
+                # with input being parallelized on sequence dimension, output can be parallelized on
+                # the head dimension, i.e., `ColumnLinear` with sequence parallel, or not being parallelized
+                # by all-gather at the end
+                return [2, None]
+            elif axis == 2:
+                # with input being parallelized on head dimension, output can be parallelized on the
+                # sequence dimension or not parallelized by all-reduce at the end, i.e., `RowLinear`
+                # when sp is not enabled
+                return [1, None] if self.config.enable_sequence_parallel else [None]
+            else:
+                return []
+        elif is_embedding(self.node):
+            input_arg = self.node.all_input_nodes[0]
+            axis = self.extract_axis(input_arg)
+            if axis is None:
+                # only support the embedding parameter being parallelized on `vocab` dim or not parallelized for now,
+                # the output can be parallelized on sequence dim or not parallelized
+                return [1, None] if self.config.enable_sequence_parallel else [None]
+            else:
+                return []
+        elif is_cross_entropy(self.node):
+            logits = self.node.all_input_nodes[0]
+            axis = self.extract_axis(logits)
+            if axis is None or (
+                is_cross_entropy_parallel_compatible(self.node) and axis == logits.meta["val"].ndim - 1
+            ):
+                # for cross entropy, the input logits parallel axis can only be the last axis or None
+                return [None]
+            else:
+                return []
+        elif is_activation(self.node):
+            return UnaryOpParallelAxisPropagateHandler(self.node, self.meta_key, self.config).propagate()
+
+        # last resort, if no input is being parallelized, then we make output also not parallelized,
+        # this will give us relief on writing policies for strange ops which don't actually need
+        # parallelization in most cases
+        if all(self.extract_axis(arg) is None for arg in self.node.all_input_nodes):
+            return [None]
+
+        raise NotImplementedError(f"don't know how to propagate axis for {self.node.target}")
diff --git a/optimum/fx/parallelization/parallel_layers/__init__.py b/optimum/fx/parallelization/parallel_layers/__init__.py
index 9bfb13afdf6..474ae7f7eef 100644
--- a/optimum/fx/parallelization/parallel_layers/__init__.py
+++ b/optimum/fx/parallelization/parallel_layers/__init__.py
@@ -14,3 +14,4 @@
 # limitations under the License.
 from .embedding import VocabParallelEmbedding
 from .linear import ColumnParallelLinear, RowParallelLinear
+from .loss import VocabParallelCrossEntropyLoss, sharded_cross_entropy_wrapper_fn
diff --git a/optimum/fx/parallelization/parallel_layers/loss.py b/optimum/fx/parallelization/parallel_layers/loss.py
new file mode 100644
index 00000000000..0a11e33c08e
--- /dev/null
+++ b/optimum/fx/parallelization/parallel_layers/loss.py
@@ -0,0 +1,163 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+from ..core import ParallelExecutionCtx
+
+
+# Adapted from https://github.com/huggingface/nanotron/blob/main/src/nanotron/parallel/tensor_parallel/functional.py
+class _ShardedCrossEntropy(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        sharded_logits: torch.Tensor,  # (batch_size, length, sharded_hidden_size)
+        target: torch.Tensor,  # (batch_size, length)
+        group: dist.ProcessGroup,
+    ):
+        # Maximum value along last dimension across all GPUs.
+        logits_max = torch.max(sharded_logits, dim=-1)[0]
+        dist.all_reduce(logits_max, op=dist.ReduceOp.MAX, group=group)
+        # Subtract the maximum value.
+        sharded_logits = sharded_logits - logits_max.unsqueeze(dim=-1)
+
+        # Get the shard's indices
+        sharded_hidden_size = sharded_logits.shape[-1]
+        rank = dist.get_rank(group)
+        start_index = rank * sharded_hidden_size
+        end_index = start_index + sharded_hidden_size
+
+        # Create a mask of valid ids (1 means it needs to be masked).
+        target_mask = (target < start_index) | (target >= end_index)
+        masked_target = target.clone() - start_index
+        masked_target[target_mask] = 0
+
+        # Get predicted-logits = logits[target].
+        # For Simplicity, we convert logits to a 2-D tensor with size
+        # [*, shard-size] and target to a 1-D tensor of size [*].
+        logits_2d = sharded_logits.view(-1, sharded_hidden_size)
+        masked_target_1d = masked_target.view(-1)
+        arange_1d = torch.arange(start=0, end=logits_2d.shape[0], device=logits_2d.device)
+        predicted_logits_1d = logits_2d[arange_1d, masked_target_1d]
+        if predicted_logits_1d.is_contiguous():
+            predicted_logits_1d = predicted_logits_1d.clone()
+        else:
+            predicted_logits_1d = predicted_logits_1d.contiguous()
+        predicted_logits = predicted_logits_1d.view_as(target)
+        predicted_logits[target_mask] = 0.0
+        # All reduce is needed to get the chunks from other GPUs.
+        dist.all_reduce(predicted_logits, op=dist.ReduceOp.SUM, group=group)
+
+        # Sum of exponential of logits along vocab dimension across all GPUs.
+        exp_logits = sharded_logits
+        torch.exp(sharded_logits, out=exp_logits)
+        sum_exp_logits = exp_logits.sum(dim=-1)
+        dist.all_reduce(sum_exp_logits, op=dist.ReduceOp.SUM, group=group)
+
+        # Loss = log(sum(exp(logits))) - predicted-logit.
+        loss = torch.log(sum_exp_logits) - predicted_logits
+
+        # Normalize and optionally smooth logits
+        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
+
+        # Store softmax, target-mask and masked-target for backward pass.
+        ctx.save_for_backward(exp_logits, target_mask, masked_target_1d)
+
+        return loss
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        # Retrieve tensors from the forward path.
+        softmax, target_mask, masked_target_1d = ctx.saved_tensors
+
+        # All the inputs have softmax as their gradient.
+        grad_input = softmax
+        # For simplicity, work with the 2D gradient.
+        sharded_hidden_size = softmax.size()[-1]
+        grad_2d = grad_input.view(-1, sharded_hidden_size)
+
+        # Add the gradient from matching classes.
+        arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_2d.device)
+        grad_2d[arange_1d, masked_target_1d] -= 1.0 - target_mask.view(-1).float()
+
+        # Finally elementwise multiplication with the output gradients.
+        grad_input.mul_(grad_output.unsqueeze(dim=-1))
+
+        return grad_input, None, None
+
+
+def sharded_cross_entropy(sharded_logits: torch.Tensor, target: torch.Tensor, process_group: dist.ProcessGroup):
+    return _ShardedCrossEntropy.apply(sharded_logits, target, process_group)
+
+
+def sharded_cross_entropy_wrapper_fn(process_group: dist.ProcessGroup):
+    @wraps(sharded_cross_entropy)
+    def wrapper(
+        sharded_logits: torch.Tensor,
+        target: torch.Tensor,
+        weight: Optional[torch.Tensor] = None,
+        size_average: Optional[bool] = None,
+        ignore_index: int = -100,
+        reduce: Optional[bool] = None,
+        reduction: str = "mean",
+        label_smoothing: float = 0.0,
+    ):
+        if weight is not None or ignore_index != -100 or label_smoothing != 0.0:
+            raise ValueError(
+                "Does not support weighted mode, index ignoring and label smoothing in current parallel cross entropy implementation."
+            )
+        loss: torch.Tensor = sharded_cross_entropy(sharded_logits, target, process_group)
+
+        if size_average is not None or reduce is not None:
+            size_average = True if size_average is None else size_average
+            reduce = True if reduce is None else reduce
+
+            if size_average and reduce:
+                reduction = "mean"
+            elif reduce:
+                reduction = "sum"
+            else:
+                reduction = "none"
+
+        if reduction == "mean":
+            return loss.mean()
+        elif reduction == "sum":
+            return loss.sum()
+        return loss
+
+    return wrapper
+
+
+class VocabParallelCrossEntropyLoss(nn.Module):
+    """
+    Simple parallel cross entropy implementation which does not support weighted mode and label smoothing yet.
+    """
+
+    def __init__(self, ctx: ParallelExecutionCtx, reduction: str = "mean") -> None:
+        super(VocabParallelCrossEntropyLoss, self).__init__()
+        self.process_group = ctx.tp_group
+        self.reduction = reduction
+
+    def forward(self, sharded_logits: torch.Tensor, target: torch.Tensor):
+        loss: torch.Tensor = _ShardedCrossEntropy.apply(sharded_logits, target, self.process_group)
+        if self.reduction == "mean":
+            return loss.mean()
+        elif self.reduction == "sum":
+            return loss.sum()
+        return loss
diff --git a/optimum/fx/parallelization/passes.py b/optimum/fx/parallelization/passes.py
index 1b25e9e1233..90155263281 100644
--- a/optimum/fx/parallelization/passes.py
+++ b/optimum/fx/parallelization/passes.py
@@ -23,15 +23,21 @@
 from torch.fx import Graph, GraphModule, Node
 
 from .core import Config, ParallelExecutionCtx, ParameterMeta
+from .decomp import decompose_and_functionalize
 from .distributed import scatter
-from .parallel_layers import ColumnParallelLinear, RowParallelLinear, VocabParallelEmbedding
+from .op_registry import REGISTRY, FallbackParallelAxisPropagateHandler
+from .parallel_layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelCrossEntropyLoss,
+    VocabParallelEmbedding,
+    sharded_cross_entropy_wrapper_fn,
+)
 from .utils import (
+    is_cross_entropy,
     is_embedding,
     is_linear,
-    is_permute,
     is_shape_consumer,
-    is_shape_generator,
-    is_transpose,
     stable_topological_sort,
 )
 
@@ -135,238 +141,156 @@ def clean_all(self, graph_module: GraphModule) -> None:
             self.clear_marker_per_node(node)
 
 
-class ParallelLayerAnnotatePass(AnalyzeBase):
+class ParallelAxisSolverPass(AnalyzeBase):
     """
-    A pass which tries to automatically identify parallel layers in the graph. Note that for simplicity
-    we only consider classical ways of parallelizing layers in transformers architecture for now, we are not
-    solving an optimization problem which tries to give a best solution of parallelizing any model under
-    memory/hardware constraints.
-
-    For `nn.Embedding` layers, we parallelize them on the vocabulary dim by default, because they are often tied
-    to the `lm_head` of the model, which is usually a `ColumnLinear`(parallelized on vocab dim).
-
-    For `nn.Linear` layers, we parallelize them by grouping them as `upstream` nodes and `downstream` nodes, and
-    `upstream` nodes are marked as `ColumnLinear`, `downstream` nodes are marked as `RowLinear`.
-
-    Typical examples in transformer models:
-
-          Attention                   Bert-style MLP          Llama-style MLP
-        __________________________________________________________________________
-        Linear  Linear                     Linear           Linear
-          \\     /                            |               \\                       --> upstream
-            Matmul   Linear               Activation         Activation  Linear
-        __________________________________________________________________________
-               \\    /                        |                    \\     /
-                \\  /                     ___________               \\   /
-                Matmul                   /  Linear   \                Mul
-                   |                    /             \                |
-        _______________________________/               \___________________________
-                 Linear                                              Linear            --> downstream
-
-    Note that there are some patterns that can not be clearly marked, like this one:
-
-    Linear
-      |    \\
-      |    Linear  <-- which label should we mark for the intermediate linear, `upstream` or `downstream`
-      |     /
-        Add
-         |
-       Linear
-
-    For patterns like this we will be conservative and raise errors directly because we don't know how to parallelize
-    it. Another concern is about the correctness, it's possible that we might end up with a wrong parallelization solution
-    even if the pattern itself is clear, but for now we are mainly targeting on transformer models and the current solution
-    should work fairly well.
+    A pass which tries to automatically identify parallel layers in the graph. There are three steps
+    involved to find a possible parallel solution given the traced graph module and process group.
+
+        - Decompostion & Functionalization
+            The vanilla graph traced by dynamo frontend is a high-level graph which contains high-level
+            pytorch ops, and there could be thousands of them, which makes graph analysis hard in order
+            to cover all cases. So we decompose the high-level graph into low-level graph which only
+            conrtains core aten ops, which is a much smaller set. And functionalization is also needed
+            to remove inplace ops in the graph so that we get `aten.Add` instead of `aten.Add_` in the
+            graph, which furthur reduces the op set we need to consider.
+
+        - Parallel Axis Propagation
+            We need to write parallel axis propagation rules for aten ops in the decomposed and functionalized
+            graph, note that we don't need to cover every possible parallelization strategy because in general
+            only certain ops(usually involves computation) can be parallelized in transformer models. And we just
+            need to write rules for a subset of core aten op set in order to support most of the transformer models.
+
+        - Backtracking Search
+            After we have defined parallel axis propagation rules for each op in the graph, we do a brute force
+            backtracking search to try to find a possible solution which respects the propagation rule of every
+            op in the graph.
+
+
+        Note that there are several practical concerns
+
+            - Time Complexity. Although brute force backtracking introduces an exponential time complexity, we reduces
+                the search space by injecting human heuristics. First, we only consider parallelization on the head dimension
+                (for tensor parallel) or the sequence dimension(to support sequence parallel), then at any time the tensor is
+                parallelized on at most one dimension. Second, we only allow axis switch around certain layers(like `nn.Linear`
+                or `nn.Embedding), and all other ops fall into their places by the parallel axis of their input and rules we write.
+
+            - Optimal Solution. Note that since we return the first solution we find, then it might not be optimal in terms of
+                memory consumption and communication overhead. But again we can adjust the order of search and try parallelize
+                as much as we can first before fall back to non-parallelized search paths. And we don't pay too much attention
+                on calculating communication overhead because in practice they are bounded under the constraint that only certain
+                layers are allowed to communicate.
+
+    Our goal is not to solve an optimization problem which tries to give a best solution of parallelizing any model under memory/hardware
+    constraints, but rather a cheap solution which relieves you from writing boilerplate code for parallelizing layers of different models.
     """
 
-    def try_form_parallel_linear_groups(self, linear: Node) -> None:
-        """
-        We try to form linears by forming closures in a greedy way, we start with an unmarked linear node, and traverses down
-        recusively to find all the potential `downstream` linears, note that once we have reached a linear, the recursion stops.
-        And the newly found `downstream` linears are used as new seeds to traverse upwards to find all the potential `upstream`
-        linears, the process goes on until number of linears on both sides converges.
-        Args:
-            linear (Node): the first linear node used as `upstream` node seed to form closure.
-
-        Raises:
-            RuntimeError:
-                raises runtime error when the pattern itself is not clear, there are no clear boundaries that can be drawn.
-        """
-        upstream_nodes, downstream_nodes = {linear}, set()
-
-        seeds, next_seeds = [(linear, "down")], []
-
-        def traverse(start: Node, cur: Node, direction: str = "down"):
-            if is_linear(cur) and cur is not start:
-                if direction == "up" and cur not in upstream_nodes:
-                    upstream_nodes.add(cur)
-                    next_seeds.append((cur, "down"))
-                elif direction == "down" and cur not in downstream_nodes:
-                    downstream_nodes.add(cur)
-                    next_seeds.append((cur, "up"))
-                return
-
-            next_nodes = cur.all_input_nodes if direction == "up" else cur.users
-            for node in next_nodes:
-                # we should ignore shape-related dependencies
-                if is_shape_generator(node):
-                    continue
-                traverse(start, node, direction)
-
-        while seeds:
-            next_seeds = []
-            for node, direction in seeds:
-                traverse(start=node, cur=node, direction=direction)
-            seeds = next_seeds
-
-        if any(self.already_executed_per_node(node) for node in (upstream_nodes | downstream_nodes)) or (
-            upstream_nodes & downstream_nodes
-        ):
-            raise RuntimeError(
-                "Failed to automatically group and parallelize ops in graph in greedy way: "
-                "no clear boudaries between `upstream` and `downstream` ops."
-            )
-
-        for node in upstream_nodes:
-            self.place_marker_per_node(node, {"axis": "column", "gather_output": False if downstream_nodes else True})
-
-        for node in downstream_nodes:
-            self.place_marker_per_node(node, {"axis": "row", "input_is_parallel": True})
+    def trace_back(self, graph_module: GraphModule, decomp_graph: Graph) -> None:
+        node_map = {node.name: node for node in graph_module.graph.nodes}
+
+        for node in decomp_graph.nodes:
+            if "traced_from" in node.meta:
+                node_name, _ = node.meta["traced_from"][0]
+                assert node_name in node_map, f"un-recognized node origin {node_name} not in graph being traced"
+                orig_node = node_map[node_name]
+                self.clear_marker_per_node(orig_node)
+                self.place_marker_per_node(
+                    orig_node, {"parallel_axis": self.get_stored_field_info(node, field="parallel_axis")}
+                )
 
     def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule:
-        graph: Graph = graph_module.graph
+        graph: Graph = decompose_and_functionalize(graph_module)(*ctx.example_inputs)
         stable_topological_sort(graph)
-        for node in graph.nodes:
-            if is_linear(node) and not self.already_executed_per_node(node):
-                self.try_form_parallel_linear_groups(node)
-            elif is_embedding(node):
-                # directly mark `nn.Embedding` layers
-                self.place_marker_per_node(node, {"axis": "vocab"})
 
-        return graph_module
+        nodes = list(graph.nodes)
+
+        def search(idx: int):
+            if idx == len(nodes):
+                return True
 
+            node = nodes[idx]
+            if node.op == "call_function" and REGISTRY.is_supported(node.target):
+                prop_cls = REGISTRY.mapping[node.target]
+            else:
+                prop_cls = FallbackParallelAxisPropagateHandler
 
-class ParallelAxisPropagationPass(AnalyzeBase):
-    """
-    A pass which tries to track which axis is being parallelized in the dataflow. For transformer models, the
-    axis being paralled for tensor parallism is almost always 2, i.e., the attention head axis, except for
-    Q and K matrices which need to swap the sequence length axis and head axis to do the attention computation,
-    so we focus on operations like `transpose` or `permute` which swaps axis, and try inducting the parallel
-    axis after these operations.
-    """
+            prop = prop_cls(node, self.meta_key(), config)
+            axis_candidates = prop.propagate()
+            for axis in axis_candidates:
+                self.place_marker_per_node(node, {"parallel_axis": axis})
+                if search(idx + 1):
+                    return True
+                self.clear_marker_per_node(node)
 
-    def propagate_transpose(self, node: Node, parallel_axis: int) -> bool:
-        dims = node.meta["example_value"].dim()
-        if "dim0" in node.kwargs and "dim1" in node.kwargs:
-            dim0, dim1 = node.kwargs["dim0"], node.kwargs["dim1"]
-        elif len(node.args) == 3:
-            dim0, dim1 = node.args[1:]
-
-        dim0 = (dim0 + dims) % dims
-        dim1 = (dim1 + dims) % dims
-
-        if dim0 == parallel_axis:
-            self.place_marker_per_node(node, {"parallel_axis": dim1})
-            return True
-        elif dim1 == parallel_axis:
-            self.place_marker_per_node(node, {"parallel_axis": dim0})
-            return True
-        return False
-
-    def propagate_permute(self, node: Node, parallel_axis: int) -> bool:
-        if "dims" in node.kwargs:
-            dims = node.kwargs["dims"]
-        else:
-            dims = (
-                list(node.args[1])
-                if isinstance(node.args[1], tuple)
-                else [arg for arg in node.args if isinstance(arg, int)]
-            )
+            return False
 
-        dim_len = node.meta["example_value"].dim()
-        dims = [dim + dim_len if dim < 0 else dim for dim in dims]
+        if not search(0):
+            raise RuntimeError("Failed to find a solution to automatically parallelize ops in graph in greedy way.")
 
-        for i, dim in enumerate(dims):
-            if dim == parallel_axis:
-                self.place_marker_per_node(node, {"parallel_axis": i})
-                return True
-        return False
-
-    def propagate_getitem(self, node: Node, parallel_axis: int) -> bool:
-        slices = node.args[1]
-        dims = node.meta["example_value"].dim()
-        assert parallel_axis < dims
-        inc, i, j = 0, 0, 0
-
-        while i < parallel_axis and j < len(slices):
-            if isinstance(slices[j], int):
-                inc -= 1
-                i += 1
-            elif slices[j] is None:
-                inc += 1
-            elif slices[j] is Ellipsis:
-                i = dims
-                k = j
-                while k < len(slices):
-                    if slices[k] is not Ellipsis:
-                        i -= 1
-                    k += 1
-            else:
-                i += 1
-            j += 1
+        self.trace_back(graph_module, graph)
+        return graph_module
 
-        if inc != 0:
-            assert parallel_axis + inc < dims and parallel_axis + inc >= 0
-            self.place_marker_per_node(node, {"parallel_axis": parallel_axis + inc})
-            return True
-        return False
+
+class ParallelLayerAnnotatePass(AnalyzeBase):
+    """
+    This pass annotates layers which have different parallel axis(requires communication inside the layer) in their
+    input and output tensors. Since heuristics applied during the searching process respect traditional classical ways of
+    parallelizing layers(like Megatron-style `ColumnLinear` or `RowLinear`), we are guaranteed to match a valid replacement
+    annotation according to parallelization strategy of input and output tensors.
+    """
 
     def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule:
-        g: Graph = graph_module.graph
-        stable_topological_sort(g)
+        for node in graph_module.graph.nodes:
+            if is_linear(node):
+                axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis")
+                axis_after = ParallelAxisSolverPass.get_stored_field_info(node, "parallel_axis")
+                info = {}
+                if axis_before is None:
+                    info["axis"] = "column"
+                    info["gather_output"] = True if axis_after is None else False
+                elif axis_before == 1:
+                    assert (
+                        config.enable_sequence_parallel
+                    ), "illegal parallel axis for sequence parallelism deactivated setting"
+                    info["axis"] = "column"
+                    info["sequence_parallel"] = True
+                    info["gather_output"] = True if axis_after is None else False
+                elif axis_before == 2:
+                    info["axis"] = "row"
+                    info["input_is_parallel"] = True
+                    if axis_after == 1:
+                        assert (
+                            config.enable_sequence_parallel
+                        ), "illegal parallel axis for sequence parallelism deactivated setting"
+                        info["sequence_parallel"] = True
+                    else:
+                        info["sequence_parallel"] = False
+                self.place_marker_per_node(node, info)
 
-        for node in g.nodes:
-            if ParallelLayerAnnotatePass.already_executed_per_node(node):
-                # start propagating at ColumnLinear, marking the beginning of parallelized region
-                axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis", must_have=True)
-                gather_output = ParallelLayerAnnotatePass.get_stored_field_info(node, field="gather_output")
-                if axis == "column" and not gather_output:
-                    self.place_marker_per_node(node, {"parallel_axis": 2})
-                # stop propagating at RowLinear, concluding the ending of parallelized region
+            elif is_embedding(node):
+                axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis")
+                axis_after = ParallelAxisSolverPass.get_stored_field_info(node, "parallel_axis")
+                assert axis_before is None and axis_after in [1, None]
+                info = {"axis": "vocab"}
+                if axis_after == 1:
+                    assert (
+                        config.enable_sequence_parallel
+                    ), "illegal parallel axis for sequence parallelism deactivated setting"
+                    info["sequence_parallel"] = True
                 else:
-                    continue
-            else:
-                already_marked_args, parallel_axis = [], None
-                for arg in node.all_input_nodes:
-                    if not self.already_executed_per_node(arg):
-                        continue
-                    if parallel_axis is None:
-                        parallel_axis = self.get_stored_field_info(arg, field="parallel_axis", must_have=True)
-                    else:
-                        assert parallel_axis == self.get_stored_field_info(
-                            arg, field="parallel_axis", must_have=True
-                        ), "`parallel_axis` should be equal for all arguments in any related ops"
-                    already_marked_args.append(arg)
-
-                if not already_marked_args:
-                    continue
-
-                marked = False
-                if is_transpose(node):
-                    marked = self.propagate_transpose(node, parallel_axis)
-                elif is_permute(node):
-                    marked = self.propagate_permute(node, parallel_axis)
-
-                # fall back
-                if not marked:
-                    self.place_marker_per_node(node, {"parallel_axis": parallel_axis})
+                    info["sequence_parallel"] = False
+                self.place_marker_per_node(node, info)
+
+            elif is_cross_entropy(node):
+                axis_before = ParallelAxisSolverPass.get_stored_field_info(node.args[0], "parallel_axis")
+                if axis_before is not None:
+                    self.place_marker_per_node(node, {"axis": "vocab"})
+
         return graph_module
 
 
 class ParallelLayerReplacePass(PassBase):
     """
-    A pass which modifies graph according to information provided by previous analytical passes,
-    in general it does two things for now:
+    A pass which modifies graph according to information provided by previous analytical passes, in general it does two things for now:
         1. replaces linears and embedding layers with their parallel counterparts.
         2. modifies hard-coded arguments like the number of attention heads in the graph by dividing it by parallelism level.
     """
@@ -431,6 +355,35 @@ def handle_embedding(node: Node, ctx: ParallelExecutionCtx) -> None:
             layer_cache[key] = new_mod
         setattr(parent_mod, field, new_mod)
 
+    @staticmethod
+    def handle_cross_entropy(node: Node, ctx: ParallelExecutionCtx) -> None:
+        axis = ParallelLayerAnnotatePass.get_stored_field_info(node, field="axis")
+        if axis is None:
+            return
+
+        assert axis in {"vocab"}, "Only support parallelization on vocab dim for now."
+        if node.op == "call_module":
+            graph_module = node.graph.owning_module
+            prefix_and_field = node.target.rsplit(".", maxsplit=1)
+            if len(prefix_and_field) == 2:
+                parent_mod = graph_module.get_submodule(prefix_and_field[0])
+                field = prefix_and_field[1]
+            else:
+                parent_mod = graph_module
+                field = node.target
+
+            mod: nn.CrossEntropyLoss = graph_module.get_submodule(node.target)
+            key, layer_cache = node.target, ctx.parallel_layer_cache
+            if key in layer_cache:
+                new_mod = layer_cache[key]
+            else:
+                assert ctx.compile_times == 0, "illegal path for recompilation"
+                new_mod = VocabParallelCrossEntropyLoss(ctx, reduction=mod.reduction)
+                layer_cache[key] = new_mod
+            setattr(parent_mod, field, new_mod)
+        else:
+            node.target = sharded_cross_entropy_wrapper_fn(process_group=ctx.tp_group)
+
     @staticmethod
     def handle_hard_coded_axis_param(node: Node, ctx: ParallelExecutionCtx) -> None:
         def extract_shape_from_node(node: Node) -> List[Any]:
@@ -453,7 +406,7 @@ def update(node: Node, new_shape: List[Any], parallel_axis: int):
             else:
                 node.update_arg(parallel_axis + 1, shape[parallel_axis])
 
-        parallel_axis = ParallelAxisPropagationPass.get_stored_field_info(node, field="parallel_axis")
+        parallel_axis = ParallelAxisSolverPass.get_stored_field_info(node, field="parallel_axis")
         if parallel_axis is None:
             return
 
@@ -472,6 +425,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
                 self.handle_linear(node, ctx)
             elif is_embedding(node):
                 self.handle_embedding(node, ctx)
+            elif is_cross_entropy(node):
+                self.handle_cross_entropy(node, ctx)
             # correct the attention head num in parallel setting
             elif is_shape_consumer(node):
                 self.handle_hard_coded_axis_param(node, ctx)
@@ -480,18 +435,21 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
 
 class InitializeOrLoadWeightsPass(PassBase):
     """
-    Make weights loading/initialization a seperate pass for cleaner logic and easier extensibility. This
-    pass will only run once in the very first compilation step.
+    Weights loading and intialization pass, will initialize parameters on current rank and load weights from disk
+    if necessary.
     """
 
-    need_rerun_when_recompile = False
-
     def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Config) -> GraphModule:
         world_size = dist.get_world_size(ctx.tp_group)
         tp_rank = dist.get_rank(ctx.tp_group)
 
-        new_parameters, tied_parameters = [], {}
+        new_parameters, tied_parameters, param_cache = [], {}, ctx.param_cache
         for name, param in sorted(graph_module.named_parameters(remove_duplicate=False)):
+            # skip initializing new params when recompilation happens
+            if name in param_cache:
+                new_parameters.append((name, param_cache[name]))
+                continue
+
             param_meta: ParameterMeta = getattr(param, "meta")
             # skip already initialized/loaded tied parameters
             if param_meta.is_tied and id(param) in tied_parameters:
@@ -569,6 +527,8 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
             else:
                 parent_mod = graph_module
                 field = name
+            if name not in param_cache:
+                param_cache[name] = new_param
             setattr(parent_mod, field, new_param)
 
         return graph_module
@@ -577,18 +537,18 @@ def run(self, graph_module: GraphModule, ctx: ParallelExecutionCtx, config: Conf
 def build_parallel_pass_pipeline() -> PassPipeline:
     """
     Ensemble a pass pipeline which contains the following passes:
-        1. `ParallelLayerAnnotatePass` to annoate which linears are `ColumnLinear`, which are `RowLinear`
-        2. `ParallelAxisPropagationPass` to propate parallel axis along the data flow
-        3. `ParallelLinearReplacePass` to do the actual replacement and modification of hard-coded attributes
-        4. `InitializeOrLoadWeightsPass` to load or initialize weights for parameters
+        1. `ParallelAxisSolverPass` to find a parallelization solution of tensors in the graph.
+        2. `ParallelLayerAnnotatePass` to annotate parallelized layers according to the solution found in the first step.
+        3. `ParallelLinearReplacePass` to do the actual replacement and modification of hard-coded attributes.
+        4. `InitializeOrLoadWeightsPass` to load or initialize weights for parameters.
 
     Returns:
         PassPipeline: the pipeline used for automatic parallelism.
     """
     return PassPipeline(
         [
+            ParallelAxisSolverPass(),
             ParallelLayerAnnotatePass(),
-            ParallelAxisPropagationPass(),
             ParallelLayerReplacePass(),
             InitializeOrLoadWeightsPass(),
         ]
diff --git a/optimum/fx/parallelization/utils.py b/optimum/fx/parallelization/utils.py
index f129ffbd402..3074638737f 100644
--- a/optimum/fx/parallelization/utils.py
+++ b/optimum/fx/parallelization/utils.py
@@ -17,7 +17,6 @@
 import hashlib
 import importlib
 import json
-import operator
 import os
 import re
 import tempfile
@@ -45,6 +44,14 @@ def ensure_divisibility(numerator: int, denominator: int) -> None:
         )
 
 
+def is_activation(node: Node) -> bool:
+    # only consider leaf Module activations
+    if node.op != "call_module":
+        return False
+    mod = node.graph.owning_module
+    return getattr(mod.get_submodule(node.target), "__module__", "").startswith("torch.nn.modules.activation")
+
+
 def is_linear(node: Node) -> bool:
     if node.op != "call_module":
         return False
@@ -67,32 +74,46 @@ def is_shape_consumer(node: Node) -> bool:
     return False
 
 
-def is_transpose(node: Node) -> bool:
-    if node.op == "call_method":
-        return node.target in {"transpose", "transpose_"}
-    elif node.op == "call_function":
-        return node.target is torch.transpose
-    return False
-
+def is_output(node: Node) -> bool:
+    return node.op == "output"
 
-def is_permute(node: Node) -> bool:
-    if node.op == "call_method":
-        return node.target in {"permute"}
-    elif node.op == "call_function":
-        return node.target is torch.permute
-    return False
 
+def is_shape_generator(node: Node) -> bool:
+    return node.op == "call_method" and node.target == "size"
 
-def is_getitem(node: Node) -> bool:
-    return node.op == "call_function" and node.target is operator.getitem
 
+def is_cross_entropy(node: Node) -> bool:
+    if node.op == "call_function":
+        return node.target is F.cross_entropy
+    elif node.op == "call_module":
+        mod = node.graph.owning_module
+        return isinstance(mod.get_submodule(node.target), nn.CrossEntropyLoss)
+    return False
 
-def is_output(node: Node) -> bool:
-    return node.op == "output"
 
+def is_cross_entropy_parallel_compatible(node: Node) -> bool:
+    """
+    For now `VocabParallelCrossEntropyLoss` does not support weighted mode, index ignoring and label smoothing.
+    """
+    if node.op == "call_function":
+        weight = node.kwargs.get("weight", None)
+        ignore_index = node.kwargs.get("ignore_index", -100)
+        label_smoothing = node.kwargs.get("label_smoothing", 0.0)
+        if len(node.args) > 2 and weight is None:
+            weight = node.args[2]
+        if len(node.args) > 4 and ignore_index == -100:
+            ignore_index = node.args[4]
+        if len(node.args) > 7 and label_smoothing == 0.0:
+            label_smoothing = node.args[7]
+
+        return weight is None and ignore_index == -100 and label_smoothing == 0.0
+
+    elif node.op == "call_module":
+        mod: nn.CrossEntropyLoss = node.graph.owning_module.get_submodule(node.target)
+        weight, label_smoothing, ignore_index = mod.weight, mod.label_smoothing, mod.ignore_index
+        return weight is None and ignore_index == -100 and label_smoothing == 0.0
 
-def is_shape_generator(node: Node) -> bool:
-    return node.op == "call_method" and node.target == "size"
+    return False
 
 
 def stable_topological_sort(graph: Graph):
diff --git a/optimum/gptq/data.py b/optimum/gptq/data.py
index b8734da478e..7e5fc0b43db 100644
--- a/optimum/gptq/data.py
+++ b/optimum/gptq/data.py
@@ -18,7 +18,12 @@
 
 import numpy as np
 import torch
-from datasets import load_dataset
+
+from optimum.utils.import_utils import DATASETS_IMPORT_ERROR, is_datasets_available
+
+
+if is_datasets_available():
+    from datasets import load_dataset
 
 
 """
@@ -113,6 +118,9 @@ def pad_block(block, pads):
 
 
 def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_wikitext2"))
+
     if split == "train":
         data = load_dataset("wikitext", "wikitext-2-raw-v1", split="train")
     elif split == "validation":
@@ -132,6 +140,9 @@ def get_wikitext2(tokenizer: Any, seqlen: int, nsamples: int, split: str = "trai
 
 
 def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
@@ -157,6 +168,9 @@ def get_c4(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
 
 
 def get_c4_new(tokenizer: Any, seqlen: int, nsamples: int, split: str = "train"):
+    if not is_datasets_available():
+        raise ImportError(DATASETS_IMPORT_ERROR.format("get_c4_new"))
+
     if split == "train":
         data = load_dataset("allenai/c4", split="train", data_files={"train": "en/c4-train.00000-of-01024.json.gz"})
     elif split == "validation":
diff --git a/optimum/gptq/quantizer.py b/optimum/gptq/quantizer.py
index 902af87bbb0..844da3e3157 100644
--- a/optimum/gptq/quantizer.py
+++ b/optimum/gptq/quantizer.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import importlib
 import json
 import os
 from enum import Enum
@@ -19,17 +20,26 @@
 from typing import Any, Dict, List, Optional, Tuple, Union
 
 import torch
+from packaging import version
 from torch import nn
 from tqdm.auto import tqdm
 from transformers import AutoTokenizer
 from transformers.pytorch_utils import Conv1D
 from transformers.utils.quantization_config import QuantizationMethod
 
-from ..utils import is_accelerate_available, is_auto_gptq_available
+from ..utils import is_accelerate_available, is_auto_gptq_available, is_gptqmodel_available
 from ..utils.modeling_utils import recurse_getattr
+from ..version import __version__ as optimum_version
 from .constants import GPTQ_CONFIG
 from .data import get_dataset, prepare_dataset
-from .utils import get_block_name_with_pattern, get_device, get_layers, get_preceding_modules, get_seqlen
+from .utils import (
+    get_block_name_with_pattern,
+    get_device,
+    get_layers,
+    get_preceding_modules,
+    get_seqlen,
+    nested_move_to,
+)
 
 
 if is_accelerate_available():
@@ -40,14 +50,27 @@
     from accelerate.hooks import remove_hook_from_module
 
 if is_auto_gptq_available():
+    from auto_gptq import __version__ as autogptq_version
     from auto_gptq import exllama_set_max_input_length
-    from auto_gptq.modeling._utils import autogptq_post_init
+    from auto_gptq.modeling._utils import autogptq_post_init as gptq_post_init
     from auto_gptq.quantization import GPTQ
-    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear
+    from auto_gptq.utils.import_utils import dynamically_import_QuantLinear as hf_select_quant_linear
+
+if is_gptqmodel_available():
+    from gptqmodel import exllama_set_max_input_length
+    from gptqmodel.quantization import GPTQ
+    from gptqmodel.utils.importer import hf_select_quant_linear
+    from gptqmodel.utils.model import hf_convert_gptq_v1_to_v2_format, hf_convert_gptq_v2_to_v1_format
+    from gptqmodel.utils.model import hf_gptqmodel_post_init as gptq_post_init
+    from gptqmodel.version import __version__ as gptqmodel_version
 
 logger = getLogger(__name__)
 
 
+def has_device_more_than_cpu():
+    return torch.cuda.is_available() or (hasattr(torch, "xpu") and torch.xpu.is_available())
+
+
 class ExllamaVersion(int, Enum):
     ONE = 1
     TWO = 2
@@ -74,10 +97,13 @@ def __init__(
         batch_size: int = 1,
         pad_token_id: Optional[int] = None,
         disable_exllama: bool = False,
-        exllama_config: Dict[str, Any] = None,
+        exllama_config: Optional[Dict[str, Any]] = None,
         max_input_length: Optional[int] = None,
         cache_block_outputs: Optional[bool] = True,
         modules_in_block_to_quantize: Optional[List[List[str]]] = None,
+        checkpoint_format: str = "gptq",
+        meta: Optional[Dict[str, any]] = None,
+        backend: Optional[str] = None,
         *args,
         **kwargs,
     ):
@@ -88,7 +114,7 @@ def __init__(
             dataset (`Union[List[str], str, Any]`, defaults to `None`):
                 The dataset used for quantization. You can provide your own dataset in a list of string or in a list of tokenized data
                 (e.g. [{ "input_ids": [ 1, 100, 15, ... ],"attention_mask": [ 1, 1, 1, ... ]},...])
-                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new','ptb','ptb-new'].
+                or just use the original datasets used in GPTQ paper ['wikitext2','c4','c4-new'].
             group_size (int, defaults to 128):
                 The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization.
             damp_percent (`float`, defaults to `0.1`):
@@ -129,6 +155,13 @@ def __init__(
                 List list of module names to quantize in the block specified. This argument is useful to exclude certain linear modules from being quantized.
                 The block to quantize can be specified by setting `block_name_to_quantize`. We will quantize each list sequentially.
                 If not set, we will quantize all linear layers. Example: `inside_layer_modules=[["self_attention.query_key_value"], ["mlp.dense_h_to_4h"]]`
+            checkpoint_format (`str`, *optional*, defaults to `gptq`):
+                GPTQ weight format. `gptq`(v1) is supported by both gptqmodel and auto-gptq. `gptq_v2` is gptqmodel only.
+            meta (`Dict[str, any]`, *optional*):
+                Properties, such as tooling:version, that do not directly contributes to quantization or quant inference are stored in meta.
+                i.e. `meta.quantizer`: ["optimum:_version_", "gptqmodel:_version_"]
+            backend (`str`, *optional*):
+                Controls which gptq kernel to be used. Valid values for gptqmodel are `auto`, `auto_trainable` and more. For auto-gptq, only valid value is None and `auto_trainable`. Ref gptqmodel backends: https://github.com/ModelCloud/GPTQModel/blob/main/gptqmodel/utils/backend.py
         """
 
         self.bits = bits
@@ -138,6 +171,9 @@ def __init__(
         self.desc_act = desc_act
         self.sym = sym
         self.true_sequential = true_sequential
+        self.checkpoint_format = checkpoint_format.lower()
+        self.meta = meta
+        self.backend = backend.lower() if backend is not None else None
         self.use_cuda_fp16 = use_cuda_fp16
         self.model_seqlen = model_seqlen
         self.block_name_to_quantize = block_name_to_quantize
@@ -161,6 +197,8 @@ def __init__(
             "true_sequential",
             "quant_method",
             "modules_in_block_to_quantize",
+            "checkpoint_format",
+            "meta",
         ]
 
         if self.bits not in [2, 3, 4, 8]:
@@ -182,6 +220,28 @@ def __init__(
                 )
         self.exllama_version = self.exllama_config["version"]
 
+    def select_quant_linear(self, device_map: Union[str, dict]):
+        if is_gptqmodel_available():
+            self.quant_linear = hf_select_quant_linear(
+                bits=self.bits,
+                group_size=self.group_size,
+                desc_act=self.desc_act,
+                sym=self.sym,
+                checkpoint_format=self.checkpoint_format,
+                meta=self.meta,
+                device_map=device_map,
+                backend=self.backend,
+            )
+        else:
+            self.quant_linear = hf_select_quant_linear(
+                use_triton=False,
+                desc_act=self.desc_act,
+                group_size=self.group_size,
+                bits=self.bits,
+                disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
+                disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
+            )
+
     def to_dict(self):
         """
         Returns the args in dict format.
@@ -189,6 +249,20 @@ def to_dict(self):
         gptq_dict = {}
         for key in self.serialization_keys:
             gptq_dict[key] = getattr(self, key)
+
+        if gptq_dict.get("meta") is None:
+            gptq_dict["meta"] = {}
+
+        meta = gptq_dict["meta"]
+        # store both optimum:version and gptq_lib:version into quantize_config.meta.quantizer
+        if meta.get("quantizer") is None:
+            meta["quantizer"] = [f"optimum:{optimum_version}"]
+
+            if is_gptqmodel_available():
+                meta["quantizer"].append(f"gptqmodel:{gptqmodel_version}")
+            elif is_auto_gptq_available():
+                meta["quantizer"].append(f"auto_gptq:{autogptq_version}")
+
         return gptq_dict
 
     @classmethod
@@ -205,7 +279,7 @@ def from_dict(cls, config_dict: Dict[str, Any]):
         """
         return cls(**config_dict)
 
-    def convert_model(self, model: nn.Module):
+    def convert_model(self, model: nn.Module, **kwargs):
         """
         Convert the model to a GPTQ model by getting and replacing the layers.
 
@@ -226,7 +300,11 @@ def convert_model(self, model: nn.Module):
                         f"Quantization disabled for {name} (only modules_in_block_to_quantize={self.modules_in_block_to_quantize} are quantized)"
                     )
                     del layers_to_be_replaced[name]
+
+        self.select_quant_linear(device_map=kwargs.get("device_map", None))
+
         self._replace_by_quant_layers(model, layers_to_be_replaced)
+
         return model
 
     def get_no_split_module_classes(self, model):
@@ -253,15 +331,7 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
             name (`str`, defaults to `""`):
                 To keep track of the name of the current module
         """
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
-            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
-        )
-        if isinstance(module, QuantLinear):
+        if isinstance(module, self.quant_linear):
             return
         for attr in dir(module):
             layer = getattr(module, attr)
@@ -279,20 +349,37 @@ def _replace_by_quant_layers(self, module: nn.Module, names: List[str], name: st
                     in_features = layer.weight.shape[0]
                     out_features = layer.weight.shape[1]
                 bias = layer.bias is not None
-                if not (self.desc_act) or self.group_size == -1:
-                    new_layer = QuantLinear(
+                if is_gptqmodel_available():
+                    new_layer = self.quant_linear(
                         self.bits,
                         self.group_size,
+                        self.desc_act,
+                        self.sym,
                         in_features,
                         out_features,
                         bias,
-                        use_cuda_fp16=self.use_cuda_fp16,
                         weight_dtype=layer.weight.dtype,
                     )
                 else:
-                    new_layer = QuantLinear(
-                        self.bits, self.group_size, in_features, out_features, bias, weight_dtype=layer.weight.dtype
-                    )
+                    if not (self.desc_act) or self.group_size == -1:
+                        new_layer = self.quant_linear(
+                            self.bits,
+                            self.group_size,
+                            in_features,
+                            out_features,
+                            bias,
+                            use_cuda_fp16=self.use_cuda_fp16,
+                            weight_dtype=layer.weight.dtype,
+                        )
+                    else:
+                        new_layer = self.quant_linear(
+                            self.bits,
+                            self.group_size,
+                            in_features,
+                            out_features,
+                            bias,
+                            weight_dtype=layer.weight.dtype,
+                        )
                 new_layer.device = device
                 setattr(module, attr, new_layer.to(device))
         for name1, child in module.named_children():
@@ -318,13 +405,41 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
             `nn.Module`: The quantized model
         """
 
-        if not is_auto_gptq_available():
-            raise RuntimeError("auto-gptq is required in order to perform quantzation : `pip install auto-gptq`")
-        if not torch.cuda.is_available():
-            raise RuntimeError("No GPU found. A GPU is needed to quantize model.")
+        if not is_auto_gptq_available() and not is_gptqmodel_available():
+            raise RuntimeError(
+                "gptqmodel or auto-gptq is required in order to perform gptq quantzation: `pip install gptqmodel` or `pip install auto-gptq`. Please notice that auto-gptq will be deprecated in the future."
+            )
+        elif is_gptqmodel_available() and is_auto_gptq_available():
+            logger.warning(
+                "Detected gptqmodel and auto-gptq, will use gptqmodel. The auto_gptq will be deprecated in the future."
+            )
+
+        gptq_supports_cpu = (
+            is_auto_gptq_available()
+            and version.parse(importlib.metadata.version("auto-gptq")) > version.parse("0.4.2")
+        ) or is_gptqmodel_available()
+
+        if not gptq_supports_cpu and not torch.cuda.is_available():
+            raise RuntimeError(
+                "No cuda gpu or cpu support using Intel/IPEX found. A gpu or cpu with Intel/IPEX is required for quantization."
+            )
+
+        if not self.sym and not is_gptqmodel_available():
+            raise ValueError(
+                "Asymmetric sym=False quantization is not supported with auto-gptq. Please use gptqmodel: `pip install gptqmodel`"
+            )
+
+        if self.checkpoint_format == "gptq_v2" and not is_gptqmodel_available():
+            raise ValueError(
+                "gptq_v2 format only supported with gptqmodel. Please install gptqmodel: `pip install gptqmodel`"
+            )
 
         model.eval()
 
+        # gptqmodel internal is gptq_v2 for asym support, gptq(v1) can only support sym=True
+        if is_gptqmodel_available() and self.checkpoint_format != "gptq_v2":
+            self.checkpoint_format = "gptq_v2"
+
         # For Transformer model
         has_config = False
         has_device_map = False
@@ -403,27 +518,32 @@ def quantize_model(self, model: nn.Module, tokenizer: Optional[Any] = None):
 
         blocks = recurse_getattr(model, self.block_name_to_quantize)
 
+        cur_layer_device = get_device(blocks[0])
+        if not is_gptqmodel_available():
+            cur_layer_device = 0
+
         if not has_device_map:
-            # put modules from module_name_preceding_first_block on cuda
+            # put modules from module_name_preceding_first_block on cuda or xpu or cpu
+            to_device = cur_layer_device
             for module_name in self.module_name_preceding_first_block:
                 module = recurse_getattr(model, module_name)
                 if module is None:
                     raise ValueError(f"Module {module_name} was not found in model")
-                module = module.to(0)
-            blocks[0] = blocks[0].to(0)
+                module = module.to(to_device)
+            blocks[0] = blocks[0].to(to_device)
 
         def store_input_hook(_, input, *args):
             kwargs = args[0]
             if input is None:
                 if "hidden_states" in kwargs:
-                    input = (kwargs["hidden_states"],)
+                    input = (nested_move_to(kwargs["hidden_states"], cur_layer_device),)
                 else:
                     raise ValueError("No input value found in the foward pass")
             layer_inputs.append(input)
             other_kwargs = {}
             for k, v in kwargs.items():  # make sure other arguments also be captured
                 if k not in ["hidden_states"]:
-                    other_kwargs[k] = v
+                    other_kwargs[k] = nested_move_to(v, cur_layer_device)
             layer_input_kwargs.append(other_kwargs)
             raise ValueError
 
@@ -431,11 +551,7 @@ def store_input_hook(_, input, *args):
             handle = blocks[0].register_forward_pre_hook(store_input_hook, with_kwargs=True)
             for data in dataset:
                 for k, v in data.items():
-                    # put the data on gpu, we won't put them back to cpu
-                    if not has_device_map or device.type == "cpu":
-                        data[k] = v.to(0)
-                    else:
-                        data[k] = v.to(device)
+                    data[k] = nested_move_to(v, cur_layer_device)
                 try:
                     model(**data)
                 except ValueError:
@@ -450,6 +566,8 @@ def store_input_hook(_, input, *args):
                     raise ValueError(f"Module {module_name} was not found in model")
 
         torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
 
         # Step 3: Quantize the blocks
         quantizers = {}
@@ -460,11 +578,7 @@ def store_input_hook(_, input, *args):
                 handle = block.register_forward_pre_hook(store_input_hook, with_kwargs=True)
                 for data in dataset:
                     for k, v in data.items():
-                        # put the data on gpu, we won't put them back to cpu
-                        if not has_device_map or device.type == "cpu":
-                            data[k] = v.to(0)
-                        else:
-                            data[k] = v.to(device)
+                        data[k] = nested_move_to(v, cur_layer_device)
                     try:
                         model(**data)
                     except ValueError:
@@ -473,9 +587,12 @@ def store_input_hook(_, input, *args):
 
             # move block to cuda if needed
             # in case we have offload modules, we need to put them on cuda because of GPTQ object
-            if not has_device_map or get_device(block) == torch.device("cpu"):
+            if (not has_device_map or get_device(block) == torch.device("cpu")) and has_device_more_than_cpu():
                 block = block.to(0)
             layers = get_layers(block)
+            block_device = get_device(block)
+            if not is_gptqmodel_available():
+                block_device = 0
             if isinstance(self.modules_in_block_to_quantize, list) and len(self.modules_in_block_to_quantize) > 0:
                 if self.true_sequential:
                     layers_name_list = self.modules_in_block_to_quantize
@@ -509,15 +626,20 @@ def tmp(_, input, output):
                 for j in range(len(dataset)):
                     # the args are already on the gpu
                     # don't need to store the output
+                    layer_inputs[j] = nested_move_to(layer_inputs[j], block_device)
+                    for k, v in layer_input_kwargs[j].items():
+                        layer_input_kwargs[j][k] = nested_move_to(v, block_device)
+
                     block(*layer_inputs[j], **layer_input_kwargs[j])
                 # remove hook
                 for h in handles:
                     h.remove()
                 for name in subset_name_list:
                     logger.info(f"Quantizing {name} in block {i + 1}/{len(blocks)}...")
-                    scale, zero, g_idx = gptq[name].fasterquant(
+                    quant_outputs = gptq[name].fasterquant(
                         percdamp=self.damp_percent, group_size=self.group_size, actorder=self.desc_act
                     )
+                    scale, zero, g_idx = quant_outputs[0], quant_outputs[1], quant_outputs[2]
                     quantizers[f"{self.block_name_to_quantize}.{i}.{name}"] = (
                         gptq[name].quantizer,
                         scale,
@@ -543,11 +665,13 @@ def tmp(_, input, output):
                 del layer_inputs
                 layer_inputs = []
             torch.cuda.empty_cache()
+            if hasattr(torch, "xpu") and torch.xpu.is_available():
+                torch.xpu.empty_cache()
 
         if self.bits == 4:
             # device not on gpu
-            if device == torch.device("cpu") or (has_device_map and any(d in devices for d in ["cpu", "disk"])):
-                if not self.disable_exllama:
+            if device.type != "cuda" or (has_device_map and any(d in devices for d in ["cpu", "disk", "hpu"])):
+                if not self.disable_exllama and not is_gptqmodel_available():
                     logger.warning(
                         "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
                     )
@@ -578,6 +702,8 @@ def tmp(_, input, output):
         model = self.post_init_model(model)
 
         torch.cuda.empty_cache()
+        if hasattr(torch, "xpu") and torch.xpu.is_available():
+            torch.xpu.empty_cache()
         return model
 
     def post_init_model(self, model):
@@ -589,20 +715,26 @@ def post_init_model(self, model):
                 The input model
         """
         if self.bits == 4 and not self.disable_exllama:
-            if get_device(model) == torch.device("cpu") or (
-                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk"])
+            if get_device(model).type != "cuda" or (
+                hasattr(model, "hf_device_map") and any(d in model.hf_device_map for d in ["cpu", "disk", "hpu"])
             ):
-                raise ValueError(
-                    "Found modules on cpu/disk. Using Exllama or Exllamav2 backend requires all the modules to be on GPU."
-                    "You can deactivate exllama backend by setting `disable_exllama=True` in the quantization config object"
-                )
+                if not self.disable_exllama:
+                    logger.warning(
+                        "Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting `disable_exllama=True`"
+                    )
+                    self.disable_exllama = True
 
         class StoreAttr(object):
             pass
 
+        if is_gptqmodel_available():
+            model, _ = hf_convert_gptq_v1_to_v2_format(
+                model, self.bits, self.quant_linear, self.checkpoint_format, self.meta
+            )
+
         model.quantize_config = StoreAttr()
         model.quantize_config.desc_act = self.desc_act
-        model = autogptq_post_init(model, use_act_order=self.desc_act)
+        model = gptq_post_init(model, use_act_order=self.desc_act)
         if (
             self.desc_act
             and (not self.disable_exllama and self.exllama_version == ExllamaVersion.ONE)
@@ -625,19 +757,14 @@ def pack_model(
             quantizers (`Dict[str,Tuple]`):
                 A mapping of the layer name and the data needed to pack the layer
         """
-        QuantLinear = dynamically_import_QuantLinear(
-            use_triton=False,
-            desc_act=self.desc_act,
-            group_size=self.group_size,
-            bits=self.bits,
-            disable_exllama=self.disable_exllama or self.exllama_version != ExllamaVersion.ONE,
-            disable_exllamav2=self.disable_exllama or self.exllama_version != ExllamaVersion.TWO,
-        )
         logger.info("Packing model...")
         layers = get_layers(model)
         layers = {n: layers[n] for n in quantizers}
+
+        self.select_quant_linear(device_map=model.hf_device_map)
+
         self._replace_by_quant_layers(model, quantizers)
-        qlayers = get_layers(model, [QuantLinear])
+        qlayers = get_layers(model, [self.quant_linear])
         for name in qlayers:
             logger.info(name)
             quantizers[name], scale, zero, g_idx = quantizers[name]
@@ -672,6 +799,15 @@ def save(self, model: nn.Module, save_dir: str, max_shard_size: str = "10GB", sa
                 Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
 
         """
+
+        # convert gptqmodel internal gptq_v2 format to v1 for max compatibility
+        if is_gptqmodel_available():
+            model, converted = hf_convert_gptq_v2_to_v1_format(
+                model, self.sym, self.bits, self.quant_linear, self.checkpoint_format, self.meta
+            )
+            if converted:
+                self.checkpoint_format = "gptq"
+
         os.makedirs(save_dir, exist_ok=True)
         model.save_pretrained(save_dir, max_shard_size=max_shard_size, safe_serialization=safe_serialization)
         with open(os.path.join(save_dir, GPTQ_CONFIG), "w", encoding="utf-8") as f:
@@ -735,10 +871,12 @@ def load_quantized_model(
     Returns:
         `nn.Module`: The quantized model
     """
-    if not torch.cuda.is_available():
-        raise RuntimeError("No GPU found. A GPU is needed to run quantized model.")
-    if not is_auto_gptq_available():
-        raise RuntimeError("auto-gptq is required in order to load quantized weights : `pip install auto-gptq`")
+    if not torch.cuda.is_available() and not is_gptqmodel_available():
+        raise RuntimeError("No GPU found. A GPU is needed to run quantized model by auto_gptq.")
+    if not is_auto_gptq_available() and not is_gptqmodel_available():
+        raise RuntimeError(
+            "gptqmodel (`pip install gptqmodel`) or auto-gptq (`pip install auto-gptq`) is required in order to load quantized weights. Please notice that auto-gptq will be deprecated in the future."
+        )
     if not is_accelerate_available():
         raise RuntimeError(
             "You need to install accelerate in order to load and dispatch weights to"
@@ -776,7 +914,7 @@ def load_quantized_model(
     quantizer.exllama_version = quantizer.exllama_config["version"]
     quantizer.max_input_length = max_input_length
 
-    model = quantizer.convert_model(model)
+    model = quantizer.convert_model(model, device_map=device_map)
 
     if no_split_module_classes is None:
         no_split_module_classes = quantizer.get_no_split_module_classes(model)
diff --git a/optimum/gptq/utils.py b/optimum/gptq/utils.py
index a5f9afdaaef..732ecbd66b9 100644
--- a/optimum/gptq/utils.py
+++ b/optimum/gptq/utils.py
@@ -113,3 +113,18 @@ def get_seqlen(model: nn.Module):
         "We couldn't get the model sequence length. Setting it to 2048. You can overwrite this value by passing `model_seqlen` in` GPTQQuantizer`"
     )
     return 2048
+
+
+def move_to(obj: torch.Tensor, device: torch.device):
+    if get_device(obj) != device:
+        obj = obj.to(device)
+    return obj
+
+
+def nested_move_to(v, device):
+    if isinstance(v, torch.Tensor):
+        return move_to(v, device)
+    elif isinstance(v, (list, tuple)):
+        return type(v)([nested_move_to(e, device) for e in v])
+    else:
+        return v
diff --git a/optimum/modeling_base.py b/optimum/modeling_base.py
index 5bab0622de4..48c738514ae 100644
--- a/optimum/modeling_base.py
+++ b/optimum/modeling_base.py
@@ -85,7 +85,6 @@ class PreTrainedModel(ABC):  # noqa: F811
 
 class OptimizedModel(PreTrainedModel):
     config_class = AutoConfig
-    load_tf_weights = None
     base_model_prefix = "optimized_model"
     config_name = CONFIG_NAME
 
@@ -372,29 +371,44 @@ def from_pretrained(
             export = from_transformers
 
         if len(model_id.split("@")) == 2:
+            logger.warning(
+                f"Specifying the `revision` as @{model_id.split('@')[1]} is deprecated and will be removed in v1.23, please use the `revision` argument instead."
+            )
             if revision is not None:
                 logger.warning(
                     f"The argument `revision` was set to {revision} but will be ignored for {model_id.split('@')[1]}"
                 )
             model_id, revision = model_id.split("@")
 
-        library_name = TasksManager.infer_library_from_model(model_id, subfolder, revision, cache_dir, token=token)
+        all_files, _ = TasksManager.get_model_files(
+            model_id,
+            subfolder=subfolder,
+            cache_dir=cache_dir,
+            revision=revision,
+            token=token,
+        )
+
+        config_folder = subfolder
+        if cls.config_name not in all_files:
+            logger.info(
+                f"{cls.config_name} not found in the specified subfolder {subfolder}. Using the top level {cls.config_name}."
+            )
+            config_folder = ""
+
+        library_name = TasksManager.infer_library_from_model(
+            model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token
+        )
 
         if library_name == "timm":
-            config = PretrainedConfig.from_pretrained(model_id, subfolder, revision)
+            config = PretrainedConfig.from_pretrained(
+                model_id, subfolder=config_folder, revision=revision, cache_dir=cache_dir, token=token
+            )
 
         if config is None:
-            if os.path.isdir(os.path.join(model_id, subfolder)) and cls.config_name == CONFIG_NAME:
-                if CONFIG_NAME in os.listdir(os.path.join(model_id, subfolder)):
+            if os.path.isdir(os.path.join(model_id, config_folder)) and cls.config_name == CONFIG_NAME:
+                if CONFIG_NAME in os.listdir(os.path.join(model_id, config_folder)):
                     config = AutoConfig.from_pretrained(
-                        os.path.join(model_id, subfolder), trust_remote_code=trust_remote_code
-                    )
-                elif CONFIG_NAME in os.listdir(model_id):
-                    config = AutoConfig.from_pretrained(
-                        os.path.join(model_id, CONFIG_NAME), trust_remote_code=trust_remote_code
-                    )
-                    logger.info(
-                        f"config.json not found in the specified subfolder {subfolder}. Using the top level config.json."
+                        os.path.join(model_id, config_folder), trust_remote_code=trust_remote_code
                     )
                 else:
                     raise OSError(f"config.json not found in {model_id} local folder")
@@ -405,7 +419,7 @@ def from_pretrained(
                     cache_dir=cache_dir,
                     token=token,
                     force_download=force_download,
-                    subfolder=subfolder,
+                    subfolder=config_folder,
                     trust_remote_code=trust_remote_code,
                 )
         elif isinstance(config, (str, os.PathLike)):
@@ -415,7 +429,7 @@ def from_pretrained(
                 cache_dir=cache_dir,
                 token=token,
                 force_download=force_download,
-                subfolder=subfolder,
+                subfolder=config_folder,
                 trust_remote_code=trust_remote_code,
             )
 
diff --git a/optimum/onnx/transformations_utils.py b/optimum/onnx/transformations_utils.py
index 1f0765112e8..fe55a5a5770 100644
--- a/optimum/onnx/transformations_utils.py
+++ b/optimum/onnx/transformations_utils.py
@@ -29,7 +29,6 @@
 
 
 logger = logging.get_logger()
-logger.setLevel(logging.INFO)
 
 
 def _find_duplicate_initializers(
diff --git a/optimum/onnx/utils.py b/optimum/onnx/utils.py
index b52c4f4cdac..c014c1b3429 100644
--- a/optimum/onnx/utils.py
+++ b/optimum/onnx/utils.py
@@ -71,6 +71,22 @@ def _get_external_data_paths(src_paths: List[Path], dst_paths: List[Path]) -> Tu
     return src_paths, dst_paths
 
 
+def _get_model_external_data_paths(model_path: Path) -> List[Path]:
+    """
+    Gets external data paths from the model.
+    """
+
+    onnx_model = onnx.load(str(model_path), load_external_data=False)
+    model_tensors = _get_initializer_tensors(onnx_model)
+    # filter out tensors that are not external data
+    model_tensors_ext = [
+        ExternalDataInfo(tensor).location
+        for tensor in model_tensors
+        if tensor.HasField("data_location") and tensor.data_location == onnx.TensorProto.EXTERNAL
+    ]
+    return [model_path.parent / tensor_name for tensor_name in model_tensors_ext]
+
+
 def check_model_uses_external_data(model: onnx.ModelProto) -> bool:
     """
     Checks if the model uses external data.
diff --git a/optimum/onnxruntime/__init__.py b/optimum/onnxruntime/__init__.py
index f1d4f63a9ff..f3f1535fd45 100644
--- a/optimum/onnxruntime/__init__.py
+++ b/optimum/onnxruntime/__init__.py
@@ -44,6 +44,7 @@
         "ORTModelForSemanticSegmentation",
         "ORTModelForSequenceClassification",
         "ORTModelForTokenClassification",
+        "ORTModelForImageToImage",
     ],
     "modeling_seq2seq": [
         "ORTModelForSeq2SeqLM",
@@ -73,21 +74,51 @@
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
     _import_structure[".utils.dummy_diffusers_objects"] = [
-        "ORTStableDiffusionPipeline",
+        "ORTDiffusionPipeline",
+        "ORTPipelineForText2Image",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        # flux
+        "ORTFluxPipeline",
+        # lcm
+        "ORTLatentConsistencyModelImg2ImgPipeline",
+        "ORTLatentConsistencyModelPipeline",
+        # sd3
+        "ORTStableDiffusion3Img2ImgPipeline",
+        "ORTStableDiffusion3InpaintPipeline",
+        "ORTStableDiffusion3Pipeline",
+        # sd
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
-        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionPipeline",
+        # xl
         "ORTStableDiffusionXLImg2ImgPipeline",
-        "ORTLatentConsistencyModelPipeline",
+        "ORTStableDiffusionXLInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 else:
     _import_structure["modeling_diffusion"] = [
-        "ORTStableDiffusionPipeline",
+        "ORTDiffusionPipeline",
+        "ORTPipelineForText2Image",
+        "ORTPipelineForImage2Image",
+        "ORTPipelineForInpainting",
+        # flux
+        "ORTFluxPipeline",
+        # lcm
+        "ORTLatentConsistencyModelImg2ImgPipeline",
+        "ORTLatentConsistencyModelPipeline",
+        # sd3
+        "ORTStableDiffusion3Img2ImgPipeline",
+        "ORTStableDiffusion3InpaintPipeline",
+        "ORTStableDiffusion3Pipeline",
+        # sd
         "ORTStableDiffusionImg2ImgPipeline",
         "ORTStableDiffusionInpaintPipeline",
-        "ORTStableDiffusionXLPipeline",
+        "ORTStableDiffusionPipeline",
+        # xl
         "ORTStableDiffusionXLImg2ImgPipeline",
-        "ORTLatentConsistencyModelPipeline",
+        "ORTStableDiffusionXLInpaintPipeline",
+        "ORTStableDiffusionXLPipeline",
     ]
 
 
@@ -104,6 +135,7 @@
         ORTModelForCustomTasks,
         ORTModelForFeatureExtraction,
         ORTModelForImageClassification,
+        ORTModelForImageToImage,
         ORTModelForMaskedLM,
         ORTModelForMultipleChoice,
         ORTModelForQuestionAnswering,
@@ -137,20 +169,54 @@
             raise OptionalDependencyNotAvailable()
     except OptionalDependencyNotAvailable:
         from ..utils.dummy_diffusers_objects import (
+            # generic entrypoint
+            ORTDiffusionPipeline,
+            # flux
+            ORTFluxPipeline,
+            # lcm
+            ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
+            # task-specific entrypoints
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
+            # sd3
+            ORTStableDiffusion3Img2ImgPipeline,
+            ORTStableDiffusion3InpaintPipeline,
+            ORTStableDiffusion3Pipeline,
+            # sd
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            # xl
             ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
         )
     else:
         from .modeling_diffusion import (
+            # generic entrypoint
+            ORTDiffusionPipeline,
+            # flux
+            ORTFluxPipeline,
+            # lcm
+            ORTLatentConsistencyModelImg2ImgPipeline,
             ORTLatentConsistencyModelPipeline,
+            # task-specific entrypoints
+            ORTPipelineForImage2Image,
+            ORTPipelineForInpainting,
+            ORTPipelineForText2Image,
+            # sd3
+            ORTStableDiffusion3Img2ImgPipeline,
+            ORTStableDiffusion3InpaintPipeline,
+            ORTStableDiffusion3Pipeline,
+            # sd
             ORTStableDiffusionImg2ImgPipeline,
             ORTStableDiffusionInpaintPipeline,
             ORTStableDiffusionPipeline,
+            # xl
             ORTStableDiffusionXLImg2ImgPipeline,
+            ORTStableDiffusionXLInpaintPipeline,
             ORTStableDiffusionXLPipeline,
         )
 else:
diff --git a/optimum/onnxruntime/base.py b/optimum/onnxruntime/base.py
index d9877670ba8..845780cafad 100644
--- a/optimum/onnxruntime/base.py
+++ b/optimum/onnxruntime/base.py
@@ -41,17 +41,11 @@ class ORTModelPart:
     _prepare_onnx_inputs = ORTModel._prepare_onnx_inputs
     _prepare_onnx_outputs = ORTModel._prepare_onnx_outputs
 
-    def __init__(
-        self,
-        session: InferenceSession,
-        parent_model: "ORTModel",
-    ):
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
         self.session = session
         self.parent_model = parent_model
-        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            self.parent_model.config.model_type
-        )(self.parent_model.config)
         self.main_input_name = self.parent_model.main_input_name
+
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
         self.input_dtypes = {input_key.name: input_key.type for input_key in session.get_inputs()}
@@ -77,6 +71,25 @@ def dtype(self):
 
         return None
 
+    def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, torch.device):
+                device = arg
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if device is not None and device != self.device:
+            raise ValueError(
+                "Cannot change the device of a model part without changing the device of the parent model. "
+                "Please use the `to` method of the parent model to change the device."
+            )
+
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the model from {self.dtype} to {dtype}. "
+                f"Please export the model with the desired dtype."
+            )
+
     @abstractmethod
     def forward(self, *args, **kwargs):
         pass
@@ -90,12 +103,18 @@ class ORTEncoder(ORTModelPart):
     Encoder part of the encoder-decoder model for ONNX Runtime inference.
     """
 
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: torch.LongTensor,
-        **kwargs,
-    ) -> BaseModelOutput:
+    def __init__(self, session: InferenceSession, parent_model: "ORTModel"):
+        super().__init__(session, parent_model)
+
+        config = (
+            self.parent_model.config.encoder
+            if hasattr(self.parent_model.config, "encoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
+    def forward(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor, **kwargs) -> BaseModelOutput:
         use_torch = isinstance(input_ids, torch.Tensor)
         self.parent_model.raise_on_numpy_input_io_binding(use_torch)
 
@@ -138,6 +157,14 @@ def __init__(
     ):
         super().__init__(session, parent_model)
 
+        config = (
+            self.parent_model.config.decoder
+            if hasattr(self.parent_model.config, "decoder")
+            else self.parent_model.config
+        )
+
+        self.normalized_config = NormalizedConfigManager.get_normalized_config_class(config.model_type)(config)
+
         # TODO: make this less hacky.
         self.key_value_input_names = [key for key in self.input_names if (".key" in key) or (".value" in key)]
         self.key_value_output_names = [key for key in self.output_names if (".key" in key) or (".value" in key)]
@@ -153,11 +180,7 @@ def __init__(
 
         self.use_past_in_outputs = len(self.key_value_output_names) > 0
         self.use_past_in_inputs = len(self.key_value_input_names) > 0
-        self.use_fp16 = False
-        for inp in session.get_inputs():
-            if "past_key_values" in inp.name and inp.type == "tensor(float16)":
-                self.use_fp16 = True
-                break
+        self.use_fp16 = self.dtype == torch.float16
 
         # We may use ORTDecoderForSeq2Seq for vision-encoder-decoder models, where models as gpt2
         # can be used but do not support KV caching for the cross-attention key/values, see:
@@ -461,11 +484,3 @@ def prepare_inputs_for_merged(
                 cache_position = cache_position.to(self.device)
 
         return use_cache_branch_tensor, past_key_values, cache_position
-
-
-class ORTDecoder(ORTDecoderForSeq2Seq):
-    def __init__(self, *args, **kwargs):
-        logger.warning(
-            "The class `ORTDecoder` is deprecated and will be removed in optimum v1.15.0, please use `ORTDecoderForSeq2Seq` instead."
-        )
-        super().__init__(*args, **kwargs)
diff --git a/optimum/onnxruntime/configuration.py b/optimum/onnxruntime/configuration.py
index 2e3d9f32d6a..adc1984795a 100644
--- a/optimum/onnxruntime/configuration.py
+++ b/optimum/onnxruntime/configuration.py
@@ -18,9 +18,8 @@
 from dataclasses import asdict, dataclass, field
 from enum import Enum
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
-from datasets import Dataset
 from packaging.version import Version, parse
 
 from onnxruntime import __version__ as ort_version
@@ -33,6 +32,10 @@
 from ..utils import logging
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.get_logger(__name__)
 
 # This value is used to indicate ORT which axis it should use to quantize an operator "per-channel"
@@ -117,7 +120,9 @@ def create_calibrator(
 
 class AutoCalibrationConfig:
     @staticmethod
-    def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: float = 0.01) -> CalibrationConfig:
+    def minmax(
+        dataset: "Dataset", moving_average: bool = False, averaging_constant: float = 0.01
+    ) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
@@ -151,7 +156,7 @@ def minmax(dataset: Dataset, moving_average: bool = False, averaging_constant: f
 
     @staticmethod
     def entropy(
-        dataset: Dataset,
+        dataset: "Dataset",
         num_bins: int = 128,
         num_quantized_bins: int = 128,
     ) -> CalibrationConfig:
@@ -188,7 +193,7 @@ def entropy(
         )
 
     @staticmethod
-    def percentiles(dataset: Dataset, num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
+    def percentiles(dataset: "Dataset", num_bins: int = 2048, percentile: float = 99.999) -> CalibrationConfig:
         """
         Args:
             dataset (`Dataset`):
diff --git a/optimum/onnxruntime/io_binding/io_binding_helper.py b/optimum/onnxruntime/io_binding/io_binding_helper.py
index 31da5379184..f32ecc56e6e 100644
--- a/optimum/onnxruntime/io_binding/io_binding_helper.py
+++ b/optimum/onnxruntime/io_binding/io_binding_helper.py
@@ -157,9 +157,9 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding:
         Returns an IOBinding object for an inference session. This method is for general purpose, if the inputs and outputs
         are determined, you can prepare data buffers directly to avoid tensor transfers across frameworks.
         """
-        if not all(input_name in inputs.keys() for input_name in ort_model.inputs_names):
+        if not all(input_name in inputs.keys() for input_name in ort_model.input_names):
             raise ValueError(
-                f"The ONNX model takes {ort_model.inputs_names.keys()} as inputs, but only {inputs.keys()} are given."
+                f"The ONNX model takes {ort_model.input_names.keys()} as inputs, but only {inputs.keys()} are given."
             )
 
         name_to_np_type = TypeHelper.get_io_numpy_type_map(ort_model.model)
@@ -168,7 +168,7 @@ def prepare_io_binding(ort_model: "ORTModel", **inputs) -> ort.IOBinding:
         io_binding = ort_model.model.io_binding()
 
         # Bind inputs
-        for input_name in ort_model.inputs_names:
+        for input_name in ort_model.input_names:
             onnx_input = inputs.pop(input_name)
             onnx_input = onnx_input.contiguous()
 
diff --git a/optimum/onnxruntime/model.py b/optimum/onnxruntime/model.py
index caa662f3824..4182abc925f 100644
--- a/optimum/onnxruntime/model.py
+++ b/optimum/onnxruntime/model.py
@@ -14,10 +14,9 @@
 
 import logging
 import os
-from typing import Callable, Dict, List, Optional, Union
+from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 
 import numpy as np
-from datasets import Dataset
 from transformers import EvalPrediction
 from transformers.trainer_pt_utils import nested_concat
 from transformers.trainer_utils import EvalLoopOutput
@@ -25,6 +24,10 @@
 from onnxruntime import InferenceSession
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 logger = logging.getLogger(__name__)
 
 
@@ -59,7 +62,7 @@ def __init__(
         self.session = InferenceSession(str(model_path), providers=[execution_provider])
         self.onnx_input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
 
-    def evaluation_loop(self, dataset: Dataset):
+    def evaluation_loop(self, dataset: "Dataset"):
         """
         Run evaluation and returns metrics and predictions.
 
diff --git a/optimum/onnxruntime/modeling_decoder.py b/optimum/onnxruntime/modeling_decoder.py
index 6a0dcbba2f0..8f1d062221a 100644
--- a/optimum/onnxruntime/modeling_decoder.py
+++ b/optimum/onnxruntime/modeling_decoder.py
@@ -14,7 +14,6 @@
 """Classes handling causal-lm related architectures in ONNX Runtime."""
 
 import logging
-import warnings
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
@@ -149,6 +148,19 @@ def __init__(
             generation_config = GenerationConfig.from_model_config(config)
 
         self.generation_config = generation_config
+
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
         self.onnx_paths = [self.model_path]
         self.use_merged = "use_cache_branch" in self.input_names
         self.model_type = self.config.model_type
@@ -328,7 +340,7 @@ def prepare_past_key_values(
             if self.model_type == "gemma":
                 num_attention_heads = self.normalized_config.num_key_value_heads
                 embed_size_per_head = self.normalized_config.head_dim
-            elif self.model_type in {"mistral", "llama", "qwen2"}:
+            elif self.model_type in {"mistral", "llama", "qwen2", "granite"}:
                 num_attention_heads = self.normalized_config.num_key_value_heads
             else:
                 num_attention_heads = self.normalized_config.num_attention_heads
@@ -336,8 +348,7 @@ def prepare_past_key_values(
             dtype = constructor.float16 if self.use_fp16 else constructor.float32
 
             # TODO: find a way to better handle this controlflow, this is EXTREMELY UGLY.
-            # "1" is the dummy sequence length
-            if self.model_type == "bloom":
+            if self.__class__.__name__ == "ORTBloomForCausalLM":
                 shape_value = (batch_size * num_attention_heads, 0, embed_size_per_head)
                 shape_key = (batch_size * num_attention_heads, embed_size_per_head, 0)
                 key = constructor.zeros(shape_key, dtype=dtype)
@@ -354,9 +365,9 @@ def prepare_past_key_values(
                 for name, value in zip(self.key_value_output_names, past_key_values):
                     shape = [*value.shape]
                     index = 1 if "value" in name else 2
-
                     shape[index] += sequence_length
                     pkv_output_shape[name] = shape
+
             elif self.model_type == "gpt_bigcode":
                 # GPT BigCode uses muti-query attention, and has the specificity of putting both key and value in the same cache tensor.
                 shape_key_and_value = (batch_size, 0, embed_size_per_head * 2)
@@ -371,9 +382,9 @@ def prepare_past_key_values(
                     shape = [*value.shape]
                     shape[1] += sequence_length
                     pkv_output_shape[name] = shape
+
             else:
                 num_key_value_heads = self.num_key_value_heads if self.model_type == "falcon" else num_attention_heads
-
                 shape = (batch_size, num_key_value_heads, 0, embed_size_per_head)
                 key_or_value = constructor.zeros(shape, dtype=dtype)
 
@@ -394,7 +405,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -411,15 +421,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -534,9 +536,9 @@ def _from_pretrained(
 
         # Since https://github.com/huggingface/optimum/pull/871/
         # changed axis notation/naming during export, we need to update the dims
-        for dim in input_dims.keys():
-            if "past" in dim and input_dims[dim][2] == "past_sequence_length + sequence_length":
-                input_dims[dim][2] = "past_sequence_length"
+        for input_name in input_dims.keys():
+            if "past" in input_name and input_dims[input_name][2] == "past_sequence_length + sequence_length":
+                input_dims[input_name][2] = "past_sequence_length"
                 override_dims = True
 
         if override_dims:
@@ -559,6 +561,12 @@ def _from_pretrained(
                 size_threshold=0,
             )
 
+        # Since transformers 4.44, the bloom model has been updated to use the standard cache format
+        use_old_bloom_modeling = not check_if_transformers_greater("4.44")
+        for input_name in input_dims.keys():
+            if input_dims[input_name][0] == "batch_size x num_heads":
+                use_old_bloom_modeling = True
+
         del onnx_model
 
         model = ORTModel.load_model(
@@ -568,19 +576,36 @@ def _from_pretrained(
             provider_options=provider_options,
         )
 
-        if config.model_type == "bloom":
+        if config.model_type == "bloom" and use_old_bloom_modeling:
             init_cls = ORTBloomForCausalLM
         elif config.model_type == "falcon":
             init_cls = ORTFalconForCausalLM
         elif config.model_type == "mpt":
             init_cls = ORTMPTForCausalLM
-        elif config.model_type == "opt":
+        # if model was exported with position_ids it means the model was exported with transformers >= v4.46
+        elif config.model_type == "opt" and "position_ids" not in input_dims:
             init_cls = ORTOPTForCausalLM
         elif config.model_type == "gpt_bigcode":
             init_cls = ORTGPTBigCodeForCausalLM
         else:
             init_cls = ORTModelForCausalLM
 
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
+
         return init_cls(
             model=model,
             config=config,
@@ -588,6 +613,7 @@ def _from_pretrained(
             model_save_dir=model_save_dir,
             preprocessors=preprocessors,
             use_cache=use_cache,
+            generation_config=generation_config,
         )
 
     @classmethod
@@ -595,7 +621,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -611,15 +636,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForCausalLM":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         file_name = ONNX_WEIGHTS_NAME
 
         if use_merged:
@@ -650,8 +666,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -707,6 +721,10 @@ def _reorder_cache(past: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor) ->
             for layer_past in past
         )
 
+    def _save_pretrained(self, save_directory: Union[str, Path]):
+        super()._save_pretrained(save_directory)
+        self.generation_config.save_pretrained(save_directory)
+
 
 class ORTGPTBigCodeForCausalLM(ORTModelForCausalLM):
     # Adapted from transformers.models.gpt_bigcode.modeling_gpt_bigcode.GPTBigCodeForCausalLM.prepare_inputs_for_generation
@@ -822,7 +840,6 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwarg
 
         attention_mask = kwargs.get("attention_mask", None)
         use_cache = kwargs.get("use_cache", None)
-
         return {
             "input_ids": input_ids,
             "past_key_values": past_key_values,
diff --git a/optimum/onnxruntime/modeling_diffusion.py b/optimum/onnxruntime/modeling_diffusion.py
index 4bbfb2eda2a..66b08e1ef66 100644
--- a/optimum/onnxruntime/modeling_diffusion.py
+++ b/optimum/onnxruntime/modeling_diffusion.py
@@ -13,431 +13,382 @@
 #  limitations under the License.
 
 import importlib
+import inspect
 import logging
 import os
 import shutil
-import warnings
 from abc import abstractmethod
+from collections import OrderedDict
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from typing import Any, Dict, Optional, Union
 
 import numpy as np
 import torch
-from diffusers import (
-    DDIMScheduler,
-    LMSDiscreteScheduler,
-    PNDMScheduler,
+from diffusers.configuration_utils import ConfigMixin
+from diffusers.pipelines import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    LatentConsistencyModelImg2ImgPipeline,
+    LatentConsistencyModelPipeline,
+    StableDiffusionImg2ImgPipeline,
+    StableDiffusionInpaintPipeline,
     StableDiffusionPipeline,
     StableDiffusionXLImg2ImgPipeline,
+    StableDiffusionXLInpaintPipeline,
+    StableDiffusionXLPipeline,
 )
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import SchedulerMixin
 from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
-from diffusers.utils import CONFIG_NAME, is_invisible_watermark_available
+from diffusers.utils.constants import CONFIG_NAME
 from huggingface_hub import snapshot_download
 from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
+from huggingface_hub.utils import validate_hf_hub_args
 from transformers import CLIPFeatureExtractor, CLIPTokenizer
 from transformers.file_utils import add_end_docstrings
+from transformers.modeling_outputs import ModelOutput
 
 import onnxruntime as ort
+from optimum.utils import check_if_diffusers_greater
 
 from ..exporters.onnx import main_export
-from ..onnx.utils import _get_external_data_paths
-from ..pipelines.diffusers.pipeline_latent_consistency import LatentConsistencyPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion import StableDiffusionPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_img2img import StableDiffusionImg2ImgPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_inpaint import StableDiffusionInpaintPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl import StableDiffusionXLPipelineMixin
-from ..pipelines.diffusers.pipeline_stable_diffusion_xl_img2img import StableDiffusionXLImg2ImgPipelineMixin
-from ..pipelines.diffusers.pipeline_utils import VaeImageProcessor
+from ..onnx.utils import _get_model_external_data_paths
 from ..utils import (
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+    DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
+    DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
 )
+from .io_binding import TypeHelper
 from .modeling_ort import ONNX_MODEL_END_DOCSTRING, ORTModel
 from .utils import (
-    _ORT_TO_NP_TYPE,
     ONNX_WEIGHTS_NAME,
     get_provider_for_device,
+    np_to_pt_generators,
     parse_device,
     validate_provider_availability,
 )
 
 
+if check_if_diffusers_greater("0.25.0"):
+    from diffusers.models.autoencoders.vae import DiagonalGaussianDistribution
+else:
+    from diffusers.models.vae import DiagonalGaussianDistribution  # type: ignore
+
+
 logger = logging.getLogger(__name__)
 
 
-class ORTStableDiffusionPipelineBase(ORTModel):
-    auto_model_class = StableDiffusionPipeline
-    main_input_name = "input_ids"
-    base_model_prefix = "onnx_model"
+# TODO: support from_pipe()
+# TODO: Instead of ORTModel, it makes sense to have a compositional ORTMixin
+# TODO: instead of one bloated __init__, we should consider an __init__ per pipeline
+class ORTDiffusionPipeline(ORTModel, DiffusionPipeline):
     config_name = "model_index.json"
-    sub_component_config_name = "config.json"
+    auto_model_class = DiffusionPipeline
 
     def __init__(
         self,
+        scheduler: "SchedulerMixin",
         vae_decoder_session: ort.InferenceSession,
-        text_encoder_session: ort.InferenceSession,
-        unet_session: ort.InferenceSession,
-        config: Dict[str, Any],
-        tokenizer: CLIPTokenizer,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        feature_extractor: Optional[CLIPFeatureExtractor] = None,
+        # optional pipeline models
+        unet_session: Optional[ort.InferenceSession] = None,
+        transformer_session: Optional[ort.InferenceSession] = None,
         vae_encoder_session: Optional[ort.InferenceSession] = None,
+        text_encoder_session: Optional[ort.InferenceSession] = None,
         text_encoder_2_session: Optional[ort.InferenceSession] = None,
-        tokenizer_2: Optional[CLIPTokenizer] = None,
+        text_encoder_3_session: Optional[ort.InferenceSession] = None,
+        # optional pipeline submodels
+        tokenizer: Optional["CLIPTokenizer"] = None,
+        tokenizer_2: Optional["CLIPTokenizer"] = None,
+        tokenizer_3: Optional["CLIPTokenizer"] = None,
+        feature_extractor: Optional["CLIPFeatureExtractor"] = None,
+        # stable diffusion xl specific arguments
+        force_zeros_for_empty_prompt: bool = True,
+        requires_aesthetics_score: bool = False,
+        add_watermarker: Optional[bool] = None,
+        # onnxruntime specific arguments
         use_io_binding: Optional[bool] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
+        **kwargs,
     ):
-        """
-        Args:
-            vae_decoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the VAE decoder.
-            text_encoder_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the text encoder.
-            unet_session (`ort.InferenceSession`):
-                The ONNX Runtime inference session associated to the U-NET.
-            config (`Dict[str, Any]`):
-                A config dictionary from which the model components will be instantiated. Make sure to only load
-                configuration files of compatible classes.
-            tokenizer (`CLIPTokenizer`):
-                Tokenizer of class
-                [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-            scheduler (`Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler]`):
-                A scheduler to be used in combination with the U-NET component to denoise the encoded image latents.
-            feature_extractor (`Optional[CLIPFeatureExtractor]`, defaults to `None`):
-                A model extracting features from generated images to be used as inputs for the `safety_checker`
-            vae_encoder_session (`Optional[ort.InferenceSession]`, defaults to `None`):
-                The ONNX Runtime inference session associated to the VAE encoder.
-            use_io_binding (`Optional[bool]`, defaults to `None`):
-                Whether to use IOBinding during inference to avoid memory copy between the host and devices. Defaults to
-                `True` if the device is CUDA, otherwise defaults to `False`.
-            model_save_dir (`Optional[str]`, defaults to `None`):
-                The directory under which the model exported to ONNX was saved.
-        """
-        self.shared_attributes_init(
-            vae_decoder_session,
-            use_io_binding=use_io_binding,
-            model_save_dir=model_save_dir,
+        self.unet = ORTModelUnet(unet_session, self) if unet_session is not None else None
+        self.transformer = ORTModelTransformer(transformer_session, self) if transformer_session is not None else None
+        self.text_encoder = (
+            ORTModelTextEncoder(text_encoder_session, self) if text_encoder_session is not None else None
         )
-        self._internal_dict = config
-        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self)
-        self.vae_decoder_model_path = Path(vae_decoder_session._model_path)
-        self.unet = ORTModelUnet(unet_session, self)
-        self.unet_model_path = Path(unet_session._model_path)
-
-        if text_encoder_session is not None:
-            self.text_encoder_model_path = Path(text_encoder_session._model_path)
-            self.text_encoder = ORTModelTextEncoder(text_encoder_session, self)
-        else:
-            self.text_encoder_model_path = None
-            self.text_encoder = None
-
-        if vae_encoder_session is not None:
-            self.vae_encoder_model_path = Path(vae_encoder_session._model_path)
-            self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self)
-        else:
-            self.vae_encoder_model_path = None
-            self.vae_encoder = None
+        self.text_encoder_2 = (
+            ORTModelTextEncoder(text_encoder_2_session, self) if text_encoder_2_session is not None else None
+        )
+        self.text_encoder_3 = (
+            ORTModelTextEncoder(text_encoder_3_session, self) if text_encoder_3_session is not None else None
+        )
+        # We wrap the VAE Decoder & Encoder in a single object to simulate diffusers API
+        self.vae_encoder = ORTModelVaeEncoder(vae_encoder_session, self) if vae_encoder_session is not None else None
+        self.vae_decoder = ORTModelVaeDecoder(vae_decoder_session, self) if vae_decoder_session is not None else None
+        self.vae = ORTWrapperVae(self.vae_encoder, self.vae_decoder)
 
-        if text_encoder_2_session is not None:
-            self.text_encoder_2_model_path = Path(text_encoder_2_session._model_path)
-            self.text_encoder_2 = ORTModelTextEncoder(text_encoder_2_session, self)
-        else:
-            self.text_encoder_2_model_path = None
-            self.text_encoder_2 = None
+        # we allow passing these as torch models for now
+        self.image_encoder = kwargs.pop("image_encoder", None)  # TODO: maybe implement ORTModelImageEncoder
+        self.safety_checker = kwargs.pop("safety_checker", None)  # TODO: maybe implement ORTModelSafetyChecker
 
+        self.scheduler = scheduler
         self.tokenizer = tokenizer
         self.tokenizer_2 = tokenizer_2
-        self.scheduler = scheduler
+        self.tokenizer_3 = tokenizer_3
         self.feature_extractor = feature_extractor
-        self.safety_checker = None
-
-        sub_models = {
-            DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER: self.text_encoder,
-            DIFFUSION_MODEL_UNET_SUBFOLDER: self.unet,
-            DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER: self.vae_decoder,
-            DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER: self.vae_encoder,
-            DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER: self.text_encoder_2,
-        }
-
-        # Modify config to keep the resulting model compatible with diffusers pipelines
-        for name in sub_models.keys():
-            self._internal_dict[name] = (
-                ("diffusers", "OnnxRuntimeModel") if sub_models[name] is not None else (None, None)
-            )
-        self._internal_dict.pop("vae", None)
-
-        if "block_out_channels" in self.vae_decoder.config:
-            self.vae_scale_factor = 2 ** (len(self.vae_decoder.config["block_out_channels"]) - 1)
-        else:
-            self.vae_scale_factor = 8
 
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-
-    @staticmethod
-    def load_model(
-        vae_decoder_path: Union[str, Path],
-        text_encoder_path: Union[str, Path],
-        unet_path: Union[str, Path],
-        vae_encoder_path: Optional[Union[str, Path]] = None,
-        text_encoder_2_path: Optional[Union[str, Path]] = None,
-        provider: str = "CPUExecutionProvider",
-        session_options: Optional[ort.SessionOptions] = None,
-        provider_options: Optional[Dict] = None,
-    ):
-        """
-        Creates three inference sessions for respectively the VAE decoder, the text encoder and the U-NET models.
-        The default provider is `CPUExecutionProvider` to match the default behaviour in PyTorch/TensorFlow/JAX.
-
-        Args:
-            vae_decoder_path (`Union[str, Path]`):
-                The path to the VAE decoder ONNX model.
-            text_encoder_path (`Union[str, Path]`):
-                The path to the text encoder ONNX model.
-            unet_path (`Union[str, Path]`):
-                The path to the U-NET ONNX model.
-            vae_encoder_path (`Union[str, Path]`, defaults to `None`):
-                The path to the VAE encoder ONNX model.
-            text_encoder_2_path (`Union[str, Path]`, defaults to `None`):
-                The path to the second text decoder ONNX model.
-            provider (`str`, defaults to `"CPUExecutionProvider"`):
-                ONNX Runtime provider to use for loading the model. See https://onnxruntime.ai/docs/execution-providers/
-                for possible providers.
-            session_options (`Optional[ort.SessionOptions]`, defaults to `None`):
-                ONNX Runtime session options to use for loading the model. Defaults to `None`.
-            provider_options (`Optional[Dict]`, defaults to `None`):
-                Provider option dictionary corresponding to the provider used. See available options
-                for each provider: https://onnxruntime.ai/docs/api/c/group___global.html . Defaults to `None`.
-        """
-        vae_decoder = ORTModel.load_model(vae_decoder_path, provider, session_options, provider_options)
-        unet = ORTModel.load_model(unet_path, provider, session_options, provider_options)
-
-        sessions = {
-            "vae_encoder": vae_encoder_path,
-            "text_encoder": text_encoder_path,
-            "text_encoder_2": text_encoder_2_path,
+        all_pipeline_init_args = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "transformer": self.transformer,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_3,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+            "scheduler": self.scheduler,
+            "tokenizer": self.tokenizer,
+            "tokenizer_2": self.tokenizer_2,
+            "tokenizer_3": self.tokenizer_3,
+            "feature_extractor": self.feature_extractor,
+            "requires_aesthetics_score": requires_aesthetics_score,
+            "force_zeros_for_empty_prompt": force_zeros_for_empty_prompt,
+            "add_watermarker": add_watermarker,
         }
 
-        for key, value in sessions.items():
-            if value is not None and value.is_file():
-                sessions[key] = ORTModel.load_model(value, provider, session_options, provider_options)
-            else:
-                sessions[key] = None
+        diffusers_pipeline_args = {}
+        for key in inspect.signature(self.auto_model_class).parameters.keys():
+            if key in all_pipeline_init_args:
+                diffusers_pipeline_args[key] = all_pipeline_init_args[key]
+        # inits diffusers pipeline specific attributes (registers modules and config)
+        self.auto_model_class.__init__(self, **diffusers_pipeline_args)
 
-        return vae_decoder, sessions["text_encoder"], unet, sessions["vae_encoder"], sessions["text_encoder_2"]
+        # inits ort specific attributes
+        self.shared_attributes_init(
+            model=unet_session if unet_session is not None else transformer_session,
+            use_io_binding=use_io_binding,
+            model_save_dir=model_save_dir,
+            **kwargs,
+        )
 
     def _save_pretrained(self, save_directory: Union[str, Path]):
         save_directory = Path(save_directory)
-        src_to_dst_path = {
-            self.vae_decoder_model_path: save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / ONNX_WEIGHTS_NAME,
-            self.text_encoder_model_path: save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / ONNX_WEIGHTS_NAME,
-            self.unet_model_path: save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER / ONNX_WEIGHTS_NAME,
-        }
 
-        sub_models_to_save = {
-            self.vae_encoder_model_path: DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
-            self.text_encoder_2_model_path: DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+        models_to_save_paths = {
+            (self.unet, save_directory / DIFFUSION_MODEL_UNET_SUBFOLDER),
+            (self.transformer, save_directory / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER),
+            (self.vae_decoder, save_directory / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER),
+            (self.vae_encoder, save_directory / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER),
+            (self.text_encoder, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER),
+            (self.text_encoder_2, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER),
+            (self.text_encoder_3, save_directory / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER),
         }
-        for path, subfolder in sub_models_to_save.items():
-            if path is not None:
-                src_to_dst_path[path] = save_directory / subfolder / ONNX_WEIGHTS_NAME
-
-        # TODO: Modify _get_external_data_paths to give dictionnary
-        src_paths = list(src_to_dst_path.keys())
-        dst_paths = list(src_to_dst_path.values())
-        # Add external data paths in case of large models
-        src_paths, dst_paths = _get_external_data_paths(src_paths, dst_paths)
-
-        for src_path, dst_path in zip(src_paths, dst_paths):
-            dst_path.parent.mkdir(parents=True, exist_ok=True)
-            shutil.copyfile(src_path, dst_path)
-            config_path = src_path.parent / self.sub_component_config_name
-            if config_path.is_file():
-                shutil.copyfile(config_path, dst_path.parent / self.sub_component_config_name)
+        for model, save_path in models_to_save_paths:
+            if model is not None:
+                model_path = Path(model.session._model_path)
+                save_path.mkdir(parents=True, exist_ok=True)
+                # copy onnx model
+                shutil.copyfile(model_path, save_path / ONNX_WEIGHTS_NAME)
+                # copy external onnx data
+                external_data_paths = _get_model_external_data_paths(model_path)
+                for external_data_path in external_data_paths:
+                    shutil.copyfile(external_data_path, save_path / external_data_path.name)
+                # copy model config
+                config_path = model_path.parent / CONFIG_NAME
+                if config_path.is_file():
+                    config_save_path = save_path / CONFIG_NAME
+                    shutil.copyfile(config_path, config_save_path)
 
         self.scheduler.save_pretrained(save_directory / "scheduler")
 
-        if self.feature_extractor is not None:
-            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
         if self.tokenizer is not None:
             self.tokenizer.save_pretrained(save_directory / "tokenizer")
         if self.tokenizer_2 is not None:
             self.tokenizer_2.save_pretrained(save_directory / "tokenizer_2")
+        if self.tokenizer_3 is not None:
+            self.tokenizer_3.save_pretrained(save_directory / "tokenizer_3")
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory / "feature_extractor")
 
     @classmethod
     def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: Dict[str, Any],
-        use_auth_token: Optional[Union[bool, str]] = None,
-        token: Optional[Union[bool, str]] = None,
+        subfolder: str = "",
+        force_download: bool = False,
+        local_files_only: bool = False,
         revision: Optional[str] = None,
+        trust_remote_code: bool = False,
         cache_dir: str = HUGGINGFACE_HUB_CACHE,
-        vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
-        text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
+        token: Optional[Union[bool, str]] = None,
         unet_file_name: str = ONNX_WEIGHTS_NAME,
+        transformer_file_name: str = ONNX_WEIGHTS_NAME,
+        vae_decoder_file_name: str = ONNX_WEIGHTS_NAME,
         vae_encoder_file_name: str = ONNX_WEIGHTS_NAME,
+        text_encoder_file_name: str = ONNX_WEIGHTS_NAME,
         text_encoder_2_file_name: str = ONNX_WEIGHTS_NAME,
-        local_files_only: bool = False,
+        text_encoder_3_file_name: str = ONNX_WEIGHTS_NAME,
+        use_io_binding: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
-        session_options: Optional[ort.SessionOptions] = None,
         provider_options: Optional[Dict[str, Any]] = None,
-        use_io_binding: Optional[bool] = None,
+        session_options: Optional[ort.SessionOptions] = None,
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
+        if use_io_binding:
+            raise ValueError(
+                "IOBinding is not yet available for diffusion pipelines, please set `use_io_binding` to False."
             )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
-        if provider == "TensorrtExecutionProvider":
-            raise ValueError("The provider `'TensorrtExecutionProvider'` is not supported")
 
-        model_id = str(model_id)
-        patterns = set(config.keys())
-        sub_models_to_load = patterns.intersection({"feature_extractor", "tokenizer", "tokenizer_2", "scheduler"})
-
-        if not os.path.isdir(model_id):
-            patterns.update({"vae_encoder", "vae_decoder"})
-            allow_patterns = {os.path.join(k, "*") for k in patterns if not k.startswith("_")}
+        if not os.path.isdir(str(model_id)):
+            all_components = {key for key in config.keys() if not key.startswith("_")} | {"vae_encoder", "vae_decoder"}
+            allow_patterns = {os.path.join(component, "*") for component in all_components}
             allow_patterns.update(
                 {
-                    vae_decoder_file_name,
-                    text_encoder_file_name,
                     unet_file_name,
+                    transformer_file_name,
+                    vae_decoder_file_name,
                     vae_encoder_file_name,
+                    text_encoder_file_name,
                     text_encoder_2_file_name,
+                    text_encoder_3_file_name,
                     SCHEDULER_CONFIG_NAME,
-                    CONFIG_NAME,
                     cls.config_name,
+                    CONFIG_NAME,
                 }
             )
-            # Downloads all repo's files matching the allowed patterns
-            model_id = snapshot_download(
+            model_save_folder = snapshot_download(
                 model_id,
                 cache_dir=cache_dir,
+                force_download=force_download,
                 local_files_only=local_files_only,
-                token=token,
                 revision=revision,
+                token=token,
                 allow_patterns=allow_patterns,
                 ignore_patterns=["*.msgpack", "*.safetensors", "*.bin", "*.xml"],
             )
-        new_model_save_dir = Path(model_id)
+        else:
+            model_save_folder = str(model_id)
 
-        sub_models = {}
-        for name in sub_models_to_load:
-            library_name, library_classes = config[name]
-            if library_classes is not None:
+        model_save_path = Path(model_save_folder)
+
+        if model_save_dir is None:
+            model_save_dir = model_save_path
+
+        model_paths = {
+            "unet": model_save_path / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
+            "transformer": model_save_path / DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER / transformer_file_name,
+            "vae_decoder": model_save_path / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
+            "vae_encoder": model_save_path / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
+            "text_encoder": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
+            "text_encoder_2": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER / text_encoder_2_file_name,
+            "text_encoder_3": model_save_path / DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER / text_encoder_3_file_name,
+        }
+
+        sessions = {}
+        for model, path in model_paths.items():
+            if kwargs.get(model, None) is not None:
+                # this allows passing a model directly to from_pretrained
+                sessions[f"{model}_session"] = kwargs.pop(model)
+            else:
+                sessions[f"{model}_session"] = (
+                    ORTModel.load_model(path, provider, session_options, provider_options) if path.is_file() else None
+                )
+
+        submodels = {}
+        for submodel in {"scheduler", "tokenizer", "tokenizer_2", "tokenizer_3", "feature_extractor"}:
+            if kwargs.get(submodel, None) is not None:
+                submodels[submodel] = kwargs.pop(submodel)
+            elif config.get(submodel, (None, None))[0] is not None:
+                library_name, library_classes = config.get(submodel)
                 library = importlib.import_module(library_name)
                 class_obj = getattr(library, library_classes)
                 load_method = getattr(class_obj, "from_pretrained")
                 # Check if the module is in a subdirectory
-                if (new_model_save_dir / name).is_dir():
-                    sub_models[name] = load_method(new_model_save_dir / name)
+                if (model_save_path / submodel).is_dir():
+                    submodels[submodel] = load_method(model_save_path / submodel)
                 else:
-                    sub_models[name] = load_method(new_model_save_dir)
-
-        vae_decoder, text_encoder, unet, vae_encoder, text_encoder_2 = cls.load_model(
-            vae_decoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER / vae_decoder_file_name,
-            text_encoder_path=new_model_save_dir / DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER / text_encoder_file_name,
-            unet_path=new_model_save_dir / DIFFUSION_MODEL_UNET_SUBFOLDER / unet_file_name,
-            vae_encoder_path=new_model_save_dir / DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER / vae_encoder_file_name,
-            text_encoder_2_path=new_model_save_dir
-            / DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER
-            / text_encoder_2_file_name,
-            provider=provider,
-            session_options=session_options,
-            provider_options=provider_options,
-        )
+                    submodels[submodel] = load_method(model_save_path)
 
-        if model_save_dir is None:
-            model_save_dir = new_model_save_dir
-
-        if use_io_binding:
-            raise ValueError(
-                "IOBinding is not yet available for stable diffusion model, please set `use_io_binding` to False."
-            )
+        # same as DiffusionPipeline.from_pretraoned, if called directly, it loads the class in the config
+        if cls.__name__ == "ORTDiffusionPipeline":
+            class_name = config["_class_name"]
+            ort_pipeline_class = _get_ort_class(class_name)
+        else:
+            ort_pipeline_class = cls
 
-        return cls(
-            vae_decoder_session=vae_decoder,
-            text_encoder_session=text_encoder,
-            unet_session=unet,
-            config=config,
-            tokenizer=sub_models.get("tokenizer", None),
-            scheduler=sub_models.get("scheduler"),
-            feature_extractor=sub_models.get("feature_extractor", None),
-            tokenizer_2=sub_models.get("tokenizer_2", None),
-            vae_encoder_session=vae_encoder,
-            text_encoder_2_session=text_encoder_2,
+        ort_pipeline = ort_pipeline_class(
+            **sessions,
+            **submodels,
             use_io_binding=use_io_binding,
             model_save_dir=model_save_dir,
+            **kwargs,
         )
 
+        # same as in DiffusionPipeline.from_pretrained, we save where the model was instantiated from
+        ort_pipeline.register_to_config(_name_or_path=config.get("_name_or_path", str(model_id)))
+
+        return ort_pipeline
+
     @classmethod
-    def _from_transformers(
+    def _export(
         cls,
         model_id: str,
-        config: Optional[str] = None,
-        use_auth_token: Optional[Union[bool, str]] = None,
-        token: Optional[Union[bool, str]] = None,
-        revision: str = "main",
-        force_download: bool = True,
-        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        config: Dict[str, Any],
         subfolder: str = "",
+        force_download: bool = False,
         local_files_only: bool = False,
+        revision: Optional[str] = None,
         trust_remote_code: bool = False,
+        cache_dir: str = HUGGINGFACE_HUB_CACHE,
+        token: Optional[Union[bool, str]] = None,
+        use_io_binding: Optional[bool] = None,
         provider: str = "CPUExecutionProvider",
         session_options: Optional[ort.SessionOptions] = None,
         provider_options: Optional[Dict[str, Any]] = None,
-        use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
-    ) -> "ORTStableDiffusionPipeline":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        **kwargs,
+    ) -> "ORTDiffusionPipeline":
         if task is None:
             task = cls._auto_model_to_task(cls.auto_model_class)
 
-        save_dir = TemporaryDirectory()
-        save_dir_path = Path(save_dir.name)
+        # we continue passing the model_save_dir from here on to avoid it being cleaned up
+        # might be better to use a persistent temporary directory such as the one implemented in
+        # https://gist.github.com/twolfson/2929dc1163b0a76d2c2b66d51f9bc808
+        model_save_dir = TemporaryDirectory()
+        model_save_path = Path(model_save_dir.name)
 
         main_export(
-            model_name_or_path=model_id,
-            output=save_dir_path,
-            task=task,
+            model_id,
+            output=model_save_path,
             do_validation=False,
             no_post_process=True,
-            subfolder=subfolder,
+            token=token,
             revision=revision,
             cache_dir=cache_dir,
-            token=token,
-            local_files_only=local_files_only,
+            subfolder=subfolder,
             force_download=force_download,
+            local_files_only=local_files_only,
             trust_remote_code=trust_remote_code,
+            library_name="diffusers",
+            task=task,
         )
 
         return cls._from_pretrained(
-            save_dir_path,
+            model_save_path,
             config=config,
             provider=provider,
-            session_options=session_options,
             provider_options=provider_options,
+            session_options=session_options,
             use_io_binding=use_io_binding,
-            model_save_dir=save_dir,
+            model_save_dir=model_save_dir,
+            **kwargs,
         )
 
     def to(self, device: Union[torch.device, str, int]):
@@ -455,19 +406,29 @@ def to(self, device: Union[torch.device, str, int]):
 
         device, provider_options = parse_device(device)
         provider = get_provider_for_device(device)
-        validate_provider_availability(provider)  # raise error if the provider is not available
+        validate_provider_availability(provider)
 
         if device.type == "cuda" and self.providers[0] == "TensorrtExecutionProvider":
             return self
 
         self.vae_decoder.session.set_providers([provider], provider_options=[provider_options])
-        self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
-        self.unet.session.set_providers([provider], provider_options=[provider_options])
 
+        if self.unet is not None:
+            self.unet.session.set_providers([provider], provider_options=[provider_options])
+        if self.transformer is not None:
+            self.transformer.session.set_providers([provider], provider_options=[provider_options])
         if self.vae_encoder is not None:
             self.vae_encoder.session.set_providers([provider], provider_options=[provider_options])
-
-        self.providers = self.vae_decoder.session.get_providers()
+        if self.text_encoder is not None:
+            self.text_encoder.session.set_providers([provider], provider_options=[provider_options])
+        if self.text_encoder_2 is not None:
+            self.text_encoder_2.session.set_providers([provider], provider_options=[provider_options])
+        if self.text_encoder_3 is not None:
+            self.text_encoder_3.session.set_providers([provider], provider_options=[provider_options])
+
+        self.providers = (
+            self.unet.session.get_providers() if self.unet is not None else self.transformer.session.get_providers()
+        )
         self._device = device
 
         return self
@@ -476,194 +437,723 @@ def to(self, device: Union[torch.device, str, int]):
     def _load_config(cls, config_name_or_path: Union[str, os.PathLike], **kwargs):
         return cls.load_config(config_name_or_path, **kwargs)
 
-    def _save_config(self, save_directory):
-        self.save_config(save_directory)
+    def _save_config(self, save_directory: Union[str, Path]):
+        model_dir = (
+            self.model_save_dir
+            if not isinstance(self.model_save_dir, TemporaryDirectory)
+            else self.model_save_dir.name
+        )
+        save_dir = Path(save_directory)
+        original_config = Path(model_dir) / self.config_name
+        if original_config.exists():
+            if not save_dir.exists():
+                save_dir.mkdir(parents=True)
 
+            shutil.copy(original_config, save_dir)
+        else:
+            self.save_config(save_directory)
 
-# TODO : Use ORTModelPart once IOBinding support is added
-class _ORTDiffusionModelPart:
-    """
-    For multi-file ONNX models, represents a part of the model.
-    It has its own `onnxruntime.InferenceSession`, and can perform a forward pass.
-    """
+    @property
+    def components(self) -> Dict[str, Any]:
+        components = {
+            "vae": self.vae,
+            "unet": self.unet,
+            "transformer": self.transformer,
+            "text_encoder": self.text_encoder,
+            "text_encoder_2": self.text_encoder_2,
+            "text_encoder_3": self.text_encoder_3,
+            "safety_checker": self.safety_checker,
+            "image_encoder": self.image_encoder,
+        }
+        components = {k: v for k, v in components.items() if v is not None}
+        return components
+
+    def __call__(self, *args, **kwargs):
+        # we do this to keep numpy random states support for now
+        # TODO: deprecate and add warnings when a random state is passed
+
+        args = list(args)
+        for i in range(len(args)):
+            args[i] = np_to_pt_generators(args[i], self.device)
+
+        for k, v in kwargs.items():
+            kwargs[k] = np_to_pt_generators(v, self.device)
 
-    CONFIG_NAME = "config.json"
+        return self.auto_model_class.__call__(self, *args, **kwargs)
 
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
+
+class ORTPipelinePart(ConfigMixin):
+    config_name: str = CONFIG_NAME
+
+    def __init__(self, session: ort.InferenceSession, parent_pipeline: ORTDiffusionPipeline):
         self.session = session
-        self.parent_model = parent_model
+        self.parent_pipeline = parent_pipeline
+
         self.input_names = {input_key.name: idx for idx, input_key in enumerate(self.session.get_inputs())}
         self.output_names = {output_key.name: idx for idx, output_key in enumerate(self.session.get_outputs())}
-        config_path = Path(session._model_path).parent / self.CONFIG_NAME
-        self.config = self.parent_model._dict_from_json_file(config_path) if config_path.is_file() else {}
-        self.input_dtype = {inputs.name: _ORT_TO_NP_TYPE[inputs.type] for inputs in self.session.get_inputs()}
+
+        self.input_dtypes = {input_key.name: input_key.type for input_key in self.session.get_inputs()}
+        self.output_dtypes = {output_key.name: output_key.type for output_key in self.session.get_outputs()}
+
+        self.input_shapes = {input_key.name: input_key.shape for input_key in self.session.get_inputs()}
+        self.output_shapes = {output_key.name: output_key.shape for output_key in self.session.get_outputs()}
+
+        config_file_path = Path(session._model_path).parent / self.config_name
+        if not config_file_path.is_file():
+            # config is mandatory for the model part to be used for inference
+            raise ValueError(f"Configuration file for {self.__class__.__name__} not found at {config_file_path}")
+        config_dict = self._dict_from_json_file(config_file_path)
+        self.register_to_config(**config_dict)
 
     @property
     def device(self):
-        return self.parent_model.device
+        return self.parent_pipeline.device
+
+    @property
+    def dtype(self):
+        for dtype in self.input_dtypes.values():
+            torch_dtype = TypeHelper.ort_type_to_torch_type(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        for dtype in self.output_dtypes.values():
+            torch_dtype = TypeHelper.ort_type_to_torch_type(dtype)
+            if torch_dtype.is_floating_point:
+                return torch_dtype
+
+        return None
+
+    def to(self, *args, device: Optional[Union[torch.device, str, int]] = None, dtype: Optional[torch.dtype] = None):
+        for arg in args:
+            if isinstance(arg, torch.device):
+                device = arg
+            elif isinstance(arg, (int, str)):
+                device = torch.device(arg)
+            elif isinstance(arg, torch.dtype):
+                dtype = arg
+
+        if device is not None and device != self.device:
+            raise ValueError(
+                "Cannot change the device of a pipeline part without changing the device of the parent pipeline. "
+                "Please use the `to` method of the parent pipeline to change the device."
+            )
+
+        if dtype is not None and dtype != self.dtype:
+            raise NotImplementedError(
+                f"Cannot change the dtype of the pipeline from {self.dtype} to {dtype}. "
+                f"Please export the pipeline with the desired dtype."
+            )
+
+    def prepare_onnx_inputs(self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]) -> Dict[str, np.ndarray]:
+        onnx_inputs = {}
+
+        # converts pytorch inputs into numpy inputs for onnx
+        for input_name in self.input_names.keys():
+            onnx_inputs[input_name] = inputs.pop(input_name)
+
+            if use_torch:
+                onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True)
+
+            if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]:
+                onnx_inputs[input_name] = onnx_inputs[input_name].astype(
+                    TypeHelper.ort_type_to_numpy_type(self.input_dtypes[input_name])
+                )
+
+        return onnx_inputs
+
+    def prepare_onnx_outputs(
+        self, use_torch: bool, *onnx_outputs: np.ndarray
+    ) -> Dict[str, Union[torch.Tensor, np.ndarray]]:
+        model_outputs = {}
+
+        # converts onnxruntime outputs into tensor for standard outputs
+        for output_name, idx in self.output_names.items():
+            model_outputs[output_name] = onnx_outputs[idx]
+
+            if use_torch:
+                model_outputs[output_name] = torch.from_numpy(model_outputs[output_name]).to(self.device)
+
+        return model_outputs
 
     @abstractmethod
     def forward(self, *args, **kwargs):
-        pass
+        raise NotImplementedError
 
     def __call__(self, *args, **kwargs):
         return self.forward(*args, **kwargs)
 
 
-class ORTModelTextEncoder(_ORTDiffusionModelPart):
-    def forward(self, input_ids: np.ndarray):
-        onnx_inputs = {
-            "input_ids": input_ids,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+class ORTModelUnet(ORTPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "time_cond_proj_dim"):
+            logger.warning(
+                "The `time_cond_proj_dim` attribute is missing from the UNet configuration. "
+                "Please re-export the model with newer version of optimum and diffusers."
+            )
+            self.register_to_config(time_cond_proj_dim=None)
 
-class ORTModelUnet(_ORTDiffusionModelPart):
-    def __init__(self, session: ort.InferenceSession, parent_model: ORTModel):
-        super().__init__(session, parent_model)
+        if len(self.input_shapes["timestep"]) > 0:
+            logger.warning(
+                "The exported unet onnx model expects a non scalar timestep input. "
+                "We will have to unsqueeze the timestep input at each iteration which might be inefficient. "
+                "Please re-export the pipeline with newer version of optimum and diffusers to avoid this warning."
+            )
 
     def forward(
         self,
-        sample: np.ndarray,
-        timestep: np.ndarray,
-        encoder_hidden_states: np.ndarray,
-        text_embeds: Optional[np.ndarray] = None,
-        time_ids: Optional[np.ndarray] = None,
-        timestep_cond: Optional[np.ndarray] = None,
+        sample: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        timestep_cond: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        added_cond_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
     ):
-        onnx_inputs = {
+        use_torch = isinstance(sample, torch.Tensor)
+
+        if len(self.input_shapes["timestep"]) > 0:
+            timestep = timestep.unsqueeze(0)
+
+        model_inputs = {
             "sample": sample,
             "timestep": timestep,
             "encoder_hidden_states": encoder_hidden_states,
+            "timestep_cond": timestep_cond,
+            **(cross_attention_kwargs or {}),
+            **(added_cond_kwargs or {}),
         }
 
-        if text_embeds is not None:
-            onnx_inputs["text_embeds"] = text_embeds
-        if time_ids is not None:
-            onnx_inputs["time_ids"] = time_ids
-        if timestep_cond is not None:
-            onnx_inputs["timestep_cond"] = timestep_cond
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
 
+        if return_dict:
+            return model_outputs
 
-class ORTModelVaeDecoder(_ORTDiffusionModelPart):
-    def forward(self, latent_sample: np.ndarray):
-        onnx_inputs = {
-            "latent_sample": latent_sample,
-        }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+        return ModelOutput(**model_outputs)
 
 
-class ORTModelVaeEncoder(_ORTDiffusionModelPart):
-    def forward(self, sample: np.ndarray):
-        onnx_inputs = {
-            "sample": sample,
+class ORTModelTransformer(ORTPipelinePart):
+    def forward(
+        self,
+        hidden_states: Union[np.ndarray, torch.Tensor],
+        encoder_hidden_states: Union[np.ndarray, torch.Tensor],
+        pooled_projections: Union[np.ndarray, torch.Tensor],
+        timestep: Union[np.ndarray, torch.Tensor],
+        guidance: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        txt_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        img_ids: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(hidden_states, torch.Tensor)
+
+        model_inputs = {
+            "hidden_states": hidden_states,
+            "encoder_hidden_states": encoder_hidden_states,
+            "pooled_projections": pooled_projections,
+            "timestep": timestep,
+            "guidance": guidance,
+            "txt_ids": txt_ids,
+            "img_ids": img_ids,
+            **(joint_attention_kwargs or {}),
         }
-        outputs = self.session.run(None, onnx_inputs)
-        return outputs
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelTextEncoder(ORTPipelinePart):
+    def forward(
+        self,
+        input_ids: Union[np.ndarray, torch.Tensor],
+        attention_mask: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(input_ids, torch.Tensor)
+
+        model_inputs = {"input_ids": input_ids}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if output_hidden_states:
+            model_outputs["hidden_states"] = []
+            num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers
+            for i in range(num_layers):
+                model_outputs["hidden_states"].append(model_outputs.pop(f"hidden_states.{i}"))
+            model_outputs["hidden_states"].append(model_outputs.get("last_hidden_state"))
+        else:
+            num_layers = self.num_hidden_layers if hasattr(self, "num_hidden_layers") else self.num_decoder_layers
+            for i in range(num_layers):
+                model_outputs.pop(f"hidden_states.{i}", None)
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelVaeEncoder(ORTPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE encoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers to avoid this warning."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(sample, torch.Tensor)
+
+        model_inputs = {"sample": sample}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if "latent_parameters" in model_outputs:
+            model_outputs["latent_dist"] = DiagonalGaussianDistribution(
+                parameters=model_outputs.pop("latent_parameters")
+            )
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTModelVaeDecoder(ORTPipelinePart):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # can be missing from models exported long ago
+        if not hasattr(self.config, "scaling_factor"):
+            logger.warning(
+                "The `scaling_factor` attribute is missing from the VAE decoder configuration. "
+                "Please re-export the model with newer version of optimum and diffusers to avoid this warning."
+            )
+            self.register_to_config(scaling_factor=2 ** (len(self.config.block_out_channels) - 1))
+
+    def forward(
+        self,
+        latent_sample: Union[np.ndarray, torch.Tensor],
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = False,
+    ):
+        use_torch = isinstance(latent_sample, torch.Tensor)
+
+        model_inputs = {"latent_sample": latent_sample}
+
+        onnx_inputs = self.prepare_onnx_inputs(use_torch, **model_inputs)
+        onnx_outputs = self.session.run(None, onnx_inputs)
+        model_outputs = self.prepare_onnx_outputs(use_torch, *onnx_outputs)
+
+        if "latent_sample" in model_outputs:
+            model_outputs["latents"] = model_outputs.pop("latent_sample")
+
+        if return_dict:
+            return model_outputs
+
+        return ModelOutput(**model_outputs)
+
+
+class ORTWrapperVae(ORTPipelinePart):
+    def __init__(self, encoder: ORTModelVaeEncoder, decoder: ORTModelVaeDecoder):
+        self.decoder = decoder
+        self.encoder = encoder
+
+    @property
+    def config(self):
+        return self.decoder.config
+
+    @property
+    def dtype(self):
+        return self.decoder.dtype
+
+    @property
+    def device(self):
+        return self.decoder.device
+
+    def decode(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+    def encode(self, *args, **kwargs):
+        return self.encoder(*args, **kwargs)
+
+    def to(self, *args, **kwargs):
+        self.decoder.to(*args, **kwargs)
+        if self.encoder is not None:
+            self.encoder.to(*args, **kwargs)
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionPipeline(ORTStableDiffusionPipelineBase, StableDiffusionPipelineMixin):
+class ORTStableDiffusionPipeline(ORTDiffusionPipeline, StableDiffusionPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusionPipeline).
     """
 
-    __call__ = StableDiffusionPipelineMixin.__call__
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionImg2ImgPipeline(ORTStableDiffusionPipelineBase, StableDiffusionImg2ImgPipelineMixin):
+class ORTStableDiffusionImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionImg2ImgPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusionImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionImg2ImgPipelineMixin.__call__
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = StableDiffusionImg2ImgPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionInpaintPipeline(ORTStableDiffusionPipelineBase, StableDiffusionInpaintPipelineMixin):
+class ORTStableDiffusionInpaintPipeline(ORTDiffusionPipeline, StableDiffusionInpaintPipeline):
     """
     ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusionInpaintPipeline).
     """
 
-    __call__ = StableDiffusionInpaintPipelineMixin.__call__
+    main_input_name = "prompt"
+    export_feature = "inpainting"
+    auto_model_class = StableDiffusionInpaintPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTLatentConsistencyModelPipeline(ORTStableDiffusionPipelineBase, LatentConsistencyPipelineMixin):
+class ORTStableDiffusionXLPipeline(ORTDiffusionPipeline, StableDiffusionXLPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
     """
 
-    __call__ = LatentConsistencyPipelineMixin.__call__
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = StableDiffusionXLPipeline
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        add_time_ids = list(original_size + crops_coords_top_left + target_size)
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        return add_time_ids
 
 
-class ORTStableDiffusionXLPipelineBase(ORTStableDiffusionPipelineBase):
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTStableDiffusionXLImg2ImgPipeline(ORTDiffusionPipeline, StableDiffusionXLImg2ImgPipeline):
+    """
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
+    """
+
+    main_input_name = "prompt"
+    export_feature = "image-to-image"
     auto_model_class = StableDiffusionXLImg2ImgPipeline
 
-    def __init__(
+    def _get_add_time_ids(
         self,
-        vae_decoder_session: ort.InferenceSession,
-        text_encoder_session: ort.InferenceSession,
-        unet_session: ort.InferenceSession,
-        config: Dict[str, Any],
-        tokenizer: CLIPTokenizer,
-        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
-        feature_extractor: Optional[CLIPFeatureExtractor] = None,
-        vae_encoder_session: Optional[ort.InferenceSession] = None,
-        text_encoder_2_session: Optional[ort.InferenceSession] = None,
-        tokenizer_2: Optional[CLIPTokenizer] = None,
-        use_io_binding: Optional[bool] = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        add_watermarker: Optional[bool] = None,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
     ):
-        super().__init__(
-            vae_decoder_session=vae_decoder_session,
-            text_encoder_session=text_encoder_session,
-            unet_session=unet_session,
-            config=config,
-            tokenizer=tokenizer,
-            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            vae_encoder_session=vae_encoder_session,
-            text_encoder_2_session=text_encoder_2_session,
-            tokenizer_2=tokenizer_2,
-            use_io_binding=use_io_binding,
-            model_save_dir=model_save_dir,
-        )
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
+        else:
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
 
-        add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
 
-        if add_watermarker:
-            if not is_invisible_watermark_available():
-                raise ImportError(
-                    "`add_watermarker` requires invisible-watermark to be installed, which can be installed with `pip install invisible-watermark`."
-                )
 
-            from ..pipelines.diffusers.watermark import StableDiffusionXLWatermarker
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTStableDiffusionXLInpaintPipeline(ORTDiffusionPipeline, StableDiffusionXLInpaintPipeline):
+    """
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLInpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLInpaintPipeline).
+    """
 
-            self.watermark = StableDiffusionXLWatermarker()
+    main_input_name = "image"
+    export_feature = "inpainting"
+    auto_model_class = StableDiffusionXLInpaintPipeline
+
+    def _get_add_time_ids(
+        self,
+        original_size,
+        crops_coords_top_left,
+        target_size,
+        aesthetic_score,
+        negative_aesthetic_score,
+        negative_original_size,
+        negative_crops_coords_top_left,
+        negative_target_size,
+        dtype,
+        text_encoder_projection_dim=None,
+    ):
+        if self.config.requires_aesthetics_score:
+            add_time_ids = list(original_size + crops_coords_top_left + (aesthetic_score,))
+            add_neg_time_ids = list(
+                negative_original_size + negative_crops_coords_top_left + (negative_aesthetic_score,)
+            )
         else:
-            self.watermark = None
+            add_time_ids = list(original_size + crops_coords_top_left + target_size)
+            add_neg_time_ids = list(negative_original_size + crops_coords_top_left + negative_target_size)
+
+        add_time_ids = torch.tensor([add_time_ids], dtype=dtype)
+        add_neg_time_ids = torch.tensor([add_neg_time_ids], dtype=dtype)
+
+        return add_time_ids, add_neg_time_ids
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionXLPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLPipelineMixin):
+class ORTLatentConsistencyModelPipeline(ORTDiffusionPipeline, LatentConsistencyModelPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency#diffusers.LatentConsistencyModelPipeline).
     """
 
-    __call__ = StableDiffusionXLPipelineMixin.__call__
+    main_input_name = "prompt"
+    export_feature = "text-to-image"
+    auto_model_class = LatentConsistencyModelPipeline
 
 
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
-class ORTStableDiffusionXLImg2ImgPipeline(ORTStableDiffusionXLPipelineBase, StableDiffusionXLImg2ImgPipelineMixin):
+class ORTLatentConsistencyModelImg2ImgPipeline(ORTDiffusionPipeline, LatentConsistencyModelImg2ImgPipeline):
     """
-    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusionXLImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#diffusers.StableDiffusionXLImg2ImgPipeline).
+    ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.LatentConsistencyModelImg2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/latent_consistency_img2img#diffusers.LatentConsistencyModelImg2ImgPipeline).
     """
 
-    __call__ = StableDiffusionXLImg2ImgPipelineMixin.__call__
+    main_input_name = "image"
+    export_feature = "image-to-image"
+    auto_model_class = LatentConsistencyModelImg2ImgPipeline
+
+
+class ORTUnavailablePipeline:
+    MIN_VERSION = None
+
+    def __init__(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"The pipeline {self.__class__.__name__} is not available in the current version of `diffusers`. "
+            f"Please upgrade `diffusers` to {self.MIN_VERSION} or later."
+        )
+
+
+if check_if_diffusers_greater("0.29.0"):
+    from diffusers import StableDiffusion3Img2ImgPipeline, StableDiffusion3Pipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3Pipeline(ORTDiffusionPipeline, StableDiffusion3Pipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Pipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img#diffusers.StableDiffusion3Pipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "text-to-image"
+        auto_model_class = StableDiffusion3Pipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3Img2ImgPipeline(ORTDiffusionPipeline, StableDiffusion3Img2ImgPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3Img2ImgPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/img2img#diffusers.StableDiffusion3Img2ImgPipeline).
+        """
+
+        main_input_name = "image"
+        export_feature = "image-to-image"
+        auto_model_class = StableDiffusion3Img2ImgPipeline
+
+else:
+
+    class ORTStableDiffusion3Pipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.29.0"
+
+    class ORTStableDiffusion3Img2ImgPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.29.0"
+
+
+if check_if_diffusers_greater("0.30.0"):
+    from diffusers import FluxPipeline, StableDiffusion3InpaintPipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTStableDiffusion3InpaintPipeline(ORTDiffusionPipeline, StableDiffusion3InpaintPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.StableDiffusion3InpaintPipeline](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/inpaint#diffusers.StableDiffusion3InpaintPipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "inpainting"
+        auto_model_class = StableDiffusion3InpaintPipeline
+
+    @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+    class ORTFluxPipeline(ORTDiffusionPipeline, FluxPipeline):
+        """
+        ONNX Runtime-powered stable diffusion pipeline corresponding to [diffusers.FluxPipeline](https://huggingface.co/docs/diffusers/api/pipelines/flux/text2img#diffusers.FluxPipeline).
+        """
+
+        main_input_name = "prompt"
+        export_feature = "text-to-image"
+        auto_model_class = FluxPipeline
+
+else:
+
+    class ORTStableDiffusion3InpaintPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.30.0"
+
+    class ORTFluxPipeline(ORTUnavailablePipeline):
+        MIN_VERSION = "0.30.0"
+
+
+SUPPORTED_ORT_PIPELINES = [
+    ORTStableDiffusionPipeline,
+    ORTStableDiffusionImg2ImgPipeline,
+    ORTStableDiffusionInpaintPipeline,
+    ORTStableDiffusionXLPipeline,
+    ORTStableDiffusionXLImg2ImgPipeline,
+    ORTStableDiffusionXLInpaintPipeline,
+    ORTLatentConsistencyModelPipeline,
+    ORTLatentConsistencyModelImg2ImgPipeline,
+    ORTStableDiffusion3Pipeline,
+    ORTStableDiffusion3Img2ImgPipeline,
+    ORTStableDiffusion3InpaintPipeline,
+    ORTFluxPipeline,
+]
+
+
+def _get_ort_class(pipeline_class_name: str, throw_error_if_not_exist: bool = True):
+    for ort_pipeline_class in SUPPORTED_ORT_PIPELINES:
+        if (
+            ort_pipeline_class.__name__ == pipeline_class_name
+            or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+        ):
+            return ort_pipeline_class
+
+    if throw_error_if_not_exist:
+        raise ValueError(f"ORTDiffusionPipeline can't find a pipeline linked to {pipeline_class_name}")
+
+
+ORT_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("flux", ORTFluxPipeline),
+        ("latent-consistency", ORTLatentConsistencyModelPipeline),
+        ("stable-diffusion", ORTStableDiffusionPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3Pipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLPipeline),
+    ]
+)
+
+ORT_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("latent-consistency", ORTLatentConsistencyModelImg2ImgPipeline),
+        ("stable-diffusion", ORTStableDiffusionImg2ImgPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3Img2ImgPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLImg2ImgPipeline),
+    ]
+)
+
+ORT_INPAINT_PIPELINES_MAPPING = OrderedDict(
+    [
+        ("stable-diffusion", ORTStableDiffusionInpaintPipeline),
+        ("stable-diffusion-3", ORTStableDiffusion3InpaintPipeline),
+        ("stable-diffusion-xl", ORTStableDiffusionXLInpaintPipeline),
+    ]
+)
+
+SUPPORTED_ORT_PIPELINES_MAPPINGS = [
+    ORT_TEXT2IMAGE_PIPELINES_MAPPING,
+    ORT_IMAGE2IMAGE_PIPELINES_MAPPING,
+    ORT_INPAINT_PIPELINES_MAPPING,
+]
+
+
+def _get_task_ort_class(mapping, pipeline_class_name):
+    def _get_model_name(pipeline_class_name):
+        for ort_pipelines_mapping in SUPPORTED_ORT_PIPELINES_MAPPINGS:
+            for model_name, ort_pipeline_class in ort_pipelines_mapping.items():
+                if (
+                    ort_pipeline_class.__name__ == pipeline_class_name
+                    or ort_pipeline_class.auto_model_class.__name__ == pipeline_class_name
+                ):
+                    return model_name
+
+    model_name = _get_model_name(pipeline_class_name)
+
+    if model_name is not None:
+        task_class = mapping.get(model_name, None)
+        if task_class is not None:
+            return task_class
+
+    raise ValueError(f"ORTPipelineForTask can't find a pipeline linked to {pipeline_class_name} for {model_name}")
+
+
+class ORTPipelineForTask(ConfigMixin):
+    config_name = "model_index.json"
+
+    @classmethod
+    @validate_hf_hub_args
+    def from_pretrained(cls, pretrained_model_or_path, **kwargs) -> ORTDiffusionPipeline:
+        load_config_kwargs = {
+            "force_download": kwargs.get("force_download", False),
+            "resume_download": kwargs.get("resume_download", None),
+            "local_files_only": kwargs.get("local_files_only", False),
+            "cache_dir": kwargs.get("cache_dir", None),
+            "revision": kwargs.get("revision", None),
+            "proxies": kwargs.get("proxies", None),
+            "token": kwargs.get("token", None),
+        }
+        config = cls.load_config(pretrained_model_or_path, **load_config_kwargs)
+        config = config[0] if isinstance(config, tuple) else config
+        class_name = config["_class_name"]
+
+        ort_pipeline_class = _get_task_ort_class(cls.ort_pipelines_mapping, class_name)
+
+        return ort_pipeline_class.from_pretrained(pretrained_model_or_path, **kwargs)
+
+
+class ORTPipelineForText2Image(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForText2Image
+    ort_pipelines_mapping = ORT_TEXT2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForImage2Image(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForImage2Image
+    ort_pipelines_mapping = ORT_IMAGE2IMAGE_PIPELINES_MAPPING
+
+
+class ORTPipelineForInpainting(ORTPipelineForTask):
+    auto_model_class = AutoPipelineForInpainting
+    ort_pipelines_mapping = ORT_INPAINT_PIPELINES_MAPPING
diff --git a/optimum/onnxruntime/modeling_ort.py b/optimum/onnxruntime/modeling_ort.py
index 254b771e334..a55eb064fa3 100644
--- a/optimum/onnxruntime/modeling_ort.py
+++ b/optimum/onnxruntime/modeling_ort.py
@@ -34,6 +34,7 @@
     AutoModelForAudioXVector,
     AutoModelForCTC,
     AutoModelForImageClassification,
+    AutoModelForImageToImage,
     AutoModelForMaskedLM,
     AutoModelForMultipleChoice,
     AutoModelForQuestionAnswering,
@@ -47,6 +48,7 @@
     BaseModelOutput,
     CausalLMOutput,
     ImageClassifierOutput,
+    ImageSuperResolutionOutput,
     MaskedLMOutput,
     ModelOutput,
     MultipleChoiceModelOutput,
@@ -508,13 +510,12 @@ def _from_pretrained(
 
         if file_name is None:
             if model_path.is_dir():
-                onnx_files = list(model_path.glob("*.onnx"))
+                onnx_files = list((model_path / subfolder).glob("*.onnx"))
             else:
                 repo_files, _ = TasksManager.get_model_files(
                     model_id, revision=revision, cache_dir=cache_dir, token=token
                 )
                 repo_files = map(Path, repo_files)
-
                 pattern = "*.onnx" if subfolder == "" else f"{subfolder}/*.onnx"
                 onnx_files = [p for p in repo_files if p.match(pattern)]
 
@@ -661,8 +662,6 @@ def _export(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -932,13 +931,12 @@ def _prepare_onnx_inputs(
         self, use_torch: bool, **inputs: Union[torch.Tensor, np.ndarray]
     ) -> Dict[str, np.ndarray]:
         onnx_inputs = {}
-
         # converts pytorch inputs into numpy inputs for onnx
         for input_name in self.input_names.keys():
             onnx_inputs[input_name] = inputs.pop(input_name)
 
             if use_torch:
-                onnx_inputs[input_name] = onnx_inputs[input_name].cpu().detach().numpy()
+                onnx_inputs[input_name] = onnx_inputs[input_name].numpy(force=True)
 
             if onnx_inputs[input_name].dtype != self.input_dtypes[input_name]:
                 onnx_inputs[input_name] = onnx_inputs[input_name].astype(
@@ -983,10 +981,9 @@ def _cached_file(
             token = use_auth_token
 
         model_path = Path(model_path)
-
         # locates a file in a local folder and repo, downloads and cache it if necessary.
         if model_path.is_dir():
-            model_cache_path = model_path / file_name
+            model_cache_path = model_path / subfolder / file_name
             preprocessors = maybe_load_preprocessors(model_path.as_posix())
         else:
             model_cache_path = hf_hub_download(
@@ -1088,6 +1085,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1169,7 +1169,6 @@ def _export(
             library_name="transformers",
         )
 
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -1244,6 +1243,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1333,6 +1335,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1440,6 +1445,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1530,6 +1538,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1613,6 +1624,9 @@ def forward(
         use_torch = isinstance(input_ids, torch.Tensor)
         self.raise_on_numpy_input_io_binding(use_torch)
 
+        if token_type_ids is None and "token_type_ids" in self.input_names:
+            token_type_ids = torch.zeros_like(input_ids) if use_torch else np.zeros_like(input_ids)
+
         if self.device.type == "cuda" and self.use_io_binding:
             io_binding, output_shapes, output_buffers = self.prepare_io_binding(
                 input_ids,
@@ -1682,7 +1696,7 @@ def forward(
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForImageClassification(ORTModel):
     """
-    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, vit.
+    ONNX Model for image-classification tasks. This class officially supports beit, convnext, convnextv2, data2vec_vision, deit, dinov2, levit, mobilenet_v1, mobilenet_v2, mobilevit, poolformer, resnet, segformer, swin, swinv2, vit.
     """
 
     auto_model_class = AutoModelForImageClassification
@@ -1770,7 +1784,7 @@ def forward(
 @add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
 class ORTModelForSemanticSegmentation(ORTModel):
     """
-    ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports segformer.
+    ONNX Model for semantic-segmentation, with an all-MLP decode head on top e.g. for ADE20k, CityScapes. This class officially supports maskformer, segformer.
     """
 
     auto_model_class = AutoModelForSemanticSegmentation
@@ -2183,6 +2197,77 @@ def forward(
         return TokenClassifierOutput(logits=logits)
 
 
+IMAGE_TO_IMAGE_EXAMPLE = r"""
+    Example of image-to-image (Super Resolution):
+
+    ```python
+    >>> from transformers import {processor_class}
+    >>> from optimum.onnxruntime import {model_class}
+    >>> from PIL import Image
+
+    >>> image = Image.open("path/to/image.jpg")
+
+    >>> image_processor = {processor_class}.from_pretrained("{checkpoint}")
+    >>> model = {model_class}.from_pretrained("{checkpoint}")
+
+    >>> inputs = image_processor(images=image, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     logits = model(**inputs).logits
+    ```
+"""
+
+
+@add_end_docstrings(ONNX_MODEL_END_DOCSTRING)
+class ORTModelForImageToImage(ORTModel):
+    """
+    ONNX Model for image-to-image tasks. This class officially supports pix2pix, cyclegan, wav2vec2, wav2vec2-conformer.
+    """
+
+    auto_model_class = AutoModelForImageToImage
+
+    @add_start_docstrings_to_model_forward(
+        ONNX_IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width")
+        + IMAGE_TO_IMAGE_EXAMPLE.format(
+            processor_class=_PROCESSOR_FOR_DOC,
+            model_class="ORTModelForImgageToImage",
+            checkpoint="caidas/swin2SR-realworld-sr-x4-64-bsrgan-psnr",
+        )
+    )
+    def forward(
+        self,
+        pixel_values: Union[torch.Tensor, np.ndarray],
+        **kwargs,
+    ):
+        use_torch = isinstance(pixel_values, torch.Tensor)
+        self.raise_on_numpy_input_io_binding(use_torch)
+        if self.device.type == "cuda" and self.use_io_binding:
+            input_shapes = pixel_values.shape
+            io_binding, output_shapes, output_buffers = self.prepare_io_binding(
+                pixel_values,
+                ordered_input_names=self._ordered_input_names,
+                known_output_shapes={
+                    "reconstruction": [
+                        input_shapes[0],
+                        input_shapes[1],
+                        input_shapes[2] * self.config.upscale,
+                        input_shapes[3] * self.config.upscale,
+                    ]
+                },
+            )
+            io_binding.synchronize_inputs()
+            self.model.run_with_iobinding(io_binding)
+            io_binding.synchronize_outputs()
+            reconstruction = output_buffers["reconstruction"].view(output_shapes["reconstruction"])
+        else:
+            model_inputs = {"pixel_values": pixel_values}
+            onnx_inputs = self._prepare_onnx_inputs(use_torch, **model_inputs)
+            onnx_outputs = self.model.run(None, onnx_inputs)
+            model_outputs = self._prepare_onnx_outputs(use_torch, *onnx_outputs)
+            reconstruction = model_outputs["reconstruction"]
+        return ImageSuperResolutionOutput(reconstruction=reconstruction)
+
+
 CUSTOM_TASKS_EXAMPLE = r"""
     Example of custom tasks(e.g. a sentence transformers taking `pooler_output` as output):
 
diff --git a/optimum/onnxruntime/modeling_seq2seq.py b/optimum/onnxruntime/modeling_seq2seq.py
index 4ce3e4707ed..27e0dc01b4c 100644
--- a/optimum/onnxruntime/modeling_seq2seq.py
+++ b/optimum/onnxruntime/modeling_seq2seq.py
@@ -18,7 +18,6 @@
 
 import logging
 import shutil
-import warnings
 from abc import ABC, abstractmethod
 from pathlib import Path
 from tempfile import TemporaryDirectory
@@ -46,7 +45,6 @@
 from ..onnx.utils import _get_external_data_paths
 from ..utils import check_if_transformers_greater
 from ..utils.file_utils import validate_file_exists
-from ..utils.normalized_config import NormalizedConfigManager
 from ..utils.save_utils import maybe_load_preprocessors, maybe_save_preprocessors
 from .base import ORTDecoderForSeq2Seq, ORTEncoder
 from .constants import (
@@ -69,17 +67,7 @@
 if check_if_transformers_greater("4.25.0"):
     from transformers.generation import GenerationMixin
 else:
-    from transformers.generation_utils import GenerationMixin
-
-
-# if check_if_transformers_greater("4.37.0"):
-#     # starting from transformers v4.37.0, the whisper generation loop is implemented in the `WhisperGenerationMixin`
-#     # and it implements many new features including short and long form generation, and starts with 2 init tokens
-#     from transformers.models.whisper.generation_whisper import WhisperGenerationMixin
-# else:
-
-#     class WhisperGenerationMixin(WhisperForConditionalGeneration, GenerationMixin):
-#         pass
+    from transformers.generation_utils import GenerationMixin  # type: ignore
 
 
 if check_if_transformers_greater("4.43.0"):
@@ -717,6 +705,18 @@ def show_deprecated_argument(arg_name):
             generation_config = GenerationConfig.from_model_config(config)
         self.generation_config = generation_config
 
+        if check_if_transformers_greater("4.44.99"):
+            misplaced_generation_parameters = self.config._get_non_default_generation_parameters()
+            if len(misplaced_generation_parameters) > 0:
+                logger.warning(
+                    "Moving the following attributes in the config to the generation config: "
+                    f"{misplaced_generation_parameters}. You are seeing this warning because you've set "
+                    "generation parameters in the model config, as opposed to in the generation config.",
+                )
+                for param_name, param_value in misplaced_generation_parameters.items():
+                    setattr(self.generation_config, param_name, param_value)
+                    setattr(self.config, param_name, None)
+
     @abstractmethod
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         pass
@@ -791,7 +791,6 @@ def _from_pretrained(
         cls,
         model_id: Union[str, Path],
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: Optional[str] = None,
         force_download: bool = False,
@@ -810,15 +809,7 @@ def _from_pretrained(
         model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
         **kwargs,
     ):
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
+        generation_config = kwargs.pop("generation_config", None)
         model_path = Path(model_id)
 
         # We do not implement the logic for use_cache=False, use_merged=True
@@ -1007,19 +998,21 @@ def _from_pretrained(
         if model_save_dir is None:
             model_save_dir = new_model_save_dir
 
-        generation_config = None
-        try:
-            generation_config = GenerationConfig.from_pretrained(
-                model_id,
-                cache_dir=cache_dir,
-                force_download=force_download,
-                local_files_only=local_files_only,
-                token=token,
-                revision=revision,
-                subfolder=subfolder,
-            )
-        except OSError:
-            logger.info("Generation config file not found, using a generation config created from the model config.")
+        if generation_config is None:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_id,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    local_files_only=local_files_only,
+                    token=token,
+                    revision=revision,
+                    subfolder=subfolder,
+                )
+            except OSError:
+                logger.info(
+                    "Generation config file not found, using a generation config created from the model config."
+                )
 
         onnx_paths = [encoder_path]
         if use_merged is False:
@@ -1046,7 +1039,6 @@ def _from_transformers(
         cls,
         model_id: str,
         config: "PretrainedConfig",
-        use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
         revision: str = "main",
         force_download: bool = True,
@@ -1062,15 +1054,6 @@ def _from_transformers(
         use_io_binding: Optional[bool] = None,
         task: Optional[str] = None,
     ) -> "ORTModelForConditionalGeneration":
-        if use_auth_token is not None:
-            warnings.warn(
-                "The `use_auth_token` argument is deprecated and will be removed soon. Please use the `token` argument instead.",
-                FutureWarning,
-            )
-            if token is not None:
-                raise ValueError("You cannot use both `use_auth_token` and `token` arguments at the same time.")
-            token = use_auth_token
-
         if use_cache is False and use_merged is True:
             raise ValueError(
                 "The incompatible arguments use_cache=False, use_merged=True were passed to"
@@ -1102,8 +1085,6 @@ def _from_transformers(
             force_download=force_download,
             trust_remote_code=trust_remote_code,
         )
-
-        config.save_pretrained(save_dir_path)
         maybe_save_preprocessors(model_id, save_dir_path, src_subfolder=subfolder)
 
         return cls._from_pretrained(
@@ -1165,49 +1146,6 @@ class ORTModelForSeq2SeqLM(ORTModelForConditionalGeneration, GenerationMixin):
     auto_model_class = AutoModelForSeq2SeqLM
     main_input_name = "input_ids"
 
-    def __init__(
-        self,
-        encoder_session: ort.InferenceSession,
-        decoder_session: ort.InferenceSession,
-        config: "PretrainedConfig",
-        onnx_paths: List[str],
-        decoder_with_past_session: Optional[ort.InferenceSession] = None,
-        use_cache: bool = True,
-        use_io_binding: Optional[bool] = None,
-        model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None,
-        preprocessors: Optional[List] = None,
-        generation_config: Optional[GenerationConfig] = None,
-        **kwargs,
-    ):
-        super().__init__(
-            encoder_session,
-            decoder_session,
-            config,
-            onnx_paths,
-            decoder_with_past_session,
-            use_cache,
-            use_io_binding,
-            model_save_dir,
-            preprocessors,
-            generation_config,
-            **kwargs,
-        )
-
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        if config.model_type == "encoder-decoder":
-            self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.encoder.model_type
-            )(config.encoder)
-
-            self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
-            if self.decoder_with_past is not None:
-                self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                    config.decoder.model_type
-                )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoder(session, self)
 
@@ -1521,20 +1459,6 @@ def __init__(
             **kwargs,
         )
 
-        # The normalized_config initialization in ORTModelPart is unfortunately wrong as the top level config is initialized.
-        self.encoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.encoder.model_type
-        )(config.encoder)
-
-        self.decoder.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-            config.decoder.model_type
-        )(config.decoder)
-
-        if self.decoder_with_past is not None:
-            self.decoder_with_past.normalized_config = NormalizedConfigManager.get_normalized_config_class(
-                config.decoder.model_type
-            )(config.decoder)
-
     def _initialize_encoder(self, session: ort.InferenceSession) -> ORTEncoder:
         return ORTEncoderForVisionEncoderDecoder(session, self)
 
diff --git a/optimum/onnxruntime/optimization.py b/optimum/onnxruntime/optimization.py
index 9e62a3f324c..fd6958bba7d 100644
--- a/optimum/onnxruntime/optimization.py
+++ b/optimum/onnxruntime/optimization.py
@@ -20,6 +20,7 @@
 
 import onnx
 from onnx import load_model
+from transformers import GenerationConfig
 from transformers.models.auto.configuration_auto import AutoConfig
 
 from onnxruntime.transformers.onnx_model_bert import BertOnnxModel
@@ -152,10 +153,6 @@ def optimize(
         save_dir = Path(save_dir)
         save_dir.mkdir(parents=True, exist_ok=True)
         ORTConfigManager.check_optimization_supported_model(self.model_type, optimization_config)
-
-        self.config.save_pretrained(save_dir)
-        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
-
         model_type = ORTConfigManager.get_model_ort_type(self.config.model_type)
         optimization_options = optimization_config.create_fusion_options(model_type)
 
@@ -236,6 +233,13 @@ def optimize(
         # Save the model configuration
         self.config.save_pretrained(save_dir)
         ort_config.save_pretrained(save_dir)
+        maybe_save_preprocessors(self.onnx_model_path[0].parent, save_dir)
+
+        try:
+            generation_config = GenerationConfig.from_pretrained(self.onnx_model_path[0].parent)
+            generation_config.save_pretrained(save_dir)
+        except Exception:
+            pass
 
         logger.info(
             f"Optimized model saved at: {save_dir} (external data format: "
diff --git a/optimum/onnxruntime/quantization.py b/optimum/onnxruntime/quantization.py
index 056123f8d8e..054a2310a6b 100644
--- a/optimum/onnxruntime/quantization.py
+++ b/optimum/onnxruntime/quantization.py
@@ -21,7 +21,6 @@
 from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple, Union
 
 import onnx
-from datasets import Dataset, load_dataset
 from packaging.version import Version, parse
 from transformers import AutoConfig
 
@@ -29,6 +28,7 @@
 from onnxruntime.quantization import CalibrationDataReader, QuantFormat, QuantizationMode, QuantType
 from onnxruntime.quantization.onnx_quantizer import ONNXQuantizer
 from onnxruntime.quantization.qdq_quantizer import QDQQuantizer
+from optimum.utils.import_utils import requires_backends
 
 from ..quantization_base import OptimumQuantizer
 from ..utils.save_utils import maybe_save_preprocessors
@@ -40,6 +40,7 @@
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset
     from transformers import PretrainedConfig
 
 LOGGER = logging.getLogger(__name__)
@@ -48,7 +49,7 @@
 class ORTCalibrationDataReader(CalibrationDataReader):
     __slots__ = ["batch_size", "dataset", "_dataset_iter"]
 
-    def __init__(self, dataset: Dataset, batch_size: int = 1):
+    def __init__(self, dataset: "Dataset", batch_size: int = 1):
         if dataset is None:
             raise ValueError("Provided dataset is None.")
 
@@ -100,7 +101,7 @@ def __init__(self, onnx_model_path: Path, config: Optional["PretrainedConfig"] =
         if self.config is None:
             try:
                 self.config = AutoConfig.from_pretrained(self.onnx_model_path.parent)
-            except OSError:
+            except (OSError, ValueError):
                 LOGGER.warning(
                     f"Could not load the config for {self.onnx_model_path} automatically, this might make "
                     "the quantized model harder to use because it will not be able to be loaded by an ORTModel without "
@@ -134,6 +135,7 @@ def from_pretrained(
             model_or_path = Path(model_or_path)
 
         path = None
+        config = None
         if isinstance(model_or_path, ORTModelForConditionalGeneration):
             raise NotImplementedError(ort_quantizer_error_message)
         elif isinstance(model_or_path, Path) and file_name is None:
@@ -147,17 +149,17 @@ def from_pretrained(
             file_name = onnx_files[0].name
 
         if isinstance(model_or_path, ORTModel):
-            if path is None:
-                path = Path(model_or_path.model._model_path)
+            path = Path(model_or_path.model._model_path)
+            config = model_or_path.config
         elif os.path.isdir(model_or_path):
             path = Path(model_or_path) / file_name
         else:
             raise ValueError(f"Unable to load model from {model_or_path}.")
-        return cls(path)
+        return cls(path, config=config)
 
     def fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -211,7 +213,7 @@ def fit(
 
     def partial_fit(
         self,
-        dataset: Dataset,
+        dataset: "Dataset",
         calibration_config: CalibrationConfig,
         onnx_augmented_model_name: Union[str, Path] = "augmented_model.onnx",
         operators_to_quantize: Optional[List[str]] = None,
@@ -427,7 +429,7 @@ def get_calibration_dataset(
         seed: int = 2016,
         use_auth_token: Optional[Union[bool, str]] = None,
         token: Optional[Union[bool, str]] = None,
-    ) -> Dataset:
+    ) -> "Dataset":
         """
         Creates the calibration `datasets.Dataset` to use for the post-training static quantization calibration step.
 
@@ -473,6 +475,10 @@ def get_calibration_dataset(
                 "provided."
             )
 
+        requires_backends(self, ["datasets"])
+
+        from datasets import load_dataset
+
         calib_dataset = load_dataset(
             dataset_name,
             name=dataset_config_name,
@@ -491,7 +497,7 @@ def get_calibration_dataset(
 
         return self.clean_calibration_dataset(processed_calib_dataset)
 
-    def clean_calibration_dataset(self, dataset: Dataset) -> Dataset:
+    def clean_calibration_dataset(self, dataset: "Dataset") -> "Dataset":
         model = onnx.load(self.onnx_model_path)
         model_inputs = {input.name for input in model.graph.input}
         ignored_columns = list(set(dataset.column_names) - model_inputs)
diff --git a/optimum/onnxruntime/runs/calibrator.py b/optimum/onnxruntime/runs/calibrator.py
index c493a943747..bfdcd64d92e 100644
--- a/optimum/onnxruntime/runs/calibrator.py
+++ b/optimum/onnxruntime/runs/calibrator.py
@@ -1,6 +1,4 @@
-from typing import Dict, List
-
-from datasets import Dataset
+from typing import TYPE_CHECKING, Dict, List
 
 from ...runs_base import Calibrator
 from .. import ORTQuantizer
@@ -9,10 +7,14 @@
 from ..preprocessors.passes import ExcludeGeLUNodes, ExcludeLayerNormNodes, ExcludeNodeAfter, ExcludeNodeFollowedBy
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
+
 class OnnxRuntimeCalibrator(Calibrator):
     def __init__(
         self,
-        calibration_dataset: Dataset,
+        calibration_dataset: "Dataset",
         quantizer: ORTQuantizer,
         model_path: str,
         qconfig: QuantizationConfig,
diff --git a/optimum/onnxruntime/trainer.py b/optimum/onnxruntime/trainer.py
index 9bc2bb5134d..66273cbcf96 100644
--- a/optimum/onnxruntime/trainer.py
+++ b/optimum/onnxruntime/trainer.py
@@ -55,7 +55,6 @@
 from torch.utils.data import Dataset, RandomSampler
 from transformers.data.data_collator import DataCollator
 from transformers.debug_utils import DebugOption, DebugUnderflowOverflow
-from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
 from transformers.modeling_utils import PreTrainedModel, unwrap_model
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 from transformers.trainer import Trainer
@@ -81,10 +80,10 @@
     is_apex_available,
     is_sagemaker_dp_enabled,
     is_sagemaker_mp_enabled,
-    is_torch_tpu_available,
 )
 
 from ..utils import logging
+from ..utils.import_utils import check_if_transformers_greater
 from .training_args import ORTOptimizerNames, ORTTrainingArguments
 from .utils import (
     is_onnxruntime_training_available,
@@ -94,8 +93,25 @@
 if is_apex_available():
     from apex import amp
 
-if is_torch_tpu_available(check_device=False):
-    import torch_xla.core.xla_model as xm
+if check_if_transformers_greater("4.33"):
+    from transformers.integrations.deepspeed import (
+        deepspeed_init,
+        deepspeed_load_checkpoint,
+        is_deepspeed_zero3_enabled,
+    )
+else:
+    from transformers.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_zero3_enabled
+
+if check_if_transformers_greater("4.39"):
+    from transformers.utils import is_torch_xla_available as is_torch_tpu_xla_available
+
+    if is_torch_tpu_xla_available():
+        import torch_xla.core.xla_model as xm
+else:
+    from transformers.utils import is_torch_tpu_available as is_torch_tpu_xla_available
+
+    if is_torch_tpu_xla_available(check_device=False):
+        import torch_xla.core.xla_model as xm
 
 if TYPE_CHECKING:
     import optuna
@@ -719,7 +735,7 @@ def get_dataloader_sampler(dataloader):
 
                 if (
                     args.logging_nan_inf_filter
-                    and not is_torch_tpu_available()
+                    and not is_torch_tpu_xla_available()
                     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
                 ):
                     # if loss is nan or inf simply add the average of previous logged losses
diff --git a/optimum/onnxruntime/trainer_seq2seq.py b/optimum/onnxruntime/trainer_seq2seq.py
index 2e43ee89e00..1565ffa6acb 100644
--- a/optimum/onnxruntime/trainer_seq2seq.py
+++ b/optimum/onnxruntime/trainer_seq2seq.py
@@ -19,10 +19,10 @@
 import torch
 from torch import nn
 from torch.utils.data import Dataset
-from transformers.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer_utils import PredictionOutput
 from transformers.utils import is_accelerate_available, logging
 
+from ..utils.import_utils import check_if_transformers_greater
 from .trainer import ORTTrainer
 
 
@@ -33,6 +33,11 @@
         "The package `accelerate` is required to use the ORTTrainer. Please install it following https://huggingface.co/docs/accelerate/basic_tutorials/install."
     )
 
+if check_if_transformers_greater("4.33"):
+    from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+else:
+    from transformers.deepspeed import is_deepspeed_zero3_enabled
+
 logger = logging.get_logger(__name__)
 
 
diff --git a/optimum/onnxruntime/training_args.py b/optimum/onnxruntime/training_args.py
index 6aec362c07c..6135abc1376 100644
--- a/optimum/onnxruntime/training_args.py
+++ b/optimum/onnxruntime/training_args.py
@@ -117,32 +117,32 @@ def __post_init__(self):
         if self.disable_tqdm is None:
             self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN
 
-        if isinstance(self.evaluation_strategy, EvaluationStrategy):
+        if isinstance(self.eval_strategy, EvaluationStrategy):
             warnings.warn(
-                "using `EvaluationStrategy` for `evaluation_strategy` is deprecated and will be removed in version 5"
+                "using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5"
                 " of 🤗 Transformers. Use `IntervalStrategy` instead",
                 FutureWarning,
             )
             # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it.
-            self.evaluation_strategy = self.evaluation_strategy.value
+            self.eval_strategy = self.eval_strategy.value
 
-        self.evaluation_strategy = IntervalStrategy(self.evaluation_strategy)
+        self.eval_strategy = IntervalStrategy(self.eval_strategy)
         self.logging_strategy = IntervalStrategy(self.logging_strategy)
         self.save_strategy = IntervalStrategy(self.save_strategy)
         self.hub_strategy = HubStrategy(self.hub_strategy)
 
         self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type)
-        if self.do_eval is False and self.evaluation_strategy != IntervalStrategy.NO:
+        if self.do_eval is False and self.eval_strategy != IntervalStrategy.NO:
             self.do_eval = True
 
         # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero
-        if self.evaluation_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
+        if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0):
             if self.logging_steps > 0:
                 logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}")
                 self.eval_steps = self.logging_steps
             else:
                 raise ValueError(
-                    f"evaluation strategy {self.evaluation_strategy} requires either non-zero --eval_steps or"
+                    f"evaluation strategy {self.eval_strategy} requires either non-zero --eval_steps or"
                     " --logging_steps"
                 )
 
@@ -154,7 +154,7 @@ def __post_init__(self):
             if self.logging_steps != int(self.logging_steps):
                 raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}")
             self.logging_steps = int(self.logging_steps)
-        if self.evaluation_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
+        if self.eval_strategy == IntervalStrategy.STEPS and self.eval_steps > 1:
             if self.eval_steps != int(self.eval_steps):
                 raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}")
             self.eval_steps = int(self.eval_steps)
@@ -165,13 +165,13 @@ def __post_init__(self):
 
         # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible.
         if self.load_best_model_at_end:
-            if self.evaluation_strategy != self.save_strategy:
+            if self.eval_strategy != self.save_strategy:
                 raise ValueError(
                     "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation "
                     "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps "
                     f"{self.save_steps} and eval_steps {self.eval_steps}."
                 )
-            if self.evaluation_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
+            if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0:
                 if self.eval_steps < 1 or self.save_steps < 1:
                     if not (self.eval_steps < 1 and self.save_steps < 1):
                         raise ValueError(
@@ -244,7 +244,7 @@ def __post_init__(self):
                 )
 
         if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU:
-            if self.evaluation_strategy == IntervalStrategy.NO:
+            if self.eval_strategy == IntervalStrategy.NO:
                 raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy")
             if not is_torch_available():
                 raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0")
diff --git a/optimum/onnxruntime/utils.py b/optimum/onnxruntime/utils.py
index ad40af92b9d..79375d958ff 100644
--- a/optimum/onnxruntime/utils.py
+++ b/optimum/onnxruntime/utils.py
@@ -13,6 +13,7 @@
 #  limitations under the License.
 """Utility functions, classes and constants for ONNX Runtime."""
 
+import importlib
 import os
 import re
 from enum import Enum
@@ -31,7 +32,6 @@
 import onnxruntime as ort
 
 from ..exporters.onnx import OnnxConfig, OnnxConfigWithLoss
-from ..utils.import_utils import _is_package_available
 
 
 if TYPE_CHECKING:
@@ -91,9 +91,11 @@ def is_onnxruntime_training_available():
 
 def is_cupy_available():
     """
-    Checks if onnxruntime-training is available.
+    Checks if CuPy is available.
     """
-    return _is_package_available("cupy")
+    # Don't use _is_package_available as it doesn't work with CuPy installed
+    # with `cupy-cuda*` and `cupy-rocm-*` package name (prebuilt wheels).
+    return importlib.util.find_spec("cupy") is not None
 
 
 class ORTConfigManager:
@@ -126,6 +128,7 @@ class ORTConfigManager:
         "gpt-neo": "gpt2",
         "gpt-neox": "gpt2",
         "gptj": "gpt2",
+        "granite": "gpt2",
         # longt5 with O4 results in segmentation fault
         "longt5": "bert",
         "llama": "gpt2",
@@ -175,6 +178,7 @@ def check_optimization_supported_model(cls, model_type: str, optimization_config
             "clip",
             "vit",
             "swin",
+            "swinv2",
         ]
         model_type = model_type.replace("_", "-")
         if (model_type not in cls._conf) or (cls._conf[model_type] not in supported_model_types_for_optimization):
@@ -401,3 +405,18 @@ def evaluation_loop(
         metrics = {}
 
     return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=len(dataset))
+
+
+def np_to_pt_generators(np_object, device):
+    if isinstance(np_object, np.random.RandomState):
+        return torch.Generator(device=device).manual_seed(int(np_object.get_state()[1][0]))
+    elif isinstance(np_object, np.random.Generator):
+        return torch.Generator(device=device).manual_seed(int(np_object.bit_generator.state[1][0]))
+    elif isinstance(np_object, list) and isinstance(np_object[0], (np.random.RandomState, np.random.Generator)):
+        return [np_to_pt_generators(a, device) for a in np_object]
+    elif isinstance(np_object, dict) and isinstance(
+        next(iter(np_object.values())), (np.random.RandomState, np.random.Generator)
+    ):
+        return {k: np_to_pt_generators(v, device) for k, v in np_object.items()}
+    else:
+        return np_object
diff --git a/optimum/pipelines/diffusers/pipeline_latent_consistency.py b/optimum/pipelines/diffusers/pipeline_latent_consistency.py
deleted file mode 100644
index 41c85b5b6ac..00000000000
--- a/optimum/pipelines/diffusers/pipeline_latent_consistency.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import logging
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-logger = logging.getLogger(__name__)
-
-
-class LatentConsistencyPipelineMixin(StableDiffusionPipelineMixin):
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 4,
-        original_inference_steps: int = None,
-        guidance_scale: float = 8.5,
-        num_images_per_prompt: int = 1,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-
-        # Don't need to get negative prompts due to LCM guided distillation
-        negative_prompt = None
-        negative_prompt_embeds = None
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            False,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, original_inference_steps=original_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config["in_channels"],
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        bs = batch_size * num_images_per_prompt
-        # get Guidance Scale Embedding
-        w = np.full(bs, guidance_scale - 1, dtype=prompt_embeds.dtype)
-        w_embedding = self.get_guidance_scale_embedding(
-            w, embedding_dim=self.unet.config["time_cond_proj_dim"], dtype=prompt_embeds.dtype
-        )
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latents,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                timestep_cond=w_embedding,
-            )[0]
-
-            # compute the previous noisy sample x_t -> x_t-1
-            latents, denoised = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), return_dict=False
-            )
-            latents, denoised = latents.numpy(), denoised.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = denoised
-            has_nsfw_concept = None
-        else:
-            denoised /= self.vae_decoder.config["scaling_factor"]
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=denoised[i : i + 1])[0] for i in range(denoised.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.22.0/src/diffusers/pipelines/latent_consistency/pipeline_latent_consistency.py#L264
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=None):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        w = w * 1000
-        half_dim = embedding_dim // 2
-        emb = np.log(10000.0) / (half_dim - 1)
-        emb = np.exp(np.arange(half_dim, dtype=dtype) * -emb)
-        emb = w[:, None] * emb[None, :]
-        emb = np.concatenate([np.sin(emb), np.cos(emb)], axis=1)
-
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = np.pad(emb, [(0, 0), (0, 1)])
-
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
deleted file mode 100644
index 98bff0de44d..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion.py
+++ /dev/null
@@ -1,419 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionPipelineMixin(DiffusionPipelineMixin):
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L114
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # get prompt text embeddings
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="max_length", return_tensors="np").input_ids
-
-            if not np.array_equal(text_input_ids, untruncated_ids):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            prompt_embeds = self.text_encoder(input_ids=text_input_ids.astype(np.int32))[0]
-
-        prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt] * batch_size
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            negative_prompt_embeds = self.text_encoder(input_ids=uncond_input.input_ids.astype(np.int32))[0]
-
-        if do_classifier_free_guidance:
-            negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the unconditional and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = np.concatenate([negative_prompt_embeds, prompt_embeds])
-
-        return prompt_embeds
-
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L217
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
-        elif latents.shape != shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        return latents
-
-    # Adapted from https://github.com/huggingface/diffusers/blob/v0.17.1/src/diffusers/pipelines/stable_diffusion/pipeline_onnx_stable_diffusion.py#L264
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        guidance_rescale: float = 0.0,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config.get("in_channels", 4),
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
-
-    def run_safety_checker(self, image: np.ndarray):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(
-                feature_extractor_input, return_tensors="np"
-            ).pixel_values.astype(image.dtype)
-            images, has_nsfw_concept = [], []
-            for i in range(image.shape[0]):
-                image_i, has_nsfw_concept_i = self.safety_checker(
-                    clip_input=safety_checker_input[i : i + 1], images=image[i : i + 1]
-                )
-                images.append(image_i)
-                has_nsfw_concept.append(has_nsfw_concept_i[0])
-            image = np.concatenate(images)
-
-        return image, has_nsfw_concept
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
deleted file mode 100644
index 81a6ffa1e04..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_img2img.py
+++ /dev/null
@@ -1,302 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import deprecate
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-class StableDiffusionImg2ImgPipelineMixin(StableDiffusionPipelineMixin):
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        strength: float,
-        callback_steps: int,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionImg2ImgPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.8,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`Union[np.ndarray, PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            strength (`float`, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        image = self.image_processor.preprocess(image)
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        latents_dtype = prompt_embeds.dtype
-        image = image.astype(latents_dtype)
-        # encode the init image into latents and scale the latents
-        init_latents = self.vae_encoder(sample=image)[0]
-
-        scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-        init_latents = scaling_factor * init_latents
-
-        if isinstance(prompt, str):
-            prompt = [prompt]
-        if len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            deprecation_message = (
-                f"You have passed {len(prompt)} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
-                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
-                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
-                " your script to pass as many initial images as text prompts to suppress this warning."
-            )
-            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
-            additional_image_per_prompt = len(prompt) // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt * num_images_per_prompt, axis=0)
-        elif len(prompt) > init_latents.shape[0] and len(prompt) % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {len(prompt)} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents] * num_images_per_prompt, axis=0)
-
-        # get the original timestep using init_timestep
-        offset = self.scheduler.config.get("steps_offset", 0)
-        init_timestep = int(num_inference_steps * strength) + offset
-        init_timestep = min(init_timestep, num_inference_steps)
-
-        timesteps = self.scheduler.timesteps.numpy()[-init_timestep]
-        timesteps = np.array([timesteps] * batch_size * num_images_per_prompt)
-
-        # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(latents_dtype)
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timesteps)
-        )
-        init_latents = init_latents.numpy()
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        latents = init_latents
-
-        t_start = max(num_inference_steps - init_timestep + offset, 0)
-        timesteps = self.scheduler.timesteps[t_start:].numpy()
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= scaling_factor
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
deleted file mode 100644
index 19de793ccd0..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_inpaint.py
+++ /dev/null
@@ -1,345 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
-from diffusers.utils import PIL_INTERPOLATION
-
-from .pipeline_stable_diffusion import StableDiffusionPipelineMixin
-
-
-def prepare_mask_and_masked_image(image, mask, latents_shape, vae_scale_factor):
-    image = np.array(
-        image.convert("RGB").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor))
-    )
-    image = image[None].transpose(0, 3, 1, 2)
-    image = image.astype(np.float32) / 127.5 - 1.0
-
-    image_mask = np.array(
-        mask.convert("L").resize((latents_shape[1] * vae_scale_factor, latents_shape[0] * vae_scale_factor))
-    )
-    masked_image = image * (image_mask < 127.5)
-
-    mask = mask.resize((latents_shape[1], latents_shape[0]), PIL_INTERPOLATION["nearest"])
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-
-    return mask, masked_image
-
-
-class StableDiffusionInpaintPipelineMixin(StableDiffusionPipelineMixin):
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_onnx_stable_diffusion.OnnxStableDiffusionPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]],
-        image: PIL.Image.Image,
-        mask_image: PIL.Image.Image,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            mask_image (`PIL.Image.Image`):
-                `Image`, or tensor representing a masked image batch which will be upscaled.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 7.5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        height = height or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-        width = width or self.unet.config.get("sample_size", 64) * self.vae_scale_factor
-
-        # check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt, height, width, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds
-        )
-
-        # define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        # set timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        prompt_embeds = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-
-        num_channels_latents = self.vae_decoder.config.get("latent_channels", 4)
-        num_channels_unet = self.unet.config.get("in_channels", 9)
-        latents_shape = (
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        latents_dtype = prompt_embeds.dtype
-        if latents is None:
-            latents = generator.randn(*latents_shape).astype(latents_dtype)
-        else:
-            if latents.shape != latents_shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
-
-        # prepare mask and masked_image
-        mask, masked_image = prepare_mask_and_masked_image(
-            image, mask_image, latents_shape[-2:], self.vae_scale_factor
-        )
-        mask = mask.astype(latents.dtype)
-        masked_image = masked_image.astype(latents.dtype)
-
-        masked_image_latents = self.vae_encoder(sample=masked_image)[0]
-
-        scaling_factor = self.vae_decoder.config.get("scaling_factor", 0.18215)
-        masked_image_latents = scaling_factor * masked_image_latents
-
-        # duplicate mask and masked_image_latents for each generation per prompt
-        mask = mask.repeat(batch_size * num_images_per_prompt, 0)
-        masked_image_latents = masked_image_latents.repeat(batch_size * num_images_per_prompt, 0)
-
-        mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask
-        masked_image_latents = (
-            np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents
-        )
-
-        # check that sizes of mask, masked image and latents match
-        if num_channels_unet == 9:
-            # default case for runwayml/stable-diffusion-inpainting
-            num_channels_mask = mask.shape[1]
-            num_channels_masked_image = masked_image_latents.shape[1]
-            if num_channels_latents + num_channels_mask + num_channels_masked_image != num_channels_unet:
-                raise ValueError(
-                    f"Incorrect configuration settings! The config of `pipeline.unet`: expects"
-                    f" {num_channels_unet} but received `num_channels_latents`: {num_channels_latents} +"
-                    f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}"
-                    f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of"
-                    " `pipeline.unet` or your `mask_image` or `image` input."
-                )
-        elif num_channels_unet != 4:
-            raise ValueError(
-                f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {num_channels_unet}."
-            )
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            # concat latents, mask, masked_image_latnets in the channel dimension
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-            if num_channels_unet == 9:
-                latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1)
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(sample=latent_model_input, timestep=timestep, encoder_hidden_states=prompt_embeds)[
-                0
-            ]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-            has_nsfw_concept = None
-        else:
-            latents /= scaling_factor
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            image, has_nsfw_concept = self.run_safety_checker(image)
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        if not return_dict:
-            return (image, has_nsfw_concept)
-
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
deleted file mode 100644
index 2a5e7bf78b0..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl.py
+++ /dev/null
@@ -1,496 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionXLPipelineMixin(DiffusionPipelineMixin):
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-
-        if prompt_embeds is None:
-            prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                # get prompt text embeddings
-                text_inputs = tokenizer(
-                    prompt,
-                    padding="max_length",
-                    max_length=tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                text_input_ids = text_inputs.input_ids
-                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
-
-                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
-                    text_input_ids, untruncated_ids
-                ):
-                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
-                    logger.warning(
-                        "The following part of your input was truncated because CLIP can only handle sequences up to"
-                        f" {tokenizer.model_max_length} tokens: {removed_text}"
-                    )
-
-                prompt_embeds = text_encoder(
-                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds[-2]
-                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-                prompt_embeds_list.append(prompt_embeds)
-
-            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
-        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
-            negative_prompt_embeds = np.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            negative_prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                negative_prompt_embeds = text_encoder(
-                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
-                negative_prompt_embeds = negative_prompt_embeds[-2]
-
-                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-            negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1)
-
-        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
-        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        height: Optional[int],
-        width: Optional[int],
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if prompt_embeds is not None and pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
-            )
-
-        if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
-            raise ValueError(
-                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
-            )
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = generator.randn(*shape).astype(dtype)
-        elif latents.shape != shape:
-            raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * np.float64(self.scheduler.init_noise_sigma)
-
-        return latents
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        extra_step_kwargs = {}
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        return extra_step_kwargs
-
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`Optional[int]`, defaults to None):
-                The height in pixels of the generated image.
-            width (`Optional[int]`, defaults to None):
-                The width in pixels of the generated image.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.7):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-
-        # 0. Default height and width to unet
-        height = height or self.unet.config["sample_size"] * self.vae_scale_factor
-        width = width or self.unet.config["sample_size"] * self.vae_scale_factor
-
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            height,
-            width,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        )
-
-        # 2. Define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            self.unet.config.get("in_channels", 4),
-            height,
-            width,
-            prompt_embeds.dtype,
-            generator,
-            latents,
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare added time ids & embeddings
-        add_text_embeds = pooled_prompt_embeds
-        add_time_ids = (original_size + crops_coords_top_left + target_size,)
-        add_time_ids = np.array(add_time_ids, dtype=prompt_embeds.dtype)
-
-        if do_classifier_free_guidance:
-            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
-            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
-            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
-        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
-
-        # Adapted from diffusers to extend it for other runtimes than ORT
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                text_embeds=add_text_embeds,
-                time_ids=add_time_ids,
-            )
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        if not return_dict:
-            return (image,)
-
-        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py b/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
deleted file mode 100644
index a07903a735e..00000000000
--- a/optimum/pipelines/diffusers/pipeline_stable_diffusion_xl_img2img.py
+++ /dev/null
@@ -1,503 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-import inspect
-import logging
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-
-from .pipeline_utils import DiffusionPipelineMixin, rescale_noise_cfg
-
-
-logger = logging.getLogger(__name__)
-
-
-class StableDiffusionXLImg2ImgPipelineMixin(DiffusionPipelineMixin):
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.encode_prompt
-    def _encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        num_images_per_prompt: int,
-        do_classifier_free_guidance: bool,
-        negative_prompt: Optional[Union[str, list]],
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`Union[str, List[str]]`):
-                prompt to be encoded
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
-                if `guidance_scale` is less than `1`).
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-        """
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Define tokenizers and text encoders
-        tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
-        text_encoders = (
-            [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
-        )
-
-        if prompt_embeds is None:
-            prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                # get prompt text embeddings
-                text_inputs = tokenizer(
-                    prompt,
-                    padding="max_length",
-                    max_length=tokenizer.model_max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-                text_input_ids = text_inputs.input_ids
-                untruncated_ids = tokenizer(prompt, padding="longest", return_tensors="np").input_ids
-
-                if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not np.array_equal(
-                    text_input_ids, untruncated_ids
-                ):
-                    removed_text = tokenizer.batch_decode(untruncated_ids[:, tokenizer.model_max_length - 1 : -1])
-                    logger.warning(
-                        "The following part of your input was truncated because CLIP can only handle sequences up to"
-                        f" {tokenizer.model_max_length} tokens: {removed_text}"
-                    )
-
-                prompt_embeds = text_encoder(
-                    input_ids=text_input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                pooled_prompt_embeds = prompt_embeds[0]
-                prompt_embeds = prompt_embeds[-2]
-                prompt_embeds = np.repeat(prompt_embeds, num_images_per_prompt, axis=0)
-                prompt_embeds_list.append(prompt_embeds)
-
-            prompt_embeds = np.concatenate(prompt_embeds_list, axis=-1)
-
-        # get unconditional embeddings for classifier free guidance
-        zero_out_negative_prompt = negative_prompt is None and self.config["force_zeros_for_empty_prompt"]
-        if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
-            negative_prompt_embeds = np.zeros_like(prompt_embeds)
-            negative_pooled_prompt_embeds = np.zeros_like(pooled_prompt_embeds)
-        elif do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            negative_prompt_embeds_list = []
-            for tokenizer, text_encoder in zip(tokenizers, text_encoders):
-                max_length = prompt_embeds.shape[1]
-                uncond_input = tokenizer(
-                    uncond_tokens,
-                    padding="max_length",
-                    max_length=max_length,
-                    truncation=True,
-                    return_tensors="np",
-                )
-
-                negative_prompt_embeds = text_encoder(
-                    input_ids=uncond_input.input_ids.astype(text_encoder.input_dtype.get("input_ids", np.int32))
-                )
-                negative_pooled_prompt_embeds = negative_prompt_embeds[0]
-                negative_prompt_embeds = negative_prompt_embeds[-2]
-                # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-                negative_prompt_embeds = np.repeat(negative_prompt_embeds, num_images_per_prompt, axis=0)
-                # For classifier free guidance, we need to do two forward passes.
-                # Here we concatenate the unconditional and text embeddings into a single batch
-                # to avoid doing two forward passes
-                negative_prompt_embeds_list.append(negative_prompt_embeds)
-            negative_prompt_embeds = np.concatenate(negative_prompt_embeds_list, axis=-1)
-
-        pooled_prompt_embeds = np.repeat(pooled_prompt_embeds, num_images_per_prompt, axis=0)
-        negative_pooled_prompt_embeds = np.repeat(negative_pooled_prompt_embeds, num_images_per_prompt, axis=0)
-
-        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img.StableDiffusionXLImg2ImgPipeline.check_inputs
-    def check_inputs(
-        self,
-        prompt: Union[str, List[str]],
-        strength: float,
-        callback_steps: int,
-        negative_prompt: Optional[str] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-    ):
-        if strength < 0 or strength > 1:
-            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-    def get_timesteps(self, num_inference_steps, strength):
-        # get the original timestep using init_timestep
-        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
-        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :].numpy()
-
-        return timesteps, num_inference_steps - t_start
-
-    # Adapted from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, generator=None):
-        batch_size = batch_size * num_images_per_prompt
-
-        if image.shape[1] == 4:
-            init_latents = image
-        else:
-            init_latents = self.vae_encoder(sample=image)[0] * self.vae_decoder.config.get("scaling_factor", 0.18215)
-
-        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
-            # expand init_latents for batch_size
-            additional_image_per_prompt = batch_size // init_latents.shape[0]
-            init_latents = np.concatenate([init_latents] * additional_image_per_prompt, axis=0)
-        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
-            raise ValueError(
-                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
-            )
-        else:
-            init_latents = np.concatenate([init_latents], axis=0)
-
-        # add noise to latents using the timesteps
-        noise = generator.randn(*init_latents.shape).astype(dtype)
-        init_latents = self.scheduler.add_noise(
-            torch.from_numpy(init_latents), torch.from_numpy(noise), torch.from_numpy(timestep)
-        )
-        return init_latents.numpy()
-
-    def _get_add_time_ids(
-        self, original_size, crops_coords_top_left, target_size, aesthetic_score, negative_aesthetic_score, dtype
-    ):
-        if self.config.get("requires_aesthetics_score"):
-            add_time_ids = (original_size + crops_coords_top_left + (aesthetic_score,),)
-            add_neg_time_ids = (original_size + crops_coords_top_left + (negative_aesthetic_score,),)
-        else:
-            add_time_ids = (original_size + crops_coords_top_left + target_size,)
-            add_neg_time_ids = (original_size + crops_coords_top_left + target_size,)
-
-        add_time_ids = np.array(add_time_ids, dtype=dtype)
-        add_neg_time_ids = np.array(add_neg_time_ids, dtype=dtype)
-
-        return add_time_ids, add_neg_time_ids
-
-    # Adapted from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl.StableDiffusionXLPipeline.__call__
-    def __call__(
-        self,
-        prompt: Optional[Union[str, List[str]]] = None,
-        image: Union[np.ndarray, PIL.Image.Image] = None,
-        strength: float = 0.3,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 5.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: int = 1,
-        eta: float = 0.0,
-        generator: Optional[np.random.RandomState] = None,
-        latents: Optional[np.ndarray] = None,
-        prompt_embeds: Optional[np.ndarray] = None,
-        negative_prompt_embeds: Optional[np.ndarray] = None,
-        pooled_prompt_embeds: Optional[np.ndarray] = None,
-        negative_pooled_prompt_embeds: Optional[np.ndarray] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, np.ndarray], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        guidance_rescale: float = 0.0,
-        original_size: Optional[Tuple[int, int]] = None,
-        crops_coords_top_left: Tuple[int, int] = (0, 0),
-        target_size: Optional[Tuple[int, int]] = None,
-        aesthetic_score: float = 6.0,
-        negative_aesthetic_score: float = 2.5,
-    ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`Optional[Union[str, List[str]]]`, defaults to None):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            image (`Union[np.ndarray, PIL.Image.Image]`):
-                `Image`, or tensor representing an image batch which will be upscaled.
-            strength (`float`, defaults to 0.8):
-                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
-                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
-                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
-                be maximum and the denoising process will run for the full number of iterations specified in
-                `num_inference_steps`. A value of 1, therefore, essentially ignores `image`.
-            num_inference_steps (`int`, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to 5):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`Optional[Union[str, list]]`):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds`. instead. Ignored when not using guidance (i.e., ignored if `guidance_scale`
-                is less than `1`).
-            num_images_per_prompt (`int`, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`Optional[np.random.RandomState]`, defaults to `None`)::
-                A np.random.RandomState to make generation deterministic.
-            latents (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`Optional[np.ndarray]`, defaults to `None`):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            output_type (`str`, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] instead of a
-                plain tuple.
-            callback (Optional[Callable], defaults to `None`):
-                A function that will be called every `callback_steps` steps during inference. The function will be
-                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, defaults to 1):
-                The frequency at which the `callback` function will be called. If not specified, the callback will be
-                called at every step.
-            guidance_rescale (`float`, defaults to 0.7):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
-            When returning a tuple, the first element is a list with the generated images, and the second element is a
-            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
-            (nsfw) content, according to the `safety_checker`.
-        """
-        # 0. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, strength, callback_steps, negative_prompt, prompt_embeds, negative_prompt_embeds)
-
-        # 1. Define call parameters
-        if isinstance(prompt, str):
-            batch_size = 1
-        elif isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if generator is None:
-            generator = np.random
-
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 2. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-            pooled_prompt_embeds,
-            negative_pooled_prompt_embeds,
-        ) = self._encode_prompt(
-            prompt,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
-        )
-
-        # 3. Preprocess image
-        image = self.image_processor.preprocess(image)
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps)
-
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength)
-        latent_timestep = np.repeat(timesteps[:1], batch_size * num_images_per_prompt, axis=0)
-        timestep_dtype = self.unet.input_dtype.get("timestep", np.float32)
-
-        latents_dtype = prompt_embeds.dtype
-        image = image.astype(latents_dtype)
-
-        # 5. Prepare latent variables
-        latents = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, latents_dtype, generator
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = {}
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        height, width = latents.shape[-2:]
-        height = height * self.vae_scale_factor
-        width = width * self.vae_scale_factor
-        original_size = original_size or (height, width)
-        target_size = target_size or (height, width)
-
-        # 8. Prepare added time ids & embeddings
-        add_text_embeds = pooled_prompt_embeds
-        add_time_ids, add_neg_time_ids = self._get_add_time_ids(
-            original_size,
-            crops_coords_top_left,
-            target_size,
-            aesthetic_score,
-            negative_aesthetic_score,
-            dtype=prompt_embeds.dtype,
-        )
-
-        if do_classifier_free_guidance:
-            prompt_embeds = np.concatenate((negative_prompt_embeds, prompt_embeds), axis=0)
-            add_text_embeds = np.concatenate((negative_pooled_prompt_embeds, add_text_embeds), axis=0)
-            add_time_ids = np.concatenate((add_time_ids, add_time_ids), axis=0)
-        add_time_ids = np.repeat(add_time_ids, batch_size * num_images_per_prompt, axis=0)
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        for i, t in enumerate(self.progress_bar(timesteps)):
-            # expand the latents if we are doing classifier free guidance
-            latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents
-            latent_model_input = self.scheduler.scale_model_input(torch.from_numpy(latent_model_input), t)
-            latent_model_input = latent_model_input.cpu().numpy()
-
-            # predict the noise residual
-            timestep = np.array([t], dtype=timestep_dtype)
-            noise_pred = self.unet(
-                sample=latent_model_input,
-                timestep=timestep,
-                encoder_hidden_states=prompt_embeds,
-                text_embeds=add_text_embeds,
-                time_ids=add_time_ids,
-            )
-            noise_pred = noise_pred[0]
-
-            # perform guidance
-            if do_classifier_free_guidance:
-                noise_pred_uncond, noise_pred_text = np.split(noise_pred, 2)
-                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-                if guidance_rescale > 0.0:
-                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
-                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=guidance_rescale)
-
-            # compute the previous noisy sample x_t -> x_t-1
-            scheduler_output = self.scheduler.step(
-                torch.from_numpy(noise_pred), t, torch.from_numpy(latents), **extra_step_kwargs
-            )
-            latents = scheduler_output.prev_sample.numpy()
-
-            # call the callback, if provided
-            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                if callback is not None and i % callback_steps == 0:
-                    callback(i, t, latents)
-
-        if output_type == "latent":
-            image = latents
-        else:
-            latents /= self.vae_decoder.config.get("scaling_factor", 0.18215)
-            # it seems likes there is a strange result for using half-precision vae decoder if batchsize>1
-            image = np.concatenate(
-                [self.vae_decoder(latent_sample=latents[i : i + 1])[0] for i in range(latents.shape[0])]
-            )
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-            image = self.image_processor.postprocess(image, output_type=output_type)
-
-        if not return_dict:
-            return (image,)
-
-        return StableDiffusionXLPipelineOutput(images=image)
diff --git a/optimum/pipelines/diffusers/pipeline_utils.py b/optimum/pipelines/diffusers/pipeline_utils.py
deleted file mode 100644
index 869b91ffe59..00000000000
--- a/optimum/pipelines/diffusers/pipeline_utils.py
+++ /dev/null
@@ -1,282 +0,0 @@
-#  Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-
-
-import warnings
-from typing import List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from diffusers import ConfigMixin
-from diffusers.image_processor import VaeImageProcessor as DiffusersVaeImageProcessor
-from diffusers.utils.pil_utils import PIL_INTERPOLATION
-from PIL import Image
-from tqdm.auto import tqdm
-
-
-class DiffusionPipelineMixin(ConfigMixin):
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L812
-    @staticmethod
-    def numpy_to_pil(images):
-        """
-        Converts a numpy image or a batch of images to a PIL image.
-        """
-        if images.ndim == 3:
-            images = images[None, ...]
-        images = (images * 255).round().astype("uint8")
-        if images.shape[-1] == 1:
-            # special case for grayscale (single channel) images
-            pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
-        else:
-            pil_images = [Image.fromarray(image) for image in images]
-
-        return pil_images
-
-    # Copied from https://github.com/huggingface/diffusers/blob/v0.12.1/src/diffusers/pipelines/pipeline_utils.py#L827
-    def progress_bar(self, iterable=None, total=None):
-        if not hasattr(self, "_progress_bar_config"):
-            self._progress_bar_config = {}
-        elif not isinstance(self._progress_bar_config, dict):
-            raise ValueError(
-                f"`self._progress_bar_config` should be of type `dict`, but is {type(self._progress_bar_config)}."
-            )
-
-        if iterable is not None:
-            return tqdm(iterable, **self._progress_bar_config)
-        elif total is not None:
-            return tqdm(total=total, **self._progress_bar_config)
-        else:
-            raise ValueError("Either `total` or `iterable` has to be defined.")
-
-
-# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L58
-def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
-    """
-    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
-    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
-    """
-    std_text = np.std(noise_pred_text, axis=tuple(range(1, noise_pred_text.ndim)), keepdims=True)
-    std_cfg = np.std(noise_cfg, axis=tuple(range(1, noise_cfg.ndim)), keepdims=True)
-    # rescale the results from guidance (fixes overexposure)
-    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
-    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
-    noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
-    return noise_cfg
-
-
-class VaeImageProcessor(DiffusersVaeImageProcessor):
-    # Adapted from diffusers.VaeImageProcessor.denormalize
-    @staticmethod
-    def denormalize(images: np.ndarray):
-        """
-        Denormalize an image array to [0,1].
-        """
-        return np.clip(images / 2 + 0.5, 0, 1)
-
-    # Adapted from diffusers.VaeImageProcessor.preprocess
-    def preprocess(
-        self,
-        image: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> np.ndarray:
-        """
-        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
-        """
-        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
-
-        do_convert_grayscale = getattr(self.config, "do_convert_grayscale", False)
-        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
-        if do_convert_grayscale and isinstance(image, (torch.Tensor, np.ndarray)) and image.ndim == 3:
-            if isinstance(image, torch.Tensor):
-                # if image is a pytorch tensor could have 2 possible shapes:
-                #    1. batch x height x width: we should insert the channel dimension at position 1
-                #    2. channnel x height x width: we should insert batch dimension at position 0,
-                #       however, since both channel and batch dimension has same size 1, it is same to insert at position 1
-                #    for simplicity, we insert a dimension of size 1 at position 1 for both cases
-                image = image.unsqueeze(1)
-            else:
-                # if it is a numpy array, it could have 2 possible shapes:
-                #   1. batch x height x width: insert channel dimension on last position
-                #   2. height x width x channel: insert batch dimension on first position
-                if image.shape[-1] == 1:
-                    image = np.expand_dims(image, axis=0)
-                else:
-                    image = np.expand_dims(image, axis=-1)
-
-        if isinstance(image, supported_formats):
-            image = [image]
-        elif not (isinstance(image, list) and all(isinstance(i, supported_formats) for i in image)):
-            raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in image]}. Currently, we only support {', '.join(supported_formats)}"
-            )
-
-        if isinstance(image[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                image = [self.convert_to_rgb(i) for i in image]
-            elif do_convert_grayscale:
-                image = [self.convert_to_grayscale(i) for i in image]
-            if self.config.do_resize:
-                height, width = self.get_height_width(image[0], height, width)
-                image = [self.resize(i, height, width) for i in image]
-            image = self.reshape(self.pil_to_numpy(image))
-        else:
-            if isinstance(image[0], torch.Tensor):
-                image = [self.pt_to_numpy(elem) for elem in image]
-                image = np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0)
-            else:
-                image = self.reshape(np.concatenate(image, axis=0) if image[0].ndim == 4 else np.stack(image, axis=0))
-
-            if do_convert_grayscale and image.ndim == 3:
-                image = np.expand_dims(image, 1)
-
-            # don't need any preprocess if the image is latents
-            if image.shape[1] == 4:
-                return image
-
-            if self.config.do_resize:
-                height, width = self.get_height_width(image, height, width)
-                image = self.resize(image, height, width)
-
-        # expected range [0,1], normalize to [-1,1]
-        do_normalize = self.config.do_normalize
-        if image.min() < 0 and do_normalize:
-            warnings.warn(
-                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
-                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{image.min()},{image.max()}]",
-                FutureWarning,
-            )
-            do_normalize = False
-
-        if do_normalize:
-            image = self.normalize(image)
-
-        if getattr(self.config, "do_binarize", False):
-            image = self.binarize(image)
-
-        return image
-
-    # Adapted from diffusers.VaeImageProcessor.postprocess
-    def postprocess(
-        self,
-        image: np.ndarray,
-        output_type: str = "pil",
-        do_denormalize: Optional[List[bool]] = None,
-    ):
-        if not isinstance(image, np.ndarray):
-            raise ValueError(
-                f"Input for postprocessing is in incorrect format: {type(image)}. We only support np array"
-            )
-        if output_type not in ["latent", "np", "pil"]:
-            deprecation_message = (
-                f"the output_type {output_type} is outdated and has been set to `np`. Please make sure to set it to one of these instead: "
-                "`pil`, `np`, `pt`, `latent`"
-            )
-            warnings.warn(deprecation_message, FutureWarning)
-            output_type = "np"
-
-        if output_type == "latent":
-            return image
-
-        if do_denormalize is None:
-            do_denormalize = [self.config.do_normalize] * image.shape[0]
-
-        image = np.stack(
-            [self.denormalize(image[i]) if do_denormalize[i] else image[i] for i in range(image.shape[0])], axis=0
-        )
-
-        image = image.transpose((0, 2, 3, 1))
-
-        if output_type == "pil":
-            image = self.numpy_to_pil(image)
-
-        return image
-
-    def get_height_width(
-        self,
-        image: [PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ):
-        """
-        This function return the height and width that are downscaled to the next integer multiple of
-        `vae_scale_factor`.
-
-        Args:
-            image(`PIL.Image.Image`, `np.ndarray`):
-                The image input, can be a PIL image, numpy array or pytorch tensor. if it is a numpy array, should have
-                shape `[batch, height, width]` or `[batch, height, width, channel]` if it is a pytorch tensor, should
-                have shape `[batch, channel, height, width]`.
-            height (`int`, *optional*, defaults to `None`):
-                The height in preprocessed image. If `None`, will use the height of `image` input.
-            width (`int`, *optional*`, defaults to `None`):
-                The width in preprocessed. If `None`, will use the width of the `image` input.
-        """
-        height = height or (image.height if isinstance(image, PIL.Image.Image) else image.shape[-2])
-        width = width or (image.width if isinstance(image, PIL.Image.Image) else image.shape[-1])
-        # resize to integer multiple of vae_scale_factor
-        width, height = (x - x % self.config.vae_scale_factor for x in (width, height))
-        return height, width
-
-    # Adapted from diffusers.VaeImageProcessor.numpy_to_pt
-    @staticmethod
-    def numpy_to_pt(images: np.ndarray) -> torch.FloatTensor:
-        """
-        Convert a NumPy image to a PyTorch tensor.
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-
-        images = torch.from_numpy(images)
-        return images
-
-    # Adapted from diffusers.VaeImageProcessor.pt_to_numpy
-    @staticmethod
-    def pt_to_numpy(images: torch.FloatTensor) -> np.ndarray:
-        """
-        Convert a PyTorch tensor to a NumPy image.
-        """
-        images = images.cpu().float().numpy()
-        return images
-
-    @staticmethod
-    def reshape(images: np.ndarray) -> np.ndarray:
-        """
-        Reshape inputs to expected shape.
-        """
-        if images.ndim == 3:
-            images = images[..., None]
-
-        return images.transpose(0, 3, 1, 2)
-
-    # TODO : remove after diffusers v0.21.0 release
-    def resize(
-        self,
-        image: [PIL.Image.Image, np.ndarray, torch.Tensor],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-    ) -> [PIL.Image.Image, np.ndarray, torch.Tensor]:
-        """
-        Resize image.
-        """
-        if isinstance(image, PIL.Image.Image):
-            image = image.resize((width, height), resample=PIL_INTERPOLATION[self.config.resample])
-        elif isinstance(image, torch.Tensor):
-            image = torch.nn.functional.interpolate(image, size=(height, width))
-        elif isinstance(image, np.ndarray):
-            image = self.numpy_to_pt(image)
-            image = torch.nn.functional.interpolate(image, size=(height, width))
-            image = self.pt_to_numpy(image)
-        return image
diff --git a/optimum/pipelines/diffusers/watermark.py b/optimum/pipelines/diffusers/watermark.py
deleted file mode 100644
index b3cd622edac..00000000000
--- a/optimum/pipelines/diffusers/watermark.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import numpy as np
-from imwatermark import WatermarkEncoder
-
-
-WATERMARK_MESSAGE = 0b101100111110110010010000011110111011000110011110
-WATERMARK_BITS = [int(bit) for bit in bin(WATERMARK_MESSAGE)[2:]]
-
-
-# Adapted from https://github.com/huggingface/diffusers/blob/v0.18.1/src/diffusers/pipelines/stable_diffusion_xl/watermark.py#L12
-class StableDiffusionXLWatermarker:
-    def __init__(self):
-        self.watermark = WATERMARK_BITS
-        self.encoder = WatermarkEncoder()
-        self.encoder.set_watermark("bits", self.watermark)
-
-    def apply_watermark(self, images: np.array):
-        # can't encode images that are smaller than 256
-        if images.shape[-1] < 256:
-            return images
-
-        # cv2 doesn't support float16
-        if images.dtype == np.float16:
-            images = images.astype(np.float32)
-
-        images = (255 * (images / 2 + 0.5)).transpose((0, 2, 3, 1))
-
-        images = np.array([self.encoder.encode(image, "dwtDct") for image in images]).transpose((0, 3, 1, 2))
-
-        np.clip(2 * (images / 255 - 0.5), -1.0, 1.0, out=images)
-
-        return images
diff --git a/optimum/pipelines/pipelines_base.py b/optimum/pipelines/pipelines_base.py
index a08ab8782a3..7690143f13f 100644
--- a/optimum/pipelines/pipelines_base.py
+++ b/optimum/pipelines/pipelines_base.py
@@ -24,6 +24,7 @@
     FillMaskPipeline,
     ImageClassificationPipeline,
     ImageSegmentationPipeline,
+    ImageToImagePipeline,
     ImageToTextPipeline,
     Pipeline,
     PreTrainedTokenizer,
@@ -55,6 +56,7 @@
         ORTModelForCausalLM,
         ORTModelForFeatureExtraction,
         ORTModelForImageClassification,
+        ORTModelForImageToImage,
         ORTModelForMaskedLM,
         ORTModelForQuestionAnswering,
         ORTModelForSemanticSegmentation,
@@ -157,6 +159,12 @@
             "default": "superb/hubert-base-superb-ks",
             "type": "audio",
         },
+        "image-to-image": {
+            "impl": ImageToImagePipeline,
+            "class": (ORTModelForImageToImage,),
+            "default": "caidas/swin2SR-classical-sr-x2-64",
+            "type": "image",
+        },
     }
 else:
     ORT_SUPPORTED_TASKS = {}
diff --git a/optimum/runs_base.py b/optimum/runs_base.py
index 3a1d164c602..dadd445818f 100644
--- a/optimum/runs_base.py
+++ b/optimum/runs_base.py
@@ -2,13 +2,12 @@
 import subprocess
 from contextlib import contextmanager
 from time import perf_counter_ns
-from typing import Set
+from typing import TYPE_CHECKING, Set
 
 import numpy as np
 import optuna
 import torch
 import transformers
-from datasets import Dataset
 from tqdm import trange
 
 from . import version as optimum_version
@@ -21,6 +20,9 @@
 from .utils.runs import RunConfig, cpu_info_command
 
 
+if TYPE_CHECKING:
+    from datasets import Dataset
+
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
@@ -34,7 +36,7 @@ def get_autoclass_name(task):
 
 class Calibrator:
     def __init__(
-        self, calibration_dataset: Dataset, quantizer, model_path, qconfig, calibration_params, node_exclusion
+        self, calibration_dataset: "Dataset", quantizer, model_path, qconfig, calibration_params, node_exclusion
     ):
         self.calibration_dataset = calibration_dataset
         self.quantizer = quantizer
diff --git a/optimum/utils/__init__.py b/optimum/utils/__init__.py
index 5d5044e63e1..e2b53a7dbc7 100644
--- a/optimum/utils/__init__.py
+++ b/optimum/utils/__init__.py
@@ -16,7 +16,9 @@
 from .constant import (
     CONFIG_NAME,
     DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER,
+    DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER,
     DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER,
+    DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER,
     DIFFUSION_MODEL_UNET_SUBFOLDER,
     DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER,
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
@@ -29,10 +31,13 @@
     TRANSFORMERS_MINIMUM_VERSION,
     check_if_diffusers_greater,
     check_if_pytorch_greater,
+    check_if_torch_greater,
     check_if_transformers_greater,
     is_accelerate_available,
     is_auto_gptq_available,
+    is_datasets_available,
     is_diffusers_available,
+    is_gptqmodel_available,
     is_onnx_available,
     is_onnxruntime_available,
     is_pydantic_available,
@@ -49,8 +54,11 @@
     DummyAudioInputGenerator,
     DummyBboxInputGenerator,
     DummyCodegenDecoderTextInputGenerator,
+    DummyDecisionTransformerInputGenerator,
     DummyDecoderTextInputGenerator,
     DummyEncodecInputGenerator,
+    DummyFluxTransformerTextInputGenerator,
+    DummyFluxTransformerVisionInputGenerator,
     DummyInputGenerator,
     DummyIntGenerator,
     DummyLabelsGenerator,
@@ -62,6 +70,9 @@
     DummySpeechT5InputGenerator,
     DummyTextInputGenerator,
     DummyTimestepInputGenerator,
+    DummyTransformerTextInputGenerator,
+    DummyTransformerTimestepInputGenerator,
+    DummyTransformerVisionInputGenerator,
     DummyVisionEmbeddingsGenerator,
     DummyVisionEncoderDecoderPastKeyValuesGenerator,
     DummyVisionInputGenerator,
diff --git a/optimum/utils/constant.py b/optimum/utils/constant.py
index 4497b5246d4..eb7a67e9ece 100644
--- a/optimum/utils/constant.py
+++ b/optimum/utils/constant.py
@@ -15,8 +15,10 @@
 
 CONFIG_NAME = "config.json"
 DIFFUSION_MODEL_UNET_SUBFOLDER = "unet"
-DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
+DIFFUSION_MODEL_TRANSFORMER_SUBFOLDER = "transformer"
 DIFFUSION_MODEL_VAE_DECODER_SUBFOLDER = "vae_decoder"
 DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER = "vae_encoder"
+DIFFUSION_MODEL_TEXT_ENCODER_SUBFOLDER = "text_encoder"
 DIFFUSION_MODEL_TEXT_ENCODER_2_SUBFOLDER = "text_encoder_2"
+DIFFUSION_MODEL_TEXT_ENCODER_3_SUBFOLDER = "text_encoder_3"
 ONNX_WEIGHTS_NAME = "model.onnx"
diff --git a/optimum/utils/dummy_diffusers_objects.py b/optimum/utils/dummy_diffusers_objects.py
index f6914bbcd3a..ff8b587e19f 100644
--- a/optimum/utils/dummy_diffusers_objects.py
+++ b/optimum/utils/dummy_diffusers_objects.py
@@ -15,6 +15,50 @@
 from .import_utils import DummyObject, requires_backends
 
 
+class ORTDiffusionPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForText2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForImage2Image(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTPipelineForInpainting(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
 class ORTStableDiffusionPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
@@ -70,6 +114,17 @@ def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
 
 
+class ORTStableDiffusionXLInpaintPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
 class ORTLatentConsistencyModelPipeline(metaclass=DummyObject):
     _backends = ["diffusers"]
 
@@ -79,3 +134,58 @@ def __init__(self, *args, **kwargs):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         requires_backends(cls, ["diffusers"])
+
+
+class ORTLatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTStableDiffusion3Pipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTStableDiffusion3Img2ImgPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTStableDiffusion3InpaintPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
+
+
+class ORTFluxPipeline(metaclass=DummyObject):
+    _backends = ["diffusers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["diffusers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["diffusers"])
diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py
index 4a57fda79ce..d0f4c85db2b 100644
--- a/optimum/utils/import_utils.py
+++ b/optimum/utils/import_utils.py
@@ -52,6 +52,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 TRANSFORMERS_MINIMUM_VERSION = version.parse("4.25.0")
 DIFFUSERS_MINIMUM_VERSION = version.parse("0.22.0")
 AUTOGPTQ_MINIMUM_VERSION = version.parse("0.4.99")  # Allows 0.5.0.dev0
+GPTQMODEL_MINIMUM_VERSION = version.parse("1.4.2")
 
 
 # This is the minimal required version to support some ONNX Runtime features
@@ -67,8 +68,10 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 _accelerate_available = _is_package_available("accelerate")
 _diffusers_available = _is_package_available("diffusers")
 _auto_gptq_available = _is_package_available("auto_gptq")
+_gptqmodel_available = _is_package_available("gptqmodel")
 _timm_available = _is_package_available("timm")
 _sentence_transformers_available = _is_package_available("sentence_transformers")
+_datasets_available = _is_package_available("datasets")
 
 torch_version = None
 if is_torch_available():
@@ -131,14 +134,29 @@ def is_sentence_transformers_available():
     return _sentence_transformers_available
 
 
+def is_datasets_available():
+    return _datasets_available
+
+
 def is_auto_gptq_available():
     if _auto_gptq_available:
-        version_autogptq = version.parse(importlib_metadata.version("auto_gptq"))
-        if AUTOGPTQ_MINIMUM_VERSION < version_autogptq:
+        v = version.parse(importlib_metadata.version("auto_gptq"))
+        if v >= AUTOGPTQ_MINIMUM_VERSION:
+            return True
+        else:
+            raise ImportError(
+                f"Found an incompatible version of auto-gptq. Found version {v}, but only version >= {AUTOGPTQ_MINIMUM_VERSION} are supported"
+            )
+
+
+def is_gptqmodel_available():
+    if _gptqmodel_available:
+        v = version.parse(importlib_metadata.version("gptqmodel"))
+        if v >= GPTQMODEL_MINIMUM_VERSION:
             return True
         else:
             raise ImportError(
-                f"Found an incompatible version of auto-gptq. Found version {version_autogptq}, but only version above {AUTOGPTQ_MINIMUM_VERSION} are supported"
+                f"Found an incompatible version of gptqmodel. Found version {v}, but only version >= {GPTQMODEL_MINIMUM_VERSION} are supported"
             )
 
 
@@ -193,6 +211,22 @@ def check_if_diffusers_greater(target_version: str) -> bool:
     return version.parse(_diffusers_version) >= version.parse(target_version)
 
 
+def check_if_torch_greater(target_version: str) -> bool:
+    """
+    Checks whether the current install of torch is greater than or equal to the target version.
+
+    Args:
+        target_version (str): version used as the reference for comparison.
+
+    Returns:
+        bool: whether the check is True or not.
+    """
+    if not is_torch_available():
+        return False
+
+    return torch_version >= version.parse(target_version)
+
+
 @contextmanager
 def require_numpy_strictly_lower(package_version: str, message: str):
     if not version.parse(np.__version__) < version.parse(package_version):
@@ -214,6 +248,12 @@ def require_numpy_strictly_lower(package_version: str, message: str):
 -U transformers`. Please note that you may need to restart your runtime after installation.
 """
 
+DATASETS_IMPORT_ERROR = """
+{0} requires the datasets library but it was not found in your environment. You can install it with pip:
+`pip install datasets`. Please note that you may need to restart your runtime after installation.
+"""
+
+
 BACKENDS_MAPPING = OrderedDict(
     [
         ("diffusers", (is_diffusers_available, DIFFUSERS_IMPORT_ERROR)),
@@ -229,6 +269,7 @@ def require_numpy_strictly_lower(package_version: str, message: str):
             "transformers_434",
             (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")),
         ),
+        ("datasets", (is_datasets_available, DATASETS_IMPORT_ERROR)),
     ]
 )
 
diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py
index 36913f652a8..18a2a5a3fd1 100644
--- a/optimum/utils/input_generators.py
+++ b/optimum/utils/input_generators.py
@@ -22,6 +22,7 @@
 import numpy as np
 from transformers.utils import is_tf_available, is_torch_available
 
+from ..utils import check_if_diffusers_greater, check_if_transformers_greater
 from .normalized_config import (
     NormalizedConfig,
     NormalizedEncoderDecoderConfig,
@@ -35,7 +36,7 @@
     import torch
 
 if is_tf_available():
-    import tensorflow as tf
+    import tensorflow as tf  # type: ignore
 
 
 def check_framework_is_available(func):
@@ -506,6 +507,43 @@ class DummyDecoderTextInputGenerator(DummyTextInputGenerator):
     )
 
 
+class DummyDecisionTransformerInputGenerator(DummyTextInputGenerator):
+    """
+    Generates dummy decision transformer inputs.
+    """
+
+    SUPPORTED_INPUT_NAMES = (
+        "states",
+        "actions",
+        "timesteps",
+        "returns_to_go",
+        "attention_mask",
+    )
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.act_dim = self.normalized_config.config.act_dim
+        self.state_dim = self.normalized_config.config.state_dim
+        self.max_ep_len = self.normalized_config.config.max_ep_len
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "states":
+            shape = [self.batch_size, self.sequence_length, self.state_dim]
+        elif input_name == "actions":
+            shape = [self.batch_size, self.sequence_length, self.act_dim]
+        elif input_name == "rewards":
+            shape = [self.batch_size, self.sequence_length, 1]
+        elif input_name == "returns_to_go":
+            shape = [self.batch_size, self.sequence_length, 1]
+        elif input_name == "attention_mask":
+            shape = [self.batch_size, self.sequence_length]
+        elif input_name == "timesteps":
+            shape = [self.batch_size, self.sequence_length]
+            return self.random_int_tensor(shape=shape, max_value=self.max_ep_len, framework=framework, dtype=int_dtype)
+
+        return self.random_float_tensor(shape, min_value=-2.0, max_value=2.0, framework=framework, dtype=float_dtype)
+
+
 class DummySeq2SeqDecoderTextInputGenerator(DummyDecoderTextInputGenerator):
     SUPPORTED_INPUT_NAMES = (
         "decoder_input_ids",
@@ -859,23 +897,31 @@ def __init__(
     ):
         self.task = task
         self.vocab_size = normalized_config.vocab_size
-        self.text_encoder_projection_dim = normalized_config.text_encoder_projection_dim
-        self.time_ids = 5 if normalized_config.requires_aesthetics_score else 6
+        self.text_encoder_projection_dim = getattr(normalized_config, "text_encoder_projection_dim", None)
+        self.time_ids = 5 if getattr(normalized_config, "requires_aesthetics_score", False) else 6
         if random_batch_size_range:
             low, high = random_batch_size_range
             self.batch_size = random.randint(low, high)
         else:
             self.batch_size = batch_size
-        self.time_cond_proj_dim = normalized_config.config.time_cond_proj_dim
+        self.time_cond_proj_dim = getattr(normalized_config.config, "time_cond_proj_dim", None)
 
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
         if input_name == "timestep":
-            shape = [self.batch_size]
-            return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype)
+            shape = []  # a scalar with no dimension (it can be int or float depending on the sd architecture)
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
 
         if input_name == "text_embeds":
+            if self.text_encoder_projection_dim is None:
+                raise ValueError(
+                    "Unable to infer the value of `text_encoder_projection_dim` for generating `text_embeds`, please double check the config of your model."
+                )
             dim = self.text_encoder_projection_dim
         elif input_name == "timestep_cond":
+            if self.time_cond_proj_dim is None:
+                raise ValueError(
+                    "Unable to infer the value of `time_cond_proj_dim` for generating `timestep_cond`, please double check the config of your model."
+                )
             dim = self.time_cond_proj_dim
         else:
             dim = self.time_ids
@@ -1026,23 +1072,26 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int
 
 class BloomDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
     def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
-        past_key_shape = (
-            self.batch_size * self.num_attention_heads,
-            self.hidden_size // self.num_attention_heads,
-            self.sequence_length,
-        )
-        past_value_shape = (
-            self.batch_size * self.num_attention_heads,
-            self.sequence_length,
-            self.hidden_size // self.num_attention_heads,
-        )
-        return [
-            (
-                self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype),
-                self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype),
+        if check_if_transformers_greater("4.44"):
+            return super().generate(input_name, framework=framework, int_dtype=int_dtype, float_dtype=float_dtype)
+        else:
+            past_key_shape = (
+                self.batch_size * self.num_attention_heads,
+                self.hidden_size // self.num_attention_heads,
+                self.sequence_length,
             )
-            for _ in range(self.num_layers)
-        ]
+            past_value_shape = (
+                self.batch_size * self.num_attention_heads,
+                self.sequence_length,
+                self.hidden_size // self.num_attention_heads,
+            )
+            return [
+                (
+                    self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype),
+                    self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype),
+                )
+                for _ in range(self.num_layers)
+            ]
 
 
 class MultiQueryPastKeyValuesGenerator(DummyPastKeyValuesGenerator):
@@ -1407,3 +1456,80 @@ def generate(
         float_dtype: str = "fp32",
     ):
         return self.random_int_tensor(shape=(1,), min_value=20, max_value=22, framework=framework, dtype=int_dtype)
+
+
+class DummyTransformerTimestepInputGenerator(DummyTimestepInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("timestep",)
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "timestep":
+            shape = [self.batch_size]  # With transformer diffusers, timestep is a 1D tensor
+            return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyTransformerVisionInputGenerator(DummyVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = ("hidden_states",)
+
+
+class DummyTransformerTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "encoder_hidden_states",
+        "pooled_projection",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "encoder_hidden_states":
+            return super().generate(input_name, framework, int_dtype, float_dtype)[0]
+
+        elif input_name == "pooled_projections":
+            return self.random_float_tensor(
+                [self.batch_size, self.normalized_config.projection_size], framework=framework, dtype=float_dtype
+            )
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyFluxTransformerVisionInputGenerator(DummyTransformerVisionInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "hidden_states",
+        "img_ids",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "hidden_states":
+            shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels]
+            return self.random_float_tensor(shape, framework=framework, dtype=float_dtype)
+        elif input_name == "img_ids":
+            shape = (
+                [(self.height // 2) * (self.width // 2), 3]
+                if check_if_diffusers_greater("0.31.0")
+                else [self.batch_size, (self.height // 2) * (self.width // 2), 3]
+            )
+            return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
+
+
+class DummyFluxTransformerTextInputGenerator(DummyTransformerTextInputGenerator):
+    SUPPORTED_INPUT_NAMES = (
+        "encoder_hidden_states",
+        "pooled_projections",
+        "guidance",
+        "txt_ids",
+    )
+
+    def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"):
+        if input_name == "txt_ids":
+            shape = (
+                [self.sequence_length, 3]
+                if check_if_diffusers_greater("0.31.0")
+                else [self.batch_size, self.sequence_length, 3]
+            )
+            return self.random_int_tensor(shape, max_value=1, framework=framework, dtype=int_dtype)
+        elif input_name == "guidance":
+            shape = [self.batch_size]
+            return self.random_float_tensor(shape, min_value=0, max_value=1, framework=framework, dtype=float_dtype)
+
+        return super().generate(input_name, framework, int_dtype, float_dtype)
diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py
index 81207b76496..9fde2bd4696 100644
--- a/optimum/utils/normalized_config.py
+++ b/optimum/utils/normalized_config.py
@@ -204,8 +204,10 @@ class NormalizedConfigManager:
         'data2vec-text',
         'data2vec-vision',
         'detr',
+        'dinov2',
         'flaubert',
         'groupvit',
+        'hiera',
         'ibert',
         'layoutlm',
         'layoutlmv3',
@@ -216,6 +218,8 @@ class NormalizedConfigManager:
         'owlvit',
         'perceiver',
         'roformer',
+        'segformer',
+        'siglip',
         'squeezebert',
         'table-transformer',
     """
@@ -281,6 +285,7 @@ class NormalizedConfigManager:
         "xlm-roberta": NormalizedTextConfig,
         "yolos": NormalizedVisionConfig,
         "qwen2": NormalizedTextConfig,
+        "granite": NormalizedTextConfigWithGQA,
     }
 
     @classmethod
diff --git a/optimum/utils/preprocessing/base.py b/optimum/utils/preprocessing/base.py
index dc995ccc50b..7cfda13ba7d 100644
--- a/optimum/utils/preprocessing/base.py
+++ b/optimum/utils/preprocessing/base.py
@@ -20,15 +20,16 @@
 from abc import ABC, abstractmethod
 from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Set, Tuple, Type, Union
 
-from datasets import Dataset, DatasetDict
-from datasets import load_dataset as datasets_load_dataset
 from transformers import PreTrainedTokenizerBase
 from transformers.image_processing_utils import BaseImageProcessor
 
+from optimum.utils.import_utils import requires_backends
+
 from .. import logging
 
 
 if TYPE_CHECKING:
+    from datasets import Dataset, DatasetDict
     from transformers import PretrainedConfig
 
 
@@ -102,11 +103,14 @@ def create_dataset_processing_func(
 
     def prepare_dataset(
         self,
-        dataset: Union[DatasetDict, Dataset],
+        dataset: Union["DatasetDict", "Dataset"],
         data_keys: Dict[str, str],
         ref_keys: Optional[List[str]] = None,
         split: Optional[str] = None,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+        from datasets import Dataset
+
         if isinstance(dataset, Dataset) and split is not None:
             raise ValueError("A Dataset and a split name were provided, but splits are for DatasetDict.")
         elif split is not None:
@@ -131,7 +135,12 @@ def load_dataset(
         num_samples: Optional[int] = None,
         shuffle: bool = False,
         **load_dataset_kwargs,
-    ) -> Union[DatasetDict, Dataset]:
+    ) -> Union["DatasetDict", "Dataset"]:
+        requires_backends(self, ["datasets"])
+
+        from datasets import Dataset, DatasetDict
+        from datasets import load_dataset as datasets_load_dataset
+
         dataset = datasets_load_dataset(path, **load_dataset_kwargs)
 
         if isinstance(dataset, DatasetDict) and load_smallest_split:
diff --git a/optimum/utils/testing_utils.py b/optimum/utils/testing_utils.py
index 76fe9a05b13..88b1acdb780 100644
--- a/optimum/utils/testing_utils.py
+++ b/optimum/utils/testing_utils.py
@@ -28,6 +28,7 @@
 from . import (
     is_accelerate_available,
     is_auto_gptq_available,
+    is_datasets_available,
     is_diffusers_available,
     is_sentence_transformers_available,
     is_timm_available,
@@ -146,6 +147,10 @@ def require_sentence_transformers(test_case):
     return unittest.skipUnless(is_sentence_transformers_available(), "test requires sentence-transformers")(test_case)
 
 
+def require_datasets(test_case):
+    return unittest.skipUnless(is_datasets_available(), "test requires datasets")(test_case)
+
+
 def grid_parameters(
     parameters: Dict[str, Iterable[Any]],
     yield_dict: bool = False,
diff --git a/optimum/version.py b/optimum/version.py
index 8eeeb9d05a7..4fff28e5c97 100644
--- a/optimum/version.py
+++ b/optimum/version.py
@@ -12,4 +12,4 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
-__version__ = "1.22.0.dev0"
+__version__ = "1.24.0.dev0"
diff --git a/pyproject.toml b/pyproject.toml
index 99a0f1c85fa..17bcd90e066 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ markers = [
     "rocm_ep_test",
     "tensorflow_test",
     "timm_test",
+    "datasets_test",
     "run_in_series",
     "run_slow",
     "accelerate_test",
diff --git a/setup.py b/setup.py
index 2e8c9489a89..555580528fe 100644
--- a/setup.py
+++ b/setup.py
@@ -13,17 +13,15 @@
 
 
 REQUIRED_PKGS = [
-    "coloredlogs",
-    "sympy",
-    "transformers[sentencepiece]>=4.29.0,<4.44.0",
+    "transformers>=4.29",
     "torch>=1.11",
     "packaging",
-    "numpy<2.0",  # transformers requires numpy<2.0 https://github.com/huggingface/transformers/pull/31569
+    "numpy",
     "huggingface_hub>=0.8.0",
-    "datasets",
 ]
 
 # TODO: unpin pytest once https://github.com/huggingface/transformers/pull/29154 is merged & released
+# pytest>=8.0.0 also fails with the transformers version pinned for exporters-tf
 TESTS_REQUIRE = [
     "accelerate",
     "pytest<=8.0.0",
@@ -36,9 +34,9 @@
     "diffusers>=0.17.0",
     "torchaudio",
     "einops",
-    "invisible-watermark",
     "timm",
     "scikit-learn",
+    "sentencepiece",
     "rjieba",
 ]
 
@@ -53,6 +51,7 @@
         "datasets>=1.2.1",
         "evaluate",
         "protobuf>=3.20.1",
+        "transformers>=4.36,<4.48.0",
     ],
     "onnxruntime-gpu": [
         "onnx",
@@ -61,9 +60,20 @@
         "evaluate",
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
+        "transformers>=4.36,<4.48.0",
+    ],
+    "exporters": [
+        "onnx",
+        "onnxruntime",
+        "timm",
+        "transformers>=4.36,<4.48.0",
+    ],
+    "exporters-gpu": [
+        "onnx",
+        "onnxruntime-gpu",
+        "timm",
+        "transformers>=4.36,<4.48.0",
     ],
-    "exporters": ["onnx", "onnxruntime", "timm"],
-    "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",
@@ -72,7 +82,12 @@
         "timm",
         "h5py",
         "numpy<1.24.0",
-        "transformers[sentencepiece]>=4.26.0,<4.38.0",
+        "datasets<=2.16",
+        "transformers>=4.36,<4.38",
+    ],
+    "exporters-executorch": [
+        "executorch>=0.4.0",
+        "transformers>=4.46",
     ],
     "diffusers": ["diffusers"],
     "intel": "optimum-intel>=1.18.0",
@@ -80,12 +95,13 @@
     "nncf": "optimum-intel[nncf]>=1.18.0",
     "neural-compressor": "optimum-intel[neural-compressor]>=1.18.0",
     "ipex": "optimum-intel[ipex]>=1.18.0",
-    "habana": ["optimum-habana", "transformers >= 4.43.0, < 4.44.0"],
-    "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
-    "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers >= 4.36.2, < 4.42.0"],
+    "habana": ["optimum-habana", "transformers>=4.45.0,<4.46.0"],
+    "neuron": ["optimum-neuron[neuron]>=0.0.20", "transformers>=4.36.2,<4.42.0"],
+    "neuronx": ["optimum-neuron[neuronx]>=0.0.20", "transformers>=4.36.2,<4.42.0"],
     "graphcore": "optimum-graphcore",
     "furiosa": "optimum-furiosa",
     "amd": "optimum-amd",
+    "quanto": ["optimum-quanto>=0.2.4"],
     "dev": TESTS_REQUIRE + QUALITY_REQUIRE,
     "tests": TESTS_REQUIRE,
     "quality": QUALITY_REQUIRE,
@@ -108,9 +124,10 @@
         "Intended Audience :: Education",
         "Intended Audience :: Science/Research",
         "Operating System :: OS Independent",
-        "Programming Language :: Python :: 3.7",
-        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Programming Language :: Python :: 3.11",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
     keywords="transformers, quantization, pruning, optimization, training, inference, onnx, onnx runtime, intel, "
@@ -122,7 +139,7 @@
     packages=find_namespace_packages(include=["optimum*"]),
     install_requires=REQUIRED_PKGS,
     extras_require=EXTRAS_REQUIRE,
-    python_requires=">=3.7.0",
+    python_requires=">=3.9.0",
     include_package_data=True,
     zip_safe=False,
     entry_points={"console_scripts": ["optimum-cli=optimum.commands.optimum_cli:main"]},
diff --git a/tests/bettertransformer/test_audio.py b/tests/bettertransformer/test_audio.py
index be01a92d447..caca91e27ca 100644
--- a/tests/bettertransformer/test_audio.py
+++ b/tests/bettertransformer/test_audio.py
@@ -35,7 +35,7 @@
 
 class TestsWhisper(unittest.TestCase):
     def test_error_message(self):
-        model = AutoModel.from_pretrained("openai/whisper-tiny")
+        model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")
 
         with self.assertRaises(ValueError) as cm:
             model = BetterTransformer.transform(model)
@@ -82,15 +82,19 @@ def _test_fp16_inference(
         set_seed(0)
 
         if not use_to_operator:
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False)
 
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
         else:
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=False)
 
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             hf_random_model = hf_random_model.to(torch.float16)
             converted_model = converted_model.to(torch.float16)
 
@@ -147,7 +151,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int):
         model_id = MODELS_DICT[model_type]
         processor = AutoProcessor.from_pretrained(model_id)
 
-        model = AutoModel.from_pretrained(model_id)
+        model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
 
         text = ["This is me and me"]
         if batch_size > 1:
@@ -217,14 +221,14 @@ def test_logits(self, model_type: str):
             inputs = self.prepare_inputs_for_class(model_id, model_type)
 
             torch.manual_seed(0)
-            hf_random_model = AutoModel.from_pretrained(model_id).eval()
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             random_config = hf_random_model.config
 
             torch.manual_seed(0)
             converted_model = BetterTransformer.transform(hf_random_model)
 
             torch.manual_seed(0)
-            hf_random_model = AutoModel.from_pretrained(model_id).eval()
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             random_config = hf_random_model.config
 
             self.assertFalse(
diff --git a/tests/bettertransformer/test_common.py b/tests/bettertransformer/test_common.py
index 35b89d2ed2e..b8bc0a3b3d9 100644
--- a/tests/bettertransformer/test_common.py
+++ b/tests/bettertransformer/test_common.py
@@ -28,7 +28,7 @@
 
 class BetterTransformerIntegrationTests(unittest.TestCase):
     def test_raise_error_on_double_transform_call(self):
-        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel")
+        model = AutoModel.from_pretrained("hf-internal-testing/tiny-random-BertModel", attn_implementation="eager")
 
         with self.assertRaises(Exception) as cm:
             bt_model = BetterTransformer.transform(model)
@@ -59,7 +59,7 @@ def test_raise_on_save(self, model_type: str):
         )
         for model_id in model_ids:
             with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
-                hf_model = AutoModel.from_pretrained(model_id).eval()
+                hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
                 bt_model = BetterTransformer.transform(hf_model, keep_original_model=False)
                 bt_model.save_pretrained(tmpdirname)
 
@@ -73,7 +73,7 @@ def test_conversion(self, model_type: str):
             MODELS_DICT[model_type] if isinstance(MODELS_DICT[model_type], tuple) else (MODELS_DICT[model_type],)
         )
         for model_id in model_ids:
-            hf_random_model = AutoModel.from_pretrained(model_id)
+            hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
             converted_model = BetterTransformer.transform(hf_random_model)
 
             self.assertTrue(
@@ -99,7 +99,7 @@ def test_raise_save_pretrained_error(self, test_name: str, model_type: str, keep
         )
         for model_id in model_ids:
             # get hf and bt model
-            hf_model = AutoModel.from_pretrained(model_id)
+            hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
             # get bt model and invert it
             bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
 
@@ -145,9 +145,11 @@ def test_raise_activation_fun(self, model_type: str):
             )()  # random config class for the model to test
             hf_random_config.hidden_act = "silu"
 
-            hf_random_model = AutoModel.from_config(hf_random_config).eval()
+            hf_random_model = AutoModel.from_config(hf_random_config, attn_implementation="eager").eval()
+
             with self.assertRaises(ValueError) as cm:
                 _ = BetterTransformer.transform(hf_random_model, keep_original_model=True)
+
             self.assertTrue("Activation function" in str(cm.exception))
 
     def test_dict_class_consistency(self):
diff --git a/tests/bettertransformer/test_decoder.py b/tests/bettertransformer/test_decoder.py
index 42340d3b3aa..e2bc6ddc2fb 100644
--- a/tests/bettertransformer/test_decoder.py
+++ b/tests/bettertransformer/test_decoder.py
@@ -23,7 +23,6 @@
 
 from optimum.bettertransformer import BetterTransformer
 from optimum.utils import (
-    BloomDummyPastKeyValuesGenerator,
     DummyPastKeyValuesGenerator,
     NormalizedConfigManager,
 )
@@ -132,14 +131,11 @@ def test_logits_with_cache(self, test_name: str, model_type: str, batch_size: in
 
         model_id = MODELS_DICT[model_type]
 
-        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager")
 
         normalized_config = NormalizedConfigManager.get_normalized_config_class(model.config.model_type)(model.config)
 
-        if model_type == "bloom":
-            pkv_generator_class = BloomDummyPastKeyValuesGenerator
-        else:
-            pkv_generator_class = DummyPastKeyValuesGenerator
+        pkv_generator_class = DummyPastKeyValuesGenerator
 
         pkv_generator = pkv_generator_class(
             task="", normalized_config=normalized_config, batch_size=batch_size, sequence_length=seq_length
@@ -171,7 +167,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
         model_id = MODELS_DICT[model_type]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        model = AutoModelForCausalLM.from_pretrained(model_id)
+        model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="eager")
 
         if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
             if tokenizer.eos_token != "":
@@ -228,7 +224,9 @@ def test_invert_model_logits(self, test_name: str, model_type: str, keep_origina
     @require_torch_gpu
     @require_accelerate
     def test_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_memory=None):
-        hf_model = AutoModelForCausalLM.from_pretrained("gpt2", device_map="auto", max_memory=max_memory).eval()
+        hf_model = AutoModelForCausalLM.from_pretrained(
+            "gpt2", device_map="auto", max_memory=max_memory, attn_implementation="eager"
+        ).eval()
         bt_model = BetterTransformer.transform(
             hf_model, keep_original_model=keep_original_model, max_memory=max_memory
         )
diff --git a/tests/bettertransformer/test_encoder.py b/tests/bettertransformer/test_encoder.py
index 74aacaed58c..7dd42c43b05 100644
--- a/tests/bettertransformer/test_encoder.py
+++ b/tests/bettertransformer/test_encoder.py
@@ -181,7 +181,9 @@ def check_accelerate_compatibility_cpu_gpu(self, keep_original_model=True, max_m
         If this works for roberta, it should work for all other models too.
         """
 
-        hf_model = AutoModel.from_pretrained("xlm-roberta-base", device_map="auto", max_memory=max_memory).eval()
+        hf_model = AutoModel.from_pretrained(
+            "xlm-roberta-base", device_map="auto", max_memory=max_memory, attn_implementation="eager"
+        ).eval()
         bt_model = BetterTransformer.transform(
             hf_model, keep_original_model=keep_original_model, max_memory=max_memory
         )
diff --git a/tests/bettertransformer/test_encoder_decoder.py b/tests/bettertransformer/test_encoder_decoder.py
index 8d05923522a..5ce4d62b12c 100644
--- a/tests/bettertransformer/test_encoder_decoder.py
+++ b/tests/bettertransformer/test_encoder_decoder.py
@@ -153,7 +153,7 @@ def test_generation(self, test_name: str, model_type: str, batch_size: int, padd
         model_id = MODELS_DICT[model_type]
         tokenizer = AutoTokenizer.from_pretrained(model_id)
 
-        model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
+        model = AutoModelForSeq2SeqLM.from_pretrained(model_id, attn_implementation="eager")
 
         if not hasattr(tokenizer, "pad_token") or tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
diff --git a/tests/bettertransformer/test_gpu.py b/tests/bettertransformer/test_gpu.py
index b992b90d3c8..ada38e408fa 100644
--- a/tests/bettertransformer/test_gpu.py
+++ b/tests/bettertransformer/test_gpu.py
@@ -26,7 +26,9 @@ def timing_cuda(model, num_batches, input_ids, masks, decoder_input_ids):
 
 
 def benchmark(model_name: str, num_batches: int, batch_size: int, max_seqlen: int, is_half: bool):
-    hf_model = AutoModel.from_pretrained(model_name, torch_dtype=torch.float16 if is_half else None).eval()
+    hf_model = AutoModel.from_pretrained(
+        model_name, torch_dtype=torch.float16 if is_half else None, attn_implementation="eager"
+    ).eval()
     hf_model = hf_model.to("cuda:0")
     bt_model = BetterTransformer.transform(hf_model, keep_original_model=True)
 
diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py
index e9e2edd9790..f79cbb34512 100644
--- a/tests/bettertransformer/testing_utils.py
+++ b/tests/bettertransformer/testing_utils.py
@@ -59,12 +59,12 @@
     # "llama": "fxmarty/tiny-llama-fast-tokenizer",
     # "llama-gqa": "noamwies/llama-test-gqa-with-better-transformer",
     "m2m_100": "hf-internal-testing/tiny-random-nllb",
-    "marian": "fxmarty/tiny-marian",  # the other tiny ones have a too small max_position_embeddings
+    "marian": "optimum-internal-testing/tiny-random-marian",  # the other tiny ones have a too small max_position_embeddings
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
     "mbart": "hf-internal-testing/tiny-random-mbart",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
-    "prophetnet": "hirotasoshu/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
+    "prophetnet": "optimum-internal-testing/tiny-random-prophetnet",  # the other tiny ones have a too small max_position_embeddings
     "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "rocbert": "hf-internal-testing/tiny-random-RoCBertModel",
@@ -136,10 +136,12 @@ def _test_fp16_inference(
 
         torch.manual_seed(0)
         if not use_to_operator:
-            hf_random_model = automodel_class.from_pretrained(model_id, torch_dtype=torch.float16).to(0)
+            hf_random_model = automodel_class.from_pretrained(
+                model_id, torch_dtype=torch.float16, attn_implementation="eager"
+            ).to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
         else:
-            hf_random_model = automodel_class.from_pretrained(model_id).to(0)
+            hf_random_model = automodel_class.from_pretrained(model_id, attn_implementation="eager").to(0)
             converted_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
             hf_random_model = hf_random_model.to(torch.float16)
             converted_model = converted_model.to(torch.float16)
@@ -169,7 +171,7 @@ def _test_fp16_inference(
     def _test_logits_backward(self, model_id: str, model_type: str, **preprocessor_kwargs):
         inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **preprocessor_kwargs)
 
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
         random_config = hf_random_model.config
 
         # I could not obtain reproducible results with `torch.manual_seed` nor with
@@ -309,7 +311,7 @@ def _test_train_decoder(self, model_id: str, model_type: str, **kwargs):
         """
         inputs = self.prepare_inputs_for_class(model_id=model_id, model_type=model_type, **kwargs)
 
-        hf_random_model = AutoModel.from_pretrained(model_id).eval()
+        hf_random_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
 
         bt_model = BetterTransformer.transform(hf_random_model, keep_original_model=True)
         bt_model.train()
@@ -328,7 +330,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False):
         r"""
         Test that the inverse converted model and hf model have the same modules
         """
-        hf_model = AutoModel.from_pretrained(model_id)
+        hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
         hf_modules = list(hf_model.modules())
 
         bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
@@ -349,7 +351,7 @@ def _test_invert_modules(self, model_id, keep_original_model=False):
 
     def _test_save_load_invertible(self, model_id, keep_original_model=True):
         with tempfile.TemporaryDirectory() as tmpdirname:
-            hf_model = AutoModel.from_pretrained(model_id).eval()
+            hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager").eval()
             hf_model_state_dict = copy.deepcopy(hf_model.state_dict())
 
             bt_model = BetterTransformer.transform(hf_model, keep_original_model=keep_original_model)
@@ -362,7 +364,7 @@ def _test_save_load_invertible(self, model_id, keep_original_model=True):
             # saving a normal transformers bark model fails because of shared tensors
             bt_model.save_pretrained(tmpdirname, safe_serialization=hf_model.config.model_type != "bark")
 
-            bt_model_from_load = AutoModel.from_pretrained(tmpdirname)
+            bt_model_from_load = AutoModel.from_pretrained(tmpdirname, attn_implementation="eager")
 
             self.assertEqual(
                 set(bt_model.state_dict().keys()),
@@ -397,7 +399,7 @@ def _test_invert_model_logits(
         """
         inputs = self.prepare_inputs_for_class(model_id, model_type=model_type, **preprocessor_kwargs)
 
-        hf_model = AutoModel.from_pretrained(model_id)
+        hf_model = AutoModel.from_pretrained(model_id, attn_implementation="eager")
         hf_model = hf_model.eval()
 
         with torch.inference_mode():
diff --git a/tests/executorch/export/__init__.py b/tests/executorch/export/__init__.py
new file mode 100644
index 00000000000..fdc02578672
--- /dev/null
+++ b/tests/executorch/export/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/executorch/export/test_exporters_executorch.py b/tests/executorch/export/test_exporters_executorch.py
new file mode 100644
index 00000000000..f2467105e4f
--- /dev/null
+++ b/tests/executorch/export/test_exporters_executorch.py
@@ -0,0 +1,115 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import subprocess
+import tempfile
+import unittest
+
+import pytest
+from transformers.testing_utils import slow
+
+
+class TestExportToExecuTorchCLI(unittest.TestCase):
+    def test_helps_no_raise(self):
+        subprocess.run(
+            "optimum-cli export executorch --help",
+            shell=True,
+            check=True,
+        )
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_1b_export_to_executorch(self):
+        model_id = "NousResearch/Llama-3.2-1B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_3b_export_to_executorch(self):
+        model_id = "NousResearch/Hermes-3-Llama-3.2-3B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_qwen2_5_export_to_executorch(self):
+        model_id = "Qwen/Qwen2.5-0.5B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma2_export_to_executorch(self):
+        model_id = "unsloth/gemma-2-2b-it"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma_export_to_executorch(self):
+        model_id = "weqweasdas/RM-Gemma-2B"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
+
+    @slow
+    @pytest.mark.run_slow
+    def test_olmo_export_to_executorch(self):
+        model_id = "allenai/OLMo-1B-hf"
+        task = "text-generation"
+        recipe = "xnnpack"
+        with tempfile.TemporaryDirectory() as tempdir:
+            subprocess.run(
+                f"optimum-cli export executorch --model {model_id} --task {task} --recipe {recipe} --output_dir {tempdir}/executorch",
+                shell=True,
+                check=True,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/executorch/model.pte"))
diff --git a/tests/executorch/runtime/__init__.py b/tests/executorch/runtime/__init__.py
new file mode 100644
index 00000000000..fdc02578672
--- /dev/null
+++ b/tests/executorch/runtime/__init__.py
@@ -0,0 +1,14 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tests/executorch/runtime/test_modeling.py b/tests/executorch/runtime/test_modeling.py
new file mode 100644
index 00000000000..c97b461403c
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling.py
@@ -0,0 +1,70 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import tempfile
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_load_model_from_hub(self):
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path="NousResearch/Llama-3.2-1B",
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_load_model_from_local_path(self):
+        from optimum.exporters.executorch import main_export
+
+        model_id = "NousResearch/Llama-3.2-1B"
+        task = "text-generation"
+        recipe = "xnnpack"
+
+        with tempfile.TemporaryDirectory() as tempdir:
+            # Export to a local dir
+            main_export(
+                model_name_or_path=model_id,
+                task=task,
+                recipe=recipe,
+                output_dir=tempdir,
+            )
+            self.assertTrue(os.path.exists(f"{tempdir}/model.pte"))
+
+            # Load the exported model from a local dir
+            model = ExecuTorchModelForCausalLM.from_pretrained(
+                model_name_or_path=tempdir,
+                export=False,
+            )
+            self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+            self.assertIsInstance(model.model, ExecuTorchModule)
diff --git a/tests/executorch/runtime/test_modeling_gemma.py b/tests/executorch/runtime/test_modeling_gemma.py
new file mode 100644
index 00000000000..0e4238bf8ee
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_gemma.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma_text_generation_with_xnnpack(self):
+        # TODO: Switch to use google/gemma-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "google/gemma-2b"
+        model_id = "weqweasdas/RM-Gemma-2B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "Hello I am doing a project for my school and I need to write a report on the history of the United States."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Hello I am doing a project for my school",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_gemma2.py b/tests/executorch/runtime/test_modeling_gemma2.py
new file mode 100644
index 00000000000..22fe4ab60d7
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_gemma2.py
@@ -0,0 +1,56 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_gemma2_text_generation_with_xnnpack(self):
+        # TODO: Switch to use google/gemma-2-2b once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "google/gemma-2-2b"
+        model_id = "unsloth/gemma-2-2b-it"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Hello I am doing a project for my school and I need to make sure it is a great to be creative and I can!"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Hello I am doing a project for my school",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_llama.py b/tests/executorch/runtime/test_modeling_llama.py
new file mode 100644
index 00000000000..fb08a5615a5
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_llama.py
@@ -0,0 +1,83 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_llama3_2_1b_text_generation_with_xnnpack(self):
+        # TODO: Switch to use meta-llama/Llama-3.2-1B once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "lama/Llama-3.2-1B"
+        model_id = "NousResearch/Llama-3.2-1B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "Simply put, the theory of relativity states that the laws of physics are the same in all inertial frames of reference."
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
+
+    @slow
+    @pytest.mark.run_slow
+    @pytest.mark.skip(reason="OOMs with macos-15 CI instances on GH.")
+    def test_llama3_2_3b_text_generation_with_xnnpack(self):
+        # TODO: Switch to use meta-llama/Llama-3.2-3B once https://github.com/huggingface/optimum/issues/2127 is fixed
+        # model_id = "lama/Llama-3.2-3B"
+        model_id = "NousResearch/Hermes-3-Llama-3.2-3B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Simply put, the theory of relativity states that time is relative and can be affected "
+            "by an object's speed. This theory was developed by Albert Einstein in the early 20th "
+            "century. The theory has two parts"
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_olmo.py b/tests/executorch/runtime/test_modeling_olmo.py
new file mode 100644
index 00000000000..aa57496f291
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_olmo.py
@@ -0,0 +1,54 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_olmo_text_generation_with_xnnpack(self):
+        model_id = "allenai/OLMo-1B-hf"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = (
+            "Simply put, the theory of relativity states that the speed of light is the same in all directions."
+        )
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="Simply put, the theory of relativity states that",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/executorch/runtime/test_modeling_qwen2.py b/tests/executorch/runtime/test_modeling_qwen2.py
new file mode 100644
index 00000000000..ef624a784ea
--- /dev/null
+++ b/tests/executorch/runtime/test_modeling_qwen2.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import pytest
+from executorch.extension.pybindings.portable_lib import ExecuTorchModule
+from transformers import AutoTokenizer
+from transformers.testing_utils import (
+    slow,
+)
+
+from optimum.executorchruntime import ExecuTorchModelForCausalLM
+
+
+class ExecuTorchModelIntegrationTest(unittest.TestCase):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    @slow
+    @pytest.mark.run_slow
+    def test_qwen2_5_text_generation_with_xnnpack(self):
+        model_id = "Qwen/Qwen2.5-0.5B"
+        model = ExecuTorchModelForCausalLM.from_pretrained(
+            model_name_or_path=model_id,
+            export=True,
+            task="text-generation",
+            recipe="xnnpack",
+        )
+        self.assertIsInstance(model, ExecuTorchModelForCausalLM)
+        self.assertIsInstance(model.model, ExecuTorchModule)
+
+        EXPECTED_GENERATED_TEXT = "My favourite condiment is iced tea. I love it with my breakfast, my lunch"
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        generated_text = model.text_generation(
+            tokenizer=tokenizer,
+            prompt="My favourite condiment is ",
+            max_seq_len=len(tokenizer.encode(EXPECTED_GENERATED_TEXT)),
+        )
+        self.assertEqual(generated_text, EXPECTED_GENERATED_TEXT)
diff --git a/tests/exporters/exporters_utils.py b/tests/exporters/exporters_utils.py
index a55c7a124df..900b5f3b5ce 100644
--- a/tests/exporters/exporters_utils.py
+++ b/tests/exporters/exporters_utils.py
@@ -67,7 +67,9 @@
     "data2vec-audio": "hf-internal-testing/tiny-random-Data2VecAudioModel",
     "deberta": "hf-internal-testing/tiny-random-DebertaModel",
     "deberta-v2": "hf-internal-testing/tiny-random-DebertaV2Model",
+    "decision-transformer": "edbeeching/decision-transformer-gym-hopper-medium",
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
+    "dinov2": "hf-internal-testing/tiny-random-Dinov2Model",
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "donut-swin": "hf-internal-testing/tiny-random-DonutSwinModel",
     "detr": "hf-internal-testing/tiny-random-DetrModel",  # hf-internal-testing/tiny-random-detr is larger
@@ -100,7 +102,9 @@
     "gpt-neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt-neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJModel",
+    "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
+    "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
     "imagegpt": "hf-internal-testing/tiny-random-ImageGPTModel",
     "levit": "hf-internal-testing/tiny-random-LevitModel",
@@ -113,7 +117,9 @@
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",
     "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
+    "maskformer": "hf-internal-testing/tiny-random-MaskFormerForInstanceSegmentation",
     "mbart": "hf-internal-testing/tiny-random-mbart",
+    "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
     "mobilenet-v2": "hf-internal-testing/tiny-random-MobileNetV2Model",
@@ -124,6 +130,8 @@
     "mt5": "lewtun/tiny-random-mt5",
     "musicgen": "hf-internal-testing/tiny-random-MusicgenForConditionalGeneration",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
+    "olmo": "hf-internal-testing/tiny-random-OlmoForCausalLM",
+    "olmo2": "hf-internal-testing/tiny-random-Olmo2ForCausalLM",
     "opt": "hf-internal-testing/tiny-random-OPTModel",
     "owlv2": "hf-internal-testing/tiny-random-Owlv2Model",
     "owlvit": "hf-tiny-model-private/tiny-random-OwlViTModel",
@@ -136,7 +144,9 @@
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     # "rembert": "google/rembert",
+    "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
     "regnet": "hf-internal-testing/tiny-random-RegNetModel",
     "resnet": "hf-internal-testing/tiny-random-resnet",
@@ -144,13 +154,18 @@
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
     "sam": "fxmarty/sam-vit-tiny-random",
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
+    "siglip": "hf-internal-testing/tiny-random-SiglipModel",
+    "siglip-vision-model": "hf-internal-testing/tiny-random-SiglipVisionModel",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swinv2": "hf-internal-testing/tiny-random-Swinv2Model",
     "swin2sr": "hf-internal-testing/tiny-random-Swin2SRModel",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "vit": "hf-internal-testing/tiny-random-vit",
+    "vit-mae": "hf-internal-testing/tiny-random-ViTMAEModel",
+    "vit-msn": "hf-internal-testing/tiny-random-ViTMSNForImageClassification",
     "vits": "echarlaix/tiny-random-vits",
     "yolos": "hf-internal-testing/tiny-random-YolosModel",
     "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
@@ -231,6 +246,7 @@
     "gpt-neox": "EleutherAI/gpt-neox-20b",
     "gptj": "anton-l/gpt-j-tiny-random",  # TODO
     "groupvit": "nvidia/groupvit-gcc-yfcc",
+    "hiera": "facebook/hiera-tiny-224-in1k-hf",
     "ibert": "kssteven/ibert-roberta-base",
     "imagegpt": "openai/imagegpt-small",
     "levit": "facebook/levit-128S",
@@ -243,7 +259,9 @@
     "m2m-100": "hf-internal-testing/tiny-random-m2m_100",  # Not using facebook/m2m100_418M because it takes too much time for testing.
     "marian": "Helsinki-NLP/opus-mt-en-de",
     "markuplm": "hf-internal-testing/tiny-random-MarkupLMModel",
+    "maskformer": "facebook/maskformer-swin-tiny-coco",
     "mbart": "sshleifer/tiny-mbart",
+    "mgp-str": "alibaba-damo/mgp-str-base",
     "mobilebert": "google/mobilebert-uncased",
     # "mobilenet_v1": "google/mobilenet_v1_0.75_192",
     # "mobilenet_v2": "google/mobilenet_v2_0.35_96",
@@ -255,20 +273,25 @@
     "owlv2": "google/owlv2-base-patch16",
     "owlvit": "google/owlvit-base-patch32",
     "perceiver": "hf-internal-testing/tiny-random-PerceiverModel",  # Not using deepmind/language-perceiver because it takes too much time for testing.
-    # "rembert": "google/rembert",
+    "rembert": "google/rembert",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "regnet": "facebook/regnet-y-040",
     "resnet": "microsoft/resnet-50",
     "roberta": "roberta-base",
     "roformer": "junnyu/roformer_chinese_base",
     "sam": "facebook/sam-vit-base",
     "segformer": "nvidia/segformer-b0-finetuned-ade-512-512",
+    "siglip": "google/siglip-base-patch16-224",
     "splinter": "hf-internal-testing/tiny-random-SplinterModel",
     "squeezebert": "squeezebert/squeezebert-uncased",
     "swin": "microsoft/swin-tiny-patch4-window7-224",
+    "swinv2": "microsoft/swinv2-tiny-patch4-window16-256",
     "t5": "t5-small",
     "table-transformer": "microsoft/table-transformer-detection",
     "vit": "google/vit-base-patch16-224",
+    "vit-mae": "facebook/vit-mae-base",
+    "vit-msn": "facebook/vit-msn-small",
     "yolos": "hustvl/yolos-tiny",
     "whisper": "openai/whisper-tiny.en",
     "hubert": "facebook/hubert-base-ls960",
@@ -296,9 +319,11 @@
 }
 
 PYTORCH_DIFFUSION_MODEL = {
+    "flux": "optimum-internal-testing/tiny-random-flux",
+    "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-3": "yujiepan/stable-diffusion-3-tiny-random",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
-    "lcm": "echarlaix/tiny-random-latent-consistency",
 }
 
 PYTORCH_TIMM_MODEL = {
diff --git a/tests/exporters/onnx/test_exporters_onnx_cli.py b/tests/exporters/onnx/test_exporters_onnx_cli.py
index ed611ade04e..9ac7832aa7d 100644
--- a/tests/exporters/onnx/test_exporters_onnx_cli.py
+++ b/tests/exporters/onnx/test_exporters_onnx_cli.py
@@ -602,6 +602,15 @@ def test_diffusion(self):
                 check=True,
             )
 
+    @require_sentence_transformers
+    def test_sentence_transformers(self):
+        with TemporaryDirectory() as tmpdirname:
+            subprocess.run(
+                f"python3 -m optimum.exporters.onnx --model sentence-transformers-testing/stsb-bert-tiny-onnx --task feature-extraction {tmpdirname}",
+                shell=True,
+                check=True,
+            )
+
     def test_legacy(self):
         with TemporaryDirectory() as tmpdirname:
             subprocess.run(
diff --git a/tests/exporters/onnx/test_onnx_export.py b/tests/exporters/onnx/test_onnx_export.py
index d1471aa218a..88288547c95 100644
--- a/tests/exporters/onnx/test_onnx_export.py
+++ b/tests/exporters/onnx/test_onnx_export.py
@@ -43,7 +43,7 @@
 from optimum.exporters.onnx.constants import SDPA_ARCHS_ONNX_EXPORT_NOT_SUPPORTED
 from optimum.exporters.onnx.model_configs import WhisperOnnxConfig
 from optimum.exporters.onnx.utils import get_speecht5_models_for_export
-from optimum.utils import ONNX_WEIGHTS_NAME, DummyPastKeyValuesGenerator, NormalizedTextConfig
+from optimum.utils import DummyPastKeyValuesGenerator, NormalizedTextConfig
 from optimum.utils.save_utils import maybe_load_preprocessors
 from optimum.utils.testing_utils import grid_parameters, require_diffusers
 
@@ -292,27 +292,20 @@ def _onnx_export(
 
                 gc.collect()
 
-    def _onnx_export_sd(self, model_type: str, model_name: str, device="cpu"):
+    def _onnx_export_diffusion_models(self, model_type: str, model_name: str, device="cpu"):
         pipeline = TasksManager.get_model_from_task(model_type, model_name, device=device)
         models_and_onnx_configs = get_diffusion_models_for_export(pipeline)
-        output_names = [os.path.join(name_dir, ONNX_WEIGHTS_NAME) for name_dir in models_and_onnx_configs]
-        model, _ = models_and_onnx_configs["vae_encoder"]
-        model.forward = lambda sample: {"latent_sample": model.encode(x=sample)["latent_dist"].parameters}
 
         with TemporaryDirectory() as tmpdirname:
             _, onnx_outputs = export_models(
                 models_and_onnx_configs=models_and_onnx_configs,
-                opset=14,
                 output_dir=Path(tmpdirname),
-                output_names=output_names,
                 device=device,
             )
             validate_models_outputs(
                 models_and_onnx_configs=models_and_onnx_configs,
                 onnx_named_outputs=onnx_outputs,
                 output_dir=Path(tmpdirname),
-                atol=1e-3,
-                onnx_files_subpaths=output_names,
                 use_subprocess=False,
             )
 
@@ -403,7 +396,7 @@ def test_tensorflow_export(
     @require_vision
     @require_diffusers
     def test_pytorch_export_for_diffusion_models(self, model_type, model_name):
-        self._onnx_export_sd(model_type, model_name)
+        self._onnx_export_diffusion_models(model_type, model_name)
 
     @parameterized.expand(PYTORCH_DIFFUSION_MODEL.items())
     @require_torch
@@ -414,7 +407,7 @@ def test_pytorch_export_for_diffusion_models(self, model_type, model_name):
     @pytest.mark.run_slow
     @pytest.mark.gpu_test
     def test_pytorch_export_for_diffusion_models_cuda(self, model_type, model_name):
-        self._onnx_export_sd(model_type, model_name, device="cuda")
+        self._onnx_export_diffusion_models(model_type, model_name, device="cuda")
 
 
 class CustomWhisperOnnxConfig(WhisperOnnxConfig):
diff --git a/tests/fx/parallelization/test_tensor_parallel.py b/tests/fx/parallelization/test_tensor_parallel.py
index 9626fccec3b..8a00393c4d7 100644
--- a/tests/fx/parallelization/test_tensor_parallel.py
+++ b/tests/fx/parallelization/test_tensor_parallel.py
@@ -36,6 +36,7 @@
     "output_attentions": False,
     "output_hidden_states": False,
     "tie_word_embeddings": True,
+    "return_dict": True,
 }
 
 DUMMY_MODELS_TO_TEST = (
@@ -64,11 +65,10 @@ def prepare_dummy_inputs(
     seq_len: int = 10,
     device: Union[str, torch.device] = "cuda",
 ):
-    return {
-        "input_ids": torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device),
-        "attention_mask": torch.ones((batch_size, seq_len), dtype=torch.int64, device=device),
-        "position_ids": torch.arange(0, seq_len, device=device).unsqueeze(0).expand(batch_size, -1),
-    }
+    input_ids = torch.randint(low=1, high=model_config.vocab_size, size=(batch_size, seq_len), device=device)
+    attention_mask = torch.ones((batch_size, seq_len), dtype=torch.int64, device=device)
+    labels = input_ids.clone()
+    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
 
 
 def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, model_kwargs: Dict[str, Any]):
@@ -82,8 +82,8 @@ def run_test_all_rank_results_match(rank: int, world_size: int, model_id: str, m
 
     model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs)
     inputs = prepare_dummy_inputs(model.config)
-    logits = model(**inputs)[0]
-    tensors = gather_at_main_process(tensor=logits, group=tp_group, rank=rank, world_size=world_size)
+    loss = model(**inputs).loss
+    tensors = gather_at_main_process(tensor=loss, group=tp_group, rank=rank, world_size=world_size)
 
     # check results at main worker process
     if rank == 0:
@@ -145,7 +145,7 @@ def run_test_parallel_results_matches_non_parallel(
     inputs = prepare_dummy_inputs(model.config)
 
     set_seed(SEED)
-    logits = model(**inputs)[0]
+    loss = model(**inputs).loss
 
     torch._dynamo.reset()
     del model
@@ -154,9 +154,9 @@ def run_test_parallel_results_matches_non_parallel(
     set_seed(SEED)
     ctx = ParallelExecutionCtx(tp_group=tp_group, current_device=device)
     model = parallelize_model(model_id, ctx, skip_load_weights=True, **model_kwargs)
-    parallel_logits = model(**inputs)[0]
+    parallel_loss = model(**inputs).loss
 
-    torch.testing.assert_close(logits.cpu(), parallel_logits.cpu(), rtol=1e-4, atol=1e-4)
+    torch.testing.assert_close(loss.cpu(), parallel_loss.cpu(), rtol=1e-4, atol=1e-4)
 
     dist.barrier(tp_group)
     tearDown()
diff --git a/tests/onnx/test_onnx_export_custom_module.py b/tests/onnx/test_onnx_export_custom_module.py
index a144d5cd840..9416093c841 100644
--- a/tests/onnx/test_onnx_export_custom_module.py
+++ b/tests/onnx/test_onnx_export_custom_module.py
@@ -22,7 +22,9 @@
 
 if is_torch_available():
     import torch
-    from transformers.models.deberta import modeling_deberta
+    from transformers.models.sew_d import modeling_sew_d
+
+    from optimum.utils import check_if_torch_greater
 
 
 class StableDropoutTestCase(TestCase):
@@ -34,7 +36,7 @@ def test_training(self):
         """Tests export of StableDropout in training mode."""
         devnull = open(os.devnull, "wb")
         # drop_prob must be > 0 for the test to be meaningful
-        sd = modeling_deberta.StableDropout(0.1)
+        sd = modeling_sew_d.StableDropout(0.1)
         # Avoid warnings in training mode
         do_constant_folding = False
         # Dropout is a no-op in inference mode
@@ -50,8 +52,8 @@ def test_training(self):
             training=training,
         )
 
-        # Expected to fail with opset_version < 12
-        with self.assertRaises(Exception):
+        if check_if_torch_greater("2.5"):
+            # Expected to pass with opset_version < 12 on torch >= 2.5
             torch.onnx.export(
                 sd,
                 input,
@@ -60,3 +62,14 @@ def test_training(self):
                 do_constant_folding=do_constant_folding,
                 training=training,
             )
+        else:
+            # Expected to fail with opset_version < 12 on torch < 2.5
+            with self.assertRaises(Exception):
+                torch.onnx.export(
+                    sd,
+                    input,
+                    devnull,
+                    opset_version=11,
+                    do_constant_folding=do_constant_folding,
+                    training=training,
+                )
diff --git a/tests/onnxruntime/test_diffusion.py b/tests/onnxruntime/test_diffusion.py
new file mode 100644
index 00000000000..07f90e8984e
--- /dev/null
+++ b/tests/onnxruntime/test_diffusion.py
@@ -0,0 +1,813 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import pytest
+import torch
+from diffusers import (
+    AutoPipelineForImage2Image,
+    AutoPipelineForInpainting,
+    AutoPipelineForText2Image,
+    DiffusionPipeline,
+)
+from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
+from diffusers.utils import load_image
+from parameterized import parameterized
+from transformers.testing_utils import require_torch_gpu
+from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
+
+from optimum.onnxruntime import (
+    ORTDiffusionPipeline,
+    ORTPipelineForImage2Image,
+    ORTPipelineForInpainting,
+    ORTPipelineForText2Image,
+)
+from optimum.utils import check_if_transformers_greater
+from optimum.utils.testing_utils import grid_parameters, require_diffusers
+
+
+def get_generator(framework, seed):
+    if framework == "np":
+        return np.random.RandomState(seed)
+    elif framework == "pt":
+        return torch.Generator().manual_seed(seed)
+    else:
+        raise ValueError(f"Unknown framework: {framework}")
+
+
+def _generate_prompts(batch_size=1):
+    inputs = {
+        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
+        "num_inference_steps": 3,
+        "guidance_scale": 7.5,
+        "output_type": "np",
+    }
+    return inputs
+
+
+def _generate_images(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+    if input_type == "pil":
+        image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/in_paint/overture-creations-5sI6fQgYIuo.png"
+        ).resize((width, height))
+    elif input_type == "np":
+        image = np.random.rand(height, width, channel)
+    elif input_type == "pt":
+        image = torch.rand((channel, height, width))
+
+    return [image] * batch_size
+
+
+class ORTPipelineForText2ImageTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3", "flux"]
+
+    NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        CALLBACK_SUPPORTED_ARCHITECTURES += ["flux"]
+
+    ORTMODEL_CLASS = ORTPipelineForText2Image
+    AUTOMODEL_CLASS = AutoPipelineForText2Image
+
+    TASK = "text-to-image"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+        auto_pipeline = DiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = ORTDiffusionPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [16, 32]:
+                for width in [16, 32]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
+
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+                return kwargs
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertTrue(auto_callback.has_been_called)
+        self.assertEqual(auto_callback.number_of_steps, ort_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        height, width, batch_size = 128, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for output_type in ["pil", "np", "pt", "latent"]:
+            inputs["output_type"] = output_type
+            outputs = pipeline(**inputs).images
+            if output_type == "pil":
+                self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+            elif output_type == "np":
+                self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+            elif output_type == "pt":
+                self.assertEqual(outputs.shape, (batch_size, 3, height, width))
+            else:
+                expected_height = height // pipeline.vae_scale_factor
+                expected_width = width // pipeline.vae_scale_factor
+
+                if model_arch == "flux":
+                    channels = pipeline.transformer.config.in_channels
+                    expected_shape = (batch_size, expected_height * expected_width, channels)
+                elif model_arch == "stable-diffusion-3":
+                    out_channels = pipeline.transformer.config.out_channels
+                    expected_shape = (batch_size, out_channels, expected_height, expected_width)
+                else:
+                    out_channels = pipeline.unet.config.out_channels
+                    expected_shape = (batch_size, out_channels, expected_height, expected_width)
+
+                self.assertEqual(outputs.shape, expected_shape)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(NEGATIVE_PROMPT_SUPPORTED_ARCHITECTURES)
+    def test_negative_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["negative_prompt"] = ["This is a negative prompt"] * batch_size
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+        diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device.type, "cuda")
+
+        outputs = pipeline(**inputs).images
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+
+class ORTPipelineForImage2ImageTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+        "latent-consistency",
+    ]
+
+    AUTOMODEL_CLASS = AutoPipelineForImage2Image
+    ORTMODEL_CLASS = ORTPipelineForImage2Image
+
+    TASK = "image-to-image"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(list(SUPPORTED_ARCHITECTURES))
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [16, 32]:
+                for width in [16, 32]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+                return kwargs
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        height, width, batch_size = 128, 64, 1
+
+        for input_type in ["pil", "np", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["pil", "np", "pt", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
+                else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
+                    self.assertEqual(
+                        outputs.shape,
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
+                    )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 128, 128, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
+
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device.type, "cuda")
+
+        outputs = pipeline(**inputs).images
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion", "latent-consistency"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+
+class ORTPipelineForInpaintingTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+    ]
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES += ["stable-diffusion-3"]
+
+    CALLBACK_SUPPORTED_ARCHITECTURES = [
+        "stable-diffusion",
+        "stable-diffusion-xl",
+    ]
+
+    AUTOMODEL_CLASS = AutoPipelineForInpainting
+    ORTMODEL_CLASS = ORTPipelineForInpainting
+
+    TASK = "inpainting"
+
+    def generate_inputs(self, height=128, width=128, batch_size=1, channel=3, input_type="pil"):
+        inputs = _generate_prompts(batch_size=batch_size)
+
+        inputs["image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=channel, input_type=input_type
+        )
+        inputs["mask_image"] = _generate_images(
+            height=height, width=width, batch_size=batch_size, channel=1, input_type=input_type
+        )
+
+        inputs["strength"] = 0.75
+        inputs["height"] = height
+        inputs["width"] = width
+
+        return inputs
+
+    @require_diffusers
+    def test_load_vanilla_model_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn(
+            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
+        )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_ort_pipeline_class_dispatch(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        auto_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        self.assertEqual(ort_pipeline.auto_model_class, auto_pipeline.__class__)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_num_images_per_prompt(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for batch_size in [1, 3]:
+            for height in [16, 32]:
+                for width in [16, 32]:
+                    for num_images_per_prompt in [1, 3]:
+                        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+                        outputs = pipeline(**inputs, num_images_per_prompt=num_images_per_prompt).images
+                        self.assertEqual(outputs.shape, (batch_size * num_images_per_prompt, height, width, 3))
+
+    @parameterized.expand(CALLBACK_SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_callback(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+        inputs["num_inference_steps"] = 3
+
+        class Callback:
+            def __init__(self):
+                self.has_been_called = False
+                self.number_of_steps = 0
+
+            def __call__(self, *args, **kwargs) -> None:
+                self.has_been_called = True
+                self.number_of_steps += 1
+                return kwargs
+
+        ort_pipe = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        auto_pipe = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        ort_callback = Callback()
+        auto_callback = Callback()
+
+        ort_pipe(**inputs, callback_on_step_end=ort_callback)
+        auto_pipe(**inputs, callback_on_step_end=auto_callback)
+
+        self.assertTrue(ort_callback.has_been_called)
+        self.assertEqual(ort_callback.number_of_steps, auto_callback.number_of_steps)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_shape(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        height, width, batch_size = 128, 64, 1
+
+        for input_type in ["pil", "np", "pt"]:
+            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
+
+            for output_type in ["pil", "np", "pt", "latent"]:
+                inputs["output_type"] = output_type
+                outputs = pipeline(**inputs).images
+                if output_type == "pil":
+                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
+                elif output_type == "np":
+                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+                elif output_type == "pt":
+                    self.assertEqual(outputs.shape, (batch_size, 3, height, width))
+                else:
+                    out_channels = (
+                        pipeline.unet.config.out_channels
+                        if pipeline.unet is not None
+                        else pipeline.transformer.config.out_channels
+                    )
+                    self.assertEqual(
+                        outputs.shape,
+                        (
+                            batch_size,
+                            out_channels,
+                            height // pipeline.vae_scale_factor,
+                            width // pipeline.vae_scale_factor,
+                        ),
+                    )
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_compare_to_diffusers_pipeline(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+        diffusers_pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch])
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        for output_type in ["latent", "np", "pt"]:
+            inputs["output_type"] = output_type
+
+            ort_images = ort_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+            diffusers_images = diffusers_pipeline(**inputs, generator=get_generator("pt", SEED)).images
+
+            np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_diffusers
+    def test_image_reproducibility(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 64, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
+
+        for generator_framework in ["np", "pt"]:
+            ort_outputs_1 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_2 = pipeline(**inputs, generator=get_generator(generator_framework, SEED))
+            ort_outputs_3 = pipeline(**inputs, generator=get_generator(generator_framework, SEED + 1))
+
+            self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
+            np.testing.assert_allclose(ort_outputs_1.images[0], ort_outputs_2.images[0], atol=1e-4, rtol=1e-2)
+
+    @parameterized.expand(
+        grid_parameters(
+            {
+                "model_arch": SUPPORTED_ARCHITECTURES,
+                "provider": ["CUDAExecutionProvider", "ROCMExecutionProvider", "TensorrtExecutionProvider"],
+            }
+        )
+    )
+    @pytest.mark.rocm_ep_test
+    @pytest.mark.cuda_ep_test
+    @pytest.mark.trt_ep_test
+    @require_torch_gpu
+    @require_diffusers
+    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
+        model_args = {"test_name": test_name, "model_arch": model_arch}
+        self._setup(model_args)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
+        self.assertEqual(pipeline.device.type, "cuda")
+
+        outputs = pipeline(**inputs).images
+        self.assertIsInstance(outputs, np.ndarray)
+        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
+
+    @parameterized.expand(["stable-diffusion"])
+    @require_diffusers
+    def test_safety_checker(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+
+        safety_checker = StableDiffusionSafetyChecker.from_pretrained("CompVis/stable-diffusion-safety-checker")
+
+        pipeline = self.AUTOMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], safety_checker=safety_checker)
+        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(
+            self.onnx_model_dirs[model_arch], safety_checker=safety_checker
+        )
+
+        self.assertIsInstance(pipeline.safety_checker, StableDiffusionSafetyChecker)
+        self.assertIsInstance(ort_pipeline.safety_checker, StableDiffusionSafetyChecker)
+
+        height, width, batch_size = 32, 64, 1
+        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
+
+        ort_output = ort_pipeline(**inputs, generator=get_generator("pt", SEED))
+        diffusers_output = pipeline(**inputs, generator=get_generator("pt", SEED))
+
+        ort_nsfw_content_detected = ort_output.nsfw_content_detected
+        diffusers_nsfw_content_detected = diffusers_output.nsfw_content_detected
+
+        self.assertTrue(ort_nsfw_content_detected is not None)
+        self.assertTrue(diffusers_nsfw_content_detected is not None)
+        self.assertEqual(ort_nsfw_content_detected, diffusers_nsfw_content_detected)
+
+        ort_images = ort_output.images
+        diffusers_images = diffusers_output.images
+        np.testing.assert_allclose(ort_images, diffusers_images, atol=1e-4, rtol=1e-2)
diff --git a/tests/onnxruntime/test_modeling.py b/tests/onnxruntime/test_modeling.py
index 4b44acb38ab..456ad73505e 100644
--- a/tests/onnxruntime/test_modeling.py
+++ b/tests/onnxruntime/test_modeling.py
@@ -28,6 +28,7 @@
 import requests
 import timm
 import torch
+from huggingface_hub import HfApi
 from huggingface_hub.constants import default_cache_path
 from parameterized import parameterized
 from PIL import Image
@@ -42,6 +43,7 @@
     AutoModelForCausalLM,
     AutoModelForCTC,
     AutoModelForImageClassification,
+    AutoModelForImageToImage,
     AutoModelForMaskedLM,
     AutoModelForMultipleChoice,
     AutoModelForQuestionAnswering,
@@ -52,12 +54,15 @@
     AutoModelForTokenClassification,
     AutoModelForVision2Seq,
     AutoTokenizer,
+    GenerationConfig,
     MBartForConditionalGeneration,
     Pix2StructForConditionalGeneration,  # Pix2Struct does not work with AutoModel
     PretrainedConfig,
     set_seed,
 )
+from transformers.modeling_outputs import ImageSuperResolutionOutput
 from transformers.modeling_utils import no_init_weights
+from transformers.models.swin2sr.configuration_swin2sr import Swin2SRConfig
 from transformers.onnx.utils import get_preprocessor
 from transformers.testing_utils import get_gpu_count, require_torch_gpu, slow
 from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
@@ -79,6 +84,7 @@
     ORTModelForCustomTasks,
     ORTModelForFeatureExtraction,
     ORTModelForImageClassification,
+    ORTModelForImageToImage,
     ORTModelForMaskedLM,
     ORTModelForMultipleChoice,
     ORTModelForPix2Struct,
@@ -89,15 +95,8 @@
     ORTModelForSpeechSeq2Seq,
     ORTModelForTokenClassification,
     ORTModelForVision2Seq,
-    ORTStableDiffusionPipeline,
 )
 from optimum.onnxruntime.base import ORTDecoderForSeq2Seq, ORTEncoder
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
 from optimum.onnxruntime.modeling_ort import ORTModel
 from optimum.pipelines import pipeline
 from optimum.utils import (
@@ -108,7 +107,24 @@
     DIFFUSION_MODEL_VAE_ENCODER_SUBFOLDER,
     logging,
 )
-from optimum.utils.testing_utils import grid_parameters, remove_directory, require_hf_token, require_ort_rocm
+from optimum.utils.import_utils import check_if_transformers_greater, is_diffusers_available
+from optimum.utils.testing_utils import (
+    grid_parameters,
+    remove_directory,
+    require_diffusers,
+    require_hf_token,
+    require_ort_rocm,
+)
+
+
+if is_diffusers_available():
+    from optimum.onnxruntime.modeling_diffusion import (
+        ORTModelTextEncoder,
+        ORTModelUnet,
+        ORTModelVaeDecoder,
+        ORTModelVaeEncoder,
+        ORTStableDiffusionPipeline,
+    )
 
 
 logger = logging.get_logger()
@@ -127,14 +143,14 @@ class ORTModelIntegrationTest(unittest.TestCase):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.TEST_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad"
-        self.LOCAL_MODEL_PATH = "assets/onnx"
+        self.LOCAL_MODEL_PATH = "tests/assets/onnx"
         self.ONNX_MODEL_ID = "philschmid/distilbert-onnx"
         self.TINY_ONNX_MODEL_ID = "fxmarty/resnet-tiny-beans"
         self.FAIL_ONNX_MODEL_ID = "sshleifer/tiny-distilbert-base-cased-distilled-squad"
         self.ONNX_SEQ2SEQ_MODEL_ID = "optimum/t5-small"
         self.LARGE_ONNX_SEQ2SEQ_MODEL_ID = "facebook/mbart-large-en-ro"
         self.TINY_ONNX_SEQ2SEQ_MODEL_ID = "fxmarty/sshleifer-tiny-mbart-onnx"
-        self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "hf-internal-testing/tiny-random-OnnxStableDiffusionPipeline"
+        self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID = "optimum-internal-testing/tiny-stable-diffusion-onnx"
 
     def test_load_model_from_local_path(self):
         model = ORTModel.from_pretrained(self.LOCAL_MODEL_PATH)
@@ -205,19 +221,21 @@ def test_load_seq2seq_model_from_empty_cache(self):
         with self.assertRaises(Exception):
             _ = ORTModelForSeq2SeqLM.from_pretrained(self.TINY_ONNX_SEQ2SEQ_MODEL_ID, local_files_only=True)
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_cache(self):
         _ = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)  # caching
-
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, local_files_only=True
         )
-
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
         self.assertIsInstance(model.vae_decoder, ORTModelVaeDecoder)
         self.assertIsInstance(model.vae_encoder, ORTModelVaeEncoder)
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
+    @require_diffusers
     def test_load_stable_diffusion_model_from_empty_cache(self):
         dirpath = os.path.join(
             default_cache_path, "models--" + self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID.replace("/", "--")
@@ -300,6 +318,7 @@ def test_load_seq2seq_model_unknown_provider(self):
         with self.assertRaises(ValueError):
             ORTModelForSeq2SeqLM.from_pretrained(self.ONNX_SEQ2SEQ_MODEL_ID, provider="FooExecutionProvider")
 
+    @require_diffusers
     def test_load_stable_diffusion_model_from_hub(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         self.assertIsInstance(model.text_encoder, ORTModelTextEncoder)
@@ -308,6 +327,9 @@ def test_load_stable_diffusion_model_from_hub(self):
         self.assertIsInstance(model.unet, ORTModelUnet)
         self.assertIsInstance(model.config, Dict)
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_load_stable_diffusion_model_cuda_provider(self):
@@ -321,6 +343,9 @@ def test_load_stable_diffusion_model_cuda_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -335,6 +360,9 @@ def test_load_stable_diffusion_model_rocm_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cuda:0"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
+    @require_diffusers
     def test_load_stable_diffusion_model_cpu_provider(self):
         model = ORTStableDiffusionPipeline.from_pretrained(
             self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID, provider="CPUExecutionProvider"
@@ -346,6 +374,9 @@ def test_load_stable_diffusion_model_cpu_provider(self):
         self.assertListEqual(model.vae_encoder.session.get_providers(), model.providers)
         self.assertEqual(model.device, torch.device("cpu"))
 
+        model(prompt="This is a sanity test prompt", num_inference_steps=2)
+
+    @require_diffusers
     def test_load_stable_diffusion_model_unknown_provider(self):
         with self.assertRaises(ValueError):
             ORTStableDiffusionPipeline.from_pretrained(
@@ -478,6 +509,7 @@ def test_passing_session_options_seq2seq(self):
         self.assertEqual(model.encoder.session.get_session_options().intra_op_num_threads, 3)
         self.assertEqual(model.decoder.session.get_session_options().intra_op_num_threads, 3)
 
+    @require_diffusers
     def test_passing_session_options_stable_diffusion(self):
         options = onnxruntime.SessionOptions()
         options.intra_op_num_threads = 3
@@ -772,6 +804,7 @@ def test_seq2seq_model_on_rocm_ep_str(self):
         self.assertEqual(model.decoder_with_past.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_passing_provider_options_stable_diffusion(self):
@@ -810,6 +843,7 @@ def test_passing_provider_options_stable_diffusion(self):
             model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["do_copy_in_default_stream"], "0"
         )
 
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
@@ -825,7 +859,7 @@ def test_stable_diffusion_model_on_cpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
-    # test string device input for to()
+    @require_diffusers
     def test_stable_diffusion_model_on_cpu_str(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
         cpu = torch.device("cpu")
@@ -841,6 +875,7 @@ def test_stable_diffusion_model_on_cpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CPUExecutionProvider")
         self.assertListEqual(model.providers, ["CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu(self):
@@ -858,6 +893,7 @@ def test_stable_diffusion_model_on_gpu(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -876,6 +912,7 @@ def test_stable_diffusion_model_on_rocm_ep(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "ROCMExecutionProvider")
         self.assertListEqual(model.providers, ["ROCMExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @unittest.skipIf(get_gpu_count() <= 1, "this test requires multi-gpu")
     def test_stable_diffusion_model_on_gpu_id(self):
         model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
@@ -899,7 +936,7 @@ def test_stable_diffusion_model_on_gpu_id(self):
         self.assertEqual(model.vae_decoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
         self.assertEqual(model.vae_encoder.session.get_provider_options()["CUDAExecutionProvider"]["device_id"], "1")
 
-    # test string device input for to()
+    @require_diffusers
     @require_torch_gpu
     @pytest.mark.cuda_ep_test
     def test_stable_diffusion_model_on_gpu_str(self):
@@ -916,6 +953,7 @@ def test_stable_diffusion_model_on_gpu_str(self):
         self.assertEqual(model.vae_encoder.session.get_providers()[0], "CUDAExecutionProvider")
         self.assertListEqual(model.providers, ["CUDAExecutionProvider", "CPUExecutionProvider"])
 
+    @require_diffusers
     @require_torch_gpu
     @require_ort_rocm
     @pytest.mark.rocm_ep_test
@@ -936,10 +974,14 @@ def test_stable_diffusion_model_on_rocm_ep_str(self):
     def test_load_model_from_hub_private(self):
         token = os.environ.get("HF_HUB_READ_TOKEN", None)
 
-        if token is None:
-            self.skipTest("Test requires a token for fxmartyclone in the environment variable `HF_HUB_READ_TOKEN`.")
+        if not token:
+            self.skipTest(
+                "Test requires a read access token for optimum-internal-testing in the environment variable `HF_HUB_READ_TOKEN`."
+            )
 
-        model = ORTModelForCustomTasks.from_pretrained("optimum-internal-testing/tiny-random-phi-private", token=token)
+        model = ORTModelForCustomTasks.from_pretrained(
+            "optimum-internal-testing/tiny-random-phi-private", revision="onnx", token=token
+        )
 
         self.assertIsInstance(model.model, onnxruntime.InferenceSession)
         self.assertIsInstance(model.config, PretrainedConfig)
@@ -975,6 +1017,7 @@ def test_save_seq2seq_model_without_past(self):
             self.assertTrue(ONNX_DECODER_WITH_PAST_NAME not in folder_contents)
             self.assertTrue(CONFIG_NAME in folder_contents)
 
+    @require_diffusers
     def test_save_stable_diffusion_model(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             model = ORTStableDiffusionPipeline.from_pretrained(self.TINY_ONNX_STABLE_DIFFUSION_MODEL_ID)
@@ -1050,6 +1093,7 @@ def test_save_load_seq2seq_model_with_external_data(self, use_cache: bool):
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
             remove_directory(tmpdirname)
 
+    @require_diffusers
     def test_save_load_stable_diffusion_model_with_external_data(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
             os.environ["FORCE_ONNX_EXTERNAL_DATA"] = "1"  # force exporting small model with external data
@@ -1180,6 +1224,7 @@ def test_push_seq2seq_model_with_external_data_to_hub(self):
             )
             os.environ.pop("FORCE_ONNX_EXTERNAL_DATA")
 
+    @require_diffusers
     @require_hf_token
     def test_push_stable_diffusion_model_with_external_data_to_hub(self):
         with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1224,6 +1269,20 @@ def test_trust_remote_code(self):
             torch.allclose(pt_logits, ort_logits, atol=1e-4), f" Maxdiff: {torch.abs(pt_logits - ort_logits).max()}"
         )
 
+    @parameterized.expand(("", "onnx"))
+    def test_loading_with_config_not_from_subfolder(self, subfolder):
+        # config.json file in the root directory and not in the subfolder
+        model_id = "sentence-transformers-testing/stsb-bert-tiny-onnx"
+        # hub model
+        ORTModelForFeatureExtraction.from_pretrained(model_id, subfolder=subfolder, export=subfolder == "")
+        # local model
+        api = HfApi()
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            local_dir = Path(tmpdirname) / "model"
+            api.snapshot_download(repo_id=model_id, local_dir=local_dir)
+            ORTModelForFeatureExtraction.from_pretrained(local_dir, subfolder=subfolder, export=subfolder == "")
+            remove_directory(tmpdirname)
+
 
 class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin):
     SUPPORTED_ARCHITECTURES = [
@@ -1253,6 +1312,7 @@ class ORTModelForQuestionAnsweringIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm_qa",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1443,6 +1503,7 @@ class ORTModelForMaskedLMIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1623,6 +1684,7 @@ class ORTModelForSequenceClassificationIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -1823,6 +1885,7 @@ class ORTModelForTokenClassificationIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -2134,6 +2197,18 @@ def test_compare_to_io_binding(self, model_arch):
 
         gc.collect()
 
+    def test_default_token_type_ids(self):
+        model_id = MODEL_NAMES["bert"]
+        model = ORTModelForFeatureExtraction.from_pretrained(model_id, export=True)
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokens = tokenizer("this is a simple input", return_tensors="np")
+        self.assertTrue("token_type_ids" in model.input_names)
+        token_type_ids = tokens.pop("token_type_ids")
+        outs = model(token_type_ids=token_type_ids, **tokens)
+        outs_without_token_type_ids = model(**tokens)
+        self.assertTrue(np.allclose(outs.last_hidden_state, outs_without_token_type_ids.last_hidden_state))
+        gc.collect()
+
 
 class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin):
     # Multiple Choice tests are conducted on different models due to mismatch size in model's classifier
@@ -2156,6 +2231,7 @@ class ORTModelForMultipleChoiceIntegrationTest(ORTModelTestMixin):
         "squeezebert",
         "xlm",
         "xlm_roberta",
+        "rembert",
     ]
 
     FULL_GRID = {"model_arch": SUPPORTED_ARCHITECTURES}
@@ -2247,7 +2323,6 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "bloom",
         "codegen",
         "falcon",
-        "gemma",
         "gpt2",
         "gpt_bigcode",
         "gpt_neo",
@@ -2255,11 +2330,22 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
         "gptj",
         "llama",
         "mistral",
-        "mpt",
-        "phi3",
-        "qwen2",
+        "opt",
     ]
 
+    if check_if_transformers_greater("4.37"):
+        SUPPORTED_ARCHITECTURES.append("qwen2")
+
+    if check_if_transformers_greater("4.38"):
+        SUPPORTED_ARCHITECTURES.append("gemma")
+
+    # TODO: fix "mpt" for which inference fails for transformers < v4.41
+    if check_if_transformers_greater("4.41"):
+        SUPPORTED_ARCHITECTURES.extend(["phi3", "mpt"])
+
+    if check_if_transformers_greater("4.45"):
+        SUPPORTED_ARCHITECTURES.append("granite")
+
     FULL_GRID = {
         "model_arch": SUPPORTED_ARCHITECTURES,
         "use_cache": [False, True],
@@ -2268,7 +2354,7 @@ class ORTModelForCausalLMIntegrationTest(ORTModelTestMixin):
     ORTMODEL_CLASS = ORTModelForCausalLM
     TASK = "text-generation"
 
-    GENERATION_LENGTH = 100
+    GENERATION_LENGTH = 90
     SPEEDUP_CACHE = 1.1
 
     @parameterized.expand([(False,), (True,)])
@@ -2341,7 +2427,7 @@ def test_merge_from_onnx_and_save(self, model_arch):
             self.assertNotIn(ONNX_DECODER_WITH_PAST_NAME, folder_contents)
             self.assertNotIn(ONNX_WEIGHTS_NAME, folder_contents)
 
-    @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 3]}))
+    @parameterized.expand(grid_parameters({**FULL_GRID, "num_beams": [1, 4]}))
     def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cache: bool, num_beams: int):
         use_io_binding = None
         if use_cache is False:
@@ -2371,7 +2457,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         transformers_model = AutoModelForCausalLM.from_pretrained(model_id)
         transformers_model = transformers_model.eval()
         tokenizer = get_preprocessor(model_id)
-        tokens = tokenizer("This is a sample output", return_tensors="pt")
+        tokens = tokenizer("This is a sample input", return_tensors="pt")
         position_ids = None
         if model_arch.replace("_", "-") in MODEL_TYPES_REQUIRING_POSITION_IDS:
             input_shape = tokens["input_ids"].shape
@@ -2393,7 +2479,7 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
         # Compare batched generation.
         tokenizer.pad_token_id = tokenizer.eos_token_id
         tokenizer.padding_side = "left"
-        tokens = tokenizer(["Today is a nice day and I am longer", "This is me"], return_tensors="pt", padding=True)
+        tokens = tokenizer(["This is", "This is a sample input"], return_tensors="pt", padding=True)
         onnx_model.generation_config.eos_token_id = None
         transformers_model.generation_config.eos_token_id = None
         onnx_model.config.eos_token_id = None
@@ -2404,25 +2490,39 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
             # TODO: remove once https://github.com/huggingface/transformers/pull/26873 is released, falcon is broken in transformers
             new_tokens = 5
 
-        onnx_outputs = onnx_model.generate(
-            **tokens,
-            num_beams=num_beams,
-            do_sample=False,
-            min_new_tokens=new_tokens,
-            max_new_tokens=new_tokens,
-            eos_token_id=None,
-        )
+        gen_kwargs = {
+            "max_new_tokens": new_tokens,
+            "min_new_tokens": new_tokens,
+            "eos_token_id": None,
+            "num_beams": num_beams,
+        }
 
-        transformers_outputs = transformers_model.generate(
-            **tokens,
-            num_beams=num_beams,
-            do_sample=False,
-            min_new_tokens=new_tokens,
-            max_new_tokens=new_tokens,
-            eos_token_id=None,
-        )
+        beam_search_gen_config = GenerationConfig(do_sample=False, **gen_kwargs)
+
+        if use_cache and num_beams == 4:
+            beam_sample_gen_config = GenerationConfig(do_sample=True, **gen_kwargs)
+            group_beam_search_gen_config = GenerationConfig(
+                do_sample=False, num_beam_groups=2, diversity_penalty=0.0000001, **gen_kwargs
+            )
+            gen_configs = (
+                beam_search_gen_config,
+                beam_sample_gen_config,
+                group_beam_search_gen_config,
+            )
+        else:
+            gen_configs = (beam_search_gen_config,)
+
+        for gen_config in gen_configs:
+            set_seed(SEED)
+            with torch.no_grad():
+                transformers_outputs = transformers_model.generate(**tokens, generation_config=gen_config)
+            set_seed(SEED)
+            onnx_outputs = onnx_model.generate(**tokens, generation_config=gen_config)
 
-        self.assertTrue(torch.allclose(onnx_outputs, transformers_outputs))
+            self.assertTrue(
+                torch.equal(onnx_outputs, transformers_outputs),
+                f"Failed with generation config : {gen_config}, transformers outputs {transformers_outputs}, ONNX model outputs {onnx_outputs}",
+            )
 
         gc.collect()
 
@@ -2727,6 +2827,7 @@ class ORTModelForImageClassificationIntegrationTest(ORTModelTestMixin):
         "convnextv2",
         "data2vec_vision",
         "deit",
+        "dinov2",
         "levit",
         "mobilenet_v1",
         "mobilenet_v2",
@@ -4510,14 +4611,14 @@ def test_compare_with_and_without_past_key_values(self, model_arch: str):
             )
 
         self.assertTrue(torch.equal(outputs_model_with_pkv, outputs_model_without_pkv))
-        self.assertEqual(
-            outputs_model_with_pkv.shape[1],
-            self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1,
-        )
-        self.assertEqual(
-            outputs_model_without_pkv.shape[1],
-            self.GENERATION_LENGTH + 2 if model_arch == "whisper" else self.GENERATION_LENGTH + 1,
-        )
+
+        if model_arch == "whisper" and check_if_transformers_greater("4.43"):
+            gen_length = self.GENERATION_LENGTH + 2
+        else:
+            gen_length = self.GENERATION_LENGTH + 1
+
+        self.assertEqual(outputs_model_with_pkv.shape[1], gen_length)
+        self.assertEqual(outputs_model_without_pkv.shape[1], gen_length)
 
         self.GENERATION_LENGTH = generation_length
         if os.environ.get("TEST_LEVEL", 0) == "1":
@@ -4677,6 +4778,136 @@ def test_compare_generation_to_io_binding(
         gc.collect()
 
 
+class ORTModelForImageToImageIntegrationTest(ORTModelTestMixin):
+    SUPPORTED_ARCHITECTURES = ["swin2sr"]
+
+    ORTMODEL_CLASS = ORTModelForImageToImage
+
+    TASK = "image-to-image"
+
+    def _get_sample_image(self):
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        return image
+
+    def _get_preprocessors(self, model_id):
+        image_processor = AutoImageProcessor.from_pretrained(model_id)
+
+        return image_processor
+
+    def test_load_vanilla_transformers_which_is_not_supported(self):
+        with self.assertRaises(Exception) as context:
+            _ = ORTModelForImageToImage.from_pretrained(MODEL_NAMES["bert"], export=True)
+
+        self.assertIn("only supports the tasks", str(context.exception))
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_compare_to_transformers(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        self.assertIsInstance(onnx_model.config, Swin2SRConfig)
+        set_seed(SEED)
+
+        transformers_model = AutoModelForImageToImage.from_pretrained(model_id)
+        image_processor = self._get_preprocessors(model_id)
+
+        data = self._get_sample_image()
+        features = image_processor(data, return_tensors="pt")
+
+        with torch.no_grad():
+            transformers_outputs = transformers_model(**features)
+
+        onnx_outputs = onnx_model(**features)
+        self.assertIsInstance(onnx_outputs, ImageSuperResolutionOutput)
+        self.assertTrue("reconstruction" in onnx_outputs)
+        self.assertIsInstance(onnx_outputs.reconstruction, torch.Tensor)
+        self.assertTrue(torch.allclose(onnx_outputs.reconstruction, transformers_outputs.reconstruction, atol=1e-4))
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_generate_utils(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+
+        data = self._get_sample_image()
+        features = image_processor(data, return_tensors="pt")
+
+        outputs = onnx_model(**features)
+        self.assertIsInstance(outputs, ImageSuperResolutionOutput)
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    def test_pipeline_image_to_image(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+        )
+        data = self._get_sample_image()
+        outputs = pipe(data)
+        self.assertEqual(pipe.device, onnx_model.device)
+        self.assertIsInstance(outputs, Image.Image)
+
+        gc.collect()
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_torch_gpu
+    @pytest.mark.cuda_ep_test
+    def test_pipeline_on_gpu(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+            device=0,
+        )
+
+        data = self._get_sample_image()
+        outputs = pipe(data)
+
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        self.assertIsInstance(outputs, Image.Image)
+
+    @parameterized.expand(SUPPORTED_ARCHITECTURES)
+    @require_torch_gpu
+    @require_ort_rocm
+    @pytest.mark.rocm_ep_test
+    def test_pipeline_on_rocm(self, model_arch: str):
+        model_args = {"test_name": model_arch, "model_arch": model_arch}
+        self._setup(model_args)
+        model_id = MODEL_NAMES[model_arch]
+        onnx_model = ORTModelForImageToImage.from_pretrained(self.onnx_model_dirs[model_arch])
+        image_processor = self._get_preprocessors(model_id)
+        pipe = pipeline(
+            "image-to-image",
+            model=onnx_model,
+            feature_extractor=image_processor,
+            device=0,
+        )
+
+        data = self._get_sample_image()
+        outputs = pipe(data)
+
+        self.assertEqual(pipe.model.device.type.lower(), "cuda")
+        self.assertIsInstance(outputs, Image.Image)
+
+
 class ORTModelForVision2SeqIntegrationTest(ORTModelTestMixin):
     SUPPORTED_ARCHITECTURES = ["vision-encoder-decoder", "trocr", "donut"]
 
@@ -4804,7 +5035,6 @@ def test_compare_to_transformers(self, test_name: str, model_arch: str, use_cach
                         len(onnx_outputs["past_key_values"][0]), len(transformers_outputs["past_key_values"][0])
                     )
                     for i in range(len(onnx_outputs["past_key_values"])):
-                        print(onnx_outputs["past_key_values"][i])
                         for ort_pkv, trfs_pkv in zip(
                             onnx_outputs["past_key_values"][i], transformers_outputs["past_key_values"][i]
                         ):
@@ -5490,6 +5720,7 @@ class TestBothExportersORTModel(unittest.TestCase):
             ["automatic-speech-recognition", ORTModelForCTCIntegrationTest],
             ["audio-xvector", ORTModelForAudioXVectorIntegrationTest],
             ["audio-frame-classification", ORTModelForAudioFrameClassificationIntegrationTest],
+            ["image-to-image", ORTModelForImageToImageIntegrationTest],
         ]
     )
     def test_find_untested_architectures(self, task: str, test_class):
diff --git a/tests/onnxruntime/test_quantization.py b/tests/onnxruntime/test_quantization.py
index b6f1ebb70f6..cf451590fbd 100644
--- a/tests/onnxruntime/test_quantization.py
+++ b/tests/onnxruntime/test_quantization.py
@@ -30,6 +30,7 @@
     AutoQuantizationConfig,
     ORTConfig,
     ORTModelForCausalLM,
+    ORTModelForFeatureExtraction,
     ORTModelForSeq2SeqLM,
     ORTModelForSequenceClassification,
     ORTQuantizer,
@@ -41,10 +42,10 @@
 class ORTQuantizerTest(unittest.TestCase):
     LOAD_CONFIGURATION = {
         "local_asset": {
-            "model_or_path": "assets/onnx",
+            "model_or_path": "tests/assets/onnx",
         },
         "local_asset_different_name": {
-            "model_or_path": "assets/onnx",
+            "model_or_path": "tests/assets/onnx",
             "file_name": "different_name.onnx",
         },
         "ort_model_class": {
@@ -52,6 +53,13 @@ class ORTQuantizerTest(unittest.TestCase):
                 "optimum/distilbert-base-uncased-finetuned-sst-2-english"
             )
         },
+        "ort_model_with_onnx_model_in_subfolder": {
+            "model_or_path": ORTModelForFeatureExtraction.from_pretrained(
+                "sentence-transformers/all-MiniLM-L6-v2",
+                subfolder="onnx",
+                file_name="model.onnx",
+            )
+        },
     }
 
     @parameterized.expand(LOAD_CONFIGURATION.items())
diff --git a/tests/onnxruntime/test_stable_diffusion_pipeline.py b/tests/onnxruntime/test_stable_diffusion_pipeline.py
deleted file mode 100644
index 44cd22ffecc..00000000000
--- a/tests/onnxruntime/test_stable_diffusion_pipeline.py
+++ /dev/null
@@ -1,562 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import random
-import unittest
-from typing import Dict
-
-import numpy as np
-import PIL
-import pytest
-import torch
-from diffusers import (
-    OnnxStableDiffusionImg2ImgPipeline,
-    StableDiffusionPipeline,
-    StableDiffusionXLPipeline,
-)
-from diffusers.utils import load_image
-from diffusers.utils.testing_utils import floats_tensor
-from packaging.version import Version, parse
-from parameterized import parameterized
-from transformers.testing_utils import require_torch_gpu
-from utils_onnxruntime_tests import MODEL_NAMES, SEED, ORTModelTestMixin
-
-from optimum.onnxruntime import (
-    ORTLatentConsistencyModelPipeline,
-    ORTStableDiffusionImg2ImgPipeline,
-    ORTStableDiffusionInpaintPipeline,
-    ORTStableDiffusionPipeline,
-    ORTStableDiffusionXLImg2ImgPipeline,
-    ORTStableDiffusionXLPipeline,
-)
-from optimum.onnxruntime.modeling_diffusion import (
-    ORTModelTextEncoder,
-    ORTModelUnet,
-    ORTModelVaeDecoder,
-    ORTModelVaeEncoder,
-)
-from optimum.pipelines.diffusers.pipeline_utils import VaeImageProcessor
-from optimum.utils.import_utils import _diffusers_version
-from optimum.utils.testing_utils import grid_parameters, require_diffusers, require_ort_rocm
-
-
-if parse(_diffusers_version) > Version("0.21.4"):
-    from diffusers import LatentConsistencyModelPipeline
-
-
-def _generate_inputs(batch_size=1):
-    inputs = {
-        "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-        "num_inference_steps": 3,
-        "guidance_scale": 7.5,
-        "output_type": "np",
-    }
-    return inputs
-
-
-def _create_image(height=128, width=128, batch_size=1, channel=3, input_type="pil"):
-    if input_type == "pil":
-        image = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-    elif input_type == "np":
-        image = np.random.rand(height, width, channel)
-    elif input_type == "pt":
-        image = torch.rand((channel, height, width))
-
-    return [image] * batch_size
-
-
-def to_np(image):
-    if isinstance(image[0], PIL.Image.Image):
-        return np.stack([np.array(i) for i in image], axis=0)
-    elif isinstance(image, torch.Tensor):
-        return image.cpu().numpy().transpose(0, 2, 3, 1)
-    return image
-
-
-class ORTStableDiffusionPipelineBase(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @require_diffusers
-    def test_load_vanilla_model_which_is_not_supported(self):
-        with self.assertRaises(Exception) as context:
-            _ = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES["bert"], export=True)
-
-        self.assertIn(
-            f"does not appear to have a file named {self.ORTMODEL_CLASS.config_name}", str(context.exception)
-        )
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_num_images_per_prompt(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        self.assertEqual(pipeline.vae_scale_factor, 2)
-        self.assertEqual(pipeline.vae_decoder.config["latent_channels"], 4)
-        self.assertEqual(pipeline.unet.config["in_channels"], 4)
-
-        batch_size, height = 1, 32
-        for width in [64, 32]:
-            inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for num_images in [1, 3]:
-                outputs = pipeline(**inputs, num_images_per_prompt=num_images).images
-                self.assertEqual(outputs.shape, (batch_size * num_images, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["CUDAExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @pytest.mark.cuda_ep_test
-    @require_diffusers
-    def test_pipeline_on_gpu(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(
-        grid_parameters({"model_arch": SUPPORTED_ARCHITECTURES, "provider": ["ROCMExecutionProvider"]})
-    )
-    @require_torch_gpu
-    @require_ort_rocm
-    @pytest.mark.rocm_ep_test
-    @require_diffusers
-    def test_pipeline_on_rocm_ep(self, test_name: str, model_arch: str, provider: str):
-        model_args = {"test_name": test_name, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[test_name], provider=provider)
-        height, width, batch_size = 32, 64, 1
-        inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-        outputs = pipeline(**inputs).images
-        # Verify model devices
-        self.assertEqual(pipeline.device.type.lower(), "cuda")
-        # Verify model outptus
-        self.assertIsInstance(outputs, np.ndarray)
-        self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_callback(self, model_arch: str):
-        def callback_fn(step: int, timestep: int, latents: np.ndarray) -> None:
-            callback_fn.has_been_called = True
-            callback_fn.number_of_steps += 1
-
-        pipe = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        callback_fn.has_been_called = False
-        callback_fn.number_of_steps = 0
-        inputs = self.generate_inputs(height=64, width=64)
-        pipe(**inputs, callback=callback_fn, callback_steps=1)
-        self.assertTrue(callback_fn.has_been_called)
-        self.assertEqual(callback_fn.number_of_steps, inputs["num_inference_steps"])
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_shape(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width, batch_size = 128, 64, 1
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        if self.TASK == "image-to-image":
-            input_types = ["np", "pil", "pt"]
-        elif self.TASK == "text-to-image":
-            input_types = ["np"]
-        else:
-            input_types = ["pil"]
-
-        for input_type in input_types:
-            if self.TASK == "image-to-image":
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size, input_type=input_type)
-            else:
-                inputs = self.generate_inputs(height=height, width=width, batch_size=batch_size)
-            for output_type in ["np", "pil", "latent"]:
-                inputs["output_type"] = output_type
-                outputs = pipeline(**inputs).images
-                if output_type == "pil":
-                    self.assertEqual((len(outputs), outputs[0].height, outputs[0].width), (batch_size, height, width))
-                elif output_type == "np":
-                    self.assertEqual(outputs.shape, (batch_size, height, width, 3))
-                else:
-                    self.assertEqual(
-                        outputs.shape,
-                        (batch_size, 4, height // pipeline.vae_scale_factor, width // pipeline.vae_scale_factor),
-                    )
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["height"] = height
-        inputs["width"] = width
-        return inputs
-
-
-class ORTStableDiffusionImg2ImgPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        height, width = 128, 128
-
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["prompt"] = "A painting of a squirrel eating a burger"
-        inputs["image"] = floats_tensor((1, 3, height, width), rng=random.Random(SEED))
-
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        ort_output = ort_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        diffusers_onnx_pipeline = OnnxStableDiffusionImg2ImgPipeline.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_onnx_output = diffusers_onnx_pipeline(**inputs, generator=np.random.RandomState(SEED)).images
-
-        self.assertTrue(np.allclose(ort_output, diffusers_onnx_output, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ORTStableDiffusionPipelineTest(unittest.TestCase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        pipeline.safety_checker = None
-        batch_size, num_images_per_prompt, height, width = 1, 2, 64, 32
-
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": "sailing ship in storm by Leonardo da Vinci",
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        # Compare model outputs
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    def test_negative_prompt(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        inputs["height"], inputs["width"] = 64, 32
-        negative_prompt = ["This is a negative prompt"]
-        np.random.seed(0)
-        image_slice_1 = pipeline(**inputs, negative_prompt=negative_prompt).images[0, -3:, -3:, -1]
-        prompt = inputs.pop("prompt")
-        embeds = []
-        for p in [prompt, negative_prompt]:
-            text_inputs = pipeline.tokenizer(
-                p,
-                padding="max_length",
-                max_length=pipeline.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="np",
-            )
-            text_inputs = text_inputs["input_ids"].astype(pipeline.text_encoder.input_dtype.get("input_ids", np.int32))
-            embeds.append(pipeline.text_encoder(text_inputs)[0])
-
-        inputs["prompt_embeds"], inputs["negative_prompt_embeds"] = embeds
-        np.random.seed(0)
-        image_slice_2 = pipeline(**inputs).images[0, -3:, -3:, -1]
-        self.assertTrue(np.allclose(image_slice_1, image_slice_2, atol=1e-4))
-
-
-class ORTStableDiffusionXLPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.text_encoder_2, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = StableDiffusionXLPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_rescale": 0.1,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_image_reproducibility(self, model_arch: str):
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        inputs = _generate_inputs()
-        height, width = 64, 32
-        np.random.seed(0)
-        ort_outputs_1 = pipeline(**inputs, height=height, width=width)
-        np.random.seed(0)
-        ort_outputs_2 = pipeline(**inputs, height=height, width=width)
-        ort_outputs_3 = pipeline(**inputs, height=height, width=width)
-        self.assertTrue(np.array_equal(ort_outputs_1.images[0], ort_outputs_2.images[0]))
-        self.assertFalse(np.array_equal(ort_outputs_1.images[0], ort_outputs_3.images[0]))
-
-
-class ORTStableDiffusionInpaintPipelineTest(ORTStableDiffusionPipelineBase):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionInpaintPipeline
-    TASK = "inpainting"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_compare_diffusers_pipeline(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-        diffusers_pipeline = self.ORTMODEL_CLASS.auto_model_class.from_pretrained(MODEL_NAMES[model_arch])
-        height, width = 64, 64
-        latents_shape = (
-            1,
-            ort_pipeline.vae_decoder.config["latent_channels"],
-            height // ort_pipeline.vae_scale_factor,
-            width // ort_pipeline.vae_scale_factor,
-        )
-        inputs = self.generate_inputs(height=height, width=width)
-
-        np_latents = np.random.rand(*latents_shape).astype(np.float32)
-        torch_latents = torch.from_numpy(np_latents)
-
-        ort_outputs = ort_pipeline(**inputs, latents=np_latents).images
-        self.assertEqual(ort_outputs.shape, (1, height, width, 3))
-
-        diffusers_outputs = diffusers_pipeline(**inputs, latents=torch_latents).images
-        self.assertEqual(diffusers_outputs.shape, (1, height, width, 3))
-
-        self.assertTrue(np.allclose(ort_outputs, diffusers_outputs, atol=1e-4))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1):
-        inputs = super(ORTStableDiffusionInpaintPipelineTest, self).generate_inputs(height, width)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        inputs["mask_image"] = _create_image(height=height, width=width, batch_size=1, input_type="pil")[0]
-        return inputs
-
-
-class ORTStableDiffusionXLImg2ImgPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "stable-diffusion-xl",
-    ]
-    ORTMODEL_CLASS = ORTStableDiffusionXLImg2ImgPipeline
-    TASK = "image-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    def test_inference(self, model_arch: str):
-        model_args = {"test_name": model_arch, "model_arch": model_arch}
-        self._setup(model_args)
-        pipeline = self.ORTMODEL_CLASS.from_pretrained(self.onnx_model_dirs[model_arch])
-
-        height, width = 128, 128
-        inputs = self.generate_inputs(height=height, width=width)
-        inputs["image"] = load_image(
-            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
-            "/in_paint/overture-creations-5sI6fQgYIuo.png"
-        ).resize((width, height))
-        output = pipeline(**inputs, generator=np.random.RandomState(0)).images[0, -3:, -3:, -1]
-        expected_slice = np.array([0.6515, 0.5405, 0.4858, 0.5632, 0.5174, 0.5681, 0.4948, 0.4253, 0.5080])
-
-        self.assertTrue(np.allclose(output.flatten(), expected_slice, atol=1e-1))
-
-    def generate_inputs(self, height=128, width=128, batch_size=1, input_type="np"):
-        inputs = _generate_inputs(batch_size=batch_size)
-        inputs["image"] = _create_image(height=height, width=width, batch_size=batch_size, input_type=input_type)
-        inputs["strength"] = 0.75
-        return inputs
-
-
-class ImageProcessorTest(unittest.TestCase):
-    def test_vae_image_processor_pt(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pt = torch.stack(_create_image(height=8, width=8, batch_size=1, input_type="pt"))
-        input_np = to_np(input_pt)
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pt), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_np(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_np = np.stack(_create_image(height=8, width=8, input_type="np"))
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_np), output_type=output_type)
-            out_np = to_np(out)
-            in_np = (input_np * 255).round() if output_type == "pil" else input_np
-            self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-    def test_vae_image_processor_pil(self):
-        image_processor = VaeImageProcessor(do_resize=False, do_normalize=True)
-        input_pil = _create_image(height=8, width=8, batch_size=1, input_type="pil")
-
-        for output_type in ["np", "pil"]:
-            out = image_processor.postprocess(image_processor.preprocess(input_pil), output_type=output_type)
-            for i, o in zip(input_pil, out):
-                in_np = np.array(i)
-                out_np = to_np(out) if output_type == "pil" else (to_np(out) * 255).round()
-                self.assertTrue(np.allclose(in_np, out_np, atol=1e-6))
-
-
-class ORTLatentConsistencyModelPipelineTest(ORTModelTestMixin):
-    SUPPORTED_ARCHITECTURES = [
-        "latent-consistency",
-    ]
-    ORTMODEL_CLASS = ORTLatentConsistencyModelPipeline
-    TASK = "text-to-image"
-
-    @parameterized.expand(SUPPORTED_ARCHITECTURES)
-    @require_diffusers
-    @unittest.skipIf(
-        parse(_diffusers_version) <= Version("0.21.4"),
-        "not supported with this diffusers version, needs diffusers>=v0.22.0",
-    )
-    def test_compare_to_diffusers(self, model_arch: str):
-        ort_pipeline = self.ORTMODEL_CLASS.from_pretrained(MODEL_NAMES[model_arch], export=True)
-        self.assertIsInstance(ort_pipeline.text_encoder, ORTModelTextEncoder)
-        self.assertIsInstance(ort_pipeline.vae_decoder, ORTModelVaeDecoder)
-        self.assertIsInstance(ort_pipeline.vae_encoder, ORTModelVaeEncoder)
-        self.assertIsInstance(ort_pipeline.unet, ORTModelUnet)
-        self.assertIsInstance(ort_pipeline.config, Dict)
-
-        pipeline = LatentConsistencyModelPipeline.from_pretrained(MODEL_NAMES[model_arch])
-        batch_size, num_images_per_prompt, height, width = 2, 2, 64, 32
-        latents = ort_pipeline.prepare_latents(
-            batch_size * num_images_per_prompt,
-            ort_pipeline.unet.config["in_channels"],
-            height,
-            width,
-            dtype=np.float32,
-            generator=np.random.RandomState(0),
-        )
-
-        kwargs = {
-            "prompt": ["sailing ship in storm by Leonardo da Vinci"] * batch_size,
-            "num_inference_steps": 1,
-            "num_images_per_prompt": num_images_per_prompt,
-            "height": height,
-            "width": width,
-            "guidance_scale": 8.5,
-        }
-
-        for output_type in ["latent", "np"]:
-            ort_outputs = ort_pipeline(latents=latents, output_type=output_type, **kwargs).images
-            self.assertIsInstance(ort_outputs, np.ndarray)
-            with torch.no_grad():
-                outputs = pipeline(latents=torch.from_numpy(latents), output_type=output_type, **kwargs).images
-
-            # Compare model outputs
-            self.assertTrue(np.allclose(ort_outputs, outputs, atol=1e-4))
-            # Compare model devices
-            self.assertEqual(pipeline.device, ort_pipeline.device)
diff --git a/tests/onnxruntime/utils_onnxruntime_tests.py b/tests/onnxruntime/utils_onnxruntime_tests.py
index bb6935461d7..02ced3be3aa 100644
--- a/tests/onnxruntime/utils_onnxruntime_tests.py
+++ b/tests/onnxruntime/utils_onnxruntime_tests.py
@@ -87,8 +87,9 @@
     "deit": "hf-internal-testing/tiny-random-DeiTModel",
     "donut": "fxmarty/tiny-doc-qa-vision-encoder-decoder",
     "detr": "hf-internal-testing/tiny-random-detr",
-    "dpt": "hf-internal-testing/tiny-random-DPTModel",
+    "dinov2": "hf-internal-testing/tiny-random-Dinov2Model",
     "distilbert": "hf-internal-testing/tiny-random-DistilBertModel",
+    "dpt": "hf-internal-testing/tiny-random-DPTModel",
     "electra": "hf-internal-testing/tiny-random-ElectraModel",
     "encoder-decoder": {
         "hf-internal-testing/tiny-random-EncoderDecoderModel-bert-bert": [
@@ -98,24 +99,28 @@
     },
     "falcon": "fxmarty/really-tiny-falcon-testing",
     "flaubert": "hf-internal-testing/tiny-random-flaubert",
+    "flux": "optimum-internal-testing/tiny-random-flux",
     "gemma": "fxmarty/tiny-random-GemmaForCausalLM",
     "gpt2": "hf-internal-testing/tiny-random-gpt2",
     "gpt_bigcode": "hf-internal-testing/tiny-random-GPTBigCodeModel",
     "gpt_neo": "hf-internal-testing/tiny-random-GPTNeoModel",
     "gpt_neox": "hf-internal-testing/tiny-random-GPTNeoXForCausalLM",
     "gptj": "hf-internal-testing/tiny-random-GPTJForCausalLM",
+    "granite": "hf-internal-testing/tiny-random-GraniteForCausalLM",
     "groupvit": "hf-internal-testing/tiny-random-groupvit",
+    "hiera": "hf-internal-testing/tiny-random-HieraForImageClassification",
     "hubert": "hf-internal-testing/tiny-random-HubertModel",
     "ibert": "hf-internal-testing/tiny-random-IBertModel",
-    "levit": "hf-internal-testing/tiny-random-LevitModel",
     "latent-consistency": "echarlaix/tiny-random-latent-consistency",
     "layoutlm": "hf-internal-testing/tiny-random-LayoutLMModel",
     "layoutlmv3": "hf-internal-testing/tiny-random-LayoutLMv3Model",
+    "levit": "hf-internal-testing/tiny-random-LevitModel",
     "longt5": "hf-internal-testing/tiny-random-LongT5Model",
-    "llama": "fxmarty/tiny-llama-fast-tokenizer",
+    "llama": "optimum-internal-testing/tiny-random-llama",
     "m2m_100": "hf-internal-testing/tiny-random-m2m_100",
-    "marian": "sshleifer/tiny-marian-en-de",  # hf-internal-testing ones are broken
+    "marian": "echarlaix/tiny-random-marian",
     "mbart": "hf-internal-testing/tiny-random-mbart",
+    "mgp-str": "hf-internal-testing/tiny-random-MgpstrForSceneTextRecognition",
     "mistral": "echarlaix/tiny-random-mistral",
     "mobilebert": "hf-internal-testing/tiny-random-MobileBertModel",
     "mobilenet_v1": "google/mobilenet_v1_0.75_192",
@@ -125,25 +130,32 @@
     "mpt": "hf-internal-testing/tiny-random-MptForCausalLM",
     "mt5": "lewtun/tiny-random-mt5",
     "nystromformer": "hf-internal-testing/tiny-random-NystromformerModel",
+    "opt": "hf-internal-testing/tiny-random-OPTModel",
     "pegasus": "hf-internal-testing/tiny-random-PegasusModel",
     "perceiver_text": "hf-internal-testing/tiny-random-language_perceiver",
     "perceiver_vision": "hf-internal-testing/tiny-random-vision_perceiver_conv",
     "phi3": "Xenova/tiny-random-Phi3ForCausalLM",
     "pix2struct": "fxmarty/pix2struct-tiny-random",
     "poolformer": "hf-internal-testing/tiny-random-PoolFormerModel",
+    "pvt": "hf-internal-testing/tiny-random-PvtForImageClassification",
     "qwen2": "fxmarty/tiny-dummy-qwen2",
+    "rembert": "hf-internal-testing/tiny-random-RemBertModel",
     "resnet": "hf-internal-testing/tiny-random-resnet",
     "roberta": "hf-internal-testing/tiny-random-RobertaModel",
     "roformer": "hf-internal-testing/tiny-random-RoFormerModel",
     "segformer": "hf-internal-testing/tiny-random-SegformerModel",
     "sew": "hf-internal-testing/tiny-random-SEWModel",
     "sew_d": "asapp/sew-d-tiny-100k-ft-ls100h",
+    "siglip": "hf-internal-testing/tiny-random-SiglipModel",
     "squeezebert": "hf-internal-testing/tiny-random-SqueezeBertModel",
     "speech_to_text": "hf-internal-testing/tiny-random-Speech2TextModel",
     "stable-diffusion": "hf-internal-testing/tiny-stable-diffusion-torch",
+    "stable-diffusion-3": "optimum-internal-testing/tiny-random-stable-diffusion-3",
     "stable-diffusion-xl": "echarlaix/tiny-random-stable-diffusion-xl",
     "swin": "hf-internal-testing/tiny-random-SwinModel",
+    "swinv2": "hf-internal-testing/tiny-random-Swinv2Model",
     "swin-window": "yujiepan/tiny-random-swin-patch4-window7-224",
+    "swin2sr": "hf-internal-testing/tiny-random-Swin2SRForImageSuperResolution",
     "t5": "hf-internal-testing/tiny-random-t5",
     "table-transformer": "hf-internal-testing/tiny-random-TableTransformerModel",
     "trocr": "microsoft/trocr-small-handwritten",
@@ -151,7 +163,7 @@
     "unispeech_sat": "hf-internal-testing/tiny-random-UnispeechSatModel",
     "vision-encoder-decoder": "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2",
     "vit": "hf-internal-testing/tiny-random-vit",
-    "whisper": "openai/whisper-tiny.en",  # hf-internal-testing ones are broken
+    "whisper": "optimum-internal-testing/tiny-random-whisper",
     "wav2vec2": "hf-internal-testing/tiny-random-Wav2Vec2Model",
     "wav2vec2-conformer": "hf-internal-testing/tiny-random-wav2vec2-conformer",
     "wavlm": "hf-internal-testing/tiny-random-WavlmModel",
@@ -170,6 +182,11 @@ class ORTModelTestMixin(unittest.TestCase):
         "np": np.ndarray,
     }
 
+    TASK = None
+
+    ORTMODEL_CLASS = None
+    AUTOMODEL_CLASS = None
+
     @classmethod
     def setUpClass(cls):
         cls.onnx_model_dirs = {}
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index 4c721f089d7..d70b01fe7e1 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -12,13 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import os
 import tempfile
 import unittest
 
 from huggingface_hub import HfFolder, delete_repo
 from requests.exceptions import HTTPError
-from transformers.testing_utils import TOKEN, USER, is_staging_test
+from transformers.testing_utils import TOKEN, TemporaryHubRepo, is_staging_test
 
 from optimum.configuration_utils import BaseConfig
 
@@ -69,12 +68,11 @@ def tearDownClass(cls):
 
     def test_push_to_hub(self):
         config = FakeConfig(attribute=15)
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                os.path.join(tmp_dir, "optimum-test-base-config"), push_to_hub=True, token=self._token
-            )
 
-            new_config = FakeConfig.from_pretrained(f"{USER}/optimum-test-base-config")
+        with TemporaryHubRepo(token=self._token) as tmp_repo:
+            config.push_to_hub(tmp_repo.repo_id, token=self._token)
+
+            new_config = FakeConfig.from_pretrained(tmp_repo.repo_id)
             for k, v in config.to_dict().items():
                 if k != "optimum_version" and k != "transformers_version":
                     self.assertEqual(v, getattr(new_config, k))
@@ -82,15 +80,9 @@ def test_push_to_hub(self):
     def test_push_to_hub_in_organization(self):
         config = FakeConfig(attribute=15)
 
-        with tempfile.TemporaryDirectory() as tmp_dir:
-            config.save_pretrained(
-                os.path.join(tmp_dir, "optimum-test-base-config-org"),
-                push_to_hub=True,
-                token=self._token,
-                organization="valid_org",
-            )
-
-            new_config = FakeConfig.from_pretrained("valid_org/optimum-test-base-config-org")
+        with TemporaryHubRepo(namespace="valid_org", token=self._token) as tmp_repo:
+            config.push_to_hub(tmp_repo.repo_id, token=self._token)
+            new_config = FakeConfig.from_pretrained(tmp_repo.repo_id)
             for k, v in config.to_dict().items():
                 if k != "optimum_version" and k != "transformers_version":
                     self.assertEqual(v, getattr(new_config, k))
diff --git a/tests/utils/test_task_processors.py b/tests/utils/test_task_processors.py
index 16567048073..1a9f352a79f 100644
--- a/tests/utils/test_task_processors.py
+++ b/tests/utils/test_task_processors.py
@@ -19,16 +19,21 @@
 from typing import TYPE_CHECKING, Any, Dict, Tuple, Union
 from unittest import TestCase
 
-from datasets import DatasetDict
+import pytest
 from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer
 
+from optimum.utils.import_utils import is_datasets_available
 from optimum.utils.preprocessing import TaskProcessorsManager
+from optimum.utils.testing_utils import require_datasets
 
 
 if TYPE_CHECKING:
     from transformers import PretrainedConfig, PreTrainedTokenizerBase
     from transformers.image_processing_utils import BaseImageProcessor
 
+if is_datasets_available():
+    from datasets import DatasetDict
+
 
 TEXT_MODEL_NAME = "bert-base-uncased"
 CONFIG = AutoConfig.from_pretrained(TEXT_MODEL_NAME)
@@ -122,6 +127,8 @@ def test_create_defaults_and_kwargs_from_preprocessor_kwargs_does_not_mutate_pre
         )
         self.assertDictEqual(preprocessor_kwargs, clone)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_unallowed_data_keys(self):
         task_processor = TaskProcessorsManager.get_task_processor_class_for_task(self.TASK_NAME)(
             self.CONFIG, self.PREPROCESSOR
@@ -188,15 +195,23 @@ def _test_load_dataset(
 
         return dataset
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset(self):
         return self._test_load_dataset(False, False, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_by_guessing_data_keys(self):
         return self._test_load_dataset(False, True, False)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_and_only_keep_necessary_columns(self):
         return self._test_load_dataset(False, False, True)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         return self._test_load_dataset(True, False, False)
 
@@ -207,6 +222,8 @@ class TextClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -223,6 +240,8 @@ class TokenClassificationProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = random.randint(4, 16)
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)
@@ -232,6 +251,8 @@ def test_load_dataset_with_max_length(self):
         input_ids = dataset[0]["input_ids"]
         self.assertEqual(len(input_ids), max_length)
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_default_dataset(self):
         self.skipTest(
             "Skipping so as not to execute conll2003 remote code (test would require trust_remote_code=True)"
@@ -244,6 +265,8 @@ class QuestionAnsweringProcessorTest(TestCase, TaskProcessorTestBase):
     PREPROCESSOR = TOKENIZER
     WRONG_PREPROCESSOR = IMAGE_PROCESSOR
 
+    @require_datasets
+    @pytest.mark.datasets_test
     def test_load_dataset_with_max_length(self):
         max_length = 384
         dataset = self._test_load_dataset(False, False, True, max_length=max_length)