From 8e5533452c3c183bd76d6f3848fd67d473742318 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 18 Feb 2024 21:07:05 +0900
Subject: [PATCH 1/6] Enable DirectML acceleration and improve device handling

This commit introduces two improvements:

1. DirectML acceleration:

    - Added support for running optimum commands on DirectML hardware (Windows only) using the --device dml flag.
    - Automatically sets the device to torch_directml.device() when the flag is specified.

2. Improved device handling:

    - Ensures the model is directly initialized in the device only when applicable.
---
 optimum/commands/optimum_cli.py | 4 ++++
 optimum/exporters/tasks.py      | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/commands/optimum_cli.py b/optimum/commands/optimum_cli.py
index 4bae9bb5f82..6bbfa9cee55 100644
--- a/optimum/commands/optimum_cli.py
+++ b/optimum/commands/optimum_cli.py
@@ -158,6 +158,10 @@ def main():
         parser.print_help()
         exit(1)
 
+    if args.device == "dml":
+        import torch_directml
+        args.device = torch_directml.device()
+
     # Run
     service = args.func(args)
     service.run()
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index e8e8af2bce9..ae1b3fb20b2 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1860,7 +1860,7 @@ def get_model_from_task(
                         device = torch.device("cpu")
 
                     # TODO : fix EulerDiscreteScheduler loading to enable for SD models
-                    if version.parse(torch.__version__) >= version.parse("2.0") and library_name != "diffusers":
+                    if version.parse(torch.__version__) >= version.parse("2.0") and library_name != "diffusers" and not device.type:
                         with device:
                             # Initialize directly in the requested device, to save allocation time. Especially useful for large
                             # models to initialize on cuda device.

From 26d2ed9a5aeee700b72f99c2c507fa48f0886971 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 18 Feb 2024 22:47:45 +0900
Subject: [PATCH 2/6] Refine device handling for PyTorch 2.0+ and device type
 check

This commit refines the device handling in optimum/exporters/tasks.py for the following improvements:

  - More precise device check: Instead of checking for not device.type, the condition is updated to device.type != "privateuseone". This ensures the initialization happens on the requested device only if it's not a private use device (e.g., DirectML).
  - Improved clarity: The code comments are updated to better explain the purpose of the device initialization and its benefits for large models.
---
 optimum/exporters/tasks.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index ae1b3fb20b2..85932eda509 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1860,7 +1860,7 @@ def get_model_from_task(
                         device = torch.device("cpu")
 
                     # TODO : fix EulerDiscreteScheduler loading to enable for SD models
-                    if version.parse(torch.__version__) >= version.parse("2.0") and library_name != "diffusers" and not device.type:
+                    if version.parse(torch.__version__) >= version.parse("2.0") and library_name != "diffusers" and device.type != "privateuseone":
                         with device:
                             # Initialize directly in the requested device, to save allocation time. Especially useful for large
                             # models to initialize on cuda device.

From 4502bc4110ab993e354a5808e3434b32f2161b48 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 25 Feb 2024 10:17:28 +0900
Subject: [PATCH 3/6] Support privateuseone device for PyTorch model export

  - Extends device compatibility to "privateuseone" in export_pytorch for exporting models usable on specific hardware.

This commit allows exporting PyTorch models compatible with the "privateuseone" device, potentially enabling inference on specialized hardware platforms.
---
 optimum/exporters/onnx/convert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 21fc9279428..75e7b3d5278 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -554,7 +554,7 @@ def remap(value):
 
             return value
 
-        if device.type == "cuda" and torch.cuda.is_available():
+        if device.type == "cuda" and torch.cuda.is_available() or device.type == "privateuseone":
             model.to(device)
             dummy_inputs = tree_map(remap, dummy_inputs)
 

From 99c92b62c82f27caf61804cb827896ffe983bc69 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 25 Feb 2024 15:31:31 +0900
Subject: [PATCH 4/6] Enable DML device support for PyTorch models in Optimum

This commit adds support for running PyTorch models on the DML device within the Optimum framework.

  - Dynamic DML device handling: Introduces dynamic import of torch_directml for improved maintainability.
  - Consistent device selection: Ensures consistent device selection across optimum/exporters/onnx/convert.py, optimum/exporters/tasks.py, and optimum/onnxruntime/io_binding/io_binding_helper.py.

This change allows users to leverage DML capabilities for efficient PyTorch model inference with Optimum.
---
 optimum/exporters/onnx/convert.py                   | 7 ++++++-
 optimum/exporters/tasks.py                          | 6 +++++-
 optimum/onnxruntime/io_binding/io_binding_helper.py | 9 +++++++--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/optimum/exporters/onnx/convert.py b/optimum/exporters/onnx/convert.py
index 75e7b3d5278..5801955a75e 100644
--- a/optimum/exporters/onnx/convert.py
+++ b/optimum/exporters/onnx/convert.py
@@ -16,6 +16,7 @@
 
 import copy
 import gc
+import importlib
 import multiprocessing as mp
 import os
 import traceback
@@ -546,7 +547,11 @@ def export_pytorch(
         # Check that inputs match, and order them properly
         dummy_inputs = config.generate_dummy_inputs(framework="pt", **input_shapes)
 
-        device = torch.device(device)
+        if device == "dml" and importlib.util.find_spec("torch_directml"):
+            torch_directml = importlib.import_module("torch_directml")
+            device = torch_directml.device()
+        else:
+            device = torch.device(device)
 
         def remap(value):
             if isinstance(value, torch.Tensor):
diff --git a/optimum/exporters/tasks.py b/optimum/exporters/tasks.py
index 85932eda509..cd346ef9e4a 100644
--- a/optimum/exporters/tasks.py
+++ b/optimum/exporters/tasks.py
@@ -1855,7 +1855,11 @@ def get_model_from_task(
                     kwargs["torch_dtype"] = torch_dtype
 
                     if isinstance(device, str):
-                        device = torch.device(device)
+                        if device == "dml" and importlib.util.find_spec("torch_directml"):
+                            torch_directml = importlib.import_module("torch_directml")
+                            device = torch_directml.device()
+                        else:
+                            device = torch.device(device)
                     elif device is None:
                         device = torch.device("cpu")
 
diff --git a/optimum/onnxruntime/io_binding/io_binding_helper.py b/optimum/onnxruntime/io_binding/io_binding_helper.py
index 31da5379184..6f226e9a329 100644
--- a/optimum/onnxruntime/io_binding/io_binding_helper.py
+++ b/optimum/onnxruntime/io_binding/io_binding_helper.py
@@ -11,6 +11,7 @@
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
+import importlib
 import logging
 import traceback
 from typing import TYPE_CHECKING
@@ -145,8 +146,12 @@ def to_pytorch_via_dlpack(ort_value: OrtValue) -> torch.Tensor:
     @staticmethod
     def get_device_index(device):
         if isinstance(device, str):
-            # could be 'cuda:0', 'cuda:1', or 'cpu'. with cpu, set index=0
-            device = torch.device(device)
+            if device == "dml" and importlib.util.find_spec("torch_directml"):
+                torch_directml = importlib.import_module("torch_directml")
+                device = torch_directml.device()
+            else:
+                # could be 'cuda:0', 'cuda:1', or 'cpu'. with cpu, set index=0
+                device = torch.device(device)
         elif isinstance(device, int):
             return device
         return 0 if device.index is None else device.index

From 08253df79e430908d6b1e9da01fe401fd81e6e86 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Sun, 25 Feb 2024 15:34:41 +0900
Subject: [PATCH 5/6] Remove redundant DML device handling in optimum_cli.py

This commit removes unnecessary code for handling the DML device in optimum/commands/optimum_cli.py.

  - Redundant import: The code previously imported torch_directml conditionally, which is no longer needed as DML device support is handled in other parts of the codebase.

This change simplifies the code and avoids potential conflicts.
---
 optimum/commands/optimum_cli.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/optimum/commands/optimum_cli.py b/optimum/commands/optimum_cli.py
index 6bbfa9cee55..4bae9bb5f82 100644
--- a/optimum/commands/optimum_cli.py
+++ b/optimum/commands/optimum_cli.py
@@ -158,10 +158,6 @@ def main():
         parser.print_help()
         exit(1)
 
-    if args.device == "dml":
-        import torch_directml
-        args.device = torch_directml.device()
-
     # Run
     service = args.func(args)
     service.run()

From 107879ef4655bc9faa4694ff5895e0d7c51d6846 Mon Sep 17 00:00:00 2001
From: Kaz Nishimura <kazssym@linuxfront.com>
Date: Wed, 28 Feb 2024 22:39:05 +0900
Subject: [PATCH 6/6] Add DML-specific dependencies to `setup.py`

This commit updates `setup.py` to include the following changes:

  - Introduces a new conditional section "exporters-directml" with dependencies required for exporting models for DML inference.
  - This section mirrors the existing "exporters" and "exporters-gpu" sections, adding `onnxruntime-directml` as a dependency.

This update ensures users have the necessary libraries for working with DML devices when installing Optimum with DML support.
---
 setup.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/setup.py b/setup.py
index 25a09de985d..d403d62e51b 100644
--- a/setup.py
+++ b/setup.py
@@ -61,8 +61,17 @@
         "protobuf>=3.20.1",
         "accelerate",  # ORTTrainer requires it.
     ],
+    "onnxruntime-directml": [
+        "onnx",
+        "onnxruntime-directml>=1.11.0",
+        "datasets>=1.2.1",
+        "evaluate",
+        "protobuf>=3.20.1",
+        "accelerate",  # ORTTrainer requires it.
+    ],
     "exporters": ["onnx", "onnxruntime", "timm"],
     "exporters-gpu": ["onnx", "onnxruntime-gpu", "timm"],
+    "exporters-directml": ["onnx", "onnxruntime-directml", "timm"],
     "exporters-tf": [
         "tensorflow>=2.4,<=2.12.1",
         "tf2onnx",