From 32a640810d16a5e6d6c1e2dff5ea82a355fc5b6f Mon Sep 17 00:00:00 2001
From: "Zhong, Zhicong" <zhicong.zhong@intel.com>
Date: Thu, 19 Sep 2024 08:20:36 +0000
Subject: [PATCH 1/3] add an option in insertGpuAllocs to skip the func args
 copy

---
 include/imex/Transforms/Passes.h   |  3 +++
 include/imex/Transforms/Passes.td  |  6 +++++-
 lib/Transforms/InsertGPUAllocs.cpp | 30 +++++++++++++++++++++---------
 3 files changed, 29 insertions(+), 10 deletions(-)
diff --git a/include/imex/Transforms/Passes.h b/include/imex/Transforms/Passes.h
index a697e1b49..e82ac8f90 100644
--- a/include/imex/Transforms/Passes.h
+++ b/include/imex/Transforms/Passes.h
@@ -19,12 +19,15 @@
 #include <memory>
 
 namespace imex {
+struct InsertGPUAllocsOptions;
 //===----------------------------------------------------------------------===//
 // Passes
 //===----------------------------------------------------------------------===//
 std::unique_ptr<mlir::Pass> createSerializeSPIRVPass();
 std::unique_ptr<mlir::Pass>
 createInsertGPUAllocsPass(const char *clientAPI = "vulkan");
+std::unique_ptr<mlir::Pass>
+createInsertGPUAllocsPass(const InsertGPUAllocsOptions &);
 std::unique_ptr<mlir::Pass> createSetSPIRVCapabilitiesPass();
 std::unique_ptr<mlir::Pass>
 createSetSPIRVAbiAttributePass(const char *clientAPI = "vulkan");
diff --git a/include/imex/Transforms/Passes.td b/include/imex/Transforms/Passes.td
index 5717c996d..4de20b13b 100644
--- a/include/imex/Transforms/Passes.td
+++ b/include/imex/Transforms/Passes.td
@@ -41,7 +41,11 @@ def InsertGPUAllocs : Pass<"insert-gpu-allocs", "::mlir::func::FuncOp"> {
     Option<"clientAPI", "client-api", "std::string", /*default=*/"\"opencl\"",
            "The client API to use for inserting gpu allocs">,
     Option<"inRegions", "in-regions", "bool", "false",
-           "Add gpu allocs only for memref.AllocOps within GPU regions">
+           "Add gpu allocs only for memref.AllocOps within GPU regions">,
+        Option<"isUsmArgs", "is-usm-args", "bool", "false",
+           "Whether to use USM(unified shared memory) func args, in which the "
+           "host and device could access the same buffer and there is no need "
+           "to add memcpy explicitly">
   ];
 }
 
diff --git a/lib/Transforms/InsertGPUAllocs.cpp b/lib/Transforms/InsertGPUAllocs.cpp
index d9e350e2c..178a8ac72 100644
--- a/lib/Transforms/InsertGPUAllocs.cpp
+++ b/lib/Transforms/InsertGPUAllocs.cpp
@@ -47,6 +47,12 @@ class InsertGPUAllocsPass final
   explicit InsertGPUAllocsPass() : m_clientAPI("vulkan") {}
   explicit InsertGPUAllocsPass(const mlir::StringRef &clientAPI)
       : m_clientAPI(clientAPI) {}
+  explicit InsertGPUAllocsPass(const imex::InsertGPUAllocsOptions &options)
+      : InsertGPUAllocsBase<InsertGPUAllocsPass>(options) {
+    if (clientAPI == "opencl") {
+      m_clientAPI = "opencl";
+    }
+  }
 
   mlir::LogicalResult
   initializeOptions(mlir::StringRef options,
@@ -540,15 +546,17 @@ class InsertGPUAllocsPass final
     // This is the case where the inputs are passed as arguments to the
     // function. This code will add the IR for memory allocation on the device
     // with gpu.alloc and insert a memref.copy from host to device
-    for (const auto &it : gpuBufferParams) {
-      auto param = block.getArgument(it.first);
-      if (isGpuAddrSpace(param))
-        continue;
-      auto access = getAccessType(param);
-      access.hostRead = true;
-      access.hostWrite = true;
-      builder.setInsertionPointToStart(&block);
-      add_gpu_alloc(builder, param, access, term);
+    if (!isUsmArgs.getValue()) {
+      for (const auto &it : gpuBufferParams) {
+        auto param = block.getArgument(it.first);
+        if (isGpuAddrSpace(param))
+          continue;
+        auto access = getAccessType(param);
+        access.hostRead = true;
+        access.hostWrite = true;
+        builder.setInsertionPointToStart(&block);
+        add_gpu_alloc(builder, param, access, term);
+      }
     }
 
     // CallOp Case: This is the case where the memref producer is coming
@@ -580,4 +588,8 @@ namespace imex {
 std::unique_ptr<mlir::Pass> createInsertGPUAllocsPass(const char *clientAPI) {
   return std::make_unique<InsertGPUAllocsPass>(clientAPI);
 }
+std::unique_ptr<mlir::Pass>
+createInsertGPUAllocsPass(const InsertGPUAllocsOptions &option) {
+  return std::make_unique<InsertGPUAllocsPass>(option);
+}
 } // namespace imex

From aecf9ab61c4ecebd9f874f6bb952ed9aabe1f842 Mon Sep 17 00:00:00 2001
From: "Zhong, Zhicong" <zhicong.zhong@intel.com>
Date: Sat, 12 Oct 2024 02:51:16 +0000
Subject: [PATCH 2/3] add test

---
 .../skip-gpu-alloc-for-usm-args.mlir          | 54 +++++++++++++++++++
 1 file changed, 54 insertions(+)
 create mode 100644 test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir

diff --git a/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir b/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir
new file mode 100644
index 000000000..cd64312ed
--- /dev/null
+++ b/test/Transforms/InsertGpuAllocs/skip-gpu-alloc-for-usm-args.mlir
@@ -0,0 +1,54 @@
+// RUN: imex-opt --insert-gpu-allocs='client-api=opencl is-usm-args=1' %s | FileCheck %s --check-prefix=OPENCL
+// RUN: imex-opt --insert-gpu-allocs='client-api=vulkan is-usm-args=1' %s | FileCheck %s --check-prefix=VULKAN
+
+// OPENCL-LABEL: func.func @addt
+// OPENCL-SAME:  %[[arg0:.+]]: memref<2x5xf32>, %[[arg1:.+]]: memref<2x5xf32>, %[[out_buff:.+]]: memref<2x5xf32>
+// VULKAN-LABEL: func.func @addt
+// VULKAN-SAME:  %[[arg0:.+]]: memref<2x5xf32>, %[[arg1:.+]]: memref<2x5xf32>, %[[out_buff:.+]]: memref<2x5xf32>
+func.func @addt(%arg0: memref<2x5xf32>, %arg1: memref<2x5xf32>, %out_buff: memref<2x5xf32>) -> memref<2x5xf32> {
+  %c0 = arith.constant 0 : index
+  %c2 = arith.constant 2 : index
+  %c1 = arith.constant 1 : index
+  %c5 = arith.constant 5 : index
+  // OPENCL-NOT: %[[MEMREF0:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
+  // OPENCL-NOT: %[[MEMREF1:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
+  // OPENCL-NOT: memref.copy
+  // OPENCL-NOT: %[[MEMREF2:.*]] = gpu.alloc host_shared () : memref<2x5xf32>
+  // OPENCL-NOT: memref.copy
+
+  // VULKAN-NOT: %[[MEMREF0:.*]] = memref.alloc() : memref<2x5xf32>
+  // VULKAN-NOT: %[[MEMREF1:.*]] = memref.alloc() : memref<2x5xf32>
+  // VULKAN-NOT: memref.copy
+  // VULKAN-NOT: %[[MEMREF2:.*]] = memref.alloc() : memref<2x5xf32>
+  // VULKAN-NOT: memref.copy
+
+  %tmp_buff = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
+  // OPENCL-NOT:  %[[MEMREF3:.*]] = memref.alloc().*
+  // OPENCL:  %[[MEMREF3:.*]] = gpu.alloc () : memref<2x5xf32>
+  // VULKAN:  %[[MEMREF3:.*]] = memref.alloc() {alignment = 128 : i64} : memref<2x5xf32>
+
+  %c1_0 = arith.constant 1 : index
+  %1 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c2)[%c0, %c1]
+  %2 = affine.apply affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>(%c5)[%c0, %c1]
+  gpu.launch blocks(%arg2, %arg3, %arg4) in (%arg8 = %1, %arg9 = %2, %arg10 = %c1_0) threads(%arg5, %arg6, %arg7) in (%arg11 = %c1_0, %arg12 = %c1_0, %arg13 = %c1_0) {
+    %3 = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>(%arg2)[%c1, %c0]
+    %4 = affine.apply affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>(%arg3)[%c1, %c0]
+    %5 = memref.load %arg0[%3, %4] : memref<2x5xf32>
+    %6 = memref.load %arg1[%3, %4] : memref<2x5xf32>
+    %7 = arith.addf %5, %6 : f32
+    memref.store %7, %tmp_buff[%3, %4] : memref<2x5xf32>
+
+    %8 = memref.load %tmp_buff[%3, %4] : memref<2x5xf32>
+    %9 = arith.addf %8, %5 : f32
+    memref.store %9, %out_buff[%3, %4] : memref<2x5xf32>
+
+    gpu.terminator
+  } {SCFToGPU_visited}
+
+  // OPENCL-NOT: memref.dealloc %[[MEMREF3]] : memref<2x5xf32>
+  // OPENCL: gpu.dealloc %[[MEMREF3]] : memref<2x5xf32>
+  // VULKAN: memref.dealloc %[[MEMREF3]] : memref<2x5xf32>
+  memref.dealloc %tmp_buff : memref<2x5xf32>
+
+  return %out_buff : memref<2x5xf32>
+}

From 4df3eb8935c666c4736fb6e8994898146b9a8f14 Mon Sep 17 00:00:00 2001
From: "Zhong, Zhicong" <zhicong.zhong@intel.com>
Date: Tue, 15 Oct 2024 02:58:28 +0000
Subject: [PATCH 3/3] add bf16 test

---
 .../Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir b/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir
index 2a81c56aa..765b3f0d8 100644
--- a/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir
+++ b/test/Transforms/BF16ToGPU/EltwiseAdd.bf16.mlir
@@ -65,3 +65,43 @@ module @eltwise_add attributes {gpu.container_module} {
   }
   func.func private @printMemrefBF16(memref<*xbf16>)
 }
+
+
+module @eltwise_add_usm attributes {gpu.container_module} {
+  memref.global "private" constant @__constant_10x20xbf16 : memref<10x20xbf16> = dense<5.000000e-01>
+  func.func @test(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>) -> memref<10x20xbf16> {
+    %c20 = arith.constant 20 : index
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+    %memref_1 = gpu.alloc  host_shared () : memref<10x20xbf16>
+    gpu.launch_func  @test_kernel::@test_kernel blocks in (%c10, %c20, %c1) threads in (%c1, %c1, %c1)  args(%arg0 : memref<10x20xbf16>, %arg1 : memref<10x20xbf16>, %memref_1 : memref<10x20xbf16>)
+    %alloc = memref.alloc() : memref<10x20xbf16>
+    memref.copy %memref_1, %alloc : memref<10x20xbf16> to memref<10x20xbf16>
+    gpu.dealloc  %memref_1 : memref<10x20xbf16>
+    return %alloc : memref<10x20xbf16>
+  }
+  gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR, SubgroupDispatch, VectorComputeINTEL, VectorAnyINTEL, Bfloat16ConversionINTEL], [SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume, SPV_INTEL_vector_compute, SPV_INTEL_bfloat16_conversion]>, api=OpenCL, #spirv.resource_limits<>>} {
+    gpu.func @test_kernel(%arg0: memref<10x20xbf16>, %arg1: memref<10x20xbf16>, %arg2: memref<10x20xbf16>) kernel attributes {VectorComputeFunctionINTEL, gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 10, 20, 1>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
+      %block_id_x = gpu.block_id  x
+      %block_id_y = gpu.block_id  y
+      %cst = arith.constant 0.5 : bf16
+      %0 = memref.load %arg0[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %1 = memref.load %arg1[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      %2 = arith.addf %0, %1 : bf16
+      %3 = arith.addf %2, %cst : bf16
+      memref.store %3, %arg2[%block_id_x, %block_id_y] : memref<10x20xbf16>
+      gpu.return
+    }
+  }
+  func.func @main() {
+    %0 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %1 = memref.get_global @__constant_10x20xbf16 : memref<10x20xbf16>
+    %2 = call @test(%0, %1) : (memref<10x20xbf16>, memref<10x20xbf16>) -> memref<10x20xbf16>
+    %cast = memref.cast %2 : memref<10x20xbf16> to memref<*xbf16>
+    // CHECK: Unranked Memref base@ = {{(0x)?[-9a-f]*}}
+    // CHECK-COUNT-200: 1.5
+    call @printMemrefBF16(%cast) : (memref<*xbf16>) -> ()
+    return
+  }
+   func.func private @printMemrefBF16(memref<*xbf16>)  attributes {llvm.emit_c_interface}
+}