Merge pull request #40 from ROCm/rocm-jaxlib-v0.4.31-qa-cleanup

Rocm jaxlib v0.4.31 qa cleanup
ROCm · Nov 7, 2024 · e02959b · e02959b
2 parents 8eb4ca9 + 23df4ae
commit e02959b
Show file tree

Hide file tree

Showing 26 changed files with 285 additions and 43 deletions.
diff --git a/build_tools/rocm/run_xla.sh b/build_tools/rocm/run_xla.sh
@@ -41,7 +41,7 @@ if [[ -n $1 ]]; then
     ROCM_INSTALL_DIR=$1
 else
     if [[ -z "${ROCM_PATH}" ]]; then
-        ROCM_INSTALL_DIR=/opt/rocm-6.0.2
+        ROCM_INSTALL_DIR=/opt/rocm-6.2.0
     else
         ROCM_INSTALL_DIR=$ROCM_PATH
     fi

diff --git a/third_party/llvm/rocdl_shuffle_down.patch b/third_party/llvm/rocdl_shuffle_down.patch
@@ -0,0 +1,35 @@
+From a46b9e979ffa523bfed61487a2404e1f48140288 Mon Sep 17 00:00:00 2001
+From: Dragan Mladjenovic <[email protected]>
+Date: Fri, 29 Mar 2024 12:27:36 +0000
+Subject: [PATCH] Support gpu::ShuffleMode::DOWN lowering
+
+---
+ mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+index e2cb3687d872..9317e30290c6 100644
+--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
++++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+@@ -140,7 +140,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
+     Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
+
+     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+     Value width = adaptor.getWidth();
+     Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
+     Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
+     Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
+@@ -151,6 +151,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
+     // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
+     // perf.
+     switch (op.getMode()) {
++    case gpu::ShuffleMode::DOWN:
++      dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
++                                             adaptor.getOffset());
++      break;
+     case gpu::ShuffleMode::XOR:
+       dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
+                                              adaptor.getOffset());
+-- 
+2.25.1
+
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
@@ -22,6 +22,7 @@ def repo(name):
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
+            "//third_party/llvm:rocdl_shuffle_down.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/third_party/tsl/third_party/llvm/rocdl_shuffle_down.patch b/third_party/tsl/third_party/llvm/rocdl_shuffle_down.patch
@@ -0,0 +1,35 @@
+From a46b9e979ffa523bfed61487a2404e1f48140288 Mon Sep 17 00:00:00 2001
+From: Dragan Mladjenovic <[email protected]>
+Date: Fri, 29 Mar 2024 12:27:36 +0000
+Subject: [PATCH] Support gpu::ShuffleMode::DOWN lowering
+
+---
+ mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+index e2cb3687d872..9317e30290c6 100644
+--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
++++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+@@ -140,7 +140,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
+     Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
+
+     auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+     Value width = adaptor.getWidth();
+     Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
+     Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
+     Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
+@@ -151,6 +151,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
+     // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
+     // perf.
+     switch (op.getMode()) {
++    case gpu::ShuffleMode::DOWN:
++      dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
++                                             adaptor.getOffset());
++      break;
+     case gpu::ShuffleMode::XOR:
+       dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
+                                              adaptor.getOffset());
+-- 
+2.25.1
+
diff --git a/third_party/tsl/third_party/llvm/workspace.bzl b/third_party/tsl/third_party/llvm/workspace.bzl
@@ -22,6 +22,7 @@ def repo(name):
             "//third_party/llvm:mathextras.patch",
             "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
+            "//third_party/llvm:rocdl_shuffle_down.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
     )
diff --git a/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -276,6 +276,12 @@ TEST(PjrtCApiGpuAllocatorTest, ValidOptionsParsing) {
   std::vector<std::string> allocator_options = {"default", "platform", "bfc",
                                                 "cuda_async"};
   for (const std::string& allocator_option : allocator_options) {
+#ifdef TENSORFLOW_USE_ROCM
+    if (allocator_option == "cuda_async") {
+      VLOG(1) << "cuda_async allocator not available on ROCm!";
+      continue;
+    }
+#endif
     absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
         {"allocator", allocator_option},
         {"visible_devices", xla::PjRtValueType(std::vector<int64_t>{0, 1})},

diff --git a/xla/service/gpu/BUILD b/xla/service/gpu/BUILD
@@ -266,6 +266,7 @@ cc_library(
     testonly = 1,
     srcs = ["gpu_device_info_for_tests.cc"],
     hdrs = ["gpu_device_info_for_tests.h"],
+    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//xla/stream_executor:device_description",

diff --git a/xla/service/gpu/determinism_test.cc b/xla/service/gpu/determinism_test.cc
@@ -39,6 +39,7 @@ class DeterminismTest : public GpuCodegenTest {
  public:
   DeterminismTest() : debug_options_(HloTestBase::GetDebugOptionsForTest()) {
     debug_options_.set_xla_gpu_exclude_nondeterministic_ops(true);
+    debug_options_.set_xla_gpu_deterministic_ops(true);
     // Randomize timer durations to better test autotuning does not introduce
     // nondeterminism.
     se::gpu::GpuTimer::ReturnRandomDurationsForTesting();
@@ -97,6 +98,7 @@ ENTRY e {
   if (!rocm.has_hipblaslt()) {
     GTEST_SKIP() << "No hipblas-lt support on this architecture!";
   }
+  debug_options_.set_xla_gpu_enable_triton_gemm(false) ;
 #endif  // TENSORFLOW_USE_ROCM
 
   debug_options_.set_xla_gpu_triton_fusion_level(0);

diff --git a/xla/service/gpu/execution_stream_assignment.cc b/xla/service/gpu/execution_stream_assignment.cc
@@ -34,7 +34,8 @@ limitations under the License.
 
 namespace xla::gpu {
 
-ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
+ExecutionStreamAssignment::ExecutionStreamAssignment(
+    const HloModule* module, ExecutionStreamAssignmentOptions options) {
   std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
 
   // We'll walk the `CallGraph` starting from the entrypoint. The instructions
@@ -69,6 +70,22 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
     }
   };
 
+  // Assigns source and destination streams to an instruction and records it in
+  // async_instructions_.
+  auto assign_async_execution_streams =
+      [&](HloInstruction* instruction, ExecutionStreamId source_stream_id) {
+        AsyncExecutionStreamIds streams;
+        streams.source_stream_id = source_stream_id;
+        streams.destination_stream_id = next_stream_id;
+
+        CHECK(async_instructions_.try_emplace(instruction, streams).second);
+
+        next_stream_id++;
+        if (next_stream_id.value() > options.number_of_execution_streams) {
+          next_stream_id = ExecutionStreamId(1);
+        }
+      };
+
   while (!queue.empty()) {
     Pending pending = queue.front();
     queue.pop_front();
@@ -77,8 +94,14 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
     // instructions. Asynchronous instructions will be handled afterwards.
     for (HloInstruction* instruction : pending.node->instructions()) {
       if (instruction->IsAsynchronous()) continue;
-      CHECK(sync_instructions_.try_emplace(instruction, pending.stream_id)
-                .second);
+      if (instruction->opcode() == HloOpcode::kCopyStart) {
+        // CopyStart is morally an async instruction, let us treat it
+        // as an async instruction.
+        assign_async_execution_streams(instruction, pending.stream_id);
+      } else {
+        CHECK(sync_instructions_.try_emplace(instruction, pending.stream_id)
+                  .second);
+      }
     }
 
     // Next, we'll process all callsites in the current computation.
@@ -88,14 +111,9 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
         // Asynchronous calls will result in a new `ExecutionStreamId` being
         // dispensed for the called computations.
         CHECK_EQ(callsite.instruction()->opcode(), HloOpcode::kAsyncStart);
-        const ExecutionStreamId async_stream_id = next_stream_id++;
-        enqueue_called_computations(callsite, async_stream_id);
-
-        AsyncExecutionStreamIds streams;
-        streams.source_stream_id = pending.stream_id;
-        streams.destination_stream_id = async_stream_id;
-        CHECK(async_instructions_.try_emplace(callsite.instruction(), streams)
-                  .second);
+        enqueue_called_computations(callsite, next_stream_id);
+        assign_async_execution_streams(callsite.instruction(),
+                                       pending.stream_id);
       } else {
         // Synchronous calls will result in the called computations being
         // invoked using the same `ExecutionStreamId`.
@@ -146,7 +164,9 @@ ExecutionStreamAssignment::GetSyncExecutionStreamId(
 
 absl::StatusOr<ExecutionStreamAssignment::AsyncExecutionStreamIds>
 ExecutionStreamAssignment::GetAsyncExecutionStreamIds(
-    const HloAsyncInstruction* instruction) const {
+    const HloInstruction* instruction) const {
+  CHECK(instruction->IsAsynchronous() ||
+        instruction->opcode() == HloOpcode::kCopyStart);
   auto streams = async_instructions_.find(instruction);
   if (streams == async_instructions_.end()) {
     return StreamNotFoundError(instruction);

diff --git a/xla/service/gpu/execution_stream_assignment.h b/xla/service/gpu/execution_stream_assignment.h
@@ -26,6 +26,12 @@ limitations under the License.
 
 namespace xla::gpu {
 
+struct ExecutionStreamAssignmentOptions {
+  // The `ExecutionStreamAssignment` will round-robin across this many
+  // `ExecutionStreams`.
+  int number_of_execution_streams = 4;
+};
+
 // `ExecutionStreamAssignments` represent a mapping from `HloInstructions` to
 // `ExecutionStreamIds`. Asynchronous calls (`async-start`, `async-update`, and
 // `async-done`) result in the target computations being assigned new
@@ -37,7 +43,8 @@ class ExecutionStreamAssignment {
   // pass the module through the `FlattenCallGraph` pass.
   //
   // The ExecutionStreamAssignment does not take ownership of the `HloModule`.
-  explicit ExecutionStreamAssignment(const HloModule* module);
+  explicit ExecutionStreamAssignment(
+      const HloModule* module, ExecutionStreamAssignmentOptions options = {});
 
   // Returns the `ExecutionStreamId` for the given instruction, which *must* be
   // synchronous. Returns an error if the instruction is either not reachable
@@ -58,7 +65,7 @@ class ExecutionStreamAssignment {
     ExecutionStreamId destination_stream_id;
   };
   absl::StatusOr<AsyncExecutionStreamIds> GetAsyncExecutionStreamIds(
-      const HloAsyncInstruction* instruction) const;
+      const HloInstruction* instruction) const;
 
  private:
   // Maps from `HloInstructions` to `ExecutionStreamIds` for synchronous and

diff --git a/xla/service/gpu/execution_stream_assignment_test.cc b/xla/service/gpu/execution_stream_assignment_test.cc
@@ -69,6 +69,10 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
       p0 = f32[2,2] parameter(0)
       ROOT add = f32[2,2] add(p0, p0)
     }
+    leaf3 {
+      p0 = f32[2,2] parameter(0)
+      ROOT add = f32[2,2] add(p0, p0)
+    }
 
     // Entry computation that calls each of the leaves asynchronously.
     ENTRY entry {
@@ -77,21 +81,30 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
           kind=kLoop, calls=leaf1
       start2 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
           kind=kLoop, calls=leaf2
+      start3 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
+          kind=kLoop, calls=leaf3
       update1 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start1)
       update2 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start2)
+      update3 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start3)
       done1 = f32[2,2] fusion-done(update1)
       done2 = f32[2,2] fusion-done(update2)
-      ROOT done = f32[2,2] add(done1, done2)
+      done3 = f32[2,2] fusion-done(update3)
+      ROOT done = f32[2,2] custom-call(done1, done2, done3),
+          custom_call_target="target"
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
 
-  ExecutionStreamAssignment assignment(module.get());
+  ExecutionStreamAssignment assignment(
+      module.get(),
+      ExecutionStreamAssignmentOptions{/*number_of_execution_streams=*/2});
 
   // The outermost computation should run on `ExecutionStreamId(0)`. The two
   // asynchronous branches should be launched on `ExecutionStreamId(1)` and
-  // `ExecutionStreamId(2)`, respectively.
+  // `ExecutionStreamId(2)`, respectively. The third asynchronous branch should
+  // reuse `ExecutionStreamId(1)` because we set `number_of_execution_streams`
+  // to `2`.
   ExpectExecutionStreamForSyncInstructions(
       assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
   for (std::string_view instruction : {"start1", "update1", "done1"}) {
@@ -108,6 +121,13 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
                     /*source_stream_id=*/ExecutionStreamId(0),
                     /*destination_stream_id=*/ExecutionStreamId(2)}));
   }
+  for (std::string_view instruction : {"start3", "update3", "done3"}) {
+    EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(Cast<HloAsyncInstruction>(
+                    FindInstruction(module.get(), instruction))),
+                IsOkAndHolds(AsyncExecutionStreamIds{
+                    /*source_stream_id=*/ExecutionStreamId(0),
+                    /*destination_stream_id=*/ExecutionStreamId(1)}));
+  }
 
   // Leaf computations should run on the respective asynchronous
   // `ExecutionStreamIds`.
@@ -123,6 +143,31 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
       ExecutionStreamId(2));
 }
 
+TEST_F(ExecutionStreamAssignmentTest, CopyStartStreamIdTest) {
+  const char* const hlo_copy_start_string = R"(
+  HloModule Module
+
+  ENTRY CopyStartAndCopyDone {
+    p0 = f32[2,3]{1,0:S(1)} parameter(0)
+    copy-start = (f32[2,3]{1,0:S(2)}, f32[2,3]{1,0:S(1)}, u32[]) copy-start(p0)
+    ROOT copy-done = f32[2,3]{1,0:S(2)} copy-done(copy-start)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_copy_start_string));
+
+  ExecutionStreamAssignment assignment(module.get());
+
+  for (std::string_view instruction : {"copy-start"}) {
+    EXPECT_THAT(
+        assignment.GetAsyncExecutionStreamIds(Cast<HloCopyStartInstruction>(
+            FindInstruction(module.get(), instruction))),
+        IsOkAndHolds(AsyncExecutionStreamIds{
+            /*source_stream_id=*/ExecutionStreamId(0),
+            /*destination_stream_id=*/ExecutionStreamId(1)}));
+  }
+}
+
 TEST_F(ExecutionStreamAssignmentTest, FusionComputations) {
   const char* kModuleStr = R"(
     HloModule m

diff --git a/xla/service/gpu/fusions/mlir/BUILD b/xla/service/gpu/fusions/mlir/BUILD
@@ -210,6 +210,8 @@ cc_library(
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:ReconcileUnrealizedCasts",
+        "@llvm-project//mlir:ROCDLDialect",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
@@ -255,6 +257,8 @@ xla_cc_test(
         "@llvm-project//mlir:NVVMDialect",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:ROCDLDialect",
+        "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:TensorDialect",
         "@tsl//tsl/platform:statusor",