Skip to content

Commit

Permalink
Merge pull request #40 from ROCm/rocm-jaxlib-v0.4.31-qa-cleanup
Browse files Browse the repository at this point in the history
Rocm jaxlib v0.4.31 qa cleanup
  • Loading branch information
hsharsha authored Nov 7, 2024
2 parents 8eb4ca9 + 23df4ae commit e02959b
Show file tree
Hide file tree
Showing 26 changed files with 285 additions and 43 deletions.
2 changes: 1 addition & 1 deletion build_tools/rocm/run_xla.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if [[ -n $1 ]]; then
ROCM_INSTALL_DIR=$1
else
if [[ -z "${ROCM_PATH}" ]]; then
ROCM_INSTALL_DIR=/opt/rocm-6.0.2
ROCM_INSTALL_DIR=/opt/rocm-6.2.0
else
ROCM_INSTALL_DIR=$ROCM_PATH
fi
Expand Down
35 changes: 35 additions & 0 deletions third_party/llvm/rocdl_shuffle_down.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
From a46b9e979ffa523bfed61487a2404e1f48140288 Mon Sep 17 00:00:00 2001
From: Dragan Mladjenovic <[email protected]>
Date: Fri, 29 Mar 2024 12:27:36 +0000
Subject: [PATCH] Support gpu::ShuffleMode::DOWN lowering

---
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e2cb3687d872..9317e30290c6 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -140,7 +140,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);

auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value width = adaptor.getWidth();
Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
@@ -151,6 +151,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
// perf.
switch (op.getMode()) {
+ case gpu::ShuffleMode::DOWN:
+ dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
+ adaptor.getOffset());
+ break;
case gpu::ShuffleMode::XOR:
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
--
2.25.1

1 change: 1 addition & 0 deletions third_party/llvm/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def repo(name):
"//third_party/llvm:mathextras.patch",
"//third_party/llvm:toolchains.patch",
"//third_party/llvm:zstd.patch",
"//third_party/llvm:rocdl_shuffle_down.patch",
],
link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
)
35 changes: 35 additions & 0 deletions third_party/tsl/third_party/llvm/rocdl_shuffle_down.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
From a46b9e979ffa523bfed61487a2404e1f48140288 Mon Sep 17 00:00:00 2001
From: Dragan Mladjenovic <[email protected]>
Date: Fri, 29 Mar 2024 12:27:36 +0000
Subject: [PATCH] Support gpu::ShuffleMode::DOWN lowering

---
mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
index e2cb3687d872..9317e30290c6 100644
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -140,7 +140,7 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);

auto int32Type = IntegerType::get(rewriter.getContext(), 32);
Value width = adaptor.getWidth();
Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
Value negwidth = rewriter.create<LLVM::SubOp>(loc, int32Type, zero, width);
Value add = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, width);
@@ -151,6 +151,10 @@ struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
// TODO: Use ds_swizzle for XOR when step/offsets are constants for better
// perf.
switch (op.getMode()) {
+ case gpu::ShuffleMode::DOWN:
+ dstLane = rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId,
+ adaptor.getOffset());
+ break;
case gpu::ShuffleMode::XOR:
dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
adaptor.getOffset());
--
2.25.1

1 change: 1 addition & 0 deletions third_party/tsl/third_party/llvm/workspace.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ def repo(name):
"//third_party/llvm:mathextras.patch",
"//third_party/llvm:toolchains.patch",
"//third_party/llvm:zstd.patch",
"//third_party/llvm:rocdl_shuffle_down.patch",
],
link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
)
6 changes: 6 additions & 0 deletions xla/pjrt/c/pjrt_c_api_gpu_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,12 @@ TEST(PjrtCApiGpuAllocatorTest, ValidOptionsParsing) {
std::vector<std::string> allocator_options = {"default", "platform", "bfc",
"cuda_async"};
for (const std::string& allocator_option : allocator_options) {
#ifdef TENSORFLOW_USE_ROCM
if (allocator_option == "cuda_async") {
VLOG(1) << "cuda_async allocator not available on ROCm!";
continue;
}
#endif
absl::flat_hash_map<std::string, xla::PjRtValueType> options = {
{"allocator", allocator_option},
{"visible_devices", xla::PjRtValueType(std::vector<int64_t>{0, 1})},
Expand Down
1 change: 1 addition & 0 deletions xla/service/gpu/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ cc_library(
testonly = 1,
srcs = ["gpu_device_info_for_tests.cc"],
hdrs = ["gpu_device_info_for_tests.h"],
local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
compatible_with = get_compatible_with_portable(),
deps = [
"//xla/stream_executor:device_description",
Expand Down
2 changes: 2 additions & 0 deletions xla/service/gpu/determinism_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ class DeterminismTest : public GpuCodegenTest {
public:
DeterminismTest() : debug_options_(HloTestBase::GetDebugOptionsForTest()) {
debug_options_.set_xla_gpu_exclude_nondeterministic_ops(true);
debug_options_.set_xla_gpu_deterministic_ops(true);
// Randomize timer durations to better test autotuning does not introduce
// nondeterminism.
se::gpu::GpuTimer::ReturnRandomDurationsForTesting();
Expand Down Expand Up @@ -97,6 +98,7 @@ ENTRY e {
if (!rocm.has_hipblaslt()) {
GTEST_SKIP() << "No hipblas-lt support on this architecture!";
}
debug_options_.set_xla_gpu_enable_triton_gemm(false) ;
#endif // TENSORFLOW_USE_ROCM

debug_options_.set_xla_gpu_triton_fusion_level(0);
Expand Down
44 changes: 32 additions & 12 deletions xla/service/gpu/execution_stream_assignment.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ limitations under the License.

namespace xla::gpu {

ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
ExecutionStreamAssignment::ExecutionStreamAssignment(
const HloModule* module, ExecutionStreamAssignmentOptions options) {
std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);

// We'll walk the `CallGraph` starting from the entrypoint. The instructions
Expand Down Expand Up @@ -69,6 +70,22 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
}
};

// Assigns source and destination streams to an instruction and records it in
// async_instructions_.
auto assign_async_execution_streams =
[&](HloInstruction* instruction, ExecutionStreamId source_stream_id) {
AsyncExecutionStreamIds streams;
streams.source_stream_id = source_stream_id;
streams.destination_stream_id = next_stream_id;

CHECK(async_instructions_.try_emplace(instruction, streams).second);

next_stream_id++;
if (next_stream_id.value() > options.number_of_execution_streams) {
next_stream_id = ExecutionStreamId(1);
}
};

while (!queue.empty()) {
Pending pending = queue.front();
queue.pop_front();
Expand All @@ -77,8 +94,14 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
// instructions. Asynchronous instructions will be handled afterwards.
for (HloInstruction* instruction : pending.node->instructions()) {
if (instruction->IsAsynchronous()) continue;
CHECK(sync_instructions_.try_emplace(instruction, pending.stream_id)
.second);
if (instruction->opcode() == HloOpcode::kCopyStart) {
// CopyStart is morally an async instruction, let us treat it
// as an async instruction.
assign_async_execution_streams(instruction, pending.stream_id);
} else {
CHECK(sync_instructions_.try_emplace(instruction, pending.stream_id)
.second);
}
}

// Next, we'll process all callsites in the current computation.
Expand All @@ -88,14 +111,9 @@ ExecutionStreamAssignment::ExecutionStreamAssignment(const HloModule* module) {
// Asynchronous calls will result in a new `ExecutionStreamId` being
// dispensed for the called computations.
CHECK_EQ(callsite.instruction()->opcode(), HloOpcode::kAsyncStart);
const ExecutionStreamId async_stream_id = next_stream_id++;
enqueue_called_computations(callsite, async_stream_id);

AsyncExecutionStreamIds streams;
streams.source_stream_id = pending.stream_id;
streams.destination_stream_id = async_stream_id;
CHECK(async_instructions_.try_emplace(callsite.instruction(), streams)
.second);
enqueue_called_computations(callsite, next_stream_id);
assign_async_execution_streams(callsite.instruction(),
pending.stream_id);
} else {
// Synchronous calls will result in the called computations being
// invoked using the same `ExecutionStreamId`.
Expand Down Expand Up @@ -146,7 +164,9 @@ ExecutionStreamAssignment::GetSyncExecutionStreamId(

absl::StatusOr<ExecutionStreamAssignment::AsyncExecutionStreamIds>
ExecutionStreamAssignment::GetAsyncExecutionStreamIds(
const HloAsyncInstruction* instruction) const {
const HloInstruction* instruction) const {
CHECK(instruction->IsAsynchronous() ||
instruction->opcode() == HloOpcode::kCopyStart);
auto streams = async_instructions_.find(instruction);
if (streams == async_instructions_.end()) {
return StreamNotFoundError(instruction);
Expand Down
11 changes: 9 additions & 2 deletions xla/service/gpu/execution_stream_assignment.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,12 @@ limitations under the License.

namespace xla::gpu {

struct ExecutionStreamAssignmentOptions {
// The `ExecutionStreamAssignment` will round-robin across this many
// `ExecutionStreams`.
int number_of_execution_streams = 4;
};

// `ExecutionStreamAssignments` represent a mapping from `HloInstructions` to
// `ExecutionStreamIds`. Asynchronous calls (`async-start`, `async-update`, and
// `async-done`) result in the target computations being assigned new
Expand All @@ -37,7 +43,8 @@ class ExecutionStreamAssignment {
// pass the module through the `FlattenCallGraph` pass.
//
// The ExecutionStreamAssignment does not take ownership of the `HloModule`.
explicit ExecutionStreamAssignment(const HloModule* module);
explicit ExecutionStreamAssignment(
const HloModule* module, ExecutionStreamAssignmentOptions options = {});

// Returns the `ExecutionStreamId` for the given instruction, which *must* be
// synchronous. Returns an error if the instruction is either not reachable
Expand All @@ -58,7 +65,7 @@ class ExecutionStreamAssignment {
ExecutionStreamId destination_stream_id;
};
absl::StatusOr<AsyncExecutionStreamIds> GetAsyncExecutionStreamIds(
const HloAsyncInstruction* instruction) const;
const HloInstruction* instruction) const;

private:
// Maps from `HloInstructions` to `ExecutionStreamIds` for synchronous and
Expand Down
51 changes: 48 additions & 3 deletions xla/service/gpu/execution_stream_assignment_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
p0 = f32[2,2] parameter(0)
ROOT add = f32[2,2] add(p0, p0)
}
leaf3 {
p0 = f32[2,2] parameter(0)
ROOT add = f32[2,2] add(p0, p0)
}
// Entry computation that calls each of the leaves asynchronously.
ENTRY entry {
Expand All @@ -77,21 +81,30 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
kind=kLoop, calls=leaf1
start2 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
kind=kLoop, calls=leaf2
start3 = ((f32[2,2]), f32[2,2], s32[]) fusion-start(p0),
kind=kLoop, calls=leaf3
update1 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start1)
update2 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start2)
update3 = ((f32[2,2]), f32[2,2], s32[]) fusion-update(start3)
done1 = f32[2,2] fusion-done(update1)
done2 = f32[2,2] fusion-done(update2)
ROOT done = f32[2,2] add(done1, done2)
done3 = f32[2,2] fusion-done(update3)
ROOT done = f32[2,2] custom-call(done1, done2, done3),
custom_call_target="target"
}
)";
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
ParseAndReturnVerifiedModule(kModuleStr));

ExecutionStreamAssignment assignment(module.get());
ExecutionStreamAssignment assignment(
module.get(),
ExecutionStreamAssignmentOptions{/*number_of_execution_streams=*/2});

// The outermost computation should run on `ExecutionStreamId(0)`. The two
// asynchronous branches should be launched on `ExecutionStreamId(1)` and
// `ExecutionStreamId(2)`, respectively.
// `ExecutionStreamId(2)`, respectively. The third asynchronous branch should
// reuse `ExecutionStreamId(1)` because we set `number_of_execution_streams`
// to `2`.
ExpectExecutionStreamForSyncInstructions(
assignment, FindComputation(module.get(), "entry"), ExecutionStreamId(0));
for (std::string_view instruction : {"start1", "update1", "done1"}) {
Expand All @@ -108,6 +121,13 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
/*source_stream_id=*/ExecutionStreamId(0),
/*destination_stream_id=*/ExecutionStreamId(2)}));
}
for (std::string_view instruction : {"start3", "update3", "done3"}) {
EXPECT_THAT(assignment.GetAsyncExecutionStreamIds(Cast<HloAsyncInstruction>(
FindInstruction(module.get(), instruction))),
IsOkAndHolds(AsyncExecutionStreamIds{
/*source_stream_id=*/ExecutionStreamId(0),
/*destination_stream_id=*/ExecutionStreamId(1)}));
}

// Leaf computations should run on the respective asynchronous
// `ExecutionStreamIds`.
Expand All @@ -123,6 +143,31 @@ TEST_F(ExecutionStreamAssignmentTest, AsyncFusion) {
ExecutionStreamId(2));
}

TEST_F(ExecutionStreamAssignmentTest, CopyStartStreamIdTest) {
const char* const hlo_copy_start_string = R"(
HloModule Module
ENTRY CopyStartAndCopyDone {
p0 = f32[2,3]{1,0:S(1)} parameter(0)
copy-start = (f32[2,3]{1,0:S(2)}, f32[2,3]{1,0:S(1)}, u32[]) copy-start(p0)
ROOT copy-done = f32[2,3]{1,0:S(2)} copy-done(copy-start)
}
)";
TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
ParseAndReturnVerifiedModule(hlo_copy_start_string));

ExecutionStreamAssignment assignment(module.get());

for (std::string_view instruction : {"copy-start"}) {
EXPECT_THAT(
assignment.GetAsyncExecutionStreamIds(Cast<HloCopyStartInstruction>(
FindInstruction(module.get(), instruction))),
IsOkAndHolds(AsyncExecutionStreamIds{
/*source_stream_id=*/ExecutionStreamId(0),
/*destination_stream_id=*/ExecutionStreamId(1)}));
}
}

TEST_F(ExecutionStreamAssignmentTest, FusionComputations) {
const char* kModuleStr = R"(
HloModule m
Expand Down
4 changes: 4 additions & 0 deletions xla/service/gpu/fusions/mlir/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,8 @@ cc_library(
"@llvm-project//mlir:NVVMToLLVMIRTranslation",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ReconcileUnrealizedCasts",
"@llvm-project//mlir:ROCDLDialect",
"@llvm-project//mlir:ROCDLToLLVMIRTranslation",
"@llvm-project//mlir:SCFDialect",
"@llvm-project//mlir:Support",
"@llvm-project//mlir:TensorDialect",
Expand Down Expand Up @@ -255,6 +257,8 @@ xla_cc_test(
"@llvm-project//mlir:NVVMDialect",
"@llvm-project//mlir:NVVMToLLVMIRTranslation",
"@llvm-project//mlir:Pass",
"@llvm-project//mlir:ROCDLDialect",
"@llvm-project//mlir:ROCDLToLLVMIRTranslation",
"@llvm-project//mlir:SCFDialect",
"@llvm-project//mlir:TensorDialect",
"@tsl//tsl/platform:statusor",
Expand Down
Loading

0 comments on commit e02959b

Please sign in to comment.