From d2feee819a5ee81a5dfa2f82b2cd58631efcff6d Mon Sep 17 00:00:00 2001
From: Tom Benson <30674819+benson31@users.noreply.github.com>
Date: Fri, 5 Jun 2020 15:37:07 -0700
Subject: [PATCH] Support HIP/ROCm backends for GPUs (#101)

* Add hydrogen error handling mechanisms
* new cuda management infrastructure
* everything in rocm compiles i think. linker issues pending
* remove override decoration from Element/BlockMatrix functions
* patch for finding rocblas; not sure if this is strictly necessary any more
* forward kernel arguments by reference
* a few tweaks to the CMakeLists
* Make sure ROCm and CUDA aren't enabled at the same time.
* correct a discrepancy in hipMemcpy2DAsync semantics
* clean up HAVE_CUDA macro usage; streamline copy syntax
* use nonblocking stream; clean up the mempool
* straggler HAVE_CUDA use in include tree
* preprocessor macro cleanup in blaslike tests
* Remove debugging print statements
* add short-circuit returns to copy/fill routines when size is zero
* some cleanup
* fix some new rocm issues
* update aluminum version number
* update version number
* remove some unneeded CMake
* revert changes related to the hip override bug
* add support for hipCUB and generalize cublas tensor option
* fix annoying clang warnings (that GCC _should_ throw, too, but it doesn't)
* address some review comments
* fix use of streams that should have been SyncInfos
* Clean up device library functions
* cleanup timer nonsense in Gemm test
* fix some hipCUB linkage
* Apply suggestions from code review
Co-authored-by: Tim Moon <moon13@llnl.gov>
* Apply suggestions from code review
Co-authored-by: Tim Moon <moon13@llnl.gov>
* remove unneeded metafunction. DiHydrogen has a cleaner implementation anyway.
---
 CMakeLists.txt                                | 133 ++++--
 cmake/configure_files/HydrogenConfig.cmake.in |  27 +-
 .../HydrogenConfigVersion.cmake.in            |   6 +-
 cmake/configure_files/hydrogen_config.h.in    |  15 +-
 cmake/modules/FindROCBLAS.cmake               |  46 ++
 include/El/blas_like/level1/AllReduce.hpp     |   4 +-
 include/El/blas_like/level1/Axpy.hpp          |   8 +-
 include/El/blas_like/level1/Axpy/util.hpp     |   8 +-
 include/El/blas_like/level1/AxpyContract.hpp  |   4 +-
 include/El/blas_like/level1/Broadcast.hpp     |   8 +-
 include/El/blas_like/level1/Contract.hpp      |   4 +-
 include/El/blas_like/level1/Copy.hpp          |   6 +
 .../El/blas_like/level1/Copy/ColAllGather.hpp |   4 +-
 .../El/blas_like/level1/Copy/ColFilter.hpp    |   4 +-
 include/El/blas_like/level1/Copy/Exchange.hpp |   4 +-
 .../level1/Copy/PartialColFilter.hpp          |   4 +-
 .../level1/Copy/PartialRowAllGather.hpp       |   4 +-
 .../level1/Copy/PartialRowFilter.hpp          |   4 +-
 .../El/blas_like/level1/Copy/RowAllGather.hpp |   4 +-
 .../El/blas_like/level1/Copy/RowFilter.hpp    |   4 +-
 .../blas_like/level1/Copy/TransposeDist.hpp   |   4 +-
 .../blas_like/level1/Copy/internal_decl.hpp   |   4 +-
 include/El/blas_like/level1/Copy/util.hpp     |  64 ++-
 .../blas_like/level1/CopyAsyncDistMatrix.hpp  |   4 +-
 .../El/blas_like/level1/CopyAsyncLocal.hpp    |   8 +-
 include/El/blas_like/level1/CopyLocal.hpp     |  45 +-
 include/El/blas_like/level1/DiagonalScale.hpp |   8 +-
 include/El/blas_like/level1/Dot.hpp           |   4 +-
 include/El/blas_like/level1/EntrywiseFill.hpp |   8 +-
 include/El/blas_like/level1/Fill.hpp          |   8 +-
 include/El/blas_like/level1/Hadamard.hpp      |  10 +-
 .../El/blas_like/level1/IndexDependentMap.hpp |   4 +-
 include/El/blas_like/level1/Recv.hpp          |   4 +-
 include/El/blas_like/level1/Round.hpp         |   4 +-
 include/El/blas_like/level1/Scale.hpp         |   8 +-
 include/El/blas_like/level1/Send.hpp          |   4 +-
 include/El/blas_like/level1/SendRecv.hpp      |   8 +-
 include/El/blas_like/level1/Transpose.hpp     |  12 +-
 include/El/blas_like/level1/TransposeAxpy.hpp |   8 +-
 include/El/blas_like/level1/Zero.hpp          |   8 +-
 include/El/blas_like/level1/decl.hpp          |  20 +-
 include/El/core.hpp                           |  11 +-
 include/El/core/AbstractMatrix/decl.hpp       |   4 +-
 include/El/core/Element/impl.hpp              |   2 +-
 include/El/core/Matrix/decl.hpp               |  33 +-
 include/El/core/Matrix/impl.hpp               |   2 +-
 include/El/core/Matrix/impl_cpu.hpp           |  27 +-
 include/El/core/Matrix/impl_gpu.hpp           |  53 +--
 include/El/core/Memory/decl.hpp               |   6 +-
 include/El/core/Memory/impl.hpp               |  82 +++-
 include/El/core/MemoryPool.hpp                |  53 ++-
 include/El/core/ProxyDevice.hpp               |   4 +-
 include/El/core/View/impl.hpp                 |  16 +-
 include/El/core/imports/aluminum.hpp          |  24 +-
 include/El/core/imports/mpi.hpp               |  30 ++
 include/El/core/imports/mpi/aluminum_comm.hpp |  10 +-
 include/El/core/imports/mpi/meta.hpp          |   4 +-
 include/El/macros/DeviceGuardAndPayload.h     |   4 +-
 include/hydrogen/Device.hpp                   |  18 +
 include/hydrogen/Error.hpp                    |  97 ++++
 include/hydrogen/MultiSync.hpp                |  80 ++++
 include/hydrogen/SyncInfo.hpp                 | 196 +-------
 include/hydrogen/SyncInfoAllDecl.hpp          |  12 +
 include/hydrogen/SyncInfoBase.hpp             | 129 ++++++
 include/hydrogen/SynchronizeAPI.hpp           |  41 ++
 include/hydrogen/blas/BLAS_Common.hpp         |  15 +
 include/hydrogen/blas/GPU_BLAS_decl.hpp       |  90 ++++
 include/hydrogen/blas/GPU_BLAS_impl.hpp       |  20 +-
 include/hydrogen/blas/gpu/Axpy.hpp            |  56 +--
 include/hydrogen/blas/gpu/Copy.hpp            |  69 +--
 include/hydrogen/blas/gpu/Fill.hpp            |  18 +-
 include/hydrogen/blas/gpu/Hadamard.hpp        |  10 +-
 include/hydrogen/blas/gpu/Scale.hpp           |  22 +-
 include/hydrogen/blas/gpu/Transpose.hpp       |  14 +-
 include/hydrogen/device/GPU.hpp               | 163 +++++++
 include/hydrogen/device/gpu/BasicCopy.hpp     |  12 +
 .../hydrogen/device/gpu/{cuda => }/CUB.hpp    |  13 +-
 include/hydrogen/device/gpu/CUDA.hpp          | 365 +--------------
 include/hydrogen/device/gpu/GPUError.hpp      |  18 +
 include/hydrogen/device/gpu/ROCm.hpp          |   8 +
 include/hydrogen/device/gpu/SyncInfo.hpp      |  12 +
 include/hydrogen/device/gpu/cuda/CUDACopy.hpp | 112 +++++
 .../hydrogen/device/gpu/cuda/CUDAError.hpp    |  52 +++
 .../device/gpu/cuda/CUDALaunchKernel.hpp      |  31 ++
 .../device/gpu/cuda/CUDAManagement.hpp        |  22 +
 include/hydrogen/device/gpu/cuda/SyncInfo.hpp |  84 ++++
 include/hydrogen/device/gpu/cuda/cuBLAS.hpp   | 426 +-----------------
 .../hydrogen/device/gpu/cuda/cuBLASError.hpp  |  48 ++
 .../device/gpu/cuda/cuBLASManagement.hpp      |  82 ++++
 .../hydrogen/device/gpu/cuda/cuBLASMeta.hpp   | 133 ++++++
 .../hydrogen/device/gpu/cuda/cuBLASUtil.hpp   |  71 +++
 .../hydrogen/device/gpu/cuda/cuBLAS_API.hpp   | 186 ++++++++
 include/hydrogen/device/gpu/rocm/ROCmCopy.hpp | 132 ++++++
 .../hydrogen/device/gpu/rocm/ROCmError.hpp    |  51 +++
 .../device/gpu/rocm/ROCmLaunchKernel.hpp      |  30 ++
 .../device/gpu/rocm/ROCmManagement.hpp        |  22 +
 include/hydrogen/device/gpu/rocm/SyncInfo.hpp |  83 ++++
 include/hydrogen/device/gpu/rocm/rocBLAS.hpp  |  12 +
 .../hydrogen/device/gpu/rocm/rocBLASError.hpp |  48 ++
 .../device/gpu/rocm/rocBLASManagement.hpp     |  74 +++
 .../hydrogen/device/gpu/rocm/rocBLASMeta.hpp  | 121 +++++
 .../hydrogen/device/gpu/rocm/rocBLASUtil.hpp  |  71 +++
 .../hydrogen/device/gpu/rocm/rocBLAS_API.hpp  | 131 ++++++
 include/hydrogen/meta/MetaUtilities.hpp       |   5 +-
 include/hydrogen/utils/SimpleBuffer.hpp       |  28 +-
 src/CMakeLists.txt                            |   4 +-
 src/blas_like/level2/Gemv.cpp                 |   8 +-
 src/blas_like/level2/Gemv/Normal.hpp          |   4 +-
 src/blas_like/level2/Gemv/Transpose.hpp       |   4 +-
 src/blas_like/level3/Gemm.cpp                 |  14 +-
 src/blas_like/level3/Gemm/TN.hpp              |  16 +-
 src/blas_like/level3/Gemm/TT.hpp              |  16 +-
 src/blas_like/level3/SyncInfoPool.hpp         |  25 +-
 src/blas_like/level3/sync_info_pool_test.cpp  |  16 +-
 src/core/DistMatrix/AbstractDistMatrix.cpp    |   8 +-
 src/core/DistMatrix/ElementMatrix.cpp         |  12 +-
 .../DistMatrix/ElementMatrix/CIRC_CIRC.cpp    |   4 +-
 src/core/DistMatrix/ElementMatrix/MC_MR.cpp   |   4 +-
 src/core/DistMatrix/ElementMatrix/MC_STAR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/MD_STAR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/MR_MC.cpp   |   4 +-
 src/core/DistMatrix/ElementMatrix/MR_STAR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/STAR_MC.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/STAR_MD.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/STAR_MR.cpp |   4 +-
 .../DistMatrix/ElementMatrix/STAR_STAR.cpp    |   4 +-
 src/core/DistMatrix/ElementMatrix/STAR_VC.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/STAR_VR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/VC_STAR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/VR_STAR.cpp |   4 +-
 src/core/DistMatrix/ElementMatrix/setup.hpp   |   4 +-
 src/core/MemoryPool.cpp                       |   8 +-
 src/core/environment.cpp                      |  19 +-
 src/core/imports/CMakeLists.txt               |   4 -
 src/core/imports/cub.cpp                      |  21 +-
 src/core/imports/mpi.cpp                      |   4 +-
 src/core/imports/mpi/AllGather.hpp            |   4 +-
 src/core/imports/mpi/AllReduce.hpp            |   4 +-
 src/core/imports/mpi/AllToAll.hpp             |   4 +-
 src/core/imports/mpi/Broadcast.hpp            |   4 +-
 src/core/imports/mpi/Gather.hpp               |   4 +-
 src/core/imports/mpi/Reduce.hpp               |   4 +-
 src/core/imports/mpi/ReduceScatter.hpp        |  15 +-
 src/core/imports/mpi/Scatter.hpp              |   2 +-
 src/core/imports/mpi/SendRecv.hpp             |   2 +-
 src/core/imports/mpi_utils.hpp                |  45 +-
 src/core/mpi_register.cpp                     |   1 +
 src/hydrogen/CMakeLists.txt                   |   8 +-
 src/hydrogen/Error.cpp                        |  15 +
 src/hydrogen/blas/CMakeLists.txt              |   2 +-
 src/hydrogen/blas/gpu/Axpy.cu                 |  62 +--
 src/hydrogen/blas/gpu/CMakeLists.txt          |   4 +-
 src/hydrogen/blas/gpu/Copy.cu                 |  53 ++-
 src/hydrogen/blas/gpu/Fill.cu                 |  72 +--
 src/hydrogen/blas/gpu/Hadamard.cu             |  61 +--
 src/hydrogen/blas/gpu/Scale.cu                |  35 +-
 src/hydrogen/blas/gpu/Transpose.cu            |  38 +-
 src/hydrogen/device/CMakeLists.txt            |  19 +
 src/hydrogen/device/CUDA.cpp                  | 163 +++++++
 src/hydrogen/device/GPU.cpp                   | 105 +++++
 src/hydrogen/device/ROCm.cpp                  | 101 +++++
 src/hydrogen/device/cuBLAS.cpp                | 151 +++++++
 .../device/cuBLAS_API.cpp}                    | 199 ++++++--
 src/hydrogen/device/rocBLAS.cpp               | 143 ++++++
 src/hydrogen/device/rocBLAS_API.cpp           | 285 ++++++++++++
 src/io/Display.cpp                            |   4 +-
 src/io/Print.cpp                              |   4 +-
 src/io/Read.cpp                               |   4 +-
 src/io/Write.cpp                              |   4 +-
 src/lapack_like/props/Norm/Frobenius.cpp      |   4 +-
 src/matrices/random/independent/Gaussian.cpp  |   4 +-
 src/matrices/random/independent/Uniform.cpp   |   8 +-
 tests/blas_like/Axpy.cpp                      |   4 +-
 tests/blas_like/BasicGemm.cpp                 |   4 +-
 tests/blas_like/Gemm.cpp                      | 129 ++----
 tests/blas_like/GemmHelpers/SyncTimer.hpp     |  67 ++-
 tests/core/DistMatrix.cpp                     |  96 ++--
 unit_test/CMakeLists.txt                      |   5 +
 unit_test/gpu_test.cpp                        |  19 +
 179 files changed, 4733 insertions(+), 1917 deletions(-)
 create mode 100644 cmake/modules/FindROCBLAS.cmake
 create mode 100644 include/hydrogen/Error.hpp
 create mode 100644 include/hydrogen/MultiSync.hpp
 create mode 100644 include/hydrogen/SyncInfoAllDecl.hpp
 create mode 100644 include/hydrogen/SyncInfoBase.hpp
 create mode 100644 include/hydrogen/SynchronizeAPI.hpp
 create mode 100644 include/hydrogen/device/GPU.hpp
 create mode 100644 include/hydrogen/device/gpu/BasicCopy.hpp
 rename include/hydrogen/device/gpu/{cuda => }/CUB.hpp (78%)
 create mode 100644 include/hydrogen/device/gpu/GPUError.hpp
 create mode 100644 include/hydrogen/device/gpu/ROCm.hpp
 create mode 100644 include/hydrogen/device/gpu/SyncInfo.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/CUDACopy.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/CUDAError.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/CUDAManagement.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/SyncInfo.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/cuBLASError.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp
 create mode 100644 include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/ROCmCopy.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/ROCmError.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/ROCmManagement.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/SyncInfo.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLAS.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLASError.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
 create mode 100644 include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
 create mode 100644 src/hydrogen/Error.cpp
 create mode 100644 src/hydrogen/device/CMakeLists.txt
 create mode 100644 src/hydrogen/device/CUDA.cpp
 create mode 100644 src/hydrogen/device/GPU.cpp
 create mode 100644 src/hydrogen/device/ROCm.cpp
 create mode 100644 src/hydrogen/device/cuBLAS.cpp
 rename src/{core/imports/cublas.cpp => hydrogen/device/cuBLAS_API.cpp} (58%)
 create mode 100644 src/hydrogen/device/rocBLAS.cpp
 create mode 100644 src/hydrogen/device/rocBLAS_API.cpp
 create mode 100644 unit_test/gpu_test.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e6490fe367..036cbe87da 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -53,8 +53,8 @@ endif (__GIT_EXECUTABLE)
 
 # This must be set because version tags
 set(HYDROGEN_VERSION_MAJOR 1)
-set(HYDROGEN_VERSION_MINOR 3)
-set(HYDROGEN_VERSION_PATCH 4)
+set(HYDROGEN_VERSION_MINOR 4)
+set(HYDROGEN_VERSION_PATCH 0)
 set(HYDROGEN_VERSION_MAJOR_MINOR
     "${HYDROGEN_VERSION_MAJOR}.${HYDROGEN_VERSION_MINOR}")
 set(HYDROGEN_VERSION
@@ -154,13 +154,17 @@ option(Hydrogen_ENABLE_CUDA
   "Search for CUDA support and enable related features if found."
   OFF)
 
-if (Hydrogen_ENABLE_CUDA)
+option(Hydrogen_ENABLE_ROCM
+  "Search for ROCm/HIP support and enable related features if found."
+  OFF)
+
+if (Hydrogen_ENABLE_CUDA OR Hydrogen_ENABLE_ROCM)
   option(Hydrogen_ENABLE_CUB
     "Search for CUB support and enable related features if found."
     ON)
 
-  option(Hydrogen_ENABLE_CUBLAS_TENSOR_MATH
-    "Use the cuBLAS tensor operation math."
+  option(Hydrogen_ENABLE_GPU_TENSOR_MATH
+    "Use the GPU tensor operations when available."
     OFF)
 
   option(Hydrogen_ENABLE_GPU_FP16
@@ -168,6 +172,14 @@ if (Hydrogen_ENABLE_CUDA)
     ON)
 endif ()
 
+if (Hydrogen_ENABLE_ROCM AND Hydrogen_ENABLE_CUDA)
+  message(FATAL_ERROR
+    "ROCm and CUDA code paths are mutually exclusive. "
+    "Please enable the one that corresponds to your hardware. "
+    "If you have mixed hardware, please contact the Hydrogen developers "
+    "as this would be of great interest.")
+endif ()
+
 #
 # MEMORY-RELATED OPTIONS
 #
@@ -334,8 +346,8 @@ if (Hydrogen_ENABLE_CUDA)
     find_package(CUDA REQUIRED) # Enable all the macros
     find_package(NVML REQUIRED)
 
-    if (Hydrogen_ENABLE_CUBLAS_TENSOR_MATH)
-      set(HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH TRUE)
+    if (Hydrogen_ENABLE_GPU_TENSOR_MATH)
+      set(HYDROGEN_GPU_USE_TENSOR_OP_MATH TRUE)
     endif ()
 
     if (Hydrogen_ENABLE_GPU_FP16)
@@ -387,38 +399,64 @@ if (Hydrogen_ENABLE_CUDA)
     set(HYDROGEN_HAVE_CUDA FALSE)
 
   endif ()
-
 endif (Hydrogen_ENABLE_CUDA)
 
-set(HYDROGEN_HAVE_GPU ${HYDROGEN_HAVE_CUDA})
+if (Hydrogen_ENABLE_ROCM)
+  set(CMAKE_MODULE_PATH "/opt/rocm/hip/cmake" ${CMAKE_MODULE_PATH})
+  find_package(HIP REQUIRED)
+
+  if (Hydrogen_ENABLE_CUB)
+    set(CMAKE_PREFIX_PATH "/opt/rocm/hip" ${CMAKE_PREFIX_PATH})
+    set(HIP_FOUND FALSE)
+    find_package(HIP CONFIG REQUIRED)
+    find_package(rocPRIM REQUIRED)
+    find_package(hipCUB REQUIRED)
+    set(HYDROGEN_HAVE_CUB TRUE)
+  else ()
+    set(HYDROGEN_HAVE_CUB FALSE)
+  endif ()
+
+  if (HIP_FOUND)
+    set(CMAKE_CXX_EXTENSIONS FALSE)
+    find_package(ROCBLAS REQUIRED)
+    set(HYDROGEN_HAVE_ROCM TRUE)
+    message(STATUS "Found ROCm/HIP toolchain. Using HIP/ROCm.")
+  else ()
+    message(FATAL_ERROR "ROCm requested but not found.")
+  endif ()
+endif (Hydrogen_ENABLE_ROCM)
+
+if (HYDROGEN_HAVE_CUDA OR HYDROGEN_HAVE_ROCM)
+  set(HYDROGEN_HAVE_GPU TRUE)
+endif ()
 
 if (Hydrogen_ENABLE_ALUMINUM)
-  find_package(Aluminum 0.3.0 NO_MODULE
+  find_package(Aluminum 0.4.0 NO_MODULE
     HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR}
     $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR}
     PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum
     NO_DEFAULT_PATH)
   if (NOT Aluminum_FOUND)
-    find_package(Aluminum 0.3.0 NO_MODULE)
+    find_package(Aluminum 0.4.0 NO_MODULE)
   endif ()
 
   if (Aluminum_FOUND)
     set(HYDROGEN_HAVE_ALUMINUM TRUE)
     message(STATUS "Found Aluminum: ${Aluminum_DIR}")
 
-    if (HYDROGEN_HAVE_CUDA AND AL_HAS_NCCL)
+    if (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
       set(HYDROGEN_HAVE_NCCL2 TRUE)
       message(STATUS "Aluminum detected with NCCL2 backend support.")
     else ()
       set(HYDROGEN_HAVE_NCCL2 FALSE)
-    endif (HYDROGEN_HAVE_CUDA AND AL_HAS_NCCL)
+    endif (HYDROGEN_HAVE_GPU AND AL_HAS_NCCL)
 
-    if (HYDROGEN_HAVE_CUDA AND AL_HAS_MPI_CUDA)
+    if (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
       set(HYDROGEN_HAVE_AL_MPI_CUDA TRUE)
       message(STATUS "Aluminum detected with MPI-CUDA backend support.")
     else ()
       set(HYDROGEN_HAVE_AL_MPI_CUDA FALSE)
-    endif (HYDROGEN_HAVE_CUDA AND AL_HAS_MPI_CUDA)
+    endif (HYDROGEN_HAVE_GPU AND AL_HAS_MPI_CUDA)
   else ()
     set(HYDROGEN_HAVE_ALUMINUM FALSE)
     set(HYDROGEN_HAVE_NCCL2 FALSE)
@@ -497,7 +535,12 @@ configure_file("${PROJECT_SOURCE_DIR}/cmake/configure_files/hydrogen_config.h.in
 configure_file("${PROJECT_SOURCE_DIR}/doxy/Doxyfile.in"
   "${PROJECT_BINARY_DIR}/doxy/Doxyfile")
 
-add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}")
+if (HYDROGEN_HAVE_ROCM)
+  hip_add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}")
+else ()
+  add_library(Hydrogen_CXX "${HYDROGEN_SOURCES}" "${HYDROGEN_HEADERS}")
+endif ()
+
 target_include_directories(Hydrogen_CXX PUBLIC
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
   $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include/El>
@@ -509,40 +552,60 @@ target_include_directories(Hydrogen_CXX PUBLIC
 # be forced to build with that (even though they maybe should)...
 target_compile_options(Hydrogen_CXX PRIVATE ${EXTRA_CXX_FLAGS})
 
-target_link_libraries(Hydrogen_CXX PUBLIC ${Aluminum_LIBRARIES})
-target_link_libraries(Hydrogen_CXX PUBLIC ${HALF_LIBRARIES})
-
-if (TARGET OpenMP::OpenMP_CXX)
-  target_link_libraries(Hydrogen_CXX PUBLIC OpenMP::OpenMP_CXX)
-endif ()
-target_link_libraries(Hydrogen_CXX PUBLIC MPI::MPI_CXX)
-target_link_libraries(Hydrogen_CXX PUBLIC LAPACK::lapack)
-target_link_libraries(Hydrogen_CXX PUBLIC EP::extended_precision)
-
-target_link_libraries(Hydrogen_CXX PUBLIC ${VTUNE_LIBRARIES})
-target_link_libraries(Hydrogen_CXX PUBLIC ${NVTX_LIBRARIES})
-if (HYDROGEN_HAVE_CUDA)
-  target_link_libraries(Hydrogen_CXX PUBLIC cuda::toolkit)
-endif ()
+target_link_libraries(
+  Hydrogen_CXX PUBLIC
+  ${Aluminum_LIBRARIES}
+  ${HALF_LIBRARIES}
+  ${VTUNE_LIBRARIES}
+  ${NVTX_LIBRARIES}
+  ${ROCBLAS_LIBRARIES}
+  $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+  $<TARGET_NAME_IF_EXISTS:MPI::MPI_CXX>
+  $<TARGET_NAME_IF_EXISTS:LAPACK::lapack>
+  $<TARGET_NAME_IF_EXISTS:EP::extended_precision>
+  $<TARGET_NAME_IF_EXISTS:cuda::toolkit>
+  $<TARGET_NAME_IF_EXISTS:hip::rocprim_hip>
+  $<TARGET_NAME_IF_EXISTS:hip::hipcub>
+  )
 
 # Add the CXX library to "Hydrogen"
 set(HYDROGEN_LIBRARIES Hydrogen_CXX)
 
 if (HYDROGEN_HAVE_CUDA)
-  add_library(Hydrogen_CUDA "${HYDROGEN_CUDA_SOURCES}")
+  add_library(Hydrogen_CUDA "${HYDROGEN_GPU_SOURCES}")
   target_include_directories(Hydrogen_CUDA PUBLIC
     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
     $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
     $<INSTALL_INTERFACE:include>)
 
-  target_link_libraries(Hydrogen_CUDA PUBLIC ${HALF_LIBRARIES})
-  target_link_libraries(Hydrogen_CUDA PUBLIC ${NVTX_LIBRARIES})
-  target_link_libraries(Hydrogen_CUDA PUBLIC cuda::toolkit)
+  target_link_libraries(
+    Hydrogen_CUDA PUBLIC
+    ${HALF_LIBRARIES}
+    ${NVTX_LIBRARIES}
+    $<TARGET_NAME_IF_EXISTS:cuda::toolkit>
+    )
 
   target_link_libraries(Hydrogen_CXX PUBLIC Hydrogen_CUDA)
   list(APPEND HYDROGEN_LIBRARIES Hydrogen_CUDA)
 endif ()
 
+if (HYDROGEN_HAVE_ROCM)
+  hip_add_library(Hydrogen_ROCM STATIC "${HYDROGEN_GPU_SOURCES}")
+  target_include_directories(Hydrogen_ROCM PUBLIC
+    $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
+    $<BUILD_INTERFACE:${PROJECT_BINARY_DIR}/include>
+    $<INSTALL_INTERFACE:include>
+    )
+
+  target_link_libraries(Hydrogen_ROCM PUBLIC
+    ${HALF_LIBRARIES}
+    ${ROCBLAS_LIBRARIES}
+    )
+
+  #set_target_properties(Hydrogen_ROCM PROPERTIES LINKER_LANGUAGE CXX)
+  list(APPEND HYDROGEN_LIBRARIES Hydrogen_ROCM)
+endif ()
+
 # Setup the tests
 if (Hydrogen_ENABLE_TESTING OR Hydrogen_ENABLE_UNIT_TESTS)
   include(CTest)
diff --git a/cmake/configure_files/HydrogenConfig.cmake.in b/cmake/configure_files/HydrogenConfig.cmake.in
index 48656ae6e3..48de9a6b72 100644
--- a/cmake/configure_files/HydrogenConfig.cmake.in
+++ b/cmake/configure_files/HydrogenConfig.cmake.in
@@ -18,11 +18,14 @@ set(HYDROGEN_MPI_CXX_COMPILER "@MPI_CXX_COMPILER@")
 set(MPI_CXX_COMPILER "${HYDROGEN_MPI_CXX_COMPILER}"
   CACHE FILEPATH "The MPI CXX compiler.")
 
-set(_OpenMP_DIR "@OpenMP_DIR@")
-if (NOT OpenMP_DIR)
-  set(OpenMP_DIR "${_OpenMP_DIR}")
-endif ()
-include (FindAndVerifyOpenMP)
+set(_HYDROGEN_HAVE_OPENMP @EL_HAVE_OPENMP@)
+if (_HYDROGEN_HAVE_OPENMP)
+  set(_OpenMP_DIR "@OpenMP_DIR@")
+  if (NOT OpenMP_DIR)
+    set(OpenMP_DIR "${_OpenMP_DIR}")
+  endif ()
+  include (FindAndVerifyOpenMP)
+endif (_HYDROGEN_HAVE_OPENMP)
 # FIXME: I should do verification to make sure all found features are
 #   the same.
 include (FindAndVerifyMPI)
@@ -33,14 +36,14 @@ set(_HYDROGEN_HAVE_NCCL2 @HYDROGEN_HAVE_NCCL2@)
 set(_HYDROGEN_HAVE_AL_MPI_CUDA @HYDROGEN_HAVE_AL_MPI_CUDA@)
 if (_HYDROGEN_HAVE_ALUMINUM)
   if (NOT Aluminum_FOUND)
-    find_package(Aluminum 0.3.0 NO_MODULE QUIET
+    find_package(Aluminum 0.4.0 NO_MODULE QUIET
       HINTS ${Aluminum_DIR} ${ALUMINUM_DIR} ${AL_DIR}
       $ENV{Aluminum_DIR} $ENV{ALUMINUM_DIR} $ENV{AL_DIR}
       PATH_SUFFIXES lib64/cmake/aluminum lib/cmake/aluminum
       NO_DEFAULT_PATH)
     if (NOT Aluminum_FOUND)
       set(Aluminum_DIR "@Aluminum_DIR@")
-      find_package(Aluminum 0.3.0 NO_MODULE REQUIRED)
+      find_package(Aluminum 0.4.0 NO_MODULE REQUIRED)
     endif ()
   endif ()
 
@@ -56,6 +59,16 @@ if (_HYDROGEN_HAVE_ALUMINUM)
   endif ()
 endif (_HYDROGEN_HAVE_ALUMINUM)
 
+# ROCm
+set(_HYDROGEN_HAVE_ROCM @HYDROGEN_HAVE_ROCM@)
+if (_HYDROGEN_HAVE_ROCM)
+  find_package(HIP REQUIRED)
+  find_package(ROCBLAS REQUIRED)
+
+  # query this beforehand, to set to what it was?
+  set(CMAKE_CXX_EXTENSIONS FALSE)
+endif (_HYDROGEN_HAVE_ROCM)
+
 # CUDA!
 set(_HYDROGEN_HAVE_CUDA @HYDROGEN_HAVE_CUDA@)
 set(_HYDROGEN_HAVE_CUB @HYDROGEN_HAVE_CUB@)
diff --git a/cmake/configure_files/HydrogenConfigVersion.cmake.in b/cmake/configure_files/HydrogenConfigVersion.cmake.in
index 30b7328ec2..4e7195d237 100644
--- a/cmake/configure_files/HydrogenConfigVersion.cmake.in
+++ b/cmake/configure_files/HydrogenConfigVersion.cmake.in
@@ -7,13 +7,15 @@
 # [0.87 1.0.0)
 # [1.0.0 1.1.0)
 # [1.1.0 1.2.0)
-# [1.2.0 ???)
+# [1.2.0 1.3.0)
+# [1.3.0 1.4.0)
+# [1.4.0 ???)
 #
 # IMPORTANT: IF YOU MAKE A BREAKING CHANGE TO HYDROGEN, THE UPDATE
 # MUST BE GIVEN A NEW VERSION NUMBER, WHICH THEN MUST BE APPENDED TO
 # THIS LIST.
 
-set(_version_compat_ranges 0.0.0 0.87.0 1.0.0 1.1.0 1.2.0)
+set(_version_compat_ranges 0.0.0 0.87.0 1.0.0 1.1.0 1.2.0 1.3.0 1.4.0)
 
 # This is the version that has been installed.
 set(PACKAGE_VERSION "@HYDROGEN_VERSION@")
diff --git a/cmake/configure_files/hydrogen_config.h.in b/cmake/configure_files/hydrogen_config.h.in
index 5f43053d70..e7c4b0b8bc 100644
--- a/cmake/configure_files/hydrogen_config.h.in
+++ b/cmake/configure_files/hydrogen_config.h.in
@@ -33,16 +33,18 @@
 #cmakedefine HYDROGEN_HAVE_MKL
 #cmakedefine HYDROGEN_HAVE_MKL_GEMMT
 
+#cmakedefine HYDROGEN_HAVE_GPU
+
 // CUDA stuff
 #cmakedefine HYDROGEN_HAVE_CUDA
-#cmakedefine HYDROGEN_HAVE_CUB
-#cmakedefine HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH
+
+// ROCm stuff
+#cmakedefine HYDROGEN_HAVE_ROCM
 
 // General GPU stuff
-#ifdef HYDROGEN_HAVE_CUDA
-#define HYDROGEN_HAVE_GPU
+#cmakedefine HYDROGEN_HAVE_CUB
+#cmakedefine HYDROGEN_GPU_USE_TENSOR_OP_MATH
 #cmakedefine HYDROGEN_GPU_USE_FP16
-#endif // HYDROGEN_HAVE_CUDA
 
 // Aluminum stuff
 #cmakedefine HYDROGEN_HAVE_ALUMINUM
@@ -62,4 +64,7 @@
 
 #cmakedefine HYDROGEN_DO_BOUNDS_CHECKING
 
+#define H_RESTRICT __restrict__
+#define H_PRETTY_FUNCTION __PRETTY_FUNCTION__
+
 #endif /* HYDROGEN_CONFIG_H */
diff --git a/cmake/modules/FindROCBLAS.cmake b/cmake/modules/FindROCBLAS.cmake
new file mode 100644
index 0000000000..a4e939347d
--- /dev/null
+++ b/cmake/modules/FindROCBLAS.cmake
@@ -0,0 +1,46 @@
+# Find rocBLAS library and supporting header
+#
+#   rocBLAS_DIR or ROCBLAS_DIR[in]: The prefix for rocBLAS
+#
+#   ROCBLAS_INCLUDE_PATH[out,cache]: The include path for rocBLAS
+#   ROCBLAS_LIBRARY[out,cache]: The rocBLAS library
+#
+#   ROCBLAS_LIBRARIES[out]: The thing to link to for rocBLAS
+#   ROCBLAS_FOUND[out]: Variable indicating whether rocBLAS has been found
+#
+#   rocm::rocblas: Imported library for rocBLAS
+#
+
+find_path(ROCBLAS_INCLUDE_PATH rocblas.h
+  HINTS ${rocBLAS_DIR} $ENV{rocBLAS_DIR} ${ROCBLAS_DIR} $ENV{ROCBLAS_DIR}
+  PATH_SUFFIXES include
+  NO_DEFAULT_PATH
+  DOC "The rocBLAS include path.")
+find_path(ROCBLAS_INCLUDE_PATH rocblas.h)
+
+find_library(ROCBLAS_LIBRARY rocblas
+  HINTS ${rocBLAS_DIR} $ENV{rocBLAS_DIR} ${ROCBLAS_DIR} $ENV{ROCBLAS_DIR}
+  PATH_SUFFIXES lib64 lib
+  NO_DEFAULT_PATH
+  DOC "The rocBLAS library.")
+find_library(ROCBLAS_LIBRARY rocblas)
+
+# Standard handling of the package arguments
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Rocblas
+  REQUIRED_VARS ROCBLAS_LIBRARY ROCBLAS_INCLUDE_PATH)
+
+if (NOT TARGET rocblas::rocblas)
+  add_library(rocblas::rocblas INTERFACE IMPORTED)
+endif ()
+
+if (ROCBLAS_INCLUDE_PATH AND ROCBLAS_LIBRARY)
+  set_target_properties(rocblas::rocblas PROPERTIES
+    INTERFACE_INCLUDE_DIRECTORIES
+    "${ROCBLAS_INCLUDE_PATH};/opt/rocm/hsa/include;/opt/rocm/hip/include"
+    INTERFACE_LINK_LIBRARIES "${ROCBLAS_LIBRARY}")
+endif ()
+
+set(ROCBLAS_LIBRARIES rocblas::rocblas)
+mark_as_advanced(ROCBLAS_INCLUDE_PATH)
+mark_as_advanced(ROCBLAS_LIBRARY)
diff --git a/include/El/blas_like/level1/AllReduce.hpp b/include/El/blas_like/level1/AllReduce.hpp
index ddace1cca3..4b08c58e9a 100644
--- a/include/El/blas_like/level1/AllReduce.hpp
+++ b/include/El/blas_like/level1/AllReduce.hpp
@@ -61,11 +61,11 @@ void AllReduce(AbstractMatrix<T>& A, mpi::Comm const& comm, mpi::Op op)
     case Device::CPU:
         AllReduce(static_cast<Matrix<T,Device::CPU>&>(A), comm, op);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         AllReduce(static_cast<Matrix<T,Device::GPU>&>(A), comm, op);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("AllReduce: Bad device!");
     }
diff --git a/include/El/blas_like/level1/Axpy.hpp b/include/El/blas_like/level1/Axpy.hpp
index efe1d039bf..3278b61a04 100644
--- a/include/El/blas_like/level1/Axpy.hpp
+++ b/include/El/blas_like/level1/Axpy.hpp
@@ -26,13 +26,13 @@ void Axpy(S alphaS, AbstractMatrix<T> const& X, AbstractMatrix<T>& Y)
              static_cast<Matrix<T,Device::CPU> const&>(X),
              static_cast<Matrix<T,Device::CPU>&>(Y));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Axpy(alphaS,
              static_cast<Matrix<T,Device::GPU> const&>(X),
              static_cast<Matrix<T,Device::GPU>&>(Y));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Axpy: Bad device.");
     }
@@ -95,7 +95,7 @@ void Axpy(S alphaS, const Matrix<T,Device::CPU>& X, Matrix<T,Device::CPU>& Y)
     }
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename T,typename S,
          typename=DisableIf<IsDeviceValidType<T,Device::GPU>>,
          typename=void>
@@ -146,7 +146,7 @@ void Axpy(S alphaS, Matrix<T,Device::GPU> const& X, Matrix<T,Device::GPU>& Y)
             mX, nX, alpha, XBuf, ldX, YBuf, ldY, syncInfoY);
     }
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T,typename S>
 void Axpy(S alphaS, const ElementalMatrix<T>& X, ElementalMatrix<T>& Y)
diff --git a/include/El/blas_like/level1/Axpy/util.hpp b/include/El/blas_like/level1/Axpy/util.hpp
index 190550a0f5..c4ebedabfc 100644
--- a/include/El/blas_like/level1/Axpy/util.hpp
+++ b/include/El/blas_like/level1/Axpy/util.hpp
@@ -9,7 +9,7 @@
 #ifndef EL_BLAS_AXPY_UTIL_HPP
 #define EL_BLAS_AXPY_UTIL_HPP
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include <hydrogen/blas/gpu/Axpy.hpp>
 #endif
 
@@ -35,7 +35,7 @@ void InterleaveMatrixUpdate(
             &B[rowStrideB*j], colStrideB);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 void InterleaveMatrixUpdate(
     T alpha, Int height, Int width,
@@ -47,9 +47,9 @@ void InterleaveMatrixUpdate(
     hydrogen::Axpy_GPU_impl(height, width, alpha,
                             A, colStrideA, rowStrideA,
                             B, colStrideB, rowStrideB,
-                            syncInfo.stream_);
+                            syncInfo);
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T, Device D>
 void UpdateWithLocalData(
diff --git a/include/El/blas_like/level1/AxpyContract.hpp b/include/El/blas_like/level1/AxpyContract.hpp
index bb47cf1075..a6ca6d7453 100644
--- a/include/El/blas_like/level1/AxpyContract.hpp
+++ b/include/El/blas_like/level1/AxpyContract.hpp
@@ -533,11 +533,11 @@ void AxpyContract
     case Device::CPU:
         AxpyContract_impl<Device::CPU>(alpha,A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         AxpyContract_impl<Device::GPU>(alpha,A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("AxpyContract: Bad device type.");
     }
diff --git a/include/El/blas_like/level1/Broadcast.hpp b/include/El/blas_like/level1/Broadcast.hpp
index eb3409ebb7..caff9d99ba 100644
--- a/include/El/blas_like/level1/Broadcast.hpp
+++ b/include/El/blas_like/level1/Broadcast.hpp
@@ -60,12 +60,12 @@ void Broadcast( AbstractMatrix<T>& A, mpi::Comm const& comm, int rank )
         Broadcast_impl(static_cast<Matrix<T,Device::CPU>&>(A),
                        std::move(comm), rank);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Broadcast_impl(static_cast<Matrix<T,Device::GPU>&>(A),
                        std::move(comm), rank);
         break;
-#endif // HYROGEN_HAVE_CUDA
+#endif // HYROGEN_HAVE_GPU
     default:
         LogicError("Unsupported device type.");
     }
@@ -125,11 +125,11 @@ void Broadcast( AbstractDistMatrix<T>& A, mpi::Comm const& comm, int rank )
     case Device::CPU:
         Broadcast_impl<Device::CPU>(A, std::move(comm), rank);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Broadcast_impl<Device::GPU>(A, std::move(comm), rank);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Broadcast: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Contract.hpp b/include/El/blas_like/level1/Contract.hpp
index 263565f54c..f51cd7724a 100644
--- a/include/El/blas_like/level1/Contract.hpp
+++ b/include/El/blas_like/level1/Contract.hpp
@@ -87,11 +87,11 @@ void Contract
     case Device::CPU:
         ContractDispatch<T,Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         ContractDispatch<T,Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Contract: Bad device type.");
     }
diff --git a/include/El/blas_like/level1/Copy.hpp b/include/El/blas_like/level1/Copy.hpp
index 7d835c4c4f..77677b960a 100644
--- a/include/El/blas_like/level1/Copy.hpp
+++ b/include/El/blas_like/level1/Copy.hpp
@@ -13,11 +13,17 @@
 #include <omp.h>
 #endif
 
+#include <El/hydrogen_config.h>
+
 #include <El/core/Grid.hpp>
 #include <El/blas_like/level1/Copy/internal_decl.hpp>
 #include <El/blas_like/level1/Copy/GeneralPurpose.hpp>
 #include <El/blas_like/level1/Copy/util.hpp>
 
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/gpu/BasicCopy.hpp>
+#endif
+
 #include <hydrogen/meta/MetaUtilities.hpp>
 
 // Introduce some metaprogramming notions.
diff --git a/include/El/blas_like/level1/Copy/ColAllGather.hpp b/include/El/blas_like/level1/Copy/ColAllGather.hpp
index a6529d9f35..2f7c8ff8a8 100644
--- a/include/El/blas_like/level1/Copy/ColAllGather.hpp
+++ b/include/El/blas_like/level1/Copy/ColAllGather.hpp
@@ -194,11 +194,11 @@ void ColAllGather
     case Device::CPU:
         ColAllGather_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         ColAllGather_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("ColAllGather: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/ColFilter.hpp b/include/El/blas_like/level1/Copy/ColFilter.hpp
index b40b54aef3..46619a9873 100644
--- a/include/El/blas_like/level1/Copy/ColFilter.hpp
+++ b/include/El/blas_like/level1/Copy/ColFilter.hpp
@@ -102,11 +102,11 @@ void ColFilter
     case Device::CPU:
         ColFilter_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         ColFilter_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("ColFilter: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/Exchange.hpp b/include/El/blas_like/level1/Copy/Exchange.hpp
index 28e9098bc0..13f168c710 100644
--- a/include/El/blas_like/level1/Copy/Exchange.hpp
+++ b/include/El/blas_like/level1/Copy/Exchange.hpp
@@ -139,11 +139,11 @@ void Exchange
     case Device::CPU:
         Exchange_impl<T,Device::CPU>(A,B,sendRank,recvRank,comm);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Exchange_impl<T,Device::GPU>(A,B,sendRank,recvRank,comm);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Exchange: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/PartialColFilter.hpp b/include/El/blas_like/level1/Copy/PartialColFilter.hpp
index 75dd08eff0..47c37b1f2f 100644
--- a/include/El/blas_like/level1/Copy/PartialColFilter.hpp
+++ b/include/El/blas_like/level1/Copy/PartialColFilter.hpp
@@ -112,11 +112,11 @@ void PartialColFilter
     case Device::CPU:
         PartialColFilter_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         PartialColFilter_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("PartialColFilter: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp b/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp
index 9857c03dbb..45b69e5b5e 100644
--- a/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp
+++ b/include/El/blas_like/level1/Copy/PartialRowAllGather.hpp
@@ -135,11 +135,11 @@ void PartialRowAllGather
     case Device::CPU:
         PartialRowAllGather_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         PartialRowAllGather_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("PartialRowAllGather: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/PartialRowFilter.hpp b/include/El/blas_like/level1/Copy/PartialRowFilter.hpp
index 56aa379b9f..c9b4805ed5 100644
--- a/include/El/blas_like/level1/Copy/PartialRowFilter.hpp
+++ b/include/El/blas_like/level1/Copy/PartialRowFilter.hpp
@@ -113,11 +113,11 @@ void PartialRowFilter
     case Device::CPU:
         PartialRowFilter_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         PartialRowFilter_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("PartialRowFilter: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/RowAllGather.hpp b/include/El/blas_like/level1/Copy/RowAllGather.hpp
index 269950ce9a..7193be2269 100644
--- a/include/El/blas_like/level1/Copy/RowAllGather.hpp
+++ b/include/El/blas_like/level1/Copy/RowAllGather.hpp
@@ -164,11 +164,11 @@ void RowAllGather
     case Device::CPU:
         RowAllGather_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         RowAllGather_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("RowAllGather: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/RowFilter.hpp b/include/El/blas_like/level1/Copy/RowFilter.hpp
index dbc59635ee..5fc097eb24 100644
--- a/include/El/blas_like/level1/Copy/RowFilter.hpp
+++ b/include/El/blas_like/level1/Copy/RowFilter.hpp
@@ -98,11 +98,11 @@ void RowFilter
     case Device::CPU:
         RowFilter_impl<Device::CPU>(A,B);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         RowFilter_impl<Device::GPU>(A,B);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("RowFilter: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Copy/TransposeDist.hpp b/include/El/blas_like/level1/Copy/TransposeDist.hpp
index bbfe13dce1..4f85db86ef 100644
--- a/include/El/blas_like/level1/Copy/TransposeDist.hpp
+++ b/include/El/blas_like/level1/Copy/TransposeDist.hpp
@@ -215,7 +215,7 @@ void TransposeDist(DistMatrix<T,U,V,ELEMENT,Device::CPU> const& A,
     }
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 
 // FIXME (trb): This should work just fine, but it might not have
 // optimal performance for row/column vectors (A.Height() or A.Width()
@@ -275,7 +275,7 @@ void TransposeDist(DistMatrix<T,U,V,ELEMENT,Device::GPU> const& A,
 
 }
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T,Dist U,Dist V,Device D,typename,typename>
 void TransposeDist(DistMatrix<T,U,V,ELEMENT,D> const& A,
diff --git a/include/El/blas_like/level1/Copy/internal_decl.hpp b/include/El/blas_like/level1/Copy/internal_decl.hpp
index ce077bf1bc..c7aa1016a6 100644
--- a/include/El/blas_like/level1/Copy/internal_decl.hpp
+++ b/include/El/blas_like/level1/Copy/internal_decl.hpp
@@ -64,12 +64,12 @@ template<typename T,Dist U,Dist V,
 void TransposeDist( DistMatrix<T,U,V,ELEMENT,Device::CPU> const& A,
                     DistMatrix<T,V,U,ELEMENT,Device::CPU>& B );
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename T,Dist U,Dist V,
          typename=EnableIf<IsStorageType<T,Device::GPU>>>
 void TransposeDist( DistMatrix<T,U,V,ELEMENT,Device::GPU> const& A,
                     DistMatrix<T,V,U,ELEMENT,Device::GPU>& B );
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T,Dist U,Dist V,Device D,
          typename=DisableIf<IsStorageType<T,D>>,typename=void>
diff --git a/include/El/blas_like/level1/Copy/util.hpp b/include/El/blas_like/level1/Copy/util.hpp
index bff09d31d3..c063b29e40 100644
--- a/include/El/blas_like/level1/Copy/util.hpp
+++ b/include/El/blas_like/level1/Copy/util.hpp
@@ -9,7 +9,7 @@
 #ifndef EL_BLAS_COPY_UTIL_HPP
 #define EL_BLAS_COPY_UTIL_HPP
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include <hydrogen/blas/gpu/Copy.hpp>
 #endif
 
@@ -229,7 +229,7 @@ void PartialRowStridedUnpack(
     }
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T,
           typename=EnableIf<IsStorageType<T,Device::GPU>>>
 void DeviceStridedMemCopy(
@@ -250,19 +250,17 @@ void InterleaveMatrix(
 {
     if (colStrideA == 1 && colStrideB == 1)
     {
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(B, rowStrideB*sizeof(T),
-                              A, rowStrideA*sizeof(T),
-                              height*sizeof(T), width,
-                              cudaMemcpyDeviceToDevice,
-                              syncInfo.stream_));
+        gpu::Copy2DIntraDevice(A, rowStrideA,
+                               B, rowStrideB,
+                               height, width,
+                               syncInfo);
     }
     else
     {
         hydrogen::Copy_GPU_impl(height, width,
                                 A, colStrideA, rowStrideA,
                                 B, colStrideB, rowStrideB,
-                                syncInfo.stream_);
+                                syncInfo);
     }
 }
 
@@ -278,12 +276,12 @@ void RowStridedPack(
     {
         const Int rowShift = Shift_(k, rowAlign, rowStride);
         const Int localWidth = Length_(width, rowShift, rowStride);
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(BPortions + k*portionSize, height*sizeof(T),
-                              A+rowShift*ALDim, rowStride*ALDim*sizeof(T),
-                              height*sizeof(T), localWidth,
-                              cudaMemcpyDeviceToDevice,
-                              syncInfo.stream_));
+
+        gpu::Copy2DIntraDevice(
+            A+rowShift*ALDim, rowStride*ALDim,
+            BPortions + k*portionSize, height,
+            height, localWidth,
+            syncInfo);
     }
 }
 
@@ -299,12 +297,11 @@ void RowStridedUnpack(
     {
         const Int rowShift = Shift_(k, rowAlign, rowStride);
         const Int localWidth = Length_(width, rowShift, rowStride);
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(B+rowShift*BLDim, rowStride*BLDim*sizeof(T),
-                              APortions+k*portionSize, height*sizeof(T),
-                              height*sizeof(T), localWidth,
-                              cudaMemcpyDeviceToDevice,
-                              syncInfo.stream_));
+        gpu::Copy2DIntraDevice(
+            APortions+k*portionSize, height,
+            B+rowShift*BLDim, rowStride*BLDim,
+            height, localWidth,
+            syncInfo);
     }
 }
 
@@ -324,12 +321,12 @@ void PartialRowStridedPack(
                                     rowAlign, rowStride);
         const Int rowOffset = (rowShift-rowShiftA) / rowStridePart;
         const Int localWidth = Length_(width, rowShift, rowStride);
-        H_CHECK_CUDA(cudaMemcpy2DAsync(
-                          BPortions + k*portionSize, height*sizeof(T),
-                          A + rowOffset*ALDim, rowStrideUnion*ALDim*sizeof(T),
-                          height*sizeof(T), localWidth,
-                          cudaMemcpyDeviceToDevice,
-                          syncInfo.stream_));
+
+        gpu::Copy2DIntraDevice(
+            A + rowOffset*ALDim, rowStrideUnion*ALDim,
+            BPortions + k*portionSize, height,
+            height, localWidth,
+            syncInfo);
     }
 }
 
@@ -349,16 +346,15 @@ void PartialRowStridedUnpack(
                                     rowAlign, rowStride);
         const Int rowOffset = (rowShift-rowShiftB) / rowStridePart;
         const Int localWidth = Length_(width, rowShift, rowStride);
-        H_CHECK_CUDA(cudaMemcpy2DAsync(
-                          B + rowOffset*BLDim, rowStrideUnion*BLDim*sizeof(T),
-                          APortions + k*portionSize, height*sizeof(T),
-                          height*sizeof(T), localWidth,
-                          cudaMemcpyDeviceToDevice,
-                          syncInfo.stream_));
+        gpu::Copy2DIntraDevice(
+            APortions + k*portionSize, height,
+            B + rowOffset*BLDim, rowStrideUnion*BLDim,
+            height, localWidth,
+            syncInfo);
     }
 }
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T, Device D, typename>
 void ColStridedPack(
diff --git a/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp b/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp
index bcc23dec7b..22adba7eca 100644
--- a/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp
+++ b/include/El/blas_like/level1/CopyAsyncDistMatrix.hpp
@@ -47,13 +47,13 @@ void CopyAsync(ElementalMatrix<S> const& A, DistMatrix<T,U,V,ELEMENT,D>& B)
                 static_cast<DistMatrix<S,U,V,ELEMENT,Device::CPU> const&>(A),
                 B);
             break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         case Device::GPU:
             CopyAsync(
                 static_cast<DistMatrix<S,U,V,ELEMENT,Device::GPU> const&>(A),
                 B);
             break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         default:
             LogicError("CopyAsync: Unknown device type.");
         }
diff --git a/include/El/blas_like/level1/CopyAsyncLocal.hpp b/include/El/blas_like/level1/CopyAsyncLocal.hpp
index 10e3add816..b19246c01a 100644
--- a/include/El/blas_like/level1/CopyAsyncLocal.hpp
+++ b/include/El/blas_like/level1/CopyAsyncLocal.hpp
@@ -53,8 +53,8 @@ void CopyAsyncImpl(Matrix<T, Device::CPU> const& A,
     const T* EL_RESTRICT ABuf = A.LockedBuffer();
     T* EL_RESTRICT BBuf = B.Buffer();
 
-    InterDeviceCopy<Device::CPU, Device::GPU>::MemCopy2DAsync(
-        BBuf, ldB, ABuf, ldA, height, width, B.Stream());
+    details::InterdeviceCopy<Device::CPU, Device::GPU>::Copy2DAsync(
+        ABuf, ldA, BBuf, ldB, height, width, SyncInfoFromMatrix(B));
 }
 
 template <typename T,
@@ -72,8 +72,8 @@ void CopyAsyncImpl(Matrix<T, Device::GPU> const& A,
     const T* EL_RESTRICT ABuf = A.LockedBuffer();
     T* EL_RESTRICT BBuf = B.Buffer();
 
-    InterDeviceCopy<Device::GPU, Device::CPU>::MemCopy2DAsync(
-        BBuf, ldB, ABuf, ldA, height, width, A.Stream());
+    details::InterdeviceCopy<Device::GPU, Device::CPU>::Copy2DAsync(
+        ABuf, ldA, BBuf, ldB, height, width, SyncInfoFromMatrix(A));
 }
 #endif // HYDROGEN_HAVE_GPU
 
diff --git a/include/El/blas_like/level1/CopyLocal.hpp b/include/El/blas_like/level1/CopyLocal.hpp
index e10d38e598..2c3dbe8e8d 100644
--- a/include/El/blas_like/level1/CopyLocal.hpp
+++ b/include/El/blas_like/level1/CopyLocal.hpp
@@ -129,8 +129,7 @@ void CopyImpl(Matrix<T, Device::GPU> const& A, Matrix<U, Device::GPU>& B)
                    syncInfoB);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
-// If using CUDA, prefer the cudaMemcpy2D implementation. This is
+// If using GPU, prefer the (cuda|hip)Memcpy2D implementation. This is
 // ASYNCHRONOUS with respect to the host.
 // (Case 1, GPU)
 //
@@ -153,14 +152,11 @@ void CopyImpl(Matrix<T, Device::GPU> const& A, Matrix<T, Device::GPU>& B)
     auto syncHelper = MakeMultiSync(syncInfoB, syncInfoA);
 
     // Launch the copy
-    H_CHECK_CUDA(
-        cudaMemcpy2DAsync(BBuf, ldB*sizeof(T),
-                          ABuf, ldA*sizeof(T),
-                          height*sizeof(T), width,
-                          cudaMemcpyDeviceToDevice,
-                          syncInfoB.stream_));
+    gpu::Copy2DIntraDevice(ABuf, ldA,
+                           BBuf, ldB,
+                           height, width,
+                           syncInfoB);
 }
-#endif // HYDROGEN_HAVE_CUDA
 
 namespace details
 {
@@ -178,6 +174,29 @@ struct InterdeviceSync
 
     SyncInfo<Device::GPU> gpu_sync_;
 };
+
+template <Device D1, Device D2>
+struct InterdeviceCopy;
+
+template <>
+struct InterdeviceCopy<Device::CPU, Device::GPU>
+{
+    template <typename... Args>
+    static void Copy2DAsync(Args&&... args)
+    {
+        gpu::Copy2DToDevice(std::forward<Args>(args)...);
+    }
+};
+
+template <>
+struct InterdeviceCopy<Device::GPU, Device::CPU>
+{
+    template <typename... Args>
+    static void Copy2DAsync(Args&&... args)
+    {
+        gpu::Copy2DToHost(std::forward<Args>(args)...);
+    }
+};
 }
 
 // These inter-device copy functions are SYNCHRONOUS with respect to
@@ -200,8 +219,8 @@ void CopyImpl(Matrix<T, D1> const& A, Matrix<T, D2>& B)
     details::InterdeviceSync isync(SyncInfoFromMatrix(A),
                                    SyncInfoFromMatrix(B));
 
-    InterDeviceCopy<D1, D2>::MemCopy2DAsync(
-        BBuf, ldB, ABuf, ldA, height, width, isync.gpu_sync_.stream_);
+    details::InterdeviceCopy<D1, D2>::Copy2DAsync(
+        ABuf, ldA, BBuf, ldB, height, width, isync.gpu_sync_);
     Synchronize(isync.gpu_sync_); // Is this necessary??
 }
 
@@ -225,8 +244,8 @@ void CopyImpl(Matrix<T, D1> const& A,
 
     details::InterdeviceSync isync(SyncInfoFromMatrix(A),
                                    SyncInfoFromMatrix(B));
-    InterDeviceCopy<D1, D2>::MemCopy2DAsync(
-        BBuf, ldB, ABuf, ldA, height, width, isync.gpu_sync_.stream_);
+    details::InterdeviceCopy<D1, D2>::Copy2DAsync(
+        ABuf, ldA, BBuf, ldB, height, width, isync.gpu_sync_);
     Synchronize(isync.gpu_sync_); // Is this necessary??
 }
 
diff --git a/include/El/blas_like/level1/DiagonalScale.hpp b/include/El/blas_like/level1/DiagonalScale.hpp
index cc709d9fde..9e2ab95960 100644
--- a/include/El/blas_like/level1/DiagonalScale.hpp
+++ b/include/El/blas_like/level1/DiagonalScale.hpp
@@ -13,7 +13,7 @@
 
 namespace El {
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T, typename>
 void DiagonalScale(
     LeftOrRight side, Orientation orientation,
@@ -47,7 +47,7 @@ void DiagonalScale(
     LogicError("DiagonalScale: Bad device type.");
 }
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename TDiag,typename T>
 void DiagonalScale(
@@ -104,13 +104,13 @@ void DiagonalScale(LeftOrRight side,
                       static_cast<Matrix<T,Device::CPU> const&>(d),
                       static_cast<Matrix<T,Device::CPU>&>(A));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         DiagonalScale(side, orientation,
                       static_cast<Matrix<T,Device::GPU> const&>(d),
                       static_cast<Matrix<T,Device::GPU>&>(A));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("DiagonalScale: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Dot.hpp b/include/El/blas_like/level1/Dot.hpp
index a832e52a0c..a61cf1a8ee 100644
--- a/include/El/blas_like/level1/Dot.hpp
+++ b/include/El/blas_like/level1/Dot.hpp
@@ -30,12 +30,12 @@ T Dot( const AbstractMatrix<T>& A, const AbstractMatrix<T>& B )
       sum = Dot(static_cast<const Matrix<T,Device::CPU>&>(A),
                 static_cast<const Matrix<T,Device::CPU>&>(B));
       break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
       sum = Dot(static_cast<const Matrix<T,Device::GPU>&>(A),
                 static_cast<const Matrix<T,Device::GPU>&>(B));
       break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
       LogicError("Unsupported device type.");
     }
diff --git a/include/El/blas_like/level1/EntrywiseFill.hpp b/include/El/blas_like/level1/EntrywiseFill.hpp
index 2c3944b211..cf0769ac1b 100644
--- a/include/El/blas_like/level1/EntrywiseFill.hpp
+++ b/include/El/blas_like/level1/EntrywiseFill.hpp
@@ -23,7 +23,7 @@ void EntrywiseFill( Matrix<T, Device::CPU>& A, function<T(void)> func )
 }
 
 // FIXME: Make proper kernel
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 void EntrywiseFill(Matrix<T,Device::GPU> &A, function<T(void)> func)
 {
@@ -32,7 +32,7 @@ void EntrywiseFill(Matrix<T,Device::GPU> &A, function<T(void)> func)
     EntrywiseFill(CPU_Mat, std::move(func));
     A = CPU_Mat;
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T>
 void EntrywiseFill( AbstractDistMatrix<T>& A, function<T(void)> func )
@@ -50,12 +50,12 @@ void EntrywiseFill( AbstractDistMatrix<T>& A, function<T(void)> func )
   EL_EXTERN template void EntrywiseFill \
   ( AbstractDistMatrix<T>& A, function<T(void)> func );
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 EL_EXTERN template void EntrywiseFill(
     Matrix<float,Device::GPU>&, function<float(void)>);
 EL_EXTERN template void EntrywiseFill(
     Matrix<double,Device::GPU>&, function<double(void)>);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/include/El/blas_like/level1/Fill.hpp b/include/El/blas_like/level1/Fill.hpp
index 5ddbb7894a..0e0e3d0847 100644
--- a/include/El/blas_like/level1/Fill.hpp
+++ b/include/El/blas_like/level1/Fill.hpp
@@ -9,7 +9,7 @@
 #ifndef EL_BLAS_FILL_HPP
 #define EL_BLAS_FILL_HPP
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include <hydrogen/blas/gpu/Fill.hpp>
 #endif
 
@@ -49,14 +49,14 @@ void Fill( AbstractMatrix<T>& A, T alpha )
             }
         }
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         hydrogen::Fill_GPU_impl(
             m, n, alpha, ABuf, ALDim,
             SyncInfoFromMatrix(
-                static_cast<Matrix<T,Device::GPU>&>(A)).stream_);
+                static_cast<Matrix<T,Device::GPU>&>(A)));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device type in Fill");
     }
diff --git a/include/El/blas_like/level1/Hadamard.hpp b/include/El/blas_like/level1/Hadamard.hpp
index f670e81e38..7e641b555c 100644
--- a/include/El/blas_like/level1/Hadamard.hpp
+++ b/include/El/blas_like/level1/Hadamard.hpp
@@ -9,9 +9,9 @@
 #ifndef EL_BLAS_HADAMARD_HPP
 #define EL_BLAS_HADAMARD_HPP
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include <hydrogen/blas/gpu/Hadamard.hpp>
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 // C(i,j) := A(i,j) B(i,j)
 
@@ -78,7 +78,7 @@ void Hadamard(AbstractMatrix<T> const& A, AbstractMatrix<T> const& B,
             }
         }
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         auto si_A = SyncInfoFromMatrix(
@@ -93,10 +93,10 @@ void Hadamard(AbstractMatrix<T> const& A, AbstractMatrix<T> const& B,
         hydrogen::Hadamard_GPU_impl(height, width,
                                     ABuf, 1, ALDim, BBuf, 1, BLDim,
                                     CBuf, 1, CLDim,
-                                    si_C.stream_);
+                                    si_C);
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device type for Hadamard.");
     }
diff --git a/include/El/blas_like/level1/IndexDependentMap.hpp b/include/El/blas_like/level1/IndexDependentMap.hpp
index 168bdb565b..37195a648c 100644
--- a/include/El/blas_like/level1/IndexDependentMap.hpp
+++ b/include/El/blas_like/level1/IndexDependentMap.hpp
@@ -52,12 +52,12 @@ void IndexDependentMap( AbstractMatrix<T>& A, function<T(Int,Int,const T&)> func
     case Device::CPU:
       IndexDependentMap(static_cast<Matrix<T,Device::CPU>&>(A), func);
       break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
       LogicError("IndexDependentMap: Unsupported device type.");
       // IndexDependentMap(static_cast<Matrix<T,Device::GPU>&>(A), func);
       break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
       LogicError("IndexDependentMap: Unsupported device type.");
     }
diff --git a/include/El/blas_like/level1/Recv.hpp b/include/El/blas_like/level1/Recv.hpp
index 32d1805ee3..f62a9a3249 100644
--- a/include/El/blas_like/level1/Recv.hpp
+++ b/include/El/blas_like/level1/Recv.hpp
@@ -49,11 +49,11 @@ void Recv(AbstractMatrix<T>& A, mpi::Comm const& comm, int source)
     case Device::CPU:
         Recv(static_cast<Matrix<T,Device::CPU>&>(A), std::move(comm), source);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Recv(static_cast<Matrix<T,Device::GPU>&>(A), std::move(comm), source);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Recv: Bad device.");
     }
diff --git a/include/El/blas_like/level1/Round.hpp b/include/El/blas_like/level1/Round.hpp
index 80967a6167..99c332d403 100644
--- a/include/El/blas_like/level1/Round.hpp
+++ b/include/El/blas_like/level1/Round.hpp
@@ -20,11 +20,11 @@ void Round(AbstractMatrix<T>& A)
     case Device::CPU:
         Round(static_cast<Matrix<T,Device::CPU>&>(A));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Round(static_cast<Matrix<T,Device::GPU>&>(A));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Invalid device type.");
     }
diff --git a/include/El/blas_like/level1/Scale.hpp b/include/El/blas_like/level1/Scale.hpp
index 5162d00516..c0c4d0888b 100644
--- a/include/El/blas_like/level1/Scale.hpp
+++ b/include/El/blas_like/level1/Scale.hpp
@@ -13,7 +13,7 @@
 namespace El
 {
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T, typename=EnableIf<IsComputeType<T,Device::GPU>>>
 void Scale(T const& alpha, Matrix<T,Device::GPU>& A)
 {
@@ -36,7 +36,7 @@ void Scale(T const&, Matrix<T,Device::GPU>&)
 {
     LogicError("Scale: Bad device/type combo!");
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T, typename S,
           typename=EnableIf<IsComputeType<T,Device::CPU>>>
@@ -102,11 +102,11 @@ void Scale( S alphaS, AbstractMatrix<T>& A )
         case Device::CPU:
             Scale(alpha, static_cast<Matrix<T,Device::CPU>&>(A));
             break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         case Device::GPU:
             Scale(alpha, static_cast<Matrix<T,Device::GPU>&>(A));
             break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         default:
             LogicError("Bad device type in Scale");
         }
diff --git a/include/El/blas_like/level1/Send.hpp b/include/El/blas_like/level1/Send.hpp
index 58ba29fb00..447dbe7fc0 100644
--- a/include/El/blas_like/level1/Send.hpp
+++ b/include/El/blas_like/level1/Send.hpp
@@ -49,12 +49,12 @@ void Send(AbstractMatrix<T> const& A, mpi::Comm const& comm, int destination)
         Send(static_cast<Matrix<T,Device::CPU> const&>(A),
              std::move(comm), destination);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Send(static_cast<Matrix<T,Device::GPU> const&>(A),
              std::move(comm), destination);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Send: Bad Device.");
     }
diff --git a/include/El/blas_like/level1/SendRecv.hpp b/include/El/blas_like/level1/SendRecv.hpp
index 305c023c88..40b5f48c43 100644
--- a/include/El/blas_like/level1/SendRecv.hpp
+++ b/include/El/blas_like/level1/SendRecv.hpp
@@ -27,14 +27,14 @@ void SendRecv(
             static_cast<Matrix<T,Device::CPU>&>(B),
             comm, sendRank, recvRank);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SendRecv(
             static_cast<Matrix<T,Device::GPU> const&>(A),
             static_cast<Matrix<T,Device::GPU>&>(B),
             comm, sendRank, recvRank);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SendRecv: Unsupported device.");
     }
@@ -108,14 +108,14 @@ void SendRecv
   ( const Matrix<T,Device::CPU>& A, Matrix<T,Device::CPU>& B, mpi::Comm const& comm, \
     int sendRank, int recvRank );
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 EL_EXTERN template void SendRecv(
     Matrix<float,Device::GPU> const&, Matrix<float,Device::GPU>&,
     mpi::Comm const&, int, int);
 EL_EXTERN template void SendRecv(
     Matrix<double,Device::GPU> const&, Matrix<double,Device::GPU>&,
     mpi::Comm const&, int, int);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/include/El/blas_like/level1/Transpose.hpp b/include/El/blas_like/level1/Transpose.hpp
index 1f2113d78d..b6b230f70e 100644
--- a/include/El/blas_like/level1/Transpose.hpp
+++ b/include/El/blas_like/level1/Transpose.hpp
@@ -84,13 +84,13 @@ void Transpose(AbstractMatrix<T> const& A, AbstractMatrix<T>& B,
             static_cast<Matrix<T,Device::CPU> const&>(A),
             static_cast<Matrix<T,Device::CPU>&>(B), conjugate);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Transpose(
             static_cast<Matrix<T,Device::GPU> const&>(A),
             static_cast<Matrix<T,Device::GPU>&>(B), conjugate);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device for transform.");
     }
@@ -158,7 +158,7 @@ void Transpose( const Matrix<T>& A, Matrix<T>& B, bool conjugate )
 }
 
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T, typename>
 void Transpose(Matrix<T,Device::GPU> const& A,
                Matrix<T,Device::GPU>& B, bool conjugate )
@@ -186,7 +186,7 @@ void Transpose(Matrix<T,Device::GPU> const& A,
 {
     LogicError("Bad device type!");
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T>
 void Transpose
@@ -416,7 +416,7 @@ void Adjoint
     EL_EXTERN template void Transpose(                          \
         Matrix<T> const& A, Matrix<T>& B, bool conjugate);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 EL_EXTERN template void Transpose(
     Matrix<float,Device::GPU> const& A, Matrix<float,Device::GPU>& B,
     bool conjugate);
@@ -431,7 +431,7 @@ EL_EXTERN template void Transpose(
     Matrix<gpu_half_type,Device::GPU>& B,
     bool conjugate);
 #endif // HYDROGEN_GPU_USE_FP16
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/include/El/blas_like/level1/TransposeAxpy.hpp b/include/El/blas_like/level1/TransposeAxpy.hpp
index 71d074153f..21fb366479 100644
--- a/include/El/blas_like/level1/TransposeAxpy.hpp
+++ b/include/El/blas_like/level1/TransposeAxpy.hpp
@@ -27,7 +27,7 @@ void TransposeAxpy(
                       static_cast<Matrix<T,Device::CPU>&>(Y),
                       conjugate);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         TransposeAxpy(alphaS,
                       static_cast<Matrix<T,Device::GPU> const&>(X),
@@ -35,7 +35,7 @@ void TransposeAxpy(
                       conjugate);
 
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device for TransposeAxpy");
     }
@@ -106,7 +106,7 @@ void TransposeAxpy
     }
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T, typename S,
           typename=EnableIf<IsDeviceValidType<T,Device::GPU>>>
 void TransposeAxpy(S alphaS,
@@ -175,7 +175,7 @@ void TransposeAxpy (S alphaS,
 {
     LogicError("TransposeAxpy: Bad type/device combo.");
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T,typename S>
 void TransposeAxpy
diff --git a/include/El/blas_like/level1/Zero.hpp b/include/El/blas_like/level1/Zero.hpp
index dbeae4fe6e..4cef287ff9 100644
--- a/include/El/blas_like/level1/Zero.hpp
+++ b/include/El/blas_like/level1/Zero.hpp
@@ -13,7 +13,7 @@
 #include <omp.h>
 #endif
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include <hydrogen/blas/gpu/Fill.hpp>
 #endif
 
@@ -101,14 +101,14 @@ void Zero( AbstractMatrix<T>& A )
             }
         }
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         hydrogen::Fill_GPU_impl(
             height, width, TypeTraits<T>::Zero(), ABuf, ALDim,
             SyncInfoFromMatrix(
-                static_cast<Matrix<T,Device::GPU>&>(A)).stream_);
+                static_cast<Matrix<T,Device::GPU>&>(A)));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device type in Zero");
     }
diff --git a/include/El/blas_like/level1/decl.hpp b/include/El/blas_like/level1/decl.hpp
index e41a5de239..dcb9faf4ba 100644
--- a/include/El/blas_like/level1/decl.hpp
+++ b/include/El/blas_like/level1/decl.hpp
@@ -103,14 +103,14 @@ void InterleaveMatrixUpdate(
     Ring* B, Int colStrideB, Int rowStrideB,
     SyncInfo<Device::CPU>);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename Ring>
 void InterleaveMatrixUpdate(
     Ring alpha, Int localHeight, Int localWidth,
     Ring const* A, Int colStrideA, Int rowStrideA,
     Ring* B, Int colStrideB, Int rowStrideB,
     SyncInfo<Device::GPU>);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename Ring, Device D=Device::CPU>
 void UpdateWithLocalData(
@@ -414,7 +414,7 @@ void PartialRowStridedUnpack(
     T* B, Int BLDim,
     SyncInfo<Device::CPU> );
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T, typename=EnableIf<IsStorageType<T,Device::GPU>>>
 void InterleaveMatrix(
     Int height, Int width,
@@ -458,7 +458,7 @@ void PartialRowStridedUnpack(
     T* B, Int BLDim,
     SyncInfo<Device::GPU> );
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T, Device D,
           typename=DisableIf<IsStorageType<T,D>>, typename=void>
@@ -622,7 +622,7 @@ namespace El
 
 // DiagonalScale
 // =============
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename T, typename=EnableIf<IsDeviceValidType<T,Device::GPU>>>
 void DiagonalScale
 ( LeftOrRight side, Orientation orientation,
@@ -632,7 +632,7 @@ template<typename T, typename=DisableIf<IsDeviceValidType<T,Device::GPU>>,
 void DiagonalScale
 ( LeftOrRight side, Orientation orientation,
   Matrix<T,Device::GPU> const& d, Matrix<T,Device::GPU>& A );
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename TDiag,typename T>
 void DiagonalScale(
@@ -741,10 +741,10 @@ template<typename T>
 void EntrywiseFill( Matrix<T>& A, function<T(void)> func );
 template<typename T>
 void EntrywiseFill( AbstractDistMatrix<T>& A, function<T(void)> func );
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename T>
 void EntrywiseFill( Matrix<T,Device::GPU>& A, function<T(void)> func );
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 // EntrywiseMap
 // ============
@@ -1597,7 +1597,7 @@ void Transpose
 ( const Matrix<T>& A,
         Matrix<T>& B,
   bool conjugate=false );
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template<typename T,typename=EnableIf<IsDeviceValidType<T,Device::GPU>>>
 void Transpose
 ( Matrix<T,Device::GPU> const& A,
@@ -1610,7 +1610,7 @@ void Transpose
 ( Matrix<T,Device::GPU> const& A,
   Matrix<T,Device::GPU>& B,
   bool conjugate=false );
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename T>
 void Transpose
diff --git a/include/El/core.hpp b/include/El/core.hpp
index 746cc0756e..173e0e6247 100644
--- a/include/El/core.hpp
+++ b/include/El/core.hpp
@@ -57,13 +57,16 @@
 #include <hydrogen/blas/GPU_BLAS.hpp>
 #endif // HYDROGEN_HAVE_GPU
 
-#ifdef HYDROGEN_HAVE_CUDA
+#if defined(HYDROGEN_HAVE_CUDA)
 #include <hydrogen/device/gpu/CUDA.hpp>
+#include <hydrogen/device/gpu/cuda/cuBLAS.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
+
 #ifdef HYDROGEN_HAVE_CUB
-#include <hydrogen/device/gpu/cuda/CUB.hpp>
+#include <hydrogen/device/gpu/CUB.hpp>
 #endif // HYDROGEN_HAVE_CUB
-#include <hydrogen/device/gpu/cuda/cuBLAS.hpp>
-#endif // HYDROGEN_HAVE_CUDA
 
 // Inject Hydrogen-specific symbols into El
 namespace El
diff --git a/include/El/core/AbstractMatrix/decl.hpp b/include/El/core/AbstractMatrix/decl.hpp
index 26d78c9207..b039f59c84 100644
--- a/include/El/core/AbstractMatrix/decl.hpp
+++ b/include/El/core/AbstractMatrix/decl.hpp
@@ -276,7 +276,7 @@ class AbstractMatrix
         return static_cast<const Matrix<T, Device::CPU>&>(*this);
     }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     H_DEPRECATED("Extremely dangerous. Will be removed soon.")
     operator Matrix<T, Device::GPU>& ()
     {
@@ -297,7 +297,7 @@ class AbstractMatrix
         }
         return static_cast<const Matrix<T, Device::GPU>&>(*this);
     }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
     // Single-entry manipulation
     // =========================
diff --git a/include/El/core/Element/impl.hpp b/include/El/core/Element/impl.hpp
index 4680bd2955..9110cdf5a8 100644
--- a/include/El/core/Element/impl.hpp
+++ b/include/El/core/Element/impl.hpp
@@ -135,7 +135,7 @@ inline void UpdateRealPart<gpu_half_type, void>(
 {
     alpha = float(alpha)+float(beta);
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_GPU_USE_FP16
 
 template<typename Real>
 void UpdateRealPart( Complex<Real>& alpha, const Real& beta )
diff --git a/include/El/core/Matrix/decl.hpp b/include/El/core/Matrix/decl.hpp
index 4c486692b0..c412f1ee67 100644
--- a/include/El/core/Matrix/decl.hpp
+++ b/include/El/core/Matrix/decl.hpp
@@ -9,7 +9,14 @@
 #ifndef EL_MATRIX_DECL_HPP
 #define EL_MATRIX_DECL_HPP
 
+#include <El/hydrogen_config.h>
+
 #include <hydrogen/Device.hpp>
+
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/GPU.hpp>
+#endif // HYDROGEN_HAVE_GPU
+
 #include <El/core/Grid.hpp>
 #include <El/core/Memory.hpp>
 
@@ -95,13 +102,13 @@ class Matrix<T, Device::CPU> : public AbstractMatrix<T>
      */
     Matrix<T, Device::CPU>& operator=(Matrix<T, Device::CPU>&& A);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     /** @brief Create a copy of a matrix from a GPU matrix */
     Matrix(Matrix<T, Device::GPU> const& A);
 
     /** @brief Assign by copying data from a GPU */
     Matrix<T, Device::CPU>& operator=(Matrix<T, Device::GPU> const& A);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
     ///@}
     /** @name Abstract Copies. */
@@ -286,7 +293,7 @@ SyncInfo<Device::CPU> SyncInfoFromMatrix(Matrix<T,Device::CPU> const& mat)
     return SyncInfo<Device::CPU>{};
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // GPU version
 template <typename T>
 class Matrix<T, Device::GPU> : public AbstractMatrix<T>
@@ -467,11 +474,8 @@ class Matrix<T, Device::GPU> : public AbstractMatrix<T>
     /** @name Synchronization semantics */
     ///@{
 
-    cudaStream_t Stream() const EL_NO_EXCEPT;
-    cudaEvent_t Event() const EL_NO_EXCEPT;
-
-    void SetStream(cudaStream_t stream) EL_NO_EXCEPT;
-    void SetEvent(cudaEvent_t event) EL_NO_EXCEPT;
+    SyncInfo<Device::GPU> GetSyncInfo() const EL_NO_EXCEPT;
+    void SetSyncInfo(SyncInfo<Device::GPU> const&) EL_NO_EXCEPT;
 
     void UpdateMemSyncInfo() EL_NO_EXCEPT
     {
@@ -511,28 +515,23 @@ class Matrix<T, Device::GPU> : public AbstractMatrix<T>
 
     T* data_=nullptr;
 
-    cudaStream_t stream_ = GPUManager::Stream();
-    cudaEvent_t event_ = GPUManager::Event();
+    SyncInfo<Device::GPU> sync_info_ = gpu::DefaultSyncInfo();
 
 };// class Matrix<T,Device::GPU>
 
 template <typename T>
 SyncInfo<Device::GPU> SyncInfoFromMatrix(Matrix<T,Device::GPU> const& mat)
 {
-    return SyncInfo<Device::GPU>{mat.Stream(), mat.Event()};
+    return mat.GetSyncInfo();
 }
 
 template <typename T>
 void SetSyncInfo(
     Matrix<T,Device::GPU>& mat, SyncInfo<Device::GPU> const& syncInfo)
 {
-    if (syncInfo.stream_ != nullptr)
-        mat.SetStream(syncInfo.stream_);
-    if (syncInfo.event_ != nullptr)
-        mat.SetEvent(syncInfo.event_);
-    mat.UpdateMemSyncInfo();
+    mat.SetSyncInfo(syncInfo);
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 } // namespace El
 
diff --git a/include/El/core/Matrix/impl.hpp b/include/El/core/Matrix/impl.hpp
index e8ed4dd6e1..df6089ca18 100644
--- a/include/El/core/Matrix/impl.hpp
+++ b/include/El/core/Matrix/impl.hpp
@@ -11,7 +11,7 @@
 
 #include "impl_cpu.hpp"
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #include "impl_gpu.hpp"
 #endif
 
diff --git a/include/El/core/Matrix/impl_cpu.hpp b/include/El/core/Matrix/impl_cpu.hpp
index 157c951695..b6dab13f24 100644
--- a/include/El/core/Matrix/impl_cpu.hpp
+++ b/include/El/core/Matrix/impl_cpu.hpp
@@ -9,6 +9,12 @@
 #ifndef EL_MATRIX_IMPL_CPU_HPP_
 #define EL_MATRIX_IMPL_CPU_HPP_
 
+#include <El/hydrogen_config.h>
+
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/GPU.hpp>
+#endif // HYDROGEN_HAVE_GPU
+
 #include <El/blas_like/level1/decl.hpp>
 
 namespace El
@@ -58,21 +64,24 @@ Matrix<T, Device::CPU>::Matrix(Matrix<T, Device::CPU> const& A)
     ::El::Copy(A, *this);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 Matrix<T, Device::CPU>::Matrix(Matrix<T, Device::GPU> const& A)
     : Matrix{A.Height(), A.Width(), A.LDim()}
 {
     EL_DEBUG_CSE;
-    auto stream = GPUManager::Stream();
-    H_CHECK_CUDA(cudaMemcpy2DAsync(data_, this->LDim()*sizeof(T),
-                                    A.LockedBuffer(), A.LDim()*sizeof(T),
-                                    A.Height()*sizeof(T), A.Width(),
-                                    cudaMemcpyDeviceToHost,
-                                    stream));
-    H_CHECK_CUDA(cudaStreamSynchronize(stream));
+    auto syncinfo = SyncInfoFromMatrix(A);
+    gpu::Copy2DToHost(
+        A.LockedBuffer(), A.LDim(),
+        data_, this->LDim(),
+        A.Height(), A.Width(),
+        syncinfo);
+
+    // Cannot exit until this method has finished or matrix data might
+    // be invalid.
+    Synchronize(syncinfo);
 }
-#endif
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T>
 Matrix<T, Device::CPU>::Matrix(Matrix<T, Device::CPU>&& A) EL_NO_EXCEPT
diff --git a/include/El/core/Matrix/impl_gpu.hpp b/include/El/core/Matrix/impl_gpu.hpp
index cc11bcbdfb..aa0450c9eb 100644
--- a/include/El/core/Matrix/impl_gpu.hpp
+++ b/include/El/core/Matrix/impl_gpu.hpp
@@ -61,13 +61,15 @@ Matrix<T, Device::GPU>::Matrix(Matrix<T, Device::CPU> const& A)
     : Matrix{A.Height(), A.Width(), A.LDim()}
 {
     EL_DEBUG_CSE;
-    auto stream = this->Stream();
-    H_CHECK_CUDA(cudaMemcpy2DAsync(data_, this->LDim()*sizeof(T),
-                                    A.LockedBuffer(), A.LDim()*sizeof(T),
-                                    A.Height()*sizeof(T), A.Width(),
-                                    cudaMemcpyHostToDevice,
-                                    stream));
-    H_CHECK_CUDA(cudaStreamSynchronize(stream));
+    auto syncinfo = SyncInfoFromMatrix(*this);
+
+    gpu::Copy2DToDevice(
+        A.LockedBuffer(), A.LDim(),
+        data_, this->LDim(),
+        A.Height(), A.Width(),
+        syncinfo);
+
+    Synchronize(syncinfo);
 }
 
 template <typename T>
@@ -274,12 +276,10 @@ T Matrix<T, Device::GPU>::Get(Int i, Int j) const
 #endif
     if (i == END) i = this->Height() - 1;
     if (j == END) j = this->Width() - 1;
-    auto stream = this->Stream();
+    auto syncinfo = SyncInfoFromMatrix(*this);
     T val;
-    H_CHECK_CUDA(cudaMemcpyAsync( &val, &data_[i+j*this->LDim()],
-                                   sizeof(T), cudaMemcpyDeviceToHost,
-                                   stream ));
-    H_CHECK_CUDA(cudaStreamSynchronize(stream));
+    gpu::Copy1DToHost(&data_[i+j*this->LDim()], &val, 1, syncinfo);
+    Synchronize(syncinfo);
     return val;
 }
 
@@ -319,10 +319,10 @@ void Matrix<T, Device::GPU>::Set(Int i, Int j, T const& alpha)
 #endif
     if (i == END) i = this->Height() - 1;
     if (j == END) j = this->Width() - 1;
-    H_CHECK_CUDA(cudaMemcpyAsync(&data_[i+j*this->LDim()], &alpha,
-                                  sizeof(T), cudaMemcpyHostToDevice,
-                                  stream_ ));
-    H_CHECK_CUDA(cudaStreamSynchronize(stream_));
+
+    auto syncinfo = SyncInfoFromMatrix(*this);
+    gpu::Copy1DToDevice(&alpha, &data_[i+j*this->LDim()], 1, syncinfo);
+    Synchronize(syncinfo);
 }
 
 template <typename T>
@@ -500,27 +500,16 @@ T& Matrix<T, Device::GPU>::operator()(Int i, Int j)
 }
 
 template <typename T>
-cudaStream_t Matrix<T, Device::GPU>::Stream() const EL_NO_EXCEPT
-{
-    return stream_;
-}
-
-template <typename T>
-cudaEvent_t Matrix<T, Device::GPU>::Event() const EL_NO_EXCEPT
-{
-    return event_;
-}
-
-template <typename T>
-void Matrix<T, Device::GPU>::SetStream(cudaStream_t stream) EL_NO_EXCEPT
+SyncInfo<Device::GPU> Matrix<T, Device::GPU>::GetSyncInfo() const EL_NO_EXCEPT
 {
-    stream_ = stream;
+    return sync_info_;
 }
 
 template <typename T>
-void Matrix<T, Device::GPU>::SetEvent(cudaEvent_t event) EL_NO_EXCEPT
+void Matrix<T, Device::GPU>::SetSyncInfo(
+    SyncInfo<Device::GPU> const& si) EL_NO_EXCEPT
 {
-    event_ = event;
+    sync_info_.Merge(si);
 }
 
 #ifdef EL_INSTANTIATE_CORE
diff --git a/include/El/core/Memory/decl.hpp b/include/El/core/Memory/decl.hpp
index 9a1353c6a5..bd9b03737d 100644
--- a/include/El/core/Memory/decl.hpp
+++ b/include/El/core/Memory/decl.hpp
@@ -27,7 +27,7 @@ constexpr unsigned DefaultMemoryMode<Device::CPU>()
     return 0;
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <>
 constexpr unsigned DefaultMemoryMode<Device::GPU>()
 {
@@ -35,9 +35,9 @@ constexpr unsigned DefaultMemoryMode<Device::GPU>()
     return 1;
 #else
     return 0;
-#endif
-}
 #endif // HYDROGEN_HAVE_CUB
+}
+#endif // HYDROGEN_HAVE_GPU
 
 template<typename G, Device D=Device::CPU>
 class Memory
diff --git a/include/El/core/Memory/impl.hpp b/include/El/core/Memory/impl.hpp
index 104041a943..b74cf71c2e 100644
--- a/include/El/core/Memory/impl.hpp
+++ b/include/El/core/Memory/impl.hpp
@@ -14,13 +14,16 @@
 
 #include <El/hydrogen_config.h>
 
-#ifdef HYDROGEN_HAVE_CUDA
+#if defined(HYDROGEN_HAVE_CUDA)
 #include <cuda_runtime.h>
 #include <hydrogen/device/gpu/CUDA.hpp>
-#endif // HYDROGEN_HAVE_CUDA
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hip/hip_runtime.h>
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif // defined(HYDROGEN_HAVE_CUDA)
 
 #ifdef HYDROGEN_HAVE_CUB
-#include <hydrogen/device/gpu/cuda/CUB.hpp>
+#include <hydrogen/device/gpu/CUB.hpp>
 #endif
 
 #include "decl.hpp"
@@ -39,25 +42,29 @@ G* New(size_t size, unsigned int mode, SyncInfo<Device::CPU> const&)
     case 0:
         ptr = static_cast<G*>(HostMemoryPool().Allocate(size * sizeof(G)));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case 1:
         ptr = static_cast<G*>(PinnedHostMemoryPool().Allocate(size * sizeof(G)));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     case 2: ptr = new G[size]; break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case 3:
     {
         // Pinned memory
+#ifdef HYDROGEN_HAVE_CUDA
         auto error = cudaMallocHost(&ptr, size * sizeof(G));
         if (error != cudaSuccess)
         {
             RuntimeError("Failed to allocate pinned memory with message: ",
                          "\"", cudaGetErrorString(error), "\"");
         }
+#elif defined(HYDROGEN_HAVE_ROCM)
+        H_CHECK_HIP(hipHostMalloc(&ptr, size * sizeof(G)));
+#endif
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default: RuntimeError("Invalid CPU memory allocation mode");
     }
     return ptr;
@@ -68,23 +75,27 @@ void Delete( G*& ptr, unsigned int mode, SyncInfo<Device::CPU> const& )
 {
     switch (mode) {
     case 0: HostMemoryPool().Free(ptr); break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case 1: PinnedHostMemoryPool().Free(ptr); break;
-#endif  // HYDROGEN_HAVE_CUDA
+#endif  // HYDROGEN_HAVE_GPU
     case 2: delete[] ptr; break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case 3:
     {
         // Pinned memory
+#if defined(HYDROGEN_HAVE_CUDA)
         auto error = cudaFreeHost(ptr);
         if (error != cudaSuccess)
         {
             RuntimeError("Failed to free pinned memory with message: ",
                          "\"", cudaGetErrorString(error), "\"");
         }
+#elif defined(HYDROGEN_HAVE_ROCM)
+        H_CHECK_HIP(hipHostFree(ptr));
+#endif
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default: RuntimeError("Invalid CPU memory deallocation mode");
     }
     ptr = nullptr;
@@ -97,35 +108,51 @@ void MemZero( G* buffer, size_t numEntries, unsigned int mode,
     MemZero(buffer, numEntries);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 
 template <typename G>
 G* New( size_t size, unsigned int mode, SyncInfo<Device::GPU> const& syncInfo_ )
 {
     // Allocate memory
     G* ptr = nullptr;
+#if defined(HYDROGEN_HAVE_CUDA)
     cudaError_t status = cudaSuccess;
+    cudaError_t const success = cudaSuccess;
+#elif defined(HYDROGEN_HAVE_ROCM)
+    hipError_t status = hipSuccess;
+    hipError_t const success = hipSuccess;
+#endif
     switch (mode) {
+#if defined(HYDROGEN_HAVE_CUDA)
     case 0: status = cudaMalloc(&ptr, size * sizeof(G)); break;
+#elif defined(HYDROGEN_HAVE_ROCM)
+    case 0: status = hipMalloc(&ptr, size * sizeof(G)); break;
+#endif
 #ifdef HYDROGEN_HAVE_CUB
     case 1:
         status = hydrogen::cub::MemoryPool().DeviceAllocate(
             reinterpret_cast<void**>(&ptr),
             size * sizeof(G),
-            syncInfo_.stream_);
+            syncInfo_.Stream());
         break;
 #endif // HYDROGEN_HAVE_CUB
     default: RuntimeError("Invalid GPU memory allocation mode");
     }
 
     // Check for errors
-    if (status != cudaSuccess)
+    if (status != success)
     {
         size_t freeMemory = 0;
         size_t totalMemory = 0;
+#if defined(HYDROGEN_HAVE_CUDA)
         cudaMemGetInfo(&freeMemory, &totalMemory);
+        std::string error_string = cudaGetErrorString(status);
+#elif defined(HYDROGEN_HAVE_ROCM)
+        hipMemGetInfo(&freeMemory, &totalMemory);
+        std::string error_string = hipGetErrorString(status);
+#endif
         RuntimeError("Failed to allocate GPU memory with message: ",
-                     "\"", cudaGetErrorString(status), "\" ",
+                     "\"", error_string, "\" ",
                      "(",size*sizeof(G)," bytes requested, ",
                      freeMemory," bytes available, ",
                      totalMemory," bytes total)");
@@ -138,11 +165,20 @@ template <typename G>
 void Delete( G*& ptr, unsigned int mode, SyncInfo<Device::GPU> const& )
 {
     switch (mode) {
+#if defined(HYDROGEN_HAVE_CUDA)
     case 0: H_CHECK_CUDA(cudaFree(ptr)); break;
+#elif defined(HYDROGEN_HAVE_ROCM)
+    case 0: H_CHECK_HIP(hipFree(ptr)); break;
+#endif
 #ifdef HYDROGEN_HAVE_CUB
     case 1:
+#if defined HYDROGEN_HAVE_CUDA
         H_CHECK_CUDA(
             hydrogen::cub::MemoryPool().DeviceFree(ptr));
+#elif defined HYDROGEN_HAVE_ROCM
+        H_CHECK_HIP(
+            hydrogen::cub::MemoryPool().DeviceFree(ptr));
+#endif
         break;
 #endif // HYDROGEN_HAVE_CUB
     default: RuntimeError("Invalid GPU memory deallocation mode");
@@ -154,12 +190,18 @@ template <typename G>
 void MemZero( G* buffer, size_t numEntries, unsigned int mode,
                      SyncInfo<Device::GPU> const& syncInfo_ )
 {
+#if defined(HYDROGEN_HAVE_CUDA)
     H_CHECK_CUDA(
         cudaMemsetAsync(buffer, 0x0, numEntries * sizeof(G),
-                        syncInfo_.stream_));
+                        syncInfo_.Stream()));
+#elif defined(HYDROGEN_HAVE_ROCM)
+    H_CHECK_HIP(
+        hipMemsetAsync(buffer, 0x0, numEntries * sizeof(G),
+                       syncInfo_.Stream()));
+#endif
 }
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 } // namespace <anonymous>
 
@@ -262,7 +304,7 @@ void Memory<G,D>::Empty()
 template <typename G, Device D>
 void Memory<G,D>::ResetSyncInfo(SyncInfo<D> const& syncInfo)
 {
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     // FIXME: This treats this case as an error. Alternatively, this
     // could reallocate memory. See SetMode() below.
     if ((size_ > 0) && (D == Device::GPU) && (mode_ == 1))
@@ -270,7 +312,7 @@ void Memory<G,D>::ResetSyncInfo(SyncInfo<D> const& syncInfo)
         LogicError("Cannot assign new SyncInfo object to "
                    "already-allocated CUB memory.");
     }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
     syncInfo_ = syncInfo;
 }
@@ -316,7 +358,7 @@ unsigned int Memory<G,D>::Mode() const
 EL_EXTERN template class Memory<double, Device::CPU>;
 
 // GPU instantiations
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 EL_EXTERN template class Memory<float, Device::GPU>;
 EL_EXTERN template class Memory<double, Device::GPU>;
 #endif
diff --git a/include/El/core/MemoryPool.hpp b/include/El/core/MemoryPool.hpp
index 098e4f1aca..4dc6c7c91c 100644
--- a/include/El/core/MemoryPool.hpp
+++ b/include/El/core/MemoryPool.hpp
@@ -1,18 +1,21 @@
 #ifndef HYDROGEN_MEMORYPOOL_HPP_
 #define HYDROGEN_MEMORYPOOL_HPP_
 
-#include <stddef.h>
+#include "El/hydrogen_config.h"
+#if defined(HYDROGEN_HAVE_CUDA)
+#include <cuda_runtime.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hip/hip_runtime.h>
+#endif
+
+#include <cstddef>
 #include <cstdlib>
-#include <stdexcept>
-#include <vector>
+#include <mutex>
 #include <set>
+#include <sstream>
+#include <stdexcept>
 #include <unordered_map>
-#include <mutex>
-
-#include "El/hydrogen_config.h"
-#ifdef HYDROGEN_HAVE_CUDA
-#include <cuda_runtime.h>
-#endif  // HYDROGEN_HAVE_CUDA
+#include <vector>
 
 namespace El
 {
@@ -26,7 +29,7 @@ void ThrowRuntimeError(Args&&... args)
     (void) dummy;
     throw std::runtime_error(oss.str());
 }
-}
+} // namespace details
 
 /** Simple caching memory pool.
  *  This maintains a set of bins that contain allocations of a fixed size.
@@ -200,6 +203,32 @@ inline void MemoryPool<true>::do_free(void* ptr)
             "\"", cudaGetErrorString(error), "\"");
     }
 }
+#elif defined(HYDROGEN_HAVE_ROCM)
+template <>
+inline void* MemoryPool<true>::do_allocation(size_t bytes)
+{
+    void* ptr;
+    auto error = hipHostMalloc(&ptr, bytes);
+    if (error != hipSuccess)
+    {
+        details::ThrowRuntimeError(
+            "Failed to allocate HIP pinned memory with message: ",
+            "\"", hipGetErrorString(error), "\"");
+    }
+    return ptr;
+}
+
+template<>
+inline void MemoryPool<true>::do_free(void* ptr)
+{
+    auto error = hipHostFree(ptr);
+    if (error != hipSuccess)
+    {
+        details::ThrowRuntimeError(
+            "Failed to free HIP pinned memory with message: ",
+            "\"", hipGetErrorString(error), "\"");
+    }
+}
 #endif  // HYDROGEN_HAVE_CUDA
 
 template <>
@@ -218,12 +247,12 @@ inline void MemoryPool<false>::do_free(void* ptr)
     return std::free(ptr);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 /** Get singleton instance of CUDA pinned host memory pool. */
 MemoryPool<true>& PinnedHostMemoryPool();
 /** Destroy singleton instance of CUDA pinned host memory pool. */
 void DestroyPinnedHostMemoryPool();
-#endif  // HYDROGEN_HAVE_CUDA
+#endif  // HYDROGEN_HAVE_GPU
 /** Get singleton instance of host memory pool. */
 MemoryPool<false>& HostMemoryPool();
 /** Destroy singleton instance of host memory pool. */
diff --git a/include/El/core/ProxyDevice.hpp b/include/El/core/ProxyDevice.hpp
index 92e34b50e8..88efe62af3 100644
--- a/include/El/core/ProxyDevice.hpp
+++ b/include/El/core/ProxyDevice.hpp
@@ -24,12 +24,12 @@ class AbstractMatrixReadDeviceProxy
                 proxy_ = new proxy_type{
                     static_cast<Matrix<T,Device::CPU> const&>(A)};
                 break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
             case Device::GPU:
                 proxy_ = new proxy_type{
                     static_cast<Matrix<T,Device::GPU> const&>(A)};
                 break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
             default:
                 LogicError("AbstractMatrixReadDeviceProxy: Bad device.");
             }
diff --git a/include/El/core/View/impl.hpp b/include/El/core/View/impl.hpp
index cf96fe2384..c3885a5f17 100644
--- a/include/El/core/View/impl.hpp
+++ b/include/El/core/View/impl.hpp
@@ -66,12 +66,12 @@ void View(AbstractMatrix<T>& A, AbstractMatrix<T>& B)
         View(static_cast<Matrix<T,Device::CPU>&>(A),
              static_cast<Matrix<T,Device::CPU>&>(B));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         View(static_cast<Matrix<T,Device::GPU>&>(A),
              static_cast<Matrix<T,Device::GPU>&>(B));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Unsupported device type.");
     }
@@ -88,12 +88,12 @@ void LockedView(AbstractMatrix<T>& A, const AbstractMatrix<T>& B)
         LockedView(static_cast<Matrix<T,Device::CPU>&>(A),
                    static_cast<const Matrix<T,Device::CPU>&>(B));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         LockedView(static_cast<Matrix<T,Device::GPU>&>(A),
                    static_cast<const Matrix<T,Device::GPU>&>(B));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Unsupported device type.");
     }
@@ -441,12 +441,12 @@ void View(AbstractMatrix<T>& A, AbstractMatrix<T>& B,
         View(static_cast<Matrix<T,Device::CPU>&>(A),
              static_cast<Matrix<T,Device::CPU>&>(B), I, J);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         View(static_cast<Matrix<T,Device::GPU>&>(A),
              static_cast<Matrix<T,Device::GPU>&>(B), I, J);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Unsupported device type.");
     }
@@ -464,12 +464,12 @@ void LockedView(AbstractMatrix<T>& A, AbstractMatrix<T> const& B,
       LockedView(static_cast<Matrix<T,Device::CPU>&>(A),
                  static_cast<const Matrix<T,Device::CPU>&>(B), I, J);
       break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
       LockedView(static_cast<Matrix<T,Device::GPU>&>(A),
                  static_cast<const Matrix<T,Device::GPU>&>(B), I, J);
       break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Unsupported device type.");
     }
diff --git a/include/El/core/imports/aluminum.hpp b/include/El/core/imports/aluminum.hpp
index 3705916c98..fbb4674dae 100644
--- a/include/El/core/imports/aluminum.hpp
+++ b/include/El/core/imports/aluminum.hpp
@@ -135,7 +135,7 @@ struct BackendsForDeviceT<Device::CPU>
 };// struct BackendsForDeviceT<Device::CPU>
 
 // Prefer the NCCL2 backend
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <>
 struct BackendsForDeviceT<Device::GPU>
 {
@@ -151,18 +151,18 @@ struct BackendsForDeviceT<Device::GPU>
 #endif // HYDROGEN_HAVE_AL_MPI_CUDA
         >;
 };// struct BackendsForDeviceT<Device::GPU>
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 // Helper using statement
 template <Device D>
 using BackendsForDevice = typename BackendsForDeviceT<D>::type;
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 using AllAluminumBackends = Join<BackendsForDevice<Device::CPU>,
                                  BackendsForDevice<Device::GPU>>;
 #else
 using AllAluminumBackends = BackendsForDevice<Device::CPU>;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename BackendT>
 struct DeviceForBackendT;
@@ -173,7 +173,7 @@ struct DeviceForBackendT<Al::MPIBackend>
     constexpr static Device value = Device::CPU;
 };
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #ifdef HYDROGEN_HAVE_NCCL2
 template <>
 struct DeviceForBackendT<Al::NCCLBackend>
@@ -188,7 +188,7 @@ struct DeviceForBackendT<Al::MPICUDABackend>
     constexpr static Device value = Device::GPU;
 };
 #endif // HYDROGEN_HAVE_AL_MPI_CUDA
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename BackendT>
 constexpr Device DeviceForBackend()
@@ -262,16 +262,13 @@ template <>
 struct SyncInfoManager<Device::GPU>
 {
     SyncInfoManager(std::string const& backend_name)
+        : si_{CreateNewSyncInfo<Device::GPU>()}
     {
-        H_CHECK_CUDA(
-            cudaEventCreateWithFlags(&si_.event_, cudaEventDisableTiming));
-        H_CHECK_CUDA(
-            cudaStreamCreateWithFlags(&si_.stream_, cudaStreamNonBlocking));
 #ifdef HYDROGEN_HAVE_NVPROF
         // Name the stream for debugging purposes
         std::string const stream_name
             = "H: Comm (" + backend_name + ")";
-        nvtxNameCudaStreamA(si_.stream_, stream_name.c_str());
+        nvtxNameCudaStreamA(si_.Stream(), stream_name.c_str());
 #else
         (void) backend_name;
 #endif // HYDROGEN_HAVE_NVPROF
@@ -280,10 +277,7 @@ struct SyncInfoManager<Device::GPU>
     {
         try
         {
-            H_CHECK_CUDA(
-                cudaEventDestroy(si_.event_));
-            H_CHECK_CUDA(
-                cudaStreamDestroy(si_.stream_));
+            DestroySyncInfo(si_);
         }
         catch (std::exception const& e)
         {
diff --git a/include/El/core/imports/mpi.hpp b/include/El/core/imports/mpi.hpp
index 9f815aee64..d51c4d1384 100644
--- a/include/El/core/imports/mpi.hpp
+++ b/include/El/core/imports/mpi.hpp
@@ -143,6 +143,36 @@ struct Types
     static void Destroy();
 };
 
+// Silence clang warnings. These are ETI'd in src/core/mpi_register.hpp.
+#if !defined H_INSTANTIATING_MPI_TYPES_STRUCT
+extern template struct Types<byte>;
+extern template struct Types<short>;
+extern template struct Types<unsigned>;
+extern template struct Types<unsigned long>;
+#ifdef EL_USE_64BIT_INTS
+extern template struct Types<int>; // Avoid conflict with Int
+#endif
+extern template struct Types<long int>;
+extern template struct Types<unsigned long long>;
+#ifndef EL_USE_64BIT_INTS
+extern template struct Types<long long int>; // Avoid conflict with Int
+#endif
+
+#define PROTO(T)                                \
+    extern template struct Types<T>;            \
+    extern template struct Types<ValueInt<T>>;  \
+    extern template struct Types<Entry<T>>;
+
+#define EL_ENABLE_DOUBLEDOUBLE
+#define EL_ENABLE_QUADDOUBLE
+#define EL_ENABLE_QUAD
+#define EL_ENABLE_BIGINT
+#define EL_ENABLE_BIGFLOAT
+#define EL_ENABLE_HALF
+#include <El/macros/Instantiate.h>
+#undef PROTO
+#endif // !defined H_INSTANTIATING_MPI_TYPES_STRUCT
+
 template<typename T>
 struct MPIBaseHelper { typedef T value; };
 template<typename T>
diff --git a/include/El/core/imports/mpi/aluminum_comm.hpp b/include/El/core/imports/mpi/aluminum_comm.hpp
index 4e892df2d3..04a0b71bc2 100644
--- a/include/El/core/imports/mpi/aluminum_comm.hpp
+++ b/include/El/core/imports/mpi/aluminum_comm.hpp
@@ -71,11 +71,11 @@ inline bool SyncInfoEquiv(SyncInfo<Device::CPU> const&,
     return true;
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 inline bool SyncInfoEquiv(SyncInfo<Device::GPU> const& a,
                           SyncInfo<Device::GPU> const& b) EL_NO_EXCEPT
 {
-    return a.stream_ == b.stream_;
+    return a.Stream() == b.Stream();
 }
 #endif
 
@@ -221,14 +221,14 @@ class AluminumComm
         return std::make_shared<CommT>(comm);
     }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     template <typename CommT>
     std::shared_ptr<CommT> MakeWithSyncInfo(
         MPI_Comm comm, SyncInfo<Device::GPU> const& syncinfo) const
     {
-        return std::make_shared<CommT>(comm, syncinfo.stream_);
+        return std::make_shared<CommT>(comm, syncinfo.Stream());
     }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 }; // class AluminumComm
 
 }// namespace mpi
diff --git a/include/El/core/imports/mpi/meta.hpp b/include/El/core/imports/mpi/meta.hpp
index baf34d5e8c..1686e76d46 100644
--- a/include/El/core/imports/mpi/meta.hpp
+++ b/include/El/core/imports/mpi/meta.hpp
@@ -30,7 +30,7 @@ namespace mpi
 template <typename T, Device D>
 struct IsMpiDeviceValidType : IsDeviceValidType<T,D> {};
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Signed integer types
 template <>
 struct IsMpiDeviceValidType<char, Device::GPU> : std::true_type {};
@@ -57,7 +57,7 @@ struct IsMpiDeviceValidType<unsigned long int, Device::GPU> : std::true_type {};
 template <>
 struct IsMpiDeviceValidType<unsigned long long int, Device::GPU>
     : std::true_type {};
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #ifdef HYDROGEN_HAVE_ALUMINUM
 namespace internal
diff --git a/include/El/macros/DeviceGuardAndPayload.h b/include/El/macros/DeviceGuardAndPayload.h
index 2c6645361c..2b7c08bb61 100644
--- a/include/El/macros/DeviceGuardAndPayload.h
+++ b/include/El/macros/DeviceGuardAndPayload.h
@@ -30,7 +30,7 @@ ELSEIF_GUARD_AND_PAYLOAD(STAR,VR  ,ELEMENT,Device::CPU)
 ELSEIF_GUARD_AND_PAYLOAD(VC,  STAR,ELEMENT,Device::CPU)
 ELSEIF_GUARD_AND_PAYLOAD(VR,  STAR,ELEMENT,Device::CPU)
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 ELSEIF_GUARD_AND_PAYLOAD(CIRC,CIRC,ELEMENT,Device::GPU)
 ELSEIF_GUARD_AND_PAYLOAD(MC,  MR  ,ELEMENT,Device::GPU)
 ELSEIF_GUARD_AND_PAYLOAD(MC,  STAR,ELEMENT,Device::GPU)
@@ -45,7 +45,7 @@ ELSEIF_GUARD_AND_PAYLOAD(STAR,VC  ,ELEMENT,Device::GPU)
 ELSEIF_GUARD_AND_PAYLOAD(STAR,VR  ,ELEMENT,Device::GPU)
 ELSEIF_GUARD_AND_PAYLOAD(VC,  STAR,ELEMENT,Device::GPU)
 ELSEIF_GUARD_AND_PAYLOAD(VR,  STAR,ELEMENT,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 ELSEIF_GUARD_AND_PAYLOAD(CIRC,CIRC,BLOCK,Device::CPU)
 ELSEIF_GUARD_AND_PAYLOAD(MC,  MR  ,BLOCK,Device::CPU)
diff --git a/include/hydrogen/Device.hpp b/include/hydrogen/Device.hpp
index 69ac5d2eb5..ecc789851f 100644
--- a/include/hydrogen/Device.hpp
+++ b/include/hydrogen/Device.hpp
@@ -149,5 +149,23 @@ using SameDevice = EnumSame<Device,D1,D2>;
  */
 template <Device SrcD, Device DestD> struct InterDeviceCopy;
 
+// These should replace the InterDeviceCopy struct.
+#if 0
+template <typename T, Device SrcD, Device TgtD>
+void MemCopy1DAsync(
+    T const* __restrict__ const src,
+    T * __restrict__ const dest,
+    size_t const size,
+    SyncInfo<SrcD> const& srcSyncInfo,
+    SyncInfo<TgtD> const& destSyncInfo);
+
+template <typename T, Device SrcD, Device TgtD>
+void MemCopy2DAsync(
+    T const* __restrict__ const src, size_t const src_ldim,
+    T * __restrict__ const dest, size_t const dest_ldim,
+    size_t const height, size_t const width,
+    SyncInfo<SrcD> const& srcSyncInfo,
+    SyncInfo<TgtD> const& destSyncInfo);
+#endif // 0
 }// namespace hydrogen
 #endif // EL_CORE_DEVICE_HPP_
diff --git a/include/hydrogen/Error.hpp b/include/hydrogen/Error.hpp
new file mode 100644
index 0000000000..85ccd8e770
--- /dev/null
+++ b/include/hydrogen/Error.hpp
@@ -0,0 +1,97 @@
+#ifndef HYDROGEN_ERROR_HPP_
+#define HYDROGEN_ERROR_HPP_
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <utility>
+
+// "Basic exceptions" are those that are constructible with their
+// "what string", similar to std::runtime_error and std::logic_error.
+
+#define H_THROW_BASIC_ASSERT_EXCEPTION(cond,excptn, msg)        \
+    do                                                          \
+    {                                                           \
+        std::ostringstream tbe_oss__;                           \
+        tbe_oss__ << "Assertion\n\n"                            \
+                  << "    " #cond << "\n\n"                     \
+                  << "in function\n\n"                          \
+                  << "    " << H_PRETTY_FUNCTION << "\n\n"      \
+                  << "failed!\n\n"                              \
+                  << "{\n"                                      \
+                  << "    File: " << __FILE__ << "\n"           \
+                  << "    Line: " << __LINE__ << "\n"           \
+                  << "    Mesg: " << msg << "\n"                \
+                  << "}\n";                                     \
+        ::hydrogen::break_on_me();                              \
+        throw excptn(tbe_oss__.str());                          \
+    } while (false)
+
+#define H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(excptn)                   \
+    do                                                                  \
+    {                                                                   \
+        std::ostringstream dtor_excpt_oss;                              \
+        dtor_excpt_oss << "An exception was detected in a destructor!\n\n" \
+                       << "File: " << __FILE__ << "\n"                  \
+                       << "Line: " << __LINE__ << "\n"                  \
+                       << "Function: " << H_PRETTY_FUNCTION << "\n"     \
+                       << "Exception:\n\n" << excptn.what() << "\n\n"   \
+                       << "Now calling std::terminate(). Good bye.\n";  \
+        std::cerr << dtor_excpt_oss.str() << std::endl;                 \
+        ::hydrogen::break_on_me();                                      \
+    } while (false)
+
+//
+// ASSERTIONS
+//
+
+#define H_ASSERT(cond, excptn, msg)                             \
+    if (!(cond))                                                \
+        H_THROW_BASIC_ASSERT_EXCEPTION(cond, excptn, msg)
+
+#define H_ASSERT_FALSE(cond, excptn, msg)               \
+    if (cond)                                           \
+        H_THROW_BASIC_ASSERT_EXCEPTION(!(cond), excptn, msg)
+
+
+//
+// Exception classes
+//
+
+// Really, "basic exceptions" are just those that have no data and
+// forward all their arguments to their parent.
+#define H_ADD_BASIC_EXCEPTION_CLASS(name, parent)       \
+    struct name : parent                                \
+    {                                                   \
+        template <typename... Ts>                       \
+        name(Ts&&... args)                              \
+            : parent(std::forward<Ts>(args)...)         \
+        {}                                              \
+    }
+
+namespace hydrogen
+{
+
+/** @class RuntimeError
+ *  @brief The base exception for runtime errors thrown by Hydrogen.
+ *
+ *  Runtime errors are those that are due to factors external to the
+ *  program.
+ */
+//H_ADD_BASIC_EXCEPTION_CLASS(RuntimeError, std::runtime_error);
+
+/** @class LogicError
+ *  @brief The base exception for logic errors thrown by Hydrogen.
+ *
+ *  Logic errors are those due to factors internal to the program and
+ *  are more likely to be preventable than RuntimeErrors.
+ */
+//H_ADD_BASIC_EXCEPTION_CLASS(LogicError, std::logic_error);
+
+/** @brief A no-op that can be set as a predictable breakpoint in a
+ *         debugger.
+ */
+void break_on_me();
+
+}// namespace hydrogen
+#endif /* HYDROGEN_ERROR_HPP_ */
diff --git a/include/hydrogen/MultiSync.hpp b/include/hydrogen/MultiSync.hpp
new file mode 100644
index 0000000000..ab8900d044
--- /dev/null
+++ b/include/hydrogen/MultiSync.hpp
@@ -0,0 +1,80 @@
+#ifndef HYDROGEN_MULTISYNC_HPP_
+#define HYDROGEN_MULTISYNC_HPP_
+
+#include "Device.hpp"
+#include "SyncInfoBase.hpp"
+#include "SynchronizeAPI.hpp"
+#include "meta/IndexSequence.hpp"
+
+#include <tuple>
+
+namespace hydrogen
+{
+
+/** \class MultiSync
+ *  \brief RAII class to wrap a bunch of SyncInfo objects.
+ *
+ *  Provides basic synchronization for the common case in which an
+ *  operation may act upon objects that exist on multiple distinct
+ *  synchronous processing elements (e.g., cudaStreams) but actual
+ *  computation can only occur on one of them.
+ *
+ *  Constructing an object of this class will cause the master
+ *  processing element to wait on the others, asynchronously with
+ *  respect to the CPU, if possible. Symmetrically, destruction of
+ *  this object will cause the other processing elements to wait on
+ *  the master processing element, asynchronously with respect to the
+ *  CPU, if possible.
+ *
+ *  The master processing element is assumed to be the first SyncInfo
+ *  passed into the constructor.
+ */
+template <Device... Ds>
+class MultiSync
+{
+    using sync_tuple_type = std::tuple<SyncInfo<Ds>...>;
+    using sync_master_type =
+        typename std::tuple_element<0, sync_tuple_type>::type;
+public:
+    MultiSync(SyncInfo<Ds> const&... syncInfos)
+        : syncInfos_{syncInfos...}
+    {
+        MasterWaitOnAll(syncInfos...);
+    }
+
+    ~MultiSync()
+    {
+        DTorImpl_(MakeIndexSequence<sizeof...(Ds)>());
+    }
+
+    /** @brief Implicitly convert to the master.
+     *
+     *  This is to be able to pass a multisync in place of a SyncInfo
+     *  object. It is common to create a MultiSync and then pass its
+     *  master to a bunch of other calls. This simplifies things by
+     *  not needing to store an external reference to the master
+     *  SyncInfo.
+     */
+    operator sync_master_type const& () const noexcept
+    {
+        return std::get<0>(syncInfos_);
+    }
+
+private:
+    template <size_t... Is>
+    void DTorImpl_(IndexSequence<Is...>)
+    {
+        AllWaitOnMaster(std::get<Is>(syncInfos_)...);
+    }
+
+    sync_tuple_type syncInfos_;
+};// class MultiSync
+
+template <Device... Ds>
+auto MakeMultiSync(SyncInfo<Ds> const&... syncInfos) -> MultiSync<Ds...>
+{
+    return MultiSync<Ds...>(syncInfos...);
+}
+
+}// namespace hydrogen
+#endif // HYDROGEN_MULTISYNC_HPP_
diff --git a/include/hydrogen/SyncInfo.hpp b/include/hydrogen/SyncInfo.hpp
index 9cf228fe6b..4982e72df5 100644
--- a/include/hydrogen/SyncInfo.hpp
+++ b/include/hydrogen/SyncInfo.hpp
@@ -1,192 +1,8 @@
-#ifndef EL_CORE_SYNCINFO_HPP_
-#define EL_CORE_SYNCINFO_HPP_
+#ifndef HYDROGEN_SYNCINFO_HPP_
+#define HYDROGEN_SYNCINFO_HPP_
 
-#include <hydrogen/Device.hpp>
-#include <hydrogen/meta/IndexSequence.hpp>
+#include "SyncInfoAllDecl.hpp"
+#include "SynchronizeAPI.hpp"
+#include "MultiSync.hpp"
 
-#ifdef HYDROGEN_HAVE_CUDA
-#include <hydrogen/device/gpu/CUDA.hpp>
-#endif // HYDROGEN_HAVE_CUDA
-
-namespace hydrogen
-{
-
-/** \class SyncInfo
- *  \brief Manage device-specific synchronization information.
- *
- *  Device-specific synchronization information. For CPUs, this is
- *  empty since all CPU operations are synchronous with respect to the
- *  host. For GPUs, this will be a stream and an associated event.
- *
- *  The use-case for this is to cope with the matrix-free part of the
- *  interface. Many of the copy routines have the paradigm that they
- *  take Matrix<T,D>s as arguments and then the host will organize and
- *  dispatch subkernels that operate on data buffers, i.e., T[]
- *  data. In the GPU case, for example, this provides a lightweight
- *  way to pass the CUDA stream through the T* interface without an
- *  entire matrix (which, semantically, may not make sense).
- *
- *  This also might be useful for interacting with
- *  Aluminum/MPI/NCCL/whatever. It essentially enables tagged
- *  dispatch, where the tags possibly contain some extra
- *  device-specific helpers.
- */
-template <Device D> struct SyncInfo
-{
-    SyncInfo() {}
-};// struct SyncInfo<D>
-
-// Adding synchronization points is generally a no-op
-template <Device... Ds>
-void AddSynchronizationPoint(SyncInfo<Ds>... As)
-{}
-
-template <Device D>
-void Synchronize(SyncInfo<D> const&)
-{}
-
-#ifdef HYDROGEN_HAVE_CUDA
-
-template <>
-struct SyncInfo<Device::GPU>
-{
-    SyncInfo() : SyncInfo{GPUManager::Stream(), GPUManager::Event()} {}
-
-    SyncInfo(cudaStream_t stream, cudaEvent_t event)
-        : stream_{stream}, event_{event} {}
-
-    cudaStream_t stream_;
-    cudaEvent_t event_;
-};// struct SyncInfo<Device::GPU>
-
-
-inline void AddSynchronizationPoint(SyncInfo<Device::GPU> const& syncInfo)
-{
-    H_CHECK_CUDA(cudaEventRecord(syncInfo.event_, syncInfo.stream_));
-}
-
-inline void AddSynchronizationPoint(
-    SyncInfo<Device::CPU> const& A, SyncInfo<Device::GPU> const& B)
-{
-    throw std::logic_error("I don't know what should happen here.");
-}
-
-inline void AddSynchronizationPoint(
-    SyncInfo<Device::GPU> const& A, SyncInfo<Device::CPU> const& B)
-{
-    throw std::logic_error("I don't know what should happen here.");
-}
-
-// This captures the work done on A and forces B to wait for completion
-inline void AddSynchronizationPoint(
-    SyncInfo<Device::GPU> const& A, SyncInfo<Device::GPU> const& B)
-{
-    if (A.stream_ != B.stream_)
-    {
-        AddSynchronizationPoint(A);
-        H_CHECK_CUDA(cudaStreamWaitEvent(B.stream_, A.event_, 0));
-    }
-}
-
-// This captures the work done on A and forces B and C to wait for completion
-inline void AddSynchronizationPoint(
-    SyncInfo<Device::GPU> const& A,
-    SyncInfo<Device::GPU> const& B, SyncInfo<Device::GPU> const& C)
-{
-    bool const ABdiff = (A.stream_ != B.stream_);
-    bool const ACdiff = (A.stream_ != C.stream_);
-
-    if (ABdiff || ACdiff)
-        AddSynchronizationPoint(A);
-
-    if (ABdiff)
-        H_CHECK_CUDA(cudaStreamWaitEvent(B.stream_, A.event_, 0));
-
-    if (ACdiff)
-        H_CHECK_CUDA(cudaStreamWaitEvent(C.stream_, A.event_, 0));
-}
-
-inline void Synchronize(SyncInfo<Device::GPU> const& syncInfo)
-{
-    H_CHECK_CUDA(cudaStreamSynchronize(syncInfo.stream_));
-}
-
-#endif // HYDROGEN_HAVE_CUDA
-
-template <Device D, Device... Ds>
-void AllWaitOnMaster(
-    SyncInfo<D> const& Master, SyncInfo<Ds> const&... Others)
-{
-    AddSynchronizationPoint(Master, Others...);
-}
-
-template <Device D>
-void MasterWaitOnAll(SyncInfo<D> const& Master)
-{}
-
-template <Device D, Device D1, Device... Ds>
-void MasterWaitOnAll(
-    SyncInfo<D> const& Master, SyncInfo<D1> const& Other,
-    SyncInfo<Ds> const&... others)
-{
-    AddSynchronizationPoint(Other, Master);
-    MasterWaitOnAll(Master, others...);
-}
-
-/** \class MultiSync
- *  \brief RAII class to wrap a bunch of SyncInfo objects.
- *
- *  Provides basic synchronization for the common case in which an
- *  operation may act upon objects that exist on multiple distinct
- *  synchronous processing elements (e.g., cudaStreams) but actual
- *  computation can only occur on one of them.
- *
- *  Constructing an object of this class will cause the master
- *  processing element to wait on the others, asynchronously with
- *  respect to the CPU, if possible. Symmetrically, destruction of
- *  this object will cause the other processing elements to wait on
- *  the master processing element, asynchronously with respect to the
- *  CPU, if possible.
- *
- *  The master processing element is assumed to be the first SyncInfo
- *  passed into the constructor.
- */
-template <Device... Ds>
-class MultiSync
-{
-public:
-    MultiSync(SyncInfo<Ds> const&... syncInfos)
-        : syncInfos_{syncInfos...}
-    {
-        SyncMasterToAll_(MakeIndexSequence<sizeof...(Ds)>());
-    }
-
-    ~MultiSync()
-    {
-        SyncAllToMaster_(MakeIndexSequence<sizeof...(Ds)>());
-    }
-private:
-
-    template <size_t... Is>
-    void SyncMasterToAll_(IndexSequence<Is...>)
-    {
-        MasterWaitOnAll(std::get<Is>(syncInfos_)...);
-    }
-
-    template <size_t... Is>
-    void SyncAllToMaster_(IndexSequence<Is...>)
-    {
-        AllWaitOnMaster(std::get<Is>(syncInfos_)...);
-    }
-
-    std::tuple<SyncInfo<Ds>...> syncInfos_;
-};// class MultiSync
-
-template <Device... Ds>
-auto MakeMultiSync(SyncInfo<Ds> const&... syncInfos) -> MultiSync<Ds...>
-{
-    return MultiSync<Ds...>(syncInfos...);
-}
-
-}// namespace hydrogen
-#endif // EL_CORE_SYNCINFO_HPP_
+#endif // HYDROGEN_SYNCINFO_HPP_
diff --git a/include/hydrogen/SyncInfoAllDecl.hpp b/include/hydrogen/SyncInfoAllDecl.hpp
new file mode 100644
index 0000000000..023edf897a
--- /dev/null
+++ b/include/hydrogen/SyncInfoAllDecl.hpp
@@ -0,0 +1,12 @@
+#ifndef HYDROGEN_SYNCINFOALLDECL_HPP_
+#define HYDROGEN_SYNCINFOALLDECL_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include "SyncInfoBase.hpp"
+
+#ifdef HYDROGEN_HAVE_GPU
+#include "device/gpu/SyncInfo.hpp"
+#endif // HYDROGEN_HAVE_GPU
+
+#endif // HYDROGEN_SYNCINFOALLDECL_HPP_
diff --git a/include/hydrogen/SyncInfoBase.hpp b/include/hydrogen/SyncInfoBase.hpp
new file mode 100644
index 0000000000..72b0ca54ac
--- /dev/null
+++ b/include/hydrogen/SyncInfoBase.hpp
@@ -0,0 +1,129 @@
+#ifndef HYDROGEN_SYNCINFOBASE_HPP_
+#define HYDROGEN_SYNCINFOBASE_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include "Device.hpp"
+
+namespace hydrogen
+{
+
+/** \class SyncInfo
+ *  \brief Manage device-specific synchronization information.
+ *
+ *  Device-specific synchronization information. For CPUs, this is
+ *  empty since all CPU operations are synchronous with respect to the
+ *  host. For GPUs, this will be a stream and an associated event.
+ *
+ *  The use-case for this is to cope with the matrix-free part of the
+ *  interface. Many of the copy routines have the paradigm that they
+ *  take Matrix<T,D>s as arguments and then the host will organize and
+ *  dispatch subkernels that operate on data buffers, i.e., T[]
+ *  data. In the GPU case, for example, this provides a lightweight
+ *  way to pass the CUDA stream through the T* interface without an
+ *  entire matrix (which, semantically, may not make sense).
+ *
+ *  This also might be useful for interacting with
+ *  Aluminum/MPI/NCCL/whatever. It essentially enables tagged
+ *  dispatch, where the tags possibly contain some extra
+ *  device-specific helpers.
+ */
+template <Device D>
+class SyncInfo;
+
+template <>
+class SyncInfo<Device::CPU>
+{
+public:
+    SyncInfo() noexcept = default;
+    ~SyncInfo() noexcept = default;
+};// struct SyncInfo<Device::CPU>
+
+template <Device D>
+bool operator==(SyncInfo<D> const&, SyncInfo<D> const&)
+{
+    return true;
+}
+
+template <Device D>
+bool operator!=(SyncInfo<D> const&, SyncInfo<D> const&)
+{
+    return false;
+}
+
+template <Device D1, Device D2>
+bool operator==(SyncInfo<D1> const&, SyncInfo<D2> const&)
+{
+    return false;
+}
+
+template <Device D1, Device D2>
+bool operator!=(SyncInfo<D1> const&, SyncInfo<D2> const&)
+{
+    return true;
+}
+
+/** @brief Get a new instance of a certain SyncInfo class.
+ *
+ *  For CPU, this will be empty, as usual. For GPU, this will have a
+ *  *new* stream and event.
+ */
+template <Device D>
+SyncInfo<D> CreateNewSyncInfo();
+
+/** @brief Create a new CPU SyncInfo object. */
+template <>
+inline SyncInfo<Device::CPU> CreateNewSyncInfo<Device::CPU>()
+{
+    return SyncInfo<Device::CPU>{};
+}
+
+/** @brief Reset any internal state in the SyncInfo object.
+ *
+ *  For CPU, this will do nothing. For GPU, this will destroy the
+ *  stream and event.
+ */
+template <Device D>
+void DestroySyncInfo(SyncInfo<D>&);
+
+/** @brief Destroy the CPU SyncInfo. */
+inline void DestroySyncInfo(SyncInfo<Device::CPU>&) noexcept {}
+
+/** @brief Synchronize the SyncInfo with the main (CPU) thread. */
+template <Device D>
+void Synchronize(SyncInfo<D> const&);
+
+inline void Synchronize(SyncInfo<Device::CPU> const&) {}
+
+/** @brief Add information to the SyncInfo object identifying this
+ *         execution point.
+ */
+template <Device D, Device... Ds>
+void AddSynchronizationPoint(
+    SyncInfo<D> const& master,
+    SyncInfo<Ds> const&... others);
+
+inline void AddSynchronizationPoint(SyncInfo<Device::CPU> const&)
+{}
+
+inline void AddSynchronizationPoint(SyncInfo<Device::CPU> const&,
+                                    SyncInfo<Device::CPU> const&)
+{}
+
+inline void AddSynchronizationPoint(SyncInfo<Device::CPU> const&,
+                                    SyncInfo<Device::CPU> const&,
+                                    SyncInfo<Device::CPU> const&)
+{}
+
+namespace details
+{
+template <Device D1, Device D2>
+void AddSyncPoint(SyncInfo<D1> const&, SyncInfo<D2> const&);
+
+inline void AddSyncPoint(SyncInfo<Device::CPU> const&,
+                         SyncInfo<Device::CPU> const&) noexcept
+{}
+
+}// namespace details
+}// namespace hydrogen
+#endif // HYDROGEN_SYNCINFOBASE_HPP_
diff --git a/include/hydrogen/SynchronizeAPI.hpp b/include/hydrogen/SynchronizeAPI.hpp
new file mode 100644
index 0000000000..339bcf355e
--- /dev/null
+++ b/include/hydrogen/SynchronizeAPI.hpp
@@ -0,0 +1,41 @@
+#ifndef HYDROGEN_SYNCHRONIZEAPI_HPP_
+#define HYDROGEN_SYNCHRONIZEAPI_HPP_
+
+#include "SyncInfo.hpp"
+
+namespace hydrogen
+{
+
+// This synchronizes the additional SyncInfos to the "master". That
+// is, the execution streams described by the "others" will wait
+// for the "master" stream.
+template <Device D, Device... Ds>
+void AddSynchronizationPoint(
+    SyncInfo<D> const& master,
+    SyncInfo<Ds> const&... others)
+{
+    AddSynchronizationPoint(master);
+
+    int dummy[] = { (details::AddSyncPoint(master, others), 0)... };
+    (void) dummy;
+}
+
+template <Device D, Device... Ds>
+void AllWaitOnMaster(
+    SyncInfo<D> const& master, SyncInfo<Ds> const&... others)
+{
+    AddSynchronizationPoint(master, others...);
+}
+
+template <Device D, Device... Ds>
+void MasterWaitOnAll(
+    SyncInfo<D> const& master,
+    SyncInfo<Ds> const&... others)
+{
+    int dummy[] = {
+        (AddSynchronizationPoint(others, master), 0)...};
+    (void) dummy;
+}
+
+}// namespace hydrogen
+#endif // HYDROGEN_SYNCHRONIZEAPI_HPP_
diff --git a/include/hydrogen/blas/BLAS_Common.hpp b/include/hydrogen/blas/BLAS_Common.hpp
index 18b93a2a31..80ab5978a9 100644
--- a/include/hydrogen/blas/BLAS_Common.hpp
+++ b/include/hydrogen/blas/BLAS_Common.hpp
@@ -16,9 +16,12 @@ enum class BLAS_Op
     AXPY,
     COPY,
     DGMM,
+    DOT,
     GEAM,
     GEMM,
+    GEMMSTRIDEDBATCHED,
     GEMV,
+    NRM2,
     SCAL,
     /** @brief Axpy for 2D data with leading dimension */
     AXPY2D,
@@ -74,5 +77,17 @@ enum class SideMode
     RIGHT,
 };
 
+/** @brief Describes where pointers point. */
+enum class PointerMode
+{
+    HOST,
+    DEVICE,
+};// enum class PointerMode
+
+namespace gpu_blas
+{
+/** @brief Set the pointer mode of the underlying library. */
+void SetPointerMode(PointerMode mode);
+}
 }// namespace hydrogen
 #endif // HYDROGEN_BLAS_COMMON_HPP_
diff --git a/include/hydrogen/blas/GPU_BLAS_decl.hpp b/include/hydrogen/blas/GPU_BLAS_decl.hpp
index 2eef6440c4..6406110c73 100644
--- a/include/hydrogen/blas/GPU_BLAS_decl.hpp
+++ b/include/hydrogen/blas/GPU_BLAS_decl.hpp
@@ -256,6 +256,49 @@ void Copy(SizeT num_rows, SizeT num_cols,
           T* B, SizeT row_stride_B, SizeT ldb,
           SyncInfo<Device::GPU> const& syncinfo);
 
+/** @brief A dot-product operation for 1-D memory.
+ *
+ *  @tparam T (Inferred) The type of data.
+ *  @tparam SizeT (Inferred) The type used to express size information.
+ *
+ *  @param num_entries The number of entries in X and Y.
+ *  @param X The first vector (device memory).
+ *  @param stride_X The stride of X.
+ *  @param Y The second vector (device memory).
+ *  @param stride_Y The stride of Y.
+ *  @param result The result of the dot product (host or device memory).
+ *  @param[in] syncinfo The synchronization information for this
+ *                      operation.
+ *
+ *  @ingroup device_blas
+ */
+template <typename T, typename SizeT>
+void Dot(SizeT num_entries,
+         T const* X, SizeT stride_X,
+         T const* Y, SizeT stride_Y,
+         T* result,
+         SyncInfo<Device::GPU> const& syncinfo);
+
+/** @brief Computes the 2-norm of 1-D memory.
+ *
+ *  @tparam T (Inferred) The type of data.
+ *  @tparam SizeT (Inferred) The type used to express size information.
+ *
+ *  @param num_entries The number of entries in X.
+ *  @param X The vector (device memory).
+ *  @param stride_X The stride of X.
+ *  @param result The result of the dot product (host or device memory).
+ *  @param[in] syncinfo The synchronization information for this
+ *                      operation.
+ *
+ *  @ingroup device_blas
+ */
+template <typename T, typename SizeT>
+void Nrm2(SizeT num_entries,
+          T const* X, SizeT stride_X,
+          T* result,
+          SyncInfo<Device::GPU> const& syncinfo);
+
 /** @brief 1-D Scale operation in GPU memory.
  *
  *  This is in-place scaling:
@@ -391,6 +434,53 @@ void Gemm(
     T* C, SizeT ldc,
     SyncInfo<Device::GPU> const& syncinfo);
 
+/** @brief Batched, strided matrix-matrix product in GPU memory.
+ *
+ *  @todo Write documentation.
+ *
+ *  @tparam T (Inferred) The type of the data. Should be a field.
+ *  @tparam SizeT (Inferred) The type used to express size information.
+ *  @tparam StrideT (Inferred) The type used to express stride information.
+ *
+ *  @param[in] transpA The operation flag for `A` indicating `NORMAL`,
+ *                     `TRANSPOSE`, or `CONJ_TRANSPOSE`.
+ *  @param[in] transpB The operation flag for `B` indicating `NORMAL`,
+ *                     `TRANSPOSE`, or `CONJ_TRANSPOSE`.
+ *  @param[in] m The number of rows in `op(A)` and C.
+ *  @param[in] n The number of columns in `op(B)` and C.
+ *  @param[in] k The number of columns in `op(A)` and rows in `op(B)`.
+ *  @param[in] alpha The scaling term on the multiplicative term.
+ *  @param[in] A A matrix in column-major format.
+ *  @param[in] lda The leading dimension of A.
+ *  @param[in] strideA The between A matrices.
+ *  @param[in] B A matrix in column-major format.
+ *  @param[in] ldb The leading dimension of B
+ *  @param[in] strideB The between B matrices.
+ *  @param[in] beta The scaling applied to the input value of the
+ *                  target matrix.
+ *  @param[in,out] C The target matrix. Inital values are scaled by
+ *                   beta and updated with the result of the product.
+ *  @param[in] ldc The leading dimension of C.
+ *  @param[in] strideC The between C matrices.
+ *  @param[in] batchCount The number of GEMMs in the batch.
+ *  @param[in] syncinfo The synchronization information for this
+ *                      operation.
+ *
+ *  @ingroup device_blas
+ */
+template <typename T, typename SizeT, typename StrideT>
+void GemmStridedBatched(
+    TransposeMode transpA, TransposeMode transpB,
+    SizeT m, SizeT n, SizeT k,
+    T const& alpha,
+    T const* A, SizeT lda, StrideT strideA,
+    T const* B, SizeT ldb, StrideT strideB,
+    T const& beta,
+    T* C, SizeT ldc, StrideT strideC,
+    SizeT batchCount,
+    SyncInfo<Device::GPU> const& syncinfo);
+
+
 ///@}
 /** @name BLAS-like Extension Routines */
 ///@{
diff --git a/include/hydrogen/blas/GPU_BLAS_impl.hpp b/include/hydrogen/blas/GPU_BLAS_impl.hpp
index 82e7caf2a5..14cfbd0656 100644
--- a/include/hydrogen/blas/GPU_BLAS_impl.hpp
+++ b/include/hydrogen/blas/GPU_BLAS_impl.hpp
@@ -40,7 +40,7 @@ namespace gpu_blas_impl = hydrogen::cublas;
 // needs.
 
 #define GPU_BLAS_USE_ROCBLAS
-#include <El/core/imports/rocblas.hpp>
+#include <hydrogen/device/gpu/rocm/rocBLAS.hpp>
 
 namespace gpu_blas_impl = hydrogen::rocblas;
 
@@ -340,7 +340,7 @@ void Axpy2DImpl(SizeT nrows, SizeT ncols,
 {
     Axpy_GPU_impl(nrows, ncols,
                   alpha, A, SizeT(1), lda,
-                  B, SizeT(1), ldb, si.stream_);
+                  B, SizeT(1), ldb, si);
 }
 
 template <typename T, typename SizeT,
@@ -354,7 +354,7 @@ void Axpy2DImplTranspose(TransposeMode transpA,
                          SyncInfo<Device::GPU> const& si)
 {
     Axpy_GPU_impl(
-        transpA, nrows, ncols, alpha, A, lda, B, ldb, si.stream_);
+        transpA, nrows, ncols, alpha, A, lda, B, ldb, si);
 }
 
 template <typename T, typename SizeT,
@@ -365,7 +365,7 @@ void CopyImpl(SizeT size,
               T* Y, SizeT incy,
               SyncInfo<Device::GPU> const& si)
 {
-    Copy_GPU_impl(size, X, incx, Y, incy, si.stream_);
+    Copy_GPU_impl(size, X, incx, Y, incy, si);
 }
 
 template <typename T, typename SizeT,
@@ -382,12 +382,12 @@ void Copy2DImpl(SizeT nrows, SizeT ncols,
     case TransposeMode::NORMAL:
         Copy_GPU_impl(nrows, ncols,
                       A, SizeT(1), lda,
-                      B, SizeT(1), ldb, si.stream_);
+                      B, SizeT(1), ldb, si);
         break;
     case TransposeMode::TRANSPOSE:
         // This kernel is a bit funny and takes the dimensions of A,
         // so we must reverse nrows and ncols.
-        Transpose_GPU_impl(ncols, nrows, A, lda, B, ldb, si.stream_);
+        Transpose_GPU_impl(ncols, nrows, A, lda, B, ldb, si);
         break;
     default:
         throw std::logic_error("Copy2DImpl: TransposeMode not supported!");
@@ -406,7 +406,7 @@ void Copy2DImpl(SizeT nrows, SizeT ncols,
     case TransposeMode::NORMAL:
         Copy_GPU_impl(nrows, ncols,
                       A, SizeT(1), lda,
-                      B, SizeT(1), ldb, si.stream_);
+                      B, SizeT(1), ldb, si);
         break;
     case TransposeMode::TRANSPOSE:
         throw std::logic_error(
@@ -429,7 +429,7 @@ void Copy2DStridedImpl(
     Copy_GPU_impl(nrows, ncols,
                   A, rowstride_A, lda,
                   B, rowstride_B, ldb,
-                  si.stream_);
+                  si);
 }
 
 template <typename T, typename SizeT,
@@ -440,7 +440,7 @@ void ScaleImpl(SizeT size,
                T* X, SizeT incx,
                SyncInfo<Device::GPU> const& si)
 {
-    Scale_GPU_impl(size, alpha, X, incx, si.stream_);
+    Scale_GPU_impl(size, alpha, X, incx, si);
 }
 
 template <typename T, typename SizeT,
@@ -451,7 +451,7 @@ void Scale2DImpl(SizeT nrows, SizeT ncols,
                  T* A, SizeT lda,
                  SyncInfo<Device::GPU> const& si)
 {
-    Scale_GPU_impl(nrows, ncols, alpha, A, lda, si.stream_);
+    Scale_GPU_impl(nrows, ncols, alpha, A, lda, si);
 }
 
 //
diff --git a/include/hydrogen/blas/gpu/Axpy.hpp b/include/hydrogen/blas/gpu/Axpy.hpp
index 001d7eabbd..34e4364534 100644
--- a/include/hydrogen/blas/gpu/Axpy.hpp
+++ b/include/hydrogen/blas/gpu/Axpy.hpp
@@ -5,7 +5,11 @@
 #include <hydrogen/blas/BLAS_Common.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -20,27 +24,27 @@ namespace hydrogen
  *  @tparam T (Inferred) The type of data. Must be the same for source
  *      and destination matrices.
  *
- *  @param num_rows The number of rows in the matrix
- *  @param num_cols The number of columns in the matrix
- *  @param alpha The scaling factor
- *  @param src The source matrix, in column-major ordering. Must not
- *      overlap with the destination matrix.
- *  @param src_row_stride The number of `T`s between rows in a column
- *      of the source matrix. For "traditional" packed matrices, this
- *      will be "1".
- *  @param src_col_stride The number of `T`s between columns in a row
- *      of the source matrix. For "traditional" packed matrices, this
- *      will be the leading dimension.
- *  @param dest The destination matrix, in column-major ordering. Must not
- *      overlap with the source matrix.
- *  @param dest_row_stride The number of `T`s between rows in a column
- *      of the destination matrix. For "traditional" packed matrices,
- *      this will be "1".
- *  @param dest_col_stride The number of `T`s between columns in a row
- *      of the destination matrix. For "traditional" packed matrices,
+ *  @param[in] num_rows The number of rows in the matrix.
+ *  @param[in] num_cols The number of columns in the matrix.
+ *  @param[in] alpha The scaling factor.
+ *  @param[in] src The source matrix, in column-major ordering. Must
+ *      not overlap with the destination matrix.
+ *  @param[in] src_row_stride The number of `T`s between rows in a
+ *      column of the source matrix. For "traditional" packed
+ *      matrices, this will be "1".
+ *  @param[in] src_col_stride The number of `T`s between columns in a
+ *      row of the source matrix. For "traditional" packed matrices,
  *      this will be the leading dimension.
- *  @param stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[out] dest The destination matrix, in column-major
+ *      ordering. Must not overlap with the source matrix.
+ *  @param[in] dest_row_stride The number of `T`s between rows in a
+ *      column of the destination matrix. For "traditional" packed
+ *      matrices, this will be "1".
+ *  @param[in] dest_col_stride The number of `T`s between columns in a
+ *      row of the destination matrix. For "traditional" packed
+ *      matrices, this will be the leading dimension.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  */
 template <typename T, typename SizeT,
           typename=EnableWhen<IsComputeType<T,Device::GPU>>>
@@ -48,7 +52,7 @@ void Axpy_GPU_impl(
     SizeT num_rows, SizeT num_cols, T alpha,
     T const* src, SizeT src_row_stride, SizeT src_col_stride,
     T* dest, SizeT dest_row_stride, SizeT dest_col_stride,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T, typename SizeT,
           typename=EnableUnless<IsComputeType<T,Device::GPU>>,
@@ -56,7 +60,7 @@ template <typename T, typename SizeT,
 void Axpy_GPU_impl(
     SizeT, SizeT, T,
     T const*, SizeT, SizeT, T*, SizeT, SizeT,
-    cudaStream_t)
+    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Axpy: Type not valid on GPU.");
 }
@@ -80,8 +84,8 @@ void Axpy_GPU_impl(
  *  @param[in,out] B The destination matrix, in column-major
  *      ordering. Must not overlap with the source matrix.
  *  @param[in] ldb The leading dimension of B.
- *  @param[in] stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  */
 template <typename T, typename SizeT,
           typename=EnableWhen<IsComputeType<T,Device::GPU>>>
@@ -89,7 +93,7 @@ void Axpy_GPU_impl(
     TransposeMode transpA,
     SizeT num_rows, SizeT num_cols,
     T alpha, T const* A, SizeT lda, T* B, SizeT ldb,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 }// namespace hydrogen
 #endif // HYDROGEN_BLAS_GPU_AXPY_HPP_
diff --git a/include/hydrogen/blas/gpu/Copy.hpp b/include/hydrogen/blas/gpu/Copy.hpp
index a3de39990f..638cc4d7b0 100644
--- a/include/hydrogen/blas/gpu/Copy.hpp
+++ b/include/hydrogen/blas/gpu/Copy.hpp
@@ -4,7 +4,11 @@
 #include <hydrogen/Device.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -20,16 +24,17 @@ namespace hydrogen
  *  @tparam T (Inferred) The type of data. Must be the same for source
  *      and destination matrices.
  *
- *  @param num_entries The number of entries in the array
- *  @param src The source array. Must not overlap with the destination
- *      array.
- *  @param src_stride The number of `T`s between entries in the source array.
- *  @param dest The destination array. Must not overlap with the
+ *  @param[in] num_entries The number of entries in the array
+ *  @param[in] src The source array. Must not overlap with the
+ *      destination array.
+ *  @param[in] src_stride The number of `T`s between entries in the
+ *      source array.
+ *  @param[out] dest The destination array. Must not overlap with the
  *      source array.
- *  @param dest_stride The number of `T`s between entires in the
+ *  @param[in] dest_stride The number of `T`s between entires in the
  *      destination array.
- *  @param stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  *
  *  @throws std::logic_error If the type is not supported on GPU or if
  *      the arrays overlap.
@@ -41,7 +46,7 @@ void Copy_GPU_impl(
     SizeT num_entries,
     SrcT const* src, SizeT src_stride,
     DestT* dest, SizeT dest_stride,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename SrcT, typename DestT, typename SizeT,
           typename=EnableUnless<IsStorageType<SrcT,Device::GPU>>,
@@ -51,7 +56,7 @@ void Copy_GPU_impl(
     SizeT const&,
     SrcT const* const&, SizeT const&,
     DestT* const&, SizeT const&,
-    cudaStream_t const&)
+    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Type not valid on GPU");
 }
@@ -64,26 +69,26 @@ void Copy_GPU_impl(
  *  @tparam T (Inferred) The type of data. Must be the same for source
  *      and destination matrices.
  *
- *  @param num_rows The number of rows in the matrix
- *  @param num_cols The number of columns in the matrix
- *  @param src The source matrix, in column-major ordering. Must not
- *      overlap with the destination matrix.
- *  @param src_row_stride The number of `T`s between rows in a column
- *      of the source matrix. For "traditional" packed matrices, this
- *      will be "1".
- *  @param src_col_stride The number of `T`s between columns in a row
- *      of the source matrix. For "traditional" packed matrices, this
- *      will be the leading dimension.
- *  @param dest The destination matrix, in column-major ordering. Must not
- *      overlap with the source matrix.
- *  @param dest_row_stride The number of `T`s between rows in a column
- *      of the destination matrix. For "traditional" packed matrices,
- *      this will be "1".
- *  @param dest_col_stride The number of `T`s between columns in a row
- *      of the destination matrix. For "traditional" packed matrices,
+ *  @param[in] num_rows The number of rows in the matrix.
+ *  @param[in] num_cols The number of columns in the matrix.
+ *  @param[in] src The source matrix, in column-major ordering. Must
+ *      not overlap with the destination matrix.
+ *  @param[in] src_row_stride The number of `T`s between rows in a
+ *      column of the source matrix. For "traditional" packed
+ *      matrices, this will be "1".
+ *  @param[in] src_col_stride The number of `T`s between columns in a
+ *      row of the source matrix. For "traditional" packed matrices,
  *      this will be the leading dimension.
- *  @param stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[out] dest The destination matrix, in column-major
+ *      ordering. Must not overlap with the source matrix.
+ *  @param[in] dest_row_stride The number of `T`s between rows in a
+ *      column of the destination matrix. For "traditional" packed
+ *      matrices, this will be "1".
+ *  @param[in] dest_col_stride The number of `T`s between columns in a
+ *      row of the destination matrix. For "traditional" packed
+ *      matrices, this will be the leading dimension.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  *
  *  @todo See if we can statically assert that the operator= between
  *        SrcT and DestT will succeed on the device.
@@ -95,7 +100,7 @@ void Copy_GPU_impl(
     SizeT num_rows, SizeT num_cols,
     SrcT const* src, SizeT src_row_stride, SizeT src_col_stride,
     DestT* dest, SizeT dest_row_stride, SizeT dest_col_stride,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename SrcT, typename DestT, typename SizeT,
           typename=EnableUnless<IsStorageType<SrcT,Device::GPU>>,
@@ -104,7 +109,7 @@ template <typename SrcT, typename DestT, typename SizeT,
 void Copy_GPU_impl(SizeT const&, SizeT const&,
                    SrcT const* const&, SizeT const&, SizeT const&,
                    DestT* const&, SizeT const&, SizeT const&,
-                   cudaStream_t const&)
+                   SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Copy: Type not valid on GPU.");
 }
diff --git a/include/hydrogen/blas/gpu/Fill.hpp b/include/hydrogen/blas/gpu/Fill.hpp
index 350dfa364b..9fc5af5828 100644
--- a/include/hydrogen/blas/gpu/Fill.hpp
+++ b/include/hydrogen/blas/gpu/Fill.hpp
@@ -8,7 +8,11 @@
 #include <hydrogen/Device.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -18,17 +22,25 @@ namespace hydrogen
 template <typename T, typename=EnableWhen<IsStorageType<T,Device::GPU>>>
 void Fill_GPU_impl(size_t height, size_t width,
                    T const& alpha, T* buffer, size_t ldim,
-                   cudaStream_t stream);
+                   SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T,
           typename=EnableUnless<IsDeviceValidType<T,Device::GPU>>,
           typename=void>
 void Fill_GPU_impl(size_t const&, size_t const&,
                    T const&, T* const&, size_t const&,
-                   cudaStream_t const&)
+                   SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Fill: Type not valid on GPU.");
 }
 
+template <typename T, typename=EnableWhen<IsStorageType<T,Device::GPU>>>
+void Fill_GPU_1D_impl(T* buffer, size_t const& size,
+                      T const& alpha,
+                      SyncInfo<Device::GPU> const& sync_info)
+{
+    Fill_GPU_impl(size, 1, alpha, buffer, size, sync_info);
+}
+
 }// namespace hydrogen
 #endif // HYDROGEN_BLAS_GPU_FILL_HPP_
diff --git a/include/hydrogen/blas/gpu/Hadamard.hpp b/include/hydrogen/blas/gpu/Hadamard.hpp
index 3639c1f010..8079ec544d 100644
--- a/include/hydrogen/blas/gpu/Hadamard.hpp
+++ b/include/hydrogen/blas/gpu/Hadamard.hpp
@@ -8,7 +8,11 @@
 #include <hydrogen/Device.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -21,7 +25,7 @@ void Hadamard_GPU_impl(
     T const* A, size_t row_stride_A, size_t lda,
     T const* B, size_t row_stride_B, size_t ldb,
     T* C, size_t row_stride_C, size_t ldc,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T,
           typename=EnableUnless<IsComputeType<T,Device::GPU>>,
@@ -31,7 +35,7 @@ void Hadamard_GPU_impl(
     T const* const&, size_t const&, size_t const&,
     T const* const&, size_t const&, size_t const&,
     T* const&, size_t const&, size_t const&,
-    cudaStream_t const&)
+    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Hadamard: Type not valid on GPU.");
 }
diff --git a/include/hydrogen/blas/gpu/Scale.hpp b/include/hydrogen/blas/gpu/Scale.hpp
index 9b1277c47b..e7d444f663 100644
--- a/include/hydrogen/blas/gpu/Scale.hpp
+++ b/include/hydrogen/blas/gpu/Scale.hpp
@@ -4,7 +4,11 @@
 #include <hydrogen/Device.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -25,8 +29,8 @@ namespace hydrogen
  *  @param[in,out] buffer The array.
  *  @param[in] stride The number of `T`s between entries in the input
  *      array.
- *  @param[in] stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  *
  *  @throws std::logic_error If the type is not supported on GPU.
  */
@@ -36,7 +40,7 @@ void Scale_GPU_impl(
     SizeT num_entries,
     T const& alpha,
     T* buffer, SizeT stride,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T, typename SizeT,
           typename=EnableUnless<IsStorageType<T,Device::GPU>>,
@@ -45,7 +49,7 @@ void Scale_GPU_impl(
     SizeT const&,
     T const&,
     T const* const&, SizeT const&,
-    cudaStream_t const&)
+    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Scale: Type not valid on GPU");
 }
@@ -63,8 +67,8 @@ void Scale_GPU_impl(
  *  @param[in] alpha The scaling parameter.
  *  @param[in,out] buffer The matrix, in column-major ordering.
  *  @param[in] ldim The leading dimension of the data in buffer.
- *  @param[in] stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  *
  *  @todo See if we can statically assert that the operator*= will
  *        succeed on the device.
@@ -75,14 +79,14 @@ void Scale_GPU_impl(
     SizeT num_rows, SizeT num_cols,
     T const& alpha,
     T* buffer, SizeT ldim,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T, typename SizeT,
           typename=EnableUnless<IsStorageType<T,Device::GPU>>,
           typename=void>
 void Scale_GPU_impl(SizeT const&, SizeT const&,
                     T const&, T const* const&, SizeT const&,
-                    cudaStream_t const&)
+                    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Scale: Type not valid on GPU.");
 }
diff --git a/include/hydrogen/blas/gpu/Transpose.hpp b/include/hydrogen/blas/gpu/Transpose.hpp
index 618cbd0539..d8f6e14958 100644
--- a/include/hydrogen/blas/gpu/Transpose.hpp
+++ b/include/hydrogen/blas/gpu/Transpose.hpp
@@ -4,7 +4,11 @@
 #include <hydrogen/Device.hpp>
 #include <hydrogen/meta/MetaUtilities.hpp>
 
-#include <cuda_runtime.h>
+#if defined(HYDROGEN_HAVE_CUDA)
+#include <hydrogen/device/gpu/CUDA.hpp>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#endif
 
 #include <stdexcept>
 
@@ -28,8 +32,8 @@ namespace hydrogen
  *  @param[out] dest The destination matrix, in column-major ordering. Must not
  *      overlap with the source matrix. Contents will be overwritten.
  *  @param[in] ldb The leading dimension of B.
- *  @param stream The CUDA stream on which the kernel should be
- *      launched.
+ *  @param[in] sync_info The sync info wrapping the stream on which
+ *      the kernel should be launched.
  */
 template <typename T, typename SizeT,
           typename=EnableWhen<IsStorageType<T,Device::GPU>>>
@@ -37,7 +41,7 @@ void Transpose_GPU_impl(
     SizeT num_rows, SizeT num_cols,
     T const* A, SizeT lda,
     T* B, SizeT ldb,
-    cudaStream_t stream);
+    SyncInfo<Device::GPU> const& sync_info);
 
 template <typename T, typename SizeT,
           typename=EnableUnless<IsStorageType<T,Device::GPU>>,
@@ -46,7 +50,7 @@ void Transpose_GPU_impl(
     SizeT const&, SizeT const&,
     T const* const&, SizeT const&,
     T* const&, SizeT const&,
-    cudaStream_t const&)
+    SyncInfo<Device::GPU> const&)
 {
     throw std::logic_error("Copy: Type not valid on GPU.");
 }
diff --git a/include/hydrogen/device/GPU.hpp b/include/hydrogen/device/GPU.hpp
new file mode 100644
index 0000000000..ed23297ead
--- /dev/null
+++ b/include/hydrogen/device/GPU.hpp
@@ -0,0 +1,163 @@
+#ifndef HYDROGEN_DEVICE_GPU_HPP_
+#define HYDROGEN_DEVICE_GPU_HPP_
+
+/** @defgroup gpu_mgmt GPU device interaction and management
+ *
+ *  These functions provide a runtime-agnostic API for basic
+ *  interaction with GPUs. The exposed functionality is deliberately
+ *  quite basic and represents the functions needed for Hydrogen.
+ */
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include <stdexcept>
+
+namespace hydrogen
+{
+
+/** @namespace gpu
+ *  @brief Interface functions for interacting with the GPU.
+ *
+ *  This is basically a "backended" system where the backends are
+ *  mutually exclusive and therefore largely hidden from view. At time
+ *  of writing, the backends are CUDA and ROCm/HIP. This will be
+ *  determined at configure time based on user-input configure options
+ *  and/or system interrogation.
+ *
+ *  @note Since HIP is a compatibility layer, it should be
+ *  possible to just universally use HIP. However, we wish to allow
+ *  the two backends to evolve independently. Thus, it should
+ *  theoretically be possible to just universally use the HIP
+ *  backend. However, in the current implementation, CUDA-specific
+ *  optimizations will be lost if compiling under HIP (as they will
+ *  likely be protected by "HYDROGEN_HAVE_CUDA", which will not be
+ *  defined in this case).
+ */
+namespace gpu
+{
+
+/** @name Environment management */
+///@{
+
+/** @brief Initialize the GPU driver and runtime.
+ *
+ *  This incorporates anything that needs to be done before kernels
+ *  can be dispatched to the GPU. In CUDA terms, this establishes a
+ *  CUDA context.
+ *
+ *  @ingroup gpu_mgmt
+ */
+void Initialize();
+
+/** @brief Cleanup and shutdown any GPU driver/runtime state.
+ *
+ *  This performs any tasks that are required to close the GPU
+ *  environment and leave a clean state.
+ *
+ *  @ingroup gpu_mgmt
+ */
+void Finalize();
+
+/** @brief Query if the GPU environment is initialized.
+ *  @ingroup gpu_mgmt
+ */
+bool IsInitialized() noexcept;
+
+/** @brief Query if the GPU environment is finalized.
+ *
+ *  Finalized means "not initialized", so an environment that has
+ *  never been initialized is, in this sense, "finalized".
+ *
+ *  @ingroup gpu_mgmt
+ */
+inline bool IsFinalized() noexcept { return !IsInitialized(); }
+
+///@}
+/** @name Device management */
+///@{
+
+/** @brief Get the number of GPUs visible to this process.
+ *  @throws GPUError If the runtime detects any errors.
+ *  @ingroup gpu_mgmt
+ */
+size_t DeviceCount();
+
+/** @brief Get the ID of the currently selected GPU.
+ *  @throws GPUError If the runtime detects any errors.
+ *  @ingroup gpu_mgmt
+ */
+int CurrentDevice();
+
+/** @brief Get the ID of the default GPU.
+ *  @throws GPUError If the runtime detects any errors.
+ *  @ingroup gpu_mgmt
+ */
+int DefaultDevice();
+
+/** @brief Get the device ID we should be using.
+ *  @details This uses environment variables set by most MPI libraries
+ *      and/or launchers (slurm,lsf) to determine a device ID. Devices
+ *      are assigned round-robin based on local rank.
+ *  @param[in] device_count Number of visible devices.
+ *  @ingroup gpu_mgmt
+ */
+int ComputeDeviceId(unsigned int device_count) noexcept;
+
+/** @brief Select the given device.
+ *
+ *  @param[in] device_id The ID of the device to select. Must be less
+ *                       than the number of available GPUs.
+ *
+ *  @throws GPUError If the runtime detects any errors.
+ *  @ingroup gpu_mgmt
+ */
+void SetDevice(int device_id);
+
+/** @brief Block the host until all device execution has completed.
+ *  @throws GPUError If the runtime detects any errors.
+ *  @ingroup gpu_mgmt
+ */
+void SynchronizeDevice();
+
+///@}
+/** @name Execution control */
+///@{
+
+/** @brief Get the default SyncInfo object for this session.
+ *
+ *  Note that Hydrogen will use this SyncInfo by default. On CUDA
+ *  platforms, for example, it will be different from the "default
+ *  CUDA stream".
+ *
+ *  This SyncInfo object will persist for as long as
+ *  IsInitialized(). Note that if the GPU environment is finalized and
+ *  reinitialized, this SyncInfo object in the new environment may
+ *  differ from the previous environment.
+ *
+ *  @throws GPUError If the runtime detects any errors.
+ *
+ *  @ingroup gpu_mgmt
+ */
+SyncInfo<Device::GPU> const& DefaultSyncInfo() noexcept;
+
+///@}
+
+}// namespace gpu
+
+/** @name SyncInfo management */
+///@{
+
+/** @brief Create a new CPU SyncInfo object. */
+template <>
+SyncInfo<Device::GPU> CreateNewSyncInfo<Device::GPU>();
+
+/** @brief Destroy the GPU SyncInfo. */
+void DestroySyncInfo(SyncInfo<Device::GPU>&);
+
+///@}
+
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_HPP_
diff --git a/include/hydrogen/device/gpu/BasicCopy.hpp b/include/hydrogen/device/gpu/BasicCopy.hpp
new file mode 100644
index 0000000000..59e37a5f2f
--- /dev/null
+++ b/include/hydrogen/device/gpu/BasicCopy.hpp
@@ -0,0 +1,12 @@
+#ifndef HYDROGEN_DEVICE_GPU_BASICCOPY_HPP
+#define HYDROGEN_DEVICE_GPU_BASICCOPY_HPP
+
+#include <El/hydrogen_config.h>
+
+#if defined(HYDROGEN_HAVE_CUDA)
+#include "cuda/CUDACopy.hpp"
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include "rocm/ROCmCopy.hpp"
+#endif
+
+#endif // HYDROGEN_DEVICE_GPU_BASICCOPY_HPP
diff --git a/include/hydrogen/device/gpu/cuda/CUB.hpp b/include/hydrogen/device/gpu/CUB.hpp
similarity index 78%
rename from include/hydrogen/device/gpu/cuda/CUB.hpp
rename to include/hydrogen/device/gpu/CUB.hpp
index 0f94a4bfa1..75f10ad167 100644
--- a/include/hydrogen/device/gpu/cuda/CUB.hpp
+++ b/include/hydrogen/device/gpu/CUB.hpp
@@ -1,13 +1,24 @@
 #ifndef HYDROGEN_IMPORTS_CUB_HPP_
 #define HYDROGEN_IMPORTS_CUB_HPP_
 
+#include "El/hydrogen_config.h"
+
+#ifdef HYDROGEN_HAVE_CUDA
 #include <cuda_runtime.h>
 #include <cub/util_allocator.cuh>
+#elif defined HYDROGEN_HAVE_ROCM
+#include <hipcub/hipcub.hpp>
+#endif // HYDROGEN_HAVE_CUB
 
 namespace hydrogen
 {
 namespace cub
 {
+#ifdef HYDROGEN_HAVE_CUDA
+namespace cub_impl = ::cub;
+#elif defined HYDROGEN_HAVE_ROCM
+namespace cub_impl = ::hipcub;
+#endif // HYDROGEN_HAVE_CUDA
 
     /** @brief Get singleton instance of CUB memory pool.
      *
@@ -27,7 +38,7 @@ namespace cub
      *  redirect output on a per-rank basis, either through the
      *  features exposed by their MPI launcher or by some other means.
      */
-    ::cub::CachingDeviceAllocator& MemoryPool();
+    cub_impl::CachingDeviceAllocator& MemoryPool();
     /** Destroy singleton instance of CUB memory pool. */
     void DestroyMemoryPool();
 
diff --git a/include/hydrogen/device/gpu/CUDA.hpp b/include/hydrogen/device/gpu/CUDA.hpp
index af2fe77dd6..fe6acd216e 100644
--- a/include/hydrogen/device/gpu/CUDA.hpp
+++ b/include/hydrogen/device/gpu/CUDA.hpp
@@ -1,361 +1,8 @@
-#ifndef HYDROGEN_IMPORTS_CUDA_HPP_
-#define HYDROGEN_IMPORTS_CUDA_HPP_
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_HPP_
 
-#include <El/hydrogen_config.h>
+#include "cuda/CUDAError.hpp"
+#include "cuda/CUDALaunchKernel.hpp"
+#include "cuda/CUDAManagement.hpp"
 
-#include <hydrogen/Device.hpp>
-#include <hydrogen/utils/HalfPrecision.hpp>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-
-#include <memory>
-#include <sstream>
-#include <stdexcept>
-#include <string>
-
-namespace hydrogen
-{
-
-/** @class CudaError
- *
- *  Exception class for CUDA errors.
- *
- *  \todo Clean up the error-handling macros
- */
-struct CudaError : std::runtime_error
-{
-    std::string build_error_string_(
-        cudaError_t cuda_error, char const* file, int line, bool async = false)
-    {
-        std::ostringstream oss;
-        oss << ( async ? "Asynchronous CUDA error" : "CUDA error" )
-            << " (error code=" << cuda_error << ") (" << file << ":" << line << "): "
-            << cudaGetErrorString(cuda_error);
-        return oss.str();
-    }
-    CudaError(cudaError_t cuda_error, char const* file, int line, bool async = false)
-        : std::runtime_error{build_error_string_(cuda_error,file,line,async)}
-    {}
-}; // struct CudaError
-
-#define H_CUDA_SYNC(async)                                             \
-    do                                                                  \
-    {                                                                   \
-        /* Synchronize GPU and check for errors. */                     \
-        cudaError_t status_CUDA_SYNC = cudaDeviceSynchronize();         \
-        if (status_CUDA_SYNC == cudaSuccess)                            \
-            status_CUDA_SYNC = cudaGetLastError();                      \
-        if (status_CUDA_SYNC != cudaSuccess) {                          \
-            cudaDeviceReset();                                          \
-            throw hydrogen::CudaError(status_CUDA_SYNC,__FILE__,__LINE__,async);  \
-        }                                                               \
-    }                                                                   \
-    while( 0 )
-#define H_FORCE_CHECK_CUDA(cuda_call)                                  \
-    do                                                                  \
-    {                                                                   \
-        /* Call CUDA API routine, synchronizing before and after to */  \
-        /* check for errors. */                                         \
-        H_CUDA_SYNC(true);                                             \
-        cudaError_t status_CHECK_CUDA = cuda_call ;                     \
-        if( status_CHECK_CUDA != cudaSuccess ) {                        \
-            cudaDeviceReset();                                          \
-            throw hydrogen::CudaError(status_CHECK_CUDA,__FILE__,__LINE__,false); \
-        }                                                               \
-        H_CUDA_SYNC(false);                                            \
-    } while (0)
-#define H_FORCE_CHECK_CUDA_NOSYNC(cuda_call)                           \
-    do                                                                  \
-    {                                                                   \
-        /* Call CUDA API routine, and check for errors without */       \
-        /* synchronizing. */                                            \
-        cudaError_t status_CHECK_CUDA = cuda_call ;                     \
-        if( status_CHECK_CUDA != cudaSuccess ) {                        \
-            cudaDeviceReset();                                          \
-            throw hydrogen::CudaError(status_CHECK_CUDA,__FILE__,__LINE__,false); \
-        }                                                               \
-    } while (0)
-#define H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args)      \
-    do                                                          \
-    {                                                           \
-        /* Dg is a dim3 specifying grid dimensions. */          \
-        /* Db is a dim3 specifying block dimensions. */         \
-        /* Ns is a size_t specifying dynamic memory. */         \
-        /* S is a cudaStream_t specifying stream. */            \
-        kernel <<< Dg, Db, Ns, S >>> args ;                     \
-    }                                                           \
-    while (0)
-#define H_FORCE_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \
-    do                                                          \
-    {                                                           \
-        /* Launch CUDA kernel, synchronizing before */          \
-        /* and after to check for errors. */                    \
-        H_CUDA_SYNC(true);                                     \
-        H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args);     \
-        H_CUDA_SYNC(false);                                    \
-    }                                                           \
-    while (0)
-
-#ifdef HYDROGEN_RELEASE_BUILD
-#define H_CHECK_CUDA( cuda_call ) H_FORCE_CHECK_CUDA_NOSYNC(cuda_call)
-#define H_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \
-  H_LAUNCH_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args)
-#else
-#define H_CHECK_CUDA( cuda_call ) H_FORCE_CHECK_CUDA( cuda_call )
-#define H_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args) \
-  H_FORCE_CHECK_CUDA_KERNEL(kernel, Dg, Db, Ns, S, args)
-#endif // HYDROGEN_RELEASE_BUILD
-
-// Function to determine if a pointer is GPU memory
-inline bool IsGPUMemory(const void* ptr)
-{
-    cudaPointerAttributes attrs;
-    auto err = cudaPointerGetAttributes(&attrs, ptr);
-    if (err == cudaErrorInvalidValue)
-    {
-        if ((err = cudaGetLastError()) == cudaErrorInvalidValue)
-            return false;
-        else
-            H_FORCE_CHECK_CUDA(err);
-    }
-    else
-    {
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-        if ((err = cudaGetLastError()) == cudaSuccess)
-            return (attrs.memoryType == cudaMemoryTypeDevice);
-        else
-            H_FORCE_CHECK_CUDA(err);
-#pragma GCC diagnostic pop
-    }
-    return false;// silence compiler warning
-}
-
-/** Initialize CUDA environment.
- *  We assume that all MPI ranks within a compute node have access to
- *  exactly one unique GPU or to the same (possibly empty) list of
- *  GPUs. GPU assignments can be controled with the
- *  CUDA_VISIBLE_DEVICES environment variable.
- */
-void InitializeCUDA(int,char*[]);
-/** Finalize CUDA environment. */
-void FinalizeCUDA();
-
-/** Singleton class to manage CUDA objects.
- *  This class also manages cuBLAS objects. Note that the CUDA device
- *  is set whenever the singleton instance is requested, i.e. in most
- *  of the static functions.
- */
-class GPUManager
-{
-public:
-
-    GPUManager( const GPUManager& ) = delete;
-    GPUManager& operator=( const GPUManager& ) = delete;
-    ~GPUManager();
-
-    /** Create new singleton instance of CUDA manager. */
-    static void Create( int device = 0 );
-    /** Initilize CUBLAS. */
-    static void InitializeCUBLAS();
-    /** Destroy singleton instance of CUDA manager. */
-    static void Destroy();
-    /** Get singleton instance of CUDA manager. */
-    static GPUManager* Instance();
-    /** Get number of visible CUDA devices. */
-    static unsigned int NumDevices();
-    /** Get currently active CUDA device. */
-    static int Device();
-    /** Set active CUDA device. */
-    static void SetDevice( int device );
-    /** Get CUDA stream. */
-    static cudaStream_t Stream();
-    /** Get CUDA event. */
-    static cudaEvent_t Event();
-    /** Synchronize CUDA stream. */
-    static void SynchronizeStream();
-    /** Synchronize CUDA device.
-     *  If checkError is true, an exception will be thrown if an error
-     *  from an asynchronous CUDA kernel is detected.
-     */
-    static void SynchronizeDevice( bool checkError = false );
-    /** Get cuBLAS handle. */
-    static cublasHandle_t cuBLASHandle();
-
-private:
-
-    /** Singleton instance. */
-    static std::unique_ptr<GPUManager> instance_;
-
-    /** Number of visible CUDA devices. */
-    unsigned int numDevices_;
-    /** Currently active CUDA device. */
-    int device_;
-    /** CUDA stream. */
-    cudaStream_t stream_;
-    /** CUDA event. */
-    cudaEvent_t event_;
-    /** cuBLAS handle */
-    cublasHandle_t cublasHandle_;
-
-    GPUManager( int device = 0 );
-
-}; // class GPUManager
-
-template <Device D1, Device D2>
-constexpr cudaMemcpyKind CUDAMemcpyKind();
-
-template <>
-constexpr cudaMemcpyKind CUDAMemcpyKind<Device::CPU,Device::GPU>()
-{
-    return cudaMemcpyHostToDevice;
-}
-
-template <>
-constexpr cudaMemcpyKind CUDAMemcpyKind<Device::GPU,Device::CPU>()
-{
-    return cudaMemcpyDeviceToHost;
-}
-
-template <>
-constexpr cudaMemcpyKind CUDAMemcpyKind<Device::GPU,Device::GPU>()
-{
-    return cudaMemcpyDeviceToDevice;
-}
-
-template <>
-struct InterDeviceCopy<Device::CPU,Device::GPU>
-{
-    template <typename T>
-    static void MemCopy1DAsync(
-        T * __restrict__ const dest,
-        T const* __restrict__ const src,
-        size_t const size,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpyAsync(
-                dest, src, size*sizeof(T),
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                stream));
-    }
-
-#if defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16)
-    // These two types are bitwise-compatible across the two devices.
-    static void MemCopy1DAsync(gpu_half_type * __restrict__ const dest,
-                               cpu_half_type const* __restrict__ const src,
-                               size_t const size,
-                               cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpyAsync(
-                dest, src, size*sizeof(gpu_half_type),
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                stream));
-    }
-
-    static void MemCopy1DAsync(
-        cpu_half_type * __restrict__ const dest,
-        gpu_half_type const* __restrict__ const src,
-        size_t const size,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpyAsync(
-                dest, src, size*sizeof(gpu_half_type),
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                stream));
-    }
-#endif // defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16)
-
-    template <typename T>
-    static void MemCopy2DAsync(
-        T * __restrict__ const dest, size_t const dest_ldim,
-        T const* __restrict__ const src,
-        size_t const src_ldim,
-        size_t const height, size_t const width,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(
-                dest, dest_ldim*sizeof(T),
-                src, src_ldim*sizeof(T),
-                height*sizeof(T), width,
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                stream));
-    }
-
-#if defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16)
-    // These two types are bitwise-compatible across the two devices.
-    static void MemCopy2DAsync(
-        gpu_half_type * __restrict__ const dest,
-        size_t const dest_ldim,
-        cpu_half_type const* __restrict__ const src,
-        size_t const src_ldim,
-        size_t const height, size_t const width,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(
-                dest, dest_ldim*sizeof(gpu_half_type),
-                src, src_ldim*sizeof(cpu_half_type),
-                height*sizeof(gpu_half_type), width,
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                stream));
-    }
-    static void MemCopy2DAsync(
-        cpu_half_type * __restrict__ const dest,
-        size_t const dest_ldim,
-        gpu_half_type const* __restrict__ const src,
-        size_t const src_ldim,
-        size_t const height, size_t const width,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(cudaMemcpy2DAsync(
-            dest, dest_ldim*sizeof(cpu_half_type),
-            src, src_ldim*sizeof(gpu_half_type),
-            height*sizeof(gpu_half_type), width,
-            CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-            stream));
-    }
-#endif // defined(HYDROGEN_HAVE_HALF) && defined(HYDROGEN_GPU_USE_FP16)
-};// InterDevice<CPU,GPU>
-
-template <>
-struct InterDeviceCopy<Device::GPU,Device::CPU>
-{
-    template <typename T>
-    static void MemCopy1DAsync(
-        T * __restrict__ const dest,
-        T const* __restrict__ const src, size_t const size,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpyAsync(
-                dest, src, size*sizeof(T),
-                CUDAMemcpyKind<Device::GPU,Device::CPU>(),
-                stream));
-    }
-
-    template <typename T>
-    static void MemCopy2DAsync(
-        T * __restrict__ const dest, size_t const dest_ldim,
-        T const* __restrict__ const src, size_t const src_ldim,
-        size_t const height, size_t const width,
-        cudaStream_t stream = GPUManager::Stream())
-    {
-        H_CHECK_CUDA(
-            cudaMemcpy2DAsync(
-                dest, dest_ldim*sizeof(T),
-                src, src_ldim*sizeof(T),
-                height*sizeof(T), width,
-                CUDAMemcpyKind<Device::GPU,Device::CPU>(),
-                stream));
-    }
-};// InterDevice<CPU,GPU>
-
-} // namespace hydrogen
-
-#endif // HYDROGEN_IMPORTS_CUDA_HPP_
+#endif // HYDROGEN_DEVICE_GPU_CUDA_HPP_
diff --git a/include/hydrogen/device/gpu/GPUError.hpp b/include/hydrogen/device/gpu/GPUError.hpp
new file mode 100644
index 0000000000..a2a1e6ca16
--- /dev/null
+++ b/include/hydrogen/device/gpu/GPUError.hpp
@@ -0,0 +1,18 @@
+#ifndef HYDROGEN_DEVICE_GPUERROR_HPP_
+#define HYDROGEN_DEVICE_GPUERROR_HPP_
+
+#include <stdexcept>
+
+#include <hydrogen/Error.hpp>
+
+namespace hydrogen
+{
+
+/** @name ErrorHandling */
+///@{
+
+H_ADD_BASIC_EXCEPTION_CLASS(GPUError, std::runtime_error);
+
+///@}
+}// namespace
+#endif // HYDROGEN_DEVICE_GPUERROR_HPP_
diff --git a/include/hydrogen/device/gpu/ROCm.hpp b/include/hydrogen/device/gpu/ROCm.hpp
new file mode 100644
index 0000000000..086dc8227f
--- /dev/null
+++ b/include/hydrogen/device/gpu/ROCm.hpp
@@ -0,0 +1,8 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_HPP_
+
+#include "rocm/ROCmError.hpp"
+#include "rocm/ROCmLaunchKernel.hpp"
+#include "rocm/ROCmManagement.hpp"
+
+#endif // HYDROGEN_DEVICE_GPU_ROCM_HPP_
diff --git a/include/hydrogen/device/gpu/SyncInfo.hpp b/include/hydrogen/device/gpu/SyncInfo.hpp
new file mode 100644
index 0000000000..b685cd2b4e
--- /dev/null
+++ b/include/hydrogen/device/gpu/SyncInfo.hpp
@@ -0,0 +1,12 @@
+#ifndef HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_
+#define HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_
+
+#include <El/hydrogen_config.h>
+
+#if defined HYDROGEN_HAVE_CUDA
+#include "cuda/SyncInfo.hpp"
+#elif defined HYDROGEN_HAVE_ROCM
+#include "rocm/SyncInfo.hpp"
+#endif
+
+#endif // HYDROGEN_DEVICE_GPU_SYNCINFO_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/CUDACopy.hpp b/include/hydrogen/device/gpu/cuda/CUDACopy.hpp
new file mode 100644
index 0000000000..bcf97eb06d
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/CUDACopy.hpp
@@ -0,0 +1,112 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include <hydrogen/device/GPU.hpp>
+#include <hydrogen/blas/gpu/Fill.hpp>
+
+#include "CUDAError.hpp"
+
+namespace hydrogen
+{
+namespace gpu
+{
+
+/** @todo Flesh out documentation
+ *  @todo these are actually only valid for "packed" types
+ */
+
+// These functions are synchronous with respect to their SyncInfo
+// objects (that is, they require explicit synchronization to the
+// host).
+
+template <typename T>
+void Fill1DBuffer(T* buffer, size_t num_elements, T value,
+                  SyncInfo<Device::GPU> const& si)
+{
+    Fill_GPU_1D_impl(buffer, num_elements, value, si);
+}
+
+template <typename T>
+void Copy1DIntraDevice(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                       size_t num_elements,
+                       SyncInfo<Device::GPU> const& si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            cudaMemcpyDeviceToDevice, si.Stream()));
+}
+
+template <typename T>
+void Copy1DToHost(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                  size_t num_elements,
+                  SyncInfo<Device::GPU> const& src_si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            cudaMemcpyDeviceToHost, src_si.Stream()));
+}
+
+template <typename T>
+void Copy1DToDevice(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                    size_t num_elements,
+                    SyncInfo<Device::GPU> const& dest_si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            cudaMemcpyHostToDevice, dest_si.Stream()));
+}
+
+
+template <typename T>
+void Copy2DIntraDevice(T const* src, size_t src_ldim,
+                       T* dest, size_t dest_ldim,
+                       size_t height, size_t width,
+                       SyncInfo<Device::GPU> const& si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            cudaMemcpyDeviceToDevice, si.Stream()));
+}
+
+template <typename T>
+void Copy2DToHost(T const* src, size_t src_ldim,
+                  T* dest, size_t dest_ldim,
+                  size_t height, size_t width,
+                  SyncInfo<Device::GPU> const& src_si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            cudaMemcpyDeviceToHost, src_si.Stream()));
+}
+
+template <typename T>
+void Copy2DToDevice(T const* src, size_t src_ldim,
+                    T* dest, size_t dest_ldim,
+                    size_t height, size_t width,
+                    SyncInfo<Device::GPU> const& dest_si)
+{
+    H_CHECK_CUDA(
+        cudaMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            cudaMemcpyHostToDevice, dest_si.Stream()));
+}
+
+}// namespace gpu
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUDACOPY_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/CUDAError.hpp b/include/hydrogen/device/gpu/cuda/CUDAError.hpp
new file mode 100644
index 0000000000..e2794e9caa
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/CUDAError.hpp
@@ -0,0 +1,52 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <cuda_runtime_api.h>
+
+#include <hydrogen/device/gpu/GPUError.hpp>
+
+#ifdef HYDROGEN_GPU_CALLS_ARE_SYNCHRONOUS
+#define H_SYNC_CUDA() cudaDeviceSynchronize()
+#else
+#define H_SYNC_CUDA()
+#endif
+
+// Error handling macro
+#define H_CHECK_CUDA(cmd)                                               \
+    do                                                                  \
+    {                                                                   \
+        H_SYNC_CUDA();                                                  \
+        auto h_check_cuda_error_code__ = cmd;                           \
+        H_ASSERT(h_check_cuda_error_code__ == cudaSuccess,              \
+                 ::hydrogen::CUDAError,                                 \
+                 (cudaDeviceReset(),                                    \
+                  ::hydrogen::cuda::BuildCUDAErrorMessage(              \
+                      #cmd, h_check_cuda_error_code__)));               \
+        H_SYNC_CUDA();                                                  \
+    } while (false)
+
+namespace hydrogen
+{
+
+/** @class CUDAError
+ *  @brief Exception class representing an error detected by the CUDA
+ *         runtime.
+ */
+H_ADD_BASIC_EXCEPTION_CLASS(CUDAError, GPUError);
+
+namespace cuda
+{
+
+/** @brief Write an error message describing the error detected in CUDA.
+ *  @param[in] cmd The expression that raised the error.
+ *  @param[in] error_code The error code reported by CUDA.
+ *  @returns A string describing the error.
+ */
+std::string BuildCUDAErrorMessage(
+    std::string const& cmd, cudaError_t error_code);
+
+}// namespace cuda
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDAERROR_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp b/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp
new file mode 100644
index 0000000000..331dc26a58
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/CUDALaunchKernel.hpp
@@ -0,0 +1,31 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_
+
+#include <cuda_runtime.h>
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include "CUDAError.hpp"
+
+namespace hydrogen
+{
+namespace gpu
+{
+
+template <typename F, typename... Args>
+void LaunchKernel(
+    F kernel, dim3 const& gridDim, dim3 const& blkDim,
+    size_t sharedMem, SyncInfo<Device::GPU> const& si,
+    Args... kernel_args)
+{
+    void* args[] = { const_cast<void*>(reinterpret_cast<const void*>(&kernel_args))... };
+    H_CHECK_CUDA(
+        cudaLaunchKernel(
+            (void const*) kernel,
+            gridDim, blkDim, args, sharedMem, si.Stream()));
+}
+
+}// namespace gpu
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDALAUNCHKERNEL_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp b/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp
new file mode 100644
index 0000000000..e501ef79d7
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/CUDAManagement.hpp
@@ -0,0 +1,22 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_
+
+#include <cuda_runtime.h>
+
+namespace hydrogen
+{
+
+using gpuEvent_t = cudaEvent_t;
+using gpuStream_t = cudaStream_t;
+
+namespace cuda
+{
+cudaEvent_t GetDefaultEvent() noexcept;
+cudaStream_t GetDefaultStream() noexcept;
+cudaEvent_t GetNewEvent();
+cudaStream_t GetNewStream();
+void FreeEvent(cudaEvent_t& event);
+void FreeStream(cudaStream_t& stream);
+}// namespace cuda
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDAMANAGEMENT_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/SyncInfo.hpp b/include/hydrogen/device/gpu/cuda/SyncInfo.hpp
new file mode 100644
index 0000000000..72eabbf012
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/SyncInfo.hpp
@@ -0,0 +1,84 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_
+
+#include <cuda_runtime_api.h>
+
+#include <hydrogen/SyncInfo.hpp>
+#include <hydrogen/meta/MetaUtilities.hpp>
+
+#include "CUDAError.hpp"
+#include "CUDAManagement.hpp"
+
+namespace hydrogen
+{
+
+template <>
+class SyncInfo<Device::GPU>
+{
+public:
+    SyncInfo()
+        : SyncInfo{cuda::GetDefaultStream(), cuda::GetDefaultEvent()}
+    {}
+
+    SyncInfo(cudaStream_t stream, cudaEvent_t event)
+        : stream_{stream}, event_{event}
+    {}
+
+    void Merge(SyncInfo<Device::GPU> const& si) noexcept
+    {
+        if (si.stream_)
+            stream_ = si.stream_;
+        if (si.event_)
+            event_ = si.event_;
+    }
+
+    cudaStream_t Stream() const noexcept { return stream_; }
+    cudaEvent_t Event() const noexcept { return event_; }
+private:
+    friend void DestroySyncInfo(SyncInfo<Device::GPU>&);
+    cudaStream_t stream_;
+    cudaEvent_t event_;
+};// struct SyncInfo<Device::GPU>
+
+inline void AddSynchronizationPoint(SyncInfo<Device::GPU> const& syncInfo)
+{
+    H_CHECK_CUDA(cudaEventRecord(syncInfo.Event(), syncInfo.Stream()));
+}
+
+
+namespace details
+{
+inline void AddSyncPoint(
+    SyncInfo<Device::CPU> const& master,
+    SyncInfo<Device::GPU> const& dependent)
+{
+}
+
+inline void AddSyncPoint(
+    SyncInfo<Device::GPU> const& master,
+    SyncInfo<Device::CPU> const& dependent)
+{
+    // The CPU must wait for the GPU to catch up.
+    Synchronize(master); // wait for "master"
+}
+
+// This captures the work done on A and forces "others" to wait for
+// completion.
+template <typename... Ts>
+inline
+void AddSyncPoint(
+    SyncInfo<Device::GPU> const& master, SyncInfo<Device::GPU> const& other)
+{
+    if (master.Stream() != other.Stream())
+        H_CHECK_CUDA(
+            cudaStreamWaitEvent(other.Stream(), master.Event(), 0));
+}
+}// namespace details
+
+inline void Synchronize(SyncInfo<Device::GPU> const& syncInfo)
+{
+    H_CHECK_CUDA(cudaStreamSynchronize(syncInfo.Stream()));
+}
+
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_SYNCINFO_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLAS.hpp b/include/hydrogen/device/gpu/cuda/cuBLAS.hpp
index ead7b70f00..936f91b412 100644
--- a/include/hydrogen/device/gpu/cuda/cuBLAS.hpp
+++ b/include/hydrogen/device/gpu/cuda/cuBLAS.hpp
@@ -1,420 +1,12 @@
-#ifndef HYDROGEN_IMPORTS_CUBLAS_HPP_
-#define HYDROGEN_IMPORTS_CUBLAS_HPP_
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_
 
-#include <cublas_v2.h>
+#include "cuBLASError.hpp"
+#include "cuBLASManagement.hpp"
+#include "cuBLASMeta.hpp"
+#include "cuBLASUtil.hpp"
 
-#include <El/hydrogen_config.h>
-#include <hydrogen/blas/BLAS_Common.hpp>
-#include <hydrogen/device/gpu/CUDA.hpp>
-#include <hydrogen/utils/HalfPrecision.hpp>
-#include <hydrogen/utils/NumericTypeConversion.hpp>
-#include <hydrogen/SyncInfo.hpp>
+// The API wrapper declarations
+#include "cuBLAS_API.hpp"
 
-namespace hydrogen
-{
-
-#define ADD_ENUM_TO_STRING_CASE(enum_value) \
-    case enum_value:                        \
-        return #enum_value
-
-/** \class cuBLASError
- *  \brief Exception class for cuBLAS errors.
- */
-struct cuBLASError : std::runtime_error
-{
-    static std::string get_error_string_(cublasStatus_t status)
-    {
-        switch (status)
-        {
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_SUCCESS);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_INITIALIZED);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ALLOC_FAILED);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INVALID_VALUE);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ARCH_MISMATCH);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_MAPPING_ERROR);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_EXECUTION_FAILED);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INTERNAL_ERROR);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_SUPPORTED);
-        ADD_ENUM_TO_STRING_CASE(CUBLAS_STATUS_LICENSE_ERROR);
-        default:
-            return "unknown cuBLAS error";
-        }
-    }
-
-    std::string build_error_string_(
-        cublasStatus_t status, char const* file, int line)
-    {
-        std::ostringstream oss;
-        oss << "cuBLAS error (" << file << ":" << line << "): "
-            << get_error_string_(status);
-        return oss.str();
-    }
-
-    cuBLASError(cublasStatus_t status, char const* file, int line)
-        : std::runtime_error{build_error_string_(status,file,line)}
-    {}
-};// struct cublasError
-
-#undef ADD_ENUM_TO_STRING_CASE
-
-#define H_FORCE_CHECK_CUBLAS(cublas_call)                              \
-    do                                                                  \
-    {                                                                   \
-        /* Check for earlier asynchronous errors. */                    \
-        H_FORCE_CHECK_CUDA(cudaSuccess);                               \
-        {                                                               \
-            /* Make cuBLAS call and check for errors. */                \
-            const cublasStatus_t status_CHECK_CUBLAS = (cublas_call);   \
-            if (status_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS)           \
-            {                                                           \
-              cudaDeviceReset();                                        \
-              throw cuBLASError(status_CHECK_CUBLAS,__FILE__,__LINE__); \
-            }                                                           \
-        }                                                               \
-        {                                                               \
-            /* Check for CUDA errors. */                                \
-            cudaError_t status_CHECK_CUBLAS = cudaDeviceSynchronize();  \
-            if (status_CHECK_CUBLAS == cudaSuccess)                     \
-                status_CHECK_CUBLAS = cudaGetLastError();               \
-            if (status_CHECK_CUBLAS != cudaSuccess)                     \
-            {                                                           \
-                cudaDeviceReset();                                      \
-                throw CudaError(                                        \
-                    status_CHECK_CUBLAS,__FILE__,__LINE__,false);       \
-            }                                                           \
-        }                                                               \
-    } while (0)
-
-#define H_FORCE_CHECK_CUBLAS_NOSYNC(cublas_call)                       \
-    do                                                                  \
-    {                                                                   \
-        /* Make cuBLAS call and check for errors without */             \
-        /* synchronizing. */                                            \
-        const cublasStatus_t status_CHECK_CUBLAS = (cublas_call);       \
-        if (status_CHECK_CUBLAS != CUBLAS_STATUS_SUCCESS)               \
-        {                                                               \
-            cudaDeviceReset();                                          \
-            throw cuBLASError(status_CHECK_CUBLAS,__FILE__,__LINE__);   \
-        }                                                               \
-    } while (0)
-
-#ifdef HYDROGEN_RELEASE_BUILD
-#define H_CHECK_CUBLAS(cublas_call)            \
-    H_FORCE_CHECK_CUBLAS_NOSYNC(cublas_call)
-#else
-#define H_CHECK_CUBLAS(cublas_call)            \
-    H_FORCE_CHECK_CUBLAS(cublas_call)
-#endif // #ifdef HYDROGEN_RELEASE_BUILD
-
-namespace cublas
-{
-
-/** @name cuBLAS utility functions. */
-///@{
-
-/** @brief Initialize CUBLAS.
- *
- *  This must be called after `MPI_Init` is called with
- *  MVAPICH2-GDR. Effectively, this creates the global cuBLAS library
- *  handle.
- */
-void Initialize();
-
-/** @class NativeType
- *  @brief Metafunction mapping type names to CUDA/cuBLAS equivalents.
- *
- *  The mapping should provide bitwise equivalence.
- *
- *  @note This belongs at this level because rocBLAS defines types (or
- *        names of types) that are local to the BLAS
- *        implementation. Additionally, it's feasible to conceive of
- *        custom types on the GPU that would, likewise, need to be
- *        mapped to the types that cuBLAS knows about.
- *
- *  @todo Add static assertions to ensure only valid types get mapped.
- */
-template <typename T>
-struct NativeTypeT;
-
-// Built-in types are their own native types
-template <> struct NativeTypeT<float> { using type = float; };
-template <> struct NativeTypeT<double> { using type = double; };
-template <>
-struct NativeTypeT<cuComplex> { using type = cuComplex; };
-template <>
-struct NativeTypeT<cuDoubleComplex> { using type = cuDoubleComplex; };
-
-// Complex and Double-Complex types require conversion
-template <>
-struct NativeTypeT<std::complex<float>> { using type = cuComplex; };
-template <>
-struct NativeTypeT<std::complex<double>> { using type = cuDoubleComplex; };
-
-// Half precision requires conversion as well
-#ifdef HYDROGEN_GPU_USE_FP16
-template <> struct NativeTypeT<__half> { using type = __half; };
-#ifdef HYDROGEN_HAVE_HALF
-template <> struct NativeTypeT<cpu_half_type> { using type = __half; };
-#endif // HYDROGEN_HAVE_HALF
-#endif // HYDROGEN_GPU_USE_FP16
-
-/** @brief Convenience wrapper for NativeTypeT */
-template <typename T>
-using NativeType = typename NativeTypeT<T>::type;
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS
-namespace meta_details
-{
-template <typename T>
-auto Try_HasNativeType(int) -> SubstitutionSuccess<NativeType<T>>;
-template <typename T>
-auto Try_HasNativeType(...) -> std::false_type;
-}// namespace meta_details
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-/** @struct HasNativeType
- *  @brief Predicate that determines if a type is mappable to a
- *         library-native type.
- */
-template <typename T>
-struct HasNativeType : decltype(meta_details::Try_HasNativeType<T>(0)) {};
-
-/** @class IsSupportedType_Base
- *  @brief Predicate indicating that a type is supported within cuBLAS
- *         for the given operation.
- *
- *  This is used to map internal cuBLAS types to the operations that
- *  are supported. For example, `float` is always supported but
- *  `__half` only has support in a few functions.
- */
-template <typename T, BLAS_Op op>
-struct IsSupportedType_Base : std::false_type {};
-
-template <BLAS_Op op>
-struct IsSupportedType_Base<float, op> : std::true_type {};
-template <BLAS_Op op>
-struct IsSupportedType_Base<double, op> : std::true_type {};
-template <BLAS_Op op>
-struct IsSupportedType_Base<cuComplex, op> : std::true_type {};
-template <BLAS_Op op>
-struct IsSupportedType_Base<cuDoubleComplex, op> : std::true_type {};
-
-// No need to further test CUDA because this file isn't included if
-// either we don't have GPUs at all or we don't have CUDA support.
-#ifdef HYDROGEN_GPU_USE_FP16
-template <>
-struct IsSupportedType_Base<__half, BLAS_Op::AXPY> : std::true_type {};
-template <>
-struct IsSupportedType_Base<__half, BLAS_Op::GEMM> : std::true_type {};
-template <>
-struct IsSupportedType_Base<__half, BLAS_Op::SCAL> : std::true_type {};
-#endif // HYDROGEN_GPU_USE_FP16
-
-/** @class IsSupportedType
- *  @brief Predicate indicating that the given type is compatible with
- *         cuBLAS.
- *
- *  This is true when either the type is a compatible cuBLAS type
- *  (e.g., float) or when it is binarily equivalent to one (e.g.,
- *  std::complex<float>)..
- */
-template <typename T, BLAS_Op op, bool=HasNativeType<T>::value>
-struct IsSupportedType
-    : IsSupportedType_Base<NativeType<T>, op>
-{};
-
-template <typename T, BLAS_Op op>
-struct IsSupportedType<T,op,false> : std::false_type {};
-
-/** @brief cuBLAS uses ints to represent sizes. */
-using SizeT = int;
-
-/** @brief Convert a value to the size type expected by the cuBLAS
- *         library.
- *
- *  If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a
- *  "safe cast" (it will verify that `val` is in the dynamic range of
- *  `int`. Otherwise it will do a regular static_cast.
- */
-template <typename T>
-#ifdef HYDROGEN_DO_BOUNDS_CHECKING
-SizeT ToSizeT(T const& val)
-{
-    return narrow_cast<SizeT>(val);
-}
-#else
-SizeT ToSizeT(T const& val) noexcept
-{
-    return static_cast<SizeT>(val);
-}
-#endif // HYDROGEN_DO_BOUNDS_CHECKING
-
-/** @brief Overload to prevent extra work in the case of dynamic range checking. */
-inline SizeT ToSizeT(SizeT const& val) noexcept
-{
-    return val;
-}
-
-/** @brief Convert an TransposeMode to the cuBLAS operation type. */
-inline cublasOperation_t
-ToNativeTransposeMode(TransposeMode const& orient) noexcept
-{
-    switch (orient)
-    {
-    case TransposeMode::TRANSPOSE:
-        return CUBLAS_OP_T;
-    case TransposeMode::CONJ_TRANSPOSE:
-        return CUBLAS_OP_C;
-    default: // TransposeMode::NORMAL
-        return CUBLAS_OP_N;
-    }
-}
-
-/** @brief Convert a SideMode to the cuBLAS side mode type. */
-inline cublasSideMode_t
-ToNativeSideMode(SideMode const& side) noexcept
-{
-    if (side == SideMode::LEFT)
-        return CUBLAS_SIDE_LEFT;
-
-    return CUBLAS_SIDE_RIGHT;
-}
-
-/** @brief Get the cuBLAS library handle. */
-cublasHandle_t GetLibraryHandle() noexcept;
-
-/** @class SyncManager
- *  @brief Manage stream synchronization within cuBLAS.
- */
-class SyncManager
-{
-public:
-    SyncManager(cublasHandle_t handle, SyncInfo<Device::GPU> const& si);
-    ~SyncManager();
-private:
-    cudaStream_t orig_stream_;
-};// class SyncManager
-
-///@}
-/** @name BLAS-1 Routines */
-///@{
-
-#define ADD_AXPY_DECL(ScalarType)               \
-    void Axpy(cublasHandle_t handle,            \
-              int n, ScalarType const& alpha,   \
-              ScalarType const* X, int incx,    \
-              ScalarType* Y, int incy)
-
-#define ADD_COPY_DECL(ScalarType)                       \
-    void Copy(cublasHandle_t handle,                    \
-              int n, ScalarType const* X, int incx,     \
-              ScalarType* Y, int incy)
-
-#define ADD_SCALE_DECL(ScalarType)                       \
-    void Scale(cublasHandle_t handle,                    \
-               int n, ScalarType const& alpha,           \
-               ScalarType* X, int incx)
-
-#ifdef HYDROGEN_GPU_USE_FP16
-ADD_AXPY_DECL(__half);
-#endif // HYDROGEN_GPU_USE_FP16
-ADD_AXPY_DECL(float);
-ADD_AXPY_DECL(double);
-ADD_AXPY_DECL(cuComplex);
-ADD_AXPY_DECL(cuDoubleComplex);
-
-ADD_COPY_DECL(float);
-ADD_COPY_DECL(double);
-ADD_COPY_DECL(cuComplex);
-ADD_COPY_DECL(cuDoubleComplex);
-
-#ifdef HYDROGEN_GPU_USE_FP16
-ADD_SCALE_DECL(__half);
-#endif // HYDROGEN_GPU_USE_FP16
-ADD_SCALE_DECL(float);
-ADD_SCALE_DECL(double);
-ADD_SCALE_DECL(cuComplex);
-ADD_SCALE_DECL(cuDoubleComplex);
-
-///@}
-/** @name BLAS-2 Routines */
-///@{
-
-#define ADD_GEMV_DECL(ScalarType)                       \
-    void Gemv(                                          \
-        cublasHandle_t handle,                          \
-        cublasOperation_t transpA, int m, int n,        \
-        ScalarType const& alpha,                        \
-        ScalarType const* A, int lda,                   \
-        ScalarType const* x, int incx,                  \
-        ScalarType const& beta,                         \
-        ScalarType* y, int incy)
-
-ADD_GEMV_DECL(float);
-ADD_GEMV_DECL(double);
-ADD_GEMV_DECL(cuComplex);
-ADD_GEMV_DECL(cuDoubleComplex);
-
-///@}
-/** @name BLAS-3 Routines */
-///@{
-
-#define ADD_GEMM_DECL(ScalarType)               \
-    void Gemm(                                  \
-        cublasHandle_t handle,                  \
-        cublasOperation_t transpA,              \
-        cublasOperation_t transpB,              \
-        int m, int n, int k,                    \
-        ScalarType const& alpha,                \
-        ScalarType const* A, int lda,           \
-        ScalarType const* B, int ldb,           \
-        ScalarType const& beta,                 \
-        ScalarType* C, int ldc)
-
-#ifdef HYDROGEN_GPU_USE_FP16
-ADD_GEMM_DECL(__half);
-#endif // HYDROGEN_GPU_USE_FP16
-ADD_GEMM_DECL(float);
-ADD_GEMM_DECL(double);
-ADD_GEMM_DECL(cuComplex);
-ADD_GEMM_DECL(cuDoubleComplex);
-
-///@}
-/** @name BLAS-like Extension Routines */
-///@{
-
-// We use this for Axpy2D, Copy2D, and Transpose
-#define ADD_GEAM_DECL(ScalarType)               \
-    void Geam(cublasHandle_t handle,            \
-              cublasOperation_t transpA,        \
-              cublasOperation_t transpB,        \
-              int m, int n,                     \
-              ScalarType const& alpha,          \
-              ScalarType const* A, int lda,     \
-              ScalarType const& beta,           \
-              ScalarType const* B, int ldb,     \
-              ScalarType* C, int ldc)
-
-#define ADD_DGMM_DECL(ScalarType)               \
-    void Dgmm(cublasHandle_t handle,            \
-              cublasSideMode_t side,            \
-              int m, int n,                     \
-              ScalarType const* A, int lda,     \
-              ScalarType const* X, int incx,    \
-              ScalarType* C, int ldc)
-
-ADD_GEAM_DECL(float);
-ADD_GEAM_DECL(double);
-ADD_GEAM_DECL(cuComplex);
-ADD_GEAM_DECL(cuDoubleComplex);
-
-ADD_DGMM_DECL(float);
-ADD_DGMM_DECL(double);
-ADD_DGMM_DECL(cuComplex);
-ADD_DGMM_DECL(cuDoubleComplex);
-
-///@}
-
-}// namespace cublas
-}// namespace hydrogen
-#endif // HYDROGEN_IMPORTS_CUBLAS_HPP_
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLASError.hpp b/include/hydrogen/device/gpu/cuda/cuBLASError.hpp
new file mode 100644
index 0000000000..84c83c4d9b
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/cuBLASError.hpp
@@ -0,0 +1,48 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/Error.hpp>
+#include <hydrogen/device/gpu/GPUError.hpp>
+
+#include <cublas_v2.h>
+
+// Helper error-checking macro.
+#define H_CHECK_CUBLAS(cmd)                                             \
+    do                                                                  \
+    {                                                                   \
+        H_SYNC_CUDA();                                                  \
+        auto h_check_cublas_err_code__ = cmd;                           \
+        H_ASSERT(h_check_cublas_err_code__ == CUBLAS_STATUS_SUCCESS,    \
+                 cuBLASError,                                           \
+                 (cudaDeviceReset(),                                    \
+                  cublas::BuildcuBLASErrorMessage(                      \
+                      #cmd,                                             \
+                      h_check_cublas_err_code__)));                     \
+        H_SYNC_CUDA();                                                  \
+    } while (false)
+
+namespace hydrogen
+{
+
+/** @class cuBLASError
+ *  @brief Exception representing errors detected by cuBLAS library.
+ */
+H_ADD_BASIC_EXCEPTION_CLASS(cuBLASError,GPUError);
+
+namespace cublas
+{
+
+/** @brief Write an error message describing the error detected in CUDA.
+ *  @param[in] cmd The expression that raised the error.
+ *  @param[in] error_code The error code reported by CUDA.
+ *  @returns A string describing the error.
+ */
+std::string BuildcuBLASErrorMessage(
+    std::string const& cmd, cublasStatus_t error_code);
+
+}// namespace cublas
+
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASERROR_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp b/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp
new file mode 100644
index 0000000000..c9ce8b0caa
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/cuBLASManagement.hpp
@@ -0,0 +1,82 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include "cuBLASError.hpp"
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include <cublas_v2.h>
+
+namespace hydrogen
+{
+
+namespace cublas
+{
+
+/** @name cuBLAS management functions. */
+///@{
+
+/** @brief Initialize cuBLAS.
+ *
+ *  Creates the default library instance for cuBLAS.
+ *
+ *  @note This must be called after `MPI_Init` is called with
+ *  MVAPICH2-GDR. cuBLAS initialization allocates some device memory
+ *  chunks, which MVAPICH-GDR attempts to intercept but fails if
+ *  MPI_Init is not called yet. So, the correct ordering of
+ *  initialization seems to be first CUDA, then MPI, and then any
+ *  libraries that depend on CUDA or MPI.
+ *
+ *  \param[in] handle The handle to use for cuBLAS. If null, a new
+ *                    handle will be created. If not null, it is
+ *                    assumed that the handle has been created with a
+ *                    user-side call to cublasCreate().
+ */
+void Initialize(cublasHandle_t handle=nullptr);
+
+/** @brief Finalize the cuBLAS library.
+ *
+ *  Destroys the default library handle.
+ *
+ *  \throws cuBLASError If the cuBLAS library detects any errors.
+ */
+void Finalize();
+
+/** @brief Replace the default cuBLAS library handle.
+ *
+ *  This will destroy the current default cuBLAS library handle and
+ *  assume control of the input handle. The cuBLAS library must be
+ *  initialized in order to call this function.
+ *
+ *  \param[in] handle The new library handle. Hydrogen will take
+ *                    ownership of the new handle and destroy it in
+ *                    Finalize().
+ *
+ *  \throws std::logic_error If the input handle is null or the
+ *                           library isn't initialized.
+ */
+void ReplaceLibraryHandle(cublasHandle_t handle);
+
+/** @brief Get the cuBLAS library handle. */
+cublasHandle_t GetLibraryHandle() noexcept;
+
+/** @class SyncManager
+ *  @brief Manage stream synchronization within cuBLAS.
+ */
+class SyncManager
+{
+public:
+    SyncManager(cublasHandle_t handle, SyncInfo<Device::GPU> const& si);
+    ~SyncManager();
+private:
+    cudaStream_t orig_stream_;
+};// class SyncManager
+
+///@}
+
+}// namespace cublas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASMANAGEMENT_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp b/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp
new file mode 100644
index 0000000000..2c30235c14
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/cuBLASMeta.hpp
@@ -0,0 +1,133 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/blas/BLAS_Common.hpp>
+#include <hydrogen/meta/MetaUtilities.hpp>
+#include <hydrogen/utils/HalfPrecision.hpp>
+
+#include <cublas_v2.h>
+
+namespace hydrogen
+{
+namespace cublas
+{
+
+/** @class NativeTypeT
+ *  @brief Metafunction mapping type names to CUDA/cuBLAS equivalents.
+ *
+ *  The mapping should provide bitwise equivalence.
+ *
+ *  @note This belongs at this level because rocBLAS defines types (or
+ *        names of types) that are local to the BLAS
+ *        implementation. Additionally, it's feasible to conceive of
+ *        custom types on the GPU that would, likewise, need to be
+ *        mapped to the types that cuBLAS knows about.
+ *
+ *  @todo Add static assertions to ensure only valid types get mapped.
+ */
+template <typename T>
+struct NativeTypeT;
+
+// Built-in types are their own native types
+template <> struct NativeTypeT<float> { using type = float; };
+template <> struct NativeTypeT<double> { using type = double; };
+template <>
+struct NativeTypeT<cuComplex> { using type = cuComplex; };
+template <>
+struct NativeTypeT<cuDoubleComplex> { using type = cuDoubleComplex; };
+
+// Complex and Double-Complex types require conversion
+template <>
+struct NativeTypeT<std::complex<float>> { using type = cuComplex; };
+template <>
+struct NativeTypeT<std::complex<double>> { using type = cuDoubleComplex; };
+
+// Half precision requires conversion as well
+#ifdef HYDROGEN_GPU_USE_FP16
+template <> struct NativeTypeT<__half> { using type = __half; };
+#ifdef HYDROGEN_HAVE_HALF
+template <> struct NativeTypeT<cpu_half_type> { using type = __half; };
+#endif // HYDROGEN_HAVE_HALF
+#endif // HYDROGEN_GPU_USE_FP16
+
+/** @brief Convenience wrapper for NativeTypeT */
+template <typename T>
+using NativeType = typename NativeTypeT<T>::type;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace meta_details
+{
+template <typename T>
+auto Try_HasNativeType(int) -> SubstitutionSuccess<NativeType<T>>;
+template <typename T>
+auto Try_HasNativeType(...) -> std::false_type;
+}// namespace meta_details
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/** @struct HasNativeType
+ *  @brief Predicate that determines if a type is mappable to a
+ *         library-native type.
+ */
+template <typename T>
+struct HasNativeType : decltype(meta_details::Try_HasNativeType<T>(0)) {};
+
+/** @class IsSupportedType_Base
+ *  @brief Predicate indicating that a type is supported within cuBLAS
+ *         for the given operation.
+ *
+ *  This is used to map internal cuBLAS types to the operations that
+ *  are supported. For example, `float` is always supported but
+ *  `__half` only has support in a few functions.
+ */
+template <typename T, BLAS_Op op>
+struct IsSupportedType_Base : std::false_type {};
+
+template <BLAS_Op op>
+struct IsSupportedType_Base<float, op> : std::true_type {};
+template <BLAS_Op op>
+struct IsSupportedType_Base<double, op> : std::true_type {};
+template <BLAS_Op op>
+struct IsSupportedType_Base<cuComplex, op> : std::true_type {};
+template <BLAS_Op op>
+struct IsSupportedType_Base<cuDoubleComplex, op> : std::true_type {};
+
+// No need to further test CUDA because this file isn't included if
+// either we don't have GPUs at all or we don't have CUDA support.
+#ifdef HYDROGEN_GPU_USE_FP16
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::AXPY> : std::true_type {};
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::DOT> : std::true_type {};
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::GEMM> : std::true_type {};
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::GEMMSTRIDEDBATCHED>
+    : std::true_type
+{};
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::NRM2> : std::true_type {};
+template <>
+struct IsSupportedType_Base<__half, BLAS_Op::SCAL> : std::true_type {};
+#endif // HYDROGEN_GPU_USE_FP16
+
+/** @class IsSupportedType
+ *  @brief Predicate indicating that the given type is compatible with
+ *         cuBLAS.
+ *
+ *  This is true when either the type is a compatible cuBLAS type
+ *  (e.g., float) or when it is binarily equivalent to one (e.g.,
+ *  std::complex<float>)..
+ */
+template <typename T, BLAS_Op op, bool=HasNativeType<T>::value>
+struct IsSupportedType
+    : IsSupportedType_Base<NativeType<T>, op>
+{};
+
+template <typename T, BLAS_Op op>
+struct IsSupportedType<T,op,false> : std::false_type {};
+
+}// namespace cublas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASMETA_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp b/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp
new file mode 100644
index 0000000000..b7fe1db486
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/cuBLASUtil.hpp
@@ -0,0 +1,71 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <cublas_v2.h>
+
+namespace hydrogen
+{
+namespace cublas
+{
+
+/** @brief cuBLAS uses ints to represent sizes. */
+using SizeT = int;
+
+/** @brief Convert a value to the size type expected by the cuBLAS
+ *         library.
+ *
+ *  If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a
+ *  "safe cast" (it will verify that `val` is in the dynamic range of
+ *  `int`. Otherwise it will do a regular static_cast.
+ */
+template <typename T>
+#ifdef HYDROGEN_DO_BOUNDS_CHECKING
+SizeT ToSizeT(T const& val)
+{
+    return narrow_cast<SizeT>(val);
+}
+#else
+SizeT ToSizeT(T const& val) noexcept
+{
+    return static_cast<SizeT>(val);
+}
+#endif // HYDROGEN_DO_BOUNDS_CHECKING
+
+/** @brief Overload to prevent extra work in the case of dynamic range
+ *         checking.
+ */
+inline SizeT ToSizeT(SizeT const& val) noexcept
+{
+    return val;
+}
+
+/** @brief Convert an TransposeMode to the cuBLAS operation type. */
+inline cublasOperation_t
+ToNativeTransposeMode(TransposeMode const& orient) noexcept
+{
+    switch (orient)
+    {
+    case TransposeMode::TRANSPOSE:
+        return CUBLAS_OP_T;
+    case TransposeMode::CONJ_TRANSPOSE:
+        return CUBLAS_OP_C;
+    default: // TransposeMode::NORMAL
+        return CUBLAS_OP_N;
+    }
+}
+
+/** @brief Convert a SideMode to the cuBLAS side mode type. */
+inline cublasSideMode_t
+ToNativeSideMode(SideMode const& side) noexcept
+{
+    if (side == SideMode::LEFT)
+        return CUBLAS_SIDE_LEFT;
+
+    return CUBLAS_SIDE_RIGHT;
+}
+
+}// namespace cublas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLASUTIL_HPP_
diff --git a/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp b/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp
new file mode 100644
index 0000000000..a9c0e886eb
--- /dev/null
+++ b/include/hydrogen/device/gpu/cuda/cuBLAS_API.hpp
@@ -0,0 +1,186 @@
+#ifndef HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_
+#define HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <cublas_v2.h>
+
+namespace hydrogen
+{
+namespace cublas
+{
+
+/** @name BLAS-1 Routines */
+///@{
+
+#define ADD_AXPY_DECL(ScalarType)               \
+    void Axpy(cublasHandle_t handle,            \
+              int n, ScalarType const& alpha,   \
+              ScalarType const* X, int incx,    \
+              ScalarType* Y, int incy)
+
+#define ADD_COPY_DECL(ScalarType)                       \
+    void Copy(cublasHandle_t handle,                    \
+              int n, ScalarType const* X, int incx,     \
+              ScalarType* Y, int incy)
+
+#define ADD_DOT_DECL(ScalarType)                \
+    void Dot(cublasHandle_t handle,             \
+             int n,                             \
+             ScalarType const* X, int incx,     \
+             ScalarType const* Y, int incy,     \
+             ScalarType& output)
+
+#define ADD_NRM2_DECL(ScalarType)               \
+    void Nrm2(cublasHandle_t handle,            \
+              int n,                            \
+              ScalarType const* X, int incx,    \
+              ScalarType& output)
+
+#define ADD_SCALE_DECL(ScalarType)                       \
+    void Scale(cublasHandle_t handle,                    \
+               int n, ScalarType const& alpha,           \
+               ScalarType* X, int incx)
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_AXPY_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_AXPY_DECL(float);
+ADD_AXPY_DECL(double);
+ADD_AXPY_DECL(cuComplex);
+ADD_AXPY_DECL(cuDoubleComplex);
+
+ADD_COPY_DECL(float);
+ADD_COPY_DECL(double);
+ADD_COPY_DECL(cuComplex);
+ADD_COPY_DECL(cuDoubleComplex);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_DOT_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_DOT_DECL(float);
+ADD_DOT_DECL(double);
+ADD_DOT_DECL(cuComplex);
+ADD_DOT_DECL(cuDoubleComplex);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_NRM2_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_NRM2_DECL(float);
+ADD_NRM2_DECL(double);
+ADD_NRM2_DECL(cuComplex);
+ADD_NRM2_DECL(cuDoubleComplex);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_SCALE_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_SCALE_DECL(float);
+ADD_SCALE_DECL(double);
+ADD_SCALE_DECL(cuComplex);
+ADD_SCALE_DECL(cuDoubleComplex);
+
+///@}
+/** @name BLAS-2 Routines */
+///@{
+
+#define ADD_GEMV_DECL(ScalarType)                       \
+    void Gemv(                                          \
+        cublasHandle_t handle,                          \
+        cublasOperation_t transpA, int m, int n,        \
+        ScalarType const& alpha,                        \
+        ScalarType const* A, int lda,                   \
+        ScalarType const* x, int incx,                  \
+        ScalarType const& beta,                         \
+        ScalarType* y, int incy)
+
+ADD_GEMV_DECL(float);
+ADD_GEMV_DECL(double);
+ADD_GEMV_DECL(cuComplex);
+ADD_GEMV_DECL(cuDoubleComplex);
+
+///@}
+/** @name BLAS-3 Routines */
+///@{
+
+#define ADD_GEMM_DECL(ScalarType)               \
+    void Gemm(                                  \
+        cublasHandle_t handle,                  \
+        cublasOperation_t transpA,              \
+        cublasOperation_t transpB,              \
+        int m, int n, int k,                    \
+        ScalarType const& alpha,                \
+        ScalarType const* A, int lda,           \
+        ScalarType const* B, int ldb,           \
+        ScalarType const& beta,                 \
+        ScalarType* C, int ldc)
+
+#define ADD_GEMM_STRIDED_BATCHED_DECL(ScalarType)       \
+    void GemmStridedBatched(                            \
+        cublasHandle_t handle,                          \
+        cublasOperation_t transpA,                      \
+        cublasOperation_t transpB,                      \
+        int m, int n, int k,                            \
+        ScalarType const* alpha,                        \
+        ScalarType const* A, int lda,                   \
+        long long int strideA,                          \
+        ScalarType const* B, int ldb,                   \
+        long long int strideB,                          \
+        ScalarType const* beta,                         \
+        ScalarType* C, int ldc,                         \
+        long long int strideC,                          \
+        int batchCount)
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_GEMM_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_GEMM_DECL(float);
+ADD_GEMM_DECL(double);
+ADD_GEMM_DECL(cuComplex);
+ADD_GEMM_DECL(cuDoubleComplex);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_GEMM_STRIDED_BATCHED_DECL(__half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_GEMM_STRIDED_BATCHED_DECL(float);
+ADD_GEMM_STRIDED_BATCHED_DECL(double);
+ADD_GEMM_STRIDED_BATCHED_DECL(cuComplex);
+ADD_GEMM_STRIDED_BATCHED_DECL(cuDoubleComplex);
+
+///@}
+/** @name BLAS-like Extension Routines */
+///@{
+
+// We use this for Axpy2D, Copy2D, and Transpose
+#define ADD_GEAM_DECL(ScalarType)               \
+    void Geam(cublasHandle_t handle,            \
+              cublasOperation_t transpA,        \
+              cublasOperation_t transpB,        \
+              int m, int n,                     \
+              ScalarType const& alpha,          \
+              ScalarType const* A, int lda,     \
+              ScalarType const& beta,           \
+              ScalarType const* B, int ldb,     \
+              ScalarType* C, int ldc)
+
+#define ADD_DGMM_DECL(ScalarType)               \
+    void Dgmm(cublasHandle_t handle,            \
+              cublasSideMode_t side,            \
+              int m, int n,                     \
+              ScalarType const* A, int lda,     \
+              ScalarType const* X, int incx,    \
+              ScalarType* C, int ldc)
+
+ADD_GEAM_DECL(float);
+ADD_GEAM_DECL(double);
+ADD_GEAM_DECL(cuComplex);
+ADD_GEAM_DECL(cuDoubleComplex);
+
+ADD_DGMM_DECL(float);
+ADD_DGMM_DECL(double);
+ADD_DGMM_DECL(cuComplex);
+ADD_DGMM_DECL(cuDoubleComplex);
+
+///@}
+}// namespace cublas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_CUDA_CUBLAS_API_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp b/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp
new file mode 100644
index 0000000000..61cd72131b
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/ROCmCopy.hpp
@@ -0,0 +1,132 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_
+
+#include "ROCmError.hpp"
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include <hydrogen/device/GPU.hpp>
+#include <hydrogen/blas/gpu/Fill.hpp>
+
+#include <hip/hip_runtime.h>
+
+namespace hydrogen
+{
+namespace gpu
+{
+
+/** @todo Flesh out documentation
+ *  @todo these are actually only valid for "packed" types
+ */
+
+// These functions are synchronous with respect to their SyncInfo
+// objects (that is, they require explicit synchronization to the
+// host).
+
+template <typename T>
+void Fill1DBuffer(T* buffer, size_t num_elements, T value,
+                  SyncInfo<Device::GPU> const& si)
+{
+    if (num_elements == 0UL)
+        return;
+
+    Fill_GPU_1D_impl(buffer, num_elements, value, si);
+}
+
+template <typename T>
+void Copy1DIntraDevice(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                       size_t num_elements,
+                       SyncInfo<Device::GPU> const& si)
+{
+    if (num_elements == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            hipMemcpyDeviceToDevice, si.Stream()));
+}
+
+template <typename T>
+void Copy1DToHost(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                  size_t num_elements,
+                  SyncInfo<Device::GPU> const& src_si)
+{
+    if (num_elements == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            hipMemcpyDeviceToHost, src_si.Stream()));
+}
+
+template <typename T>
+void Copy1DToDevice(T const* H_RESTRICT src, T* H_RESTRICT dest,
+                    size_t num_elements,
+                    SyncInfo<Device::GPU> const& dest_si)
+{
+    if (num_elements == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpyAsync(
+            dest, src, num_elements*sizeof(T),
+            hipMemcpyHostToDevice, dest_si.Stream()));
+}
+
+template <typename T>
+void Copy2DIntraDevice(T const* src, size_t src_ldim,
+                       T* dest, size_t dest_ldim,
+                       size_t height, size_t width,
+                       SyncInfo<Device::GPU> const& si)
+{
+    if (height == 0UL || width == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            hipMemcpyDeviceToDevice, si.Stream()));
+}
+
+template <typename T>
+void Copy2DToHost(T const* src, size_t src_ldim,
+                  T* dest, size_t dest_ldim,
+                  size_t height, size_t width,
+                  SyncInfo<Device::GPU> const& src_si)
+{
+    if (height == 0UL || width == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            hipMemcpyDeviceToHost, src_si.Stream()));
+}
+
+template <typename T>
+void Copy2DToDevice(T const* src, size_t src_ldim,
+                    T* dest, size_t dest_ldim,
+                    size_t height, size_t width,
+                    SyncInfo<Device::GPU> const& dest_si)
+{
+    if (height == 0UL || width == 0UL)
+        return;
+
+    H_CHECK_HIP(
+        hipMemcpy2DAsync(
+            dest, dest_ldim*sizeof(T),
+            src, src_ldim*sizeof(T),
+            height*sizeof(T), width,
+            hipMemcpyHostToDevice, dest_si.Stream()));
+}
+
+}// namespace gpu
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCMCOPY_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/ROCmError.hpp b/include/hydrogen/device/gpu/rocm/ROCmError.hpp
new file mode 100644
index 0000000000..58c89849df
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/ROCmError.hpp
@@ -0,0 +1,51 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hip/hip_runtime.h>
+
+#include <hydrogen/device/gpu/GPUError.hpp>
+
+#ifdef HYDROGEN_GPU_CALLS_ARE_SYNCHRONOUS
+#define H_SYNC_HIP() hipDeviceSynchronize()
+#else
+#define H_SYNC_HIP()
+#endif
+
+// Error handling macro
+#define H_CHECK_HIP(cmd)                                                \
+    do                                                                  \
+    {                                                                   \
+        H_SYNC_HIP();                                                   \
+        auto h_check_hip_error_code__ = cmd;                            \
+        H_ASSERT(h_check_hip_error_code__ == hipSuccess,                \
+                 ::hydrogen::HIPError,                                  \
+                 (hipDeviceReset(),                                     \
+                  ::hydrogen::rocm::BuildHipErrorMessage(               \
+                      #cmd, h_check_hip_error_code__)));                \
+        H_SYNC_HIP();                                                   \
+    } while (false)
+
+namespace hydrogen
+{
+
+/** @class HipError
+ *  @brief Exception class describing an error in the HIP environment
+ */
+H_ADD_BASIC_EXCEPTION_CLASS(HIPError, GPUError);
+
+namespace rocm
+{
+
+/** @brief Write an error message describing the error detected in HIP.
+ *  @param[in] cmd The expression that raised the error.
+ *  @param[in] hipError_T The error code reported by HIP.
+ *  @return A string describing the error.
+ */
+std::string BuildHipErrorMessage(
+    std::string const& cmd, hipError_t error_code);
+
+}// namespace rocm
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCMERROR_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp b/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp
new file mode 100644
index 0000000000..19c2cfcd70
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/ROCmLaunchKernel.hpp
@@ -0,0 +1,30 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+namespace hydrogen
+{
+namespace gpu
+{
+
+template <typename F, typename... Args>
+void LaunchKernel(
+    F kernel, dim3 const& gridDim, dim3 const& blkDim,
+    size_t sharedMem, SyncInfo<Device::GPU> const& si,
+    Args&&... kernel_args)
+{
+    H_CHECK_HIP(hipGetLastError());
+    // Note that this is (currently) implemented as a macro; not clear
+    // if std::forward-ing the arguments is appropriate...
+    hipLaunchKernelGGL(
+        kernel, gridDim, blkDim,
+        sharedMem, si.Stream(),
+        std::forward<Args>(kernel_args)...);
+    H_CHECK_HIP(hipGetLastError());
+}
+
+}// namespace gpu
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCMLAUNCHKERNEL_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp b/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp
new file mode 100644
index 0000000000..58f7ab3d10
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/ROCmManagement.hpp
@@ -0,0 +1,22 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_
+
+#include <hip/hip_runtime.h>
+
+namespace hydrogen
+{
+
+using gpuEvent_t = hipEvent_t;
+using gpuStream_t = hipStream_t;
+
+namespace rocm
+{
+hipEvent_t GetDefaultEvent() noexcept;
+hipStream_t GetDefaultStream() noexcept;
+hipEvent_t GetNewEvent();
+hipStream_t GetNewStream();
+void FreeEvent(hipEvent_t& event);
+void FreeStream(hipStream_t& stream);
+}// namespace rocm
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCMMANAGEMENT_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/SyncInfo.hpp b/include/hydrogen/device/gpu/rocm/SyncInfo.hpp
new file mode 100644
index 0000000000..ee7d372b4c
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/SyncInfo.hpp
@@ -0,0 +1,83 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_
+
+#include <hip/hip_runtime.h>
+
+#include <hydrogen/SyncInfo.hpp>
+#include <hydrogen/meta/MetaUtilities.hpp>
+
+#include "ROCmError.hpp"
+#include "ROCmManagement.hpp"
+
+namespace hydrogen
+{
+
+template <>
+class SyncInfo<Device::GPU>
+{
+public:
+    SyncInfo()
+        : SyncInfo{rocm::GetDefaultStream(), rocm::GetDefaultEvent()}
+    {}
+
+    SyncInfo(hipStream_t stream, hipEvent_t event)
+        : stream_{stream}, event_{event}
+    {}
+
+    void Merge(SyncInfo<Device::GPU> const& si) noexcept
+    {
+        if (si.stream_)
+            stream_ = si.stream_;
+        if (si.event_)
+            event_ = si.event_;
+    }
+
+    hipStream_t Stream() const noexcept { return stream_; }
+    hipEvent_t Event() const noexcept { return event_; }
+private:
+    friend void DestroySyncInfo(SyncInfo<Device::GPU>&);
+    hipStream_t stream_;
+    hipEvent_t event_;
+};// struct SyncInfo<Device::GPU>
+
+inline void AddSynchronizationPoint(SyncInfo<Device::GPU> const& syncInfo)
+{
+    H_CHECK_HIP(hipEventRecord(syncInfo.Event(), syncInfo.Stream()));
+}
+
+namespace details
+{
+inline void AddSyncPoint(
+    SyncInfo<Device::CPU> const& master,
+    SyncInfo<Device::GPU> const& dependent)
+{
+}
+
+inline void AddSyncPoint(
+    SyncInfo<Device::GPU> const& master,
+    SyncInfo<Device::CPU> const& dependent)
+{
+    // The CPU must wait for the GPU to catch up.
+    Synchronize(master); // wait for "master"
+}
+
+// This captures the work done on A and forces "others" to wait for
+// completion.
+template <typename... Ts>
+inline
+void AddSyncPoint(
+    SyncInfo<Device::GPU> const& master, SyncInfo<Device::GPU> const& other)
+{
+    if (master.Stream() != other.Stream())
+        H_CHECK_HIP(
+            hipStreamWaitEvent(other.Stream(), master.Event(), 0));
+}
+}// namespace details
+
+inline void Synchronize(SyncInfo<Device::GPU> const& syncInfo)
+{
+    H_CHECK_HIP(hipStreamSynchronize(syncInfo.Stream()));
+}
+
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_SYNCINFO_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS.hpp
new file mode 100644
index 0000000000..15a10dc547
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLAS.hpp
@@ -0,0 +1,12 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_
+
+#include "rocBLASError.hpp"
+#include "rocBLASManagement.hpp"
+#include "rocBLASMeta.hpp"
+#include "rocBLASUtil.hpp"
+
+// The API wrapper declarations
+#include "rocBLAS_API.hpp"
+
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASError.hpp b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp
new file mode 100644
index 0000000000..d5958c716c
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLASError.hpp
@@ -0,0 +1,48 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/Error.hpp>
+#include <hydrogen/device/gpu/GPUError.hpp>
+
+#include <rocblas.h>
+
+// Helper error-checking macro.
+#define H_CHECK_ROCBLAS(cmd)                                            \
+    do                                                                  \
+    {                                                                   \
+        H_SYNC_HIP();                                                   \
+        auto h_check_rocblas_err_code__ = cmd;                          \
+        H_ASSERT(h_check_rocblas_err_code__ == rocblas_status_success,  \
+                 rocBLASError,                                          \
+                 (hipDeviceReset(),                                     \
+                  rocblas::BuildrocBLASErrorMessage(                    \
+                      #cmd,                                             \
+                      h_check_rocblas_err_code__)));                    \
+        H_SYNC_HIP();                                                   \
+    } while (false)
+
+namespace hydrogen
+{
+
+/** @class rocBLASError
+ *  @brief Exception representing errors detected by rocBLAS library.
+ */
+H_ADD_BASIC_EXCEPTION_CLASS(rocBLASError,GPUError);
+
+namespace rocblas
+{
+
+/** @brief Write an error message describing the error detected in rocBLAS.
+ *  @param[in] cmd The expression that raised the error.
+ *  @param[in] error_code The error code reported by rocBLAS.
+ *  @returns A string describing the error.
+ */
+std::string BuildrocBLASErrorMessage(
+    std::string const& cmd, rocblas_status error_code);
+
+}// namespace rocblas
+
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASERROR_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
new file mode 100644
index 0000000000..137c58dbca
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLASManagement.hpp
@@ -0,0 +1,74 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_
+
+#include "rocBLASError.hpp"
+
+#include <hydrogen/Device.hpp>
+#include <hydrogen/SyncInfo.hpp>
+
+#include <hip/hip_runtime.h>
+#include <rocblas.h>
+
+namespace hydrogen
+{
+
+namespace rocblas
+{
+
+/** @name rocBLAS management functions. */
+///@{
+
+/** @brief Initialize rocBLAS.
+ *
+ *  Creates the default library instance for rocBLAS.
+ *
+ *  \param[in] handle The handle to use for rocBLAS. If null, a new
+ *                    handle will be created. If not null, it is
+ *                    assumed that the handle has been created with a
+ *                    user-side call to rocblas_create_handle().
+ */
+void Initialize(rocblas_handle handle=nullptr);
+
+/** @brief Finalize the rocBLAS library.
+ *
+ *  Destroys the default library handle.
+ *
+ *  \throws rocBLASError If the rocBLAS library detects any errors.
+ */
+void Finalize();
+
+/** @brief Replace the default rocBLAS library handle.
+ *
+ *  This will destroy the current default rocBLAS library handle and
+ *  assume control of the input handle. The rocBLAS library must be
+ *  initialized in order to call this function.
+ *
+ *  \param[in] handle The new library handle. Hydrogen will take
+ *                    ownership of the new handle and destroy it in
+ *                    Finalize().
+ *
+ *  \throws std::logic_error If the input handle is null or the
+ *                           library isn't initialized.
+ */
+void ReplaceLibraryHandle(rocblas_handle handle);
+
+/** @brief Get the rocBLAS library handle. */
+rocblas_handle GetLibraryHandle() noexcept;
+
+/** @class SyncManager
+ *  @brief Manage stream synchronization within rocBLAS.
+ */
+class SyncManager
+{
+public:
+    SyncManager(rocblas_handle handle, SyncInfo<Device::GPU> const& si);
+    ~SyncManager();
+private:
+    hipStream_t orig_stream_;
+};// class SyncManager
+
+///@}
+
+}// namespace rocblas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMANAGEMENT_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
new file mode 100644
index 0000000000..d130850d83
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLASMeta.hpp
@@ -0,0 +1,121 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/blas/BLAS_Common.hpp>
+#include <hydrogen/meta/MetaUtilities.hpp>
+#include <hydrogen/utils/HalfPrecision.hpp>
+
+#include <rocblas.h>
+
+namespace hydrogen
+{
+namespace rocblas
+{
+
+/** @class NativeTypeT
+ *  @brief Metafunction mapping type names to HIP/rocBLAS equivalents.
+ *
+ *  The mapping should provide bitwise equivalence.
+ *
+ *  @note This belongs at this level because rocBLAS defines types (or
+ *        names of types) that are local to the BLAS
+ *        implementation. Additionally, it's feasible to conceive of
+ *        custom types on the GPU that would, likewise, need to be
+ *        mapped to the types that rocBLAS knows about.
+ *
+ *  @todo Add static assertions to ensure only valid types get mapped.
+ */
+template <typename T>
+struct NativeTypeT
+{
+    using type = T;
+};
+
+// Complex and Double-Complex types require conversion
+template <>
+struct NativeTypeT<std::complex<float>> { using type = rocblas_float_complex; };
+template <>
+struct NativeTypeT<std::complex<double>> { using type = rocblas_double_complex; };
+
+// Half precision requires conversion as well
+#ifdef HYDROGEN_GPU_USE_FP16
+template <> struct NativeTypeT<rocblas_half> { using type = rocblas_half; };
+#ifdef HYDROGEN_HAVE_HALF
+template <> struct NativeTypeT<cpu_half_type> { using type = rocblas_half; };
+template <>
+struct NativeTypeT<std::complex<cpu_half_type>>
+{
+    using type = rocblas_half_complex;
+};
+#endif // HYDROGEN_HAVE_HALF
+#endif // HYDROGEN_GPU_USE_FP16
+
+/** @brief Convenience wrapper for NativeTypeT */
+template <typename T>
+using NativeType = typename NativeTypeT<T>::type;
+
+#ifndef DOXYGEN_SHOULD_SKIP_THIS
+namespace meta_details
+{
+template <typename T>
+auto Try_HasNativeType(int) -> SubstitutionSuccess<NativeType<T>>;
+template <typename T>
+auto Try_HasNativeType(...) -> std::false_type;
+}// namespace meta_details
+#endif // DOXYGEN_SHOULD_SKIP_THIS
+
+/** @struct HasNativeType
+ *  @brief Predicate that determines if a type is mappable to a
+ *         library-native type.
+ */
+template <typename T>
+struct HasNativeType : decltype(meta_details::Try_HasNativeType<T>(0)) {};
+
+/** @class IsSupportedType_Base
+ *  @brief Predicate indicating that a type is supported within rocBLAS
+ *         for the given operation.
+ *
+ *  This is used to map internal rocBLAS types to the operations that
+ *  are supported. For example, `float` is always supported but
+ *  `rocblas_half` only has support in a few functions.
+ */
+template <typename T, BLAS_Op op>
+struct IsSupportedType_Base : std::false_type {};
+
+template <BLAS_Op op>
+struct IsSupportedType_Base<float, op> : std::true_type {};
+template <BLAS_Op op>
+struct IsSupportedType_Base<double, op> : std::true_type {};
+template <>
+struct IsSupportedType_Base<float,BLAS_Op::DGMM> : std::false_type {};
+template <>
+struct IsSupportedType_Base<double,BLAS_Op::DGMM> : std::false_type {};
+
+#ifdef HYDROGEN_GPU_USE_FP16
+template <>
+struct IsSupportedType_Base<rocblas_half, BLAS_Op::AXPY> : std::true_type {};
+template <>
+struct IsSupportedType_Base<rocblas_half, BLAS_Op::GEMM> : std::true_type {};
+#endif // HYDROGEN_GPU_USE_FP16
+
+/** @class IsSupportedType
+ *  @brief Predicate indicating that the given type is compatible with
+ *         rocBLAS.
+ *
+ *  This is true when either the type is a compatible rocBLAS type
+ *  (e.g., float) or when it is binarily equivalent to one (e.g.,
+ *  std::complex<float>)..
+ */
+template <typename T, BLAS_Op op, bool=HasNativeType<T>::value>
+struct IsSupportedType
+    : IsSupportedType_Base<NativeType<T>, op>
+{};
+
+template <typename T, BLAS_Op op>
+struct IsSupportedType<T,op,false> : std::false_type {};
+
+}// namespace rocblas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASMETA_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
new file mode 100644
index 0000000000..ee47d51aec
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLASUtil.hpp
@@ -0,0 +1,71 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <rocblas.h>
+
+namespace hydrogen
+{
+namespace rocblas
+{
+
+/** @brief rocBLAS uses its own int typedef to represent sizes. */
+using SizeT = rocblas_int;
+
+/** @brief Convert a value to the size type expected by the rocBLAS
+ *         library.
+ *
+ *  If `HYDROGEN_DO_BOUNDS_CHECKING` is defined, this will do a
+ *  "safe cast" (it will verify that `val` is in the dynamic range of
+ *  `int`. Otherwise it will do a regular static_cast.
+ */
+template <typename T>
+#ifdef HYDROGEN_DO_BOUNDS_CHECKING
+SizeT ToSizeT(T const& val)
+{
+    return narrow_cast<SizeT>(val);
+}
+#else
+SizeT ToSizeT(T const& val) noexcept
+{
+    return static_cast<SizeT>(val);
+}
+#endif // HYDROGEN_DO_BOUNDS_CHECKING
+
+/** @brief Overload to prevent extra work in the case of dynamic range
+ *         checking.
+ */
+inline SizeT ToSizeT(SizeT const& val) noexcept
+{
+    return val;
+}
+
+/** @brief Convert an TransposeMode to the rocBLAS operation type. */
+inline rocblas_operation
+ToNativeTransposeMode(TransposeMode const& orient) noexcept
+{
+    switch (orient)
+    {
+    case TransposeMode::TRANSPOSE:
+        return rocblas_operation_transpose;
+    case TransposeMode::CONJ_TRANSPOSE:
+        return rocblas_operation_conjugate_transpose;
+    default: // TransposeMode::NORMAL
+        return rocblas_operation_none;
+    }
+}
+
+/** @brief Convert a SideMode to the rocBLAS side mode type. */
+inline rocblas_side
+ToNativeSideMode(SideMode const& side) noexcept
+{
+    if (side == SideMode::LEFT)
+        return rocblas_side_left;
+
+    return rocblas_side_right;
+}
+
+}// namespace rocblas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLASUTIL_HPP_
diff --git a/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
new file mode 100644
index 0000000000..0ffe469ba4
--- /dev/null
+++ b/include/hydrogen/device/gpu/rocm/rocBLAS_API.hpp
@@ -0,0 +1,131 @@
+#ifndef HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_
+#define HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_
+
+#include <El/hydrogen_config.h>
+
+#include <rocblas.h>
+
+namespace hydrogen
+{
+namespace rocblas
+{
+
+/** @name BLAS-1 Routines */
+///@{
+
+#define ADD_AXPY_DECL(ScalarType)               \
+    void Axpy(rocblas_handle handle,            \
+              int n, ScalarType const& alpha,   \
+              ScalarType const* X, int incx,    \
+              ScalarType* Y, int incy)
+
+#define ADD_COPY_DECL(ScalarType)                       \
+    void Copy(rocblas_handle handle,                    \
+              int n, ScalarType const* X, int incx,     \
+              ScalarType* Y, int incy)
+
+#define ADD_DOT_DECL(ScalarType)                \
+    void Dot(rocblasHandle_t handle,            \
+             int n,                             \
+             ScalarType const* X, int incx,     \
+             ScalarType const* Y, int incy,     \
+             ScalarType* output)
+
+#define ADD_NRM2_DECL(ScalarType)               \
+    void Nrm2(rocblasHandle_t handle,           \
+              int n,                            \
+              ScalarType const* X, int incx,    \
+              ScalarType* output)
+
+#define ADD_SCALE_DECL(ScalarType)                       \
+    void Scale(rocblas_handle handle,                    \
+               int n, ScalarType const& alpha,           \
+               ScalarType* X, int incx)
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_AXPY_DECL(rocblas_half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_AXPY_DECL(float);
+ADD_AXPY_DECL(double);
+
+ADD_COPY_DECL(float);
+ADD_COPY_DECL(double);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_SCALE_DECL(rocblas_half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_SCALE_DECL(float);
+ADD_SCALE_DECL(double);
+
+///@}
+/** @name BLAS-2 Routines */
+///@{
+
+#define ADD_GEMV_DECL(ScalarType)                       \
+    void Gemv(                                          \
+        rocblas_handle handle,                          \
+        rocblas_operation transpA, int m, int n,        \
+        ScalarType const& alpha,                        \
+        ScalarType const* A, int lda,                   \
+        ScalarType const* x, int incx,                  \
+        ScalarType const& beta,                         \
+        ScalarType* y, int incy)
+
+ADD_GEMV_DECL(float);
+ADD_GEMV_DECL(double);
+
+///@}
+/** @name BLAS-3 Routines */
+///@{
+
+#define ADD_GEMM_DECL(ScalarType)               \
+    void Gemm(                                  \
+        rocblas_handle handle,                  \
+        rocblas_operation transpA,              \
+        rocblas_operation transpB,              \
+        int m, int n, int k,                    \
+        ScalarType const& alpha,                \
+        ScalarType const* A, int lda,           \
+        ScalarType const* B, int ldb,           \
+        ScalarType const& beta,                 \
+        ScalarType* C, int ldc)
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ADD_GEMM_DECL(rocblas_half);
+#endif // HYDROGEN_GPU_USE_FP16
+ADD_GEMM_DECL(float);
+ADD_GEMM_DECL(double);
+
+///@}
+/** @name BLAS-like Extension Routines */
+///@{
+
+// We use this for Axpy2D, Copy2D, and Transpose
+#define ADD_GEAM_DECL(ScalarType)               \
+    void Geam(rocblas_handle handle,            \
+              rocblas_operation transpA,        \
+              rocblas_operation transpB,        \
+              int m, int n,                     \
+              ScalarType const& alpha,          \
+              ScalarType const* A, int lda,     \
+              ScalarType const& beta,           \
+              ScalarType const* B, int ldb,     \
+              ScalarType* C, int ldc)
+
+ADD_GEAM_DECL(float);
+ADD_GEAM_DECL(double);
+
+#define ADD_DGMM_DECL(ScalarType)               \
+    void Dgmm(rocblas_handle handle,            \
+              rocblas_side side,                \
+              int m, int n,                     \
+              ScalarType const* A, int lda,     \
+              ScalarType const* X, int incx,    \
+              ScalarType* C, int ldc)
+ADD_DGMM_DECL(float);
+ADD_DGMM_DECL(double);
+
+///@}
+}// namespace rocblas
+}// namespace hydrogen
+#endif // HYDROGEN_DEVICE_GPU_ROCM_ROCBLAS_API_HPP_
diff --git a/include/hydrogen/meta/MetaUtilities.hpp b/include/hydrogen/meta/MetaUtilities.hpp
index 89a8de91c9..86d3d3402d 100644
--- a/include/hydrogen/meta/MetaUtilities.hpp
+++ b/include/hydrogen/meta/MetaUtilities.hpp
@@ -112,6 +112,10 @@ using MakePointer = typename std::add_pointer<T>::type;
 template <typename T>
 using MakePointerToConst = MakePointer<MakeConst<T>>;
 
+/** @brief Convenience type predicate to check if two types are the same. */
+template <typename T, typename U>
+using IsSame = std::is_same<T,U>;
+
 // Wrapper around std::conditional
 template <typename B, typename T, typename U>
 using Select = typename std::conditional<B::value, T, U>::type;
@@ -127,6 +131,5 @@ template <typename EnumT, EnumT A>
 struct EnumSame<EnumT,A,A> : std::true_type {};
 
 ///@}
-
 }// namespace hydrogen
 #endif // HYDROGEN_META_METAUTILITIES_HPP_
diff --git a/include/hydrogen/utils/SimpleBuffer.hpp b/include/hydrogen/utils/SimpleBuffer.hpp
index 697476bb07..c210094b42 100644
--- a/include/hydrogen/utils/SimpleBuffer.hpp
+++ b/include/hydrogen/utils/SimpleBuffer.hpp
@@ -5,9 +5,10 @@
 
 #include <hydrogen/Device.hpp>
 #include <hydrogen/SyncInfo.hpp>
-#ifdef HYDROGEN_HAVE_CUDA
-#include <hydrogen/device/gpu/CUDA.hpp>
-#endif // HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/GPU.hpp>
+#include <hydrogen/device/gpu/BasicCopy.hpp>
+#endif // HYDROGEN_HAVE_GPU
 
 #include <El/core/Memory/decl.hpp>
 
@@ -66,28 +67,15 @@ void setBufferToValue(T* buffer, size_t size, T const& value,
     std::fill_n(buffer, size, value);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 void setBufferToValue(T* buffer, size_t size, T const& value,
-                      SyncInfo<Device::GPU> syncInfo = SyncInfo<Device::GPU>{})
+                      SyncInfo<Device::GPU> const& syncInfo)
 {
-    if( value == TypeTraits<T>::Zero() )
-    {
-        H_CHECK_CUDA(cudaMemsetAsync(buffer, 0x0, size*sizeof(T),
-                                      syncInfo.stream_));
-    }
-    else
-    {
-        std::vector<T> tmp(size, value);
-        H_CHECK_CUDA(
-            cudaMemcpyAsync(
-                buffer, tmp.data(), size*sizeof(T),
-                CUDAMemcpyKind<Device::CPU,Device::GPU>(),
-                syncInfo.stream_));
-    }
+    gpu::Fill1DBuffer(buffer, size, value, syncInfo);
     AddSynchronizationPoint(syncInfo);
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 }// namespace details
 
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 7412589367..03cadd3e4f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -14,8 +14,8 @@ add_subdirectory(hydrogen)
 
 # Propagate the files up the tree
 set(HYDROGEN_SOURCES "${SOURCES}" "${THIS_DIR_SOURCES}" PARENT_SCOPE)
-if (HYDROGEN_HAVE_CUDA)
-  set(HYDROGEN_CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE)
+if (HYDROGEN_HAVE_GPU)
+  set(HYDROGEN_GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE)
 endif ()
 set(HYDROGEN_CATCH2_TEST_FILES
   "${CATCH2_TESTS}" "${THIS_DIR_CATCH2_TESTS}" PARENT_SCOPE)
diff --git a/src/blas_like/level2/Gemv.cpp b/src/blas_like/level2/Gemv.cpp
index 5556885af2..47c37bd0db 100644
--- a/src/blas_like/level2/Gemv.cpp
+++ b/src/blas_like/level2/Gemv.cpp
@@ -33,7 +33,7 @@ void Gemv(Orientation orientA,
              beta,
              static_cast<Matrix<T,Device::CPU>&>(C));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Gemv(orientA, alpha,
              static_cast<Matrix<T,Device::GPU> const&>(A),
@@ -41,7 +41,7 @@ void Gemv(Orientation orientA,
              beta,
              static_cast<Matrix<T,Device::GPU>&>(C));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device type.");
     }
@@ -309,7 +309,7 @@ void Gemv
     Gemv(orientation, alpha, A, x, T(0), y);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template void Gemv(Orientation orientA,
                    float alpha,
                    Matrix<float,Device::GPU> const& A,
@@ -335,7 +335,7 @@ template void Gemv(Orientation, gpu_half_type,
                    gpu_half_type,
                    AbstractMatrix<gpu_half_type>&);
 #endif // HYDROGEN_GPU_USE_FP16
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define PROTO(T) \
   template void Gemv                                   \
diff --git a/src/blas_like/level2/Gemv/Normal.hpp b/src/blas_like/level2/Gemv/Normal.hpp
index c111a7d67e..8d315e3623 100644
--- a/src/blas_like/level2/Gemv/Normal.hpp
+++ b/src/blas_like/level2/Gemv/Normal.hpp
@@ -131,11 +131,11 @@ void Normal
     case Device::CPU:
         Normal_impl<Device::CPU>(alpha, APre, x, beta, yPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Normal_impl<Device::GPU>(alpha, APre, x, beta, yPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Gemv::Normal: Bad device.");
     }
diff --git a/src/blas_like/level2/Gemv/Transpose.hpp b/src/blas_like/level2/Gemv/Transpose.hpp
index cf67564de4..4d365428f2 100644
--- a/src/blas_like/level2/Gemv/Transpose.hpp
+++ b/src/blas_like/level2/Gemv/Transpose.hpp
@@ -148,11 +148,11 @@ void Transpose
     case Device::CPU:
         Transpose_impl<Device::CPU>(orientation, alpha, APre, x, beta, yPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Transpose_impl<Device::GPU>(orientation, alpha, APre, x, beta, yPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Gemv::Transpose: Bad device.");
     }
diff --git a/src/blas_like/level3/Gemm.cpp b/src/blas_like/level3/Gemm.cpp
index 3198cd1b15..16d338f2e9 100644
--- a/src/blas_like/level3/Gemm.cpp
+++ b/src/blas_like/level3/Gemm.cpp
@@ -72,7 +72,7 @@ InitializeComms(El::Grid const& g,
             InitGrid<El::Collective::ALLTOALL>(g, syncInfo);
             InitGrid<El::Collective::ALLGATHER>(g, syncInfo);
         }
-        H_CHECK_CUDA(cudaDeviceSynchronize());
+        hydrogen::gpu::SynchronizeDevice();
         initialized_grids_.push_front(&g);
     }
     return pool;
@@ -114,7 +114,7 @@ void Gemm(Orientation orientA, Orientation orientB,
              beta,
              static_cast<Matrix<T,Device::CPU>&>(C));
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         Gemm(orientA, orientB, alpha,
              static_cast<Matrix<T,Device::GPU> const&>(A),
@@ -122,7 +122,7 @@ void Gemm(Orientation orientA, Orientation orientB,
              beta,
              static_cast<Matrix<T,Device::GPU>&>(C));
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Bad device type.");
     }
@@ -159,7 +159,7 @@ void Gemm_impl(
                beta, C.Buffer(), C.LDim());
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 void Gemm_impl(
     Orientation orientA, Orientation orientB,
@@ -185,7 +185,7 @@ void Gemm_impl(
                    beta, C.Buffer(), C.LDim(), master_sync);
 }
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 }// namespace <anon>
 
@@ -439,7 +439,7 @@ void LocalGemm
     LocalGemm(orientA, orientB, alpha, A, B, TypeTraits<T>::Zero(), C);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template void Gemm(Orientation orientA, Orientation orientB,
                    float alpha,
                    Matrix<float,Device::GPU> const& A,
@@ -460,7 +460,7 @@ template void Gemm(Orientation orientA, Orientation orientB,
                    gpu_half_type beta,
                    Matrix<gpu_half_type,Device::GPU>& C);
 #endif // HYDROGEN_GPU_USE_FP16
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define ABSTRACT_PROTO(T)                                               \
     template void Gemm(                                                 \
diff --git a/src/blas_like/level3/Gemm/TN.hpp b/src/blas_like/level3/Gemm/TN.hpp
index 4d2525f6a4..50486f448c 100644
--- a/src/blas_like/level3/Gemm/TN.hpp
+++ b/src/blas_like/level3/Gemm/TN.hpp
@@ -87,11 +87,11 @@ void SUMMA_TNA
     case Device::CPU:
         SUMMA_TNA_impl<Device::CPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TNA_impl<Device::GPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TNA: Bad device.");
     }
@@ -202,11 +202,11 @@ void SUMMA_TNB
     case Device::CPU:
         SUMMA_TNB_impl<Device::CPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TNB_impl<Device::GPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TNA: Bad device.");
     }
@@ -317,11 +317,11 @@ void SUMMA_TNC
     case Device::CPU:
         SUMMA_TNC_impl<Device::CPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TNC_impl<Device::GPU>(orientA, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TNA: Bad device.");
     }
@@ -444,12 +444,12 @@ void SUMMA_TNDot
         SUMMA_TNDot_impl<Device::CPU>(
             orientA, alpha, APre, BPre, CPre, blockSize);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TNDot_impl<Device::GPU>(
             orientA, alpha, APre, BPre, CPre, blockSize);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TNA: Bad device.");
     }
diff --git a/src/blas_like/level3/Gemm/TT.hpp b/src/blas_like/level3/Gemm/TT.hpp
index b2078ce03d..1955b0111a 100644
--- a/src/blas_like/level3/Gemm/TT.hpp
+++ b/src/blas_like/level3/Gemm/TT.hpp
@@ -89,12 +89,12 @@ void SUMMA_TTA
         SUMMA_TTA_impl<Device::CPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TTA_impl<Device::GPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TTA: Bad device.");
     }
@@ -179,12 +179,12 @@ void SUMMA_TTB
         SUMMA_TTB_impl<Device::CPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TTB_impl<Device::GPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TTB: Bad device.");
     }
@@ -267,12 +267,12 @@ void SUMMA_TTC
         SUMMA_TTC_impl<Device::CPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TTC_impl<Device::GPU>(
             orientA, orientB, alpha, APre, BPre, CPre);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TTC: Bad device.");
     }
@@ -364,12 +364,12 @@ void SUMMA_TTDot
         SUMMA_TTDot_impl<Device::CPU>(
             orientA, orientB, alpha, APre, BPre, CPre, blockSize);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         SUMMA_TTDot_impl<Device::GPU>(
             orientA, orientB, alpha, APre, BPre, CPre, blockSize);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("SUMMA_TTA: Bad device.");
     }
diff --git a/src/blas_like/level3/SyncInfoPool.hpp b/src/blas_like/level3/SyncInfoPool.hpp
index 30b9226db7..f54cedfb96 100644
--- a/src/blas_like/level3/SyncInfoPool.hpp
+++ b/src/blas_like/level3/SyncInfoPool.hpp
@@ -145,17 +145,21 @@ void swap(SyncInfoPool<D>& a, SyncInfoPool<D>& b) noexcept
 template <>
 SyncInfoPool<Device::GPU>::~SyncInfoPool()
 {
+#ifdef HYDROGEN_HAVE_CUDA
+    using GPUErrorType = CUDAError;
+#elif defined(HYDROGEN_HAVE_ROCM)
+    using GPUErrorType = HIPError;
+#endif
     try
     {
         for (auto& si : pool_)
         {
-            H_CHECK_CUDA(cudaEventDestroy(si.event_));
-            H_CHECK_CUDA(cudaStreamDestroy(si.stream_));
+            DestroySyncInfo(si);
         }
     }
-    catch (CudaError const& e)
+    catch (GPUErrorType const& e)
     {
-        std::cerr << "Warning: CUDA error detected:\n\ne.what(): "
+        std::cerr << "Warning: GPU runtime error detected:\n\ne.what(): "
                   << e.what()
                   << std::endl;
     }
@@ -176,21 +180,14 @@ void SyncInfoPool<Device::GPU>::EnsureSize(size_t pool_size)
     size_t const new_elements = pool_size - start_size;
     for (auto ii = 0UL; ii < new_elements; ++ii)
     {
-        cudaStream_t stream;
-        cudaEvent_t event;
-        H_CHECK_CUDA(
-            cudaStreamCreateWithFlags(
-                &stream, cudaStreamNonBlocking));
-        H_CHECK_CUDA(
-            cudaEventCreateWithFlags(
-                &event, cudaEventDisableTiming));
+        auto si = CreateNewSyncInfo<Device::GPU>();
 #ifdef HYDROGEN_HAVE_NVPROF
         // Name the stream for debugging purposes
         std::string const stream_name
             = "H: SP (" + std::to_string(start_size + ii) + ")";
-        nvtxNameCudaStreamA(stream, stream_name.c_str());
+        nvtxNameCudaStreamA(si.Stream(), stream_name.c_str());
 #endif // HYDROGEN_HAVE_NVPROF
-        pool_.emplace_back(stream, event);
+        pool_.emplace_back(std::move(si));
     }
 
     // Handle iterators:
diff --git a/src/blas_like/level3/sync_info_pool_test.cpp b/src/blas_like/level3/sync_info_pool_test.cpp
index 44f56e2221..48c69ff210 100644
--- a/src/blas_like/level3/sync_info_pool_test.cpp
+++ b/src/blas_like/level3/sync_info_pool_test.cpp
@@ -65,8 +65,8 @@ TEST_CASE(
             for (auto const& si : tmp)
             {
                 auto const& pool_si = pool.Next();
-                CHECK(si.stream_ == pool_si.stream_);
-                CHECK(si.event_ == pool_si.event_);
+                CHECK(si.Stream() == pool_si.Stream());
+                CHECK(si.Event() == pool_si.Event());
             }
 
             SECTION("Moving the pool preserves iterators")
@@ -79,8 +79,8 @@ TEST_CASE(
 
                 auto const& pool_si = pool_mv.Next();
                 auto const& si = tmp[1];
-                CHECK(si.stream_ == pool_si.stream_);
-                CHECK(si.event_ == pool_si.event_);
+                CHECK(si.Stream() == pool_si.Stream());
+                CHECK(si.Event() == pool_si.Event());
             }
 
             SECTION("Growing the pool preserves iterators")
@@ -95,8 +95,8 @@ TEST_CASE(
 
                 auto const& pool_si = pool.Next();
                 auto const& si = tmp[1];
-                CHECK(si.stream_ == pool_si.stream_);
-                CHECK(si.event_ == pool_si.event_);
+                CHECK(si.Stream() == pool_si.Stream());
+                CHECK(si.Event() == pool_si.Event());
             }
         }
         SECTION("Resetting the pool returns to the same point")
@@ -104,8 +104,8 @@ TEST_CASE(
             auto const& first = pool.Next();
             pool.Reset();
             auto const& after_reset = pool.Next();
-            CHECK(first.event_ == after_reset.event_);
-            CHECK(first.stream_ == after_reset.stream_);
+            CHECK(first.Event() == after_reset.Event());
+            CHECK(first.Stream() == after_reset.Stream());
         }
     }
 
diff --git a/src/core/DistMatrix/AbstractDistMatrix.cpp b/src/core/DistMatrix/AbstractDistMatrix.cpp
index 91e3a2ca3c..d1f492a687 100644
--- a/src/core/DistMatrix/AbstractDistMatrix.cpp
+++ b/src/core/DistMatrix/AbstractDistMatrix.cpp
@@ -124,7 +124,7 @@ AbstractDistMatrix<T>::MakeSizeConsistent(bool includingViewers)
         if (this->GetLocalDevice() == Device::CPU)
             mpi::Broadcast(message, msgSize, Root(), CrossComm(),
                            SyncInfo<Device::CPU>{});
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         else if (this->GetLocalDevice() == Device::GPU)
         {
             auto syncInfo = SyncInfoFromMatrix(
@@ -133,7 +133,7 @@ AbstractDistMatrix<T>::MakeSizeConsistent(bool includingViewers)
             mpi::Broadcast(message, msgSize, Root(), CrossComm(),
                            syncInfo);
         }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         else
             LogicError("AbstractMatrix: Bad Device!");
     }
@@ -143,7 +143,7 @@ AbstractDistMatrix<T>::MakeSizeConsistent(bool includingViewers)
         if (this->GetLocalDevice() == Device::CPU)
             mpi::Broadcast(message, msgSize, vcRoot, grid.ViewingComm(),
                            SyncInfo<Device::CPU>{});
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         else if (this->GetLocalDevice() == Device::GPU)
         {
             auto syncInfo = SyncInfoFromMatrix(
@@ -152,7 +152,7 @@ AbstractDistMatrix<T>::MakeSizeConsistent(bool includingViewers)
             mpi::Broadcast(message, msgSize, vcRoot, grid.ViewingComm(),
                            syncInfo);
         }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         else
             LogicError("AbstractMatrix: Bad Device!");
 
diff --git a/src/core/DistMatrix/ElementMatrix.cpp b/src/core/DistMatrix/ElementMatrix.cpp
index 8523ce825d..eb6e0f421d 100644
--- a/src/core/DistMatrix/ElementMatrix.cpp
+++ b/src/core/DistMatrix/ElementMatrix.cpp
@@ -114,7 +114,7 @@ ElementalMatrix<T>::MakeConsistent(bool includingViewers)
         if (this->GetLocalDevice() == Device::CPU)
             mpi::Broadcast(message, msgLength, this->Root(), this->CrossComm(),
                            SyncInfo<Device::CPU>{});
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         else if (this->GetLocalDevice() == Device::GPU)
         {
             auto syncInfo = SyncInfoFromMatrix(
@@ -123,7 +123,7 @@ ElementalMatrix<T>::MakeConsistent(bool includingViewers)
             mpi::Broadcast(message, msgLength, this->Root(), this->CrossComm(),
                            syncInfo);
         }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         else
             LogicError("ElementalMatrix: Bad Device!");
     }
@@ -133,7 +133,7 @@ ElementalMatrix<T>::MakeConsistent(bool includingViewers)
         if (this->GetLocalDevice() == Device::CPU)
             mpi::Broadcast(message, msgLength, vcRoot, grid.ViewingComm(),
                            SyncInfo<Device::CPU>{});
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         else if (this->GetLocalDevice() == Device::GPU)
         {
             auto syncInfo = SyncInfoFromMatrix(
@@ -142,7 +142,7 @@ ElementalMatrix<T>::MakeConsistent(bool includingViewers)
             mpi::Broadcast(message, msgLength, vcRoot, grid.ViewingComm(),
                            syncInfo);
         }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         else
             LogicError("ElementalMatrix: Bad Device!");
     }
@@ -395,7 +395,7 @@ ElementalMatrix<T>::Attach(
             static_cast<Matrix<T,Device::CPU>&>(this->Matrix()).
                 Attach_(localHeight, localWidth, buffer, ldim);
             break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         case El::Device::GPU:
             static_cast<Matrix<T,Device::GPU>&>(this->Matrix()).
                 Attach_(localHeight, localWidth, buffer, ldim);
@@ -459,7 +459,7 @@ ElementalMatrix<T>::LockedAttach(
             static_cast<Matrix<T,Device::CPU>&>(this->Matrix()).
                 LockedAttach_(localHeight, localWidth, buffer, ldim);
             break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         case El::Device::GPU:
             static_cast<Matrix<T,Device::GPU>&>(this->Matrix()).
                 LockedAttach_(localHeight, localWidth, buffer, ldim);
diff --git a/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp b/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp
index dbbd10aa61..2e036a11d5 100644
--- a/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp
+++ b/src/core/DistMatrix/ElementMatrix/CIRC_CIRC.cpp
@@ -144,7 +144,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Inter-device copy ctors
 #ifdef HYDROGEN_GPU_USE_FP16
 template
@@ -247,7 +247,7 @@ template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
     const DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/MC_MR.cpp b/src/core/DistMatrix/ElementMatrix/MC_MR.cpp
index 6e1b8cb7b2..0b737081fa 100644
--- a/src/core/DistMatrix/ElementMatrix/MC_MR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/MC_MR.cpp
@@ -278,7 +278,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
     BOTH(T,VC,  STAR,Device::CPU);                                      \
     BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Inter-device copy ctors
 #ifdef HYDROGEN_GPU_USE_FP16
 template DistMatrix<gpu_half_type,COLDIST,ROWDIST,ELEMENT,Device::CPU>::DistMatrix(
@@ -375,7 +375,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp
index 51568a0be8..332cd7bb12 100644
--- a/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/MC_STAR.cpp
@@ -301,7 +301,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR,Device::CPU); \
   BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -389,7 +389,7 @@ template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
 
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp
index 1d0b923125..a1acaf2b4e 100644
--- a/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/MD_STAR.cpp
@@ -271,7 +271,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -352,7 +352,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/MR_MC.cpp b/src/core/DistMatrix/ElementMatrix/MR_MC.cpp
index e21d04b5de..8b0e4a0435 100644
--- a/src/core/DistMatrix/ElementMatrix/MR_MC.cpp
+++ b/src/core/DistMatrix/ElementMatrix/MR_MC.cpp
@@ -272,7 +272,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
     BOTH(T,VC,  STAR,Device::CPU);                                      \
     BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Inter-device copy ctors
 template DistMatrix<float,COLDIST,ROWDIST,ELEMENT,Device::CPU>::DistMatrix(
     const DistMatrix<float,COLDIST,ROWDIST,ELEMENT,Device::GPU>&);
@@ -365,7 +365,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp
index 8076016559..7299173c83 100644
--- a/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/MR_STAR.cpp
@@ -300,7 +300,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Inter-device copy ctors
 #ifdef HYDROGEN_GPU_USE_FP16
 template DistMatrix<gpu_half_type,COLDIST,ROWDIST,ELEMENT,Device::CPU>::DistMatrix(
@@ -397,7 +397,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp
index 41d21af8ac..d4a2d2b3bb 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_MC.cpp
@@ -295,7 +295,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -376,7 +376,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp
index e3e634dcd8..de90940033 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_MD.cpp
@@ -271,7 +271,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -356,7 +356,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp
index fa739c9d83..59af515b4e 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_MR.cpp
@@ -306,7 +306,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -391,7 +391,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp
index 2a0bbe62c4..5ffbf20245 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_STAR.cpp
@@ -256,7 +256,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR); \
   BOTH(T,VR,  STAR);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // Inter-device copy ctors
 #ifdef HYDROGEN_GPU_USE_FP16
 template DistMatrix<gpu_half_type,COLDIST,ROWDIST,ELEMENT,Device::CPU>::DistMatrix(
@@ -353,7 +353,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp b/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp
index a3619202d4..217314fcac 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_VC.cpp
@@ -281,7 +281,7 @@ int DM::PartialUnionColRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR,Device::CPU); \
   BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -362,7 +362,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp b/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp
index 1b9e8afab7..a72b2943ce 100644
--- a/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/STAR_VR.cpp
@@ -272,7 +272,7 @@ int DM::PartialUnionColRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR,Device::CPU); \
   BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -353,7 +353,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp b/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp
index c336aa90f7..ae34eec855 100644
--- a/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/VC_STAR.cpp
@@ -281,7 +281,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
     OTHER(T,VC,  STAR,Device::CPU);                                     \
     BOTH(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -362,7 +362,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp b/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp
index 721b88e80e..09b9a6cd16 100644
--- a/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp
+++ b/src/core/DistMatrix/ElementMatrix/VR_STAR.cpp
@@ -277,7 +277,7 @@ int DM::PartialUnionRowRank() const EL_NO_EXCEPT
   BOTH(T,VC,  STAR,Device::CPU); \
   OTHER(T,VR,  STAR,Device::CPU);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define INSTGPU(T,U,V)                                                  \
     template DistMatrix<T,COLDIST,ROWDIST,ELEMENT,Device::GPU>::DistMatrix \
     (DistMatrix<T,U,V,ELEMENT,Device::CPU> const&);                     \
@@ -358,7 +358,7 @@ DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU>::operator=(
 template DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>&
 DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::CPU>::operator=(
     DistMatrix<double,COLDIST,ROWDIST,ELEMENT,Device::GPU> const&);
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/src/core/DistMatrix/ElementMatrix/setup.hpp b/src/core/DistMatrix/ElementMatrix/setup.hpp
index 65723b2f8f..eb7d7467c0 100644
--- a/src/core/DistMatrix/ElementMatrix/setup.hpp
+++ b/src/core/DistMatrix/ElementMatrix/setup.hpp
@@ -234,10 +234,10 @@ DM::ConstructWithNewDevice(Device D2) const
     {
     case Device::CPU:
         return ConstructWithNewDevice_impl_<Device::CPU>();
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         return ConstructWithNewDevice_impl_<Device::GPU>();
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Unkown device type.");
     }
diff --git a/src/core/MemoryPool.cpp b/src/core/MemoryPool.cpp
index b7af3b4e4b..1b5058f651 100644
--- a/src/core/MemoryPool.cpp
+++ b/src/core/MemoryPool.cpp
@@ -7,13 +7,13 @@ namespace El
 
 namespace
 {
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 std::unique_ptr<MemoryPool<true>> pinnedHostMemoryPool_;
-#endif  // HYDROGEN_HAVE_CUDA
+#endif  // HYDROGEN_HAVE_GPU
 std::unique_ptr<MemoryPool<false>> hostMemoryPool_;
 }  // namespace <anon>
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 
 MemoryPool<true>& PinnedHostMemoryPool()
 {
@@ -25,7 +25,7 @@ MemoryPool<true>& PinnedHostMemoryPool()
 void DestroyPinnedHostMemoryPool()
 { pinnedHostMemoryPool_.reset(); }
 
-#endif  // HYDROGEN_HAVE_CUDA
+#endif  // HYDROGEN_HAVE_GPU
 
 MemoryPool<false>& HostMemoryPool()
 {
diff --git a/src/core/environment.cpp b/src/core/environment.cpp
index 46a4b9b820..fc0db9f499 100644
--- a/src/core/environment.cpp
+++ b/src/core/environment.cpp
@@ -9,6 +9,12 @@
 */
 #include <El-lite.hpp>
 
+#include <El/hydrogen_config.h>
+
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/GPU.hpp>
+#endif // HYDROGEN_HAVE_GPU
+
 #include <algorithm>
 #include <set>
 
@@ -216,9 +222,9 @@ void Initialize( int& argc, char**& argv )
 
     ::args = new Args( argc, argv, mpi::COMM_WORLD, std::cerr );
 
-#ifdef HYDROGEN_HAVE_CUDA
-    InitializeCUDA(argc, argv);
-#endif // HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
+    gpu::Initialize();
+#endif // HYDROGEN_HAVE_GPU
 
     ::numElemInits = 1;
     if( !mpi::Initialized() )
@@ -296,6 +302,9 @@ void Initialize( int& argc, char**& argv )
 #ifdef HYDROGEN_HAVE_CUDA
     cublas::Initialize();
 #endif
+#ifdef HYDROGEN_HAVE_ROCM
+    hydrogen::rocblas::Initialize();
+#endif
 
 #ifdef EL_HAVE_QT5
     InitializeQt5( argc, argv );
@@ -358,8 +367,8 @@ void Finalize()
         FinalizeRandom();
     }
 
-#ifdef HYDROGEN_HAVE_CUDA
-    FinalizeCUDA();
+#ifdef HYDROGEN_HAVE_GPU
+    gpu::Finalize();
 #endif
 
     EL_DEBUG_ONLY( CloseLog() )
diff --git a/src/core/imports/CMakeLists.txt b/src/core/imports/CMakeLists.txt
index 021e2093e0..1a688f563e 100644
--- a/src/core/imports/CMakeLists.txt
+++ b/src/core/imports/CMakeLists.txt
@@ -12,10 +12,6 @@ set_full_path(THIS_DIR_SOURCES
   scalapack.cpp
   )
 
-if (HYDROGEN_HAVE_CUDA)
-  set_full_path(CUDA_SOURCES cuda.cpp cublas.cpp)
-  list(APPEND THIS_DIR_SOURCES ${CUDA_SOURCES})
-endif()
 if (HYDROGEN_HAVE_CUB)
   set_full_path(CUB_SOURCES cub.cpp)
   list(APPEND THIS_DIR_SOURCES ${CUB_SOURCES})
diff --git a/src/core/imports/cub.cpp b/src/core/imports/cub.cpp
index 16a25730d1..61fb2ca6bc 100644
--- a/src/core/imports/cub.cpp
+++ b/src/core/imports/cub.cpp
@@ -1,7 +1,11 @@
-#include <hydrogen/device/gpu/cuda/CUB.hpp>
+#include "hydrogen/device/gpu/CUB.hpp"
 
 #include <memory>
 
+namespace hydrogen
+{
+namespace cub
+{
 namespace
 {
 
@@ -27,7 +31,7 @@ unsigned int get_min_bin() noexcept
 unsigned int get_max_bin() noexcept
 {
     return get_env_uint("H_CUB_MAX_BIN",
-                        ::cub::CachingDeviceAllocator::INVALID_BIN);
+                        cub_impl::CachingDeviceAllocator::INVALID_BIN);
 }
 
 size_t get_max_cached_size() noexcept
@@ -35,7 +39,7 @@ size_t get_max_cached_size() noexcept
     char const* env = std::getenv("H_CUB_MAX_CACHED_SIZE");
     return (env
             ? static_cast<size_t>(std::stoul(env))
-            : ::cub::CachingDeviceAllocator::INVALID_SIZE);
+            : cub_impl::CachingDeviceAllocator::INVALID_SIZE);
 }
 
 bool get_debug() noexcept
@@ -47,19 +51,14 @@ bool get_debug() noexcept
 }
 
 /** Singleton instance of CUB memory pool. */
-std::unique_ptr<::cub::CachingDeviceAllocator> memoryPool_;
+std::unique_ptr<cub_impl::CachingDeviceAllocator> memoryPool_;
 } // namespace <anon>
 
-namespace hydrogen
-{
-namespace cub
-{
-
-::cub::CachingDeviceAllocator& MemoryPool()
+cub_impl::CachingDeviceAllocator& MemoryPool()
 {
     if (!memoryPool_)
         memoryPool_.reset(
-            new ::cub::CachingDeviceAllocator(
+            new cub_impl::CachingDeviceAllocator(
                 get_bin_growth(),
                 get_min_bin(),
                 get_max_bin(),
diff --git a/src/core/imports/mpi.cpp b/src/core/imports/mpi.cpp
index 661c7a0dad..9d724f516e 100644
--- a/src/core/imports/mpi.cpp
+++ b/src/core/imports/mpi.cpp
@@ -2484,7 +2484,7 @@ EL_NO_RELEASE_EXCEPT
         EL_NO_RELEASE_EXCEPT;                                           \
     MPI_PROTO_COMMON_DEV(Complex<T>,D)
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define MPI_PROTO(T) \
     MPI_PROTO_DEVICELESS(T) \
     MPI_PROTO_DEV(T, Device::CPU) \
@@ -2506,7 +2506,7 @@ MPI_PROTO(Entry<gpu_half_type>)
 #define MPI_PROTO_COMPLEX(T) \
     MPI_PROTO_DEVICELESS_COMPLEX(T)                    \
     MPI_PROTO_COMPLEX_DEV(T, Device::CPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_PROTO(byte)
 MPI_PROTO(int)
diff --git a/src/core/imports/mpi/AllGather.hpp b/src/core/imports/mpi/AllGather.hpp
index bfff12bb6a..c7c7d3d2e4 100644
--- a/src/core/imports/mpi/AllGather.hpp
+++ b/src/core/imports/mpi/AllGather.hpp
@@ -132,14 +132,14 @@ void AllGather(
     template void AllGather(const T* sbuf, int sc, T* rbuf, int rc, Comm const& comm, \
                             SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_ALLGATHER_PROTO(T) \
     MPI_ALLGATHER_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_ALLGATHER_PROTO(T)             \
     MPI_ALLGATHER_PROTO_DEV(T,Device::CPU);     \
     MPI_ALLGATHER_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_ALLGATHER_PROTO(byte);
 MPI_ALLGATHER_PROTO(int);
diff --git a/src/core/imports/mpi/AllReduce.hpp b/src/core/imports/mpi/AllReduce.hpp
index 2ad7b151db..f916d97518 100644
--- a/src/core/imports/mpi/AllReduce.hpp
+++ b/src/core/imports/mpi/AllReduce.hpp
@@ -254,14 +254,14 @@ void AllReduce(T* buf, int count, Comm const& comm, SyncInfo<D> const& syncInfo)
     template T AllReduce(T, Comm const&, SyncInfo<D> const&);                  \
     template void AllReduce(T*, int, Comm const&, SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_ALLREDUCE_PROTO(T)             \
     MPI_ALLREDUCE_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_ALLREDUCE_PROTO(T)             \
     MPI_ALLREDUCE_PROTO_DEV(T,Device::CPU);     \
     MPI_ALLREDUCE_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_ALLREDUCE_PROTO(byte);
 MPI_ALLREDUCE_PROTO(int);
diff --git a/src/core/imports/mpi/AllToAll.hpp b/src/core/imports/mpi/AllToAll.hpp
index 7b2d08a6a0..a0cff86729 100644
--- a/src/core/imports/mpi/AllToAll.hpp
+++ b/src/core/imports/mpi/AllToAll.hpp
@@ -110,14 +110,14 @@ void AllToAll(T const*, int, T*, int, Comm const&, SyncInfo<D> const&)
 #define MPI_ALLTOALL_PROTO_DEV(T,D) \
     template void AllToAll(T const*, int, T*, int, Comm const&, SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_ALLTOALL_PROTO(T)             \
     MPI_ALLTOALL_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_ALLTOALL_PROTO(T)             \
     MPI_ALLTOALL_PROTO_DEV(T,Device::CPU);      \
     MPI_ALLTOALL_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_ALLTOALL_PROTO(byte);
 MPI_ALLTOALL_PROTO(int);
diff --git a/src/core/imports/mpi/Broadcast.hpp b/src/core/imports/mpi/Broadcast.hpp
index 92b5747381..04a3fb859b 100644
--- a/src/core/imports/mpi/Broadcast.hpp
+++ b/src/core/imports/mpi/Broadcast.hpp
@@ -111,14 +111,14 @@ void Broadcast( T& b, int root, Comm const& comm, SyncInfo<D> const& syncInfo )
     template void Broadcast(T*, int, int, Comm const&, SyncInfo<D> const&);       \
     template void Broadcast(T&, int, Comm const&, SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_BROADCAST_PROTO(T)                  \
     MPI_BROADCAST_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_BROADCAST_PROTO(T)                  \
     MPI_BROADCAST_PROTO_DEV(T,Device::CPU);     \
     MPI_BROADCAST_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_BROADCAST_PROTO(byte);
 MPI_BROADCAST_PROTO(int);
diff --git a/src/core/imports/mpi/Gather.hpp b/src/core/imports/mpi/Gather.hpp
index f0d3d3a7e7..135f7d4225 100644
--- a/src/core/imports/mpi/Gather.hpp
+++ b/src/core/imports/mpi/Gather.hpp
@@ -126,7 +126,7 @@ void Gather(const T*, int, T*, int, int, Comm const&, SyncInfo<D> const&)
     template void Gather(const Complex<T>* sbuf, int sc, Complex<T>* rbuf, \
                          int rc, int root, Comm const& comm, SyncInfo<D> const&);
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_COLLECTIVE_PROTO(T) \
     MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU)
 #define MPI_COLLECTIVE_COMPLEX_PROTO(T) \
@@ -138,7 +138,7 @@ void Gather(const T*, int, T*, int, int, Comm const&, SyncInfo<D> const&)
 #define MPI_COLLECTIVE_COMPLEX_PROTO(T) \
     MPI_COLLECTIVE_COMPLEX_PROTO_DEV(T,Device::CPU) \
     MPI_COLLECTIVE_COMPLEX_PROTO_DEV(T,Device::GPU)
-#endif
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_COLLECTIVE_PROTO(byte)
 MPI_COLLECTIVE_PROTO(int)
diff --git a/src/core/imports/mpi/Reduce.hpp b/src/core/imports/mpi/Reduce.hpp
index e00e62bccf..4d20b53cf9 100644
--- a/src/core/imports/mpi/Reduce.hpp
+++ b/src/core/imports/mpi/Reduce.hpp
@@ -317,14 +317,14 @@ void Reduce(T* buf, int count, int root, Comm const& comm,
     template T Reduce(T, int, Comm const&, SyncInfo<D> const&);                \
     template void Reduce(T*, int, int, Comm const&, SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_REDUCE_PROTO(T)                     \
     MPI_REDUCE_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_REDUCE_PROTO(T)                     \
     MPI_REDUCE_PROTO_DEV(T,Device::CPU);        \
     MPI_REDUCE_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_REDUCE_PROTO(byte);
 MPI_REDUCE_PROTO(int);
diff --git a/src/core/imports/mpi/ReduceScatter.hpp b/src/core/imports/mpi/ReduceScatter.hpp
index 9d339b6dc8..d5f6bb9f43 100644
--- a/src/core/imports/mpi/ReduceScatter.hpp
+++ b/src/core/imports/mpi/ReduceScatter.hpp
@@ -1,4 +1,7 @@
 // ReduceScatter
+#ifdef HYDROGEN_HAVE_GPU
+#include "hydrogen/device/gpu/BasicCopy.hpp"
+#endif // HYDROGEN_HAVE_GPU
 
 namespace El
 {
@@ -15,18 +18,16 @@ void LocalCopy(T const* EL_RESTRICT src,
     return std::copy_n(src, size, dest);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 void LocalCopy(T const* EL_RESTRICT src,
                T* EL_RESTRICT dest,
                size_t size,
                SyncInfo<Device::GPU> const& si)
 {
-    H_CHECK_CUDA(cudaMemcpyAsync(dest, src, sizeof(T)*size,
-                                 cudaMemcpyDeviceToDevice,
-                                 si.stream_));
+    gpu::Copy1DIntraDevice(src, dest, size, si);
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 }
 
 // IsValidAluminumDeviceType should mean both that the device/type
@@ -290,14 +291,14 @@ void ReduceScatter(T* buf, int rc, Comm const& comm, SyncInfo<D> const& syncInfo
     template T ReduceScatter(T, Comm const&, SyncInfo<D> const&);              \
     template void ReduceScatter(T*, int, Comm const&, SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_REDUCESCATTER_PROTO(T)             \
     MPI_REDUCESCATTER_PROTO_DEV(T,Device::CPU)
 #else
 #define MPI_REDUCESCATTER_PROTO(T)             \
     MPI_REDUCESCATTER_PROTO_DEV(T,Device::CPU); \
     MPI_REDUCESCATTER_PROTO_DEV(T,Device::GPU)
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 MPI_REDUCESCATTER_PROTO(byte);
 MPI_REDUCESCATTER_PROTO(int);
diff --git a/src/core/imports/mpi/Scatter.hpp b/src/core/imports/mpi/Scatter.hpp
index d3c21f4555..6e316185b7 100644
--- a/src/core/imports/mpi/Scatter.hpp
+++ b/src/core/imports/mpi/Scatter.hpp
@@ -117,7 +117,7 @@ void Scatter(const T*, int, T*, int, int, Comm const&, SyncInfo<D> const&)
     template void Scatter(const T* sbuf, int sc, T* rbuf, int rc, int root, \
                           Comm const& comm, SyncInfo<D> const&)
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 #define MPI_COLLECTIVE_PROTO(T) \
     MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU);    \
     MPI_COLLECTIVE_PROTO_DEV(T,Device::GPU)
diff --git a/src/core/imports/mpi/SendRecv.hpp b/src/core/imports/mpi/SendRecv.hpp
index 8e9e12dd5c..75326bf87a 100644
--- a/src/core/imports/mpi/SendRecv.hpp
+++ b/src/core/imports/mpi/SendRecv.hpp
@@ -79,7 +79,7 @@ T SendRecv( T sb, int to, int from, Comm const& comm, SyncInfo<D> const& syncInf
         T* buf, int count, int to, int from, Comm const& comm,                 \
         SyncInfo<D> const&)
 
-#ifndef HYDROGEN_HAVE_CUDA
+#ifndef HYDROGEN_HAVE_GPU
 #define MPI_COLLECTIVE_PROTO(T)             \
     MPI_COLLECTIVE_PROTO_DEV(T,Device::CPU)
 #else
diff --git a/src/core/imports/mpi_utils.hpp b/src/core/imports/mpi_utils.hpp
index b26fd0fc7d..d5a84f78f7 100644
--- a/src/core/imports/mpi_utils.hpp
+++ b/src/core/imports/mpi_utils.hpp
@@ -17,7 +17,14 @@
 #ifndef EL_IMPORTS_MPIUTILS_HPP
 #define EL_IMPORTS_MPIUTILS_HPP
 
-namespace {
+#include <El/hydrogen_config.h>
+
+#ifdef HYDROGEN_HAVE_GPU
+#include <hydrogen/device/gpu/BasicCopy.hpp>
+#endif
+
+namespace
+{
 
 template<typename T>
 MPI_Op NativeOp( const El::mpi::Op& op )
@@ -83,7 +90,7 @@ class PassthroughMemoryWrapper
 template <typename T, Device D>
 class ManagedHostMemoryWrapper;
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template <typename T>
 class ManagedHostMemoryWrapper<T,Device::GPU>
 {
@@ -101,23 +108,32 @@ class ManagedHostMemoryWrapper<T,Device::GPU>
           final_xfer_size_{final_xfer_size}
     {
         if ((host_data_.size() > 0) && (initial_xfer_size > 0))
-            InterDeviceCopy<Device::GPU, Device::CPU>::MemCopy1DAsync(
-                host_data_.data()+initial_xfer_offset,
+        {
+            gpu::Copy1DToHost(
                 device_data_+initial_xfer_offset,
-                initial_xfer_size, syncInfo_.stream_);
+                host_data_.data()+initial_xfer_offset,
+                initial_xfer_size, syncInfo_);
+        }
     }
 
     ~ManagedHostMemoryWrapper()
     {
         // Transfer stuff back to device
-        if ((host_data_.size() > 0) && (final_xfer_size_ > 0))
+        try
         {
-            InterDeviceCopy<Device::CPU, Device::GPU>::MemCopy1DAsync(
-                device_data_+final_xfer_offset_,
-                host_data_.data()+final_xfer_offset_,
-                final_xfer_size_, syncInfo_.stream_);
+            if ((host_data_.size() > 0) && (final_xfer_size_ > 0))
+            {
+                gpu::Copy1DToDevice(
+                    host_data_.data()+final_xfer_offset_,
+                    device_data_+final_xfer_offset_,
+                    final_xfer_size_, syncInfo_);
+            }
             Synchronize(syncInfo_);
         }
+        catch (std::exception const& e)
+        {
+            H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e);
+        }
     }
 
     // Enable move construction/assignment
@@ -135,7 +151,7 @@ class ManagedHostMemoryWrapper<T,Device::GPU>
     size_t final_xfer_offset_;
     size_t final_xfer_size_;
 };// class ManagedHostMemoryWrapper<T,Device::GPU>
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T>
 auto MakeHostBuffer(T* buf, size_t const& size,
@@ -158,7 +174,7 @@ MakeManagedHostBuffer(T* buf, size_t const&, size_t const&, size_t const&,
 template <typename T>
 struct type_check;
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 // This can't (shouldn't) just be std::vector<T> because we want
 // pinned memory for GPUs. And I don't want to write a new Allocator
 // for std::vector that uses pinned memory through CUDA. We can access
@@ -170,11 +186,10 @@ auto MakeHostBuffer(T const* buf, size_t const& size,
 {
     simple_buffer<T,Device::CPU> locbuf(
         size, SyncInfo<Device::CPU>{}, /*mode=*/ 1);
-    InterDeviceCopy<Device::GPU, Device::CPU>::MemCopy1DAsync(
-        locbuf.data(), buf, size, syncInfo.stream_);
+    gpu::Copy1DToHost(buf, locbuf.data(), size, syncInfo);
     return locbuf;
 }
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 template <typename T, Device D>
 auto MakeManagedHostBuffer(
diff --git a/src/core/mpi_register.cpp b/src/core/mpi_register.cpp
index 0712caf66d..f9ffa357c6 100644
--- a/src/core/mpi_register.cpp
+++ b/src/core/mpi_register.cpp
@@ -6,6 +6,7 @@
    which can be found in the LICENSE file in the root directory, or at
    http://opensource.org/licenses/BSD-2-Clause
 */
+#define H_INSTANTIATING_MPI_TYPES_STRUCT
 #include <El-lite.hpp>
 using std::function;
 
diff --git a/src/hydrogen/CMakeLists.txt b/src/hydrogen/CMakeLists.txt
index da07066b1e..a171ea49ea 100644
--- a/src/hydrogen/CMakeLists.txt
+++ b/src/hydrogen/CMakeLists.txt
@@ -1,6 +1,12 @@
 add_subdirectory(blas)
+add_subdirectory(device)
 
 set(SOURCES "${SOURCES}" PARENT_SCOPE)
 if (HYDROGEN_HAVE_GPU)
-  set(CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE)
+  set(GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE)
 endif ()
+
+set_full_path(THIS_DIR_CXX_SOURCES
+  Error.cpp)
+
+set(SOURCES "${SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE)
diff --git a/src/hydrogen/Error.cpp b/src/hydrogen/Error.cpp
new file mode 100644
index 0000000000..44df791f3b
--- /dev/null
+++ b/src/hydrogen/Error.cpp
@@ -0,0 +1,15 @@
+#include <hydrogen/Error.hpp>
+
+namespace hydrogen
+{
+namespace
+{
+volatile size_t break_on_me_called_ = 0UL;
+}
+
+void break_on_me()
+{
+    break_on_me_called_ += 1UL;
+}
+
+}// namespace hydrogen
diff --git a/src/hydrogen/blas/CMakeLists.txt b/src/hydrogen/blas/CMakeLists.txt
index 5487e6c7d6..3c47c8bd00 100644
--- a/src/hydrogen/blas/CMakeLists.txt
+++ b/src/hydrogen/blas/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (HYDROGEN_HAVE_GPU)
   add_subdirectory(gpu)
 
-  set(CUDA_SOURCES "${CUDA_SOURCES}" PARENT_SCOPE)
+  set(GPU_SOURCES "${GPU_SOURCES}" PARENT_SCOPE)
 endif ()
diff --git a/src/hydrogen/blas/gpu/Axpy.cu b/src/hydrogen/blas/gpu/Axpy.cu
index 1773463df3..e238c3af8c 100644
--- a/src/hydrogen/blas/gpu/Axpy.cu
+++ b/src/hydrogen/blas/gpu/Axpy.cu
@@ -2,11 +2,16 @@
 
 #include <El/hydrogen_config.h>
 #include <hydrogen/meta/TypeTraits.hpp>
-#include <hydrogen/device/gpu/CUDA.hpp>
 
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
 #include <cuda_runtime.h>
 #include <cooperative_groups.h>
 namespace cg = cooperative_groups;
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace
 {
@@ -20,7 +25,9 @@ __global__ void axpy_2d_transpose_tiled_kernel(
     // All the fun of a transpose meets the awesomeness of Axpy. :D
     //
     // remember: B is m x n, A is n x m
+#ifdef HYDROGEN_HAVE_CUDA
     cg::thread_block cta = cg::this_thread_block();
+#endif
     __shared__ T tile[TILE_SIZE][TILE_SIZE+1];
 
     auto const row_start_A = blockIdx.y * TILE_SIZE + threadIdx.x;
@@ -49,7 +56,11 @@ __global__ void axpy_2d_transpose_tiled_kernel(
          }
     }
 
+#ifdef HYDROGEN_HAVE_CUDA
     cg::sync(cta);
+#else
+    __syncthreads();
+#endif
 
     // If I am a valid row in B, I need to store things
     if (row_start_B < m)
@@ -109,7 +120,7 @@ void Axpy_GPU_impl(
     T alpha,
     T const* X, SizeT colStrideX, SizeT rowStrideX,
     T* Y, SizeT colStrideY, SizeT rowStrideY,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (height == TypeTraits<SizeT>::Zero()
         || width == TypeTraits<SizeT>::Zero())
@@ -127,14 +138,13 @@ void Axpy_GPU_impl(
     dim3 blks((height + TILE_SIZE - 1) / TILE_SIZE,
               (width + TILE_SIZE - 1) / TILE_SIZE, 1);
     dim3 thds(TILE_SIZE, BLK_COLS, 1);
-    void* args[] = {&height, &width, &alpha,
-                    &X, &colStrideX, &rowStrideX,
-                    &Y, &colStrideY, &rowStrideY};
-
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&axpy_2d_tiled_kernel<TILE_SIZE, BLK_COLS, T, SizeT>,
-            blks, thds, args, 0, stream));
+
+    gpu::LaunchKernel(
+        axpy_2d_tiled_kernel<TILE_SIZE, BLK_COLS, T, SizeT>,
+        blks, thds, 0, sync_info,
+        height, width, alpha,
+        X, colStrideX, rowStrideX,
+        Y, colStrideY, rowStrideY);
 }
 
 template <typename T, typename SizeT, typename>
@@ -144,7 +154,7 @@ void Axpy_GPU_impl(
     T alpha,
     T const* A, SizeT lda,
     T* B, SizeT ldb,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     // Short-circuit
     if (height <= TypeTraits<SizeT>::Zero()
@@ -157,7 +167,7 @@ void Axpy_GPU_impl(
         return Axpy_GPU_impl(
             height, width, alpha,
             A, TypeTraits<SizeT>::One(), lda,
-            B, TypeTraits<SizeT>::One(), ldb, stream);
+            B, TypeTraits<SizeT>::One(), ldb, sync_info);
 
     constexpr int TILE_SIZE = 32;
     constexpr int BLK_COLS = 8;
@@ -165,24 +175,22 @@ void Axpy_GPU_impl(
     dim3 blks((height + TILE_SIZE - 1) / TILE_SIZE,
               (width + TILE_SIZE - 1) / TILE_SIZE, 1);
     dim3 thds(TILE_SIZE, BLK_COLS, 1);
-    void* args[] = {&height, &width, &alpha, &A, &lda, &B, &ldb};
 
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&axpy_2d_transpose_tiled_kernel
-            <TILE_SIZE, BLK_COLS, T, SizeT>,
-            blks, thds, args, 0, stream));
+    gpu::LaunchKernel(
+        axpy_2d_transpose_tiled_kernel<TILE_SIZE, BLK_COLS, T, SizeT>,
+        blks, thds, 0, sync_info,
+        height, width, alpha, A, lda, B, ldb);
 }
 
-#define ETI(ScalarT, SizeT)                             \
-    template void Axpy_GPU_impl(                        \
-        SizeT, SizeT, ScalarT,                          \
-        ScalarT const*, SizeT, SizeT,                   \
-        ScalarT*, SizeT, SizeT, cudaStream_t);          \
-    template void Axpy_GPU_impl(                        \
-        TransposeMode, SizeT, SizeT, ScalarT,           \
-        ScalarT const*, SizeT,                          \
-        ScalarT*, SizeT, cudaStream_t)
+#define ETI(ScalarT, SizeT)                                    \
+    template void Axpy_GPU_impl(                               \
+        SizeT, SizeT, ScalarT,                                 \
+        ScalarT const*, SizeT, SizeT,                          \
+        ScalarT*, SizeT, SizeT, SyncInfo<Device::GPU> const&); \
+    template void Axpy_GPU_impl(                               \
+        TransposeMode, SizeT, SizeT, ScalarT,                  \
+        ScalarT const*, SizeT,                                 \
+        ScalarT*, SizeT, SyncInfo<Device::GPU> const&)
 
 
 #ifdef HYDROGEN_GPU_USE_FP16
diff --git a/src/hydrogen/blas/gpu/CMakeLists.txt b/src/hydrogen/blas/gpu/CMakeLists.txt
index 6b3fa46edb..a39e62bd4c 100644
--- a/src/hydrogen/blas/gpu/CMakeLists.txt
+++ b/src/hydrogen/blas/gpu/CMakeLists.txt
@@ -1,4 +1,4 @@
-set_full_path(THIS_DIR_CUDA_SOURCES
+set_full_path(THIS_DIR_GPU_SOURCES
   Axpy.cu
   Copy.cu
   Fill.cu
@@ -8,4 +8,4 @@ set_full_path(THIS_DIR_CUDA_SOURCES
   )
 
 # Propagate the files up the tree
-set(CUDA_SOURCES "${CUDA_SOURCES}" "${THIS_DIR_CUDA_SOURCES}" PARENT_SCOPE)
+set(GPU_SOURCES "${GPU_SOURCES}" "${THIS_DIR_GPU_SOURCES}" PARENT_SCOPE)
diff --git a/src/hydrogen/blas/gpu/Copy.cu b/src/hydrogen/blas/gpu/Copy.cu
index ee03c7e401..aa5abbca22 100644
--- a/src/hydrogen/blas/gpu/Copy.cu
+++ b/src/hydrogen/blas/gpu/Copy.cu
@@ -2,9 +2,14 @@
 
 #include <El/hydrogen_config.h>
 #include <hydrogen/meta/TypeTraits.hpp>
-#include <hydrogen/device/gpu/CUDA.hpp>
 
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
 #include <cuda_runtime.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace
 {
@@ -80,7 +85,7 @@ void Copy_GPU_impl(
     SizeT num_entries,
     SrcT const* src, SizeT src_stride,
     DestT * dest, SizeT dest_stride,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (num_entries <= TypeTraits<SizeT>::Zero())
         return;
@@ -97,13 +102,12 @@ void Copy_GPU_impl(
 
     constexpr size_t threads_per_block = 128;
     auto blocks = (num_entries + threads_per_block - 1)/ threads_per_block;
-    void* args[] = { &num_entries, &src, &src_stride, &dest, &dest_stride };
 
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&copy_1d_kernel<SrcT,DestT,SizeT>,
-            blocks, threads_per_block,
-            args, 0, stream));
+    gpu::LaunchKernel(
+        copy_1d_kernel<SrcT,DestT,SizeT>,
+        blocks, threads_per_block,
+        0, sync_info,
+        num_entries, src, src_stride, dest, dest_stride);
 }
 
 template <typename SrcT, typename DestT, typename SizeT, typename, typename>
@@ -111,7 +115,7 @@ void Copy_GPU_impl(
     SizeT num_rows, SizeT num_cols,
     SrcT const* src, SizeT src_row_stride, SizeT src_col_stride,
     DestT* dest, SizeT dest_row_stride, SizeT dest_col_stride,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
   if (num_rows == 0 || num_cols == 0)
     return;
@@ -132,24 +136,23 @@ void Copy_GPU_impl(
     dim3 blks((num_rows + TILE_SIZE - 1)/TILE_SIZE,
               (num_cols + TILE_SIZE - 1)/TILE_SIZE, 1);
     dim3 thds(TILE_SIZE, BLK_COLS, 1);
-    void* args[] = { &num_rows, &num_cols,
-                     &src, &src_row_stride, &src_col_stride,
-                     &dest, &dest_row_stride, &dest_col_stride };
-
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&copy_2d_kernel<TILE_SIZE,BLK_COLS,SrcT,DestT,SizeT>,
-            blks, thds, args, 0, stream));
+
+    gpu::LaunchKernel(
+        copy_2d_kernel<TILE_SIZE,BLK_COLS,SrcT,DestT,SizeT>,
+        blks, thds, 0, sync_info,
+        num_rows, num_cols,
+        src, src_row_stride, src_col_stride,
+        dest, dest_row_stride, dest_col_stride);
 }
 
-#define ETI(SourceType, DestType, SizeType)             \
-    template void Copy_GPU_impl(                        \
-        SizeType, SourceType const*, SizeType,          \
-        DestType*, SizeType, cudaStream_t);             \
-    template void Copy_GPU_impl(                        \
-        SizeType, SizeType,                             \
-        SourceType const*, SizeType, SizeType,          \
-        DestType*, SizeType, SizeType, cudaStream_t)
+#define ETI(SourceType, DestType, SizeType)                          \
+    template void Copy_GPU_impl(                                     \
+        SizeType, SourceType const*, SizeType,                       \
+        DestType*, SizeType, SyncInfo<Device::GPU> const&);          \
+    template void Copy_GPU_impl(                                     \
+        SizeType, SizeType,                                          \
+        SourceType const*, SizeType, SizeType,                       \
+        DestType*, SizeType, SizeType, SyncInfo<Device::GPU> const&)
 
 ETI(float, float, int);
 ETI(float, float, long);
diff --git a/src/hydrogen/blas/gpu/Fill.cu b/src/hydrogen/blas/gpu/Fill.cu
index 456d8608bd..85c1590d1f 100644
--- a/src/hydrogen/blas/gpu/Fill.cu
+++ b/src/hydrogen/blas/gpu/Fill.cu
@@ -2,9 +2,14 @@
 
 #include <El/hydrogen_config.h>
 #include <hydrogen/meta/TypeTraits.hpp>
-#include <hydrogen/device/gpu/CUDA.hpp>
 
+#ifdef HYDROGEN_HAVE_CUDA
+#include <hydrogen/device/gpu/CUDA.hpp>
 #include <cuda_runtime.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace hydrogen
 {
@@ -36,25 +41,13 @@ __global__ void Fill2D_kernel(size_t height, size_t width, T value,
     }
 }
 
-template <typename T>
-bool CompareEqual(T const& a, T const& b)
-{
-    return a == b;
-}
-
-#ifdef HYDROGEN_GPU_USE_FP16
-inline bool CompareEqual(gpu_half_type const& a, gpu_half_type const& b)
-{
-    return float(a) == float(b);
-}
-#endif // HYDROGEN_GPU_USE_FP16
-
 }// namespace <anon>
 
 template <typename T, typename>
 void Fill_GPU_impl(
     size_t height, size_t width, T const& value,
-    T* buffer, size_t ldim, cudaStream_t stream)
+    T* buffer, size_t ldim,
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (height <= 0 || width <= 0)
         return;
@@ -62,49 +55,28 @@ void Fill_GPU_impl(
     size_t size = height * width;
     constexpr size_t blockDim = 256;
     const size_t gridDim = (size + blockDim - 1) / blockDim;
-    if (CompareEqual(value, TypeTraits<T>::Zero()))
+
+    if (width == 1 || ldim == height)
     {
-        if (width == 1 || ldim == height)
-        {
-            H_CHECK_CUDA(cudaMemsetAsync(buffer, 0x0, size*sizeof(T),
-                                         stream));
-        }
-        else
-        {
-            H_CHECK_CUDA(
-                cudaMemset2DAsync(
-                    buffer, ldim*sizeof(T), 0x0,
-                    height*sizeof(T), width,
-                    stream));
-        }
+        gpu::LaunchKernel(
+            Fill1D_kernel<T>,
+            gridDim, blockDim, 0, sync_info,
+            size, value, buffer);
     }
     else
     {
-        T arg_value = value;
-        if (width == 1 || ldim == height)
-        {
-            void* args[] = {&size, &arg_value, &buffer};
-            H_CHECK_CUDA(
-                cudaLaunchKernel(
-                    (void const*)&Fill1D_kernel<T>,
-                    gridDim, blockDim, args, 0, stream));
-
-        }
-        else
-        {
-            void* args[] = {&height, &width, &arg_value, &buffer, &ldim};
-            H_CHECK_CUDA(
-                cudaLaunchKernel(
-                    (void const*)&Fill2D_kernel<T>,
-                    gridDim, blockDim, args, 0, stream));
-        }
+        gpu::LaunchKernel(
+            Fill2D_kernel<T>,
+            gridDim, blockDim, 0, sync_info,
+            height, width, value, buffer, ldim);
     }
 
 }
 
-#define ETI(T)                                                          \
-    template void Fill_GPU_impl(                                        \
-        size_t, size_t, T const&, T*, size_t, cudaStream_t)
+#define ETI(T)                                 \
+    template void Fill_GPU_impl(               \
+        size_t, size_t, T const&, T*, size_t,  \
+        SyncInfo<Device::GPU> const&)
 
 #ifdef HYDROGEN_GPU_USE_FP16
 ETI(gpu_half_type);
diff --git a/src/hydrogen/blas/gpu/Hadamard.cu b/src/hydrogen/blas/gpu/Hadamard.cu
index 1f838efa45..923830b499 100644
--- a/src/hydrogen/blas/gpu/Hadamard.cu
+++ b/src/hydrogen/blas/gpu/Hadamard.cu
@@ -1,8 +1,13 @@
 #include <hydrogen/blas/gpu/Hadamard.hpp>
 
 #include <El/hydrogen_config.h>
+#ifdef HYDROGEN_HAVE_CUDA
 #include <hydrogen/device/gpu/CUDA.hpp>
 #include <cuda_runtime.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace
 {
@@ -61,7 +66,7 @@ void Hadamard_GPU_impl(
     T const* X, size_t colStrideX, size_t rowStrideX,
     T const* Y, size_t colStrideY, size_t rowStrideY,
     T* Z, size_t colStrideZ, size_t rowStrideZ,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (height <= 0 || width <= 0) { return; }
     size_t size = height * width;
@@ -73,49 +78,45 @@ void Hadamard_GPU_impl(
     {
         if (X == Z)
         {
-            void* args[] = { &size, &Y, &Z };
-            H_CHECK_CUDA(
-                cudaLaunchKernel(
-                    (void const*)&MultAssign_kernel<T>,
-                    gridDim, blockDim, args, 0, stream));
+            gpu::LaunchKernel(
+                MultAssign_kernel<T>,
+                gridDim, blockDim, 0, sync_info,
+                size, Y, Z);
         }
         else if (Y == Z)
         {
-            void* args[] = { &size, &X, &Z };
-            H_CHECK_CUDA(
-                cudaLaunchKernel(
-                    (void const*)&MultAssign_kernel<T>,
-                    gridDim, blockDim, args, 0, stream));
+            gpu::LaunchKernel(
+                MultAssign_kernel<T>,
+                gridDim, blockDim, 0, sync_info,
+                size, X, Z);
         }
         else
         {
-            void* args[] = { &size, &X, &Y, &Z };
-            H_CHECK_CUDA(
-                cudaLaunchKernel(
-                    (void const*)&Hadamard1D_kernel<T>,
-                    gridDim, blockDim, args, 0, stream));
+            gpu::LaunchKernel(
+                Hadamard1D_kernel<T>,
+                gridDim, blockDim, 0, sync_info,
+                size, X, Y, Z);
         }
     }
     else
     {
-        void* args[] = { &height, &width,
-                         &X, &colStrideX, &rowStrideX,
-                         &Y, &colStrideY, &rowStrideY,
-                         &Z, &colStrideZ, &rowStrideZ };
-        H_CHECK_CUDA(
-            cudaLaunchKernel(
-                (void const*)&Hadamard2D_kernel<T>,
-                gridDim, blockDim, args, 0, stream));
+        gpu::LaunchKernel(
+            Hadamard2D_kernel<T>,
+            gridDim, blockDim, 0, sync_info,
+            height, width,
+            X, colStrideX, rowStrideX,
+            Y, colStrideY, rowStrideY,
+            Z, colStrideZ, rowStrideZ);
     }
 
 }
 
-#define ETI(T)                                  \
-    template void Hadamard_GPU_impl(            \
-        size_t, size_t,                         \
-        T const*, size_t, size_t,               \
-        T const*, size_t, size_t,               \
-        T*, size_t, size_t, cudaStream_t)
+#define ETI(T)                                              \
+    template void Hadamard_GPU_impl(                        \
+        size_t, size_t,                                     \
+        T const*, size_t, size_t,                           \
+        T const*, size_t, size_t,                           \
+        T*, size_t, size_t, SyncInfo<Device::GPU> const&)
 
 #ifdef HYDROGEN_GPU_USE_FP16
 ETI(gpu_half_type);
diff --git a/src/hydrogen/blas/gpu/Scale.cu b/src/hydrogen/blas/gpu/Scale.cu
index f55bcde28f..afc3abb3a5 100644
--- a/src/hydrogen/blas/gpu/Scale.cu
+++ b/src/hydrogen/blas/gpu/Scale.cu
@@ -2,9 +2,13 @@
 
 #include <El/hydrogen_config.h>
 #include <hydrogen/meta/TypeTraits.hpp>
+#ifdef HYDROGEN_HAVE_CUDA
 #include <hydrogen/device/gpu/CUDA.hpp>
-
 #include <cuda_runtime.h>
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace
 {
@@ -41,26 +45,24 @@ template <typename T, typename SizeT, typename>
 void Scale_GPU_impl(
     SizeT num_entries,
     T const& alpha, T* A, SizeT lda,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (!num_entries)
         return;
 
     constexpr size_t threads_per_block = 128;
     auto blocks = (num_entries + threads_per_block - 1)/ threads_per_block;
-    T arg_alpha = alpha;
-    void* args[] = { &num_entries, &arg_alpha, &A, &lda};
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&scale_1d_kernel_naive<T,SizeT>,
-            blocks, threads_per_block, args, 0, stream));
+    gpu::LaunchKernel(
+        scale_1d_kernel_naive<T,SizeT>,
+        blocks, threads_per_block, 0, sync_info,
+        num_entries, alpha, A, lda);
 }
 
 template <typename T, typename SizeT, typename>
 void Scale_GPU_impl(
     SizeT num_rows, SizeT num_cols,
     T const& alpha, T* A, SizeT lda,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (num_rows == TypeTraits<SizeT>::Zero()
         || num_cols == TypeTraits<SizeT>::Zero())
@@ -76,24 +78,21 @@ void Scale_GPU_impl(
               1);
     dim3 thds(TILE_DIM, BLK_COLS, 1);
 
-    T arg_alpha = alpha;
-    void* args[] = { &num_rows, &num_cols,
-                     &arg_alpha, &A, &lda};
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)&scale_2d_kernel_naive<TILE_DIM,BLK_COLS,T,SizeT>,
-            blks, thds, args, 0, stream));
+    gpu::LaunchKernel(
+        scale_2d_kernel_naive<TILE_DIM,BLK_COLS,T,SizeT>,
+        blks, thds, 0, sync_info,
+        num_rows, num_cols, alpha, A, lda);
 }
 
 #define ETI(DataType, SizeType)                         \
     template void Scale_GPU_impl(                       \
         SizeType,                                       \
         DataType const&, DataType*, SizeType,           \
-        cudaStream_t);                                  \
+        SyncInfo<Device::GPU> const&);                  \
     template void Scale_GPU_impl(                       \
         SizeType, SizeType,                             \
         DataType const&, DataType*, SizeType,           \
-        cudaStream_t)
+        SyncInfo<Device::GPU> const&)
 
 ETI(float, int);
 ETI(float, long);
diff --git a/src/hydrogen/blas/gpu/Transpose.cu b/src/hydrogen/blas/gpu/Transpose.cu
index e48bf5497d..66607d028e 100644
--- a/src/hydrogen/blas/gpu/Transpose.cu
+++ b/src/hydrogen/blas/gpu/Transpose.cu
@@ -2,12 +2,15 @@
 
 #include <El/hydrogen_config.h>
 #include <hydrogen/meta/TypeTraits.hpp>
+#ifdef HYDROGEN_HAVE_CUDA
 #include <hydrogen/device/gpu/CUDA.hpp>
-
 #include <cuda_runtime.h>
-
 #include <cooperative_groups.h>
 namespace cg = cooperative_groups;
+#elif defined(HYDROGEN_HAVE_ROCM)
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hip/hip_runtime.h>
+#endif
 
 namespace
 {
@@ -19,7 +22,9 @@ __global__ void transpose_kernel(
     T const* __restrict__ A, SizeT const lda,
     T* __restrict__ B, SizeT const ldb)
 {
+#ifdef HYDROGEN_HAVE_CUDA
     cg::thread_block cta = cg::this_thread_block();
+#endif
     __shared__ T tile[TILE_DIM][TILE_DIM+1];
 
     SizeT row_idx_A = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -48,7 +53,11 @@ __global__ void transpose_kernel(
             tile[threadIdx.y+ii][threadIdx.x] = A[idx_in + ii*lda];
         }
 
+#ifdef HYDROGEN_HAVE_CUDA
         cg::sync(cta);
+#else
+        __syncthreads();
+#endif
 
         #pragma unroll
         for (int ii = 0; ii < TILE_DIM; ii += BLK_COLS)
@@ -70,7 +79,11 @@ __global__ void transpose_kernel(
         }
 
         // Same warp-sync stuff -- I assume this still needs to happen.
+#ifdef HYDROGEN_HAVE_CUDA
         cg::sync(cta);
+#else
+        __syncthreads();
+#endif
 
         // Don't write rows of the new matrix that don't exist.
         if (row_idx < n)
@@ -90,7 +103,7 @@ namespace hydrogen
 template <typename T, typename SizeT, typename>
 void Transpose_GPU_impl(
     SizeT m, SizeT n, T const* A, SizeT lda, T* B, SizeT ldb,
-    cudaStream_t stream)
+    SyncInfo<Device::GPU> const& sync_info)
 {
     if (m == TypeTraits<SizeT>::Zero() || n == TypeTraits<SizeT>::Zero())
         return;
@@ -102,19 +115,18 @@ void Transpose_GPU_impl(
               (n + TILE_DIM - 1) / TILE_DIM,
               1);
     dim3 thds(TILE_DIM, BLK_COLS, 1);
-    void* args[] = { &m, &n, &A, &lda, &B, &ldb };
 
-    H_CHECK_CUDA(
-        cudaLaunchKernel(
-            (void const*)transpose_kernel<TILE_DIM,BLK_COLS,T,SizeT>,
-            blks, thds, args, 0, stream));
+    gpu::LaunchKernel(
+        transpose_kernel<TILE_DIM,BLK_COLS,T,SizeT>,
+        blks, thds, 0, sync_info,
+        m, n, A, lda, B, ldb);
 }
 
-#define ETI(DataType, SizeType)                 \
-    template void Transpose_GPU_impl(           \
-        SizeType, SizeType,                     \
-        DataType const*, SizeType,              \
-        DataType*, SizeType, cudaStream_t)
+#define ETI(DataType, SizeType)                            \
+    template void Transpose_GPU_impl(                      \
+        SizeType, SizeType,                                \
+        DataType const*, SizeType,                         \
+        DataType*, SizeType, SyncInfo<Device::GPU> const&)
 
 ETI(float, int);
 ETI(float, long);
diff --git a/src/hydrogen/device/CMakeLists.txt b/src/hydrogen/device/CMakeLists.txt
new file mode 100644
index 0000000000..e205b84614
--- /dev/null
+++ b/src/hydrogen/device/CMakeLists.txt
@@ -0,0 +1,19 @@
+if (HYDROGEN_HAVE_GPU)
+  if (HYDROGEN_HAVE_CUDA)
+    set_full_path(THIS_DIR_CXX_SOURCES
+      CUDA.cpp
+      cuBLAS.cpp
+      cuBLAS_API.cpp
+      GPU.cpp)
+  endif ()
+  if (HYDROGEN_HAVE_ROCM)
+    set_full_path(THIS_DIR_CXX_SOURCES
+      GPU.cpp
+      ROCm.cpp
+      rocBLAS.cpp
+      rocBLAS_API.cpp
+      )
+  endif ()
+
+  set(SOURCES "${SOURCES}" "${THIS_DIR_CXX_SOURCES}" PARENT_SCOPE)
+endif ()
diff --git a/src/hydrogen/device/CUDA.cpp b/src/hydrogen/device/CUDA.cpp
new file mode 100644
index 0000000000..952eb58141
--- /dev/null
+++ b/src/hydrogen/device/CUDA.cpp
@@ -0,0 +1,163 @@
+#include "El/hydrogen_config.h"
+
+#include "hydrogen/device/GPU.hpp"
+#include "hydrogen/device/gpu/CUDA.hpp"
+
+#include "hydrogen/Device.hpp"
+#include "hydrogen/Error.hpp"
+#include "hydrogen/SyncInfo.hpp"
+
+#include "El/core/MemoryPool.hpp"
+
+#include <nvml.h>
+
+#include <iostream>
+#include <sstream>
+
+#define H_CHECK_NVML(cmd)                                               \
+    {                                                                   \
+        auto h_check_nvml_error_code = cmd;                             \
+        H_ASSERT(h_check_nvml_error_code == NVML_SUCCESS,               \
+                 NVMLError,                                             \
+                 BuildNVMLErrorMessage(#cmd,                            \
+                                       h_check_nvml_error_code));       \
+    }
+
+namespace hydrogen
+{
+namespace gpu
+{
+namespace
+{
+
+/** @class NVMLError
+ *  @brief Exception class for errors detected in NVML
+ */
+H_ADD_BASIC_EXCEPTION_CLASS(NVMLError, GPUError);// struct NVMLError
+
+/** @brief Write an error message describing what went wrong in NVML
+ *  @param[in] cmd The expression that raised the error.
+ *  @param[in] error_code The error code reported by NVML.
+ *  @returns A string describing the error.
+ */
+std::string BuildNVMLErrorMessage(
+    std::string const& cmd, nvmlReturn_t error_code)
+{
+    std::ostringstream oss;
+    oss << "NVML error detected in command: \"" << cmd << "\"\n\n"
+        << "    Error Code: " << error_code << "\n"
+        << "    Error Mesg: " << nvmlErrorString(error_code) << "\n";
+    return oss.str();
+}
+
+unsigned int PreCUDAInitDeviceCount()
+{
+    unsigned int count;
+    H_CHECK_NVML(nvmlInit());
+    H_CHECK_NVML(nvmlDeviceGetCount(&count));
+    H_CHECK_NVML(nvmlShutdown());
+    return count;
+}
+
+}// namespace hydrogen::gpu::<anon>
+
+//
+// GPU.hpp functions
+//
+
+int DefaultDevice()
+{
+    static int device_id =
+        ComputeDeviceId(PreCUDAInitDeviceCount());
+    return device_id;
+}
+
+size_t DeviceCount()
+{
+    int count;
+    H_CHECK_CUDA(cudaGetDeviceCount(&count));
+    return static_cast<size_t>(count);
+}
+
+int CurrentDevice()
+{
+    int device;
+    H_CHECK_CUDA(cudaGetDevice(&device));
+    return device;
+}
+
+void SetDevice(int device_id)
+{
+    H_CHECK_CUDA(cudaSetDevice(device_id));
+    H_CHECK_CUDA(cudaGetLastError());
+}
+
+void SynchronizeDevice()
+{
+    H_CHECK_CUDA(cudaDeviceSynchronize());
+}
+
+}// namespace gpu
+
+namespace cuda
+{
+
+std::string BuildCUDAErrorMessage(
+    std::string const& cmd, cudaError_t error_code)
+{
+    std::ostringstream oss;
+    oss << "CUDA error detected in command: \"" << cmd << "\"\n\n"
+        << "    Error Code: " << error_code << "\n"
+        << "    Error Name: " << cudaGetErrorName(error_code) << "\n"
+        << "    Error Mesg: " << cudaGetErrorString(error_code);
+    return oss.str();
+}
+
+cudaEvent_t GetDefaultEvent() noexcept
+{
+    return gpu::DefaultSyncInfo().Event();
+}
+
+cudaStream_t GetDefaultStream() noexcept
+{
+    return gpu::DefaultSyncInfo().Stream();
+}
+
+cudaStream_t GetNewStream()
+{
+    cudaStream_t stream;
+    H_CHECK_CUDA(
+        cudaStreamCreateWithFlags(
+            &stream, cudaStreamNonBlocking));
+    return stream;
+}
+
+cudaEvent_t GetNewEvent()
+{
+    cudaEvent_t event;
+    H_CHECK_CUDA(
+        cudaEventCreateWithFlags(
+            &event, cudaEventDisableTiming));
+    return event;
+}
+
+void FreeStream(cudaStream_t& stream)
+{
+    if (stream)
+    {
+        H_CHECK_CUDA(cudaStreamDestroy(stream));
+        stream = nullptr;
+    }
+}
+
+void FreeEvent(cudaEvent_t& event)
+{
+    if (event)
+    {
+        H_CHECK_CUDA(cudaEventDestroy(event));
+        event = nullptr;
+    }
+}
+
+}// namespace cuda
+}// namespace hydrogen
diff --git a/src/hydrogen/device/GPU.cpp b/src/hydrogen/device/GPU.cpp
new file mode 100644
index 0000000000..300e39eada
--- /dev/null
+++ b/src/hydrogen/device/GPU.cpp
@@ -0,0 +1,105 @@
+#include "El/hydrogen_config.h"
+#include "El/core/MemoryPool.hpp"
+#include "hydrogen/device/GPU.hpp"
+
+#if defined HYDROGEN_HAVE_CUDA
+#include "hydrogen/device/gpu/CUDA.hpp"
+namespace impl = ::hydrogen::cuda;
+#elif defined HYDROGEN_HAVE_ROCM
+#include "hydrogen/device/gpu/ROCm.hpp"
+namespace impl = ::hydrogen::rocm;
+#endif // HYDROGEN_HAVE_CUDA
+
+#if defined HYDROGEN_HAVE_CUB
+#include "hydrogen/device/gpu/CUB.hpp"
+#endif
+
+namespace hydrogen
+{
+namespace gpu
+{
+namespace
+{
+
+// Global variables
+bool gpu_initialized_ = false;
+SyncInfo<Device::GPU> default_syncinfo_;
+
+}// namespace <anon>
+
+int ComputeDeviceId(unsigned int device_count) noexcept
+{
+    if (device_count == 0U)
+        return -1;
+    if (device_count == 1U)
+        return 0;
+
+    // Get local rank (rank within compute node)
+    //
+    // TODO: Update to not rely on env vars
+    // TODO: Use HWLOC or something to pick "closest GPU"
+    int local_rank = 0;
+    char* env = nullptr;
+    if (!env) { env = std::getenv("SLURM_LOCALID"); }
+    if (!env) { env = std::getenv("MV2_COMM_WORLD_LOCAL_RANK"); }
+    if (!env) { env = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK"); }
+    if (env) { local_rank = std::atoi(env); }
+
+    // Try assigning GPUs to local ranks in round-robin fashion
+    return local_rank % device_count;
+}
+
+void Initialize()
+{
+    if (gpu_initialized_)
+        return; // Or should this throw??
+
+    // This should fail if device < 0.
+    SetDevice(DefaultDevice());
+
+    // Setup a default stream and event.
+    default_syncinfo_ = CreateNewSyncInfo<Device::GPU>();
+
+    // Set the global flag
+    gpu_initialized_ = true;
+}
+
+void Finalize()
+{
+    // FIXME: This stuff should move.
+#ifdef HYDROGEN_HAVE_CUB
+    cub::DestroyMemoryPool();
+#endif // HYDROGEN_HAVE_CUB
+    El::DestroyPinnedHostMemoryPool();
+    DestroySyncInfo(default_syncinfo_);
+    gpu_initialized_ = false;
+}
+
+bool IsInitialized() noexcept
+{
+    return gpu_initialized_;
+}
+
+SyncInfo<Device::GPU> const& DefaultSyncInfo() noexcept
+{
+    return default_syncinfo_;
+}
+
+}// namespace gpu
+
+template <>
+SyncInfo<Device::GPU> CreateNewSyncInfo<Device::GPU>()
+{
+    return SyncInfo<Device::GPU>{
+        impl::GetNewStream(), impl::GetNewEvent()};
+}
+
+void DestroySyncInfo(SyncInfo<Device::GPU>& si)
+{
+    impl::FreeStream(si.stream_);
+    impl::FreeEvent(si.event_);
+    si.stream_ = nullptr;
+    si.event_ = nullptr;
+}
+
+}// namespace hydrogen
diff --git a/src/hydrogen/device/ROCm.cpp b/src/hydrogen/device/ROCm.cpp
new file mode 100644
index 0000000000..db1592184c
--- /dev/null
+++ b/src/hydrogen/device/ROCm.cpp
@@ -0,0 +1,101 @@
+#include <El/hydrogen_config.h>
+
+#include <hydrogen/device/GPU.hpp>
+#include <hydrogen/device/gpu/ROCm.hpp>
+
+#include <El/core/MemoryPool.hpp>
+
+namespace hydrogen
+{
+namespace gpu
+{
+
+size_t DeviceCount()
+{
+    int count;
+    H_CHECK_HIP(hipGetDeviceCount(&count));
+    return count;
+}
+
+int DefaultDevice()
+{
+    static int device_id = ComputeDeviceId(DeviceCount());
+    return device_id;
+}
+
+int CurrentDevice()
+{
+    int device_id;
+    H_CHECK_HIP(hipGetDevice(&device_id));
+    return device_id;
+}
+
+void SetDevice(int device_id)
+{
+    H_CHECK_HIP(hipSetDevice(device_id));
+}
+
+void SynchronizeDevice()
+{
+    H_CHECK_HIP(hipDeviceSynchronize());
+}
+
+}// namespace gpu
+
+namespace rocm
+{
+
+std::string BuildHipErrorMessage(std::string const& cmd, hipError_t error_code)
+{
+    std::ostringstream oss;
+    oss << "ROCm error detected in command: \"" << cmd << "\"\n\n"
+        << "    Error Code: " << error_code << "\n"
+        << "    Error Name: " << hipGetErrorName(error_code) << "\n"
+        << "    Error Mesg: " << hipGetErrorString(error_code);
+    return oss.str();
+}
+
+hipEvent_t GetDefaultEvent() noexcept
+{
+    return gpu::DefaultSyncInfo().Event();
+}
+
+hipStream_t GetDefaultStream() noexcept
+{
+    return gpu::DefaultSyncInfo().Stream();
+}
+
+hipStream_t GetNewStream()
+{
+    hipStream_t stream;
+    H_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
+    return stream;
+}
+
+hipEvent_t GetNewEvent()
+{
+    hipEvent_t event;
+    H_CHECK_HIP(hipEventCreateWithFlags(&event, hipEventDisableTiming));
+    return event;
+}
+
+void FreeStream(hipStream_t& stream)
+{
+    if (stream)
+    {
+        H_CHECK_HIP(hipStreamDestroy(stream));
+        stream = nullptr;
+    }
+}
+
+void FreeEvent(hipEvent_t& event)
+{
+    if (event)
+    {
+        H_CHECK_HIP(hipEventDestroy(event));
+        event = nullptr;
+    }
+}
+
+}// namespace rocm
+}// namespace hydrogen
diff --git a/src/hydrogen/device/cuBLAS.cpp b/src/hydrogen/device/cuBLAS.cpp
new file mode 100644
index 0000000000..71133a58af
--- /dev/null
+++ b/src/hydrogen/device/cuBLAS.cpp
@@ -0,0 +1,151 @@
+#include <hydrogen/device/gpu/cuda/cuBLAS.hpp>
+
+// Helper macro for converting enums to strings.
+#define H_ADD_CUBLAS_ENUM_TO_STRING_CASE(enum_value) \
+    case enum_value:                        \
+        return #enum_value
+
+
+namespace
+{
+
+std::string GetcuBLASErrorString(cublasStatus_t status)
+{
+    switch (status)
+    {
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_SUCCESS);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_INITIALIZED);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ALLOC_FAILED);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INVALID_VALUE);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_ARCH_MISMATCH);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_MAPPING_ERROR);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_EXECUTION_FAILED);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_INTERNAL_ERROR);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_NOT_SUPPORTED);
+        H_ADD_CUBLAS_ENUM_TO_STRING_CASE(CUBLAS_STATUS_LICENSE_ERROR);
+    default:
+        return "Unknown cuBLAS error.";
+    }
+}
+
+}
+
+namespace hydrogen
+{
+namespace cublas
+{
+namespace // <anon2>
+{
+bool cublas_is_initialized_ = false;
+cublasHandle_t default_cublas_handle_;
+}// namespace <anon2>
+
+cublasHandle_t GetLibraryHandle() noexcept
+{
+    return default_cublas_handle_;
+}
+
+bool IsInitialized() noexcept
+{
+    return cublas_is_initialized_;
+}
+
+void Initialize(cublasHandle_t handle)
+{
+    if (!IsInitialized())
+    {
+        if (!handle)
+            H_CHECK_CUBLAS(cublasCreate(&default_cublas_handle_));
+        else
+            default_cublas_handle_ = handle;
+
+        H_CHECK_CUBLAS(
+            cublasSetStream(
+                GetLibraryHandle(), cuda::GetDefaultStream()));
+        H_CHECK_CUBLAS(
+            cublasSetPointerMode(
+                GetLibraryHandle(), CUBLAS_POINTER_MODE_HOST));
+#ifdef HYDROGEN_GPU_USE_TENSOR_OP_MATH
+        H_CHECK_CUBLAS(
+            cublasSetMathMode(GetLibraryHandle(),CUBLAS_TENSOR_OP_MATH));
+#else
+        H_CHECK_CUBLAS(
+            cublasSetMathMode(GetLibraryHandle(),CUBLAS_DEFAULT_MATH));
+#endif // HYDROGEN_GPU_USE_TENSOR_OP_MATH
+
+        cublas_is_initialized_ = true;
+    }
+}
+
+void Finalize()
+{
+    if (default_cublas_handle_)
+        H_CHECK_CUBLAS(cublasDestroy(default_cublas_handle_));
+    default_cublas_handle_ = nullptr;
+    cublas_is_initialized_ = false;
+}
+
+void ReplaceLibraryHandle(cublasHandle_t handle)
+{
+    H_ASSERT_FALSE(handle == nullptr,
+                   std::logic_error,
+                   "hydrogen::cublas::ReplaceLibraryHandle(): "
+                   "Detected a null cuBLAS handle.");
+
+    H_ASSERT(IsInitialized(),
+             std::logic_error,
+             "hydrogen::cublas::ReplaceLibraryHandle(): "
+             "cuBLAS must be initialized before calling this function.");
+
+    if (default_cublas_handle_)
+        H_CHECK_CUBLAS(cublasDestroy(default_cublas_handle_));
+    default_cublas_handle_ = handle;
+}
+
+SyncManager::SyncManager(cublasHandle_t handle,
+                         SyncInfo<Device::GPU> const& si)
+{
+    H_CHECK_CUBLAS(
+        cublasGetStream(handle, &orig_stream_));
+    H_CHECK_CUBLAS(
+        cublasSetStream(handle, si.Stream()));
+}
+
+SyncManager::~SyncManager()
+{
+    try
+    {
+        H_CHECK_CUBLAS(
+            cublasSetStream(
+                GetLibraryHandle(), orig_stream_));
+    }
+    catch (std::exception const& e)
+    {
+        H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e);
+    }
+}
+
+std::string BuildcuBLASErrorMessage(
+    std::string const& cmd, cublasStatus_t error_code)
+{
+    std::ostringstream oss;
+    oss << "cuBLAS error detected in command: \"" << cmd << "\"\n\n"
+        << "    Error Code: " << error_code << "\n"
+        << "    Error Name: " << GetcuBLASErrorString(error_code);
+    return oss.str();
+}
+
+}// namespace cublas
+
+namespace gpu_blas
+{
+void SetPointerMode(PointerMode mode)
+{
+    H_CHECK_CUBLAS(
+        cublasSetPointerMode(cublas::GetLibraryHandle(),
+                             (mode == PointerMode::HOST
+                              ? CUBLAS_POINTER_MODE_HOST
+                              : CUBLAS_POINTER_MODE_DEVICE)));
+}
+}// namespace gpu_blas
+}// namespace hydrogen
diff --git a/src/core/imports/cublas.cpp b/src/hydrogen/device/cuBLAS_API.cpp
similarity index 58%
rename from src/core/imports/cublas.cpp
rename to src/hydrogen/device/cuBLAS_API.cpp
index 47f7af8cd9..0f5d457b72 100644
--- a/src/core/imports/cublas.cpp
+++ b/src/hydrogen/device/cuBLAS_API.cpp
@@ -3,9 +3,11 @@
 
 #include <cuda.h>
 #include <cuda_runtime.h>
+
 #ifdef HYDROGEN_GPU_USE_FP16
 #include <cuda_fp16.h>
 #endif // HYDROGEN_GPU_USE_FP16
+
 #include <cublas_v2.h>
 
 namespace hydrogen
@@ -13,34 +15,6 @@ namespace hydrogen
 namespace cublas
 {
 
-cublasHandle_t GetLibraryHandle() noexcept
-{
-    return GPUManager::cuBLASHandle();
-}
-
-void Initialize()
-{
-    GPUManager::InitializeCUBLAS();
-#ifdef HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH
-    H_CHECK_CUBLAS(
-        cublasSetMathMode(GetLibraryHandle(), CUBLAS_TENSOR_OP_MATH));
-#endif // HYDROGEN_CUBLAS_USE_TENSOR_OP_MATH
-}
-
-SyncManager::SyncManager(cublasHandle_t handle,
-                         SyncInfo<Device::GPU> const& si)
-{
-    H_CHECK_CUBLAS(
-        cublasGetStream(handle, &orig_stream_));
-    H_CHECK_CUBLAS(
-        cublasSetStream(handle, si.stream_));
-}
-
-SyncManager::~SyncManager()
-{
-    cublasSetStream(GPUManager::cuBLASHandle(), orig_stream_);
-}
-
 //
 // BLAS 1
 //
@@ -61,6 +35,38 @@ void Axpy(cublasHandle_t handle,
             CUDA_R_32F));
 }
 
+void Dot(cublasHandle_t handle,
+         int n,
+         __half const* X, int incx,
+         __half const* Y, int incy,
+         __half& output)
+{
+    H_CHECK_CUBLAS(
+        cublasDotEx(
+            handle,
+            n,
+            X, /*xtype=*/CUDA_R_16F, incx,
+            Y, /*ytype=*/CUDA_R_16F, incy,
+            &output,
+            /*resulttype=*/CUDA_R_16F,
+            /*executiontype=*/CUDA_R_32F));
+}
+
+void Nrm2(cublasHandle_t handle,
+          int n,
+          __half const* X, int incx,
+          __half& output)
+{
+    H_CHECK_CUBLAS(
+        cublasNrm2Ex(
+            handle,
+            n,
+            X, /*xtype=*/CUDA_R_16F, incx,
+            &output,
+            /*resulttype=*/CUDA_R_16F,
+            /*executiontype=*/CUDA_R_32F));
+}
+
 void Scale(cublasHandle_t handle,
             int n, __half const& alpha,
             __half* X, int incx)
@@ -94,6 +100,72 @@ void Scale(cublasHandle_t handle,
                 n, X, incx, Y, incy));                  \
     }
 
+#define ADD_DOT_IMPL(ScalarType, TypeChar)                      \
+    void Dot(cublasHandle_t handle,                             \
+             int n, ScalarType const* X, int incx,              \
+             ScalarType const* Y, int incy,                     \
+             ScalarType* output)                                \
+    {                                                           \
+        H_CHECK_CUBLAS(                                         \
+            cublas ## TypeChar ## dot(                          \
+                handle,                                         \
+                n, X, incx, Y, incy, output));                  \
+    }
+
+template <typename T>
+struct RealTypeT
+{
+    using type = T;
+};
+
+template <>
+struct RealTypeT<cuComplex>
+{
+    using type = float;
+};
+
+template <>
+struct RealTypeT<cuDoubleComplex>
+{
+    using type = double;
+};
+
+template <typename T>
+using RealType = typename RealTypeT<T>::type;
+
+#define ADD_COMPLEX_DOT_IMPL(ScalarType, TypeChar)              \
+    void Dotu(cublasHandle_t handle,                            \
+              int n, ScalarType const* X, int incx,             \
+              ScalarType const* Y, int incy,                    \
+              ScalarType* output)                               \
+    {                                                           \
+        H_CHECK_CUBLAS(                                         \
+            cublas ## TypeChar ## dotu(                         \
+                handle,                                         \
+                n, X, incx, Y, incy, output));                  \
+    }                                                           \
+    void Dotc(cublasHandle_t handle,                            \
+             int n, ScalarType const* X, int incx,              \
+             ScalarType const* Y, int incy,                     \
+             ScalarType* output)                                \
+    {                                                           \
+        H_CHECK_CUBLAS(                                         \
+            cublas ## TypeChar ## dotc(                         \
+                handle,                                         \
+                n, X, incx, Y, incy, output));                  \
+    }
+
+#define ADD_NRM2_IMPL(ScalarType, TypeChar)                     \
+    void Nrm2(cublasHandle_t handle,                            \
+              int n, ScalarType const* X, int incx,             \
+              ScalarType const* Y, int incy,                    \
+              RealType<ScalarType>* output)                     \
+    {                                                           \
+        H_CHECK_CUBLAS(                                         \
+            cublas ## TypeChar ## nrm2(                         \
+                handle, n, X, incx, output));                   \
+    }
+
 #define ADD_SCALE_IMPL(ScalarType, TypeChar)               \
     void Scale(cublasHandle_t handle,                      \
                int n, ScalarType const& alpha,             \
@@ -140,14 +212,41 @@ void Scale(cublasHandle_t handle,
         ScalarType const& beta,                         \
         ScalarType* C, int ldc)                         \
     {                                                   \
-        H_CHECK_CUBLAS(                                \
+        H_CHECK_CUBLAS(                                 \
             cublas ## TypeChar ## gemm(                 \
-                handle,                                 \
-                transpA, transpB,                       \
+            handle,                                     \
+            transpA, transpB,                           \
                 m, n, k, &alpha, A, lda, B, ldb,        \
                 &beta, C, ldc));                        \
     }
 
+#define ADD_GEMM_STRIDED_BATCHED_IMPL(ScalarType, TypeChar)     \
+    void GemmStridedBatched(                                    \
+        cublasHandle_t handle,                                  \
+        cublasOperation_t transpA,                              \
+        cublasOperation_t transpB,                              \
+        int m, int n, int k,                                    \
+        ScalarType const* alpha,                                \
+        ScalarType const* A, int lda,                           \
+        long long int strideA,                                  \
+        ScalarType const* B, int ldb,                           \
+        long long int strideB,                                  \
+        ScalarType const* beta,                                 \
+        ScalarType* C, int ldc,                                 \
+        long long int strideC,                                  \
+        int batchCount)                                         \
+    {                                                           \
+        H_CHECK_CUBLAS(                                         \
+            cublas ## TypeChar ## gemmStridedBatched(           \
+                handle, transpA, transpB,                       \
+                m, n, k,                                        \
+                alpha,                                          \
+                A, lda, strideA, B, ldb, strideB,               \
+                beta,                                           \
+                C, ldc, strideC,                                \
+                batchCount));                                   \
+    }
+
 //
 // BLAS-like Extension
 //
@@ -163,7 +262,7 @@ void Scale(cublasHandle_t handle,
         ScalarType const* B, int ldb,           \
         ScalarType* C, int ldc)                 \
     {                                           \
-        H_CHECK_CUBLAS(                        \
+        H_CHECK_CUBLAS(                         \
             cublas ## TypeChar ## geam(         \
                 handle,                         \
                 transpA, transpB,               \
@@ -182,7 +281,7 @@ void Scale(cublasHandle_t handle,
         ScalarType const* X, int incx,                  \
         ScalarType* C, int ldc)                         \
     {                                                   \
-        H_CHECK_CUBLAS(                                \
+        H_CHECK_CUBLAS(                                 \
             cublas ## TypeChar ## dgmm(                 \
                 handle,                                 \
                 side, m, n, A, lda, X, incx, C, ldc));  \
@@ -199,6 +298,16 @@ ADD_COPY_IMPL(double, D)
 ADD_COPY_IMPL(cuComplex, C)
 ADD_COPY_IMPL(cuDoubleComplex, Z)
 
+ADD_DOT_IMPL(float, S)
+ADD_DOT_IMPL(double, D)
+ADD_COMPLEX_DOT_IMPL(cuComplex, C)
+ADD_COMPLEX_DOT_IMPL(cuDoubleComplex, Z)
+
+ADD_NRM2_IMPL(float, S)
+ADD_NRM2_IMPL(double, D)
+ADD_NRM2_IMPL(cuComplex, Sc)
+ADD_NRM2_IMPL(cuDoubleComplex, Dz)
+
 ADD_SCALE_IMPL(float, S)
 ADD_SCALE_IMPL(double, D)
 ADD_SCALE_IMPL(cuComplex, C)
@@ -217,6 +326,12 @@ ADD_GEMM_IMPL(double, D)
 ADD_GEMM_IMPL(cuComplex, C)
 ADD_GEMM_IMPL(cuDoubleComplex, Z)
 
+ADD_GEMM_STRIDED_BATCHED_IMPL(__half, H)
+ADD_GEMM_STRIDED_BATCHED_IMPL(float, S)
+ADD_GEMM_STRIDED_BATCHED_IMPL(double, D)
+ADD_GEMM_STRIDED_BATCHED_IMPL(cuComplex, C)
+ADD_GEMM_STRIDED_BATCHED_IMPL(cuDoubleComplex, Z)
+
 // BLAS-like extension
 ADD_GEAM_IMPL(float, S)
 ADD_GEAM_IMPL(double, D)
@@ -241,38 +356,53 @@ ADD_DGMM_IMPL(cuDoubleComplex, Z)
 ASSERT_SUPPORT(float, BLAS_Op::AXPY);
 ASSERT_SUPPORT(float, BLAS_Op::COPY);
 ASSERT_SUPPORT(float, BLAS_Op::DGMM);
+ASSERT_SUPPORT(float, BLAS_Op::DOT);
 ASSERT_SUPPORT(float, BLAS_Op::GEAM);
 ASSERT_SUPPORT(float, BLAS_Op::GEMM);
+ASSERT_SUPPORT(float, BLAS_Op::GEMMSTRIDEDBATCHED);
 ASSERT_SUPPORT(float, BLAS_Op::GEMV);
+ASSERT_SUPPORT(float, BLAS_Op::NRM2);
 ASSERT_SUPPORT(float, BLAS_Op::SCAL);
 
 ASSERT_SUPPORT(double, BLAS_Op::AXPY);
 ASSERT_SUPPORT(double, BLAS_Op::COPY);
 ASSERT_SUPPORT(double, BLAS_Op::DGMM);
+ASSERT_SUPPORT(double, BLAS_Op::DOT);
 ASSERT_SUPPORT(double, BLAS_Op::GEAM);
 ASSERT_SUPPORT(double, BLAS_Op::GEMM);
+ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED);
 ASSERT_SUPPORT(double, BLAS_Op::GEMV);
+ASSERT_SUPPORT(double, BLAS_Op::NRM2);
 ASSERT_SUPPORT(double, BLAS_Op::SCAL);
 
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::AXPY);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::COPY);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::DGMM);
+ASSERT_SUPPORT(std::complex<float>, BLAS_Op::DOT);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::GEAM);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::GEMM);
+ASSERT_SUPPORT(std::complex<float>, BLAS_Op::GEMMSTRIDEDBATCHED);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::GEMV);
+ASSERT_SUPPORT(std::complex<float>, BLAS_Op::NRM2);
 ASSERT_SUPPORT(std::complex<float>, BLAS_Op::SCAL);
 
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::AXPY);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::COPY);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::DGMM);
+ASSERT_SUPPORT(std::complex<double>, BLAS_Op::DOT);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::GEAM);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::GEMM);
+ASSERT_SUPPORT(std::complex<double>, BLAS_Op::GEMMSTRIDEDBATCHED);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::GEMV);
+ASSERT_SUPPORT(std::complex<double>, BLAS_Op::NRM2);
 ASSERT_SUPPORT(std::complex<double>, BLAS_Op::SCAL);
 
 #ifdef HYDROGEN_GPU_USE_FP16
 ASSERT_SUPPORT(__half, BLAS_Op::AXPY);
+ASSERT_SUPPORT(__half, BLAS_Op::DOT);
 ASSERT_SUPPORT(__half, BLAS_Op::GEMM);
+ASSERT_SUPPORT(__half, BLAS_Op::GEMMSTRIDEDBATCHED);
+ASSERT_SUPPORT(__half, BLAS_Op::NRM2);
 ASSERT_SUPPORT(__half, BLAS_Op::SCAL);
 ASSERT_NO_SUPPORT(__half, BLAS_Op::COPY);
 ASSERT_NO_SUPPORT(__half, BLAS_Op::DGMM);
@@ -294,9 +424,12 @@ ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEMV);
 ASSERT_NO_SUPPORT(int, BLAS_Op::AXPY);
 ASSERT_NO_SUPPORT(int, BLAS_Op::COPY);
 ASSERT_NO_SUPPORT(int, BLAS_Op::DGMM);
+ASSERT_NO_SUPPORT(int, BLAS_Op::DOT);
 ASSERT_NO_SUPPORT(int, BLAS_Op::GEAM);
 ASSERT_NO_SUPPORT(int, BLAS_Op::GEMM);
+ASSERT_NO_SUPPORT(int, BLAS_Op::GEMMSTRIDEDBATCHED);
 ASSERT_NO_SUPPORT(int, BLAS_Op::GEMV);
+ASSERT_NO_SUPPORT(int, BLAS_Op::NRM2);
 
 } // namespace cublas
 
diff --git a/src/hydrogen/device/rocBLAS.cpp b/src/hydrogen/device/rocBLAS.cpp
new file mode 100644
index 0000000000..2904ea1b48
--- /dev/null
+++ b/src/hydrogen/device/rocBLAS.cpp
@@ -0,0 +1,143 @@
+#include <hydrogen/device/gpu/rocm/rocBLAS.hpp>
+
+// Helper macro for converting enums to strings.
+#define H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(enum_value) \
+    case enum_value:                        \
+        return #enum_value
+
+
+namespace
+{
+
+std::string GetrocBLASErrorString(rocblas_status status)
+{
+    switch (status)
+    {
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_success);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_handle);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_not_implemented);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_pointer);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_invalid_size);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_memory_error);
+        H_ADD_ROCBLAS_ENUM_TO_STRING_CASE(rocblas_status_internal_error);
+    default:
+        return "Unknown rocBLAS error.";
+    }
+}
+
+}
+
+namespace hydrogen
+{
+
+namespace rocblas
+{
+namespace // <anon2>
+{
+bool rocblas_is_initialized_ = false;
+rocblas_handle default_rocblas_handle_;
+}// namespace <anon2>
+
+rocblas_handle GetLibraryHandle() noexcept
+{
+    return default_rocblas_handle_;
+}
+
+bool IsInitialized() noexcept
+{
+    return rocblas_is_initialized_;
+}
+
+void Initialize(rocblas_handle handle)
+{
+    if (!IsInitialized())
+    {
+        if (!handle)
+            H_CHECK_ROCBLAS(rocblas_create_handle(&default_rocblas_handle_));
+        else
+            default_rocblas_handle_ = handle;
+
+        H_CHECK_ROCBLAS(
+            rocblas_set_stream(
+                GetLibraryHandle(), rocm::GetDefaultStream()));
+        H_CHECK_ROCBLAS(
+            rocblas_set_pointer_mode(
+                GetLibraryHandle(), rocblas_pointer_mode_host));
+
+        rocblas_is_initialized_ = true;
+    }
+}
+
+void Finalize()
+{
+    if (default_rocblas_handle_)
+        H_CHECK_ROCBLAS(rocblas_destroy_handle(default_rocblas_handle_));
+    default_rocblas_handle_ = nullptr;
+    rocblas_is_initialized_ = false;
+}
+
+void ReplaceLibraryHandle(rocblas_handle handle)
+{
+    H_ASSERT_FALSE(handle == nullptr,
+                   std::logic_error,
+                   "hydrogen::rocblas::ReplaceLibraryHandle(): "
+                   "Detected a null rocBLAS handle.");
+
+    H_ASSERT(IsInitialized(),
+             std::logic_error,
+             "hydrogen::rocblas::ReplaceLibraryHandle(): "
+             "rocBLAS must be initialized before calling this function.");
+
+    if (default_rocblas_handle_)
+        H_CHECK_ROCBLAS(rocblas_destroy_handle(default_rocblas_handle_));
+    default_rocblas_handle_ = handle;
+}
+
+SyncManager::SyncManager(rocblas_handle handle,
+                         SyncInfo<Device::GPU> const& si)
+{
+    H_CHECK_ROCBLAS(
+        rocblas_get_stream(handle, &orig_stream_));
+    H_CHECK_ROCBLAS(
+        rocblas_set_stream(handle, si.Stream()));
+}
+
+SyncManager::~SyncManager()
+{
+    try
+    {
+        H_CHECK_ROCBLAS(
+            rocblas_set_stream(
+                GetLibraryHandle(), orig_stream_));
+    }
+    catch (std::exception const& e)
+    {
+        H_REPORT_DTOR_EXCEPTION_AND_TERMINATE(e);
+    }
+}
+
+std::string BuildrocBLASErrorMessage(
+    std::string const& cmd, rocblas_status error_code)
+{
+    std::ostringstream oss;
+    oss << "rocBLAS error detected in command: \"" << cmd << "\"\n\n"
+        << "    Error Code: " << error_code << "\n"
+        << "    Error Name: " << GetrocBLASErrorString(error_code);
+    return oss.str();
+}
+
+}// namespace rocblas
+
+namespace gpu_blas
+{
+void SetPointerMode(PointerMode mode)
+{
+    H_CHECK_ROCBLAS(
+        rocblas_set_pointer_mode(rocblas::GetLibraryHandle(),
+                                 (mode == PointerMode::HOST
+                                  ? rocblas_pointer_mode_host
+                                  : rocblas_pointer_mode_device)));
+}
+}// namespace gpu_blas
+
+}// namespace hydrogen
diff --git a/src/hydrogen/device/rocBLAS_API.cpp b/src/hydrogen/device/rocBLAS_API.cpp
new file mode 100644
index 0000000000..3f2f4e6ce0
--- /dev/null
+++ b/src/hydrogen/device/rocBLAS_API.cpp
@@ -0,0 +1,285 @@
+#include <hydrogen/device/gpu/rocm/rocBLAS_API.hpp>
+
+#include <hydrogen/device/gpu/ROCm.hpp>
+#include <hydrogen/device/gpu/rocm/rocBLAS.hpp>
+
+#include <rocblas.h>
+
+namespace hydrogen
+{
+namespace rocblas
+{
+
+//
+// BLAS 1
+//
+
+#define ADD_AXPY_IMPL(ScalarType, TypeChar)                     \
+    void Axpy(rocblas_handle handle,                            \
+              int n, ScalarType const& alpha,                   \
+              ScalarType const* X, int incx,                    \
+              ScalarType* Y, int incy)                          \
+    {                                                           \
+        H_CHECK_ROCBLAS(                                        \
+            rocblas_ ## TypeChar ## axpy(                         \
+                handle,                                         \
+                n, &alpha, X, incx, Y, incy));                  \
+    }
+
+#define ADD_COPY_IMPL(ScalarType, TypeChar)             \
+    void Copy(rocblas_handle handle,                    \
+              int n, ScalarType const* X, int incx,     \
+              ScalarType* Y, int incy)                  \
+    {                                                   \
+        H_CHECK_ROCBLAS(                                \
+            rocblas_ ## TypeChar ## copy(                 \
+                handle,                                 \
+                n, X, incx, Y, incy));                  \
+    }
+
+#define ADD_DOT_IMPL(ScalarType, TypeChar)      \
+    void Dot(rocblas_handle handle,             \
+             int n,                             \
+             ScalarType const* X, int incx,     \
+             ScalarType const* Y, int incy,     \
+             ScalarType* result)                \
+    {                                           \
+        H_CHECK_ROCBLAS(                        \
+            rocblas_ ## TypeChar ## dot(        \
+                handle,                         \
+                n, X, incx, Y, incy, result));  \
+    }
+
+#define ADD_NRM2_IMPL(ScalarType, TypeChar)             \
+    void Nrm2(rocblas_handle handle,                    \
+              int n, ScalarType const* X, int incx,     \
+              ScalarType* result)                       \
+    {                                                   \
+        H_CHECK_ROCBLAS(                                \
+            rocblas_ ## TypeChar ## nrm2(               \
+                handle,                                 \
+                n, X, incx, result));                   \
+    }
+
+#define ADD_SCALE_IMPL(ScalarType, TypeChar)               \
+    void Scale(rocblas_handle handle,                      \
+               int n, ScalarType const& alpha,             \
+               ScalarType* X, int incx)                    \
+    {                                                      \
+        H_CHECK_ROCBLAS(                                   \
+            rocblas_ ## TypeChar ## scal(                    \
+                handle, n, &alpha, X, incx));              \
+    }
+
+//
+// BLAS 2
+//
+#define ADD_GEMV_IMPL(ScalarType, TypeChar)              \
+    void Gemv(                                           \
+        rocblas_handle handle,                           \
+        rocblas_operation transpA, int m, int n,         \
+        ScalarType const& alpha,                         \
+        ScalarType const* A, int lda,                    \
+        ScalarType const* B, int ldb,                    \
+        ScalarType const& beta,                          \
+        ScalarType* C, int ldc)                          \
+    {                                                    \
+        H_CHECK_ROCBLAS(rocblas_ ## TypeChar ## gemv(      \
+                            handle,                      \
+                            transpA,                     \
+                            m, n,                        \
+                            &alpha, A, lda, B, ldb,      \
+                            &beta, C, ldc));             \
+    }
+
+//
+// BLAS 3
+//
+#define ADD_GEMM_IMPL(ScalarType, TypeChar)             \
+    void Gemm(                                          \
+        rocblas_handle handle,                          \
+        rocblas_operation transpA,                      \
+        rocblas_operation transpB,                      \
+        rocblas_int m, rocblas_int n, rocblas_int k,                            \
+        ScalarType const& alpha,                        \
+        ScalarType const* A, rocblas_int lda,                   \
+        ScalarType const* B, rocblas_int ldb,                   \
+        ScalarType const& beta,                         \
+        ScalarType* C, rocblas_int ldc)                         \
+    {                                                   \
+        H_CHECK_ROCBLAS(                                \
+            rocblas_ ## TypeChar ## gemm(                 \
+                handle,                                 \
+                transpA, transpB,                       \
+                m, n, k, &alpha, A, lda, B, ldb,        \
+                &beta, C, ldc));                        \
+    }
+
+#define ADD_GEMM_STRIDED_BATCHED_IMPL(ScalarType, TypeChar)             \
+    void GemmStridedBatched(                                            \
+        rocblas_handle handle,                                          \
+        rocblas_operation transpA,                                      \
+        rocblas_operation transpB,                                      \
+        rocblas_int m, rocblas_int n, rocblas_int k,                    \
+        ScalarType const& alpha,                                        \
+        ScalarType const* A, rocblas_int lda, rocblas_stride strideA,   \
+        ScalarType const* B, rocblas_int ldb, rocblas_stride strideB,   \
+        ScalarType const& beta,                                         \
+        ScalarType* C, rocblas_int ldc, rocblas_stride strideC,         \
+        rocblas_int batchCount)                                         \
+    {                                                                   \
+        H_CHECK_ROCBLAS(                                                \
+            rocblas_ ## TypeChar ## gemm_strided_batched(               \
+                handle,                                                 \
+                transpA, transpB,                                       \
+                m, n, k, &alpha,                                        \
+                A, lda, strideA,                                        \
+                B, ldb, strideB,                                        \
+                &beta, C, ldc, strideC, batchCount));                   \
+    }
+
+//
+// BLAS-like Extension
+//
+#define ADD_GEAM_IMPL(ScalarType, TypeChar)     \
+    void Geam(                                  \
+        rocblas_handle handle,                  \
+        rocblas_operation transpA,              \
+        rocblas_operation transpB,              \
+        int m, int n,                           \
+        ScalarType const& alpha,                \
+        ScalarType const* A, int lda,           \
+        ScalarType const& beta,                 \
+        ScalarType const* B, int ldb,           \
+        ScalarType* C, int ldc)                 \
+    {                                           \
+        H_CHECK_ROCBLAS(                        \
+            rocblas_ ## TypeChar ## geam(       \
+                handle,                         \
+                transpA, transpB,               \
+                m, n,                           \
+                &alpha, A, lda,                 \
+                &beta, B, ldb,                  \
+                C, ldc));                       \
+    }
+
+#define ADD_DGMM_IMPL(ScalarType, TypeChar)                     \
+    void Dgmm(                                                  \
+        rocblas_handle handle,                                  \
+        rocblas_side side,                                      \
+        int m, int n,                                           \
+        ScalarType const* A, int lda,                           \
+        ScalarType const* X, int incx,                          \
+        ScalarType* C, int ldc)                                 \
+    {                                                           \
+        H_CHECK_ROCBLAS(rocblas_status_not_implemented);        \
+    }
+
+// BLAS 1
+ADD_AXPY_IMPL(rocblas_half, h)
+ADD_AXPY_IMPL(float, s)
+ADD_AXPY_IMPL(double, d)
+
+ADD_COPY_IMPL(float, s)
+ADD_COPY_IMPL(double, d)
+
+//ADD_DOT_IMPL(rocblas_half, h)
+ADD_DOT_IMPL(float, s)
+ADD_DOT_IMPL(double, d)
+
+//ADD_NRM2_IMPL(rocblas_half, h)
+ADD_NRM2_IMPL(float, s)
+ADD_NRM2_IMPL(double, d)
+
+ADD_SCALE_IMPL(float, s)
+ADD_SCALE_IMPL(double, d)
+
+// BLAS 2
+ADD_GEMV_IMPL(float, s)
+ADD_GEMV_IMPL(double, d)
+
+// BLAS 3
+ADD_GEMM_IMPL(rocblas_half, h)
+ADD_GEMM_IMPL(float, s)
+ADD_GEMM_IMPL(double, d)
+
+ADD_GEMM_STRIDED_BATCHED_IMPL(rocblas_half, h)
+ADD_GEMM_STRIDED_BATCHED_IMPL(float, s)
+ADD_GEMM_STRIDED_BATCHED_IMPL(double, d)
+
+// BLAS-like extension
+ADD_GEAM_IMPL(float, s)
+ADD_GEAM_IMPL(double, d)
+
+ADD_DGMM_IMPL(float, s)
+ADD_DGMM_IMPL(double, d)
+
+//
+// "STATIC" UNIT TEST
+//
+
+#define ASSERT_SUPPORT(type, op)                        \
+    static_assert(IsSupportedType<type, op>::value, "")
+
+#define ASSERT_NO_SUPPORT(type, op)                             \
+    static_assert(!IsSupportedType<type, op>::value, "")
+
+ASSERT_SUPPORT(float, BLAS_Op::AXPY);
+ASSERT_SUPPORT(float, BLAS_Op::COPY);
+ASSERT_SUPPORT(float, BLAS_Op::GEAM);
+ASSERT_SUPPORT(float, BLAS_Op::GEMM);
+ASSERT_SUPPORT(float, BLAS_Op::GEMV);
+ASSERT_SUPPORT(float, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(float, BLAS_Op::DGMM);
+ASSERT_SUPPORT(float, BLAS_Op::DOT);
+ASSERT_SUPPORT(float, BLAS_Op::NRM2);
+ASSERT_SUPPORT(float, BLAS_Op::GEMMSTRIDEDBATCHED);
+
+ASSERT_SUPPORT(double, BLAS_Op::AXPY);
+ASSERT_SUPPORT(double, BLAS_Op::COPY);
+ASSERT_SUPPORT(double, BLAS_Op::GEAM);
+ASSERT_SUPPORT(double, BLAS_Op::GEMM);
+ASSERT_SUPPORT(double, BLAS_Op::GEMV);
+ASSERT_SUPPORT(double, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(double, BLAS_Op::DGMM);
+ASSERT_SUPPORT(double, BLAS_Op::DOT);
+ASSERT_SUPPORT(double, BLAS_Op::NRM2);
+ASSERT_SUPPORT(double, BLAS_Op::GEMMSTRIDEDBATCHED);
+
+#ifdef HYDROGEN_GPU_USE_FP16
+ASSERT_SUPPORT(rocblas_half, BLAS_Op::AXPY);
+ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMM);
+ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::COPY);
+ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::DGMM);
+ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEAM);
+ASSERT_NO_SUPPORT(rocblas_half, BLAS_Op::GEMV);
+ASSERT_SUPPORT(rocblas_half, BLAS_Op::DOT);
+ASSERT_SUPPORT(rocblas_half, BLAS_Op::NRM2);
+ASSERT_SUPPORT(rocblas_half, BLAS_Op::GEMMSTRIDEDBATCHED);
+
+#ifdef HYDROGEN_HAVE_HALF
+ASSERT_SUPPORT(cpu_half_type, BLAS_Op::AXPY);
+ASSERT_SUPPORT(cpu_half_type, BLAS_Op::GEMM);
+ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::COPY);
+ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::DGMM);
+ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEAM);
+ASSERT_NO_SUPPORT(cpu_half_type, BLAS_Op::GEMV);
+#endif // HYDROGEN_HAVE_HALF
+#endif // HYDROGEN_GPU_USE_FP16
+
+// One type that should be entirely unsupported, just for sanity.
+ASSERT_NO_SUPPORT(int, BLAS_Op::AXPY);
+ASSERT_NO_SUPPORT(int, BLAS_Op::COPY);
+ASSERT_NO_SUPPORT(int, BLAS_Op::DGMM);
+ASSERT_NO_SUPPORT(int, BLAS_Op::GEAM);
+ASSERT_NO_SUPPORT(int, BLAS_Op::GEMM);
+ASSERT_NO_SUPPORT(int, BLAS_Op::GEMV);
+ASSERT_NO_SUPPORT(int, BLAS_Op::SCAL);
+ASSERT_NO_SUPPORT(int, BLAS_Op::DOT);
+ASSERT_NO_SUPPORT(int, BLAS_Op::NRM2);
+ASSERT_NO_SUPPORT(int, BLAS_Op::GEMMSTRIDEDBATCHED);
+
+} // namespace rocblas
+} // namespace hydrogen
diff --git a/src/io/Display.cpp b/src/io/Display.cpp
index 775f3a6b3a..7957c12c28 100644
--- a/src/io/Display.cpp
+++ b/src/io/Display.cpp
@@ -32,7 +32,7 @@ void Display(AbstractMatrix<Real> const& A, std::string title)
     case Device::CPU:
         Display(static_cast<Matrix<Real,Device::CPU> const&>(A), title);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         // Copy to the CPU
@@ -41,7 +41,7 @@ void Display(AbstractMatrix<Real> const& A, std::string title)
         Display(A_CPU, title);
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Display: Bad Device type.");
     }
diff --git a/src/io/Print.cpp b/src/io/Print.cpp
index 60e8d7a8de..c07e0dac2f 100644
--- a/src/io/Print.cpp
+++ b/src/io/Print.cpp
@@ -29,7 +29,7 @@ void Print(AbstractMatrix<T> const& A, string title, ostream& os)
     case Device::CPU:
         Print(static_cast<Matrix<T,Device::CPU> const&>(A), title, os);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         // Copy to host
@@ -38,7 +38,7 @@ void Print(AbstractMatrix<T> const& A, string title, ostream& os)
         Print(A_CPU, title, os);
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Print: Bad device.");
     }
diff --git a/src/io/Read.cpp b/src/io/Read.cpp
index 8762f6d1b9..3484e0a8d1 100644
--- a/src/io/Read.cpp
+++ b/src/io/Read.cpp
@@ -25,7 +25,7 @@ void Read(AbstractMatrix<T>& A,
     case Device::CPU:
         Read(static_cast<Matrix<T,Device::CPU>&>(A), filename, format);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         Matrix<T,Device::CPU> A_CPU;
@@ -33,7 +33,7 @@ void Read(AbstractMatrix<T>& A,
         static_cast<Matrix<T,Device::GPU>&>(A) = A_CPU;
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Read: Bad device type.");
     }
diff --git a/src/io/Write.cpp b/src/io/Write.cpp
index 7399e25746..415132a3fe 100644
--- a/src/io/Write.cpp
+++ b/src/io/Write.cpp
@@ -27,7 +27,7 @@ void Write(AbstractMatrix<T> const& A, string basename,
         Write(static_cast<Matrix<T,Device::CPU> const&>(A),
               basename, format, title);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         // Copy to the CPU
@@ -35,7 +35,7 @@ void Write(AbstractMatrix<T> const& A, string basename,
         Write(A_CPU, basename, format, title);
     }
     break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("Write: Bad Device type.");
     }
diff --git a/src/lapack_like/props/Norm/Frobenius.cpp b/src/lapack_like/props/Norm/Frobenius.cpp
index dc84bd68a7..fd187b2260 100644
--- a/src/lapack_like/props/Norm/Frobenius.cpp
+++ b/src/lapack_like/props/Norm/Frobenius.cpp
@@ -17,13 +17,13 @@ Base<Field> FrobeniusNorm(AbstractMatrix<Field> const& A)
     {
     case Device::CPU:
         return FrobeniusNorm(static_cast<Matrix<Field,Device::CPU> const&>(A));
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
     {
         AbstractMatrixReadDeviceProxy<Field,Device::CPU> ALocProxy{A};
         return FrobeniusNorm(ALocProxy.GetLocked());
     }
-#endif //HYDROGEN_HAVE_CUDA
+#endif //HYDROGEN_HAVE_GPU
     default:
         LogicError("FrobeniusNorm: Bad Device.");
     }
diff --git a/src/matrices/random/independent/Gaussian.cpp b/src/matrices/random/independent/Gaussian.cpp
index 6584744108..c64cd14a37 100644
--- a/src/matrices/random/independent/Gaussian.cpp
+++ b/src/matrices/random/independent/Gaussian.cpp
@@ -37,11 +37,11 @@ void MakeGaussian(AbstractMatrix<F>& A, F mean, Base<F> stddev)
     case Device::CPU:
         MakeGaussian(static_cast<Matrix<F,Device::CPU>&>(A), mean, stddev);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         MakeGaussian(static_cast<Matrix<F,Device::GPU>&>(A), mean, stddev);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("MakeGaussian: Bad device.");
     }
diff --git a/src/matrices/random/independent/Uniform.cpp b/src/matrices/random/independent/Uniform.cpp
index 45ec20d8fe..a0d0937334 100644
--- a/src/matrices/random/independent/Uniform.cpp
+++ b/src/matrices/random/independent/Uniform.cpp
@@ -23,11 +23,11 @@ void MakeUniform( AbstractMatrix<T>& A, T center, Base<T> radius )
     case Device::CPU:
         MakeUniform(static_cast<Matrix<T,Device::CPU>&>(A), center, radius);
         break;
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     case Device::GPU:
         MakeUniform(static_cast<Matrix<T,Device::GPU>&>(A), center, radius);
         break;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
     default:
         LogicError("MakeUniform: Bad device.");
     }
@@ -82,7 +82,7 @@ void Uniform( AbstractDistMatrix<T>& A, Int m, Int n, T center, Base<T> radius )
     template void MakeUniform(                                  \
         Matrix<T,Device::CPU>& A, T center, Base<T> radius );
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
 template void MakeUniform(Matrix<float,Device::GPU>&, float, Base<float>);
 template void MakeUniform(Matrix<double,Device::GPU>&, double, Base<double>);
 
@@ -91,7 +91,7 @@ ABSTRACT_PROTO(gpu_half_type);
 template void MakeUniform(Matrix<gpu_half_type,Device::GPU>&,
                           gpu_half_type, Base<gpu_half_type>);
 #endif
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
 #define EL_ENABLE_DOUBLEDOUBLE
 #define EL_ENABLE_QUADDOUBLE
diff --git a/tests/blas_like/Axpy.cpp b/tests/blas_like/Axpy.cpp
index d24e977565..22b5ecf91b 100644
--- a/tests/blas_like/Axpy.cpp
+++ b/tests/blas_like/Axpy.cpp
@@ -181,7 +181,7 @@ main(int argc, char* argv[])
         // Message
         OutputFromRoot(g.Comm(),"Testing Axpy");
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         if (testGPU)
         {
             TestAxpy<float,Device::GPU>(
@@ -191,7 +191,7 @@ main(int argc, char* argv[])
         }
 #else
         (void)testGPU;
-#endif // HYDROGEN_HAVE_CUDA
+#endif // HYDROGEN_HAVE_GPU
         // Run tests
         if (testCPU)
         {
diff --git a/tests/blas_like/BasicGemm.cpp b/tests/blas_like/BasicGemm.cpp
index cb58cde052..a1db01d210 100644
--- a/tests/blas_like/BasicGemm.cpp
+++ b/tests/blas_like/BasicGemm.cpp
@@ -275,7 +275,7 @@ int main(int argc, char *argv[])
             El::Output("grid is ",grid.Height()," x ",grid.Width());
 
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         if (testGPU)
         {
             TestGemm<float,El::Device::GPU>
@@ -285,7 +285,7 @@ int main(int argc, char *argv[])
         }
 #else
         (void) testGPU;
-#endif // HYDROGEN_ENABLE_CUDA
+#endif // HYDROGEN_HAVE_GPU
 
         TestGemm<float,El::Device::CPU>
             (m, n, k, grid, testSequential, instrument);
diff --git a/tests/blas_like/Gemm.cpp b/tests/blas_like/Gemm.cpp
index 9324637183..bf18231e9e 100644
--- a/tests/blas_like/Gemm.cpp
+++ b/tests/blas_like/Gemm.cpp
@@ -7,6 +7,9 @@
   http://opensource.org/licenses/BSD-2-Clause
 */
 #include <El.hpp>
+
+#include "GemmHelpers/SyncTimer.hpp"
+
 using namespace El;
 
 template<typename T, Device D>
@@ -45,35 +48,6 @@ void TestAssociativity
          EFrobNorm, "/", YFrobNorm, "=", EFrobNorm/YFrobNorm);
 }
 
-#ifdef HYDROGEN_HAVE_CUDA
-#define START_CUDA_TIMER                                  \
-    if (D == Device::GPU)                                 \
-        cudaEventRecord(start, GPUManager::Stream());
-
-#define STOP_CUDA_TIMER                                 \
-    if (D == Device::GPU)                               \
-    {                                                   \
-        cudaEventRecord(stop, GPUManager::Stream());    \
-        cudaEventSynchronize(stop);                     \
-        cudaEventElapsedTime(&cudaTime, start, stop);   \
-    }
-
-#define SUMMARIZE_CUDA_TIMER                                            \
-    if (D == Device::GPU)                                               \
-    {                                                                   \
-        runTime = cudaTime * 1e-3;                                      \
-        realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime);   \
-        gFlops = (IsComplex<T>::value ? 4*realGFlops : realGFlops);     \
-        OutputFromRoot(g.Comm(),"Finished in ",runTime,                 \
-                     " seconds (",gFlops," GFlop/s)");                  \
-    }
-
-#else
-#define START_CUDA_TIMER do {} while (false)
-#define STOP_CUDA_TIMER do {} while (false)
-#define SUMMARIZE_CUDA_TIMER do {} while (false)
-#endif
-
 template<typename T, Device D>
 void TestGemm
 (Orientation orientA,
@@ -106,9 +80,9 @@ void TestGemm
         Gaussian(B, n, k);
     Gaussian(COrig, m, n);
 
-#ifdef HYDROGEN_HAVE_CUDA
-    H_CHECK_CUDA(cudaDeviceSynchronize());
-#endif // HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
+    El::gpu::SynchronizeDevice();
+#endif // HYDROGEN_HAVE_GPU
 
     if (print)
     {
@@ -117,14 +91,11 @@ void TestGemm
         Print(COrig, "COrig");
     }
 
-    Timer timer;
-#ifdef HYDROGEN_HAVE_CUDA
-    cudaEvent_t start, stop;
-    cudaEventCreate(&start);
-    cudaEventCreate(&stop);
+    helpers::SyncTimer<D> timer(SyncInfoFromMatrix(C.LockedMatrix()));
     float cudaTime;
 
     // Warmup run -- doesn't matter in CPU land
+#ifdef HYDROGEN_HAVE_GPU
     if (D == Device::GPU)
     {
         C = COrig;
@@ -139,27 +110,26 @@ void TestGemm
         C = COrig;
         OutputFromRoot(g.Comm(),"Stationary A algorithm:");
         PushIndent();
+        timer.Reset();
         mpi::Barrier(g.Comm());
         timer.Start();
-        START_CUDA_TIMER;
-        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_A_MS);
-        STOP_CUDA_TIMER;
-
+        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_A);
         mpi::Barrier(g.Comm());
-        runTime = timer.Stop();
+        timer.Stop();
+        runTime = timer.GetTime();
         realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime);
         gFlops = (IsComplex<T>::value ? 4*realGFlops : realGFlops);
-        if (D == Device::CPU)
-            OutputFromRoot
-                (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)");
-        SUMMARIZE_CUDA_TIMER;
+        OutputFromRoot(
+            g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)");
 
         flush(std::cout);
 
         if (print)
             Print(C, BuildString("C := ",alpha," A B + ",beta," C"));
         if (correctness)
-            TestAssociativity(orientA, orientB, alpha, A, B, beta, COrig, C, print);
+            TestAssociativity(orientA, orientB,
+                              alpha, A, B, beta, COrig, C,
+                              print);
         PopIndent();
 
         flush(std::cout);
@@ -171,28 +141,25 @@ void TestGemm
         C = COrig;
         OutputFromRoot(g.Comm(),"Stationary B Algorithm:");
         PushIndent();
+        timer.Reset();
         mpi::Barrier(g.Comm());
         timer.Start();
-        Synchronize(SyncInfoFromMatrix(C.Matrix()));
-        START_CUDA_TIMER;
-        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_B_MS);
-        Synchronize(SyncInfoFromMatrix(C.Matrix()));
-        STOP_CUDA_TIMER;
-
+        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_B);
         mpi::Barrier(g.Comm());
-        runTime = timer.Stop();
+        timer.Stop();
+        runTime = timer.GetTime();
         realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime);
         gFlops = (IsComplex<T>::value ? 4*realGFlops : realGFlops);
 
-        if (D == Device::CPU)
-            OutputFromRoot
-                (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)");
-        SUMMARIZE_CUDA_TIMER;
+        OutputFromRoot(
+            g.Comm(),"Finished in ",runTime, " seconds (",gFlops," GFlop/s)");
 
         if (print)
             Print(C, BuildString("C := ",alpha," A B + ",beta," C"));
         if (correctness)
-            TestAssociativity(orientA, orientB, alpha, A, B, beta, COrig, C, print);
+            TestAssociativity(orientA, orientB,
+                              alpha, A, B, beta, COrig, C,
+                              print);
         PopIndent();
 
         flush(std::cout);
@@ -204,20 +171,19 @@ void TestGemm
         C = COrig;
         OutputFromRoot(g.Comm(),"Stationary C Algorithm:");
         PushIndent();
+        timer.Reset();
         mpi::Barrier(g.Comm());
         timer.Start();
-        START_CUDA_TIMER;
-        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_C_MS);
-        STOP_CUDA_TIMER;
-
+        Gemm(orientA, orientB, alpha, A, B, beta, C, GEMM_SUMMA_C);
         mpi::Barrier(g.Comm());
-        runTime = timer.Stop();
+        timer.Stop();
+        runTime = timer.GetTime();
         realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime);
         gFlops = (IsComplex<T>::value ? 4*realGFlops : realGFlops);
-        if (D == Device::CPU)
-            OutputFromRoot
-                (g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)");
-        SUMMARIZE_CUDA_TIMER;
+
+        OutputFromRoot(
+            g.Comm(),"Finished in ",runTime," seconds (",gFlops," GFlop/s)");
+
         if (print)
             Print(C, BuildString("C := ",alpha," A B + ",beta," C"));
         if (correctness)
@@ -236,37 +202,32 @@ void TestGemm
             OutputFromRoot(g.Comm(),"Dot Product Algorithm:");
             PushIndent();
             C = COrig;
+            timer.Reset();
             mpi::Barrier(g.Comm());
             timer.Start();
-            START_CUDA_TIMER;
             Gemm(NORMAL, NORMAL, alpha, A, B, beta, C, GEMM_SUMMA_DOT);
-            STOP_CUDA_TIMER;
-
             mpi::Barrier(g.Comm());
-            runTime = timer.Stop();
+            timer.Stop();
+            runTime = timer.GetTime();
             realGFlops = 2.*double(m)*double(n)*double(k)/(1.e9*runTime);
             gFlops = (IsComplex<T>::value ? 4*realGFlops : realGFlops);
-            if (D == Device::CPU)
-                OutputFromRoot
-                    (g.Comm(),"Finished in ",runTime," seconds (",gFlops,
-                     " GFlop/s)");
-            SUMMARIZE_CUDA_TIMER;
+            OutputFromRoot(
+                g.Comm(),"Finished in ",runTime," seconds (",gFlops,
+                " GFlop/s)");
 
             if (print)
                 Print(C, BuildString("C := ",alpha," A B + ",beta," C"));
             if (correctness)
                 TestAssociativity
                     (orientA, orientB, alpha, A, B, beta, COrig, C, print);
+
             PopIndent();
             flush(std::cout);
         }
     }
     PopIndent();
 
-#ifdef HYDROGEN_HAVE_CUDA
-    cudaEventDestroy(start);
-    cudaEventDestroy(stop);
-#endif
+    flush(std::cout);
 }
 
 int
@@ -310,10 +271,10 @@ main(int argc, char* argv[])
         ComplainIfDebug();
         OutputFromRoot(g.Comm(),"Will test Gemm",transA,transB);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
         if (testGPU)
         {
-#ifdef HYDROGEN_GPU_USE_FP16
+#if defined HYDROGEN_HAVE_HALF && defined HYDROGEN_GPU_USE_FP16
             TestGemm<gpu_half_type,Device::GPU>
                 (orientA, orientB,
                  m, n, k,
@@ -323,7 +284,7 @@ main(int argc, char* argv[])
                  colAlignA, rowAlignA,
                  colAlignB, rowAlignB,
                  colAlignC, rowAlignC);
-#endif // HYDROGEN_GPU_USE_FP16
+#endif // defined HYDROGEN_HAVE_HALF && defined HYDROGEN_GPU_USE_FP16
             TestGemm<float,Device::GPU>
                 (orientA, orientB,
                  m, n, k,
diff --git a/tests/blas_like/GemmHelpers/SyncTimer.hpp b/tests/blas_like/GemmHelpers/SyncTimer.hpp
index 8dd7f83fed..b2fbc88bf4 100644
--- a/tests/blas_like/GemmHelpers/SyncTimer.hpp
+++ b/tests/blas_like/GemmHelpers/SyncTimer.hpp
@@ -68,7 +68,7 @@ class SyncTimer<El::Device::GPU>
         if (started_ || stopped_)
             throw std::runtime_error("Start(): Bad timer state.");
 
-        H_CHECK_CUDA(cudaEventRecord(start_, si_.stream_));
+        H_CHECK_CUDA(cudaEventRecord(start_, si_.Stream()));
         started_ = true;
     }
 
@@ -77,7 +77,7 @@ class SyncTimer<El::Device::GPU>
         if (stopped_ || !started_)
             throw std::runtime_error("Stop(): Bad timer state.");
 
-        H_CHECK_CUDA(cudaEventRecord(stop_, si_.stream_));
+        H_CHECK_CUDA(cudaEventRecord(stop_, si_.Stream()));
         stopped_ = true;
     }
 
@@ -103,6 +103,69 @@ class SyncTimer<El::Device::GPU>
     cudaEvent_t start_, stop_;
     bool started_, stopped_;
 };// class SyncTimer<GPU>
+
+#elif defined(HYDROGEN_HAVE_ROCM)
+
+template <>
+class SyncTimer<El::Device::GPU>
+{
+public:
+    SyncTimer(El::SyncInfo<El::Device::GPU> const& si)
+        : si_ {si},
+          started_ {false},
+          stopped_ {false}
+    {
+        H_CHECK_HIP(hipEventCreate(&start_));
+        H_CHECK_HIP(hipEventCreate(&stop_));
+    }
+
+    ~SyncTimer()
+    {
+        hipEventDestroy(start_);
+        hipEventDestroy(stop_);
+    }
+
+    void Start()
+    {
+        if (started_ || stopped_)
+            throw std::runtime_error("Start(): Bad timer state.");
+
+        H_CHECK_HIP(hipEventRecord(start_, si_.Stream()));
+        started_ = true;
+    }
+
+    void Stop()
+    {
+        if (stopped_ || !started_)
+            throw std::runtime_error("Stop(): Bad timer state.");
+
+        H_CHECK_HIP(hipEventRecord(stop_, si_.Stream()));
+        stopped_ = true;
+    }
+
+    /** @brief Get elapsed time in seconds. */
+    long double GetTime() const
+    {
+        if (!(started_ && stopped_))
+            throw std::runtime_error("GetTime(): Bad timer state.");
+
+        float elapsed_time_ms;
+        H_CHECK_HIP(hipEventSynchronize(stop_));
+        H_CHECK_HIP(hipEventElapsedTime(&elapsed_time_ms, start_, stop_));
+        return elapsed_time_ms / 1000.l;
+    }
+
+    void Reset()
+    {
+        started_ = stopped_ = false;
+    }
+
+private:
+    El::SyncInfo<El::Device::GPU> si_;
+    hipEvent_t start_, stop_;
+    bool started_, stopped_;
+};// class SyncTimer<GPU>
+
 #endif // HYDROGEN_HAVE_CUDA
 
 template <El::Device D>
diff --git a/tests/core/DistMatrix.cpp b/tests/core/DistMatrix.cpp
index ad66fed2b4..23aaa628db 100644
--- a/tests/core/DistMatrix.cpp
+++ b/tests/core/DistMatrix.cpp
@@ -20,6 +20,7 @@ Check(DistMatrix<T,AColDist,ARowDist,ELEMENT,ADevice>& A,
 
     const Int height = B.Height();
     const Int width = B.Width();
+    SyncInfo<Device::CPU> cpu_si;
 
     OutputFromRoot
     (g.Comm(),
@@ -29,8 +30,8 @@ Check(DistMatrix<T,AColDist,ARowDist,ELEMENT,ADevice>& A,
      ",",DeviceName<BDevice>(),"]");
     Int colAlign = SampleUniform<Int>(0,A.ColStride());
     Int rowAlign = SampleUniform<Int>(0,A.RowStride());
-    mpi::Broadcast(colAlign, 0, g.Comm());
-    mpi::Broadcast(rowAlign, 0, g.Comm());
+    mpi::Broadcast(colAlign, 0, g.Comm(), cpu_si);
+    mpi::Broadcast(rowAlign, 0, g.Comm(), cpu_si);
     A.Align(colAlign, rowAlign);
     A = B;
     if (A.Height() != B.Height() || A.Width() != B.Width())
@@ -54,7 +55,7 @@ Check(DistMatrix<T,AColDist,ARowDist,ELEMENT,ADevice>& A,
     }
 
     Int summedErrorFlag;
-    mpi::AllReduce(&myErrorFlag, &summedErrorFlag, 1, mpi::SUM, g.Comm());
+    mpi::AllReduce(&myErrorFlag, &summedErrorFlag, 1, mpi::SUM, g.Comm(), cpu_si);
 
     if (summedErrorFlag == 0)
     {
@@ -66,7 +67,7 @@ Check(DistMatrix<T,AColDist,ARowDist,ELEMENT,ADevice>& A,
     }
     else
     {
-        OutputFromRoot(g.Comm(),"FAILED");
+        OutputFromRoot(g.Comm(),"FAILED (", summedErrorFlag," ranks failed)");
         if (print)
             Print(A, "A");
         if (print)
@@ -177,10 +178,12 @@ template<typename T,Dist U,Dist V,Device D>
 void CheckAll(Int m, Int n, const Grid& grid, bool print)
 {
     DistMatrix<T,U,V,ELEMENT,D> A(grid);
+    SyncInfo<Device::CPU> cpu_si;
+
     Int colAlign = SampleUniform<Int>(0,A.ColStride());
     Int rowAlign = SampleUniform<Int>(0,A.RowStride());
-    mpi::Broadcast(colAlign, 0, grid.Comm());
-    mpi::Broadcast(rowAlign, 0, grid.Comm());
+    mpi::Broadcast(colAlign, 0, grid.Comm(), cpu_si);
+    mpi::Broadcast(rowAlign, 0, grid.Comm(), cpu_si);
     A.Align(colAlign, rowAlign);
 
     const T center = 0;
@@ -189,7 +192,7 @@ void CheckAll(Int m, Int n, const Grid& grid, bool print)
 
     CheckAll_device<Device::CPU>(A, print);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     CheckAll_device<Device::GPU>(A, print);
 #endif
 }
@@ -240,7 +243,7 @@ DistMatrixTest(Int m, Int n, const Grid& grid, bool print)
 
     DistMatrixTest_device<T,Device::CPU>(m,n,grid,print);
 
-#ifdef HYDROGEN_HAVE_CUDA
+#ifdef HYDROGEN_HAVE_GPU
     DistMatrixTest_device<T,Device::GPU>(m,n,grid,print);
 #endif
 }
@@ -251,58 +254,63 @@ main(int argc, char* argv[])
     Environment env(argc, argv);
     mpi::Comm comm = mpi::NewWorldComm();
 
-    try
+    int gridHeight = Input("--gridHeight","height of process grid",0);
+    const bool colMajor = Input("--colMajor","column-major ordering?",true);
+    const Int m = Input("--height","height of matrix",50);
+    const Int n = Input("--width","width of matrix",50);
+    const bool print = Input("--print","print wrong matrices?",false);
+    const bool debug = Input("--debug","wait for debugger?",false);
+    ProcessInput();
+    PrintInputReport();
+
+    if (gridHeight == 0)
+        gridHeight = Grid::DefaultHeight(mpi::Size(comm));
+    const GridOrder order = colMajor ? COLUMN_MAJOR : ROW_MAJOR;
+    const Grid grid(std::move(comm), gridHeight, order);
+
+    if (debug)
     {
-        int gridHeight = Input("--gridHeight","height of process grid",0);
-        const bool colMajor = Input("--colMajor","column-major ordering?",true);
-        const Int m = Input("--height","height of matrix",50);
-        const Int n = Input("--width","width of matrix",50);
-        const bool print = Input("--print","print wrong matrices?",false);
-        ProcessInput();
-        PrintInputReport();
-
-        if (gridHeight == 0)
-            gridHeight = Grid::DefaultHeight(mpi::Size(comm));
-        const GridOrder order = colMajor ? COLUMN_MAJOR : ROW_MAJOR;
-        const Grid grid(std::move(comm), gridHeight, order);
+        volatile int x = 1;
+        while (x) {
+            hydrogen::break_on_me();
+        };
+    }
 
-        DistMatrixTest<Int>(m, n, grid, print);
+    DistMatrixTest<Int>(m, n, grid, print);
 
-        DistMatrixTest<float>(m, n, grid, print);
-        DistMatrixTest<Complex<float>>(m, n, grid, print);
+    DistMatrixTest<float>(m, n, grid, print);
+    DistMatrixTest<Complex<float>>(m, n, grid, print);
 
-        DistMatrixTest<double>(m, n, grid, print);
-        DistMatrixTest<Complex<double>>(m, n, grid, print);
+    DistMatrixTest<double>(m, n, grid, print);
+    DistMatrixTest<Complex<double>>(m, n, grid, print);
 
 #ifdef EL_HAVE_QD
-        DistMatrixTest<DoubleDouble>(m, n, grid, print);
-        DistMatrixTest<QuadDouble>(m, n, grid, print);
+    DistMatrixTest<DoubleDouble>(m, n, grid, print);
+    DistMatrixTest<QuadDouble>(m, n, grid, print);
 #endif
 
 #ifdef EL_HAVE_QUAD
-        DistMatrixTest<Quad>(m, n, grid, print);
-        DistMatrixTest<Complex<Quad>>(m, n, grid, print);
+    DistMatrixTest<Quad>(m, n, grid, print);
+    DistMatrixTest<Complex<Quad>>(m, n, grid, print);
 #endif
 
 #ifdef HYDROGEN_HAVE_HALF
-        DistMatrixTest<cpu_half_type>(m, n, grid, print);
+    DistMatrixTest<cpu_half_type>(m, n, grid, print);
 #endif
 
 #ifdef EL_HAVE_MPC
-        DistMatrixTest<BigInt>(m, n, grid, print);
-        OutputFromRoot(g.Comm(),"Setting BigInt precision to 512 bits");
-        mpfr::SetMinIntBits(512);
-        DistMatrixTest<BigInt>(m, n, grid, print);
-
-        DistMatrixTest<BigFloat>(m, n, grid, print);
-        DistMatrixTest<Complex<BigFloat>>(m, n, grid, print);
-        OutputFromRoot(g.Comm(),"Setting BigFloat precision to 512 bits");
-        mpfr::SetPrecision(512);
-        DistMatrixTest<BigFloat>(m, n, grid, print);
-        DistMatrixTest<Complex<BigFloat>>(m, n, grid, print);
+    DistMatrixTest<BigInt>(m, n, grid, print);
+    OutputFromRoot(g.Comm(),"Setting BigInt precision to 512 bits");
+    mpfr::SetMinIntBits(512);
+    DistMatrixTest<BigInt>(m, n, grid, print);
+
+    DistMatrixTest<BigFloat>(m, n, grid, print);
+    DistMatrixTest<Complex<BigFloat>>(m, n, grid, print);
+    OutputFromRoot(g.Comm(),"Setting BigFloat precision to 512 bits");
+    mpfr::SetPrecision(512);
+    DistMatrixTest<BigFloat>(m, n, grid, print);
+    DistMatrixTest<Complex<BigFloat>>(m, n, grid, print);
 #endif
-    }
-    catch(std::exception& e) { ReportException(e); }
 
     return 0;
 }
diff --git a/unit_test/CMakeLists.txt b/unit_test/CMakeLists.txt
index 0318babe07..6df45d5c20 100644
--- a/unit_test/CMakeLists.txt
+++ b/unit_test/CMakeLists.txt
@@ -12,6 +12,11 @@ if (HYDROGEN_HAVE_GPU)
   endif ()
 endif (HYDROGEN_HAVE_GPU)
 
+if (HYDROGEN_HAVE_GPU)
+  list(APPEND HYDROGEN_CATCH2_TEST_FILES
+    gpu_test.cpp)
+endif ()
+
 # Add the sequential test main() function
 add_executable(seq-catch-tests
   SequentialCatchMain.cpp "${HYDROGEN_CATCH2_TEST_FILES}")
diff --git a/unit_test/gpu_test.cpp b/unit_test/gpu_test.cpp
new file mode 100644
index 0000000000..f6429d0186
--- /dev/null
+++ b/unit_test/gpu_test.cpp
@@ -0,0 +1,19 @@
+#include <catch2/catch.hpp>
+
+#include <hydrogen/device/GPU.hpp>
+
+TEST_CASE("Testing core GPU functionality", "[seq][gpu][init]")
+{
+    REQUIRE_FALSE(hydrogen::gpu::IsInitialized());
+    REQUIRE(hydrogen::gpu::IsFinalized());
+
+    REQUIRE_NOTHROW(hydrogen::gpu::Initialize());
+
+    REQUIRE(hydrogen::gpu::IsInitialized());
+    REQUIRE_FALSE(hydrogen::gpu::IsFinalized());
+
+    REQUIRE_NOTHROW(hydrogen::gpu::Finalize());
+
+    REQUIRE_FALSE(hydrogen::gpu::IsInitialized());
+    REQUIRE(hydrogen::gpu::IsFinalized());
+}