From 3589078bad0f1f7b6fd6e623d29db006d60fd102 Mon Sep 17 00:00:00 2001
From: Trevor Vincent <trevor-vincent@users.noreply.github.com>
Date: Wed, 21 Jul 2021 23:08:53 -0400
Subject: [PATCH] Add final touches (#50)

* Add final touches

* Fix typos

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update include/jet/CudaTensor.hpp

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update include/jet/TaskBasedContractor.hpp

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update include/jet/CudaTensor.hpp

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Add PR corrections

* Update examples/heterogeneous_contraction.cu

Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>

* Fix two minor issues

Co-authored-by: Trevor Vincent <tvincent@cita.utoronto.ca>
Co-authored-by: Mikhail Andrenkov <Mandrenkov@users.noreply.github.com>
---
 examples/heterogeneous_contraction.cu     | 261 ++++++++++++++++++++++
 include/jet/CudaTensor.hpp                | 149 +++++++-----
 include/jet/GpuContractionTaskCreator.hpp | 150 -------------
 include/jet/TaskBasedContractor.hpp       |  11 +
 4 files changed, 366 insertions(+), 205 deletions(-)
 create mode 100644 examples/heterogeneous_contraction.cu
 delete mode 100644 include/jet/GpuContractionTaskCreator.hpp
diff --git a/examples/heterogeneous_contraction.cu b/examples/heterogeneous_contraction.cu
new file mode 100644
index 00000000..66b69dc9
--- /dev/null
+++ b/examples/heterogeneous_contraction.cu
@@ -0,0 +1,261 @@
+/**
+ * @file   heterogeneous_contraction.cu
+ *
+ * @brief  Contracts three tensor network files on two gpus
+ *         and one cpu simultaneously
+ *
+ */
+
+#include <iostream>
+
+#include "CudaTensor.hpp"
+#include "PathInfo.hpp"
+#include "TaskBasedContractor.hpp"
+#include "Tensor.hpp"
+#include "TensorNetwork.hpp"
+#include "TensorNetworkIO.hpp"
+
+#include <cuComplex.h>
+#include <taskflow/cudaflow.hpp>
+
+
+using namespace Jet;
+
+template <typename T, int device = 0> struct CudaflowContractionTask {
+
+    std::vector<std::unique_ptr<CudaTensor<T, device>>> tensors;
+    std::vector<typename CudaTensor<T, device>::CudaContractionPlan> plans;
+    std::vector<tf::cudaTask> kernel_tasks;
+    std::vector<T> result;
+};
+
+template <typename T, int device = 0>
+void AddCudaContractionToTaskflow(
+    const TensorNetwork<CudaTensor<T, device>> &tn,
+    const PathInfo &path_info, tf::Taskflow &taskflow,
+    CudaflowContractionTask<T, device> &gpu_task)
+{
+    auto &tensors = gpu_task.tensors;
+    auto &plans = gpu_task.plans;
+    auto &result = gpu_task.result;
+    auto &kernel_tasks = gpu_task.kernel_tasks;
+
+    const auto &path_node_info = path_info.GetSteps();
+    const auto &path = path_info.GetPath();
+    const auto &nodes = tn.GetNodes();
+    size_t num_leafs = nodes.size();
+    tensors.resize(path_node_info.size());
+    plans.resize(path.size());
+
+    for (size_t i = 0; i < path.size(); i++) {
+
+        const PathStepInfo &pnia = path_node_info[path[i].first];
+        const PathStepInfo &pnib = path_node_info[path[i].second];
+        const PathStepInfo &pnic = path_node_info[num_leafs + i];
+
+        if (pnia.id >= num_leafs) {
+            tensors[path[i].first] =
+                std::make_unique<CudaTensor<cuComplex, device>>(
+                    CudaTensor<cuComplex, device>(pnia.tensor_indices,
+                                                  pnia.shape));
+        }
+        else {
+            tensors[path[i].first] =
+                std::make_unique<CudaTensor<cuComplex, device>>(
+                    CudaTensor<cuComplex, device>(
+                        tn.GetNodes()[pnia.id].tensor));
+        }
+
+        if (pnib.id >= num_leafs) {
+            tensors[path[i].second] =
+                std::make_unique<CudaTensor<cuComplex, device>>(
+                    CudaTensor<cuComplex, device>(pnib.tensor_indices,
+                                                  pnib.shape));
+        }
+        else {
+            tensors[path[i].second] =
+                std::make_unique<CudaTensor<cuComplex, device>>(
+                    CudaTensor<cuComplex, device>(
+                        tn.GetNodes()[pnib.id].tensor));
+        }
+
+        tensors[num_leafs + i] =
+            std::make_unique<CudaTensor<cuComplex, device>>(
+                CudaTensor<cuComplex, device>(pnic.tensor_indices, pnic.shape));
+
+        CudaTensor<cuComplex, device>::GetCudaContractionPlan(
+            plans[i], *tensors[path[i].first], *tensors[path[i].second],
+            *tensors[num_leafs + i]);
+    }
+
+    tf::Task task = taskflow.emplace_on(
+					[&,path,path_node_info,num_leafs](tf::cudaFlowCapturer &capturer) {
+            for (int i = 0; i < path.size(); i++) {
+
+                const PathStepInfo &pnia = path_node_info[path[i].first];
+                const PathStepInfo &pnib = path_node_info[path[i].second];
+                const PathStepInfo &pnic = path_node_info[num_leafs + i];
+
+                auto tensor_a = tensors[path[i].first]->GetData();
+                auto tensor_b = tensors[path[i].second]->GetData();
+                auto tensor_c = tensors[num_leafs + i]->GetData();
+
+                auto &c_plan = plans[i];
+                tf::cudaTask kernel =
+                    capturer.on([&, c_plan, tensor_a, tensor_b,
+                                 tensor_c](cudaStream_t stream) {
+                        cuComplex alpha;
+                        alpha.x = 1.;
+                        alpha.y = 0.;
+
+                        cuComplex beta;
+                        beta.x = 0.;
+                        beta.y = 0.;
+
+                        cutensorContraction(&c_plan.handle, &c_plan.plan,
+                                            &alpha, tensor_a, tensor_b, &beta,
+                                            tensor_c, tensor_c, c_plan.work,
+                                            c_plan.work_size, stream);
+                    });
+
+                kernel_tasks.push_back(kernel);
+
+                if (pnia.id >= num_leafs) {
+                    kernel_tasks[pnia.id - num_leafs].precede(kernel);
+                }
+
+                if (pnib.id >= num_leafs) {
+                    kernel_tasks[pnib.id - num_leafs].precede(kernel);
+                }
+
+                // copy data from gpu_data to host_data
+                if (i == path.size() - 1) {
+                    result.resize(tensors[pnic.id]->GetSize());
+                    tf::cudaTask d2h = capturer.memcpy(
+                        result.data(), tensors[pnic.id]->GetData(),
+                        tensors[pnic.id]->GetSize() * sizeof(cuComplex));
+
+                    kernel.precede(d2h);
+                }
+            }
+        },
+        device);
+}
+
+int main(int argc, char *argv[])
+{
+
+    if (argc != 4) {
+        std::cout << "heterogeneous_contraction.cu <tensor network file 1 on GPU 0> "
+                     "<tensor network file 2 on GPU 1> <tensor network file 3 on CPU>"
+                  << std::endl;
+        std::cout << "Contracts three circuits on two GPUs and one CPU"
+                  << std::endl;
+    }
+
+    std::string file_name_0 = argv[1];
+    std::string file_name_1 = argv[2];
+    std::string file_name_2 = argv[3];
+
+    /*
+     * Load first tensor network file onto GPU 0
+     */
+
+    TensorNetworkFile<CudaTensor<cuComplex, 0>> tensor_file_0;
+    try {
+        std::ifstream tn_data(file_name_0);
+        std::string circuit_str{std::istreambuf_iterator<char>(tn_data),
+                                std::istreambuf_iterator<char>()};
+        // Load data into TensorNetwork and PathInfo objects
+        TensorNetworkSerializer<CudaTensor<cuComplex, 0>> serializer;
+        tensor_file_0 = serializer(circuit_str, true);
+    }
+    catch (...) {
+        std::cerr << "Please specify a valid first JSON file to contract"
+                  << std::endl;
+        exit(1);
+    }
+
+    TensorNetwork<CudaTensor<cuComplex, 0>> tn_0 = tensor_file_0.tensors;
+    PathInfo path_0 = tensor_file_0.path.value();
+
+    /**
+     * Load second tensor network file onto GPU 1
+     */
+
+    TensorNetworkFile<CudaTensor<cuComplex, 1>> tensor_file_1;
+    try {
+        std::ifstream tn_data(file_name_1);
+        std::string circuit_str{std::istreambuf_iterator<char>(tn_data),
+                                std::istreambuf_iterator<char>()};
+        // Load data into TensorNetwork and PathInfo objects
+        TensorNetworkSerializer<CudaTensor<cuComplex, 1>> serializer;
+        tensor_file_1 = serializer(circuit_str, true);
+    }
+    catch (...) {
+        std::cerr << "Please specify a valid second JSON file to contract"
+                  << std::endl;
+        exit(1);
+    }
+
+    TensorNetwork<CudaTensor<cuComplex, 1>> tn_1 = tensor_file_1.tensors;
+    PathInfo path_1 = tensor_file_1.path.value();
+
+    /**
+     * Load third tensor network file onto CPU
+     */
+
+    TensorNetworkFile<Tensor<std::complex<float>>> tensor_file_2;
+    try {
+        std::ifstream tn_data(file_name_2);
+        std::string circuit_str{std::istreambuf_iterator<char>(tn_data),
+                                std::istreambuf_iterator<char>()};
+        // Load data into TensorNetwork and PathInfo objects
+        TensorNetworkSerializer<Tensor<std::complex<float>>> serializer;
+        tensor_file_2 = serializer(circuit_str, true);
+    }
+    catch (...) {
+        std::cerr << "Please specify a valid JSON file to contract"
+                  << std::endl;
+        exit(1);
+    }
+    TensorNetwork<Tensor<std::complex<float>>> tn_2 =
+        tensor_file_2.tensors;
+    PathInfo path_2 = tensor_file_2.path.value();
+
+    tf::Taskflow taskflow;
+
+    /* set up gpu 0 contraction task */
+    CudaflowContractionTask<cuComplex, 0> gpu_task_0;
+    AddCudaContractionToTaskflow<cuComplex, 0>(tn_0, path_0, taskflow,
+                                               gpu_task_0);
+
+    /* set up gpu 1 contraction task */
+    CudaflowContractionTask<cuComplex, 1> gpu_task_1;
+    AddCudaContractionToTaskflow<cuComplex, 1>(tn_1, path_1, taskflow,
+                                               gpu_task_1);
+
+    /* set up cpu contraction task */
+    TaskBasedContractor<Tensor<std::complex<float>>> contractor;
+    contractor.AddContractionTasks(tn_2, path_2);
+
+    // Add gpu task graph to cpu task graph
+    contractor.AddTaskflow(taskflow);
+
+    /* Contract on all devices */
+    contractor.Contract().wait();
+
+    /* Display results */
+    auto result0 = gpu_task_0.result;
+    std::cout << "GPU 0 result = " << result0[0].x << " " << result0[0].y
+              << std::endl;
+
+    auto result1 = gpu_task_1.result;
+    std::cout << "GPU 1 result = " << result1[0].x << " " << result1[0].y
+              << std::endl;
+
+    auto result2 = contractor.GetResults()[0];
+    std::cout << "CPU result = " << result2 << std::endl;
+
+    return 0;
+}
diff --git a/include/jet/CudaTensor.hpp b/include/jet/CudaTensor.hpp
index 081964d3..c3f9ec1e 100644
--- a/include/jet/CudaTensor.hpp
+++ b/include/jet/CudaTensor.hpp
@@ -19,13 +19,16 @@
 #include <curand.h>
 #include <cutensor.h>
 
+#include <taskflow/cudaflow.hpp>
+#include <taskflow/taskflow.hpp>
+
 namespace {
 using namespace Jet::CudaTensorHelpers;
 }
 
 namespace Jet {
 
-template <class T = cuComplex> class CudaTensor {
+template <class T = cuComplex, int CUDA_DEVICE = 0> class CudaTensor {
 
     static_assert(CudaTensorHelpers::is_supported_data_type<T>,
                   "CudaTensor supports cuComplex (float2) and cuDoubleComplex "
@@ -35,12 +38,12 @@ template <class T = cuComplex> class CudaTensor {
     using scalar_type_t = T;
     using scalar_type_t_precision = decltype(std::declval<T>().x);
 
-    template <class U = T>
-    static CudaTensor<U> AddTensors(const CudaTensor<U> &A,
-                                    const CudaTensor<U> &B)
+    template <class U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D> AddTensors(const CudaTensor<U, D> &A,
+                                       const CudaTensor<U, D> &B)
     {
-
-        static const CudaTensor<U> zero;
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
+        static const CudaTensor<U, D> zero;
 
         // The zero tensor is used in reductions where the shape of an
         // accumulator is not known beforehand.
@@ -58,7 +61,7 @@ template <class T = cuComplex> class CudaTensor {
             disjoint_indices.empty(),
             "Tensor addition with disjoint indices is not supported.");
 
-        CudaTensor<U> C(A);
+        CudaTensor<U, D> C(A);
 
         // Align the underlying data vectors of `A` and `B`.
         cutensorHandle_t handle;
@@ -136,14 +139,16 @@ template <class T = cuComplex> class CudaTensor {
         return C;
     }
 
-    CudaTensor<T> AddTensor(const CudaTensor<T> &other) const
+    CudaTensor<T, CUDA_DEVICE>
+    AddTensor(const CudaTensor<T, CUDA_DEVICE> &other) const
     {
-        return AddTensors<T>(*this, other);
+        return AddTensors<T, CUDA_DEVICE>(*this, other);
     }
 
     void InitIndicesAndShape(const std::vector<std::string> &indices,
                              const std::vector<size_t> &shape)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         Clear_();
         shape_ = shape;
         indices_ = indices;
@@ -160,6 +165,7 @@ template <class T = cuComplex> class CudaTensor {
 
     CudaTensor() : data_{nullptr}
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         T h_dat({.x = 0.0, .y = 0.0});
         JET_CUDA_IS_SUCCESS(
             cudaMalloc(reinterpret_cast<void **>(&data_), sizeof(T)));
@@ -178,6 +184,7 @@ template <class T = cuComplex> class CudaTensor {
                const std::vector<size_t> &shape, const std::vector<T> data)
         : CudaTensor(indices, shape)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, data.data(),
                                        sizeof(T) * data.size(),
                                        cudaMemcpyHostToDevice));
@@ -187,6 +194,7 @@ template <class T = cuComplex> class CudaTensor {
                const std::vector<size_t> &shape, const T *data)
         : CudaTensor(indices, shape)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(
             data_, data, sizeof(T) * Jet::Utilities::ShapeToSize(shape),
             cudaMemcpyHostToDevice));
@@ -202,12 +210,17 @@ template <class T = cuComplex> class CudaTensor {
         InitIndicesAndShape(indices, shape);
     }
 
-    ~CudaTensor() { JET_CUDA_IS_SUCCESS(cudaFree(data_)); }
+    ~CudaTensor()
+    {
+        JET_CUDA_IS_SUCCESS(tf::cudaScopedDevice ctx(CUDA_DEVICE);
+                            cudaFree(data_));
+    }
 
-    template <class U = T>
-    static CudaTensor<U> ContractTensors(const CudaTensor<U> &a_tensor,
-                                         const CudaTensor<U> &b_tensor)
+    template <class U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D> ContractTensors(const CudaTensor<U, D> &a_tensor,
+                                            const CudaTensor<U, D> &b_tensor)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         using namespace Utilities;
 
         auto &&left_indices =
@@ -234,7 +247,7 @@ template <class T = cuComplex> class CudaTensor {
             c_shape[i + left_indices.size()] =
                 b_tensor.GetIndexToDimension().at(right_indices[i]);
 
-        CudaTensor<U> c_tensor(c_indices, c_shape);
+        CudaTensor<U, D> c_tensor(c_indices, c_shape);
 
         CudaContractionPlan cplan;
 
@@ -244,9 +257,10 @@ template <class T = cuComplex> class CudaTensor {
         return c_tensor;
     }
 
-    CudaTensor<T> ContractTensors(const CudaTensor<T> &other) const
+    CudaTensor<T, CUDA_DEVICE>
+    ContractTensors(const CudaTensor<T, CUDA_DEVICE> &other) const
     {
-        return ContractTensors<T>(*this, other);
+        return ContractTensors<T, CUDA_DEVICE>(*this, other);
     }
 
     const std::vector<std::string> &GetIndices() const { return indices_; }
@@ -282,10 +296,15 @@ template <class T = cuComplex> class CudaTensor {
         other.data_ = nullptr;
     }
 
-    CudaTensor(CudaTensor &&other) : data_{nullptr} { Move_(std::move(other)); }
+    CudaTensor(CudaTensor &&other) : data_{nullptr}
+    {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
+        Move_(std::move(other));
+    }
 
     CudaTensor(const CudaTensor &other) : data_{nullptr}
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         InitIndicesAndShape(other.GetIndices(), other.GetShape());
 
         JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, other.GetData(),
@@ -296,6 +315,7 @@ template <class T = cuComplex> class CudaTensor {
     template <class CPUData>
     CudaTensor(const Tensor<CPUData> &other) : data_{nullptr}
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         static_assert(sizeof(CPUData) == sizeof(T),
                       "Size of CPU and GPU data types do not match.");
 
@@ -307,6 +327,7 @@ template <class T = cuComplex> class CudaTensor {
 
     template <class CPUData> CudaTensor &operator=(const Tensor<CPUData> &other)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         static_assert(sizeof(CPUData) == sizeof(T),
                       "Size of CPU and GPU data types do not match.");
 
@@ -319,6 +340,7 @@ template <class T = cuComplex> class CudaTensor {
 
     CudaTensor &operator=(const CudaTensor &other)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         if (this != &other) // not a self-assignment
         {
             InitIndicesAndShape(other.GetIndices(), other.GetShape());
@@ -338,18 +360,21 @@ template <class T = cuComplex> class CudaTensor {
 
     inline void CopyHostDataToGpu(T *host_tensor)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(
             data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice));
     }
 
     inline void CopyGpuDataToHost(T *host_tensor) const
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(
             host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost));
     }
 
     inline void CopyGpuDataToGpu(T *host_tensor)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(host_tensor, data_,
                                        sizeof(T) * GetSize(),
                                        cudaMemcpyDeviceToDevice));
@@ -357,6 +382,7 @@ template <class T = cuComplex> class CudaTensor {
 
     inline void AsyncCopyHostDataToGpu(T *host_tensor, cudaStream_t stream = 0)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(data_, host_tensor,
                                             sizeof(T) * GetSize(),
                                             cudaMemcpyHostToDevice, stream));
@@ -364,6 +390,7 @@ template <class T = cuComplex> class CudaTensor {
 
     inline void AsyncCopyGpuDataToHost(T *host_tensor, cudaStream_t stream = 0)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(host_tensor, data_,
                                             sizeof(T) * GetSize(),
                                             cudaMemcpyDeviceToHost, stream));
@@ -376,6 +403,7 @@ template <class T = cuComplex> class CudaTensor {
 
     explicit operator Tensor<std::complex<scalar_type_t_precision>>() const
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         std::vector<std::complex<scalar_type_t_precision>> host_data(
             GetSize(), {0.0, 0.0});
 
@@ -393,6 +421,7 @@ template <class T = cuComplex> class CudaTensor {
      */
     void FillRandom(size_t seed)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         static curandGenerator_t rng;
         JET_CURAND_IS_SUCCESS(
             curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT));
@@ -408,6 +437,7 @@ template <class T = cuComplex> class CudaTensor {
      */
     void FillRandom()
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         static curandGenerator_t rng;
         JET_CURAND_IS_SUCCESS(
             curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT));
@@ -441,20 +471,23 @@ template <class T = cuComplex> class CudaTensor {
     }
 
     struct CudaContractionPlan {
-
         cutensorHandle_t handle;
         cutensorContractionPlan_t plan;
         size_t work_size;
         void *work;
 
-        ~CudaContractionPlan() { JET_CUDA_IS_SUCCESS(cudaFree(work)); }
+        ~CudaContractionPlan()
+        {
+            tf::cudaScopedDevice ctx(CUDA_DEVICE);
+            JET_CUDA_IS_SUCCESS(cudaFree(work));
+        }
     };
 
-    template <class U = T>
+    template <class U = T, int D = CUDA_DEVICE>
     static void GetCudaContractionPlan(CudaContractionPlan &cplan,
-                                       const CudaTensor<U> &a_tensor,
-                                       const CudaTensor<U> &b_tensor,
-                                       const CudaTensor<U> &c_tensor)
+                                       const CudaTensor<U, D> &a_tensor,
+                                       const CudaTensor<U, D> &b_tensor,
+                                       const CudaTensor<U, D> &c_tensor)
     {
         using namespace Jet::Utilities;
 
@@ -613,10 +646,10 @@ template <class T = cuComplex> class CudaTensor {
         cplan.work_size = work_size;
     }
 
-    template <class U = T>
-    static void ContractTensorsWithoutAllocation(const CudaTensor<U> &a,
-                                                 const CudaTensor<U> &b,
-                                                 CudaTensor<U> &c,
+    template <class U = T, int D = CUDA_DEVICE>
+    static void ContractTensorsWithoutAllocation(const CudaTensor<U, D> &a,
+                                                 const CudaTensor<U, D> &b,
+                                                 CudaTensor<U, D> &c,
                                                  CudaContractionPlan &c_plan,
                                                  cudaStream_t stream = 0)
     {
@@ -638,15 +671,16 @@ template <class T = cuComplex> class CudaTensor {
         JET_CUTENSOR_IS_SUCCESS(cutensor_err);
     }
 
-    template <typename U = T>
-    static CudaTensor<U> Reshape(const CudaTensor<U> &old_tensor,
-                                 const std::vector<size_t> &new_shape)
+    template <typename U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D> Reshape(const CudaTensor<U> &old_tensor,
+                                    const std::vector<size_t> &new_shape)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         JET_ABORT_IF_NOT(old_tensor.GetSize() ==
                              Jet::Utilities::ShapeToSize(new_shape),
                          "Size is inconsistent between tensors.");
 
-        CudaTensor<U> reshaped_tensor(new_shape);
+        CudaTensor<U, D> reshaped_tensor(new_shape);
         JET_CUDA_IS_SUCCESS(cudaMemcpy(reshaped_tensor.data_, old_tensor.data_,
                                        sizeof(U) * old_tensor.GetSize(),
                                        cudaMemcpyDeviceToDevice));
@@ -667,10 +701,12 @@ template <class T = cuComplex> class CudaTensor {
      * @param new_indices New `%Tensor` index label ordering.
      * @return Transposed `%CudaTensor` object.
      */
-    template <class U = T>
-    static CudaTensor<U> Transpose(const CudaTensor<U> &tensor,
-                                   const std::vector<std::string> &new_indices)
+    template <class U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D>
+    Transpose(const CudaTensor<U, D> &tensor,
+              const std::vector<std::string> &new_indices)
     {
+        tf::cudaScopedDevice ctx(CUDA_DEVICE);
         using namespace Jet::Utilities;
 
         if (tensor.GetIndices() == new_indices)
@@ -689,7 +725,7 @@ template <class T = cuComplex> class CudaTensor {
         const std::vector<std::string> &old_indices = tensor.GetIndices();
 
         // 1. Allocate permuted tensor memory
-        CudaTensor<U> permuted_tensor(new_indices, output_shape);
+        CudaTensor<U, D> permuted_tensor(new_indices, output_shape);
 
         // 2. Initialise CuTensor runtime & setup necessary options
         cutensorHandle_t handle;
@@ -790,9 +826,9 @@ template <class T = cuComplex> class CudaTensor {
      * @param new_ordering New `%Tensor` index permutation.
      * @return Transposed `%Tensor` object.
      */
-    template <class U = T>
-    static CudaTensor<U> Transpose(const CudaTensor<U> &A,
-                                   const std::vector<size_t> &new_ordering)
+    template <class U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D> Transpose(const CudaTensor<U, D> &A,
+                                      const std::vector<size_t> &new_ordering)
     {
         const size_t num_indices = A.GetIndices().size();
         JET_ABORT_IF_NOT(
@@ -806,33 +842,35 @@ template <class T = cuComplex> class CudaTensor {
             new_indices[i] = old_indices[new_ordering[i]];
         }
 
-        return Transpose<U>(A, new_indices);
+        return Transpose<U, D>(A, new_indices);
     }
     /**
      * @brief Transposes the indices of the `%Tensor` object to a new ordering.
      *
      * @see Transpose(const Tensor<U>&, const std::vector<size_t>&)
      */
-    CudaTensor<T> Transpose(const std::vector<size_t> &new_ordering) const
+    CudaTensor<T, CUDA_DEVICE>
+    Transpose(const std::vector<size_t> &new_ordering) const
     {
-        return Transpose<T>(*this, new_ordering);
+        return Transpose<T, CUDA_DEVICE>(*this, new_ordering);
     }
     /**
      * @brief Transposes the indices of the `%Tensor` object to a new ordering.
      *
      * @see Transpose(const Tensor<U>&, const std::vector<std::string>&)
      */
-    CudaTensor<T> Transpose(const std::vector<std::string> &new_indices) const
+    CudaTensor<T, CUDA_DEVICE>
+    Transpose(const std::vector<std::string> &new_indices) const
     {
-        return Transpose<T>(*this, new_indices);
+        return Transpose<T, CUDA_DEVICE>(*this, new_indices);
     }
 
-    template <typename U = T>
-    static CudaTensor<U> SliceIndex(const CudaTensor<U> &tens,
-                                    const std::string &index_str,
-                                    size_t index_value)
+    template <typename U = T, int D = CUDA_DEVICE>
+    static CudaTensor<U, D> SliceIndex(const CudaTensor<U, D> &tens,
+                                       const std::string &index_str,
+                                       size_t index_value)
     {
-
+        tf::cudaScopedDevice ctx(D);
         std::vector<std::string> new_indices = tens.GetIndices();
         std::vector<std::string> old_indices = tens.GetIndices();
 
@@ -847,12 +885,12 @@ template <class T = cuComplex> class CudaTensor {
             std::swap(output_shape[offset], output_shape.back());
         }
 
-        CudaTensor<U> permuted_tensor = Transpose(tens, new_indices);
+        CudaTensor<U, D> permuted_tensor = Transpose(tens, new_indices);
 
-        CudaTensor<U> sliced_tensor({permuted_tensor.GetIndices().begin(),
-                                     permuted_tensor.GetIndices().end() - 1},
-                                    {permuted_tensor.GetShape().begin(),
-                                     permuted_tensor.GetShape().end() - 1});
+        CudaTensor<U, D> sliced_tensor({permuted_tensor.GetIndices().begin(),
+                                        permuted_tensor.GetIndices().end() - 1},
+                                       {permuted_tensor.GetShape().begin(),
+                                        permuted_tensor.GetShape().end() - 1});
 
         const size_t ptr_offset =
             Jet::Utilities::ShapeToSize(sliced_tensor.GetShape());
@@ -869,7 +907,8 @@ template <class T = cuComplex> class CudaTensor {
         return sliced_tensor.Transpose(old_indices);
     }
 
-    CudaTensor<T> SliceIndex(const std::string &index_str, size_t index_value)
+    CudaTensor<T, CUDA_DEVICE> SliceIndex(const std::string &index_str,
+                                          size_t index_value)
     {
         return SliceIndex<T>(*this, index_str, index_value);
     }
@@ -899,7 +938,7 @@ template <class T = cuComplex> class CudaTensor {
     std::unordered_map<std::string, size_t> index_to_dimension_;
     std::unordered_map<std::string, size_t> index_to_axes_;
 
-    bool operator==(const CudaTensor<T> &other) const noexcept
+    bool operator==(const CudaTensor<T, CUDA_DEVICE> &other) const noexcept
     {
         return shape_ == other.GetShape() && indices_ == other.GetIndices() &&
                index_to_dimension_ == other.GetIndexToDimension() &&
diff --git a/include/jet/GpuContractionTaskCreator.hpp b/include/jet/GpuContractionTaskCreator.hpp
deleted file mode 100644
index a9b81174..00000000
--- a/include/jet/GpuContractionTaskCreator.hpp
+++ /dev/null
@@ -1,150 +0,0 @@
-#include "CudaTensor.hpp"
-#include "PathInfo.hpp"
-#include "TensorNetwork.hpp"
-#include <cuComplex.h>
-#include <vector>
-
-#include <memory>
-
-#include <taskflow/taskflow.hpp>
-
-// module load cuda && module load gcc && nvcc -o Test_GpuTask
-// Test_GpuContractionTaskCreator.cpp -I./taskflow -I./cutt/include -L./cutt/lib
-// -lcutt -lcuda -std=c++14 --extended-lambda -lcublas
-namespace Jet {
-
-std::vector<int> ConvertSizeVecToIntVec(const std::vector<size_t> size_vec)
-{
-    std::vector<int> int_vec(size_vec.size());
-    for (int i = 0; i < int_vec.size(); i++) {
-        int_vec[i] = size_vec[i];
-    }
-    return int_vec;
-}
-
-template <typename CpuTensor>
-void CopyCpuTensorToGpuTensor(const CpuTensor &cpu_tensor,
-                              CudaTensor<cuComplex> &gpu_tensor)
-{
-    size_t gpu_size = gpu_tensor.GetSize();
-    cuComplex *cpu_tensor_data = new cuComplex[gpu_size];
-
-    for (size_t i = 0; i < gpu_size; i++) {
-        cpu_tensor_data[i].x = cpu_tensor[i].real();
-        cpu_tensor_data[i].y = cpu_tensor[i].imag();
-    }
-
-    gpu_tensor.CopyHostDataToGpu(cpu_tensor_data);
-    delete[] cpu_tensor_data;
-}
-
-template <typename CpuTensor>
-void CopyGpuTensorToCpuTensor(CudaTensor<cuComplex> &gpu_tensor,
-                              CpuTensor &cpu_tensor)
-{
-    size_t gpu_size = gpu_tensor.GetSize();
-    cuComplex *cpu_tensor_data = new cuComplex[gpu_size];
-
-    gpu_tensor.CopyGpuDataToHost(cpu_tensor_data);
-
-    for (int i = 0; i < gpu_size; i++) {
-        cpu_tensor[i] =
-            std::complex<float>(cpu_tensor_data[i].x, cpu_tensor_data[i].y);
-    }
-
-    delete[] cpu_tensor_data;
-}
-
-template <typename CpuTensor> class GpuContractionTaskCreator {
-
-  private:
-    std::vector<std::unique_ptr<CudaTensor<cuComplex>>> tensors_;
-    std::vector<CudaContractionPlan> plans_;
-    std::vector<std::pair<size_t, size_t>> path_;
-    tf::Task task_;
-    size_t num_leafs_;
-
-  public:
-    GpuContractionTaskCreator(TensorNetwork<CpuTensor> &tn, PathInfo &path_info)
-    {
-        const std::vector<PathStepInfo> &path_node_info = path_info.GetSteps();
-        const std::vector<std::pair<size_t, size_t>> &path =
-            path_info.GetPath();
-        path_ = path;
-        const auto &nodes = tn.GetNodes();
-        size_t num_leafs = nodes.size();
-        bool store_transpose = true;
-        tensors_.resize(path_node_info.size());
-        plans_.resize(path.size());
-        num_leafs_ = num_leafs;
-
-        for (int i = 0; i < path.size(); i++) {
-
-            const PathStepInfo &pnia = path_node_info[path[i].first];
-            const PathStepInfo &pnib = path_node_info[path[i].second];
-            const PathStepInfo &pnic = path_node_info[num_leafs + i];
-
-            std::cout << "i = " << i << std::endl;
-
-            std::cout << "problem after 1" << std::endl;
-            std::cout << "pnia.shape = " << pnia.shape << std::endl;
-            std::cout << "pnia.tensor_indices = " << pnia.tensor_indices
-                      << std::endl;
-            std::cout << "tensors_.size() = " << tensors_.size() << std::endl;
-            std::cout << "path[i].first = " << path[i].first << std::endl;
-            tensors_[path[i].first] = std::make_unique<CudaTensor<cuComplex>>(
-                CudaTensor<cuComplex>());
-            tensors_[path[i].first].get()->SetIndicesShapeAndMemory(
-                pnia.tensor_indices, pnia.shape, store_transpose, -1);
-            std::cout << "problem after 2" << std::endl;
-
-            tensors_[path[i].second] = std::make_unique<CudaTensor<cuComplex>>(
-                CudaTensor<cuComplex>());
-            tensors_[path[i].second].get()->SetIndicesShapeAndMemory(
-                pnib.tensor_indices, pnib.shape, store_transpose, -1);
-
-            std::cout << "problem after 3" << std::endl;
-            tensors_[num_leafs + i] = std::make_unique<CudaTensor<cuComplex>>(
-                CudaTensor<cuComplex>());
-            tensors_[num_leafs + i].get()->SetIndicesShapeAndMemory(
-                pnic.tensor_indices, (pnic.shape), store_transpose, -1);
-
-            std::cout << "problem after 4" << std::endl;
-
-            if (pnia.id < num_leafs) {
-                CopyCpuTensorToGpuTensor(nodes[pnia.id].tensor,
-                                         *tensors_[path[i].first]);
-            }
-            std::cout << "problem after 5" << std::endl;
-
-            if (pnib.id < num_leafs) {
-                CopyCpuTensorToGpuTensor(nodes[pnib.id].tensor,
-                                         *tensors_[path[i].second]);
-            }
-            std::cout << "problem after 6" << std::endl;
-
-            plans_[i] = GetContractionPlan(*tensors_[path[i].first],
-                                           *tensors_[path[i].second]);
-        }
-    }
-
-    void AddContractionTask(tf::Taskflow &task_flow)
-    {
-        task_ = task_flow.emplace([this]() {
-            for (int i = 0; i < path_.size(); i++) {
-                Contract(*tensors_[path_[i].first], *tensors_[path_[i].second],
-                         *tensors_[num_leafs_ + i], plans_[i]);
-            }
-        });
-    }
-
-    CpuTensor GetResult()
-    {
-        CudaTensor<cuComplex> &gpu_tensor =
-            *tensors_[num_leafs_ + path_.size() - 1];
-        CpuTensor cpu_tensor(gpu_tensor.GetIndices, gpu_tensor.GetShape());
-        CopyGpuTensorToCpuTensor(gpu_tensor, cpu_tensor);
-    }
-};
-
-}; // namespace Jet
diff --git a/include/jet/TaskBasedContractor.hpp b/include/jet/TaskBasedContractor.hpp
index 28d2aba4..3eee3a41 100644
--- a/include/jet/TaskBasedContractor.hpp
+++ b/include/jet/TaskBasedContractor.hpp
@@ -122,6 +122,17 @@ template <class TensorType> class TaskBasedContractor {
      */
     const TaskFlow &GetTaskflow() const noexcept { return taskflow_; }
 
+    /**
+     * @brief Adds another Taskflow graph to this `%TaskBasedContractor`.
+     *
+     * @warning This function does not update the values returned by GetFlops()
+     *          or GetMemory().
+     */
+    void AddTaskflow(tf::Taskflow &taskflow) noexcept
+    {
+        taskflow_.composed_of(taskflow);
+    }
+
     /**
      * @brief Returns the number of floating-point operations needed to perform
      *        all the contraction tasks (assuming the tensor elements are