From 3589078bad0f1f7b6fd6e623d29db006d60fd102 Mon Sep 17 00:00:00 2001 From: Trevor Vincent Date: Wed, 21 Jul 2021 23:08:53 -0400 Subject: [PATCH] Add final touches (#50) * Add final touches * Fix typos * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update include/jet/CudaTensor.hpp Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update include/jet/TaskBasedContractor.hpp Co-authored-by: Mikhail Andrenkov * Update include/jet/CudaTensor.hpp Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Add PR corrections * Update examples/heterogeneous_contraction.cu Co-authored-by: Mikhail Andrenkov * Fix two minor issues Co-authored-by: Trevor Vincent Co-authored-by: Mikhail Andrenkov --- examples/heterogeneous_contraction.cu | 261 ++++++++++++++++++++++ include/jet/CudaTensor.hpp | 149 +++++++----- include/jet/GpuContractionTaskCreator.hpp | 150 ------------- include/jet/TaskBasedContractor.hpp | 11 + 4 files changed, 366 insertions(+), 205 deletions(-) create mode 100644 examples/heterogeneous_contraction.cu delete mode 100644 include/jet/GpuContractionTaskCreator.hpp diff --git a/examples/heterogeneous_contraction.cu b/examples/heterogeneous_contraction.cu new file mode 100644 index 00000000..66b69dc9 --- /dev/null +++ b/examples/heterogeneous_contraction.cu @@ -0,0 +1,261 @@ +/** + * @file heterogeneous_contraction.cu + * + * @brief Contracts three tensor network files on two gpus + * and one cpu simultaneously + * + */ + +#include + +#include "CudaTensor.hpp" +#include "PathInfo.hpp" +#include "TaskBasedContractor.hpp" +#include "Tensor.hpp" +#include "TensorNetwork.hpp" +#include "TensorNetworkIO.hpp" + +#include +#include + + +using namespace Jet; + +template struct CudaflowContractionTask { + + std::vector>> tensors; + std::vector::CudaContractionPlan> plans; + std::vector kernel_tasks; + std::vector result; +}; + +template +void AddCudaContractionToTaskflow( + const TensorNetwork> &tn, + const PathInfo &path_info, tf::Taskflow &taskflow, + CudaflowContractionTask &gpu_task) +{ + auto &tensors = gpu_task.tensors; + auto &plans = gpu_task.plans; + auto &result = gpu_task.result; + auto &kernel_tasks = gpu_task.kernel_tasks; + + const auto &path_node_info = path_info.GetSteps(); + const auto &path = path_info.GetPath(); + const auto &nodes = tn.GetNodes(); + size_t num_leafs = nodes.size(); + tensors.resize(path_node_info.size()); + plans.resize(path.size()); + + for (size_t i = 0; i < path.size(); i++) { + + const PathStepInfo &pnia = path_node_info[path[i].first]; + const PathStepInfo &pnib = path_node_info[path[i].second]; + const PathStepInfo &pnic = path_node_info[num_leafs + i]; + + if (pnia.id >= num_leafs) { + tensors[path[i].first] = + std::make_unique>( + CudaTensor(pnia.tensor_indices, + pnia.shape)); + } + else { + tensors[path[i].first] = + std::make_unique>( + CudaTensor( + tn.GetNodes()[pnia.id].tensor)); + } + + if (pnib.id >= num_leafs) { + tensors[path[i].second] = + std::make_unique>( + CudaTensor(pnib.tensor_indices, + pnib.shape)); + } + else { + tensors[path[i].second] = + std::make_unique>( + CudaTensor( + tn.GetNodes()[pnib.id].tensor)); + } + + tensors[num_leafs + i] = + std::make_unique>( + CudaTensor(pnic.tensor_indices, pnic.shape)); + + CudaTensor::GetCudaContractionPlan( + plans[i], *tensors[path[i].first], *tensors[path[i].second], + *tensors[num_leafs + i]); + } + + tf::Task task = taskflow.emplace_on( + [&,path,path_node_info,num_leafs](tf::cudaFlowCapturer &capturer) { + for (int i = 0; i < path.size(); i++) { + + const PathStepInfo &pnia = path_node_info[path[i].first]; + const PathStepInfo &pnib = path_node_info[path[i].second]; + const PathStepInfo &pnic = path_node_info[num_leafs + i]; + + auto tensor_a = tensors[path[i].first]->GetData(); + auto tensor_b = tensors[path[i].second]->GetData(); + auto tensor_c = tensors[num_leafs + i]->GetData(); + + auto &c_plan = plans[i]; + tf::cudaTask kernel = + capturer.on([&, c_plan, tensor_a, tensor_b, + tensor_c](cudaStream_t stream) { + cuComplex alpha; + alpha.x = 1.; + alpha.y = 0.; + + cuComplex beta; + beta.x = 0.; + beta.y = 0.; + + cutensorContraction(&c_plan.handle, &c_plan.plan, + &alpha, tensor_a, tensor_b, &beta, + tensor_c, tensor_c, c_plan.work, + c_plan.work_size, stream); + }); + + kernel_tasks.push_back(kernel); + + if (pnia.id >= num_leafs) { + kernel_tasks[pnia.id - num_leafs].precede(kernel); + } + + if (pnib.id >= num_leafs) { + kernel_tasks[pnib.id - num_leafs].precede(kernel); + } + + // copy data from gpu_data to host_data + if (i == path.size() - 1) { + result.resize(tensors[pnic.id]->GetSize()); + tf::cudaTask d2h = capturer.memcpy( + result.data(), tensors[pnic.id]->GetData(), + tensors[pnic.id]->GetSize() * sizeof(cuComplex)); + + kernel.precede(d2h); + } + } + }, + device); +} + +int main(int argc, char *argv[]) +{ + + if (argc != 4) { + std::cout << "heterogeneous_contraction.cu " + " " + << std::endl; + std::cout << "Contracts three circuits on two GPUs and one CPU" + << std::endl; + } + + std::string file_name_0 = argv[1]; + std::string file_name_1 = argv[2]; + std::string file_name_2 = argv[3]; + + /* + * Load first tensor network file onto GPU 0 + */ + + TensorNetworkFile> tensor_file_0; + try { + std::ifstream tn_data(file_name_0); + std::string circuit_str{std::istreambuf_iterator(tn_data), + std::istreambuf_iterator()}; + // Load data into TensorNetwork and PathInfo objects + TensorNetworkSerializer> serializer; + tensor_file_0 = serializer(circuit_str, true); + } + catch (...) { + std::cerr << "Please specify a valid first JSON file to contract" + << std::endl; + exit(1); + } + + TensorNetwork> tn_0 = tensor_file_0.tensors; + PathInfo path_0 = tensor_file_0.path.value(); + + /** + * Load second tensor network file onto GPU 1 + */ + + TensorNetworkFile> tensor_file_1; + try { + std::ifstream tn_data(file_name_1); + std::string circuit_str{std::istreambuf_iterator(tn_data), + std::istreambuf_iterator()}; + // Load data into TensorNetwork and PathInfo objects + TensorNetworkSerializer> serializer; + tensor_file_1 = serializer(circuit_str, true); + } + catch (...) { + std::cerr << "Please specify a valid second JSON file to contract" + << std::endl; + exit(1); + } + + TensorNetwork> tn_1 = tensor_file_1.tensors; + PathInfo path_1 = tensor_file_1.path.value(); + + /** + * Load third tensor network file onto CPU + */ + + TensorNetworkFile>> tensor_file_2; + try { + std::ifstream tn_data(file_name_2); + std::string circuit_str{std::istreambuf_iterator(tn_data), + std::istreambuf_iterator()}; + // Load data into TensorNetwork and PathInfo objects + TensorNetworkSerializer>> serializer; + tensor_file_2 = serializer(circuit_str, true); + } + catch (...) { + std::cerr << "Please specify a valid JSON file to contract" + << std::endl; + exit(1); + } + TensorNetwork>> tn_2 = + tensor_file_2.tensors; + PathInfo path_2 = tensor_file_2.path.value(); + + tf::Taskflow taskflow; + + /* set up gpu 0 contraction task */ + CudaflowContractionTask gpu_task_0; + AddCudaContractionToTaskflow(tn_0, path_0, taskflow, + gpu_task_0); + + /* set up gpu 1 contraction task */ + CudaflowContractionTask gpu_task_1; + AddCudaContractionToTaskflow(tn_1, path_1, taskflow, + gpu_task_1); + + /* set up cpu contraction task */ + TaskBasedContractor>> contractor; + contractor.AddContractionTasks(tn_2, path_2); + + // Add gpu task graph to cpu task graph + contractor.AddTaskflow(taskflow); + + /* Contract on all devices */ + contractor.Contract().wait(); + + /* Display results */ + auto result0 = gpu_task_0.result; + std::cout << "GPU 0 result = " << result0[0].x << " " << result0[0].y + << std::endl; + + auto result1 = gpu_task_1.result; + std::cout << "GPU 1 result = " << result1[0].x << " " << result1[0].y + << std::endl; + + auto result2 = contractor.GetResults()[0]; + std::cout << "CPU result = " << result2 << std::endl; + + return 0; +} diff --git a/include/jet/CudaTensor.hpp b/include/jet/CudaTensor.hpp index 081964d3..c3f9ec1e 100644 --- a/include/jet/CudaTensor.hpp +++ b/include/jet/CudaTensor.hpp @@ -19,13 +19,16 @@ #include #include +#include +#include + namespace { using namespace Jet::CudaTensorHelpers; } namespace Jet { -template class CudaTensor { +template class CudaTensor { static_assert(CudaTensorHelpers::is_supported_data_type, "CudaTensor supports cuComplex (float2) and cuDoubleComplex " @@ -35,12 +38,12 @@ template class CudaTensor { using scalar_type_t = T; using scalar_type_t_precision = decltype(std::declval().x); - template - static CudaTensor AddTensors(const CudaTensor &A, - const CudaTensor &B) + template + static CudaTensor AddTensors(const CudaTensor &A, + const CudaTensor &B) { - - static const CudaTensor zero; + tf::cudaScopedDevice ctx(CUDA_DEVICE); + static const CudaTensor zero; // The zero tensor is used in reductions where the shape of an // accumulator is not known beforehand. @@ -58,7 +61,7 @@ template class CudaTensor { disjoint_indices.empty(), "Tensor addition with disjoint indices is not supported."); - CudaTensor C(A); + CudaTensor C(A); // Align the underlying data vectors of `A` and `B`. cutensorHandle_t handle; @@ -136,14 +139,16 @@ template class CudaTensor { return C; } - CudaTensor AddTensor(const CudaTensor &other) const + CudaTensor + AddTensor(const CudaTensor &other) const { - return AddTensors(*this, other); + return AddTensors(*this, other); } void InitIndicesAndShape(const std::vector &indices, const std::vector &shape) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); Clear_(); shape_ = shape; indices_ = indices; @@ -160,6 +165,7 @@ template class CudaTensor { CudaTensor() : data_{nullptr} { + tf::cudaScopedDevice ctx(CUDA_DEVICE); T h_dat({.x = 0.0, .y = 0.0}); JET_CUDA_IS_SUCCESS( cudaMalloc(reinterpret_cast(&data_), sizeof(T))); @@ -178,6 +184,7 @@ template class CudaTensor { const std::vector &shape, const std::vector data) : CudaTensor(indices, shape) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, data.data(), sizeof(T) * data.size(), cudaMemcpyHostToDevice)); @@ -187,6 +194,7 @@ template class CudaTensor { const std::vector &shape, const T *data) : CudaTensor(indices, shape) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( data_, data, sizeof(T) * Jet::Utilities::ShapeToSize(shape), cudaMemcpyHostToDevice)); @@ -202,12 +210,17 @@ template class CudaTensor { InitIndicesAndShape(indices, shape); } - ~CudaTensor() { JET_CUDA_IS_SUCCESS(cudaFree(data_)); } + ~CudaTensor() + { + JET_CUDA_IS_SUCCESS(tf::cudaScopedDevice ctx(CUDA_DEVICE); + cudaFree(data_)); + } - template - static CudaTensor ContractTensors(const CudaTensor &a_tensor, - const CudaTensor &b_tensor) + template + static CudaTensor ContractTensors(const CudaTensor &a_tensor, + const CudaTensor &b_tensor) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); using namespace Utilities; auto &&left_indices = @@ -234,7 +247,7 @@ template class CudaTensor { c_shape[i + left_indices.size()] = b_tensor.GetIndexToDimension().at(right_indices[i]); - CudaTensor c_tensor(c_indices, c_shape); + CudaTensor c_tensor(c_indices, c_shape); CudaContractionPlan cplan; @@ -244,9 +257,10 @@ template class CudaTensor { return c_tensor; } - CudaTensor ContractTensors(const CudaTensor &other) const + CudaTensor + ContractTensors(const CudaTensor &other) const { - return ContractTensors(*this, other); + return ContractTensors(*this, other); } const std::vector &GetIndices() const { return indices_; } @@ -282,10 +296,15 @@ template class CudaTensor { other.data_ = nullptr; } - CudaTensor(CudaTensor &&other) : data_{nullptr} { Move_(std::move(other)); } + CudaTensor(CudaTensor &&other) : data_{nullptr} + { + tf::cudaScopedDevice ctx(CUDA_DEVICE); + Move_(std::move(other)); + } CudaTensor(const CudaTensor &other) : data_{nullptr} { + tf::cudaScopedDevice ctx(CUDA_DEVICE); InitIndicesAndShape(other.GetIndices(), other.GetShape()); JET_CUDA_IS_SUCCESS(cudaMemcpy(data_, other.GetData(), @@ -296,6 +315,7 @@ template class CudaTensor { template CudaTensor(const Tensor &other) : data_{nullptr} { + tf::cudaScopedDevice ctx(CUDA_DEVICE); static_assert(sizeof(CPUData) == sizeof(T), "Size of CPU and GPU data types do not match."); @@ -307,6 +327,7 @@ template class CudaTensor { template CudaTensor &operator=(const Tensor &other) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); static_assert(sizeof(CPUData) == sizeof(T), "Size of CPU and GPU data types do not match."); @@ -319,6 +340,7 @@ template class CudaTensor { CudaTensor &operator=(const CudaTensor &other) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); if (this != &other) // not a self-assignment { InitIndicesAndShape(other.GetIndices(), other.GetShape()); @@ -338,18 +360,21 @@ template class CudaTensor { inline void CopyHostDataToGpu(T *host_tensor) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice)); } inline void CopyGpuDataToHost(T *host_tensor) const { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy( host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost)); } inline void CopyGpuDataToGpu(T *host_tensor) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpy(host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToDevice)); @@ -357,6 +382,7 @@ template class CudaTensor { inline void AsyncCopyHostDataToGpu(T *host_tensor, cudaStream_t stream = 0) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(data_, host_tensor, sizeof(T) * GetSize(), cudaMemcpyHostToDevice, stream)); @@ -364,6 +390,7 @@ template class CudaTensor { inline void AsyncCopyGpuDataToHost(T *host_tensor, cudaStream_t stream = 0) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_CUDA_IS_SUCCESS(cudaMemcpyAsync(host_tensor, data_, sizeof(T) * GetSize(), cudaMemcpyDeviceToHost, stream)); @@ -376,6 +403,7 @@ template class CudaTensor { explicit operator Tensor>() const { + tf::cudaScopedDevice ctx(CUDA_DEVICE); std::vector> host_data( GetSize(), {0.0, 0.0}); @@ -393,6 +421,7 @@ template class CudaTensor { */ void FillRandom(size_t seed) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); static curandGenerator_t rng; JET_CURAND_IS_SUCCESS( curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT)); @@ -408,6 +437,7 @@ template class CudaTensor { */ void FillRandom() { + tf::cudaScopedDevice ctx(CUDA_DEVICE); static curandGenerator_t rng; JET_CURAND_IS_SUCCESS( curandCreateGenerator(&rng, CURAND_RNG_PSEUDO_DEFAULT)); @@ -441,20 +471,23 @@ template class CudaTensor { } struct CudaContractionPlan { - cutensorHandle_t handle; cutensorContractionPlan_t plan; size_t work_size; void *work; - ~CudaContractionPlan() { JET_CUDA_IS_SUCCESS(cudaFree(work)); } + ~CudaContractionPlan() + { + tf::cudaScopedDevice ctx(CUDA_DEVICE); + JET_CUDA_IS_SUCCESS(cudaFree(work)); + } }; - template + template static void GetCudaContractionPlan(CudaContractionPlan &cplan, - const CudaTensor &a_tensor, - const CudaTensor &b_tensor, - const CudaTensor &c_tensor) + const CudaTensor &a_tensor, + const CudaTensor &b_tensor, + const CudaTensor &c_tensor) { using namespace Jet::Utilities; @@ -613,10 +646,10 @@ template class CudaTensor { cplan.work_size = work_size; } - template - static void ContractTensorsWithoutAllocation(const CudaTensor &a, - const CudaTensor &b, - CudaTensor &c, + template + static void ContractTensorsWithoutAllocation(const CudaTensor &a, + const CudaTensor &b, + CudaTensor &c, CudaContractionPlan &c_plan, cudaStream_t stream = 0) { @@ -638,15 +671,16 @@ template class CudaTensor { JET_CUTENSOR_IS_SUCCESS(cutensor_err); } - template - static CudaTensor Reshape(const CudaTensor &old_tensor, - const std::vector &new_shape) + template + static CudaTensor Reshape(const CudaTensor &old_tensor, + const std::vector &new_shape) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); JET_ABORT_IF_NOT(old_tensor.GetSize() == Jet::Utilities::ShapeToSize(new_shape), "Size is inconsistent between tensors."); - CudaTensor reshaped_tensor(new_shape); + CudaTensor reshaped_tensor(new_shape); JET_CUDA_IS_SUCCESS(cudaMemcpy(reshaped_tensor.data_, old_tensor.data_, sizeof(U) * old_tensor.GetSize(), cudaMemcpyDeviceToDevice)); @@ -667,10 +701,12 @@ template class CudaTensor { * @param new_indices New `%Tensor` index label ordering. * @return Transposed `%CudaTensor` object. */ - template - static CudaTensor Transpose(const CudaTensor &tensor, - const std::vector &new_indices) + template + static CudaTensor + Transpose(const CudaTensor &tensor, + const std::vector &new_indices) { + tf::cudaScopedDevice ctx(CUDA_DEVICE); using namespace Jet::Utilities; if (tensor.GetIndices() == new_indices) @@ -689,7 +725,7 @@ template class CudaTensor { const std::vector &old_indices = tensor.GetIndices(); // 1. Allocate permuted tensor memory - CudaTensor permuted_tensor(new_indices, output_shape); + CudaTensor permuted_tensor(new_indices, output_shape); // 2. Initialise CuTensor runtime & setup necessary options cutensorHandle_t handle; @@ -790,9 +826,9 @@ template class CudaTensor { * @param new_ordering New `%Tensor` index permutation. * @return Transposed `%Tensor` object. */ - template - static CudaTensor Transpose(const CudaTensor &A, - const std::vector &new_ordering) + template + static CudaTensor Transpose(const CudaTensor &A, + const std::vector &new_ordering) { const size_t num_indices = A.GetIndices().size(); JET_ABORT_IF_NOT( @@ -806,33 +842,35 @@ template class CudaTensor { new_indices[i] = old_indices[new_ordering[i]]; } - return Transpose(A, new_indices); + return Transpose(A, new_indices); } /** * @brief Transposes the indices of the `%Tensor` object to a new ordering. * * @see Transpose(const Tensor&, const std::vector&) */ - CudaTensor Transpose(const std::vector &new_ordering) const + CudaTensor + Transpose(const std::vector &new_ordering) const { - return Transpose(*this, new_ordering); + return Transpose(*this, new_ordering); } /** * @brief Transposes the indices of the `%Tensor` object to a new ordering. * * @see Transpose(const Tensor&, const std::vector&) */ - CudaTensor Transpose(const std::vector &new_indices) const + CudaTensor + Transpose(const std::vector &new_indices) const { - return Transpose(*this, new_indices); + return Transpose(*this, new_indices); } - template - static CudaTensor SliceIndex(const CudaTensor &tens, - const std::string &index_str, - size_t index_value) + template + static CudaTensor SliceIndex(const CudaTensor &tens, + const std::string &index_str, + size_t index_value) { - + tf::cudaScopedDevice ctx(D); std::vector new_indices = tens.GetIndices(); std::vector old_indices = tens.GetIndices(); @@ -847,12 +885,12 @@ template class CudaTensor { std::swap(output_shape[offset], output_shape.back()); } - CudaTensor permuted_tensor = Transpose(tens, new_indices); + CudaTensor permuted_tensor = Transpose(tens, new_indices); - CudaTensor sliced_tensor({permuted_tensor.GetIndices().begin(), - permuted_tensor.GetIndices().end() - 1}, - {permuted_tensor.GetShape().begin(), - permuted_tensor.GetShape().end() - 1}); + CudaTensor sliced_tensor({permuted_tensor.GetIndices().begin(), + permuted_tensor.GetIndices().end() - 1}, + {permuted_tensor.GetShape().begin(), + permuted_tensor.GetShape().end() - 1}); const size_t ptr_offset = Jet::Utilities::ShapeToSize(sliced_tensor.GetShape()); @@ -869,7 +907,8 @@ template class CudaTensor { return sliced_tensor.Transpose(old_indices); } - CudaTensor SliceIndex(const std::string &index_str, size_t index_value) + CudaTensor SliceIndex(const std::string &index_str, + size_t index_value) { return SliceIndex(*this, index_str, index_value); } @@ -899,7 +938,7 @@ template class CudaTensor { std::unordered_map index_to_dimension_; std::unordered_map index_to_axes_; - bool operator==(const CudaTensor &other) const noexcept + bool operator==(const CudaTensor &other) const noexcept { return shape_ == other.GetShape() && indices_ == other.GetIndices() && index_to_dimension_ == other.GetIndexToDimension() && diff --git a/include/jet/GpuContractionTaskCreator.hpp b/include/jet/GpuContractionTaskCreator.hpp deleted file mode 100644 index a9b81174..00000000 --- a/include/jet/GpuContractionTaskCreator.hpp +++ /dev/null @@ -1,150 +0,0 @@ -#include "CudaTensor.hpp" -#include "PathInfo.hpp" -#include "TensorNetwork.hpp" -#include -#include - -#include - -#include - -// module load cuda && module load gcc && nvcc -o Test_GpuTask -// Test_GpuContractionTaskCreator.cpp -I./taskflow -I./cutt/include -L./cutt/lib -// -lcutt -lcuda -std=c++14 --extended-lambda -lcublas -namespace Jet { - -std::vector ConvertSizeVecToIntVec(const std::vector size_vec) -{ - std::vector int_vec(size_vec.size()); - for (int i = 0; i < int_vec.size(); i++) { - int_vec[i] = size_vec[i]; - } - return int_vec; -} - -template -void CopyCpuTensorToGpuTensor(const CpuTensor &cpu_tensor, - CudaTensor &gpu_tensor) -{ - size_t gpu_size = gpu_tensor.GetSize(); - cuComplex *cpu_tensor_data = new cuComplex[gpu_size]; - - for (size_t i = 0; i < gpu_size; i++) { - cpu_tensor_data[i].x = cpu_tensor[i].real(); - cpu_tensor_data[i].y = cpu_tensor[i].imag(); - } - - gpu_tensor.CopyHostDataToGpu(cpu_tensor_data); - delete[] cpu_tensor_data; -} - -template -void CopyGpuTensorToCpuTensor(CudaTensor &gpu_tensor, - CpuTensor &cpu_tensor) -{ - size_t gpu_size = gpu_tensor.GetSize(); - cuComplex *cpu_tensor_data = new cuComplex[gpu_size]; - - gpu_tensor.CopyGpuDataToHost(cpu_tensor_data); - - for (int i = 0; i < gpu_size; i++) { - cpu_tensor[i] = - std::complex(cpu_tensor_data[i].x, cpu_tensor_data[i].y); - } - - delete[] cpu_tensor_data; -} - -template class GpuContractionTaskCreator { - - private: - std::vector>> tensors_; - std::vector plans_; - std::vector> path_; - tf::Task task_; - size_t num_leafs_; - - public: - GpuContractionTaskCreator(TensorNetwork &tn, PathInfo &path_info) - { - const std::vector &path_node_info = path_info.GetSteps(); - const std::vector> &path = - path_info.GetPath(); - path_ = path; - const auto &nodes = tn.GetNodes(); - size_t num_leafs = nodes.size(); - bool store_transpose = true; - tensors_.resize(path_node_info.size()); - plans_.resize(path.size()); - num_leafs_ = num_leafs; - - for (int i = 0; i < path.size(); i++) { - - const PathStepInfo &pnia = path_node_info[path[i].first]; - const PathStepInfo &pnib = path_node_info[path[i].second]; - const PathStepInfo &pnic = path_node_info[num_leafs + i]; - - std::cout << "i = " << i << std::endl; - - std::cout << "problem after 1" << std::endl; - std::cout << "pnia.shape = " << pnia.shape << std::endl; - std::cout << "pnia.tensor_indices = " << pnia.tensor_indices - << std::endl; - std::cout << "tensors_.size() = " << tensors_.size() << std::endl; - std::cout << "path[i].first = " << path[i].first << std::endl; - tensors_[path[i].first] = std::make_unique>( - CudaTensor()); - tensors_[path[i].first].get()->SetIndicesShapeAndMemory( - pnia.tensor_indices, pnia.shape, store_transpose, -1); - std::cout << "problem after 2" << std::endl; - - tensors_[path[i].second] = std::make_unique>( - CudaTensor()); - tensors_[path[i].second].get()->SetIndicesShapeAndMemory( - pnib.tensor_indices, pnib.shape, store_transpose, -1); - - std::cout << "problem after 3" << std::endl; - tensors_[num_leafs + i] = std::make_unique>( - CudaTensor()); - tensors_[num_leafs + i].get()->SetIndicesShapeAndMemory( - pnic.tensor_indices, (pnic.shape), store_transpose, -1); - - std::cout << "problem after 4" << std::endl; - - if (pnia.id < num_leafs) { - CopyCpuTensorToGpuTensor(nodes[pnia.id].tensor, - *tensors_[path[i].first]); - } - std::cout << "problem after 5" << std::endl; - - if (pnib.id < num_leafs) { - CopyCpuTensorToGpuTensor(nodes[pnib.id].tensor, - *tensors_[path[i].second]); - } - std::cout << "problem after 6" << std::endl; - - plans_[i] = GetContractionPlan(*tensors_[path[i].first], - *tensors_[path[i].second]); - } - } - - void AddContractionTask(tf::Taskflow &task_flow) - { - task_ = task_flow.emplace([this]() { - for (int i = 0; i < path_.size(); i++) { - Contract(*tensors_[path_[i].first], *tensors_[path_[i].second], - *tensors_[num_leafs_ + i], plans_[i]); - } - }); - } - - CpuTensor GetResult() - { - CudaTensor &gpu_tensor = - *tensors_[num_leafs_ + path_.size() - 1]; - CpuTensor cpu_tensor(gpu_tensor.GetIndices, gpu_tensor.GetShape()); - CopyGpuTensorToCpuTensor(gpu_tensor, cpu_tensor); - } -}; - -}; // namespace Jet diff --git a/include/jet/TaskBasedContractor.hpp b/include/jet/TaskBasedContractor.hpp index 28d2aba4..3eee3a41 100644 --- a/include/jet/TaskBasedContractor.hpp +++ b/include/jet/TaskBasedContractor.hpp @@ -122,6 +122,17 @@ template class TaskBasedContractor { */ const TaskFlow &GetTaskflow() const noexcept { return taskflow_; } + /** + * @brief Adds another Taskflow graph to this `%TaskBasedContractor`. + * + * @warning This function does not update the values returned by GetFlops() + * or GetMemory(). + */ + void AddTaskflow(tf::Taskflow &taskflow) noexcept + { + taskflow_.composed_of(taskflow); + } + /** * @brief Returns the number of floating-point operations needed to perform * all the contraction tasks (assuming the tensor elements are