Skip to content

Commit

Permalink
Perf: optimize psir_dot function in gint_rho_gpu.cu (#4326)
Browse files Browse the repository at this point in the history
* remove new in gtask_rho.cpp

* rename some header files

* optimize psir_dot in gint_rho_gpu.cu

* fix a memory leak

* remove redundant clear()

* fix a memory leak

* modify cuda_tools.cu

* modify some header files

* modify synchronization operation

* modify psir_dot

* modify psir_dot
  • Loading branch information
dzzz2001 authored Jun 7, 2024
1 parent 51bdd59 commit 7f8adc8
Show file tree
Hide file tree
Showing 19 changed files with 97 additions and 163 deletions.
6 changes: 3 additions & 3 deletions source/module_hamilt_lcao/module_gint/gint.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
#include "gint.h"

#if ((defined __CUDA))
#include "gint_force.h"
#include "gint_rho.h"
#include "gint_vl.h"
#include "gint_force_gpu.h"
#include "gint_rho_gpu.h"
#include "gint_vl_gpu.h"
#endif

#include "module_base/memory.h"
Expand Down
5 changes: 3 additions & 2 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,12 @@
#include <fstream>
#include <sstream>

#include "gint_force.h"
#include "gint_force_gpu.h"
#include "kernels/cuda/cuda_tools.cuh"
#include "kernels/cuda/gint_force.cuh"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
#include "gint_tools.h"

namespace GintKernel
{

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#ifndef GINT_FORCE_H
#define GINT_FORCE_H
#ifndef GINT_FORCE_GPU_H
#define GINT_FORCE_GPU_H

#include "module_hamilt_lcao/module_gint/gint.h"
#include "module_hamilt_lcao/module_gint/grid_technique.h"
Expand Down
63 changes: 17 additions & 46 deletions source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#include "kernels/cuda/cuda_tools.cuh"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_rho.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh"
#include "gint_rho_gpu.h"
#include "gint_tools.h"
#include "kernels/cuda/gint_rho.cuh"
#include "omp.h"

#include <omp.h>
Expand Down Expand Up @@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
{
checkCuda(cudaStreamSynchronize(gridt.streams[i]));
}

// calculate the rho for every nbz bigcells

#pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
Expand All @@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
{
// get stream id
int stream_num = omp_get_thread_num();
checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));

// psi_input contains data used to generate the psi values.
// The suffix "_g" indicates that the data is stored in the GPU,
Expand Down Expand Up @@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
double* rho_g = gridt.rho_g;

// variables for dot product psir * psir_dm
int dot_count = 0;
int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num];
double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num];
double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num];
double** dot_product
= &gridt.dot_product[gridt.num_mcell * stream_num];

int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num];
double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num];
double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num];
double** dot_product_g
= &gridt.dot_product_g[gridt.num_mcell * stream_num];

Expand All @@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
int atom_pair_num = 0;
const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp;
std::vector<bool> gpu_matrix_cal_flag(max_size * gridt.nbzp,false);
checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));

// generate GPU tasks, including the calculation of psir, matrix
// multiplication, and dot product
Expand Down Expand Up @@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
max_n,
atom_pair_num,
rho_g,
vec_l,
vec_r,
dot_product,
vec_len,
dot_count);
dot_product);

// Copying data from host to device
checkCuda(cudaMemcpyAsync(input_double_g,
Expand Down Expand Up @@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
cudaMemcpyHostToDevice,
gridt.streams[stream_num]));

checkCuda(cudaMemcpyAsync(vec_len_g,
vec_len,
gridt.num_mcell * sizeof(int),
cudaMemcpyHostToDevice,
gridt.streams[stream_num]));
checkCuda(cudaMemcpyAsync(vec_l_g,
vec_l,
gridt.num_mcell * sizeof(double*),
cudaMemcpyHostToDevice,
gridt.streams[stream_num]));
checkCuda(cudaMemcpyAsync(vec_r_g,
vec_r,
gridt.num_mcell * sizeof(double*),
cudaMemcpyHostToDevice,
gridt.streams[stream_num]));
checkCuda(cudaMemcpyAsync(dot_product_g,
dot_product,
gridt.num_mcell * sizeof(double*),
Expand Down Expand Up @@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
atom_pair_num,
gridt.streams[stream_num],
ap_alpha_g);
checkCudaLastError();

// Launching kernel to calculate dot product psir * psir_dm
dim3 grid_dot(64);
dim3 block_dot(64);
int incx = 1;
int incy = 1;
psir_dot<<<grid_dot, block_dot, 0, gridt.streams[stream_num]>>>(
vec_len_g,
vec_l_g,
incx,
vec_r_g,
incy,
dot_product_g,
dot_count);
const int block_size = 128;
dim3 block_dot(block_size);
dim3 grid_dot(gridt.nbzp, gridt.bxyz);
psir_dot<<<grid_dot, block_dot, sizeof(double) * block_size, gridt.streams[stream_num]>>>(
gridt.nbzp,
gridt.bxyz,
max_size * ucell.nwmax,
psir_ylm_left_g,
psir_r_g,
dot_product_g);
checkCudaLastError();
}
}

Expand All @@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
{
checkCuda(cudaStreamSynchronize(gridt.streams[i]));
}

// Copy rho from device to host
checkCuda(cudaMemcpy(rho,
gridt.rho_g,
nczp * gridt.ncx * gridt.ncy * sizeof(double),
cudaMemcpyDeviceToHost));

// free the memory
checkCuda(cudaFree(dm_matrix_g));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
#include "module_hamilt_lcao/module_gint/gint.h"
#include "module_hamilt_lcao/module_gint/grid_technique.h"

cudaError_t checkCuda(cudaError_t result);
namespace GintKernel
{

Expand Down Expand Up @@ -90,11 +89,7 @@ void gtask_rho(const Grid_Technique& gridt,
* @param max_n Maximum value of n.
* @param atom_pair_num Total count of atom pairs, which is also the number of matrix multiplication operations.
* @param rho_g Rho.
* @param vec_l Pointers to psir_ylm for vector dot product.
* @param vec_r Pointers to psir_dm for vector dot product.
* @param dot_product Pointers to the results of dot products.
* @param vec_len Vector lengths for each dot product.
* @param dot_count Total count of dot products.
*/
void alloc_mult_dot_rho(const Grid_Technique& gridt,
const UnitCell& ucell,
Expand All @@ -120,11 +115,7 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
int& max_n,
int& atom_pair_num,
double* rho_g,
double** vec_l,
double** vec_r,
double** dot_product,
int* vec_len,
int& dot_count);
double** dot_product);

} // namespace GintKernel
#endif
6 changes: 3 additions & 3 deletions source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@

#include "kernels/cuda/cuda_tools.cuh"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
#include "module_hamilt_lcao/module_gint/gint_vl.h"
#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh"
#include "gint_tools.h"
#include "gint_vl_gpu.h"
#include "kernels/cuda/gint_vl.cuh"

namespace GintKernel
{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
#ifndef GINT_VL_H
#define GINT_VL_H
#ifndef GINT_VL_GPU_H
#define GINT_VL_GPU_H
#include <cublas_v2.h>
#include <cuda.h> // for CUDA_VERSION
#include <cuda_runtime.h>

#include "module_hamilt_lcao/module_gint/gint.h"
#include "module_hamilt_lcao/module_gint/grid_technique.h"

cudaError_t checkCuda(cudaError_t result);

namespace GintKernel
{

Expand Down
24 changes: 5 additions & 19 deletions source/module_hamilt_lcao/module_gint/grid_technique.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in,
#if ((defined __CUDA) /* || (defined __ROCM) */)
if(GlobalV::device_flag == "gpu")
{
this->init_gpu_gint_variables(ucell,orb,num_stream);
this->init_gpu_gint_variables(ucell, orb, num_stream);
}
#endif

Expand Down Expand Up @@ -923,22 +923,11 @@ void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Or
num_mcell = nbzp * bxyz;
checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double)));
checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double)));
checkCudaErrors(
cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(
cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(
cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(
cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(cudaMallocHost((void**)&dot_product,
num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(cudaMalloc((void**)&dot_product_g,
num_mcell * nstreams * sizeof(double*)));
checkCudaErrors(
cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int)));
checkCudaErrors(
cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int)));


for (int i = 0; i < nstreams; ++i)
{
Expand All @@ -961,7 +950,10 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
return;
}
for (int i = 0; i < nstreams; ++i)
{
checkCudaErrors(cudaStreamDestroy(streams[i]));
}
delete[] streams;

checkCudaErrors(cudaFree(ylmcoef_g));
checkCudaErrors(cudaFree(atom_nwl_g));
Expand Down Expand Up @@ -1020,14 +1012,8 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
checkCudaErrors(cudaFree(dm_global_g));
checkCudaErrors(cudaFree(ap_output_gbl_g));

checkCudaErrors(cudaFreeHost(vec_len));
checkCudaErrors(cudaFreeHost(vec_l));
checkCudaErrors(cudaFreeHost(vec_r));
checkCudaErrors(cudaFreeHost(dot_product));

checkCudaErrors(cudaFree(vec_len_g));
checkCudaErrors(cudaFree(vec_l_g));
checkCudaErrors(cudaFree(vec_r_g));
checkCudaErrors(cudaFree(dot_product_g));
checkCudaErrors(cudaFree(rho_g));

Expand Down
6 changes: 0 additions & 6 deletions source/module_hamilt_lcao/module_gint/grid_technique.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,12 +218,6 @@ class Grid_Technique : public Grid_MeshBall
// additional variables for rho calculating
int num_mcell;
double* rho_g;
int* vec_len;
int* vec_len_g;
double** vec_l;
double** vec_l_g;
double** vec_r;
double** vec_r_g;
double** dot_product;
double** dot_product_g;

Expand Down
3 changes: 1 addition & 2 deletions source/module_hamilt_lcao/module_gint/gtask_force.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <omp.h>

#include "gint_force.h"
#include "gint_force_gpu.h"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
namespace GintKernel
Expand Down Expand Up @@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt,
}
}
atom_pair_num = tid;
gpu_mat_cal_flag.clear();
}


Expand Down
18 changes: 5 additions & 13 deletions source/module_hamilt_lcao/module_gint/gtask_rho.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include "gint_rho.h"
#include "gint_rho_gpu.h"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
#include "omp.h"
Expand Down Expand Up @@ -118,13 +118,10 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
int& max_n,
int& atom_pair_num,
double* rho_g,
double** vec_l,
double** vec_r,
double** dot_product,
int* vec_len,
int& dot_count)
double** dot_product)
{
int tid = 0;
int dot_count = 0;
max_m = 0;
max_n = 0;
const int nwmax=ucell.nwmax;
Expand Down Expand Up @@ -203,18 +200,13 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
gridt.ncy * nczp);
for (int i = 0; i < gridt.bxyz; i++)
{
vec_l[dot_count]
= psir_ylm_g + (bcell_start_psir + i * max_size * nwmax);
vec_r[dot_count]
= psir_dm_g + (bcell_start_psir + i * max_size * nwmax);
dot_product[dot_count] = rho_g + vindex[i];
vec_len[dot_count] = nwmax * max_size;
dot_count++;
}

delete[] vindex;
}
atom_pair_num = tid;

gpu_mat_cal_flag.clear();
}

} // namespace GintKernel
4 changes: 1 addition & 3 deletions source/module_hamilt_lcao/module_gint/gtask_vl.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <omp.h>

#include "gint_vl.h"
#include "gint_vl_gpu.h"
#include "module_base/ylm.h"
#include "module_hamilt_lcao/module_gint/gint_tools.h"
namespace GintKernel
Expand Down Expand Up @@ -197,8 +197,6 @@ void alloc_mult_vlocal(const Grid_Technique& gridt,
}
}
}

gpu_matrix_calc_flag.clear();
}

} // namespace GintKernel
Loading

0 comments on commit 7f8adc8

Please sign in to comment.