Perf: optimize psir_dot function in gint_rho_gpu.cu (#4326)

* remove new in gtask_rho.cpp * rename some header files * optimize psir_dot in gint_rho_gpu.cu * fix a memory leak * remove redundant clear() * fix a memory leak * modify cuda_tools.cu * modify some header files * modify synchronization operation * modify psir_dot * modify psir_dot
deepmodeling · Jun 7, 2024 · 7f8adc8 · 7f8adc8
1 parent 51bdd59
commit 7f8adc8
Show file tree

Hide file tree

Showing 19 changed files with 97 additions and 163 deletions.
diff --git a/source/module_hamilt_lcao/module_gint/gint.cpp b/source/module_hamilt_lcao/module_gint/gint.cpp
@@ -1,9 +1,9 @@
 #include "gint.h"
 
 #if ((defined __CUDA))
-#include "gint_force.h"
-#include "gint_rho.h"
-#include "gint_vl.h"
+#include "gint_force_gpu.h"
+#include "gint_rho_gpu.h"
+#include "gint_vl_gpu.h"
 #endif
 
 #include "module_base/memory.h"

diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
@@ -3,11 +3,12 @@
 #include <fstream>
 #include <sstream>
 
-#include "gint_force.h"
+#include "gint_force_gpu.h"
 #include "kernels/cuda/cuda_tools.cuh"
 #include "kernels/cuda/gint_force.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
+#include "gint_tools.h"
+
 namespace GintKernel
 {
 

diff --git a/...dule_hamilt_lcao/module_gint/gint_force.h → ..._hamilt_lcao/module_gint/gint_force_gpu.h b/...dule_hamilt_lcao/module_gint/gint_force.h → ..._hamilt_lcao/module_gint/gint_force_gpu.h
@@ -1,5 +1,5 @@
-#ifndef GINT_FORCE_H
-#define GINT_FORCE_H
+#ifndef GINT_FORCE_GPU_H
+#define GINT_FORCE_GPU_H
 
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"

diff --git a/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_rho_gpu.cu
@@ -1,8 +1,8 @@
 #include "kernels/cuda/cuda_tools.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_rho.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
-#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_rho.cuh"
+#include "gint_rho_gpu.h"
+#include "gint_tools.h"
+#include "kernels/cuda/gint_rho.cuh"
 #include "omp.h"
 
 #include <omp.h>
@@ -68,7 +68,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
     {
         checkCuda(cudaStreamSynchronize(gridt.streams[i]));
     }
-
     // calculate the rho for every nbz bigcells
 
 #pragma omp parallel for num_threads(gridt.nstreams) collapse(2)
@@ -78,6 +77,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
         {
             // get stream id
             int stream_num = omp_get_thread_num();
+            checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
 
             // psi_input contains data used to generate the psi values.
             // The suffix "_g" indicates that the data is stored in the GPU,
@@ -154,16 +154,9 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
             double* rho_g = gridt.rho_g;
 
             // variables for dot product psir * psir_dm
-            int dot_count = 0;
-            int* vec_len = &gridt.vec_len[gridt.num_mcell * stream_num];
-            double** vec_l = &gridt.vec_l[gridt.num_mcell * stream_num];
-            double** vec_r = &gridt.vec_r[gridt.num_mcell * stream_num];
             double** dot_product
                 = &gridt.dot_product[gridt.num_mcell * stream_num];
 
-            int* vec_len_g = &gridt.vec_len_g[gridt.num_mcell * stream_num];
-            double** vec_l_g = &gridt.vec_l_g[gridt.num_mcell * stream_num];
-            double** vec_r_g = &gridt.vec_r_g[gridt.num_mcell * stream_num];
             double** dot_product_g
                 = &gridt.dot_product_g[gridt.num_mcell * stream_num];
 
@@ -172,7 +165,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
             int atom_pair_num = 0;
             const int grid_index_ij = i * gridt.nby * gridt.nbzp + j * gridt.nbzp;
             std::vector<bool> gpu_matrix_cal_flag(max_size * gridt.nbzp,false);
-            checkCuda(cudaStreamSynchronize(gridt.streams[stream_num]));
 
             // generate GPU tasks, including the calculation of psir, matrix
             // multiplication, and dot product
@@ -211,11 +203,7 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                             max_n,
                             atom_pair_num,
                             rho_g,
-                            vec_l,
-                            vec_r,
-                            dot_product,
-                            vec_len,
-                            dot_count);
+                            dot_product);
 
             // Copying data from host to device
             checkCuda(cudaMemcpyAsync(input_double_g,
@@ -286,21 +274,6 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                                       cudaMemcpyHostToDevice,
                                       gridt.streams[stream_num]));
 
-            checkCuda(cudaMemcpyAsync(vec_len_g,
-                                      vec_len,
-                                      gridt.num_mcell * sizeof(int),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
-            checkCuda(cudaMemcpyAsync(vec_l_g,
-                                      vec_l,
-                                      gridt.num_mcell * sizeof(double*),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
-            checkCuda(cudaMemcpyAsync(vec_r_g,
-                                      vec_r,
-                                      gridt.num_mcell * sizeof(double*),
-                                      cudaMemcpyHostToDevice,
-                                      gridt.streams[stream_num]));
             checkCuda(cudaMemcpyAsync(dot_product_g,
                                       dot_product,
                                       gridt.num_mcell * sizeof(double*),
@@ -352,20 +325,20 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
                                      atom_pair_num,
                                      gridt.streams[stream_num],
                                      ap_alpha_g);
+            checkCudaLastError();
 
             // Launching kernel to calculate dot product psir * psir_dm
-            dim3 grid_dot(64);
-            dim3 block_dot(64);
-            int incx = 1;
-            int incy = 1;
-            psir_dot<<<grid_dot, block_dot, 0, gridt.streams[stream_num]>>>(
-                vec_len_g,
-                vec_l_g,
-                incx,
-                vec_r_g,
-                incy,
-                dot_product_g,
-                dot_count);
+            const int block_size = 128;
+            dim3 block_dot(block_size);
+            dim3 grid_dot(gridt.nbzp, gridt.bxyz);
+            psir_dot<<<grid_dot, block_dot, sizeof(double) * block_size, gridt.streams[stream_num]>>>(
+                gridt.nbzp,
+                gridt.bxyz,
+                max_size * ucell.nwmax,
+                psir_ylm_left_g,
+                psir_r_g,
+                dot_product_g);
+            checkCudaLastError();
         }
     }
 
@@ -374,13 +347,11 @@ void gint_gamma_rho_gpu(const hamilt::HContainer<double>* dm,
     {
         checkCuda(cudaStreamSynchronize(gridt.streams[i]));
     }
-
     // Copy rho from device to host
     checkCuda(cudaMemcpy(rho,
                          gridt.rho_g,
                          nczp * gridt.ncx * gridt.ncy * sizeof(double),
                          cudaMemcpyDeviceToHost));
-
     // free the memory
     checkCuda(cudaFree(dm_matrix_g));
 }

diff --git a/...module_hamilt_lcao/module_gint/gint_rho.h → ...le_hamilt_lcao/module_gint/gint_rho_gpu.h b/...module_hamilt_lcao/module_gint/gint_rho.h → ...le_hamilt_lcao/module_gint/gint_rho_gpu.h
@@ -7,7 +7,6 @@
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"
 
-cudaError_t checkCuda(cudaError_t result);
 namespace GintKernel
 {
 
@@ -90,11 +89,7 @@ void gtask_rho(const Grid_Technique& gridt,
  * @param max_n Maximum value of n.
  * @param atom_pair_num Total count of atom pairs, which is also the number of matrix multiplication operations.
  * @param rho_g Rho.
- * @param vec_l Pointers to psir_ylm for vector dot product.
- * @param vec_r Pointers to psir_dm for vector dot product.
  * @param dot_product Pointers to the results of dot products.
- * @param vec_len Vector lengths for each dot product.
- * @param dot_count Total count of dot products.
  */
 void alloc_mult_dot_rho(const Grid_Technique& gridt,
                         const UnitCell& ucell,
@@ -120,11 +115,7 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
                         int& max_n,
                         int& atom_pair_num,
                         double* rho_g,
-                        double** vec_l,
-                        double** vec_r,
-                        double** dot_product,
-                        int* vec_len,
-                        int& dot_count);
+                        double** dot_product);
 
 } // namespace GintKernel
 #endif
diff --git a/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_vl_gpu.cu
@@ -2,9 +2,9 @@
 
 #include "kernels/cuda/cuda_tools.cuh"
 #include "module_base/ylm.h"
-#include "module_hamilt_lcao/module_gint/gint_tools.h"
-#include "module_hamilt_lcao/module_gint/gint_vl.h"
-#include "module_hamilt_lcao/module_gint/kernels/cuda/gint_vl.cuh"
+#include "gint_tools.h"
+#include "gint_vl_gpu.h"
+#include "kernels/cuda/gint_vl.cuh"
 
 namespace GintKernel
 {

diff --git a/.../module_hamilt_lcao/module_gint/gint_vl.h → ...ule_hamilt_lcao/module_gint/gint_vl_gpu.h b/.../module_hamilt_lcao/module_gint/gint_vl.h → ...ule_hamilt_lcao/module_gint/gint_vl_gpu.h
@@ -1,14 +1,12 @@
-#ifndef GINT_VL_H
-#define GINT_VL_H
+#ifndef GINT_VL_GPU_H
+#define GINT_VL_GPU_H
 #include <cublas_v2.h>
 #include <cuda.h> // for CUDA_VERSION
 #include <cuda_runtime.h>
 
 #include "module_hamilt_lcao/module_gint/gint.h"
 #include "module_hamilt_lcao/module_gint/grid_technique.h"
 
-cudaError_t checkCuda(cudaError_t result);
-
 namespace GintKernel
 {
 

diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.cpp b/source/module_hamilt_lcao/module_gint/grid_technique.cpp
@@ -185,7 +185,7 @@ void Grid_Technique::set_pbc_grid(const int& ncx_in,
 #if ((defined __CUDA) /* || (defined __ROCM) */)
     if(GlobalV::device_flag == "gpu")
     {
-        this->init_gpu_gint_variables(ucell,orb,num_stream);
+        this->init_gpu_gint_variables(ucell, orb, num_stream);
     }
 #endif
 
@@ -923,22 +923,11 @@ void Grid_Technique::init_gpu_gint_variables(const UnitCell& ucell,const LCAO_Or
     num_mcell = nbzp * bxyz;
     checkCudaErrors(cudaMalloc((void**)&rho_g, this->ncxyz * sizeof(double)));
     checkCudaErrors(cudaMemset(rho_g, 0, this->ncxyz * sizeof(double)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_l, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_l_g, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_r, num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_r_g, num_mcell * nstreams * sizeof(double*)));
     checkCudaErrors(cudaMallocHost((void**)&dot_product,
                                    num_mcell * nstreams * sizeof(double*)));
     checkCudaErrors(cudaMalloc((void**)&dot_product_g,
                                num_mcell * nstreams * sizeof(double*)));
-    checkCudaErrors(
-        cudaMallocHost((void**)&vec_len, num_mcell * nstreams * sizeof(int)));
-    checkCudaErrors(
-        cudaMalloc((void**)&vec_len_g, num_mcell * nstreams * sizeof(int)));
+
 
     for (int i = 0; i < nstreams; ++i)
     {
@@ -961,7 +950,10 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
         return;
     }
     for (int i = 0; i < nstreams; ++i)
+    {
         checkCudaErrors(cudaStreamDestroy(streams[i]));
+    }
+    delete[] streams;
 
     checkCudaErrors(cudaFree(ylmcoef_g));
     checkCudaErrors(cudaFree(atom_nwl_g));
@@ -1020,14 +1012,8 @@ void Grid_Technique::free_gpu_gint_variables(int nat)
     checkCudaErrors(cudaFree(dm_global_g));
     checkCudaErrors(cudaFree(ap_output_gbl_g));
 
-    checkCudaErrors(cudaFreeHost(vec_len));
-    checkCudaErrors(cudaFreeHost(vec_l));
-    checkCudaErrors(cudaFreeHost(vec_r));
     checkCudaErrors(cudaFreeHost(dot_product));
 
-    checkCudaErrors(cudaFree(vec_len_g));
-    checkCudaErrors(cudaFree(vec_l_g));
-    checkCudaErrors(cudaFree(vec_r_g));
     checkCudaErrors(cudaFree(dot_product_g));
     checkCudaErrors(cudaFree(rho_g));
 

diff --git a/source/module_hamilt_lcao/module_gint/grid_technique.h b/source/module_hamilt_lcao/module_gint/grid_technique.h
@@ -218,12 +218,6 @@ class Grid_Technique : public Grid_MeshBall
     // additional variables for rho calculating
     int num_mcell;
     double* rho_g;
-    int* vec_len;
-    int* vec_len_g;
-    double** vec_l;
-    double** vec_l_g;
-    double** vec_r;
-    double** vec_r_g;
     double** dot_product;
     double** dot_product_g;
 

diff --git a/source/module_hamilt_lcao/module_gint/gtask_force.cpp b/source/module_hamilt_lcao/module_gint/gtask_force.cpp
@@ -1,6 +1,6 @@
 #include <omp.h>
 
-#include "gint_force.h"
+#include "gint_force_gpu.h"
 #include "module_base/ylm.h"
 #include "module_hamilt_lcao/module_gint/gint_tools.h"
 namespace GintKernel
@@ -213,7 +213,6 @@ void alloc_mult_force(const Grid_Technique& gridt,
         }
     }
     atom_pair_num = tid;
-    gpu_mat_cal_flag.clear();
 }
 
 

diff --git a/source/module_hamilt_lcao/module_gint/gtask_rho.cpp b/source/module_hamilt_lcao/module_gint/gtask_rho.cpp
@@ -1,4 +1,4 @@
-#include "gint_rho.h"
+#include "gint_rho_gpu.h"
 #include "module_base/ylm.h"
 #include "module_hamilt_lcao/module_gint/gint_tools.h"
 #include "omp.h"
@@ -118,13 +118,10 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
                         int& max_n,
                         int& atom_pair_num,
                         double* rho_g,
-                        double** vec_l,
-                        double** vec_r,
-                        double** dot_product,
-                        int* vec_len,
-                        int& dot_count)
+                        double** dot_product)
 {
     int tid = 0;
+    int dot_count = 0;
     max_m = 0;
     max_n = 0;
     const int nwmax=ucell.nwmax;
@@ -203,18 +200,13 @@ void alloc_mult_dot_rho(const Grid_Technique& gridt,
                                              gridt.ncy * nczp);
         for (int i = 0; i < gridt.bxyz; i++)
         {
-            vec_l[dot_count]
-                = psir_ylm_g + (bcell_start_psir + i * max_size * nwmax);
-            vec_r[dot_count]
-                = psir_dm_g + (bcell_start_psir + i * max_size * nwmax);
             dot_product[dot_count] = rho_g + vindex[i];
-            vec_len[dot_count] = nwmax * max_size;
             dot_count++;
         }
+
+        delete[] vindex;
     }
     atom_pair_num = tid;
-
-    gpu_mat_cal_flag.clear();
 }
 
 } // namespace GintKernel
diff --git a/source/module_hamilt_lcao/module_gint/gtask_vl.cpp b/source/module_hamilt_lcao/module_gint/gtask_vl.cpp
@@ -1,6 +1,6 @@
 #include <omp.h>
 
-#include "gint_vl.h"
+#include "gint_vl_gpu.h"
 #include "module_base/ylm.h"
 #include "module_hamilt_lcao/module_gint/gint_tools.h"
 namespace GintKernel
@@ -197,8 +197,6 @@ void alloc_mult_vlocal(const Grid_Technique& gridt,
             }
         }
     }
-
-    gpu_matrix_calc_flag.clear();
 }
 
 } // namespace GintKernel