TABLE OF CONTENTS
- ABINIT/alloc_hamilt_gpu
- ABINIT/alloc_nonlop_gpu_data
- ABINIT/dealloc_hamilt_gpu
- ABINIT/dealloc_nonlop_gpu_data
- ABINIT/m_alloc_hamilt_gpu
ABINIT/alloc_hamilt_gpu [ Functions ]
NAME
alloc_hamilt_gpu
FUNCTION
allocate several memory pieces on a GPU device for the application of Hamiltonian using a GPU
INPUTS
atindx1(natom)=index table for atoms, inverse of atindx (see gstate.f) dtset <type(dataset_type)>=all input variables for this dataset gprimd(3,3)=dimensional reciprocal space primitive translations mpi_enreg=information about MPI parallelization nattyp(ntypat)= # atoms of each type. option=0: allocate data for local operator (FFT) 1: allocate data for nonlocal operator 2: allocate both psps <type(pseudopotential_type)>=variables related to pseudopotentials gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
OUTPUT
(no output - only allocation on GPU)
SOURCE
96 subroutine alloc_hamilt_gpu(atindx1,dtset,gprimd,mpi_enreg,nattyp,npwarr,option,psps,gpu_option) 97 98 !Arguments ------------------------------------ 99 !scalars 100 integer,intent(in) :: option,gpu_option 101 type(dataset_type),intent(in) :: dtset 102 type(MPI_type),intent(in) :: mpi_enreg 103 type(pseudopotential_type),intent(in) :: psps 104 !arrays 105 integer,intent(in) :: atindx1(dtset%natom),nattyp(dtset%ntypat),npwarr(dtset%nkpt) 106 real(dp),intent(in) :: gprimd(3,3) 107 108 !Local variables------------------------------- 109 !scalars 110 #if defined HAVE_GPU 111 integer :: dimekb1_max,dimekb2_max,dimffnl_max,ierr,ikpt,npw_max_loc,npw_max_nonloc 112 integer :: cplex 113 integer ::npwarr_tmp(dtset%nkpt) 114 115 integer(kind=c_int32_t) :: proj_dim(3) 116 #endif 117 118 ! ************************************************************************* 119 120 if (gpu_option==ABI_GPU_DISABLED) return 121 122 #if defined(HAVE_GPU) 123 !=== Local Hamiltonian === 124 if (option==0.or.option==2) then 125 ! Compute max of total planes waves 126 npw_max_loc=0 127 if(mpi_enreg%paral_kgb==1) then 128 npwarr_tmp=npwarr 129 call xmpi_sum(npwarr_tmp,mpi_enreg%comm_bandfft,ierr) 130 npw_max_loc =maxval(npwarr_tmp) 131 else 132 npw_max_loc=dtset%mpw 133 end if 134 ! Initialize gpu data needed in fourwf 135 ! ndat=bandpp when paral_kgb=1 136 ! no matter paral_kgb=0 or 1, we gathet all bands into a single call gpu_fourwf 137 if(gpu_option == ABI_GPU_LEGACY) then 138 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 139 call alloc_gpu_fourwf(dtset%ngfft,dtset%bandpp,npw_max_loc,npw_max_loc) 140 #endif 141 else if (gpu_option == ABI_GPU_KOKKOS) then 142 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 143 call alloc_gpu_fourwf_managed(dtset%ngfft,dtset%bandpp,npw_max_loc,npw_max_loc) 144 #endif 145 else if (gpu_option == ABI_GPU_OPENMP) then 146 call alloc_ompgpu_fourwf(dtset%ngfft,dtset%bandpp) 147 end if 148 149 end if 150 !=== Nonlocal Hamiltonian === 151 if (option==1.or.option==2) then 152 ! Compute max of total planes waves 153 npw_max_nonloc=0 154 if(mpi_enreg%paral_kgb==1) then 155 npwarr_tmp=npwarr 156 call xmpi_sum(npwarr_tmp,mpi_enreg%comm_band,ierr) 157 npw_max_nonloc =maxval(npwarr_tmp) 158 else 159 npw_max_nonloc=dtset%mpw 160 end if 161 ! Initialize all gpu data needed in nonlop 162 dimffnl_max=4 163 ! if (abs(dtset%berryopt) == 5) dimffnl_max=4 164 dimekb1_max=psps%dimekb 165 if (dtset%usepaw==0) dimekb2_max=psps%ntypat 166 if (dtset%usepaw==1) dimekb2_max=dtset%natom 167 168 if (gpu_option == ABI_GPU_KOKKOS .or. gpu_option == ABI_GPU_LEGACY) then 169 170 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 171 ! TODO (PK) : disable this allocation when Kokkos is available 172 ! to save memory on GPU side 173 call alloc_nonlop_gpu(npw_max_nonloc, & 174 & npw_max_nonloc,& 175 & dtset%nspinor,& 176 & dtset%natom,& 177 & dtset%ntypat,& 178 & psps%lmnmax,& 179 & psps%indlmn,& 180 & nattyp,& 181 & atindx1,& 182 & gprimd,& 183 & dimffnl_max,& 184 & dimffnl_max,& 185 & dimekb1_max,& 186 & dimekb2_max) 187 188 if (dtset%use_gemm_nonlop == 1) then 189 call alloc_nonlop_gpu_data(dtset, & 190 & psps, & 191 & npw_max_nonloc,& 192 & npw_max_nonloc,& 193 & atindx1,& 194 & nattyp,& 195 & gpu_option) 196 end if 197 #endif 198 199 end if 200 201 end if ! option 1 or 2 202 203 call xmpi_barrier(mpi_enreg%comm_cell) 204 205 #else 206 207 ABI_UNUSED(npwarr) 208 ABI_UNUSED_A(psps) 209 if (.false.) then 210 write(std_out,*) atindx1(1),dtset%natom,gprimd(1,1),mpi_enreg%me,nattyp(1),option 211 end if 212 213 #endif 214 215 end subroutine alloc_hamilt_gpu
ABINIT/alloc_nonlop_gpu_data [ Functions ]
NAME
alloc_hamilt_gpu
FUNCTION
allocate several memory pieces on a GPU device for the application of Hamiltonian using a GPU
INPUTS
atindx1(natom)=index table for atoms, inverse of atindx (see gstate.f) dtset <type(dataset_type)>=all input variables for this dataset nattyp(ntypat)= # atoms of each type. npwin is the number of plane waves for vectin npwout is the number of plane waves for vectout psps <type(pseudopotential_type)>=variables related to pseudopotentials gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
OUTPUT
(no output - only allocation on GPU, member of gemm_nonlop_gpu_data)
SOURCE
298 subroutine alloc_nonlop_gpu_data(dtset,& 299 & psps,& 300 & npwin,& 301 & npwout,& 302 & atindx1,& 303 & nattyp,& 304 & gpu_option) 305 306 !Arguments ------------------------------------ 307 !scalars 308 type(dataset_type), intent(in) :: dtset 309 type(pseudopotential_type),intent(in) :: psps 310 integer, intent(in) :: npwin 311 integer, intent(in) :: npwout 312 integer, intent(in) :: atindx1(dtset%natom) 313 !arrays 314 integer, intent(in) :: nattyp(dtset%ntypat) 315 !integer, intent(in) :: npwarr(dtset%nkpt) 316 317 ! other 318 integer, intent(in) :: gpu_option 319 320 !Local variables------------------------------- 321 !scalars 322 #if defined HAVE_GPU_CUDA 323 integer :: cplex 324 integer :: nprojs 325 integer :: itypat 326 327 real(dp) :: allocated_size_bytes 328 character(len=500) :: message 329 #endif 330 331 ! ************************************************************************* 332 333 #if defined HAVE_GPU_CUDA 334 allocated_size_bytes = 0. 335 336 cplex=2; !if (istwf_k>1) cplex=1 337 338 ! compute nprojs 339 nprojs = 0 340 do itypat = 1,dtset%ntypat 341 nprojs = nprojs + count(psps%indlmn(3,:,itypat)>0) * nattyp(itypat) 342 end do 343 344 345 !! allocate memory on device 346 347 if (gemm_nonlop_gpu_data % allocated .eqv. .false.) then 348 ! These will store the non-local factors for vectin, svectout and vectout respectively 349 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% projections_gpu, INT(cplex, c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp) 350 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% s_projections_gpu, INT(cplex, c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp) 351 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data%vnl_projections_gpu, INT(2 , c_size_t) * nprojs * dtset%nspinor*dtset%bandpp * dp) 352 353 allocated_size_bytes = allocated_size_bytes + (2*cplex+2)*nprojs * dtset%nspinor*dtset%bandpp * dp 354 355 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% vectin_gpu, INT(2, c_size_t) * npwin * dtset%nspinor*dtset%bandpp * dp) 356 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% vectout_gpu, INT(2, c_size_t) * npwout * dtset%nspinor*dtset%bandpp * dp) 357 ABI_MALLOC_CUDA(gemm_nonlop_gpu_data% svectout_gpu, INT(2, c_size_t) * npwout * dtset%nspinor*dtset%bandpp * dp) 358 359 allocated_size_bytes = allocated_size_bytes + & 360 & 2 * (npwin+2*npwout) * dtset%nspinor * dtset%bandpp * dp 361 362 gemm_nonlop_gpu_data % allocated = .true. 363 364 write(message,*) ' alloc_nonlop_gpu_data allocated ', allocated_size_bytes*1e-6, ' MBytes on device memory' 365 call wrtout(std_out,message,'COLL') 366 367 end if 368 369 #else 370 371 if (.false.) then 372 write(std_out,*) psps%indlmn(1,1,1),dtset%natom,npwin,npwout,atindx1(1),nattyp(1),gpu_option 373 end if 374 375 #endif 376 377 end subroutine alloc_nonlop_gpu_data
ABINIT/dealloc_hamilt_gpu [ Functions ]
NAME
dealloc_hamilt_gpu
FUNCTION
deallocate several memory pieces on a GPU device used for the application of Hamiltonian using a GPU
INPUTS
option=0: deallocate data for local operator (FFT) 1: deallocate data for nonlocal operator 2: deallocate both gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
OUTPUT
(no output - only deallocation on GPU)
SOURCE
237 subroutine dealloc_hamilt_gpu(option,gpu_option) 238 239 !Arguments ------------------------------------ 240 !scalars 241 integer,intent(in) :: option,gpu_option 242 !arrays 243 244 !Local variables------------------------------- 245 246 ! ************************************************************************* 247 248 if (gpu_option==ABI_GPU_DISABLED) return 249 250 if (gpu_option == ABI_GPU_KOKKOS .or. gpu_option == ABI_GPU_LEGACY) then 251 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 252 if (option==0.or.option==2) then 253 if (gpu_option == ABI_GPU_LEGACY) then 254 call free_gpu_fourwf() 255 else if(gpu_option == ABI_GPU_KOKKOS) then 256 call free_gpu_fourwf_managed() 257 end if 258 end if 259 260 if (option==1.or.option==2) then 261 call free_nonlop_gpu() 262 call dealloc_nonlop_gpu_data() 263 end if ! option 1 or 2 264 #endif 265 else if(gpu_option == ABI_GPU_OPENMP) then 266 call free_ompgpu_fourwf() 267 end if 268 269 if (.false.) then 270 write(std_out,*) option 271 end if 272 273 end subroutine dealloc_hamilt_gpu
ABINIT/dealloc_nonlop_gpu_data [ Functions ]
NAME
dealloc_nonlop_gpu_data
FUNCTION
deallocate several memory pieces on a GPU device used for the application of nonlop operators using Kokkos implementation
OUTPUT
(no output - only deallocation on GPU)
SOURCE
394 subroutine dealloc_nonlop_gpu_data() 395 396 #if defined HAVE_GPU_CUDA 397 if (gemm_nonlop_gpu_data % allocated) then 398 ABI_FREE_CUDA(gemm_nonlop_gpu_data% projections_gpu) 399 ABI_FREE_CUDA(gemm_nonlop_gpu_data% s_projections_gpu) 400 ABI_FREE_CUDA(gemm_nonlop_gpu_data%vnl_projections_gpu) 401 402 ABI_FREE_CUDA(gemm_nonlop_gpu_data% vectin_gpu) 403 ABI_FREE_CUDA(gemm_nonlop_gpu_data% vectout_gpu) 404 ABI_FREE_CUDA(gemm_nonlop_gpu_data%svectout_gpu) 405 406 gemm_nonlop_gpu_data % allocated = .false. 407 end if 408 #endif 409 410 end subroutine dealloc_nonlop_gpu_data
ABINIT/m_alloc_hamilt_gpu [ Modules ]
NAME
m_alloc_hamilt_gpu
FUNCTION
COPYRIGHT
Copyright (C) 2000-2024 ABINIT group (MT, FDahm) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
15 #if defined HAVE_CONFIG_H 16 #include "config.h" 17 #endif 18 19 #include "abi_common.h" 20 21 module m_alloc_hamilt_gpu 22 23 use defs_basis 24 use m_abicore 25 use m_xmpi 26 use m_dtset 27 use m_ompgpu_fourwf 28 #if defined HAVE_GPU 29 use m_gpu_toolbox 30 #endif 31 32 #ifdef HAVE_FC_ISO_C_BINDING 33 use, intrinsic :: iso_c_binding 34 #endif 35 36 use defs_datatypes, only : pseudopotential_type 37 use defs_abitypes, only : MPI_type 38 39 implicit none 40 41 private