TABLE OF CONTENTS
- ABINIT/cwavef_double_rfft_trick_pack
- ABINIT/cwavef_double_rfft_trick_unpack
- ABINIT/getghc
- ABINIT/getghc_mGGA
- ABINIT/getghc_nucdip
- ABINIT/getgsc
- ABINIT/m_getghc
- ABINIT/multithreaded_getghc
ABINIT/cwavef_double_rfft_trick_pack [ Functions ]
NAME
cwavef_double_rfft_trick_pack
FUNCTION
We have C(G)=C(-G) and D(G)=D(-G) and only G components are in memory (istwfk=2) The Fourier transform is: C(r) = sum_G e^(iGr) C(G) = sum_(G_z>=0,G/=0) 2 Re[e^(iGr) C(G)] + C(0) so C(r) is a real function (same for D) Here we construct: E( G) = C(G) + i D(G) E(-G) = C(G)^* + i D(G)^* (G/=0) so: E(r) = C(r) + i D(r) In short, one can do only one FFT on E (with istwfk=1) and obtains the FFT of C and D (istwfk=2)
INPUTS
cwavef(2,npw_k*my_nspinor*(2*ndat))=planewave coefficients of wavefunctioni (istwfk=2) mpi_enreg=information about MPI parallelization ndat=number of FFTs to perform in parall npw_k=number of planewaves in basis for given k point (istwfk=2).
OUTPUT
cwavef_fft(2,npw_fft*my_nspinor*ndat)=planewave coefficients of wavefunction (istwfk=1)
SIDE EFFECTS
NOTES
PARENTS
CHILDREN
SOURCE
1083 subroutine cwavef_double_rfft_trick_pack(cwavef,cwavef_fft,mpi_enreg,ndat,npw_k) 1084 1085 !Arguments ------------------------------------ 1086 !scalars 1087 integer,intent(in) :: ndat,npw_k 1088 type(MPI_type),intent(in) :: mpi_enreg 1089 !arrays 1090 real(dp),intent(in) :: cwavef(:,:) 1091 real(dp),intent(out) :: cwavef_fft(:,:) 1092 !Local variables------------------------------- 1093 !scalars 1094 integer :: idat,i0,ib1,ib2,npw_fft 1095 1096 npw_fft=2*npw_k 1097 i0=1 1098 if (mpi_enreg%me_g0_fft==1) then! Do not include G=(0,0,0) twice 1099 npw_fft=npw_fft-1 1100 i0=2 1101 end if 1102 1103 if (mpi_enreg%nproc_fft==1) then 1104 if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*2*ndat) then 1105 ABI_BUG('wrong size for cwavef') 1106 end if 1107 else 1108 if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*ndat) then 1109 ABI_BUG('wrong size for cwavef') 1110 end if 1111 end if 1112 if (size(cwavef_fft,1)/=2.or.size(cwavef_fft,2)/=npw_fft*ndat) then 1113 ABI_BUG('wrong size for cwavef_fft') 1114 end if 1115 1116 if (mpi_enreg%nproc_fft==1) then 1117 do idat=1,ndat 1118 ib1=(idat-1)*npw_fft ! band shift for cwavef_fft 1119 ib2=(idat-1)*2*npw_k ! band shift for cwavef 1120 ! E(G) = C(G) + i D(G) 1121 cwavef_fft(1,1+ib1:npw_k+ib1) = cwavef(1,1+ib2 : npw_k+ib2) & 1122 -cwavef(2,1+npw_k+ib2:2*npw_k+ib2) 1123 cwavef_fft(2,1+ib1:npw_k+ib1) = cwavef(2,1+ib2 : npw_k+ib2) & 1124 +cwavef(1,1+npw_k+ib2:2*npw_k+ib2) 1125 ! E(-G) = C(G)^* + i D(G)^* (G/=0) 1126 cwavef_fft(1,1+npw_k+ib1:npw_fft+ib1) = cwavef(1,i0 +ib2: npw_k+ib2) & 1127 +cwavef(2,i0+npw_k+ib2:2*npw_k+ib2) 1128 cwavef_fft(2,1+npw_k+ib1:npw_fft+ib1) =-cwavef(2,i0 +ib2: npw_k+ib2) & 1129 +cwavef(1,i0+npw_k+ib2:2*npw_k+ib2) 1130 end do 1131 else 1132 do idat=1,ndat 1133 ib1=(idat-1)*npw_fft ! band shift for cwavef_fft 1134 ib2=(idat-1)*npw_k ! band shift for cwavef 1135 ! E(G) = C(G) 1136 cwavef_fft(:,1+ib1:npw_k+ib1) = cwavef(:,1+ib2 : npw_k+ib2) 1137 ! E(-G) = C(G)^* 1138 cwavef_fft(1,1+npw_k+ib1:npw_fft+ib1) = cwavef(1,i0+ib2:npw_k+ib2) 1139 cwavef_fft(2,1+npw_k+ib1:npw_fft+ib1) = -cwavef(2,i0+ib2:npw_k+ib2) 1140 end do 1141 end if 1142 1143 end subroutine cwavef_double_rfft_trick_pack
ABINIT/cwavef_double_rfft_trick_unpack [ Functions ]
NAME
cwavef_double_rfft_trick_unpack
FUNCTION
From the "cwavef_double_rfft_trick_pack" routine we have: E( G) = C(G) + i D(G) E(-G) = C(G)^* + i D(G)^* (G/=0) Here we compute: C(G) = ( E(G) + E(-G)^* ) / 2 (G/=0) D(G) = ( iE(-G)^* - iE(G)) / 2 (G/=0) and: C(0) = Re(E(0)) D(0) = Im(E(0))
INPUTS
cwavef_fft(2,npw_fft*my_nspinor*ndat_)=planewave coefficients of wavefunction (istwfk=1) mpi_enreg=information about MPI parallelization ndat=number of FFTs to perform in parall npw_k=number of planewaves in basis for given k point (istwfk=2).
OUTPUT
cwavef(2,npw_fft*my_nspinor*(2*ndat))=planewave coefficients of wavefunction (istwfk=2)
SIDE EFFECTS
NOTES
PARENTS
CHILDREN
SOURCE
1182 subroutine cwavef_double_rfft_trick_unpack(cwavef,cwavef_fft,mpi_enreg,ndat,npw_k) 1183 1184 !Arguments ------------------------------------ 1185 !scalars 1186 integer,intent(in) :: ndat,npw_k 1187 type(MPI_type),intent(in) :: mpi_enreg 1188 !arrays 1189 real(dp),intent(out) :: cwavef(:,:) 1190 real(dp),intent(in) :: cwavef_fft(:,:) 1191 !Local variables------------------------------- 1192 !scalars 1193 integer :: idat,i0,ib1,ib2,npw_fft 1194 1195 npw_fft=2*npw_k 1196 i0=1 1197 if (mpi_enreg%me_g0_fft==1) then! Do not include G=(0,0,0) twice 1198 npw_fft=npw_fft-1 1199 i0=2 1200 end if 1201 1202 if (mpi_enreg%nproc_fft==1) then 1203 if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*2*ndat) then 1204 ABI_BUG('wrong size for cwavef') 1205 end if 1206 else 1207 if (size(cwavef,1)/=2.or.size(cwavef,2)/=npw_k*ndat) then 1208 ABI_BUG('wrong size for cwavef') 1209 end if 1210 end if 1211 if (size(cwavef_fft,1)/=2.or.size(cwavef_fft,2)/=npw_fft*ndat) then 1212 ABI_BUG('wrong size for cwavef_fft') 1213 end if 1214 1215 if (mpi_enreg%nproc_fft==1) then 1216 1217 do idat=1,ndat 1218 ib1=(idat-1)*npw_fft ! band shift for cwavef_fft 1219 ib2=(idat-1)*2*npw_k ! band shift for cwavef 1220 ! C(G) = ( E(G) + E(-G)^* ) / 2 (factor 1/2 will be applied later) 1221 cwavef(1,i0+ib2:npw_k+ib2) = cwavef_fft(1,i0 +ib1:npw_k +ib1) & !+Re(E( G)) 1222 +cwavef_fft(1,1 +npw_k+ib1:npw_fft+ib1) !+Re(E(-G)) 1223 cwavef(2,i0+ib2:npw_k+ib2) = cwavef_fft(2,i0 +ib1:npw_k +ib1) & !+Im(E( G)) 1224 -cwavef_fft(2,1 +npw_k+ib1:npw_fft+ib1) !-Im(E(-G)) 1225 ! D(G) = ( iE(-G)^* - iE(G) ) / 2 (factor 1/2 will be applied later) 1226 cwavef(1,i0+npw_k+ib2:2*npw_k+ib2) = cwavef_fft(2,i0 +ib1:npw_k +ib1) & !+Im(E( G)) 1227 +cwavef_fft(2,1 +npw_k+ib1:npw_fft+ib1) !+Im(E(-G)) 1228 cwavef(2,i0+npw_k+ib2:2*npw_k+ib2) =-cwavef_fft(1,i0 +ib1:npw_k +ib1) & !-Re(E( G)) 1229 +cwavef_fft(1,1 +npw_k+ib1:npw_fft+ib1) !+Re(E(-G)) 1230 if (mpi_enreg%me_g0_fft==1) then 1231 ! Compute C(G=0) and D(G=0) and multiply by 2 as we apply 1/2 to the whole array shortly afterwards 1232 ! C(G=0) = Re(E(G=0)) 1233 cwavef(1,1+ib2) = two*cwavef_fft(1,1+ib1) 1234 cwavef(2,1+ib2) = zero 1235 ! D(G=0) = Im(E(G=0)) 1236 cwavef(1,1+npw_k+ib2) = two*cwavef_fft(2,1+ib1) 1237 cwavef(2,1+npw_k+ib2) = zero 1238 end if 1239 1240 end do 1241 1242 cwavef(:,:) = half*cwavef(:,:) 1243 1244 else 1245 1246 do idat=1,ndat 1247 ib1=(idat-1)*npw_fft ! band shift for cwavef_fft 1248 ib2=(idat-1)*npw_k ! band shift for cwavef 1249 ! C(G) = E(G) 1250 cwavef(:,1+ib2:npw_k+ib2) = cwavef_fft(:,1+ib1:npw_k+ib1) 1251 if (i0==2) then 1252 cwavef(2,1+ib2) = zero ! This is important, otherwise it is numerically unstable... 1253 end if 1254 end do 1255 1256 end if 1257 1258 end subroutine cwavef_double_rfft_trick_unpack
ABINIT/getghc [ Functions ]
NAME
getghc
FUNCTION
Compute <G|H|C> for input vector |C> expressed in reciprocal space; Result is put in array ghc. <G|Vnonlocal + VfockACE|C> is also returned in gvnlxc if either NLoc NCPP or FockACE. If required, <G|S|C> is returned in gsc (S=overlap - PAW only) Note that left and right k points can be different, i.e. ghc=<k^prime+G|H|C_k>.
INPUTS
cpopt=flag defining the status of cwaveprj%cp(:)=<Proj_i|Cnk> scalars (PAW only) (same meaning as in nonlop.F90 routine) if cpopt=-1, <p_lmn|in> (and derivatives) are computed here (and not saved) if cpopt= 0, <p_lmn|in> are computed here and saved if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved if cpopt= 2 <p_lmn|in> are already in memory; if cpopt= 3 <p_lmn|in> are already in memory; first derivatives are computed here and saved if cpopt= 4 <p_lmn|in> and first derivatives are already in memory; cwavef(2,npw*my_nspinor*ndat)=planewave coefficients of wavefunction. cwavef_r(2,n4,n5,n6,nspinor) = wave function in real space gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian to be applied lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1 Typically lambda is the eigenvalue (or its guess) mpi_enreg=information about MPI parallelization ndat=number of FFT to do in parallel prtvol=control print volume and debugging output sij_opt= -PAW ONLY- if 0, only matrix elements <G|H|C> have to be computed (S=overlap) if 1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used) tim_getghc=timing code of the calling subroutine(can be set to 0 if not attributed) type_calc= option governing which part of Hamitonian is to be applied: 1: local part only 2: non-local+Fock+kinetic only (added to the existing Hamiltonian) 3: local + kinetic only (added to the existing Hamiltonian) ===== Optional inputs ===== [kg_fft_k(3,:)]=optional, (k+G) vector coordinates to be used for the FFT tranformation instead of the one contained in gs_ham datastructure. Typically used for real WF (in parallel) which are FFT-transformed 2 by 2. [kg_fft_kp(3,:)]=optional, (k^prime+G) vector coordinates to be used for the FFT tranformation [select_k]=optional, option governing the choice of k points to be used. gs_ham datastructure contains quantities needed to apply Hamiltonian in reciprocal space between 2 kpoints, k and k^prime (equal in most cases); if select_k=1, <k^prime|H|k> is applied [default] if select_k=2, <k|H|k^prime> is applied if select_k=3, <k|H|k> is applied if select_k=4, <k^prime|H|k^prime> is applied
OUTPUT
ghc(2,npw*my_nspinor*ndat)=matrix elements <G|H|C> (if sij_opt>=0) or <G|H-lambda.S|C> (if sij_opt=-1) gvnlxc(2,npw*my_nspinor*ndat)=matrix elements <G|Vnonlocal+VFockACE|C> (if sij_opt>=0) or <G|Vnonlocal+VFockACE-lambda.S|C> (if sij_opt=-1) include Vnonlocal if NCPP and non-local Fock if associated(gs_ham%fockcommon) if (sij_opt=1) gsc(2,npw*my_nspinor*ndat)=matrix elements <G|S|C> (S=overlap).
SIDE EFFECTS
cwaveprj(natom,my_nspinor*(1+cpopt)*ndat)= wave function projected on nl projectors (PAW only)
SOURCE
145 subroutine getghc(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,& 146 prtvol,sij_opt,tim_getghc,type_calc,& 147 kg_fft_k,kg_fft_kp,select_k,cwavef_r) ! optional arguments 148 149 !Arguments ------------------------------------ 150 !scalars 151 integer,intent(in) :: cpopt,ndat, prtvol 152 integer,intent(in) :: sij_opt,tim_getghc,type_calc 153 integer,intent(in),optional :: select_k 154 real(dp),intent(in) :: lambda 155 type(MPI_type),intent(in) :: mpi_enreg 156 type(gs_hamiltonian_type),intent(inout),target :: gs_ham 157 !arrays 158 integer,intent(in),optional,target :: kg_fft_k(:,:),kg_fft_kp(:,:) 159 real(dp),intent(out),target :: gsc(:,:) 160 real(dp),intent(inout), target :: cwavef(:,:) 161 real(dp),optional,intent(inout) :: cwavef_r(:,:,:,:,:) 162 real(dp),intent(out), target :: ghc(:,:) 163 real(dp),intent(out),target :: gvnlxc(:,:) 164 type(pawcprj_type),intent(inout),target :: cwaveprj(:,:) 165 !MG: Passing these arrays assumed-shape has the drawback that client code is 166 !forced to use vec(2, npw*ndat) instead of the more user-friendly vec(2,npw,ndat) 167 168 !Local variables------------------------------- 169 !scalars 170 integer,parameter :: level=114,re=1,im=2,tim_fourwf=1 171 integer :: choice,cplex,cpopt_here,i1,i2,i3,idat,idir,ierr 172 integer :: ig,igspinor,istwf_k_,ii,iispinor,ikpt_this_proc,ipw,ispinor,my_nspinor 173 integer :: n4,n5,n6,ndat_,nnlout,npw_fft,npw_k1,npw_k2,nspinortot,option_fft 174 integer :: paw_opt,select_k_,shift1,shift2,signs,tim_nonlop 175 logical(kind=c_bool) :: k1_eq_k2 176 logical :: double_rfft_trick,have_to_reequilibrate,has_fock,local_gvnlxc 177 logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc,use_cwavef_r 178 real(dp) :: ghcim,ghcre,weight 179 character(len=500) :: msg 180 181 !arrays 182 integer, pointer :: gbound_k1(:,:) 183 integer, pointer :: gbound_k2(:,:) 184 integer, pointer :: kg_k1(:,:) 185 integer, pointer :: kg_k2(:,:) 186 integer, ABI_CONTIGUOUS pointer :: indices_pw_fft(:), kg_k_fft(:,:) 187 integer, ABI_CONTIGUOUS pointer :: recvcount_fft(:), recvdisp_fft(:) 188 integer, ABI_CONTIGUOUS pointer :: sendcount_fft(:), senddisp_fft(:) 189 integer, allocatable:: dimcprj(:) 190 real(dp) :: enlout(ndat), lambda_ndat(ndat), tsec(2) 191 real(dp), target :: nonlop_dum(1,1) 192 real(dp), allocatable :: buff_wf(:,:) 193 real(dp), allocatable :: cwavef1(:,:) 194 real(dp), allocatable :: cwavef2(:,:) 195 real(dp), allocatable :: cwavef_fft(:,:) 196 real(dp), allocatable :: cwavef_fft_tr(:,:) 197 198 real(dp), allocatable :: ghc1(:,:) 199 real(dp), allocatable :: ghc2(:,:) 200 real(dp), allocatable :: ghc3(:,:) 201 real(dp), allocatable :: ghc4(:,:) 202 real(dp), allocatable :: ghc_mGGA(:,:) 203 real(dp), allocatable :: ghc_vectornd(:,:) 204 205 #if defined HAVE_GPU && defined HAVE_YAKL 206 real(c_double), ABI_CONTIGUOUS pointer :: gvnlc(:,:) 207 #else 208 real(dp), allocatable :: gvnlc(:,:) 209 #endif 210 211 real(dp), allocatable :: vlocal_tmp(:,:,:) 212 real(dp), allocatable :: work(:,:,:,:) 213 214 #if defined HAVE_GPU && defined HAVE_YAKL 215 real(c_double), ABI_CONTIGUOUS pointer :: gvnlxc_(:,:) 216 #else 217 real(dp), pointer :: gvnlxc_(:,:) 218 #endif 219 220 real(dp), pointer :: kinpw_k1(:) 221 real(dp), pointer :: kinpw_k2(:) 222 real(dp), pointer :: kpt_k1(:) 223 real(dp), pointer :: kpt_k2(:) 224 real(dp), pointer :: gsc_ptr(:,:) 225 type(fock_common_type),pointer :: fock 226 type(pawcprj_type),pointer :: cwaveprj_fock(:,:),cwaveprj_idat(:,:),cwaveprj_nonlop(:,:) 227 228 real(c_double), parameter :: hugevalue = huge(zero)*1.d-11 229 230 ! ********************************************************************* 231 232 DBG_ENTER("COLL") 233 234 if(gs_ham%gpu_option==ABI_GPU_OPENMP) then 235 call getghc_ompgpu(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,& 236 prtvol,sij_opt,tim_getghc,type_calc,& 237 kg_fft_k,kg_fft_kp,select_k,cwavef_r) 238 return 239 end if 240 241 !Keep track of total time spent in getghc: 242 call timab(350+tim_getghc,1,tsec) 243 ABI_NVTX_START_RANGE(NVTX_GETGHC) 244 245 !Structured debugging if prtvol==-level 246 if(prtvol==-level)then 247 write(msg,'(80a,a,a)') ('=',ii=1,80),ch10,' getghc : enter, debugging ' 248 call wrtout(std_out,msg,'PERS') 249 end if 250 251 !Select k-dependent objects according to select_k input parameter 252 select_k_=1;if (present(select_k)) select_k_=select_k 253 if (select_k_==KPRIME_H_K) then 254 ! <k^prime|H|k> 255 npw_k1 = gs_ham%npw_fft_k ; npw_k2 = gs_ham%npw_fft_kp 256 kpt_k1 => gs_ham%kpt_k ; kpt_k2 => gs_ham%kpt_kp 257 kg_k1 => gs_ham%kg_k ; kg_k2 => gs_ham%kg_kp 258 gbound_k1 => gs_ham%gbound_k ; gbound_k2 => gs_ham%gbound_kp 259 kinpw_k1 => gs_ham%kinpw_k ; kinpw_k2 => gs_ham%kinpw_kp 260 else if (select_k_==K_H_KPRIME) then 261 ! <k|H|k^prime> 262 npw_k1 = gs_ham%npw_fft_kp; npw_k2 = gs_ham%npw_fft_k 263 kpt_k1 => gs_ham%kpt_kp ; kpt_k2 => gs_ham%kpt_k 264 kg_k1 => gs_ham%kg_kp ; kg_k2 => gs_ham%kg_k 265 gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_k 266 kinpw_k1 => gs_ham%kinpw_kp ; kinpw_k2 => gs_ham%kinpw_k 267 else if (select_k_==K_H_K) then 268 ! <k|H|k> 269 npw_k1 = gs_ham%npw_fft_k ; npw_k2 = gs_ham%npw_fft_k 270 kpt_k1 => gs_ham%kpt_k ; kpt_k2 => gs_ham%kpt_k 271 kg_k1 => gs_ham%kg_k ; kg_k2 => gs_ham%kg_k 272 gbound_k1 => gs_ham%gbound_k ; gbound_k2 => gs_ham%gbound_k 273 kinpw_k1 => gs_ham%kinpw_k ; kinpw_k2 => gs_ham%kinpw_k 274 else if (select_k_==KPRIME_H_KPRIME) then 275 ! <k^prime|H|k^prime> 276 npw_k1 = gs_ham%npw_fft_kp; npw_k2 = gs_ham%npw_fft_kp 277 kpt_k1 => gs_ham%kpt_kp ; kpt_k2 => gs_ham%kpt_kp 278 kg_k1 => gs_ham%kg_kp ; kg_k2 => gs_ham%kg_kp 279 gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_kp 280 kinpw_k1 => gs_ham%kinpw_kp ; kinpw_k2 => gs_ham%kinpw_kp 281 end if 282 k1_eq_k2=(all(abs(kpt_k1(:)-kpt_k2(:))<tol8)) 283 284 !Check sizes 285 my_nspinor=max(1,gs_ham%nspinor/mpi_enreg%nproc_spinor) 286 if (size(cwavef)<2*npw_k1*my_nspinor*ndat) then 287 ABI_BUG('wrong size for cwavef!') 288 end if 289 if (size(ghc)<2*npw_k2*my_nspinor*ndat) then 290 ABI_BUG('wrong size for ghc!') 291 end if 292 if (sij_opt==1) then 293 if (size(gsc)<2*npw_k2*my_nspinor*ndat) then 294 ABI_BUG('wrong size for gsc!') 295 end if 296 end if 297 if (gs_ham%usepaw==1.and.cpopt>=0) then 298 if (size(cwaveprj)<gs_ham%natom*my_nspinor*ndat) then 299 ABI_BUG('wrong size for cwaveprj!') 300 end if 301 end if 302 if (any(type_calc == [0, 2, 3])) then 303 local_gvnlxc = size(gvnlxc)==0 304 if (local_gvnlxc) then 305 if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then 306 #if defined HAVE_GPU && defined HAVE_YAKL 307 ABI_MALLOC_MANAGED(gvnlxc_, (/2,npw_k2*my_nspinor*ndat/)) 308 #endif 309 else 310 ABI_MALLOC(gvnlxc_,(2,npw_k2*my_nspinor*ndat)) 311 end if 312 else 313 gvnlxc_ => gvnlxc 314 end if 315 if (size(gvnlxc_)<2*npw_k2*my_nspinor*ndat) then 316 ABI_BUG('wrong size for gvnlxc!') 317 end if 318 end if 319 use_cwavef_r=present(cwavef_r) 320 n4=gs_ham%n4 ; n5=gs_ham%n5 ; n6=gs_ham%n6 321 nspinortot=gs_ham%nspinor 322 if (use_cwavef_r) then 323 if (size(cwavef_r,1)/=2) then 324 ABI_BUG('wrong size for cwavef_r (dimension 1)') 325 end if 326 if (size(cwavef_r,2)/=n4) then 327 ABI_BUG('wrong size for cwavef_r (dimension 2)') 328 end if 329 if (size(cwavef_r,3)/=n5) then 330 ABI_BUG('wrong size for cwavef_r (dimension 3)') 331 end if 332 if (size(cwavef_r,4)/=n6) then 333 ABI_BUG('wrong size for cwavef_r (dimension 4)') 334 end if 335 if (size(cwavef_r,5)/=nspinortot) then 336 ABI_BUG('wrong size for cwavef_r (dimension 5)') 337 end if 338 end if 339 340 !Eventually overwrite plane waves data for FFT 341 if (present(kg_fft_k)) then 342 kg_k1 => kg_fft_k ; kg_k2 => kg_fft_k 343 npw_k1=size(kg_k1,2) ; npw_k2=size(kg_k2,2) 344 end if 345 if (present(kg_fft_kp)) then 346 kg_k2 => kg_fft_kp ; npw_k2=size(kg_k2,2) 347 end if 348 349 !paral_kgb constraint 350 if (mpi_enreg%paral_kgb==1.and.(.not.k1_eq_k2)) then 351 ABI_BUG('paral_kgb=1 not allowed for k/=k_^prime!') 352 end if 353 354 !Do we add Fock exchange term ? 355 has_fock = associated(gs_ham%fockcommon) 356 if (has_fock) fock => gs_ham%fockcommon 357 358 !Parallelization over spinors management 359 if (mpi_enreg%paral_spinor==0) then 360 shift1=npw_k1;shift2=npw_k2 361 nspinor1TreatedByThisProc=.true. 362 nspinor2TreatedByThisProc=(nspinortot==2) 363 else 364 shift1=0;shift2=0 365 nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0) 366 nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1) 367 end if 368 369 !============================================================ 370 ! Application of the local potential 371 !============================================================ 372 373 ABI_NVTX_START_RANGE(NVTX_GETGHC_LOCPOT) 374 375 if (any(type_calc == [0, 1, 3])) then 376 377 ! Need a Vlocal 378 if (.not.associated(gs_ham%vlocal)) then 379 ABI_BUG("We need vlocal in gs_ham!") 380 end if 381 382 ! fourwf can only process with one value of istwf_k 383 if (.not.k1_eq_k2) then 384 ABI_BUG('vlocal (fourwf) cannot be computed with k/=k^prime!') 385 end if 386 387 ! Eventually adjust load balancing for FFT (by changing FFT distrib) 388 have_to_reequilibrate=.false. 389 if (mpi_enreg%paral_kgb==1) then 390 ikpt_this_proc=bandfft_kpt_get_ikpt() 391 have_to_reequilibrate=bandfft_kpt(ikpt_this_proc)%have_to_reequilibrate 392 end if 393 ndat_ = ndat 394 istwf_k_ = gs_ham%istwf_k 395 double_rfft_trick = istwf_k_==2.and.modulo(ndat,2)==0.and.k1_eq_k2 396 double_rfft_trick = double_rfft_trick.and.(.not.have_to_reequilibrate).and.mpi_enreg%paral_kgb==1 397 ! Note that the trick can be activated only if nspinortot=1 (if =2 then istwf_k=1), so gs_ham%nvloc=1 too 398 ! For npfft>1, we don't use the trick but MPI-FFT does not handle istwf_k==2, 399 ! so we have to use istwf_k=1 locally and build cwavef_fft in a similar way 400 if (double_rfft_trick) then 401 istwf_k_ = 1 402 if (mpi_enreg%nproc_fft==1) then 403 ndat_ = ndat / 2 404 else 405 ndat_ = ndat 406 end if 407 kg_k_fft => bandfft_kpt(ikpt_this_proc)%kg_k_gather_sym(:,:) 408 npw_fft=2*npw_k1 409 if (mpi_enreg%me_g0_fft==1) npw_fft=npw_fft-1 ! Do not include G=(0,0,0) twice 410 ABI_MALLOC(cwavef_fft,(2,npw_fft*ndat_)) 411 call cwavef_double_rfft_trick_pack(cwavef,cwavef_fft,mpi_enreg,ndat_,npw_k1) 412 end if 413 414 if (have_to_reequilibrate) then 415 npw_fft = bandfft_kpt(ikpt_this_proc)%npw_fft 416 sendcount_fft => bandfft_kpt(ikpt_this_proc)%sendcount_fft(:) 417 recvcount_fft => bandfft_kpt(ikpt_this_proc)%recvcount_fft(:) 418 senddisp_fft => bandfft_kpt(ikpt_this_proc)%senddisp_fft(:) 419 recvdisp_fft => bandfft_kpt(ikpt_this_proc)%recvdisp_fft(:) 420 indices_pw_fft => bandfft_kpt(ikpt_this_proc)%indices_pw_fft(:) 421 kg_k_fft => bandfft_kpt(ikpt_this_proc)%kg_k_fft(:,:) 422 ABI_MALLOC(buff_wf,(2,npw_k1*ndat_) ) 423 ABI_MALLOC(cwavef_fft,(2,npw_fft*ndat_) ) 424 if(ndat_>1) then 425 ABI_MALLOC(cwavef_fft_tr, (2,npw_fft*ndat_)) 426 end if 427 do idat=1, ndat_ 428 do ipw = 1 ,npw_k1 429 buff_wf(1:2, idat + ndat_*(indices_pw_fft(ipw)-1) ) = cwavef(1:2,ipw + npw_k1*(idat-1)) 430 end do 431 end do 432 if(ndat_ > 1) then 433 call xmpi_alltoallv(buff_wf,2*ndat_*sendcount_fft,2*ndat_*senddisp_fft, & 434 & cwavef_fft_tr,2*ndat_*recvcount_fft, 2*ndat_*recvdisp_fft, mpi_enreg%comm_fft,ierr) 435 ! We need to transpose data 436 do idat=1,ndat_ 437 do ipw = 1 ,npw_fft 438 cwavef_fft(1:2, ipw + npw_fft*(idat-1)) = cwavef_fft_tr(1:2, idat + ndat_*(ipw-1)) 439 end do 440 end do 441 else 442 call xmpi_alltoallv(buff_wf,2*sendcount_fft,2*senddisp_fft, & 443 & cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, mpi_enreg%comm_fft,ierr) 444 end if 445 end if 446 447 ! Apply the local potential to the wavefunction 448 ! Start from wavefunction in reciprocal space cwavef 449 ! End with function ghc in reciprocal space also. 450 ABI_MALLOC(work,(2,gs_ham%n4,gs_ham%n5,gs_ham%n6*ndat_)) 451 weight=one 452 if (.not.use_cwavef_r) then 453 option_fft=2 454 if (nspinortot==2) then 455 ABI_MALLOC(cwavef1,(2,npw_k1*ndat_)) 456 ABI_MALLOC(cwavef2,(2,npw_k1*ndat_)) 457 do idat=1,ndat_ 458 do ipw=1,npw_k1 459 cwavef1(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1) 460 cwavef2(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1+shift1) 461 end do 462 end do 463 ! call cg_zcopy(npw_k1*ndat_,cwavef(1,1),cwavef1) 464 ! call cg_zcopy(npw_k1*ndat_,cwavef(1,1+shift1),cwavef2) 465 end if 466 else 467 option_fft=3 468 if (nspinortot==2) then 469 ABI_MALLOC(cwavef1,(0,0)) 470 ABI_MALLOC(cwavef2,(0,0)) 471 end if 472 end if 473 474 if (gs_ham%nvloc==1) then 475 ! Treat scalar local potentials 476 477 if (nspinortot==1) then 478 479 if (use_cwavef_r) then 480 do i3=1,n6 481 do i2=1,n5 482 do i1=1,n4 483 work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1) 484 work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1) 485 end do 486 end do 487 end do 488 end if 489 if (have_to_reequilibrate.or.double_rfft_trick) then 490 call fourwf(1,gs_ham%vlocal,cwavef_fft,cwavef_fft,work,gbound_k1,gbound_k2,& 491 & istwf_k_,kg_k_fft,kg_k_fft,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 492 & npw_fft,npw_fft,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,& 493 & weight,weight,gpu_option=gs_ham%gpu_option) 494 else 495 call fourwf(1,gs_ham%vlocal,cwavef,ghc,work,gbound_k1,gbound_k2,& 496 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 497 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,& 498 & weight,weight,gpu_option=gs_ham%gpu_option) 499 end if 500 501 else 502 ! nspinortot==2 503 504 if (nspinor1TreatedByThisProc) then 505 if (use_cwavef_r) then 506 do i3=1,n6 507 do i2=1,n5 508 do i1=1,n4 509 work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1) 510 work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1) 511 end do 512 end do 513 end do 514 end if 515 ABI_MALLOC(ghc1,(2,npw_k2*ndat_)) 516 call fourwf(1,gs_ham%vlocal,cwavef1,ghc1,work,gbound_k1,gbound_k2,& 517 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 518 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,& 519 & weight,weight,gpu_option=gs_ham%gpu_option) 520 do idat=1,ndat_ 521 do ipw =1, npw_k2 522 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2) 523 end do 524 end do 525 ABI_FREE(ghc1) 526 end if ! spin 1 treated by this proc 527 528 if (nspinor2TreatedByThisProc) then 529 if (use_cwavef_r) then 530 do i3=1,n6 531 do i2=1,n5 532 do i1=1,n4 533 work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,2) 534 work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,2) 535 end do 536 end do 537 end do 538 end if 539 ABI_MALLOC(ghc2,(2,npw_k2*ndat_)) 540 call fourwf(1,gs_ham%vlocal,cwavef2,ghc2,work,gbound_k1,gbound_k2,& 541 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 542 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,& 543 & gpu_option=gs_ham%gpu_option) 544 do idat=1,ndat_ 545 do ipw=1,npw_k2 546 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc2(1:2,ipw+(idat-1)*npw_k2) 547 end do 548 end do 549 ABI_FREE(ghc2) 550 end if ! spin 2 treated by this proc 551 552 end if ! nspinortot 553 554 else if (gs_ham%nvloc==4) then 555 ! Treat non-collinear local potentials 556 557 ABI_MALLOC(ghc1,(2,npw_k2*ndat_)) 558 ABI_MALLOC(ghc2,(2,npw_k2*ndat_)) 559 ABI_MALLOC(ghc3,(2,npw_k2*ndat_)) 560 ABI_MALLOC(ghc4,(2,npw_k2*ndat_)) 561 ghc1(:,:)=zero; ghc2(:,:)=zero; ghc3(:,:)=zero ; ghc4(:,:)=zero 562 if (use_cwavef_r) then 563 ABI_MALLOC(vlocal_tmp,(0,0,0)) 564 else 565 ABI_MALLOC(vlocal_tmp,(gs_ham%n4,gs_ham%n5,gs_ham%n6)) 566 end if 567 ! ghc1=v11*phi1 568 if (nspinor1TreatedByThisProc) then 569 if (use_cwavef_r) then 570 do i3=1,n6 571 do i2=1,n5 572 do i1=1,n4 573 work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1) 574 work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1) 575 end do 576 end do 577 end do 578 else 579 ! LB,07/22: 580 ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems. 581 ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why... 582 !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,1) 583 do i3=1,n6 584 do i2=1,n5 585 do i1=1,n4 586 vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1) 587 end do 588 end do 589 end do 590 end if 591 call fourwf(1,vlocal_tmp,cwavef1,ghc1,work,gbound_k1,gbound_k2,& 592 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 593 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,& 594 & gpu_option=gs_ham%gpu_option) 595 end if 596 ! ghc2=v22*phi2 597 if (nspinor2TreatedByThisProc) then 598 if (use_cwavef_r) then 599 do i3=1,n6 600 do i2=1,n5 601 do i1=1,n4 602 work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(1,i1,i2,i3,2) 603 work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(2,i1,i2,i3,2) 604 end do 605 end do 606 end do 607 else 608 ! LB,07/22: 609 ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems. 610 ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why... 611 !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,2) 612 do i3=1,n6 613 do i2=1,n5 614 do i1=1,n4 615 vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2) 616 end do 617 end do 618 end do 619 end if 620 call fourwf(1,vlocal_tmp,cwavef2,ghc2,work,gbound_k1,gbound_k2,& 621 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 622 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,& 623 & gpu_option=gs_ham%gpu_option) 624 end if 625 ABI_FREE(vlocal_tmp) 626 cplex=2 627 if (use_cwavef_r) then 628 ABI_MALLOC(vlocal_tmp,(0,0,0)) 629 else 630 ABI_MALLOC(vlocal_tmp,(cplex*gs_ham%n4,gs_ham%n5,gs_ham%n6)) 631 end if 632 ! ghc3=(re(v12)-im(v12))*phi1 633 if (nspinor1TreatedByThisProc) then 634 if (use_cwavef_r) then 635 do i3=1,n6 636 do i2=1,n5 637 do i1=1,n4 638 work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,1) 639 work(2,i1,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,1) 640 end do 641 end do 642 end do 643 else 644 do i3=1,gs_ham%n6 645 do i2=1,gs_ham%n5 646 do i1=1,gs_ham%n4 647 vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3) 648 vlocal_tmp(2*i1 ,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4) 649 end do 650 end do 651 end do 652 end if 653 call fourwf(cplex,vlocal_tmp,cwavef1,ghc3,work,gbound_k1,gbound_k2,& 654 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 655 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,& 656 & gpu_option=gs_ham%gpu_option) 657 end if 658 ! ghc4=(re(v12)+im(v12))*phi2 659 if (nspinor2TreatedByThisProc) then 660 if (use_cwavef_r) then 661 do i3=1,n6 662 do i2=1,n5 663 do i1=1,n4 664 work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,2)-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,2) 665 work(2,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,2)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,2) 666 end do 667 end do 668 end do 669 else 670 do i3=1,gs_ham%n6 671 do i2=1,gs_ham%n5 672 do i1=1,gs_ham%n4 673 vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3) 674 vlocal_tmp(2*i1 ,i2,i3)= gs_ham%vlocal(i1,i2,i3,4) 675 end do 676 end do 677 end do 678 end if 679 call fourwf(cplex,vlocal_tmp,cwavef2,ghc4,work,gbound_k1,gbound_k2,& 680 & istwf_k_,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,& 681 & npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,& 682 & gpu_option=gs_ham%gpu_option) 683 end if 684 ABI_FREE(vlocal_tmp) 685 ! Build ghc from pieces 686 ! (v11,v22,Re(v12)+iIm(v12);Re(v12)-iIm(v12))(psi1;psi2): matrix product 687 if (mpi_enreg%paral_spinor==0) then 688 do idat=1,ndat_ 689 do ipw=1,npw_k2 690 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2) =ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2) 691 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2) 692 end do 693 end do 694 else 695 call xmpi_sum(ghc4,mpi_enreg%comm_spinor,ierr) 696 call xmpi_sum(ghc3,mpi_enreg%comm_spinor,ierr) 697 if (nspinor1TreatedByThisProc) then 698 do idat=1,ndat_ 699 do ipw=1,npw_k2 700 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2) 701 end do 702 end do 703 else if (nspinor2TreatedByThisProc) then 704 do idat=1,ndat_ 705 do ipw=1,npw_k2 706 ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2) 707 end do 708 end do 709 end if 710 end if 711 ABI_FREE(ghc1) 712 ABI_FREE(ghc2) 713 ABI_FREE(ghc3) 714 ABI_FREE(ghc4) 715 end if ! nvloc 716 717 if (nspinortot==2) then 718 ABI_FREE(cwavef1) 719 ABI_FREE(cwavef2) 720 end if 721 722 ABI_FREE(work) 723 724 if (double_rfft_trick) then 725 call cwavef_double_rfft_trick_unpack(ghc,cwavef_fft,mpi_enreg,ndat_,npw_k1) 726 ABI_FREE(cwavef_fft) 727 end if 728 729 ! Retrieve eventually original FFT distrib 730 if(have_to_reequilibrate) then 731 if(ndat_ > 1 ) then 732 do idat=1,ndat_ 733 do ipw = 1 ,npw_fft 734 cwavef_fft_tr(1:2, idat + ndat_*(ipw-1)) = cwavef_fft(1:2, ipw + npw_fft*(idat-1)) 735 end do 736 end do 737 call xmpi_alltoallv(cwavef_fft_tr,2*ndat_*recvcount_fft, 2*ndat_*recvdisp_fft, & 738 & buff_wf,2*ndat_*sendcount_fft,2*ndat_*senddisp_fft, mpi_enreg%comm_fft,ierr) 739 else 740 call xmpi_alltoallv(cwavef_fft,2*recvcount_fft, 2*recvdisp_fft, & 741 & buff_wf,2*sendcount_fft,2*senddisp_fft, mpi_enreg%comm_fft,ierr) 742 end if 743 do idat=1,ndat_ 744 do ipw = 1 ,npw_k2 745 ghc(1:2,ipw + npw_k2*(idat-1)) = buff_wf(1:2, idat + ndat_*(indices_pw_fft(ipw)-1)) 746 end do 747 end do 748 ABI_FREE(buff_wf) 749 ABI_FREE(cwavef_fft) 750 if(ndat_ > 1) then 751 ABI_FREE(cwavef_fft_tr) 752 end if 753 end if 754 755 ! Add metaGGA contribution 756 if (associated(gs_ham%vxctaulocal)) then 757 if (.not.k1_eq_k2) then 758 ABI_BUG('metaGGA not allowed for k/=k_^prime!') 759 end if 760 if (size(gs_ham%vxctaulocal)/=gs_ham%n4*gs_ham%n5*gs_ham%n6*gs_ham%nvloc*4) then 761 ABI_BUG('wrong sizes for vxctaulocal!') 762 end if 763 ABI_MALLOC(ghc_mGGA,(2,npw_k2*my_nspinor*ndat_)) 764 call getghc_mGGA(cwavef,ghc_mGGA,gbound_k1,gs_ham%gprimd,istwf_k_,kg_k1,kpt_k1,& 765 & gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,npw_k1,gs_ham%nvloc,& 766 & gs_ham%n4,gs_ham%n5,gs_ham%n6,my_nspinor,gs_ham%vxctaulocal,gs_ham%gpu_option) 767 ghc(1:2,1:npw_k2*my_nspinor*ndat_)=ghc(1:2,1:npw_k2*my_nspinor*ndat_)+ghc_mGGA(1:2,1:npw_k2*my_nspinor*ndat_) 768 ABI_FREE(ghc_mGGA) 769 end if 770 771 ! Add nuclear dipole moment contribution 772 if (associated(gs_ham%vectornd)) then 773 if (.not.k1_eq_k2) then 774 ABI_BUG('nuclear dipole vector potential not allowed for k/=k_^prime!') 775 end if 776 if (size(gs_ham%vectornd)/=gs_ham%n4*gs_ham%n5*gs_ham%n6*gs_ham%nvloc*3) then 777 ABI_BUG('wrong sizes for vectornd in getghc!') 778 end if 779 ABI_MALLOC(ghc_vectornd,(2,npw_k2*my_nspinor*ndat_)) 780 ghc_vectornd=zero 781 call getghc_nucdip(cwavef,ghc_vectornd,gbound_k1,istwf_k_,kg_k1,kpt_k1,& 782 & gs_ham%mgfft,mpi_enreg,ndat_,gs_ham%ngfft,npw_k1,gs_ham%nvloc,& 783 & gs_ham%n4,gs_ham%n5,gs_ham%n6,my_nspinor,gs_ham%vectornd,gs_ham%gpu_option) 784 ghc(1:2,1:npw_k2*my_nspinor*ndat_)=ghc(1:2,1:npw_k2*my_nspinor*ndat_)+ghc_vectornd(1:2,1:npw_k2*my_nspinor*ndat_) 785 ABI_FREE(ghc_vectornd) 786 end if 787 788 end if ! type_calc 789 790 ABI_NVTX_END_RANGE() 791 792 793 if (any(type_calc == [0, 2, 3])) then 794 795 !============================================================ 796 ! Application of the non-local potential and the Fock potential 797 !============================================================ 798 799 ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT) 800 801 if (type_calc==0 .or. type_calc==2) then 802 signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1 803 cpopt_here=-1;if (gs_ham%usepaw==1) cpopt_here=cpopt 804 if (has_fock) then 805 if (gs_ham%usepaw==1) then 806 cpopt_here=max(cpopt,0) 807 if (cpopt<2) then 808 ABI_MALLOC(cwaveprj_fock,(gs_ham%natom,my_nspinor*ndat)) 809 ABI_MALLOC(dimcprj,(gs_ham%natom)) 810 call pawcprj_getdim(dimcprj,gs_ham%natom,gs_ham%nattyp,gs_ham%ntypat,& 811 & gs_ham%typat,fock%pawtab,'O') 812 call pawcprj_alloc(cwaveprj_fock,0,dimcprj) 813 ABI_FREE(dimcprj) 814 else 815 cwaveprj_fock=>cwaveprj 816 end if 817 cwaveprj_nonlop=>cwaveprj_fock 818 else 819 cwaveprj_nonlop=>cwaveprj 820 cwaveprj_fock=>cwaveprj 821 end if 822 else 823 cwaveprj_nonlop=>cwaveprj 824 end if 825 paw_opt=gs_ham%usepaw ; if (sij_opt/=0) paw_opt=sij_opt+3 826 lambda_ndat = lambda 827 828 if (gs_ham%usepaw==0) gsc_ptr => nonlop_dum 829 if (gs_ham%usepaw==1) gsc_ptr => gsc 830 call nonlop(choice,cpopt_here,cwaveprj_nonlop,enlout,gs_ham,idir,lambda_ndat,mpi_enreg,ndat,& 831 & nnlout,paw_opt,signs,gsc_ptr,tim_nonlop,cwavef,gvnlxc_,select_k=select_k_) 832 833 if (gs_ham%usepaw==1 .and. has_fock)then 834 if (fock_get_getghc_call(fock)==1) then 835 if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then 836 #if defined HAVE_GPU && defined HAVE_YAKL 837 ABI_MALLOC_MANAGED(gvnlc, (/2,npw_k2*my_nspinor*ndat/)) 838 #endif 839 else 840 ABI_MALLOC(gvnlc, (2,npw_k2*my_nspinor*ndat)) 841 end if 842 gvnlc=gvnlxc_ 843 endif 844 endif 845 846 ! Calculation of the Fock exact exchange contribution from the Fock or ACE operator 847 if (has_fock) then 848 if (fock_get_getghc_call(fock)==1) then 849 if (gs_ham%usepaw==0) cwaveprj_idat => cwaveprj 850 if (fock%use_ACE==0) then 851 call timab(360,1,tsec) 852 do idat=1,ndat 853 if (gs_ham%usepaw==1) cwaveprj_idat => cwaveprj_fock(:,(idat-1)*my_nspinor+1:idat*my_nspinor) 854 call fock_getghc(cwavef(:,1+(idat-1)*npw_k1*my_nspinor:idat*npw_k1*my_nspinor),cwaveprj_idat,& 855 & gvnlxc_(:,1+(idat-1)*npw_k2*my_nspinor:idat*npw_k2*my_nspinor),gs_ham,mpi_enreg) 856 end do ! idat 857 call timab(360,2,tsec) 858 else 859 do idat=1,ndat 860 call fock_ACE_getghc(cwavef(:,1+(idat-1)*npw_k1*my_nspinor:idat*npw_k1*my_nspinor),& 861 & gvnlxc_(:,1+(idat-1)*npw_k2*my_nspinor:idat*npw_k2*my_nspinor),gs_ham,mpi_enreg) 862 end do ! idat 863 end if 864 end if 865 end if 866 867 else if (type_calc == 3) then 868 ! for kinetic and local only, nonlocal and vfock should be zero 869 gvnlxc_(:,:) = zero 870 end if ! if(type_calc... 871 872 ABI_NVTX_END_RANGE() 873 874 !============================================================ 875 ! Assemble kinetic, local, nonlocal and Fock contributions 876 !============================================================ 877 878 ABI_NVTX_START_RANGE(NVTX_GETGHC_KIN) 879 880 #ifdef FC_NVHPC 881 !FIXME This Kokkos kernel seems to cause issues under NVHPC so it is disabled 882 if (.false.) then 883 #else 884 if (gs_ham%gpu_option == ABI_GPU_KOKKOS) then 885 #endif 886 887 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) 888 call assemble_energy_contribution_kokkos(c_loc(ghc), & 889 & c_loc(gsc), c_loc(kinpw_k2), c_loc(cwavef), c_loc(gvnlxc_), & 890 & ndat, my_nspinor, npw_k2, sij_opt, k1_eq_k2, hugevalue) 891 ! sync device so that data can be reused safely on host 892 ! will probably be moved elsewhere once all the scf loop runs on device 893 call gpu_device_synchronize() 894 #endif 895 896 else 897 898 #ifdef FC_NVHPC 899 #if defined(HAVE_GPU_CUDA) && defined(HAVE_KOKKOS) 900 !Related to FIXME above 901 if (gs_ham%gpu_option == ABI_GPU_KOKKOS) call gpu_device_synchronize() 902 #endif 903 #endif 904 905 ! Assemble modified kinetic, local and nonlocal contributions 906 ! to <G|H|C(n,k)>. Take also into account build-in debugging. 907 if(prtvol/=-level)then 908 do idat=1,ndat 909 if (k1_eq_k2) then 910 !$OMP PARALLEL DO PRIVATE(igspinor) COLLAPSE(2) IF(gemm_nonlop_use_gemm) 911 do ispinor=1,my_nspinor 912 do ig=1,npw_k2 913 igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1) 914 if(kinpw_k2(ig)<huge(zero)*1.d-11)then 915 ghc(re,igspinor) = ghc(re,igspinor) + kinpw_k2(ig)*cwavef(re,igspinor) + gvnlxc_(re,igspinor) 916 ghc(im,igspinor) = ghc(im,igspinor) + kinpw_k2(ig)*cwavef(im,igspinor) + gvnlxc_(im,igspinor) 917 else 918 ghc(re,igspinor)=zero 919 ghc(im,igspinor)=zero 920 if (sij_opt==1) then 921 gsc(re,igspinor)=zero 922 gsc(im,igspinor)=zero 923 end if 924 end if 925 end do ! ig 926 end do ! ispinor 927 928 else 929 !$OMP PARALLEL DO PRIVATE(igspinor) COLLAPSE(2) IF(gemm_nonlop_use_gemm) 930 do ispinor=1,my_nspinor 931 do ig=1,npw_k2 932 igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1) 933 if(kinpw_k2(ig)<huge(zero)*1.d-11)then 934 ghc(re,igspinor)= ghc(re,igspinor) + gvnlxc_(re,igspinor) 935 ghc(im,igspinor)= ghc(im,igspinor) + gvnlxc_(im,igspinor) 936 else 937 ghc(re,igspinor)=zero 938 ghc(im,igspinor)=zero 939 if (sij_opt==1) then 940 gsc(re,igspinor)=zero 941 gsc(im,igspinor)=zero 942 end if 943 end if 944 end do ! ig 945 end do ! ispinor 946 end if 947 end do ! idat 948 else 949 ! Here, debugging section 950 call wrtout(std_out,' getghc : components of ghc ','PERS') 951 write(msg,'(a)')& 952 & 'icp ig ispinor igspinor re/im ghc kinpw cwavef glocc gvnlxc gsc' 953 call wrtout(std_out,msg,'PERS') 954 do idat=1,ndat 955 do ispinor=1,my_nspinor 956 do ig=1,npw_k2 957 igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1) 958 if(kinpw_k2(ig)<huge(zero)*1.d-11)then 959 if (k1_eq_k2) then 960 ghcre=kinpw_k2(ig)*cwavef(re,igspinor)+ghc(re,igspinor)+gvnlxc_(re,igspinor) 961 ghcim=kinpw_k2(ig)*cwavef(im,igspinor)+ghc(im,igspinor)+gvnlxc_(im,igspinor) 962 else 963 ghcre=ghc(re,igspinor)+gvnlxc_(re,igspinor) 964 ghcim=ghc(im,igspinor)+gvnlxc_(im,igspinor) 965 end if 966 else 967 ghcre=zero 968 ghcim=zero 969 if (sij_opt==1) then 970 gsc(re,igspinor)=zero 971 gsc(im,igspinor)=zero 972 end if 973 end if 974 iispinor=ispinor;if (mpi_enreg%paral_spinor==1) iispinor=mpi_enreg%me_spinor+1 975 if (sij_opt == 1) then 976 write(msg,'(a,3(1x,i5),6(1x,es13.6))') ' 1 ', ig, iispinor, igspinor,ghcre,& 977 & kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor), gsc(re,igspinor) 978 call wrtout(std_out,msg,'PERS') 979 write(msg,'(a,3(1x,i5),6(1x,es13.6))') ' 2 ', ig, iispinor, igspinor,ghcim,& 980 & kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor), gsc(im,igspinor) 981 call wrtout(std_out,msg,'PERS') 982 else 983 write(msg,'(a,3(1x,i5),6(1x,es13.6))') ' 1 ', ig, iispinor, igspinor,ghcre,& 984 & kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor) 985 call wrtout(std_out,msg,'PERS') 986 write(msg,'(a,3(1x,i5),6(1x,es13.6))') ' 2 ', ig, iispinor, igspinor,ghcim,& 987 & kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor) 988 call wrtout(std_out,msg,'PERS') 989 end if 990 ghc(re,igspinor)=ghcre 991 ghc(im,igspinor)=ghcim 992 end do ! ig 993 end do ! ispinor 994 end do ! idat 995 end if 996 end if ! gs_ham%gpu_option 997 998 ABI_NVTX_END_RANGE() 999 1000 ! Special case of PAW + Fock : only return Fock operator contribution in gvnlxc_ 1001 if (gs_ham%usepaw==1 .and. has_fock) then 1002 gvnlxc_=gvnlxc_-gvnlc 1003 if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then 1004 #if defined HAVE_GPU && defined HAVE_YAKL 1005 ABI_FREE_MANAGED(gvnlc) 1006 #endif 1007 else 1008 ABI_FREE(gvnlc) 1009 end if 1010 endif 1011 1012 if (local_gvnlxc) then 1013 if(gs_ham%gpu_option==ABI_GPU_KOKKOS) then 1014 #if defined HAVE_GPU && defined HAVE_YAKL 1015 ABI_FREE_MANAGED(gvnlxc_) 1016 #endif 1017 else 1018 ABI_FREE(gvnlxc_) 1019 end if 1020 end if 1021 1022 ! Structured debugging : if prtvol=-level, stop here. 1023 if(prtvol==-level)then 1024 write(msg,'(a,i0,a)')' getghc : exit prtvol=-',level,', debugging mode => stop ' 1025 ABI_ERROR(msg) 1026 end if 1027 1028 if (type_calc==0.or.type_calc==2) then 1029 if (has_fock.and.gs_ham%usepaw==1.and.cpopt<2) then 1030 call pawcprj_free(cwaveprj_fock) 1031 ABI_FREE(cwaveprj_fock) 1032 end if 1033 end if 1034 1035 end if ! type_calc 1036 1037 call timab(350+tim_getghc,2,tsec) 1038 1039 DBG_EXIT("COLL") 1040 ABI_NVTX_END_RANGE() 1041 1042 end subroutine getghc
ABINIT/getghc_mGGA [ Functions ]
NAME
getghc_mGGA
FUNCTION
Compute metaGGA contribution to <G|H|C> for input vector |C> expressed in reciprocal space.
INPUTS
cwavef(2,npw_k*my_nspinor*ndat)=planewave coefficients of wavefunction. gbound_k(2*mgfft+4)=sphere boundary info istwf_k=input parameter that describes the storage of wfs kg_k(3,npw_k)=G vec coordinates wrt recip lattice transl. kpt(3)=current k point mgfft=maximum single fft dimension mpi_enreg=information about MPI parallelization my_nspinor=number of spinorial components of the wavefunctions (on current proc) ndat=number of FFTs to perform in parall ngfft(18)=contain all needed information about 3D FFT npw_k=number of planewaves in basis for given k point. nvloc=number of spin components of vxctaulocal n4,n5,n6=for dimensionning of vxctaulocal gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU) vxctaulocal(n4,n5,n6,nvloc,4)= local potential corresponding to the derivative of XC energy with respect to kinetic energy density, in real space, on the augmented fft grid. This array contains also the gradient of vxctaulocal (gvxctaulocal) in vxctaulocal(:,:,:,:,2:4).
OUTPUT
ghc_mGGA(2,npw_k*my_nspinor*ndat)=metaGGA contribution to <G|H|C>
SIDE EFFECTS
SOURCE
1536 subroutine getghc_mGGA(cwavef,ghc_mGGA,gbound_k,gprimd,istwf_k,kg_k,kpt,mgfft,mpi_enreg,& 1537 & ndat,ngfft,npw_k,nvloc,n4,n5,n6,my_nspinor,vxctaulocal,gpu_option) 1538 1539 !Arguments ------------------------------------ 1540 !scalars 1541 integer,intent(in) :: istwf_k,mgfft,my_nspinor,ndat,npw_k,nvloc,n4,n5,n6,gpu_option 1542 type(MPI_type),intent(in) :: mpi_enreg 1543 !arrays 1544 integer,intent(in) :: gbound_k(2*mgfft+4),kg_k(3,npw_k),ngfft(18) 1545 real(dp),intent(in) :: gprimd(3,3),kpt(3) 1546 real(dp),intent(inout) :: cwavef(2,npw_k*my_nspinor*ndat) 1547 real(dp),intent(inout) :: ghc_mGGA(2,npw_k*my_nspinor*ndat) 1548 real(dp),intent(inout) :: vxctaulocal(n4,n5,n6,nvloc,4) 1549 1550 !Local variables------------------------------- 1551 !scalars 1552 integer,parameter :: tim_fourwf=1 1553 integer :: idat,idir,ipw,nspinortot,shift 1554 logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc 1555 real(dp) :: gp2pi1,gp2pi2,gp2pi3,kpt_cart,kg_k_cart,weight=one 1556 !arrays 1557 real(dp),allocatable :: cwavef1(:,:),cwavef2(:,:) 1558 real(dp),allocatable :: gcwavef(:,:,:),gcwavef1(:,:,:),gcwavef2(:,:,:) 1559 real(dp),allocatable :: ghc1(:,:),ghc2(:,:) 1560 real(dp),allocatable :: lcwavef(:,:),lcwavef1(:,:),lcwavef2(:,:) 1561 real(dp),allocatable :: work(:,:,:,:) 1562 1563 ! ********************************************************************* 1564 1565 ghc_mGGA(:,:)=zero 1566 if (nvloc/=1) return 1567 1568 nspinortot=min(2,(1+mpi_enreg%paral_spinor)*my_nspinor) 1569 if (mpi_enreg%paral_spinor==0) then 1570 shift=npw_k 1571 nspinor1TreatedByThisProc=.true. 1572 nspinor2TreatedByThisProc=(nspinortot==2) 1573 else 1574 shift=0 1575 nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0) 1576 nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1) 1577 end if 1578 1579 ABI_MALLOC(work,(2,n4,n5,n6*ndat)) 1580 1581 if (nspinortot==1) then 1582 1583 ABI_MALLOC(ghc1,(2,npw_k*ndat)) 1584 1585 ! Do it in 3 STEPs: 1586 ! STEP1: Compute grad of cwavef and Laplacian of cwavef 1587 ABI_MALLOC(gcwavef,(2,npw_k*ndat,3)) 1588 ABI_MALLOC(lcwavef,(2,npw_k*ndat)) 1589 !!$OMP PARALLEL DO 1590 do idat=1,ndat 1591 do ipw=1,npw_k 1592 gcwavef(:,ipw+(idat-1)*npw_k,1:3)=zero 1593 lcwavef(:,ipw+(idat-1)*npw_k) =zero 1594 end do 1595 end do 1596 do idir=1,3 1597 gp2pi1=gprimd(idir,1)*two_pi 1598 gp2pi2=gprimd(idir,2)*two_pi 1599 gp2pi3=gprimd(idir,3)*two_pi 1600 kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3) 1601 ! Multiplication by 2pi i (G+k)_idir for gradient 1602 ! Multiplication by -(2pi (G+k)_idir )**2 for Laplacian 1603 do idat=1,ndat 1604 do ipw=1,npw_k 1605 kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart 1606 gcwavef(1,ipw+(idat-1)*npw_k,idir)= cwavef(2,ipw+(idat-1)*npw_k)*kg_k_cart 1607 gcwavef(2,ipw+(idat-1)*npw_k,idir)=-cwavef(1,ipw+(idat-1)*npw_k)*kg_k_cart 1608 lcwavef(1,ipw+(idat-1)*npw_k)=lcwavef(1,ipw+(idat-1)*npw_k)-cwavef(1,ipw+(idat-1)*npw_k)*kg_k_cart**2 1609 lcwavef(2,ipw+(idat-1)*npw_k)=lcwavef(2,ipw+(idat-1)*npw_k)-cwavef(2,ipw+(idat-1)*npw_k)*kg_k_cart**2 1610 end do 1611 end do 1612 end do ! idir 1613 ! STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc 1614 call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef,ghc1,work,gbound_k,gbound_k,& 1615 & istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1616 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1617 !!$OMP PARALLEL DO 1618 do idat=1,ndat 1619 do ipw=1,npw_k 1620 ghc_mGGA(:,ipw+(idat-1)*npw_k)=ghc_mGGA(:,ipw+(idat-1)*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k) 1621 end do 1622 end do 1623 ABI_FREE(lcwavef) 1624 ! STEP3: Compute sum of (grad components of vxctaulocal)*(grad components of cwavef) 1625 do idir=1,3 1626 call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef(:,:,idir),ghc1,work,gbound_k,gbound_k,& 1627 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1628 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1629 !!$OMP PARALLEL DO 1630 do idat=1,ndat 1631 do ipw=1,npw_k 1632 ghc_mGGA(:,ipw+(idat-1)*npw_k)=ghc_mGGA(:,ipw+(idat-1)*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k) 1633 end do 1634 end do 1635 end do ! idir 1636 ABI_FREE(gcwavef) 1637 ABI_FREE(ghc1) 1638 1639 else ! nspinortot==2 1640 1641 ABI_MALLOC(cwavef1,(2,npw_k*ndat)) 1642 ABI_MALLOC(cwavef2,(2,npw_k*ndat)) 1643 do idat=1,ndat 1644 do ipw=1,npw_k 1645 cwavef1(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k) 1646 cwavef2(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k+shift) 1647 end do 1648 end do 1649 ! call cg_zcopy(npw*ndat,cwavef(1,1),cwavef1) 1650 ! call cg_zcopy(npw*ndat,cwavef(1,1+shift),cwavef2) 1651 1652 1653 if (nspinor1TreatedByThisProc) then 1654 1655 ABI_MALLOC(ghc1,(2,npw_k*ndat)) 1656 1657 ! Do it in 3 STEPs: 1658 ! STEP1: Compute grad of cwavef and Laplacian of cwavef 1659 ABI_MALLOC(gcwavef1,(2,npw_k*ndat,3)) 1660 ABI_MALLOC(lcwavef1,(2,npw_k*ndat)) 1661 !!$OMP PARALLEL DO 1662 do idat=1,ndat 1663 do ipw=1,npw_k 1664 gcwavef1(:,ipw+(idat-1)*npw_k,1:3)=zero 1665 lcwavef1(:,ipw+(idat-1)*npw_k)=zero 1666 end do 1667 end do 1668 do idir=1,3 1669 gp2pi1=gprimd(idir,1)*two_pi 1670 gp2pi2=gprimd(idir,2)*two_pi 1671 gp2pi3=gprimd(idir,3)*two_pi 1672 kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3) 1673 ! Multiplication by 2pi i (G+k)_idir for gradient 1674 ! Multiplication by -(2pi (G+k)_idir )**2 for Laplacian 1675 do idat=1,ndat 1676 do ipw=1,npw_k 1677 kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart 1678 gcwavef1(1,ipw+(idat-1)*npw_k,idir)= cwavef1(2,ipw+(idat-1)*npw_k)*kg_k_cart 1679 gcwavef1(2,ipw+(idat-1)*npw_k,idir)=-cwavef1(1,ipw+(idat-1)*npw_k)*kg_k_cart 1680 lcwavef1(1,ipw+(idat-1)*npw_k)=lcwavef1(1,ipw+(idat-1)*npw_k)-cwavef1(1,ipw+(idat-1)*npw_k)*kg_k_cart**2 1681 lcwavef1(2,ipw+(idat-1)*npw_k)=lcwavef1(2,ipw+(idat-1)*npw_k)-cwavef1(2,ipw+(idat-1)*npw_k)*kg_k_cart**2 1682 end do 1683 end do 1684 end do ! idir 1685 ! STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc 1686 call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef1,ghc1,work,gbound_k,gbound_k,& 1687 & istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1688 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1689 !!$OMP PARALLEL DO 1690 do idat=1,ndat 1691 do ipw=1,npw_k 1692 ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k) 1693 end do 1694 end do 1695 ABI_FREE(lcwavef1) 1696 ! STEP3: Compute (grad components of vxctaulocal)*(grad components of cwavef) 1697 do idir=1,3 1698 call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef1(:,:,idir),ghc1,work,gbound_k,gbound_k,& 1699 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1700 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1701 !!$OMP PARALLEL DO 1702 do idat=1,ndat 1703 do ipw=1,npw_k 1704 ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k) = ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc1(:,ipw+(idat-1)*npw_k) 1705 end do 1706 end do 1707 end do ! idir 1708 ABI_FREE(gcwavef1) 1709 ABI_FREE(ghc1) 1710 1711 end if ! spin 1 treated by this proc 1712 1713 if (nspinor2TreatedByThisProc) then 1714 1715 ABI_MALLOC(ghc2,(2,npw_k*ndat)) 1716 1717 ! Do it in 3 STEPs: 1718 ! STEP1: Compute grad of cwavef and Laplacian of cwavef 1719 ABI_MALLOC(gcwavef2,(2,npw_k*ndat,3)) 1720 ABI_MALLOC(lcwavef2,(2,npw_k*ndat)) 1721 !!$OMP PARALLEL DO 1722 do idat=1,ndat 1723 do ipw=1,npw_k 1724 gcwavef2(:,ipw+(idat-1)*npw_k,1:3)=zero 1725 lcwavef2(:,ipw+(idat-1)*npw_k) =zero 1726 end do 1727 end do 1728 do idir=1,3 1729 gp2pi1=gprimd(idir,1)*two_pi 1730 gp2pi2=gprimd(idir,2)*two_pi 1731 gp2pi3=gprimd(idir,3)*two_pi 1732 kpt_cart=gp2pi1*kpt(1)+gp2pi2*kpt(2)+gp2pi3*kpt(3) 1733 ! Multiplication by 2pi i (G+k)_idir for gradient 1734 ! Multiplication by -(2pi (G+k)_idir )**2 for Laplacian 1735 do idat=1,ndat 1736 do ipw=1,npw_k 1737 kg_k_cart=gp2pi1*kg_k(1,ipw)+gp2pi2*kg_k(2,ipw)+gp2pi3*kg_k(3,ipw)+kpt_cart 1738 gcwavef2(1,ipw+(idat-1)*npw_k,idir)= cwavef2(2,ipw+(idat-1)*npw_k)*kg_k_cart 1739 gcwavef2(2,ipw+(idat-1)*npw_k,idir)=-cwavef2(1,ipw+(idat-1)*npw_k)*kg_k_cart 1740 lcwavef2(1,ipw+(idat-1)*npw_k)=lcwavef2(1,ipw+(idat-1)*npw_k)-cwavef2(1,ipw+(idat-1)*npw_k)*kg_k_cart**2 1741 lcwavef2(2,ipw+(idat-1)*npw_k)=lcwavef2(2,ipw+(idat-1)*npw_k)-cwavef2(2,ipw+(idat-1)*npw_k)*kg_k_cart**2 1742 end do 1743 end do 1744 end do ! idir 1745 ! STEP2: Compute (vxctaulocal)*(Laplacian of cwavef) and add it to ghc 1746 call fourwf(1,vxctaulocal(:,:,:,:,1),lcwavef2,ghc2,work,gbound_k,gbound_k,& 1747 & istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1748 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1749 !!$OMP PARALLEL DO 1750 do idat=1,ndat 1751 do ipw=1,npw_k 1752 ! original code 1753 ! ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc2(:,ipw+(idat-1)*npw_k) 1754 ! but this stores the spinor2 result in the spinor1 location. Should be stored with shift 1755 ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)& 1756 & -half*ghc2(:,ipw+(idat-1)*npw_k) 1757 end do 1758 end do 1759 ABI_FREE(lcwavef2) 1760 ! STEP3: Compute sum of (grad components of vxctaulocal)*(grad components of cwavef) 1761 do idir=1,3 1762 call fourwf(1,vxctaulocal(:,:,:,:,1+idir),gcwavef2(:,:,idir),ghc2,work,gbound_k,gbound_k,& 1763 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1764 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1765 !!$OMP PARALLEL DO 1766 do idat=1,ndat 1767 do ipw=1,npw_k 1768 ! original code 1769 ! ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k)-half*ghc2(:,ipw+(idat-1)*npw_k) 1770 ! but this stores the spinor2 result in the spinor1 location. Should be stored with shift 1771 ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)=ghc_mGGA(:,ipw+(idat-1)*my_nspinor*npw_k+shift)& 1772 & -half*ghc2(:,ipw+(idat-1)*npw_k) 1773 end do 1774 end do 1775 end do ! idir 1776 1777 ABI_FREE(gcwavef2) 1778 ABI_FREE(ghc2) 1779 1780 end if ! spin 2 treated by this proc 1781 1782 ABI_FREE(cwavef1) 1783 ABI_FREE(cwavef2) 1784 1785 end if ! nspinortot 1786 1787 ABI_FREE(work) 1788 1789 end subroutine getghc_mGGA
ABINIT/getghc_nucdip [ Functions ]
NAME
getghc_nucdip
FUNCTION
Compute magnetic nuclear dipole moment contribution to <G|H|C> for input vector |C> expressed in reciprocal space.
INPUTS
cwavef(2,npw_k*my_nspinor*ndat)=planewave coefficients of wavefunction. gbound_k(2*mgfft+4)=sphere boundary info gprimd(3,3)=dimensional reciprocal space primitive translations (b^-1) istwf_k=input parameter that describes the storage of wfs kg_k(3,npw_k)=G vec coordinates wrt recip lattice transl. kpt(3)=current k point mgfft=maximum single fft dimension mpi_enreg=information about MPI parallelization my_nspinor=number of spinorial components of the wavefunctions (on current proc) ndat=number of FFTs to perform in parall ngfft(18)=contain all needed information about 3D FFT npw_k=number of planewaves in basis for given k point. nvloc=number of spin components of vxctaulocal n4,n5,n6=for dimensionning of vxctaulocal gpu_option= GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU) vectornd(n4,n5,n6,nvloc,3)= local potential corresponding to the vector potential of the array of nuclear magnetic dipoles, in real space, on the augmented fft grid.
OUTPUT
ghc_vectornd(2,npw_k*my_nspinor*ndat)=A.p contribution to <G|H|C> for array of nuclear dipoles
SIDE EFFECTS
NOTES
this code is a copied, simplied version of getghc_mGGA (see below) and should eventually be integrated into that code, to simplify maintenance
SOURCE
1300 subroutine getghc_nucdip(cwavef,ghc_vectornd,gbound_k,istwf_k,kg_k,kpt,mgfft,mpi_enreg,& 1301 & ndat,ngfft,npw_k,nvloc,n4,n5,n6,my_nspinor,vectornd,gpu_option) 1302 1303 !Arguments ------------------------------------ 1304 !scalars 1305 integer,intent(in) :: istwf_k,mgfft,my_nspinor,ndat,npw_k,nvloc,n4,n5,n6,gpu_option 1306 type(MPI_type),intent(in) :: mpi_enreg 1307 !arrays 1308 integer,intent(in) :: gbound_k(2*mgfft+4),kg_k(3,npw_k),ngfft(18) 1309 real(dp),intent(in) :: kpt(3) 1310 real(dp),intent(inout) :: cwavef(2,npw_k*my_nspinor*ndat) 1311 real(dp),intent(inout) :: ghc_vectornd(2,npw_k*my_nspinor*ndat) 1312 real(dp),intent(inout) :: vectornd(n4,n5,n6,nvloc,3) 1313 1314 !Local variables------------------------------- 1315 !scalars 1316 integer,parameter :: tim_fourwf=1 1317 integer :: idat,idir,ipw,nspinortot,shift 1318 logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc 1319 real(dp) :: scale_conversion,weight=one 1320 !arrays 1321 real(dp),allocatable :: cwavef1(:,:),cwavef2(:,:) 1322 real(dp),allocatable :: gcwavef(:,:,:),gcwavef1(:,:,:),gcwavef2(:,:,:) 1323 real(dp),allocatable :: ghc1(:,:),ghc2(:,:),kgkpk(:,:) 1324 real(dp),allocatable :: work(:,:,:,:) 1325 1326 ! ********************************************************************* 1327 1328 ghc_vectornd(:,:)=zero 1329 if (nvloc/=1) return 1330 1331 nspinortot=min(2,(1+mpi_enreg%paral_spinor)*my_nspinor) 1332 if (mpi_enreg%paral_spinor==0) then 1333 shift=npw_k 1334 nspinor1TreatedByThisProc=.true. 1335 nspinor2TreatedByThisProc=(nspinortot==2) 1336 else 1337 shift=0 1338 nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0) 1339 nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1) 1340 end if 1341 1342 ABI_MALLOC(work,(2,n4,n5,n6*ndat)) 1343 1344 ! scale conversion from SI to atomic units, 1345 ! here \alpha^2 where \alpha is the fine structure constant 1346 scale_conversion = FineStructureConstant2 1347 1348 if (nspinortot==1) then 1349 1350 ABI_MALLOC(ghc1,(2,npw_k*ndat)) 1351 1352 ! Do it in 2 STEPs: 1353 ! STEP1: Compute grad of cwavef 1354 ABI_MALLOC(gcwavef,(2,npw_k*ndat,3)) 1355 1356 gcwavef = zero 1357 1358 ! compute k + G. Note these are in reduced coords 1359 ABI_MALLOC(kgkpk,(npw_k,3)) 1360 do ipw = 1, npw_k 1361 kgkpk(ipw,:) = kpt(:) + kg_k(:,ipw) 1362 end do 1363 1364 ! make 2\pi(k+G)c(G)|G> by element-wise multiplication 1365 do idir = 1, 3 1366 do idat = 1, ndat 1367 gcwavef(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1368 & cwavef(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1369 gcwavef(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1370 & cwavef(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1371 end do 1372 end do 1373 ABI_FREE(kgkpk) 1374 gcwavef = gcwavef*two_pi 1375 1376 ! STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef) 1377 do idir=1,3 1378 call fourwf(1,vectornd(:,:,:,:,idir),gcwavef(:,:,idir),ghc1,work,gbound_k,gbound_k,& 1379 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1380 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1381 !!$OMP PARALLEL DO 1382 ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd 1383 ! should be faster than explicit loop over ipw as npw_k gets large 1384 do idat=1,ndat 1385 call DAXPY(npw_k,scale_conversion,ghc1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1386 & ghc_vectornd(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1) 1387 call DAXPY(npw_k,scale_conversion,ghc1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1388 & ghc_vectornd(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1) 1389 end do 1390 end do ! idir 1391 ABI_FREE(gcwavef) 1392 ABI_FREE(ghc1) 1393 1394 else ! nspinortot==2 1395 1396 ABI_MALLOC(cwavef1,(2,npw_k*ndat)) 1397 ABI_MALLOC(cwavef2,(2,npw_k*ndat)) 1398 do idat=1,ndat 1399 do ipw=1,npw_k 1400 cwavef1(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k) 1401 cwavef2(1:2,ipw+(idat-1)*npw_k)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k+shift) 1402 end do 1403 end do 1404 1405 ! compute k + G. Note these are in reduced coords 1406 ABI_MALLOC(kgkpk,(npw_k,3)) 1407 do ipw = 1, npw_k 1408 kgkpk(ipw,:) = kpt(:) + kg_k(:,ipw) 1409 end do 1410 1411 if (nspinor1TreatedByThisProc) then 1412 1413 ABI_MALLOC(ghc1,(2,npw_k*ndat)) 1414 1415 ! Do it in 2 STEPs: 1416 ! STEP1: Compute grad of cwavef 1417 ABI_MALLOC(gcwavef1,(2,npw_k*ndat,3)) 1418 1419 gcwavef1 = zero 1420 ! make 2\pi(k+G)c(G)|G> by element-wise multiplication 1421 do idir = 1, 3 1422 do idat = 1, ndat 1423 gcwavef1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1424 & cwavef1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1425 gcwavef1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1426 & cwavef1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1427 end do 1428 end do 1429 gcwavef1 = gcwavef1*two_pi 1430 1431 ! STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef) 1432 do idir=1,3 1433 call fourwf(1,vectornd(:,:,:,:,idir),gcwavef1(:,:,idir),ghc1,work,gbound_k,gbound_k,& 1434 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1435 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1436 !!$OMP PARALLEL DO 1437 ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd 1438 ! should be faster than explicit loop over ipw as npw_k gets large 1439 do idat=1,ndat 1440 call DAXPY(npw_k,scale_conversion,ghc1(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1441 & ghc_vectornd(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1) 1442 call DAXPY(npw_k,scale_conversion,ghc1(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1443 & ghc_vectornd(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1) 1444 end do 1445 end do ! idir 1446 ABI_FREE(gcwavef1) 1447 ABI_FREE(ghc1) 1448 1449 end if ! end spinor 1 1450 1451 if (nspinor2TreatedByThisProc) then 1452 1453 ABI_MALLOC(ghc2,(2,npw_k*ndat)) 1454 1455 ! Do it in 2 STEPs: 1456 ! STEP1: Compute grad of cwavef 1457 ABI_MALLOC(gcwavef2,(2,npw_k*ndat,3)) 1458 1459 gcwavef2 = zero 1460 ! make 2\pi(k+G)c(G)|G> by element-wise multiplication 1461 do idir = 1, 3 1462 do idat = 1, ndat 1463 gcwavef2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1464 & cwavef2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1465 gcwavef2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k,idir) = & 1466 & cwavef2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k)*kgkpk(1:npw_k,idir) 1467 end do 1468 end do 1469 gcwavef2 = gcwavef2*two_pi 1470 1471 ! STEP2: Compute sum of (grad components of vectornd)*(grad components of cwavef) 1472 do idir=1,3 1473 call fourwf(1,vectornd(:,:,:,:,idir),gcwavef2(:,:,idir),ghc2,work,gbound_k,gbound_k,& 1474 istwf_k,kg_k,kg_k,mgfft,mpi_enreg,ndat,ngfft,npw_k,npw_k,n4,n5,n6,2,& 1475 & tim_fourwf,weight,weight,gpu_option=gpu_option) 1476 !!$OMP PARALLEL DO 1477 ! DAXPY is a BLAS routine for y -> A*x + y, here x = ghc1, A = scale_conversion, and y = ghc_vectornd 1478 ! should be faster than explicit loop over ipw as npw_k gets large 1479 do idat=1,ndat 1480 call DAXPY(npw_k,scale_conversion,ghc2(1,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1481 & ghc_vectornd(1,1+(idat-1)*npw_k+shift:npw_k+(idat-1)*npw_k+shift),1) 1482 call DAXPY(npw_k,scale_conversion,ghc2(2,1+(idat-1)*npw_k:npw_k+(idat-1)*npw_k),1,& 1483 & ghc_vectornd(2,1+(idat-1)*npw_k+shift:npw_k+(idat-1)*npw_k+shift),1) 1484 end do 1485 end do ! idir 1486 ABI_FREE(gcwavef2) 1487 ABI_FREE(ghc2) 1488 1489 end if ! end spinor 2 1490 1491 ABI_FREE(cwavef1) 1492 ABI_FREE(cwavef2) 1493 ABI_FREE(kgkpk) 1494 1495 end if ! nspinortot 1496 1497 ABI_FREE(work) 1498 1499 end subroutine getghc_nucdip
ABINIT/getgsc [ Functions ]
NAME
getgsc
FUNCTION
Compute <G|S|C> for all input vectors |Cnk> at a given k-point, OR for one input vector |Cnk>. |Cnk> are expressed in reciprocal space. S is the overlap operator between |Cnk> (used for PAW).
INPUTS
cg(2,mcg)=planewave coefficients of wavefunctions cprj(natom,mcprj)= wave functions projected with non-local projectors: cprj=<p_i|Cnk> gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian at k+q ibg=shift to be applied on the location of data in the array cprj (beginning of current k-point) icg=shift to be applied on the location of data in the array cg (beginning of current k-point) igsc=shift to be applied on the location of data in the array gsc (beginning of current k-point) ikpt,isppol=indexes of current (spin.kpoint) mcg=second dimension of the cg array mcprj=second dimension of the cprj array mgsc=second dimension of the gsc array mpi_enreg=information about MPI parallelization natom=number of atoms in unit cell. nband= if positive: number of bands at this k point for that spin polarization if negative: abs(nband) is the index of the only band to be computed npw_k=number of planewaves in basis for given k point. nspinor=number of spinorial components of the wavefunctions [select_k]=optional, option governing the choice of k points to be used. gs_ham datastructure contains quantities needed to apply overlap operator in reciprocal space between 2 kpoints, k and k^prime (equal in most cases); if select_k=1, <k^prime|S|k> is applied [default] if select_k=2, <k|S|k^prime> is applied if select_k=3, <k|S|k> is applied if select_k=4, <k^prime|S|k^prime> is applied
OUTPUT
gsc(2,mgsc)= <g|S|Cnk> or <g|S^(1)|Cnk> (S=overlap)
SOURCE
1832 subroutine getgsc(cg,cprj,gs_ham,gsc,ibg,icg,igsc,ikpt,isppol,& 1833 & mcg,mcprj,mgsc,mpi_enreg,natom,nband,npw_k,nspinor,select_k) 1834 1835 !Arguments ------------------------------------ 1836 !scalars 1837 integer,intent(in) :: ibg,icg,igsc,ikpt,isppol,mcg,mcprj 1838 integer,intent(in) :: mgsc,natom,nband,npw_k,nspinor 1839 !TODO : may be needed to distribute cprj over band procs 1840 ! integer,intent(in) :: mband_mem 1841 integer,intent(in),optional :: select_k 1842 type(MPI_type),intent(in) :: mpi_enreg 1843 type(gs_hamiltonian_type),intent(inout),target :: gs_ham 1844 !arrays 1845 real(dp),intent(in) :: cg(2,mcg) 1846 real(dp),intent(out) :: gsc(2,mgsc) 1847 type(pawcprj_type),intent(in) :: cprj(natom,mcprj) 1848 1849 !Local variables------------------------------- 1850 !scalars 1851 integer :: choice,cpopt,dimenl1,dimenl2,iband,iband1,iband2,index_cg,index_cprj 1852 integer :: index_gsc,me,my_nspinor,paw_opt,select_k_,signs,tim_nonlop,useylm 1853 !character(len=500) :: msg 1854 !arrays 1855 real(dp) :: enlout_dum(1),tsec(2) 1856 real(dp),allocatable :: cwavef(:,:),scwavef(:,:) 1857 type(pawcprj_type),allocatable :: cwaveprj(:,:) 1858 1859 ! ********************************************************************* 1860 1861 DBG_ENTER("COLL") 1862 1863 !Compatibility tests 1864 my_nspinor=max(1,nspinor/mpi_enreg%nproc_spinor) 1865 if(gs_ham%usepaw==0) then 1866 ABI_BUG('Only compatible with PAW (usepaw=1) !') 1867 end if 1868 if(nband<0.and.(mcg<npw_k*my_nspinor.or.mgsc<npw_k*my_nspinor.or.mcprj<my_nspinor)) then 1869 ABI_BUG('Invalid value for mcg, mgsc or mcprj !') 1870 end if 1871 1872 !Keep track of total time spent in getgsc: 1873 call timab(565,1,tsec) 1874 1875 gsc = zero 1876 1877 !Prepare some data 1878 ABI_MALLOC(cwavef,(2,npw_k*my_nspinor)) 1879 ABI_MALLOC(scwavef,(2,npw_k*my_nspinor)) 1880 if (gs_ham%usecprj==1) then 1881 ABI_MALLOC(cwaveprj,(natom,my_nspinor)) 1882 call pawcprj_alloc(cwaveprj,0,gs_ham%dimcprj) 1883 else 1884 ABI_MALLOC(cwaveprj,(0,0)) 1885 end if 1886 dimenl1=gs_ham%dimekb1;dimenl2=natom;tim_nonlop=0 1887 choice=1;signs=2;cpopt=-1+3*gs_ham%usecprj;paw_opt=3;useylm=1 1888 select_k_=1;if (present(select_k)) select_k_=select_k 1889 me=mpi_enreg%me_kpt 1890 1891 !Loop over bands 1892 index_cprj=ibg;index_cg=icg;index_gsc=igsc 1893 if (nband>0) then 1894 iband1=1;iband2=nband 1895 !only do 1 band in case nband < 0 (the |nband|th one) 1896 else if (nband<0) then 1897 iband1=abs(nband);iband2=iband1 1898 index_cprj=index_cprj+(iband1-1)*my_nspinor 1899 index_cg =index_cg +(iband1-1)*npw_k*my_nspinor 1900 index_gsc =index_gsc +(iband1-1)*npw_k*my_nspinor 1901 end if 1902 1903 do iband=iband1,iband2 1904 1905 if (mpi_enreg%proc_distrb(ikpt,iband,isppol)/=me.and.nband>0) then 1906 ! No longer needed 28/03/2020 to parallelize memory 1907 ! gsc(:,1+index_gsc:npw_k*my_nspinor+index_gsc)=zero 1908 ! index_gsc=index_gsc+npw_k*my_nspinor 1909 !index_cprj=index_cprj+my_nspinor 1910 !index_cg=index_cg+npw_k*my_nspinor 1911 1912 cycle 1913 end if 1914 1915 ! Retrieve WF at (n,k) 1916 cwavef(:,1:npw_k*my_nspinor)=cg(:,1+index_cg:npw_k*my_nspinor+index_cg) 1917 if (gs_ham%usecprj==1) then 1918 call pawcprj_copy(cprj(:,1+index_cprj:my_nspinor+index_cprj),cwaveprj) 1919 end if 1920 1921 ! Compute <g|S|Cnk> 1922 call nonlop(choice,cpopt,cwaveprj,enlout_dum,gs_ham,0,(/zero/),mpi_enreg,1,1,paw_opt,& 1923 & signs,scwavef,tim_nonlop,cwavef,cwavef,select_k=select_k_) 1924 1925 gsc(:,1+index_gsc:npw_k*my_nspinor+index_gsc)=scwavef(:,1:npw_k*my_nspinor) 1926 1927 ! End of loop over bands 1928 index_cprj=index_cprj+my_nspinor 1929 index_cg=index_cg+npw_k*my_nspinor 1930 index_gsc=index_gsc+npw_k*my_nspinor 1931 end do 1932 1933 !Memory deallocation 1934 ABI_FREE(cwavef) 1935 ABI_FREE(scwavef) 1936 if (gs_ham%usecprj==1) then 1937 call pawcprj_free(cwaveprj) 1938 end if 1939 ABI_FREE(cwaveprj) 1940 1941 call timab(565,2,tsec) 1942 1943 DBG_EXIT("COLL") 1944 1945 end subroutine getgsc
ABINIT/m_getghc [ Modules ]
NAME
m_getghc
FUNCTION
Compute <G|H|C> for input vector |C> expressed in reciprocal space;
COPYRIGHT
Copyright (C) 1998-2024 ABINIT group (DCA, XG, GMR, LSI, MT, JB, JWZ) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
16 #if defined HAVE_CONFIG_H 17 #include "config.h" 18 #endif 19 20 #include "abi_common.h" 21 22 ! nvtx related macro definition 23 #include "nvtx_macros.h" 24 25 module m_getghc 26 27 use, intrinsic :: iso_c_binding 28 29 use defs_basis 30 use m_errors 31 use m_abicore 32 use m_xmpi 33 34 use defs_abitypes, only : mpi_type 35 use m_time, only : timab 36 use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_getdim, pawcprj_copy 37 use m_bandfft_kpt, only : bandfft_kpt, bandfft_kpt_get_ikpt 38 use m_hamiltonian, only : gs_hamiltonian_type, KPRIME_H_K, K_H_KPRIME, K_H_K, KPRIME_H_KPRIME 39 use m_fock, only : fock_common_type, fock_get_getghc_call 40 use m_fock_getghc, only : fock_getghc, fock_ACE_getghc 41 use m_nonlop, only : nonlop 42 use m_gemm_nonlop, only : gemm_nonlop_use_gemm 43 use m_fft, only : fourwf 44 use m_getghc_ompgpu, only : getghc_ompgpu 45 46 #ifdef HAVE_FFTW3_THREADS 47 use m_fftw3, only : fftw3_spawn_threads_here, fftw3_use_lib_threads 48 #endif 49 50 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) 51 use m_nvtx_data 52 #endif 53 54 #if defined HAVE_GPU_CUDA 55 use m_gpu_toolbox 56 #endif 57 58 #if defined HAVE_YAKL 59 use gator_mod 60 #endif 61 62 #ifdef HAVE_KOKKOS 63 use m_manage_kokkos, only : assemble_energy_contribution_kokkos 64 #endif 65 66 implicit none 67 68 private
ABINIT/multithreaded_getghc [ Functions ]
NAME
multithreaded_getghc
FUNCTION
INPUTS
cpopt=flag defining the status of cwaveprj%cp(:)=<Proj_i|Cnk> scalars (PAW only) (same meaning as in nonlop.F90 routine) if cpopt=-1, <p_lmn|in> (and derivatives) are computed here (and not saved) if cpopt= 0, <p_lmn|in> are computed here and saved if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved if cpopt= 2 <p_lmn|in> are already in memory; if cpopt= 3 <p_lmn|in> are already in memory; first derivatives are computed here and saved if cpopt= 4 <p_lmn|in> and first derivatives are already in memory; cwavef(2,npw*my_nspinor*ndat)=planewave coefficients of wavefunction. gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian to be applied lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1 Typically lambda is the eigenvalue (or its guess) mpi_enreg=information about MPI parallelization ndat=number of FFT to do in parallel prtvol=control print volume and debugging output sij_opt= -PAW ONLY- if 0, only matrix elements <G|H|C> have to be computed (S=overlap) if 1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used) tim_getghc=timing code of the calling subroutine(can be set to 0 if not attributed) type_calc= option governing which part of Hamitonian is to be applied: 0: whole Hamiltonian 1: local part only 2: non-local+kinetic only (added to the existing Hamiltonian) 3: local + kinetic only (added to the existing Hamiltonian) ===== Optional inputs ===== [kg_fft_k(3,:)]=optional, (k+G) vector coordinates to be used for the FFT tranformation instead of the one contained in gs_ham datastructure. Typically used for real WF (in parallel) which are FFT-transformed 2 by 2. [kg_fft_kp(3,:)]=optional, (k^prime+G) vector coordinates to be used for the FFT tranformation [select_k]=optional, option governing the choice of k points to be used. gs_ham datastructure contains quantities needed to apply Hamiltonian in reciprocal space between 2 kpoints, k and k^prime (equal in most cases); if select_k=1, <k^prime|H|k> is applied [default] if select_k=2, <k|H|k^prime> is applied if select_k=3, <k|H|k> is applied if select_k=4, <k^prime|H|k^prime> is applied
OUTPUT
ghc(2,npw*my_nspinor*ndat)=matrix elements <G|H|C> (if sij_opt>=0) or <G|H-lambda.S|C> (if sij_opt=-1) gvnlxc(2,npw*my_nspinor*ndat)=matrix elements <G|Vnonlocal|C> (if sij_opt>=0) or <G|Vnonlocal-lambda.S|C> (if sij_opt=-1) if (sij_opt=1) gsc(2,npw*my_nspinor*ndat)=matrix elements <G|S|C> (S=overlap).
SIDE EFFECTS
cwaveprj(natom,my_nspinor*(1+cpopt)*ndat)= wave function projected on nl projectors (PAW only)
SOURCE
2006 subroutine multithreaded_getghc(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,& 2007 & prtvol,sij_opt,tim_getghc,type_calc,& 2008 & kg_fft_k,kg_fft_kp,select_k) ! optional arguments 2009 2010 #ifdef HAVE_OPENMP 2011 use omp_lib 2012 #endif 2013 2014 !Arguments ------------------------------------ 2015 !scalars 2016 integer,intent(in) :: cpopt,ndat, prtvol 2017 integer,intent(in) :: sij_opt,tim_getghc,type_calc 2018 integer,intent(in),optional :: select_k 2019 real(dp),intent(in) :: lambda 2020 type(MPI_type),intent(in) :: mpi_enreg 2021 type(gs_hamiltonian_type),intent(inout),target :: gs_ham 2022 !arrays 2023 integer,intent(in),optional,target :: kg_fft_k(:,:),kg_fft_kp(:,:) 2024 real(dp),intent(out),target :: gsc(:,:) 2025 real(dp),intent(inout) :: cwavef(:,:) 2026 real(dp),intent(out) :: ghc(:,:),gvnlxc(:,:) 2027 type(pawcprj_type),intent(inout),target :: cwaveprj(:,:) 2028 2029 !Local variables------------------------------- 2030 !scalars 2031 integer :: firstelt, firstprj, lastelt, lastprj,usegvnlxc 2032 integer :: nthreads 2033 integer :: ithread 2034 integer :: chunk 2035 integer :: residuchunk 2036 integer :: firstband 2037 integer :: lastband 2038 integer :: spacedim, spacedim_prj 2039 #ifdef HAVE_OPENMP 2040 logical :: is_nested,fftw3_use_lib_threads_sav 2041 #endif 2042 integer :: select_k_default 2043 2044 ! ************************************************************************* 2045 2046 select_k_default = 1; if ( present(select_k) ) select_k_default = select_k 2047 2048 spacedim = size(cwavef ,dim=2)/ndat 2049 spacedim_prj = size(cwaveprj,dim=2)/ndat 2050 2051 2052 ! Disabling multithreading for GPU variants (getghc_ompgpu is not thread-safe for now) 2053 !$omp parallel default (none) & 2054 !$omp& private(ithread,nthreads,chunk,firstband,lastband,residuchunk,firstelt,lastelt), & 2055 !$omp& private(firstprj,lastprj,is_nested,usegvnlxc,fftw3_use_lib_threads_sav), & 2056 !$omp& shared(cwavef,ghc,gsc, gvnlxc,spacedim,spacedim_prj,ndat,kg_fft_k,kg_fft_kp,gs_ham,cwaveprj,mpi_enreg), & 2057 !$omp& shared(gemm_nonlop_use_gemm), & 2058 !$omp& firstprivate(cpopt,lambda,prtvol,sij_opt,tim_getghc,type_calc,select_k_default) & 2059 !$omp& IF(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm) 2060 ithread = 0 2061 nthreads = 1 2062 if(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm) then 2063 #ifdef HAVE_OPENMP 2064 ithread = omp_get_thread_num() 2065 nthreads = omp_get_num_threads() 2066 ! is_nested = omp_get_nested() 2067 is_nested = .false. 2068 ! call omp_set_nested(.false.) 2069 !Ensure that libs are used without threads (mkl, openblas, fftw3, ...) 2070 #ifdef HAVE_LINALG_MKL_THREADS 2071 call mkl_set_num_threads(1) 2072 #endif 2073 #ifdef HAVE_LINALG_OPENBLAS_THREADS 2074 call openblas_set_num_threads(1) 2075 #endif 2076 #ifdef HAVE_FFTW3_THREADS 2077 fftw3_use_lib_threads_sav=(.not.fftw3_spawn_threads_here(nthreads,nthreads)) 2078 call fftw3_use_lib_threads(.false.) 2079 #endif 2080 #endif 2081 end if 2082 chunk = ndat/nthreads ! Divide by 2 to construct chunk of even number of bands 2083 residuchunk = ndat - nthreads*chunk 2084 if ( ithread < nthreads-residuchunk ) then 2085 firstband = ithread*chunk+1 2086 lastband = (ithread+1)*chunk 2087 else 2088 firstband = (nthreads-residuchunk)*chunk + ( ithread -(nthreads-residuchunk) )*(chunk+1) +1 2089 lastband = firstband+chunk 2090 end if 2091 usegvnlxc=1 2092 if (size(gvnlxc)<=1) usegvnlxc=0 2093 2094 if ( lastband /= 0 ) then 2095 firstelt = (firstband-1)*spacedim+1 2096 firstprj = (firstband-1)*spacedim_prj+1 2097 lastelt = lastband*spacedim 2098 lastprj = lastband*spacedim_prj 2099 ! Don't know how to manage optional arguments .... :( 2100 if ( present(kg_fft_k) ) then 2101 if (present(kg_fft_kp)) then 2102 call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),& 2103 & ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),& 2104 & gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,& 2105 & prtvol,sij_opt,tim_getghc,type_calc,& 2106 & select_k=select_k_default,kg_fft_k=kg_fft_k,kg_fft_kp=kg_fft_kp) 2107 else 2108 call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),& 2109 & ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),& 2110 & gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,& 2111 & prtvol,sij_opt,tim_getghc,type_calc,& 2112 & select_k=select_k_default,kg_fft_k=kg_fft_k) 2113 end if 2114 else 2115 if (present(kg_fft_kp)) then 2116 call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),& 2117 & ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),& 2118 & gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,& 2119 & prtvol,sij_opt,tim_getghc,type_calc,& 2120 & select_k=select_k_default,kg_fft_kp=kg_fft_kp) 2121 else 2122 call getghc(cpopt,cwavef(:,firstelt:lastelt),cwaveprj(:,firstprj:lastprj),& 2123 & ghc(:,firstelt:lastelt),gsc(:,firstelt:lastelt*gs_ham%usepaw),& 2124 & gs_ham,gvnlxc(:,firstelt:lastelt*usegvnlxc),lambda, mpi_enreg,lastband-firstband+1,& 2125 & prtvol,sij_opt,tim_getghc,type_calc,& 2126 & select_K=select_k_default) 2127 end if 2128 end if 2129 end if 2130 if(gs_ham%gpu_option==ABI_GPU_DISABLED .and. .not. gemm_nonlop_use_gemm) then 2131 #ifdef HAVE_OPENMP 2132 ! call omp_set_nested(is_nested) 2133 !Restore libs behavior (mkl, openblas, fftw3, ...) 2134 #ifdef HAVE_LINALG_MKL_THREADS 2135 call mkl_set_num_threads(nthreads) 2136 #endif 2137 #ifdef HAVE_LINALG_OPENBLAS_THREADS 2138 call openblas_set_num_threads(nthreads) 2139 #endif 2140 #ifdef HAVE_FFTW3_THREADS 2141 call fftw3_use_lib_threads(fftw3_use_lib_threads_sav) 2142 #endif 2143 #endif 2144 end if 2145 !$omp end parallel 2146 2147 end subroutine multithreaded_getghc