TABLE OF CONTENTS
ABINIT/m_lobpcgwf [ Functions ]
NAME
m_lobpcgwf
FUNCTION
This routine updates the whole wave functions at a given k-point, using the lobpcg method for a given spin-polarization, from a fixed hamiltonian but might also simply compute eigenvectors and eigenvalues at this k point. it will also update the matrix elements of the hamiltonian.
COPYRIGHT
Copyright (C) 1998-2024 ABINIT group (JB) this file is distributed under the terms of the gnu general public license, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt . for the initials of contributors, see ~abinit/doc/developers/contributors.txt .
SOURCE
21 #if defined HAVE_CONFIG_H 22 #include "config.h" 23 #endif 24 25 #include "abi_common.h" 26 27 ! nvtx related macro definition 28 #include "nvtx_macros.h" 29 30 module m_lobpcgwf 31 32 use defs_basis 33 use m_abicore 34 use m_lobpcg 35 use m_xmpi 36 use m_errors 37 use m_time 38 use m_xomp 39 use m_fstrings 40 use m_xg 41 use m_xgTransposer 42 use m_lobpcg2 43 use m_dtset 44 45 use defs_abitypes, only : mpi_type 46 use m_hamiltonian, only : gs_hamiltonian_type 47 use m_pawcprj, only : pawcprj_type 48 use m_nonlop, only : nonlop 49 use m_prep_kgb, only : prep_getghc, prep_nonlop 50 use m_getghc, only : multithreaded_getghc 51 use m_cgtools, only : dotprod_g 52 53 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS) 54 use m_nvtx_data 55 #endif 56 57 use, intrinsic :: iso_c_binding 58 59 private 60 61 integer, parameter :: l_tim_getghc=5 62 double precision, parameter :: inv_sqrt2 = 1/sqrt2 63 64 ! For use in getghc_gsc1 65 integer,save :: l_cpopt 66 integer,save :: l_icplx 67 integer,save :: l_istwf 68 integer,save :: l_npw 69 integer,save :: l_nspinor 70 logical,save :: l_paw 71 integer, save :: l_paral_kgb 72 integer,save :: l_prtvol 73 integer,save :: l_sij_opt 74 real(dp), allocatable,save :: l_pcon(:) 75 type(mpi_type),pointer,save :: l_mpi_enreg 76 type(gs_hamiltonian_type),pointer,save :: l_gs_hamk 77 78 public :: lobpcgwf2 79 80 contains 81 82 subroutine lobpcgwf2(cg,dtset,eig,occ,enl_out,gs_hamk,isppol,ikpt,inonsc,istep,kinpw,mpi_enreg,& 83 & nband,npw,nspinor,prtvol,resid,nbdbuf) 84 85 implicit none 86 87 !Arguments ------------------------------------ 88 integer,intent(in) :: nband,npw,prtvol,nspinor 89 integer,intent(in) :: isppol,ikpt,inonsc,istep,nbdbuf 90 type(gs_hamiltonian_type),target,intent(inout) :: gs_hamk 91 type(dataset_type) ,intent(in ) :: dtset 92 type(mpi_type) ,target,intent(in) :: mpi_enreg 93 real(dp) ,target,intent(inout) :: cg(2,nspinor*nband*npw)!,gsc(2,nspinor*nband*npw) 94 real(dp) ,intent(in ) :: kinpw(npw) 95 real(dp) ,target,intent( out) :: resid(nband) 96 real(dp) ,intent( out) :: enl_out(nband) 97 real(dp) ,target,intent( out) :: eig(nband) 98 real(dp) ,target,intent(in ) :: occ(nband) 99 100 !Local variables------------------------------- 101 102 type(xgBlock_t) :: xgx0 103 type(xgBlock_t) :: xgeigen 104 type(xgBlock_t) :: xgresidu 105 type(xgBlock_t) :: xgocc 106 type(lobpcg_t) :: lobpcg 107 108 integer :: space, blockdim, nline 109 integer :: ipw 110 111 ! integer :: nthreads 112 ! 113 ! double precision :: lobpcgMem(2) 114 ! double precision :: localmem 115 116 integer, parameter :: tim_lobpcgwf2 = 1640 117 double precision :: cputime, walltime 118 double precision :: tsec(2) 119 120 type(c_ptr) :: cptr 121 real(dp), pointer :: eig_ptr(:,:) => NULL() 122 real(dp), pointer :: resid_ptr(:,:) => NULL() 123 real(dp), pointer :: occ_ptr(:,:) => NULL() 124 125 ! Important things for NC 126 integer,parameter :: choice=1, paw_opt=0, signs=1 127 type(pawcprj_type) :: cprj_dum(1,1) 128 integer :: iband, shift 129 real(dp) :: gsc_dummy(0,0) 130 real(dp), allocatable :: l_gvnlxc(:,:) 131 132 ! ********************************************************************* 133 134 135 !########################################################################### 136 !################ INITIALISATION ########################################## 137 !########################################################################### 138 139 call timab(tim_lobpcgwf2,1,tsec) 140 cputime = abi_cpu_time() 141 walltime = abi_wtime() 142 143 ! Set module variables 144 l_paw = (gs_hamk%usepaw==1) 145 l_cpopt=-1;l_sij_opt=0;if (l_paw) l_sij_opt=1 146 l_istwf=gs_hamk%istwf_k 147 l_npw = npw 148 l_nspinor = nspinor 149 l_prtvol = prtvol 150 l_mpi_enreg => mpi_enreg 151 l_gs_hamk => gs_hamk 152 l_paral_kgb = dtset%paral_kgb 153 154 !Variables 155 nline=dtset%nline 156 blockdim=nband/dtset%nblock_lobpcg !=l_mpi_enreg%nproc_band*l_mpi_enreg%bandpp 157 158 !Depends on istwfk 159 if ( l_istwf == 2 ) then ! Real only 160 ! SPACE_CR mean that we have complex numbers but no re*im terms only re*re 161 ! and im*im so that a vector of complex is consider as a long vector of real 162 ! therefore the number of data is (2*npw*nspinor)*nband 163 ! This space is completely equivalent to SPACE_R but will correctly set and 164 ! get the array data into the xgBlock 165 space = SPACE_CR 166 l_icplx = 2 167 168 else ! complex 169 space = SPACE_C 170 l_icplx = 1 171 end if 172 173 ! Memory info 174 !if ( prtvol >= 3 ) then 175 ! lobpcgMem = lobpcg_memInfo(nband,l_icplx*l_npw*l_nspinor,blockdim,space) 176 ! localMem = (l_npw+2*l_npw*l_nspinor*blockdim+2*nband)*kind(1.d0) 177 ! write(std_out,'(1x,A,F10.6,1x,A)') "Each MPI process calling lobpcg should need around ", & 178 ! (localMem+sum(lobpcgMem))/1e9, & 179 ! "GB of peak memory as follows :" 180 ! write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in lobpcgwf : ", & 181 ! (localMem)/1e9, "GB" 182 ! write(std_out,'(4x,A,F10.6,1x,A)') "Permanent memory in m_lobpcg : ", & 183 ! (lobpcgMem(1))/1e9, "GB" 184 ! write(std_out,'(4x,A,F10.6,1x,A)') "Temporary memory in m_lobpcg : ", & 185 ! (lobpcgMem(2))/1e9, "GB" 186 !end if 187 188 !For preconditionning 189 ABI_MALLOC(l_pcon,(1:l_icplx*npw)) 190 !$omp parallel do schedule(static), shared(l_pcon,kinpw) 191 do ipw=1-1,l_icplx*npw-1 192 if(kinpw(ipw/l_icplx+1)>huge(zero)*1.d-11) then 193 l_pcon(ipw+1)=0.d0 194 else 195 l_pcon(ipw+1) = (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1)))) & 196 & / (27+kinpw(ipw/l_icplx+1)*(18+kinpw(ipw/l_icplx+1)*(12+8*kinpw(ipw/l_icplx+1))) + 16*kinpw(ipw/l_icplx+1)**4) 197 end if 198 end do 199 200 #ifdef HAVE_OPENMP_OFFLOAD 201 if(gs_hamk%gpu_option==ABI_GPU_OPENMP) then 202 !$OMP TARGET ENTER DATA MAP(to:cg,eig,resid) 203 end if 204 #endif 205 206 ! Local variables for lobpcg 207 !call xg_init(xgx0,space,icplx*npw*nspinor,nband) 208 call xgBlock_map(xgx0,cg,space,l_icplx*l_npw*l_nspinor,nband,l_mpi_enreg%comm_bandspinorfft,gpu_option=dtset%gpu_option) 209 if ( l_istwf == 2 ) then ! Real only 210 ! Scale cg 211 call xgBlock_scale(xgx0,sqrt2,1) 212 ! This is possible since the memory in cg and xgx0 is the same 213 ! Don't know yet how to deal with this with xgBlock 214 if(l_mpi_enreg%me_g0 == 1) then 215 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 216 #ifdef HAVE_OPENMP_OFFLOAD 217 !$OMP TARGET MAP(to:cg) 218 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2 219 !$OMP END TARGET 220 #endif 221 else 222 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * inv_sqrt2 223 end if 224 end if 225 end if 226 227 !call xg_init(xgeigen,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft) 228 ! Trick the with C to change rank of arrays (:) to (:,:) 229 cptr = c_loc(eig) 230 call c_f_pointer(cptr,eig_ptr,(/ nband,1 /)) 231 call xgBlock_map(xgeigen,eig_ptr,SPACE_R,nband,1,gpu_option=dtset%gpu_option) 232 233 !call xg_init(xgresidu,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft) 234 ! Trick the with C to change rank of arrays (:) to (:,:) 235 cptr = c_loc(resid) 236 call c_f_pointer(cptr,resid_ptr,(/ nband,1 /)) 237 call xgBlock_map(xgresidu,resid_ptr,SPACE_R,nband,1,gpu_option=dtset%gpu_option) 238 239 !call xg_init(xgeigen,SPACE_R,nband,1,l_mpi_enreg%comm_bandspinorfft) 240 ! Trick the with C to change rank of arrays (:) to (:,:) 241 cptr = c_loc(occ) 242 call c_f_pointer(cptr,occ_ptr,(/ nband,1 /)) 243 call xgBlock_map(xgocc,occ_ptr,SPACE_R,nband,1,gpu_option=dtset%gpu_option) 244 245 !ABI_MALLOC(l_gvnlxc,(2,l_npw*l_nspinor*blockdim)) 246 247 call lobpcg_init(lobpcg,nband,l_icplx*l_npw*l_nspinor,blockdim,dtset%tolwfr,nline,& 248 space,l_mpi_enreg%comm_bandspinorfft,l_paral_kgb,l_mpi_enreg%comm_spinorfft,l_mpi_enreg%comm_band,& 249 gs_hamk%gpu_option) 250 251 !########################################################################### 252 !################ RUUUUUUUN ########################################## 253 !########################################################################### 254 255 ! Run lobpcg 256 call lobpcg_run(lobpcg,xgx0,getghc_gsc1,precond,xgeigen,xgocc,xgresidu,prtvol,nspinor,isppol,ikpt,inonsc,istep,nbdbuf) 257 258 ! Free preconditionning since not needed anymore 259 ABI_FREE(l_pcon) 260 261 ! Scale back 262 if(l_istwf == 2) then 263 call xgBlock_scale(xgx0,inv_sqrt2,1) 264 if(l_mpi_enreg%me_g0 == 1) then 265 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 266 #ifdef HAVE_OPENMP_OFFLOAD 267 !$OMP TARGET MAP(to:cg) 268 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2 269 !$OMP END TARGET 270 #endif 271 else 272 cg(:, 1:npw*nspinor*nband:npw) = cg(:, 1:npw*nspinor*nband:npw) * sqrt2 273 end if 274 end if 275 end if 276 277 ! Compute enlout (nonlocal energy for each band if necessary) This is the best 278 ! quick and dirty trick to compute this part in NC. gvnlxc cannot be part of 279 ! lobpcg algorithm 280 if ( .not. l_paw ) then 281 !Check l_gvnlxc size 282 !if ( size(l_gvnlxc) < 2*nband*l_npw*l_nspinor ) then 283 !if ( size(l_gvnlxc) /= 0 ) then 284 ! ABI_FREE(l_gvnlxc) 285 !ABI_MALLOC(l_gvnlxc,(2,nband*l_npw*l_nspinor)) 286 #ifdef FC_CRAY 287 ABI_MALLOC(l_gvnlxc,(1,1)) 288 #else 289 ABI_MALLOC(l_gvnlxc,(0,0)) 290 #endif 291 292 !Call nonlop 293 if (l_paral_kgb==0) then 294 295 call nonlop(choice,l_cpopt,cprj_dum,enl_out,l_gs_hamk,0,eig,mpi_enreg,nband,1,paw_opt,& 296 & signs,gsc_dummy,l_tim_getghc,cg,l_gvnlxc) 297 298 else 299 do iband=1,nband/blockdim 300 shift = (iband-1)*blockdim*l_npw*l_nspinor 301 call prep_nonlop(choice,l_cpopt,cprj_dum, & 302 & enl_out((iband-1)*blockdim+1:iband*blockdim),l_gs_hamk,0,& 303 & eig((iband-1)*blockdim+1:iband*blockdim),blockdim,mpi_enreg,1,paw_opt,signs,& 304 & gsc_dummy,l_tim_getghc, & 305 & cg(:,shift+1:shift+blockdim*l_npw*l_nspinor),& 306 !& l_gvnlxc(:,shift+1:shift+blockdim*l_npw*l_nspinor),& 307 & l_gvnlxc(:,:),& 308 & already_transposed=.false.) 309 end do 310 end if 311 !Compute enlout 312 ! do iband=1,nband 313 ! shift = npw*nspinor*(iband-1) 314 ! call dotprod_g(enl_out(iband),dprod_i,l_gs_hamk%istwf_k,npw*nspinor,1,cg(:, shift+1:shift+npw*nspinor),& 315 ! & l_gvnlxc(:, shift+1:shift+npw*nspinor),mpi_enreg%me_g0,mpi_enreg%comm_bandspinorfft) 316 ! end do 317 ABI_FREE(l_gvnlxc) 318 end if 319 320 ! Free lobpcg 321 call lobpcg_free(lobpcg) 322 323 #ifdef HAVE_OPENMP_OFFLOAD 324 if(gs_hamk%gpu_option==ABI_GPU_OPENMP) then 325 !$OMP TARGET EXIT DATA MAP(from:cg,eig,resid) 326 end if 327 #endif 328 !########################################################################### 329 !################ SORRY IT'S ALREADY FINISHED : ) ###################### 330 !########################################################################### 331 332 333 call timab(tim_lobpcgwf2,2,tsec) 334 ! cputime = abi_cpu_time() - cputime 335 ! walltime = abi_wtime() - walltime 336 ! nthreads = xomp_get_num_threads(open_parallel = .true.) 337 ! if ( cputime/walltime/dble(nthreads) < 0.75 .and. (int(cputime/walltime)+1) /= nthreads) then 338 ! if ( prtvol >= 3 ) then 339 ! write(std_out,'(a)',advance='no') sjoin(" Lobpcg took", sec2str(cputime), "of cpu time") 340 ! write(std_out,*) sjoin("for a wall time of", sec2str(walltime)) 341 ! write(std_out,'(a,f6.2)') " -> Ratio of ", cputime/walltime 342 ! end if 343 ! ABI_COMMENT(sjoin("You should set the number of threads to something close to",itoa(int(cputime/walltime)+1))) 344 ! end if 345 346 DBG_EXIT("COLL") 347 348 end subroutine lobpcgwf2 349 350 subroutine getghc_gsc1(X,AX,BX,transposer) 351 352 implicit none 353 354 !Arguments ------------------------------------ 355 type(xgBlock_t), intent(inout) :: X 356 type(xgBlock_t), intent(inout) :: AX 357 type(xgBlock_t), intent(inout) :: BX 358 type(xgTransposer_t), intent(inout) :: transposer 359 integer :: blockdim 360 integer :: spacedim 361 type(pawcprj_type) :: cprj_dum(l_gs_hamk%natom,1) 362 363 !Local variables------------------------------- 364 !scalars 365 integer :: cpuRow 366 real(dp) :: eval 367 !arrays 368 real(dp), pointer :: cg(:,:) 369 real(dp), pointer :: ghc(:,:) 370 real(dp), pointer :: gsc(:,:) 371 real(dp) :: l_gvnlxc(1,1) 372 373 ! ********************************************************************* 374 375 call xgBlock_getSize(X,spacedim,blockdim) 376 call xgBlock_check(X,AX) 377 call xgBlock_check(X,BX) 378 379 spacedim = spacedim/l_icplx 380 381 call xgBlock_reverseMap(X,cg,l_icplx,spacedim*blockdim) 382 call xgBlock_reverseMap(AX,ghc,l_icplx,spacedim*blockdim) 383 call xgBlock_reverseMap(BX,gsc,l_icplx,spacedim*blockdim) 384 385 !Scale back cg 386 if (l_paral_kgb == 1) cpuRow = xgTransposer_getRank(transposer, 2) 387 if(l_istwf == 2) then 388 call xgBlock_scale(X,inv_sqrt2,1) 389 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 390 #ifdef HAVE_OPENMP_OFFLOAD 391 if (l_paral_kgb == 0) then 392 if(l_mpi_enreg%me_g0 == 1) then 393 !$OMP TARGET MAP(cg) 394 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2 395 !$OMP END TARGET 396 end if 397 else 398 if (cpuRow == 0) then 399 !$OMP TARGET MAP(cg) 400 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2 401 !$OMP END TARGET 402 end if 403 end if 404 #endif 405 else 406 if (l_paral_kgb == 0) then 407 if(l_mpi_enreg%me_g0 == 1) cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * sqrt2 408 else 409 if (cpuRow == 0) cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * sqrt2 410 end if 411 end if 412 end if 413 414 !if ( size(l_gvnlxc) < 2*blockdim*spacedim ) then 415 ! ABI_FREE(l_gvnlxc) 416 ! ABI_MALLOC(l_gvnlxc,(2,blockdim*spacedim)) 417 !end if 418 419 call multithreaded_getghc(l_cpopt,cg,cprj_dum,ghc,gsc,& 420 l_gs_hamk,l_gvnlxc,eval,l_mpi_enreg,blockdim,l_prtvol,l_sij_opt,l_tim_getghc,0) 421 422 423 #if defined(HAVE_GPU_CUDA) && defined(HAVE_YAKL) 424 call gpu_device_synchronize() 425 #endif 426 427 !Scale cg, ghc, gsc 428 if ( l_istwf == 2 ) then 429 call xgBlock_scale(X ,sqrt2,1) 430 call xgBlock_scale(AX,sqrt2,1) 431 432 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 433 #ifdef HAVE_OPENMP_OFFLOAD 434 if (l_paral_kgb == 0) then 435 if(l_mpi_enreg%me_g0 == 1) then 436 !$OMP TARGET MAP(cg) 437 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 438 !$OMP END TARGET 439 !$OMP TARGET MAP(ghc) 440 ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 441 !$OMP END TARGET 442 endif 443 else 444 if (cpuRow == 0) then 445 !$OMP TARGET MAP(cg) 446 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 447 !$OMP END TARGET 448 !$OMP TARGET MAP(ghc) 449 ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 450 !$OMP END TARGET 451 end if 452 end if 453 #endif 454 else 455 if (l_paral_kgb == 0) then 456 if(l_mpi_enreg%me_g0 == 1) then 457 cg(:, 1:spacedim*blockdim:l_npw) = cg(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 458 ghc(:, 1:spacedim*blockdim:l_npw) = ghc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 459 endif 460 else 461 if (cpuRow == 0) then 462 cg(:, 1:spacedim*blockdim:spacedim) = cg(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 463 ghc(:, 1:spacedim*blockdim:spacedim) = ghc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 464 end if 465 end if 466 end if 467 if(l_paw) then 468 call xgBlock_scale(BX,sqrt2,1) 469 if(l_gs_hamk%gpu_option==ABI_GPU_OPENMP) then 470 #ifdef HAVE_OPENMP_OFFLOAD 471 if (l_paral_kgb == 0) then 472 if(l_mpi_enreg%me_g0 == 1) then 473 !$OMP TARGET MAP(gsc) 474 gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 475 !$OMP END TARGET 476 end if 477 else 478 if (cpuRow == 0) then 479 !$OMP TARGET MAP(gsc) 480 gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 481 !$OMP END TARGET 482 end if 483 end if 484 #endif 485 else 486 if (l_paral_kgb == 0) then 487 if(l_mpi_enreg%me_g0 == 1) gsc(:, 1:spacedim*blockdim:l_npw) = gsc(:, 1:spacedim*blockdim:l_npw) * inv_sqrt2 488 else 489 if (cpuRow == 0) gsc(:, 1:spacedim*blockdim:spacedim) = gsc(:, 1:spacedim*blockdim:spacedim) * inv_sqrt2 490 end if 491 end if 492 end if ! l_paw 493 end if ! l_istwf==2 494 495 if ( .not. l_paw ) call xgBlock_copy(X,BX) 496 497 end subroutine getghc_gsc1 498 499 subroutine precond(W) 500 use m_xg, only : xg_t, xgBlock_colwiseMul 501 type(xgBlock_t), intent(inout) :: W 502 integer :: ispinor 503 !integer :: cplx 504 505 ! precondition resid_vec 506 do ispinor = 1,l_nspinor 507 !do cplx = 1, l_icplx 508 call xgBlock_colwiseMul(W,l_pcon,l_icplx*l_npw*(ispinor-1)) 509 !end do 510 end do 511 512 end subroutine precond 513 514 end module m_lobpcgwf