TABLE OF CONTENTS


ABINIT/alloc_getghc_ompgpu_buffers [ Functions ]

[ Top ] [ Functions ]

NAME

 alloc_getghc_ompgpu_buffers

FUNCTION

 Allocate getghc_ompgpu work buffer


ABINIT/free_getghc_ompgpu_buffers [ Functions ]

[ Top ] [ Functions ]

NAME

 free_getghc_ompgpu_buffers

FUNCTION

 Free getghc_ompgpu work buffer


ABINIT/getghc_ompgpu [ Functions ]

[ Top ] [ Functions ]

NAME

 getghc_ompgpu

FUNCTION

 Compute <G|H|C> for input vector |C> expressed in reciprocal space;
 Result is put in array ghc.
 <G|Vnonlocal + VfockACE|C> is also returned in gvnlxc if either NLoc NCPP or FockACE.
 If required, <G|S|C> is returned in gsc (S=overlap - PAW only)
 Note that left and right k points can be different, i.e. ghc=<k^prime+G|H|C_k>.

INPUTS

 cpopt=flag defining the status of cwaveprj%cp(:)=<Proj_i|Cnk> scalars (PAW only)
       (same meaning as in nonlop.F90 routine)
       if cpopt=-1, <p_lmn|in> (and derivatives) are computed here (and not saved)
       if cpopt= 0, <p_lmn|in> are computed here and saved
       if cpopt= 1, <p_lmn|in> and first derivatives are computed here and saved
       if cpopt= 2  <p_lmn|in> are already in memory;
       if cpopt= 3  <p_lmn|in> are already in memory; first derivatives are computed here and saved
       if cpopt= 4  <p_lmn|in> and first derivatives are already in memory;
 cwavef(2,npw*my_nspinor*ndat)=planewave coefficients of wavefunction.
 cwavef_r(2,n4,n5,n6,nspinor) = wave function in real space
 gs_ham <type(gs_hamiltonian_type)>=all data for the Hamiltonian to be applied
 lambda=factor to be used when computing <G|H-lambda.S|C> - only for sij_opt=-1
        Typically lambda is the eigenvalue (or its guess)
 mpi_enreg=information about MPI parallelization
 ndat=number of FFT to do in parallel
 prtvol=control print volume and debugging output
 sij_opt= -PAW ONLY-  if  0, only matrix elements <G|H|C> have to be computed
    (S=overlap)       if  1, matrix elements <G|S|C> have to be computed in gsc in addition to ghc
                      if -1, matrix elements <G|H-lambda.S|C> have to be computed in ghc (gsc not used)
 tim_getghc=timing code of the calling subroutine(can be set to 0 if not attributed)
 type_calc= option governing which part of Hamitonian is to be applied:
            1: local part only
            2: non-local+Fock+kinetic only (added to the existing Hamiltonian)
            3: local + kinetic only (added to the existing Hamiltonian)
 ===== Optional inputs =====
   [kg_fft_k(3,:)]=optional, (k+G) vector coordinates to be used for the FFT tranformation
                   instead of the one contained in gs_ham datastructure.
                   Typically used for real WF (in parallel) which are FFT-transformed 2 by 2.
   [kg_fft_kp(3,:)]=optional, (k^prime+G) vector coordinates to be used for the FFT tranformation
   [select_k]=optional, option governing the choice of k points to be used.
             gs_ham datastructure contains quantities needed to apply Hamiltonian
             in reciprocal space between 2 kpoints, k and k^prime (equal in most cases);
             if select_k=1, <k^prime|H|k>       is applied [default]
             if select_k=2, <k|H|k^prime>       is applied
             if select_k=3, <k|H|k>             is applied
             if select_k=4, <k^prime|H|k^prime> is applied

OUTPUT

  ghc(2,npw*my_nspinor*ndat)=matrix elements <G|H|C> (if sij_opt>=0)
                                          or <G|H-lambda.S|C> (if sij_opt=-1)
  gvnlxc(2,npw*my_nspinor*ndat)=matrix elements <G|Vnonlocal+VFockACE|C> (if sij_opt>=0)
                                            or <G|Vnonlocal+VFockACE-lambda.S|C> (if sij_opt=-1)
      include Vnonlocal if NCPP and non-local Fock if associated(gs_ham%fockcommon)
  if (sij_opt=1)
    gsc(2,npw*my_nspinor*ndat)=matrix elements <G|S|C> (S=overlap).

SIDE EFFECTS

  cwaveprj(natom,my_nspinor*(1+cpopt)*ndat)= wave function projected on nl projectors (PAW only)

PARENTS

      m_cgwf,m_cgwf_cprj,m_chebfi,m_dfpt_cgwf,m_dft_energy,m_getghc
      m_gwls_hamiltonian,m_ksdiago,m_lobpcgwf_old,m_orbmag,m_rf2,m_rmm_diis

CHILDREN

      getghc,mkl_set_num_threads,omp_set_nested

SOURCE

249 subroutine getghc_ompgpu(cpopt,cwavef,cwaveprj,ghc,gsc,gs_ham,gvnlxc,lambda,mpi_enreg,ndat,&
250                   prtvol,sij_opt,tim_getghc,type_calc,&
251                   kg_fft_k,kg_fft_kp,select_k,cwavef_r) ! optional arguments
252 
253 !Arguments ------------------------------------
254 !scalars
255  integer,intent(in) :: cpopt,ndat, prtvol
256  integer,intent(in) :: sij_opt,tim_getghc,type_calc
257  integer,intent(in),optional :: select_k
258  real(dp),intent(in) :: lambda
259  type(MPI_type),intent(in) :: mpi_enreg
260  type(gs_hamiltonian_type),intent(inout),target :: gs_ham
261 !arrays
262  integer,intent(in),optional,target :: kg_fft_k(:,:),kg_fft_kp(:,:)
263  real(dp),intent(out),target :: gsc(:,:)
264  real(dp),intent(inout) :: cwavef(:,:)
265  real(dp),optional,intent(inout) :: cwavef_r(:,:,:,:,:)
266  real(dp),intent(out) :: ghc(:,:)
267  real(dp),intent(out),target :: gvnlxc(:,:)
268  type(pawcprj_type),intent(inout),target :: cwaveprj(:,:)
269  !MG: Passing these arrays assumed-shape has the drawback that client code is
270  !forced to use vec(2, npw*ndat) instead of the more user-friendly vec(2,npw,ndat)
271 
272 !Tested usecases :
273 ! - Nvidia GPUs : FC_NVHPC + CUDA
274 ! - AMD GPUs    : FC_LLVM + HIP
275 ! An eventual Intel implementation would use the OneAPI LLVM compiler.
276 ! Homemade CUDA/HIP interfaces would allow the use of GCC.
277 ! But it is likely that OpenMP performance won't be optimal outside GPU vendors compilers.
278 #ifdef HAVE_OPENMP_OFFLOAD
279 
280 !Local variables-------------------------------
281 !scalars
282  integer,parameter :: level=114,re=1,im=2,tim_fourwf=1
283  integer :: choice,cplex,cpopt_here,i1,i2,i3,idat,idir,ierr
284  integer :: ig,igspinor,ii,iispinor,ikpt_this_proc,ipw,ispinor,my_nspinor
285  integer :: n4,n5,n6,nnlout,npw_fft,npw_k1,npw_k2,nspinortot,option_fft
286  integer :: paw_opt,select_k_,shift1,shift2,signs,tim_nonlop
287  logical :: k1_eq_k2,has_fock,local_gvnlxc
288  logical :: nspinor1TreatedByThisProc,nspinor2TreatedByThisProc,use_cwavef_r
289  real(dp) :: ghcim,ghcre,weight
290  character(len=500) :: msg
291 !arrays
292  integer, pointer :: gbound_k1(:,:),gbound_k2(:,:),kg_k1(:,:),kg_k2(:,:)
293  integer, ABI_CONTIGUOUS pointer :: indices_pw_fft(:),kg_k_fft(:,:)
294  integer, ABI_CONTIGUOUS pointer :: recvcount_fft(:),recvdisp_fft(:)
295  integer, ABI_CONTIGUOUS pointer ::  sendcount_fft(:),senddisp_fft(:)
296  integer, allocatable:: dimcprj(:)
297  real(dp) :: enlout(ndat),lambda_ndat(ndat),tsec(2)
298  real(dp),target :: nonlop_dum(1,1)
299  real(dp),allocatable :: buff_wf(:,:),cwavef1(:,:),cwavef2(:,:),cwavef_fft(:,:),cwavef_fft_tr(:,:)
300  real(dp),allocatable :: ghc1(:,:),ghc2(:,:),ghc3(:,:),ghc4(:,:),ghc_mGGA(:,:),ghc_vectornd(:,:)
301  real(dp),allocatable :: gvnlc(:,:),vlocal_tmp(:,:,:)
302  real(dp),pointer :: gvnlxc_(:,:),kinpw_k1(:),kinpw_k2(:),kpt_k1(:),kpt_k2(:)
303  real(dp),pointer :: gsc_ptr(:,:)
304  type(fock_common_type),pointer :: fock
305  type(pawcprj_type),pointer :: cwaveprj_fock(:,:),cwaveprj_idat(:,:),cwaveprj_nonlop(:,:)
306  logical :: transfer_omp_args
307  integer :: tmp2i(2)
308 
309 ! *********************************************************************
310 
311  DBG_ENTER("COLL")
312 
313 !Keep track of total time spent in getghc:
314  call timab(350+tim_getghc,1,tsec)
315  ABI_NVTX_START_RANGE(NVTX_GETGHC)
316 
317 !Structured debugging if prtvol==-level
318  if(prtvol==-level)then
319    write(msg,'(80a,a,a)') ('=',ii=1,80),ch10,' getghc : enter, debugging '
320    call wrtout(std_out,msg,'PERS')
321  end if
322 
323 !Select k-dependent objects according to select_k input parameter
324  select_k_=1;if (present(select_k)) select_k_=select_k
325  if (select_k_==KPRIME_H_K) then
326 !  <k^prime|H|k>
327    npw_k1    =  gs_ham%npw_fft_k ; npw_k2    =  gs_ham%npw_fft_kp
328    kpt_k1    => gs_ham%kpt_k     ; kpt_k2    => gs_ham%kpt_kp
329    kg_k1     => gs_ham%kg_k      ; kg_k2     => gs_ham%kg_kp
330    gbound_k1 => gs_ham%gbound_k  ; gbound_k2 => gs_ham%gbound_kp
331    kinpw_k1  => gs_ham%kinpw_k   ; kinpw_k2  => gs_ham%kinpw_kp
332  else if (select_k_==K_H_KPRIME) then
333 !  <k|H|k^prime>
334    npw_k1    =  gs_ham%npw_fft_kp; npw_k2    =  gs_ham%npw_fft_k
335    kpt_k1    => gs_ham%kpt_kp    ; kpt_k2    => gs_ham%kpt_k
336    kg_k1     => gs_ham%kg_kp     ; kg_k2     => gs_ham%kg_k
337    gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_k
338    kinpw_k1  => gs_ham%kinpw_kp  ; kinpw_k2  => gs_ham%kinpw_k
339  else if (select_k_==K_H_K) then
340 !  <k|H|k>
341    npw_k1    =  gs_ham%npw_fft_k ; npw_k2    =  gs_ham%npw_fft_k
342    kpt_k1    => gs_ham%kpt_k     ; kpt_k2    => gs_ham%kpt_k
343    kg_k1     => gs_ham%kg_k      ; kg_k2     => gs_ham%kg_k
344    gbound_k1 => gs_ham%gbound_k  ; gbound_k2 => gs_ham%gbound_k
345    kinpw_k1  => gs_ham%kinpw_k   ; kinpw_k2  => gs_ham%kinpw_k
346  else if (select_k_==KPRIME_H_KPRIME) then
347 !  <k^prime|H|k^prime>
348    npw_k1    =  gs_ham%npw_fft_kp; npw_k2    =  gs_ham%npw_fft_kp
349    kpt_k1    => gs_ham%kpt_kp    ; kpt_k2    => gs_ham%kpt_kp
350    kg_k1     => gs_ham%kg_kp     ; kg_k2     => gs_ham%kg_kp
351    gbound_k1 => gs_ham%gbound_kp ; gbound_k2 => gs_ham%gbound_kp
352    kinpw_k1  => gs_ham%kinpw_kp  ; kinpw_k2  => gs_ham%kinpw_kp
353  end if
354  k1_eq_k2=(all(abs(kpt_k1(:)-kpt_k2(:))<tol8))
355 
356 !Check sizes
357  my_nspinor=max(1,gs_ham%nspinor/mpi_enreg%nproc_spinor)
358  if (size(cwavef)<2*npw_k1*my_nspinor*ndat) then
359    ABI_BUG('wrong size for cwavef!')
360  end if
361  if (size(ghc)<2*npw_k2*my_nspinor*ndat) then
362    ABI_BUG('wrong size for ghc!')
363  end if
364  if (sij_opt==1) then
365    if (size(gsc)<2*npw_k2*my_nspinor*ndat) then
366      ABI_BUG('wrong size for gsc!')
367    end if
368  end if
369  if (gs_ham%usepaw==1.and.cpopt>=0) then
370    if (size(cwaveprj)<gs_ham%natom*my_nspinor*ndat) then
371      ABI_BUG('wrong size for cwaveprj!')
372    end if
373  end if
374  if (any(type_calc == [0, 2, 3])) then
375    local_gvnlxc = size(gvnlxc)==0
376    if (local_gvnlxc) then
377      ABI_MALLOC(gvnlxc_,(2,npw_k2*my_nspinor*ndat))
378    else
379      gvnlxc_ => gvnlxc
380    end if
381    if (size(gvnlxc_)<2*npw_k2*my_nspinor*ndat) then
382      ABI_BUG('wrong size for gvnlxc!')
383    end if
384  end if
385  use_cwavef_r=present(cwavef_r)
386  n4=gs_ham%n4 ; n5=gs_ham%n5 ; n6=gs_ham%n6
387  nspinortot=gs_ham%nspinor
388  if (use_cwavef_r) then
389    if (size(cwavef_r,1)/=2) then
390      ABI_BUG('wrong size for cwavef_r (dimension 1)')
391    end if
392    if (size(cwavef_r,2)/=n4) then
393      ABI_BUG('wrong size for cwavef_r (dimension 2)')
394    end if
395    if (size(cwavef_r,3)/=n5) then
396      ABI_BUG('wrong size for cwavef_r (dimension 3)')
397    end if
398    if (size(cwavef_r,4)/=n6) then
399      ABI_BUG('wrong size for cwavef_r (dimension 4)')
400    end if
401    if (size(cwavef_r,5)/=nspinortot) then
402      ABI_BUG('wrong size for cwavef_r (dimension 5)')
403    end if
404  end if
405 
406 !Eventually overwrite plane waves data for FFT
407  if (present(kg_fft_k)) then
408    kg_k1 => kg_fft_k ; kg_k2 => kg_fft_k
409    npw_k1=size(kg_k1,2) ; npw_k2=size(kg_k2,2)
410  end if
411  if (present(kg_fft_kp)) then
412    kg_k2 => kg_fft_kp ; npw_k2=size(kg_k2,2)
413  end if
414 
415 !paral_kgb constraint
416  if (mpi_enreg%paral_kgb==1.and.(.not.k1_eq_k2)) then
417    ABI_BUG('paral_kgb=1 not allowed for k/=k_^prime!')
418  end if
419 
420 has_fock=.false.
421 !Do we add Fock exchange term ?
422  if (associated(gs_ham%fockcommon)) then
423    ABI_BUG("Fock exchange term calculation not supported in GPU mode")
424  end if
425 
426  if (gs_ham%gpu_option/=ABI_GPU_OPENMP) then
427    ABI_BUG('Unexpected value for gs_ham%gpu_option (debugging) ! ')
428  end if
429 
430 !Parallelization over spinors management
431  if (mpi_enreg%paral_spinor==0) then
432    shift1=npw_k1;shift2=npw_k2
433    nspinor1TreatedByThisProc=.true.
434    nspinor2TreatedByThisProc=(nspinortot==2)
435  else
436    shift1=0;shift2=0
437    nspinor1TreatedByThisProc=(mpi_enreg%me_spinor==0)
438    nspinor2TreatedByThisProc=(mpi_enreg%me_spinor==1)
439  end if
440 
441  paw_opt=gs_ham%usepaw ; if (sij_opt/=0) paw_opt=sij_opt+3
442  if (gs_ham%usepaw==0) gsc_ptr => nonlop_dum
443  if (gs_ham%usepaw==1) gsc_ptr => gsc
444 
445  !$OMP TARGET ENTER DATA MAP(alloc:gvnlxc_)
446 
447  transfer_omp_args =  .not. ( xomp_target_is_present(c_loc(ghc)) &
448    .and. xomp_target_is_present(c_loc(gsc_ptr)) &
449    .and. xomp_target_is_present(c_loc(cwavef)) )
450 
451  if (type_calc == 2) then
452    !$OMP TARGET ENTER DATA MAP(to:ghc) IF(transfer_omp_args)
453    !$OMP TARGET ENTER DATA MAP(to:gsc_ptr) IF(transfer_omp_args)
454  else
455    !$OMP TARGET ENTER DATA MAP(alloc:ghc) IF(transfer_omp_args)
456    !$OMP TARGET ENTER DATA MAP(alloc:gsc_ptr) IF(transfer_omp_args)
457  end if
458  !$OMP TARGET ENTER DATA MAP(to:cwavef) IF(transfer_omp_args)
459 
460 !============================================================
461 ! Application of the local potential
462 !============================================================
463  ABI_NVTX_START_RANGE(NVTX_GETGHC_LOCPOT)
464 
465  if (any(type_calc == [0, 1, 3])) then
466 
467 !  Need a Vlocal
468    if (.not.associated(gs_ham%vlocal)) then
469      ABI_BUG("We need vlocal in gs_ham!")
470    end if
471 
472 !  fourwf can only process with one value of istwf_k
473    if (.not.k1_eq_k2) then
474      ABI_BUG('vlocal (fourwf) cannot be computed with k/=k^prime!')
475    end if
476 
477 !  Apply the local potential to the wavefunction
478 !  Start from wavefunction in reciprocal space cwavef
479    if(buf_initialized==0 .or. mod__n4/=gs_ham%n4 .or. mod__n5/=gs_ham%n5 &
480   &   .or. mod__n6/=gs_ham%n6 .or. mod__nspinor/=my_nspinor .or. mod__ndat/=ndat .or. npw_k1/=mod__npw) then
481       call alloc_getghc_ompgpu_buffers(npw_k1, my_nspinor, ndat, gs_ham%n4, gs_ham%n5, gs_ham%n6)
482    end if
483 !  End with function ghc in reciprocal space also.
484 #ifndef HAVE_GPU_HIP
485    !Work buffer allocated at each call to save memory (but too costful on HIP)
486    !$OMP TARGET ENTER DATA MAP(alloc:work)
487 #endif
488    weight=one
489    if (.not.use_cwavef_r) then
490      option_fft=2
491      if (nspinortot==2) then
492        ABI_MALLOC(cwavef1,(2,npw_k1*ndat))
493        ABI_MALLOC(cwavef2,(2,npw_k1*ndat))
494        do idat=1,ndat
495          do ipw=1,npw_k1
496            cwavef1(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1)
497            cwavef2(1:2,ipw+(idat-1)*npw_k1)=cwavef(1:2,ipw+(idat-1)*my_nspinor*npw_k1+shift1)
498          end do
499        end do
500 !      call cg_zcopy(npw_k1*ndat,cwavef(1,1),cwavef1)
501 !      call cg_zcopy(npw_k1*ndat,cwavef(1,1+shift1),cwavef2)
502      end if
503    else
504      option_fft=3
505      if (nspinortot==2) then
506        ABI_MALLOC(cwavef1,(0,0))
507        ABI_MALLOC(cwavef2,(0,0))
508      end if
509    end if
510 
511    if (gs_ham%nvloc==1) then
512 !  Treat scalar local potentials
513 
514      if (nspinortot==1) then
515 
516        if (use_cwavef_r) then
517          do i3=1,n6
518            do i2=1,n5
519              do i1=1,n4
520                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
521                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
522              end do
523            end do
524          end do
525        end if
526        !$OMP TASKWAIT
527        call fourwf(1,gs_ham%vlocal,cwavef,ghc,work,gbound_k1,gbound_k2,&
528 &       gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
529 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,&
530 &       weight,weight,gpu_option=gs_ham%gpu_option)
531 
532      else
533        ! nspinortot==2
534 
535        if (nspinor1TreatedByThisProc) then
536          if (use_cwavef_r) then
537            do i3=1,n6
538              do i2=1,n5
539                do i1=1,n4
540                  work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
541                  work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
542                end do
543              end do
544            end do
545          end if
546          ABI_MALLOC(ghc1,(2,npw_k2*ndat))
547          call fourwf(1,gs_ham%vlocal,cwavef1,ghc1,work,gbound_k1,gbound_k2,&
548 &         gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
549 &         npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,&
550 &         weight,weight,gpu_option=gs_ham%gpu_option)
551          do idat=1,ndat
552            do ipw =1, npw_k2
553              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2)
554            end do
555          end do
556          ABI_FREE(ghc1)
557        end if ! spin 1 treated by this proc
558 
559        if (nspinor2TreatedByThisProc) then
560          if (use_cwavef_r) then
561            do i3=1,n6
562              do i2=1,n5
563                do i1=1,n4
564                  work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,2)
565                  work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,2)
566                end do
567              end do
568            end do
569          end if
570          ABI_MALLOC(ghc2,(2,npw_k2*ndat))
571          call fourwf(1,gs_ham%vlocal,cwavef2,ghc2,work,gbound_k1,gbound_k2,&
572 &         gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
573 &         npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
574 &         gpu_option=gs_ham%gpu_option)
575          do idat=1,ndat
576            do ipw=1,npw_k2
577              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc2(1:2,ipw+(idat-1)*npw_k2)
578            end do
579          end do
580          ABI_FREE(ghc2)
581        end if ! spin 2 treated by this proc
582 
583      end if ! nspinortot
584 
585    else if (gs_ham%nvloc==4) then
586 !    Treat non-collinear local potentials
587 
588      ABI_MALLOC(ghc1,(2,npw_k2*ndat))
589      ABI_MALLOC(ghc2,(2,npw_k2*ndat))
590      ABI_MALLOC(ghc3,(2,npw_k2*ndat))
591      ABI_MALLOC(ghc4,(2,npw_k2*ndat))
592      ghc1(:,:)=zero; ghc2(:,:)=zero; ghc3(:,:)=zero ;  ghc4(:,:)=zero
593      if (use_cwavef_r) then
594        ABI_MALLOC(vlocal_tmp,(0,0,0))
595      else
596        ABI_MALLOC(vlocal_tmp,(gs_ham%n4,gs_ham%n5,gs_ham%n6))
597      end if
598 !    ghc1=v11*phi1
599      if (nspinor1TreatedByThisProc) then
600        if (use_cwavef_r) then
601          do i3=1,n6
602            do i2=1,n5
603              do i1=1,n4
604                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(1,i1,i2,i3,1)
605                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)*cwavef_r(2,i1,i2,i3,1)
606              end do
607            end do
608          end do
609        else
610          ! LB,07/22:
611          ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems.
612          ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why...
613          !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,1)
614          do i3=1,n6
615            do i2=1,n5
616              do i1=1,n4
617                vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,1)
618              end do
619            end do
620          end do
621        end if
622        call fourwf(1,vlocal_tmp,cwavef1,ghc1,work,gbound_k1,gbound_k2,&
623 &       gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
624 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
625 &       gpu_option=gs_ham%gpu_option)
626      end if
627 !    ghc2=v22*phi2
628      if (nspinor2TreatedByThisProc) then
629        if (use_cwavef_r) then
630          do i3=1,n6
631            do i2=1,n5
632              do i1=1,n4
633                work(1,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(1,i1,i2,i3,2)
634                work(2,i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)*cwavef_r(2,i1,i2,i3,2)
635              end do
636            end do
637          end do
638        else
639          ! LB,07/22:
640          ! Weird segmentation fault encountered here if called with multithreaded_getghc for big systems.
641          ! Using an explicit loop instead of fortran syntax seems to solve the problem, I don't understand why...
642          !vlocal_tmp(:,:,:)=gs_ham%vlocal(:,:,:,2)
643          do i3=1,n6
644            do i2=1,n5
645              do i1=1,n4
646                vlocal_tmp(i1,i2,i3) = gs_ham%vlocal(i1,i2,i3,2)
647              end do
648            end do
649          end do
650        end if
651        call fourwf(1,vlocal_tmp,cwavef2,ghc2,work,gbound_k1,gbound_k2,&
652 &       gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
653 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
654 &       gpu_option=gs_ham%gpu_option)
655      end if
656      ABI_FREE(vlocal_tmp)
657      cplex=2
658      if (use_cwavef_r) then
659        ABI_MALLOC(vlocal_tmp,(0,0,0))
660      else
661        ABI_MALLOC(vlocal_tmp,(cplex*gs_ham%n4,gs_ham%n5,gs_ham%n6))
662      end if
663 !    ghc3=(re(v12)-im(v12))*phi1
664      if (nspinor1TreatedByThisProc) then
665        if (use_cwavef_r) then
666          do i3=1,n6
667            do i2=1,n5
668              do i1=1,n4
669                work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,1)
670                work(2,i1,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,1)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,1)
671              end do
672            end do
673          end do
674        else
675          do i3=1,gs_ham%n6
676            do i2=1,gs_ham%n5
677              do i1=1,gs_ham%n4
678                vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)
679                vlocal_tmp(2*i1  ,i2,i3)=-gs_ham%vlocal(i1,i2,i3,4)
680              end do
681            end do
682          end do
683        end if
684        call fourwf(cplex,vlocal_tmp,cwavef1,ghc3,work,gbound_k1,gbound_k2,&
685 &       gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
686 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
687 &       gpu_option=gs_ham%gpu_option)
688      end if
689 !    ghc4=(re(v12)+im(v12))*phi2
690      if (nspinor2TreatedByThisProc) then
691        if (use_cwavef_r) then
692          do i3=1,n6
693            do i2=1,n5
694              do i1=1,n4
695                work(1,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(1,i1,i2,i3,2)-gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(2,i1,i2,i3,2)
696                work(2,i1,i2,i3)= gs_ham%vlocal(i1,i2,i3,4)*cwavef_r(1,i1,i2,i3,2)+gs_ham%vlocal(i1,i2,i3,3)*cwavef_r(2,i1,i2,i3,2)
697              end do
698            end do
699          end do
700        else
701          do i3=1,gs_ham%n6
702            do i2=1,gs_ham%n5
703              do i1=1,gs_ham%n4
704                vlocal_tmp(2*i1-1,i2,i3)= gs_ham%vlocal(i1,i2,i3,3)
705                vlocal_tmp(2*i1  ,i2,i3)= gs_ham%vlocal(i1,i2,i3,4)
706              end do
707            end do
708          end do
709        end if
710        call fourwf(cplex,vlocal_tmp,cwavef2,ghc4,work,gbound_k1,gbound_k2,&
711 &       gs_ham%istwf_k,kg_k1,kg_k2,gs_ham%mgfft,mpi_enreg,ndat,gs_ham%ngfft,&
712 &       npw_k1,npw_k2,gs_ham%n4,gs_ham%n5,gs_ham%n6,option_fft,tim_fourwf,weight,weight,&
713 &       gpu_option=gs_ham%gpu_option)
714      end if
715      ABI_FREE(vlocal_tmp)
716 !    Build ghc from pieces
717 !    (v11,v22,Re(v12)+iIm(v12);Re(v12)-iIm(v12))(psi1;psi2): matrix product
718      if (mpi_enreg%paral_spinor==0) then
719        do idat=1,ndat
720          do ipw=1,npw_k2
721            ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)       =ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2)
722            ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2)
723          end do
724        end do
725      else
726        call xmpi_sum(ghc4,mpi_enreg%comm_spinor,ierr)
727        call xmpi_sum(ghc3,mpi_enreg%comm_spinor,ierr)
728        if (nspinor1TreatedByThisProc) then
729          do idat=1,ndat
730            do ipw=1,npw_k2
731              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2)=ghc1(1:2,ipw+(idat-1)*npw_k2)+ghc4(1:2,ipw+(idat-1)*npw_k2)
732            end do
733          end do
734        else if (nspinor2TreatedByThisProc) then
735          do idat=1,ndat
736            do ipw=1,npw_k2
737              ghc(1:2,ipw+(idat-1)*my_nspinor*npw_k2+shift2)=ghc3(1:2,ipw+(idat-1)*npw_k2)+ghc2(1:2,ipw+(idat-1)*npw_k2)
738            end do
739          end do
740        end if
741      end if
742      ABI_FREE(ghc1)
743      ABI_FREE(ghc2)
744      ABI_FREE(ghc3)
745      ABI_FREE(ghc4)
746    end if ! nvloc
747 
748    if (nspinortot==2)  then
749      ABI_FREE(cwavef1)
750      ABI_FREE(cwavef2)
751    end if
752 
753    if (type_calc==1) then
754      if(transfer_omp_args) then
755        !$OMP TARGET UPDATE FROM(ghc) nowait
756      end if
757    end if
758 
759 #ifndef HAVE_GPU_HIP
760    !$OMP TARGET EXIT DATA MAP(delete:work)
761 #endif
762 
763  end if ! type_calc
764  ABI_NVTX_END_RANGE()
765 
766 
767  if (any(type_calc == [0, 2, 3])) then
768 
769 !============================================================
770 ! Application of the non-local potential and the Fock potential
771 !============================================================
772 
773    ABI_NVTX_START_RANGE(NVTX_GETGHC_NLOCPOT)
774    if (type_calc==0 .or. type_calc==2) then
775      signs=2 ; choice=1 ; nnlout=1 ; idir=0 ; tim_nonlop=1
776      cpopt_here=-1;if (gs_ham%usepaw==1) cpopt_here=cpopt
777      cwaveprj_nonlop=>cwaveprj
778      lambda_ndat = lambda
779 
780      call nonlop(choice,cpopt_here,cwaveprj_nonlop,enlout,gs_ham,idir,lambda_ndat,mpi_enreg,ndat,&
781 &     nnlout,paw_opt,signs,gsc_ptr,tim_nonlop,cwavef,gvnlxc_,select_k=select_k_)
782 
783    else if (type_calc == 3) then
784      ! for kinetic and local only, nonlocal and vfock should be zero
785      gvnlxc_(:,:) = zero
786    end if ! if(type_calc...
787    ABI_NVTX_END_RANGE()
788 
789 !============================================================
790 ! Assemble kinetic, local, nonlocal and Fock contributions
791 !============================================================
792 
793    ABI_NVTX_START_RANGE(NVTX_GETGHC_KIN)
794 !  Assemble modified kinetic, local and nonlocal contributions
795    !  to <G|H|C(n,k)>. Take also into account build-in debugging.
796    if(prtvol/=-level)then
797      if (k1_eq_k2) then
798        !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
799        !$OMP& PRIVATE(idat,ispinor,ig) &
800        !$OMP& MAP(to:ghc,kinpw_k2,gvnlxc_,gsc,cwavef) MAP(tofrom:kinpw_k2)
801        do idat=1,ndat
802          do ispinor=1,my_nspinor
803            do ig=1,npw_k2
804              igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
805              if(kinpw_k2(ig)<huge(zero)*1.d-11)then
806                ghc(re,igspinor)= ghc(re,igspinor) + kinpw_k2(ig)*cwavef(re,igspinor) + gvnlxc_(re,igspinor)
807                ghc(im,igspinor)= ghc(im,igspinor) + kinpw_k2(ig)*cwavef(im,igspinor) + gvnlxc_(im,igspinor)
808              else
809                ghc(re,igspinor)=zero
810                ghc(im,igspinor)=zero
811                if (sij_opt==1) then
812                  gsc(re,igspinor)=zero
813                  gsc(im,igspinor)=zero
814                end if
815              end if
816            end do ! ig
817          end do ! ispinor
818        end do ! idat
819      else
820        !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) &
821        !$OMP& PRIVATE(idat,ispinor,ig) &
822        !$OMP& MAP(to:ghc,gvnlxc_,gsc) MAP(tofrom:kinpw_k2)
823        do idat=1,ndat
824          do ispinor=1,my_nspinor
825            do ig=1,npw_k2
826              igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
827              if(kinpw_k2(ig)<huge(zero)*1.d-11)then
828                ghc(re,igspinor)= ghc(re,igspinor) + gvnlxc_(re,igspinor)
829                ghc(im,igspinor)= ghc(im,igspinor) + gvnlxc_(im,igspinor)
830              else
831                ghc(re,igspinor)=zero
832                ghc(im,igspinor)=zero
833                if (sij_opt==1) then
834                  gsc(re,igspinor)=zero
835                  gsc(im,igspinor)=zero
836                end if
837              end if
838            end do ! ig
839          end do ! ispinor
840        end do ! idat
841      end if
842    else
843      !$OMP TARGET UPDATE FROM(ghc)
844      !$OMP TARGET UPDATE FROM(gsc_ptr)
845 !    Here, debugging section
846      call wrtout(std_out,' getghc : components of ghc ','PERS')
847      write(msg,'(a)')&
848 &     'icp ig ispinor igspinor re/im     ghc        kinpw         cwavef      glocc        gvnlxc  gsc'
849      call wrtout(std_out,msg,'PERS')
850      do idat=1,ndat
851        do ispinor=1,my_nspinor
852          do ig=1,npw_k2
853            igspinor=ig+npw_k2*(ispinor-1)+npw_k2*my_nspinor*(idat-1)
854            if(kinpw_k2(ig)<huge(zero)*1.d-11)then
855              if (k1_eq_k2) then
856                ghcre=kinpw_k2(ig)*cwavef(re,igspinor)+ghc(re,igspinor)+gvnlxc_(re,igspinor)
857                ghcim=kinpw_k2(ig)*cwavef(im,igspinor)+ghc(im,igspinor)+gvnlxc_(im,igspinor)
858              else
859                ghcre=ghc(re,igspinor)+gvnlxc_(re,igspinor)
860                ghcim=ghc(im,igspinor)+gvnlxc_(im,igspinor)
861              end if
862            else
863              ghcre=zero
864              ghcim=zero
865              if (sij_opt==1) then
866                gsc(re,igspinor)=zero
867                gsc(im,igspinor)=zero
868              end if
869            end if
870            iispinor=ispinor;if (mpi_enreg%paral_spinor==1) iispinor=mpi_enreg%me_spinor+1
871            if (sij_opt == 1) then
872              write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  1 ', ig, iispinor, igspinor,ghcre,&
873 &             kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor), gsc(re,igspinor)
874              call wrtout(std_out,msg,'PERS')
875              write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  2 ', ig, iispinor, igspinor,ghcim,&
876 &             kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor), gsc(im,igspinor)
877              call wrtout(std_out,msg,'PERS')
878            else
879              write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  1 ', ig, iispinor, igspinor,ghcre,&
880 &             kinpw_k2(ig),cwavef(re,igspinor),ghc(re,igspinor),gvnlxc_(re,igspinor)
881              call wrtout(std_out,msg,'PERS')
882              write(msg,'(a,3(1x,i5),6(1x,es13.6))') '  2 ', ig, iispinor, igspinor,ghcim,&
883 &             kinpw_k2(ig),cwavef(im,igspinor),ghc(im,igspinor),gvnlxc_(im,igspinor)
884              call wrtout(std_out,msg,'PERS')
885            end if
886            ghc(re,igspinor)=ghcre
887            ghc(im,igspinor)=ghcim
888          end do ! ig
889        end do ! ispinor
890      end do ! idat
891    end if
892    ABI_NVTX_END_RANGE()
893 
894 !  Special case of PAW + Fock : only return Fock operator contribution in gvnlxc_
895    if (gs_ham%usepaw==1 .and. has_fock) then
896      gvnlxc_=gvnlxc_-gvnlc
897      ABI_FREE(gvnlc)
898    endif
899 
900    if(transfer_omp_args) then
901      !$OMP TARGET UPDATE FROM(ghc) nowait
902      !$OMP TARGET UPDATE FROM(gsc_ptr) nowait
903      !$OMP TARGET UPDATE FROM(cwavef) nowait
904    end if
905    if (.not. local_gvnlxc) then
906      !$OMP TARGET UPDATE FROM(gvnlxc_) nowait
907    end if
908    !$OMP TASKWAIT
909 
910 !  Structured debugging : if prtvol=-level, stop here.
911    if(prtvol==-level)then
912      write(msg,'(a,i0,a)')' getghc : exit prtvol=-',level,', debugging mode => stop '
913      ABI_ERROR(msg)
914    end if
915 
916  end if ! type_calc
917 
918  !$OMP TARGET EXIT DATA MAP(delete:cwavef,gsc_ptr,ghc) IF(transfer_omp_args)
919  !$OMP TARGET EXIT DATA MAP(delete:gvnlxc_)
920  if (local_gvnlxc .and. any(type_calc == [0, 2, 3])) then
921    ABI_FREE(gvnlxc_)
922  end if
923 
924  call timab(350+tim_getghc,2,tsec)
925 
926  DBG_EXIT("COLL")
927 
928  ABI_NVTX_END_RANGE()
929 #else
930 
931  ABI_UNUSED((/cpopt,ndat,prtvol,sij_opt,tim_getghc/))
932  ABI_UNUSED((/kg_fft_k,kg_fft_kp,type_calc,select_k/))
933  ABI_UNUSED((/gsc,cwavef,cwavef_r,ghc,gvnlxc,lambda/))
934  ABI_UNUSED_A(mpi_enreg)
935  ABI_UNUSED_A(cwaveprj)
936  ABI_UNUSED_A(gs_ham)
937  ABI_BUG("Unhandled configuration for OpenMP GPU immplementation")
938 
939 #endif
940 end subroutine getghc_ompgpu

ABINIT/getghc_ompgpu_work_mem [ Functions ]

[ Top ] [ Functions ]

NAME

 getghc_ompgpu_work_mem

FUNCTION

 Returns work memory requirement for getghc_ompgpu

INPUTS

 gs_ham <type(gs_hamiltonian_type)>=contains dimensions of FFT domain
 ndat=size of batch for fourwf and nonlop processing

OUTPUT

 req_mem=amount in bytes of required memory for getghc_ompgpu

ABINIT/m_getghc_ompgpu [ Modules ]

[ Top ] [ Modules ]

NAME

  m_getghc_ompgpu

FUNCTION

 Compute <G|H|C> for input vector |C> expressed in reciprocal space - OpenMP GPU version;

COPYRIGHT

  Copyright (C) 1998-2024 ABINIT group (DCA, XG, GMR, LSI, MT, JB, JWZ)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 ! nvtx related macro definition
23 #include "nvtx_macros.h"
24 
25 module m_getghc_ompgpu
26 
27  use defs_basis
28  use m_errors
29  use m_abicore
30  use m_xmpi
31  use m_xomp
32  use, intrinsic :: iso_c_binding
33 
34  use defs_abitypes, only : mpi_type
35  use m_time,        only : timab
36  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_getdim, pawcprj_copy
37  use m_hamiltonian, only : gs_hamiltonian_type, KPRIME_H_K, K_H_KPRIME, K_H_K, KPRIME_H_KPRIME
38  use m_fock,        only : fock_common_type, fock_get_getghc_call
39  use m_nonlop,      only : nonlop
40  use m_fft,         only : fourwf
41 
42  !FIXME Keep those in these modules or moves them together ?
43  use m_ompgpu_fourwf,      only : ompgpu_fourwf_work_mem
44  use m_gemm_nonlop_ompgpu, only : gemm_nonlop_ompgpu_work_mem
45 
46 #ifdef HAVE_FC_ISO_C_BINDING
47  use iso_c_binding
48 #endif
49 
50 #if defined(HAVE_GPU) && defined(HAVE_GPU_MARKERS)
51  use m_nvtx_data
52 #endif
53 
54  implicit none
55 
56  private