TABLE OF CONTENTS


ABINIT/m_invovl [ Modules ]

[ Top ] [ Modules ]

NAME

  m_invovl

FUNCTION

  Provides functions to invert the overlap matrix S. Used by Chebyshev in PAW
  See paper by A. Levitt and M. Torrent for details
  S = 1 + projs * s_projs * projs'
  S^-1 = 1 + projs * inv_s_projs * projs', with
  inv_s_projs = - (s_projs^-1 + projs'*projs)^-1

COPYRIGHT

 Copyright (C) 2013-2022 ABINIT group (AL)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

20 #if defined HAVE_CONFIG_H
21 #include "config.h"
22 #endif
23 
24 #include "abi_common.h"
25 
26 ! nvtx related macro definition
27 #include "nvtx_macros.h"
28 
29 MODULE m_invovl
30 
31  use defs_basis
32  use m_errors
33  use m_xmpi
34  use m_abicore
35 
36  use defs_abitypes, only : mpi_type
37  use m_time,        only : timab
38  use m_hamiltonian, only : gs_hamiltonian_type
39  use m_bandfft_kpt, only : bandfft_kpt_get_ikpt
40  use m_pawcprj,     only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby
41  use m_nonlop,      only : nonlop
42  use m_prep_kgb,    only : prep_nonlop
43 
44 #ifdef HAVE_FC_ISO_C_BINDING
45  use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc
46 #endif
47 
48 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_NVTX_V3)
49  use m_nvtx_data
50 #endif
51 
52  implicit none
53 
54  private
55 
56 !public procedures.
57  public :: init_invovl
58  public :: make_invovl
59  public :: apply_invovl
60  public :: destroy_invovl

m_invovl/apply_block [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_block

FUNCTION

 Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)

INPUTS

SOURCE

 950 subroutine apply_block(ham, cplx, mat, nprojs, ndat, x, y, block_sliced)
 951 
 952   use m_abi_linalg
 953   implicit none
 954 
 955   integer,intent(in) :: ndat, nprojs, cplx
 956   real(dp), intent(inout) :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat)
 957   type(gs_hamiltonian_type),intent(in) :: ham
 958   real(dp), intent(in) :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)
 959   integer, intent(in) :: block_sliced
 960 
 961   integer :: nlmn, shift, itypat, idat
 962 
 963 ! *************************************************************************
 964 
 965   ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_APPLY_BLOCK)
 966   if (block_sliced == 1) then
 967 
 968      do idat = 1, ndat
 969         shift = 1
 970         do itypat=1, ham%ntypat
 971            nlmn = count(ham%indlmn(3,:,itypat)>0)
 972            !! apply mat to all atoms at once
 973            ! perform natom multiplications of size nlmn
 974            ! compute y = mat*x
 975            if(cplx == 2) then
 976               call ZHEMM('L','U', nlmn, ham%nattyp(itypat), cone, &
 977                    &  mat(:, :, :, itypat), ham%lmnmax, &
 978                    &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, czero, &
 979                    &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn)
 980            else
 981               call DSYMM('L','U', nlmn, ham%nattyp(itypat), one, &
 982                    &  mat(:, :, :, itypat), ham%lmnmax, &
 983                    &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, zero, &
 984                    &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn)
 985            end if
 986            shift = shift + nlmn*ham%nattyp(itypat)
 987         end do
 988      end do
 989 
 990   else ! block_sliced = 0
 991 
 992      shift = 1
 993      do itypat=1, ham%ntypat
 994         nlmn = count(ham%indlmn(3,:,itypat)>0)
 995         !! apply mat to all atoms at once, all idat at once
 996         ! perform natom multiplications of size nlmn
 997         ! be careful here matrix extracted from x and y are not memory contiguous
 998         ! ==> so in the GPU version we will need to adapt leading dimension
 999         if(cplx == 2) then
1000            call ZHEMM('L','U', nlmn, ham%nattyp(itypat)*ndat, cone, &
1001                 &  mat(:, :, :, itypat), ham%lmnmax, &
1002                 &  x(:, 1:nlmn*ham%nattyp(itypat), 1:ndat), nlmn, czero, &
1003                 &  y(:, 1:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn)
1004         else
1005            call DSYMM('L','U', nlmn, ham%nattyp(itypat)*ndat, one, &
1006                 &  mat(:, :, :, itypat), ham%lmnmax, &
1007                 &  x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn, zero, &
1008                 &  y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn)
1009         end if
1010         shift = shift + nlmn*ham%nattyp(itypat)
1011      end do
1012 
1013   end if
1014  ABI_NVTX_END_RANGE()
1015 
1016 end subroutine apply_block

m_invovl/apply_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 apply_invovl

FUNCTION

 Applies the inverse of the overlap matrix to cwavef

INPUTS

SOURCE

629 subroutine apply_invovl(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced)
630 
631 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
632   use, intrinsic :: iso_c_binding
633 #endif
634 
635   implicit none
636 
637   ! args
638   type(gs_hamiltonian_type), intent(in), target :: ham
639   integer, intent(in) :: npw, ndat
640   integer, intent(in) :: nspinor
641   integer, intent(in) :: block_sliced
642   real(dp), intent(inout) :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop
643   type(mpi_type) :: mpi_enreg
644   real(dp), intent(inout) :: sm1cwavef(2, npw*nspinor*ndat)
645   type(pawcprj_type), intent(inout) :: cwaveprj(ham%natom,nspinor*ndat)
646 
647   real(dp),allocatable, target :: proj(:,:,:),sm1proj(:,:,:),PtPsm1proj(:,:,:)
648 
649   ! used to pass proj dimensions to cuda
650   integer(kind=c_int32_t) :: proj_dim(3)
651   integer(kind=c_int32_t) :: nattyp_dim
652   integer(kind=c_int32_t) :: indlmn_dim(3)
653 
654   integer :: idat, iatom, nlmn, shift
655   real(dp) :: tsec(2)
656 
657   integer :: choice, cpopt, paw_opt , cplx
658   character :: blas_transpose
659   type(pawcprj_type) :: cwaveprj_in(ham%natom,nspinor*ndat)
660 
661   integer :: ikpt_this_proc
662   integer, parameter :: tim_nonlop = 13
663   ! dummies
664   real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1)
665   integer, parameter :: nnlout = 0, idir = 0, signs = 2
666 
667   type(invovl_kpt_type), pointer :: invovl
668 
669   integer, parameter :: &
670     & timer_apply_inv_ovl_opernla = 1630, &
671     & timer_apply_inv_ovl_opernlb = 1631, &
672     & timer_apply_inv_ovl_inv_s   = 1632
673 
674   ! *************************************************************************
675 
676   ikpt_this_proc=bandfft_kpt_get_ikpt()
677   invovl => invovl_kpt(ikpt_this_proc)
678 
679   if(ham%istwf_k == 1) then
680     cplx = 2
681     blas_transpose = 'c'
682   else
683     cplx = 1
684     blas_transpose = 't'
685   end if
686 
687   ABI_MALLOC(proj,       (cplx,invovl%nprojs,nspinor*ndat))
688   ABI_MALLOC(sm1proj,    (cplx,invovl%nprojs,nspinor*ndat))
689   ABI_MALLOC(PtPsm1proj, (cplx,invovl%nprojs,nspinor*ndat))
690   proj = zero
691   sm1proj = zero
692   PtPsm1proj = zero
693 
694   proj_dim = (/ size(proj,1), size(proj,2), size(proj,3) /)
695 
696   nattyp_dim = size(ham%nattyp)
697 
698   indlmn_dim = (/ size(ham%indlmn,1), size(ham%indlmn,2), size(ham%indlmn,3) /)
699 
700 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
701 
702   !! memory allocation of data used in solve_inner_gpu
703   !! note : this is actually done only once
704   if (ham%use_gpu_cuda==1) then
705 
706     ! make sure to use sizes from apply_invovl
707     call f_gpu_apply_invovl_inner_alloc(proj_dim, ham%ntypat, 0)
708 
709     ! TODO find a better place to put that initialization
710     call f_gpu_init_invovl_data(indlmn_dim, c_loc(ham%indlmn(1,1,1)))
711 
712   end if
713 
714 #endif
715 
716 
717   call timab(timer_apply_inv_ovl_opernla, 1, tsec)
718 
719   call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj)
720 
721   ! get the cprj
722   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1)
723   choice = 0 ! only compute cprj, nothing else
724   cpopt = 0 ! compute and save cprj
725   paw_opt = 3 ! S nonlocal operator
726   if (mpi_enreg%paral_kgb==1) then
727     call prep_nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,ndat,mpi_enreg,&
728       &                   nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.)
729   else
730     call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,&
731       &              paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc)
732   end if
733   ABI_NVTX_END_RANGE()
734 
735   call timab(timer_apply_inv_ovl_opernla, 2, tsec)
736   call timab(timer_apply_inv_ovl_inv_s, 1, tsec)
737 
738   ! copy cwaveprj_in to proj(:,:)
739   do idat=1, ndat*nspinor
740     shift = 0
741     do iatom = 1, ham%natom
742       nlmn = cwaveprj_in(iatom, idat)%nlmn
743       proj(1:cplx, shift+1:shift+nlmn, idat) = cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn)
744       shift = shift + nlmn
745     end do
746   end do
747 
748   !multiply by S^1
749   ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER)
750   ! TODO : when solve_inner_gpu is ready, update the following to activate GPU computation
751   if (ham%use_gpu_cuda == 1) then
752 
753 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
754 
755     if (mpi_enreg%nproc_fft /= 1) then
756       ABI_ERROR("[66_wfs/m_invovl.F90:apply_invovl] nproc_fft must be 1, when GPU/CUDA is activated")
757     end if
758 
759     call f_solve_inner_gpu(proj_dim, c_loc(proj(1,1,1)), &
760       & c_loc(sm1proj(1,1,1)), c_loc(PtPsm1proj(1,1,1)), &
761       & nattyp_dim, c_loc(ham%nattyp(1)), ham%ntypat, &
762       & ham%lmnmax, cplx, block_sliced)
763 
764 #endif
765 
766   else
767 
768     call solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced)
769     sm1proj = - sm1proj
770     PtPsm1proj = - PtPsm1proj
771   end if
772 
773   ABI_NVTX_END_RANGE()
774 
775   ! copy sm1proj to cwaveprj(:,:)
776   do idat=1, ndat*nspinor
777     shift = 0
778     do iatom = 1, ham%natom
779       nlmn = cwaveprj(iatom, idat)%nlmn
780       cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = sm1proj(1:cplx, shift+1:shift+nlmn, idat)
781       shift = shift + nlmn
782     end do
783   end do
784   call timab(timer_apply_inv_ovl_inv_s, 2, tsec)
785   call timab(timer_apply_inv_ovl_opernlb, 1, tsec)
786 
787   ! get the corresponding wf
788   ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2)
789   cpopt = 2 ! reuse cprj
790   choice = 7 ! get wf from cprj, without the application of S
791   paw_opt = 3
792   if (mpi_enreg%paral_kgb==1) then
793     call prep_nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,ndat,mpi_enreg,nnlout,&
794       &                   paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.)
795   else
796     call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,&
797       &              signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc)
798   end if
799   ABI_NVTX_END_RANGE()
800 
801   call timab(timer_apply_inv_ovl_opernlb, 2, tsec)
802 
803   ! copy PtPSm1proj to cwaveprj(:,:)
804   do idat=1, ndat*nspinor
805     shift = 0
806     do iatom = 1, ham%natom
807       nlmn = cwaveprj(iatom, idat)%nlmn
808       cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat)
809       shift = shift + nlmn
810     end do
811   end do
812 
813   sm1cwavef = cwavef + sm1cwavef
814   call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj)
815   call pawcprj_free(cwaveprj_in)
816 
817   ABI_FREE(proj)
818   ABI_FREE(sm1proj)
819   ABI_FREE(PtPsm1proj)
820 
821 end subroutine apply_invovl

m_invovl/destroy_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 destroy_invovl

FUNCTION

 Destruction of the invovl_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

313  subroutine destroy_invovl(nkpt, use_gpu_cuda)
314 
315    integer, intent(in) :: nkpt
316    integer, intent(in) :: use_gpu_cuda
317    integer :: ikpt
318 
319 ! *************************************************************************
320 
321   ! TODO add cycling if kpt parallelism
322   do ikpt=1,nkpt
323     if(invovl_kpt(ikpt)%nprojs == -1) then
324       ! write(0, *) 'ERROR invovl_kpt is unallocated'
325       cycle
326     end if
327 
328     ABI_FREE(invovl_kpt(ikpt)%gram_projs)
329     ABI_FREE(invovl_kpt(ikpt)%inv_sij)
330     ABI_FREE(invovl_kpt(ikpt)%inv_s_approx)
331     invovl_kpt(ikpt)%nprojs = -1
332   end do
333 
334   ABI_FREE(invovl_kpt)
335 
336   ABI_UNUSED(use_gpu_cuda)
337 
338 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING)
339   if (use_gpu_cuda == 1) then
340     call f_gpu_apply_invovl_inner_dealloc()
341     call f_gpu_apply_invovl_matrix_dealloc()
342   end if
343 #endif
344 
345  end subroutine destroy_invovl

m_invovl/init_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 init_invovl

FUNCTION

 Initalization of the invovl_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

286  subroutine init_invovl(nkpt)
287 
288   integer, intent(in) :: nkpt
289   integer :: ikpt
290 
291 ! *************************************************************************
292 
293   ABI_MALLOC(invovl_kpt, (nkpt))
294   ! TODO add cycling if kpt parallelism
295   do ikpt=1,nkpt
296     invovl_kpt(ikpt)%nprojs = -1
297   end do
298 
299  end subroutine init_invovl

m_invovl/invovl_kpt_type [ Types ]

[ Top ] [ m_invovl ] [ Types ]

NAME

 invovl_kpt_type

FUNCTION

 Contains information needed to invert the overlap matrix S

SOURCE

 73 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
 74 
 75  type, public :: invovl_kpt_type
 76 
 77    integer(kind=c_int32_t) :: nprojs
 78    !  total number of projectors
 79    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
 80    !  A value of -1 means that the following arrays are not allocated
 81 
 82    real(kind=c_double), allocatable :: gram_projs(:,:,:)
 83    ! gram_projs(cplx, nprojs, nprojs)
 84    ! projs' * projs
 85 
 86    real(kind=c_double), allocatable :: inv_sij(:,:,:,:)
 87    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
 88    ! inverse of ham%sij
 89 
 90    real(kind=c_double), allocatable :: inv_s_approx(:,:,:,:)
 91    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
 92    ! preconditionner
 93 
 94  end type invovl_kpt_type
 95 
 96  !> companion type to invovl_kpt_type to pass data to gpu/cuda
 97  type, bind(c), public :: invovl_kpt_gpu_type
 98 
 99    integer(kind=c_int32_t) :: nprojs
100    !  total number of projectors
101    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
102    !  A value of -1 means that the following arrays are not allocated
103 
104    type(c_ptr) :: gram_projs
105    ! gram_projs(cplx, nprojs, nprojs)
106    ! projs' * projs
107 
108    integer(kind=c_int32_t) :: gram_projs_dim(3)
109 
110    type(c_ptr) :: inv_sij
111    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
112    ! inverse of ham%sij
113 
114    integer(kind=c_int32_t) :: inv_sij_dim(4)
115 
116    type(c_ptr) :: inv_s_approx
117    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
118    ! preconditionner
119 
120    integer(kind=c_int32_t) :: inv_s_approx_dim(4)
121 
122  end type invovl_kpt_gpu_type
123 
124 #else
125 
126 type, public :: invovl_kpt_type
127 
128    integer :: nprojs
129    !  total number of projectors
130    !    nlmn for a specific atom = count(indlmn(3,:,itypat)>0)
131    !  A value of -1 means that the following arrays are not allocated
132 
133    real(dp), allocatable :: gram_projs(:,:,:)
134    ! gram_projs(cplx, nprojs, nprojs)
135    ! projs' * projs
136 
137    real(dp), allocatable :: inv_sij(:,:,:,:)
138    ! inv_sij(cplx, lmnmax, lmnmax, ntypat)
139    ! inverse of ham%sij
140 
141    real(dp), allocatable :: inv_s_approx(:,:,:,:)
142    ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat)
143    ! preconditionner
144 
145 end type invovl_kpt_type
146 
147 #endif

m_invovl/make_invovl [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 make_invovl

FUNCTION

 Builds of the invovl structure

INPUTS

SOURCE

359 subroutine make_invovl(ham, dimffnl, ffnl, ph3d, mpi_enreg)
360 
361  use m_abi_linalg
362  implicit none
363 
364  type(gs_hamiltonian_type),intent(in), target :: ham
365  integer, intent(in) :: dimffnl
366  real(dp),intent(in) :: ffnl(ham%npw_k,dimffnl,ham%lmnmax,ham%ntypat)
367  real(dp),intent(in) :: ph3d(2,ham%npw_k,ham%matblk)
368  type(mpi_type) :: mpi_enreg
369 
370  real(dp) :: atom_projs(2, ham%npw_k, ham%lmnmax)
371  real(dp) :: temp(ham%npw_k)
372  complex(dpc), allocatable :: work(:)
373  real(dp), allocatable :: projs(:,:,:)
374  real(dp), allocatable :: gram_proj(:,:,:)
375  integer, allocatable :: ipiv(:)
376 
377  integer :: itypat, ilmn, nlmn, jlmn, ia, iaph3d, shift
378  integer :: il, ilm, jlm, ipw, info, ierr, cplx
379  integer :: ikpt_this_proc
380  logical :: parity
381  real(dp) :: tsec(2)
382  character(len=500) :: message
383  character :: blas_transpose
384 
385  type(invovl_kpt_type), pointer :: invovl
386 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
387  type(invovl_kpt_gpu_type) :: invovl_gpu
388 #endif
389  integer :: array_nprojs_pp(mpi_enreg%nproc_fft)
390  integer :: iproc, slice_size
391  real(dp), allocatable :: gramwork(:,:,:)
392 
393  integer, parameter :: timer_mkinvovl = 1620, timer_mkinvovl_build_d = 1621, timer_mkinvovl_build_ptp = 1622
394 
395 ! *************************************************************************
396 
397  !! S = 1 + PDP', so
398  !! S^-1 = 1 + P inv_s_projs P', with
399  !! inv_s_projs = - (D^-1 + P'*P)^-1
400 
401  if(ham%istwf_k == 1) then
402    cplx = 2
403    blas_transpose = 'c'
404  else
405    cplx = 1
406    blas_transpose = 't'
407  end if
408 
409  ikpt_this_proc=bandfft_kpt_get_ikpt()
410  invovl => invovl_kpt(ikpt_this_proc)
411 
412  if(invovl%nprojs /= -1) then
413    ! We have been here before, cleanup before remaking
414    ABI_FREE(invovl%gram_projs)
415    ABI_FREE(invovl%inv_sij)
416    ABI_FREE(invovl%inv_s_approx)
417    invovl%nprojs = -1
418  end if
419 
420  iaph3d = 1
421 
422  call timab(timer_mkinvovl,1,tsec)
423  call timab(timer_mkinvovl_build_d,1,tsec)
424 
425  ! build nprojs
426  invovl%nprojs = 0
427  do itypat=1,ham%ntypat
428    invovl%nprojs = invovl%nprojs + count(ham%indlmn(3,:,itypat)>0)*ham%nattyp(itypat)
429  end do
430 
431  ABI_MALLOC(projs, (2, ham%npw_k, invovl%nprojs))
432  ABI_MALLOC(invovl%inv_sij, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat))
433  ABI_MALLOC(invovl%inv_s_approx, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat))
434  ! workspace for inversion
435  ABI_MALLOC(ipiv, (ham%lmnmax))
436  ABI_MALLOC(work, (64*ham%lmnmax))
437 
438  projs = zero
439  invovl%inv_sij = zero
440  invovl%inv_s_approx = zero
441 
442  shift = 0
443  do itypat = 1, ham%ntypat
444    nlmn = count(ham%indlmn(3,:,itypat)>0)
445    !! unpack ham%sij into inv_sij
446    do jlmn = 1, nlmn
447      invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + jlmn, itypat)
448      jlm=ham%indlmn(4,jlmn, itypat)
449      do ilmn = 1, jlmn-1
450        ilm=ham%indlmn(4,ilmn, itypat)
451        if (ilm == jlm) then ! sparsity check
452          invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat)
453          invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat)
454        end if
455      end do
456    end do
457 
458    ! Invert sij
459    if(cplx == 2) then
460      call ZHETRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
461      call ZHETRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
462    else
463      call DSYTRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
464      call DSYTRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
465    end if
466    ! complete the matrix
467    do ilm=1, nlmn
468      do jlm=1, ilm-1
469        invovl%inv_sij(1,ilm,jlm,itypat) =  invovl%inv_sij(1,jlm,ilm,itypat)
470      end do
471    end do
472 
473    !! loop on atoms to build atom_projs and fill projs, s_projs
474    do ia = 1, ham%nattyp(itypat)
475 
476      !! build atom_projs, from opernlb
477      !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l
478      atom_projs(:,:,:) = 0
479 
480      ! start from 4pi/sqrt(ucvol)*ffnl
481      ! atom_projs(1, :, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(:, 1, 1:nlmn)
482      ! TODO vectorize (DCOPY with stride)
483      do ipw=1, ham%npw_k
484        atom_projs(1,ipw, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(ipw, 1, 1:nlmn, itypat)
485      end do
486 
487      ! multiply by (-i)^l
488      do ilmn=1,nlmn
489        il=mod(ham%indlmn(1,ilmn, itypat),4);
490        parity=(mod(il,2)==0)
491        if (il>1) then
492          ! multiply by -1
493          atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn)
494        end if
495        if(.not. parity) then
496          ! multiply by -i
497          temp = atom_projs(2,:,ilmn)
498          atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn)
499          atom_projs(1,:,ilmn) =  temp
500        end if
501      end do
502 
503      ! multiply by conj(ph3d)
504      do ilmn=1,nlmn
505        temp = atom_projs(1, :, ilmn)
506        atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d(1, :, iaph3d) + atom_projs(2, :, ilmn) * ph3d(2, :, iaph3d)
507        atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d(1, :, iaph3d) - temp                   * ph3d(2, :, iaph3d)
508      end do
509 
510      ! me_g0 trick
511      if(ham%istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then
512        atom_projs(1,1,:) = atom_projs(1,1,:) / sqrt2
513        atom_projs(2,1,:) = zero
514      end if
515      if(ham%istwf_k == 2) then
516        atom_projs(:,:,:) = atom_projs(:,:,:)*sqrt2
517      end if
518 
519 
520      !! atom_projs and typat_s_projs are built, copy them to projs and inv_s_projs
521      projs(:, :, shift+1:shift+nlmn) = atom_projs(:, :, 1:nlmn)
522      shift = shift + nlmn
523 
524      ! build inv_s_approx = (D^-1+PtP)^-1 restricted to a single atom block
525      ! can be optimized (real, build directly from ffnl)
526      if(ia == 1) then
527        ! D^-1
528        invovl%inv_s_approx(1, :, :, itypat) = invovl%inv_sij(1, :, :, itypat)
529        ! + PtP
530        ABI_MALLOC(gram_proj, (cplx, nlmn, nlmn))
531        call abi_xgemm(blas_transpose,'N', nlmn, nlmn, (3-cplx)*ham%npw_k, cone, atom_projs(:,:,1), (3-cplx)*ham%npw_k, &
532 &                     atom_projs(:,:,1), (3-cplx)*ham%npw_k, czero, gram_proj(:,:,1), nlmn,x_cplx=cplx)
533        call xmpi_sum(gram_proj,mpi_enreg%comm_bandspinorfft,ierr)
534        invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) = invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) + gram_proj(:,:,:)
535        ABI_FREE(gram_proj)
536        ! ^-1
537        if(cplx == 2) then
538          call ZHETRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
539          call ZHETRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
540        else
541          call DSYTRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info)
542          call DSYTRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info)
543        end if
544        ! complete lower triangle of matrix
545        do ilm=1, nlmn
546          do jlm=1, ilm-1
547            invovl%inv_s_approx(1, ilm, jlm, itypat) =  invovl%inv_s_approx(1, jlm, ilm, itypat)
548            if(cplx == 2) then
549              invovl%inv_s_approx(2, ilm, jlm, itypat) =  -invovl%inv_s_approx(2, jlm, ilm, itypat)
550            end if
551          end do
552        end do
553      end if
554 
555      iaph3d = iaph3d + 1
556    end do
557  end do
558  ABI_FREE(ipiv)
559  ABI_FREE(work)
560 
561  call timab(timer_mkinvovl_build_d, 2, tsec)
562  call timab(timer_mkinvovl_build_ptp, 1, tsec)
563 
564  ! Compute P'P one column slice at a time (might be too big to fit in one proc)
565  if(mpi_enreg%paral_kgb == 1) then
566    ! Split the work evenly the fft processors
567    array_nprojs_pp(:) = invovl%nprojs / mpi_enreg%nproc_fft
568    ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left
569    ! assign them to the first ones
570    array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) = array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) + 1
571  else
572    array_nprojs_pp = invovl%nprojs
573  end if
574  ABI_MALLOC(invovl%gram_projs, (cplx,invovl%nprojs,array_nprojs_pp(mpi_enreg%me_fft+1)))
575  shift = 0
576  do iproc = 1, mpi_enreg%nproc_fft
577    ! compute local contribution to slice iproc of gram_projs
578    slice_size = array_nprojs_pp(iproc)
579    ABI_MALLOC(gramwork, (cplx,invovl%nprojs,slice_size))
580    call abi_xgemm(blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, projs(:,:,1), (3-cplx)*ham%npw_k, &
581 &                      projs(:, :, shift+1), (3-cplx)*ham%npw_k, czero, gramwork(:,:,1), invovl%nprojs,x_cplx=cplx)
582    shift = shift + slice_size
583    ! reduce on proc i
584    call xmpi_sum_master(gramwork, iproc-1, mpi_enreg%comm_fft, ierr)
585    if(iproc == mpi_enreg%me_fft+1) then
586      invovl%gram_projs = gramwork
587    end if
588    ABI_FREE(gramwork)
589  end do
590  call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr)
591 
592  call timab(timer_mkinvovl_build_ptp, 2, tsec)
593  call timab(timer_mkinvovl,2,tsec)
594 
595  ABI_FREE(projs)
596 
597 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA)
598 
599  ! upload inverse overlap matrices (sij and s_approx) to GPU memory
600  if (ham%use_gpu_cuda==1) then
601    ! allocate memory for sij and s_approx on GPU
602    call f_gpu_apply_invovl_matrix_alloc(cplx, invovl%nprojs, ham%lmnmax, ham%ntypat, 0)
603 
604    invovl_gpu = make_invovl_kpt_gpu(invovl)
605    call f_upload_inverse_overlap(invovl_gpu, cplx, invovl%nprojs, ham%lmnmax, ham%ntypat)
606    write(message,*) 'Invovl uploaded to GPU memory'
607    call wrtout(std_out,message,'COLL')
608  end if
609 
610 #endif
611 
612  write(message,*) 'Invovl built'
613  call wrtout(std_out,message,'COLL')
614 
615 end subroutine make_invovl

m_invovl/make_invovl_kpt_gpu [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 make_invovl_kpt

FUNCTION

 Create a invovl_pkt_gpu_type from a cpu counter part for cuda interoperability

SOURCE

240   function make_invovl_kpt_gpu(invovl) result(invovl_gpu)
241     implicit none
242     type(invovl_kpt_type), intent(inout),target :: invovl
243     type(invovl_kpt_gpu_type)                   :: invovl_gpu
244 
245     invovl_gpu%nprojs = invovl%nprojs
246 
247     invovl_gpu%gram_projs = c_loc(invovl%gram_projs(1,1,1))
248     invovl_gpu%gram_projs_dim = (/ &
249       & size(invovl%gram_projs,1), &
250       & size(invovl%gram_projs,2), &
251       & size(invovl%gram_projs,3)  &
252       & /)
253 
254     invovl_gpu%inv_sij = c_loc(invovl%inv_sij(1,1,1,1))
255     invovl_gpu%inv_sij_dim = (/ &
256       & size(invovl%inv_sij,1), &
257       & size(invovl%inv_sij,2), &
258       & size(invovl%inv_sij,3), &
259       & size(invovl%inv_sij,4)  &
260       & /)
261 
262     invovl_gpu%inv_s_approx = c_loc(invovl%inv_s_approx(1,1,1,1))
263     invovl_gpu%inv_s_approx_dim = (/ &
264       & size(invovl%inv_s_approx,1), &
265       & size(invovl%inv_s_approx,2), &
266       & size(invovl%inv_s_approx,3), &
267       & size(invovl%inv_s_approx,4)  &
268       & /)
269 
270   end function make_invovl_kpt_gpu

m_invovl/solve_inner [ Functions ]

[ Top ] [ m_invovl ] [ Functions ]

NAME

 solve_inner

FUNCTION

 Helper function: iteratively solves the inner system

INPUTS

SOURCE

834 subroutine solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced)
835 
836  use m_abi_linalg
837  implicit none
838 
839  integer,intent(in) :: ndat,cplx
840  type(invovl_kpt_type), intent(in) :: invovl
841  real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat)
842  real(dp), intent(inout) :: sm1proj(cplx, invovl%nprojs, ndat)
843  real(dp), intent(inout) :: PtPsm1proj(cplx, invovl%nprojs, ndat)
844  real(dp), allocatable :: temp_proj(:,:,:)
845  type(mpi_type), intent(in) :: mpi_enreg
846  type(gs_hamiltonian_type),intent(in) :: ham
847  integer, intent(in) :: block_sliced
848 
849  integer :: array_nlmntot_pp(mpi_enreg%nproc_fft)
850  integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs
851  real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat)
852  real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr
853  character(len=500) :: message
854 
855  real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ?
856  real(dp) :: convergence_rate
857  integer :: additional_steps_to_take
858 
859 ! *************************************************************************
860 
861  nprojs = invovl%nprojs
862  normprojs = SUM(SUM(proj**2, 1),1)
863  call xmpi_sum(normprojs, mpi_enreg%comm_fft, ierr)
864 
865  ! Compute work distribution : split nprojs evenly between the fft processors
866  if(mpi_enreg%paral_kgb == 1) then
867    array_nlmntot_pp(:) = nprojs / mpi_enreg%nproc_fft
868    ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left
869    ! assign them to the first ones
870    array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) = array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) + 1
871    ibeg = SUM(array_nlmntot_pp(1:mpi_enreg%me_fft)) + 1
872    iend = ibeg + array_nlmntot_pp(1+mpi_enreg%me_fft) - 1
873    nlmntot_this_proc = iend - ibeg + 1
874  else
875    ibeg = 1
876    iend = nprojs
877    nlmntot_this_proc = nprojs
878  end if
879 
880  ABI_MALLOC(temp_proj, (cplx, nlmntot_this_proc, ndat))
881 
882  ! first guess for sm1proj
883  call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced)
884 
885  ! Iterative refinement
886  ! TODO use a more efficient iterative algorithm than iterative refinement, use locking
887  additional_steps_to_take = -1
888  do i=1, 30
889    ! compute resid = proj - (D^-1 + PtP)sm1proj
890    call apply_block(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced)
891    temp_proj = sm1proj(:,ibeg:iend,:)
892 
893    ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * temp_proj(:,:,1)
894    ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_GEMM)
895    call abi_xgemm('N', 'N', nprojs, ndat, nlmntot_this_proc, cone, &
896      & invovl%gram_projs(:,:,1), nprojs, &
897      & temp_proj(:,:,1), nlmntot_this_proc, czero, &
898      & PtPsm1proj(:,:,1), nprojs, &
899      & x_cplx=cplx)
900    call xmpi_sum(PtPsm1proj, mpi_enreg%comm_fft, ierr)
901    resid = proj - resid - Ptpsm1proj
902    ! exit check
903    errs = SUM(SUM(resid**2, 1),1)
904    call xmpi_sum(errs, mpi_enreg%comm_fft, ierr)
905    ABI_NVTX_END_RANGE()
906 
907    maxerr = sqrt(MAXVAL(errs/normprojs))
908    if(maxerr < precision .or. additional_steps_to_take == 1) then
909      exit
910      ! We might stall and never get to the specified precision because of machine errors.
911      ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional
912      ! steps to take to reach precision
913    else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then
914      convergence_rate = -LOG(1e-10) / i
915      additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1
916    else if(additional_steps_to_take > 0) then
917      if(previous_maxerr<maxerr)exit
918      additional_steps_to_take = additional_steps_to_take - 1
919    end if
920    previous_maxerr=maxerr
921 
922    ! add preconditionned residual
923    call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced)
924    sm1proj = sm1proj + precondresid
925  end do
926 
927  if(maxerr >= precision .and. maxerr >= 1e-10) then
928    write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations'
929    ABI_WARNING(message)
930  else
931    ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr
932    ! call wrtout(std_out,message,'COLL')
933  end if
934 
935  ABI_FREE(temp_proj)
936 
937 end subroutine solve_inner