TABLE OF CONTENTS


ABINIT/m_gemm_nonlop [ Modules ]

[ Top ] [ Modules ]

NAME

 m_gemm_nonlop

FUNCTION

  This module provides functions to compute the nonlocal operator by means of the BLAS GEMM
  routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines,
  which leads to excellent CPU efficiency and OpenMP scalability.

COPYRIGHT

 Copyright (C) 2014-2024 ABINIT group (AL)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

18 ! TODO list :
19 ! Don't allocate the full nkpt structures, only those that are treated by this proc: use same init as in m_bandfft_kpt
20 ! support more options (forces & stresses mostly)
21 ! Support RF/other computations (only GS right now)
22 ! handle the case where nloalg(2) < 0, ie no precomputation of ph3d
23 ! more systematic checking of the workflow (right now, only works if init/make/gemm/destroy, no multiple makes, etc)
24 ! Avoid allocating the complex matrix when istwfk > 1
25 ! Merge with chebfi's invovl
26 
27 
28 #if defined HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include "abi_common.h"
33 
34 module m_gemm_nonlop
35 
36  use defs_basis
37  use m_errors
38  use m_abicore
39  use m_xmpi
40  use m_fstrings,    only : itoa, ftoa, sjoin
41  use m_abi_linalg
42 
43  use defs_abitypes, only : MPI_type
44  use m_opernlc_ylm, only : opernlc_ylm
45  use m_pawcprj, only : pawcprj_type
46  use m_geometry, only : strconv
47  use m_kg, only : mkkpg
48 
49  implicit none
50 
51  private
52 
53  ! Use these routines in order: first call init, then call make_gemm_nonlop for each k point,
54  ! then call gemm_nonlop to do the actual computation, and call destroy when done. See gstate and vtorho.
55  public :: init_gemm_nonlop
56  public :: destroy_gemm_nonlop
57  public :: make_gemm_nonlop
58  public :: gemm_nonlop

m_gemm_nonlop/destroy_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 destroy_gemm_nonlop

FUNCTION

 Destruction of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

166  subroutine destroy_gemm_nonlop(nkpt)
167 
168   integer,intent(in) :: nkpt
169   integer :: ikpt
170 
171 ! *************************************************************************
172 
173 ! TODO add cycling if kpt parallelism
174   do ikpt = 1,nkpt
175     call free_gemm_nonlop_ikpt(ikpt)
176   end do
177 
178  ABI_FREE(gemm_nonlop_kpt)
179 
180  end subroutine destroy_gemm_nonlop

m_gemm_nonlop/free_gemm_nonlop_ikpt [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 free_destroy_gemm_nonlop_ikpt

FUNCTION

 Release memory for one kpt value of the gemm_nonlop_kpt array

INPUTS

 ikpt= index of gemm_nonlop_kptto be released

SOURCE

196  subroutine free_gemm_nonlop_ikpt(ikpt)
197 
198   integer,intent(in) :: ikpt
199 
200 ! *************************************************************************
201 
202  if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then
203    if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then
204      ABI_FREE(gemm_nonlop_kpt(ikpt)%projs)
205    end if
206    if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then
207      ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_r)
208    end if
209    if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then
210    ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_i)
211    end if
212    gemm_nonlop_kpt(ikpt)%nprojs = -1
213    if(gemm_nonlop_kpt(ikpt)%ngrads /= -1) then
214      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs)) then
215        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs)
216      end if
217      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_r)) then
218        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_r)
219      end if
220      if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_i)) then
221        ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_i)
222      end if
223      gemm_nonlop_kpt(ikpt)%ngrads = -1
224    end if
225  end if
226 
227  end subroutine free_gemm_nonlop_ikpt

m_gemm_nonlop/gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 gemm_nonlop

FUNCTION

 Replacement of nonlop. same prototype as nonlop although not all options are implemented.

INPUTS

 [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)

SOURCE

 810  subroutine gemm_nonlop(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,&
 811 &                 enl,enlout,ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,&
 812 &                 kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,&
 813 &                 mpi_enreg,mpsang,mpssoang,natom,nattyp,ndat,ngfft,nkpgin,nkpgout,nloalg,&
 814 &                 nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO,paw_opt,phkxredin,&
 815 &                 phkxredout,ph1d,ph3din,ph3dout,signs,sij,svectout,&
 816 &                 tim_nonlop,ucvol,useylm,vectin,vectout,&
 817 &                 vectproj,gpu_option)
 818 
 819   !Arguments ------------------------------------
 820   !scalars
 821   integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,idir
 822   integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,mpsang,mpssoang,natom,ndat,nkpgin
 823   integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO
 824   integer,intent(in) :: paw_opt,signs,tim_nonlop,useylm
 825   integer,optional,intent(in) :: gpu_option
 826   real(dp),intent(in) :: lambda(ndat),ucvol
 827   type(MPI_type),intent(in) :: mpi_enreg
 828   !arrays
 829   integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin)
 830   integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3)
 831   real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq)
 832   real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat)
 833   real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3)
 834   real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin*useylm)
 835   real(dp),intent(in) :: kpgout(npwout,nkpgout*useylm),kptin(3),kptout(3)
 836   real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom),ph1d(2,*)
 837   real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3))
 838   real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk)
 839   real(dp),intent(inout) :: vectin(2,npwin*nspinor*ndat)
 840   real(dp),intent(inout) :: enlout(nnlout*ndat)
 841   real(dp),intent(out) :: svectout(2,npwout*nspinor*(paw_opt/3)*ndat)
 842   real(dp),intent(inout) :: vectout(2,npwout*nspinor*ndat) !vz_i
 843   real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:)
 844   type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat)
 845 
 846   ! locals
 847   integer :: ii, ia, idat, igrad, nprojs, ngrads, shift, iatom, nlmn, ierr, ibeg, iend, ikpt
 848   integer :: cplex, cplex_enl, cplex_fac, proj_shift, grad_shift
 849   integer :: enlout_shift, force_shift, nnlout_test
 850   integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn
 851   integer :: cplex_dgxdt(1), cplex_d2gxdt(1)
 852   logical :: local_vectproj
 853   real(dp) :: esum
 854   real(dp) :: work(6)
 855   real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1)
 856   real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1)
 857   real(dp), allocatable :: enlk(:),sij_typ(:)
 858   real(dp),  ABI_CONTIGUOUS pointer :: projections(:,:,:)
 859   real(dp), allocatable :: s_projections(:,:,:), vnl_projections(:,:,:)
 860   real(dp), allocatable :: dprojections(:,:,:), temp_realvec(:)
 861   integer :: nprojs_my_blk
 862   integer :: rank, nprocs
 863   logical :: is_last
 864   real(dp), allocatable :: projs_local(:,:,:), dprojs_local(:,:,:)
 865   real(dp), allocatable :: projs_recv(:,:,:), dprojs_recv(:,:,:)
 866 
 867 ! *************************************************************************
 868 
 869   ! We keep the same interface as nonlop, but we don't use many of those
 870   ABI_UNUSED((/ffnlin,ffnlout,gmet,kpgin,kpgout/))
 871   ABI_UNUSED((/ph1d(1,1),ph3din,ph3dout/))
 872   ABI_UNUSED((/phkxredin,phkxredout,ucvol/))
 873   ABI_UNUSED((/mgfft,mpsang,mpssoang/))
 874   ABI_UNUSED((/kptin,kptout/))
 875   ABI_UNUSED((/idir,nloalg,ngfft,kgin,kgout,ngfft,only_SO,tim_nonlop,gpu_option/))
 876 
 877   ! Check supported options
 878   if (.not.gemm_nonlop_use_gemm) then
 879     ABI_BUG('computation not prepared for gemm_nonlop use!')
 880   end if
 881   if ( (choice>1.and.choice/=7.and.signs==2) .or. &
 882 &      (choice>3.and.choice/=7.and.choice/=23.and.signs==1) .or. &
 883 &      (useylm/=1) ) then
 884     ABI_BUG('gemm_nonlop option not supported!')
 885   end if
 886   if (signs==1) then
 887     nnlout_test=0
 888     if (choice==1) nnlout_test=1
 889     if (choice==2) nnlout_test=3*natom
 890     if (choice==3) nnlout_test=6
 891     if (choice==23) nnlout_test=6+3*natom
 892     if (nnlout<nnlout_test) then
 893       ABI_BUG('wrong nnlout size!')
 894     end if
 895   end if
 896 
 897   ikpt=gemm_nonlop_ikpt_this_proc_being_treated
 898   cplex=2;if (istwf_k>1) cplex=1
 899   cplex_enl=1;if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex?
 900   cplex_fac=max(cplex,dimekbq)
 901   if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex?
 902 
 903   nprojs = gemm_nonlop_kpt(ikpt)%nprojs
 904   ngrads = gemm_nonlop_kpt(ikpt)%ngrads
 905   nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_blk
 906 
 907   if(gemm_nonlop_is_distributed) then
 908     rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm)
 909     is_last = (rank==nprocs-1)
 910     if(is_last) nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_last_blk
 911   end if
 912 
 913   if(choice==1) ngrads=0
 914   ABI_CHECK(ngrads>=3.or.choice/=2 ,"Bad allocation in gemm_nonlop (2)!")
 915   ABI_CHECK(ngrads>=6.or.choice/=3 ,"Bad allocation in gemm_nonlop (3)!")
 916   ABI_CHECK(ngrads>=9.or.choice/=23,"Bad allocation in gemm_nonlop (23)!")
 917 
 918   ! If vectproj is provided, use it for further calculations, use allocated array otherwise
 919   local_vectproj=.false.
 920   if(PRESENT(vectproj)) then
 921     if(size(vectproj)>1) local_vectproj=.true.
 922   end if
 923   if (local_vectproj) then
 924     projections => vectproj
 925   end if
 926 
 927   ! These will store the non-local factors for vectin, svectout and vectout respectively
 928   if(.not. local_vectproj) then
 929     ABI_MALLOC(projections,(cplex, nprojs,nspinor*ndat))
 930   end if
 931   ABI_MALLOC(s_projections,(cplex, nprojs,nspinor*ndat))
 932   ABI_MALLOC(vnl_projections,(cplex_fac, nprojs,nspinor*ndat))
 933 
 934   if(gemm_nonlop_is_distributed) then
 935     ABI_MALLOC(projs_recv,   (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 936     ABI_MALLOC(projs_local,   (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 937     if (signs==1.and.ngrads>0) then
 938       ABI_MALLOC(dprojs_recv,   (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 939       ABI_MALLOC(dprojs_local,   (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk))
 940     end if
 941   end if
 942 
 943   if(.not. local_vectproj) projections = zero
 944   s_projections = zero
 945   vnl_projections = zero
 946   if (signs==1.and.ngrads>0) then
 947     ABI_MALLOC(dprojections,(cplex, ngrads*nprojs,nspinor*ndat))
 948     dprojections = zero
 949   end if
 950 
 951   if(nprojs == 0) then
 952     ! TODO check if this is correct
 953     if(signs == 1) then
 954       enlout=zero
 955       return
 956     end if
 957     if(signs == 2) then
 958       vectout = zero
 959       if(paw_opt>0) svectout = vectin
 960       return
 961     end if
 962   end if
 963 
 964   ! determine precisely when temp_realvec needs to be allocated
 965   ! to factorize allocate (resp. deallocate) at the begining (resp. at the end) of subroutine
 966   ! to avoid multiple allocate/deallocate that can be costly
 967   if (cplex /= 2) then
 968     if ( (cpopt < 2) .or. &
 969       &  (paw_opt == 3 .or. paw_opt == 4) .or. &
 970       &  (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)) then
 971        ABI_MALLOC(temp_realvec,(MAX(npwout,npwin)*nspinor*ndat))
 972     end if
 973   end if
 974 
 975 
 976   if(cpopt >= 2) then
 977     if(.not. local_vectproj) then
 978       !This use-case is extremely painful for GEMM nonlop performance.
 979       !vectproj paramter should always be provided to avoid it
 980 
 981       ! retrieve from cprjin
 982       do idat=1, ndat*nspinor
 983         shift = 0
 984         do iatom = 1, natom
 985           nlmn = cprjin(iatom, idat)%nlmn
 986           projections(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn)
 987           shift = shift + nlmn
 988         end do
 989       end do
 990     end if
 991     if(cpopt==4.and.allocated(dprojections)) then
 992       ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (1)")
 993       do idat=1, ndat*nspinor
 994         shift = 0
 995         do iatom = 1, natom
 996           nlmn  = cprjin(iatom, idat)%nlmn
 997           do igrad=1,ngrads
 998             dprojections(1:cplex, shift+1:shift+nlmn, idat) = &
 999 &                   cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn)
1000             shift = shift + nlmn
1001           end do
1002         end do
1003       end do
1004     end if
1005   else ! cpopt < 2
1006     ! opernla
1007     if(cplex == 2) then
1008       if(.not. gemm_nonlop_is_distributed) then
1009         call abi_zgemm_2r('C', 'N', nprojs, ndat*nspinor, npwin, cone, &
1010         &    gemm_nonlop_kpt(ikpt)%projs, npwin,&
1011         &    vectin, npwin, czero, projections, nprojs)
1012 
1013         if(signs==1.and.ngrads>0) then
1014           call abi_zgemm_2r('C', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, &
1015                    gemm_nonlop_kpt(ikpt)%dprojs, npwin,&
1016                    vectin, npwin, czero, dprojections, ngrads*nprojs)
1017         end if
1018       else
1019         call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1020         &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1021         &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1022         &                                         nprojs_my_blk,cplex,czero,&
1023         &                                         projs_local,projs_recv,&
1024         &                                         gemm_nonlop_kpt(ikpt)%projs,&
1025         &                                         vectin,projections)
1026         if(signs==1.and.ngrads>0) then
1027           call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1028           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1029           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1030           &                                         ngrads*nprojs_my_blk,cplex,czero,&
1031           &                                         dprojs_local,dprojs_recv,&
1032           &                                         gemm_nonlop_kpt(ikpt)%dprojs,&
1033           &                                         vectin,dprojections)
1034         end if
1035       end if
1036     else
1037 
1038       if (.not. allocated(temp_realvec)) then
1039         ABI_ERROR("Please provide memory allocation for temp_realvec array")
1040       end if
1041 
1042       ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i
1043       temp_realvec(1:npwin*nspinor*ndat) = vectin(1,1:npwin*nspinor*ndat)
1044       if(istwf_k == 2 .and. mpi_enreg%me_g0_fft == 1) then
1045         do idat=1, ndat*nspinor
1046           temp_realvec(1+(idat-1)*npwin) = temp_realvec(1+(idat-1)*npwin)/2
1047         end do
1048       end if
1049       if(.not. gemm_nonlop_is_distributed) then
1050         call DGEMM('T', 'N', nprojs, ndat*nspinor, npwin, one, &
1051         &          gemm_nonlop_kpt(ikpt)%projs_r, npwin, &
1052         &          temp_realvec, npwin, zero, projections, nprojs)
1053         if(signs==1.and.ngrads>0) then
1054           call DGEMM('T', 'N', ngrads*nprojs, ndat*nspinor, npwin, one, &
1055           &          gemm_nonlop_kpt(ikpt)%dprojs_r, npwin, &
1056           &          temp_realvec, npwin, zero, dprojections, ngrads*nprojs)
1057         end if
1058       else
1059         call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1060         &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1061         &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1062         &                                         nprojs_my_blk,cplex,czero,&
1063         &                                         projs_local,projs_recv,&
1064         &                                         gemm_nonlop_kpt(ikpt)%projs_r,&
1065         &                                         temp_realvec,projections)
1066         if(signs==1.and.ngrads>0) then
1067           call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1068           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1069           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1070           &                                         ngrads*nprojs_my_blk,cplex,czero,&
1071           &                                         dprojs_local,dprojs_recv,&
1072           &                                         gemm_nonlop_kpt(ikpt)%dprojs_r,&
1073           &                                         temp_realvec,dprojections)
1074         end if
1075       end if
1076       temp_realvec(1:npwin*nspinor*ndat) = vectin(2,1:npwin*nspinor*ndat)
1077       if(istwf_k == 2 .and. mpi_enreg%me_g0_fft == 1) then
1078         do idat=1, ndat*nspinor
1079           temp_realvec(1+(idat-1)*npwin) = zero
1080         end do
1081       end if
1082       if(.not. gemm_nonlop_is_distributed) then
1083         call DGEMM('T', 'N', nprojs, ndat*nspinor, npwin, one, &
1084         &          gemm_nonlop_kpt(ikpt)%projs_i, npwin, &
1085         &          temp_realvec, npwin, one , projections, nprojs)
1086         projections = projections * 2
1087         if(signs==1.and.ngrads>0) then
1088           call DGEMM('T', 'N', ngrads*nprojs, ndat*nspinor, npwin, one, &
1089           &          gemm_nonlop_kpt(ikpt)%dprojs_i, npwin, &
1090           &          temp_realvec, npwin, one , dprojections, ngrads*nprojs)
1091           dprojections = dprojections * 2
1092         end if
1093       else
1094         call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1095         &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1096         &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1097         &                                         nprojs_my_blk,cplex,cone,&
1098         &                                         projs_local,projs_recv,&
1099         &                                         gemm_nonlop_kpt(ikpt)%projs_i,&
1100         &                                         temp_realvec,projections)
1101         projections = projections * 2
1102         if(signs==1.and.ngrads>0) then
1103           call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
1104           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1105           &                                         ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1106           &                                         ngrads*nprojs_my_blk,cplex,cone,&
1107           &                                         dprojs_local,dprojs_recv,&
1108           &                                         gemm_nonlop_kpt(ikpt)%dprojs_i,&
1109           &                                         temp_realvec,dprojections)
1110           dprojections = dprojections * 2
1111         end if
1112       end if
1113     end if ! cplex == 2
1114     call xmpi_sum(projections,mpi_enreg%comm_fft,ierr)
1115     if (choice>1) then
1116       call xmpi_sum(dprojections,mpi_enreg%comm_fft,ierr)
1117     end if
1118 
1119     if(cpopt >= 0) then
1120       if(.not. local_vectproj) then
1121         !This use-case is extremely painful for GEMM nonlop performance.
1122         !vectproj paramter should always be provided to avoid it
1123 
1124         ! store in cprjin
1125         do idat=1, ndat*nspinor
1126           shift = 0
1127           do iatom = 1, natom
1128             nlmn = cprjin(iatom, idat)%nlmn
1129             cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections(1:cplex, shift+1:shift+nlmn, idat)
1130             shift = shift + nlmn
1131           end do
1132         end do
1133       end if
1134       if(cpopt==3) then
1135         ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (2)")
1136         shift = 0
1137         do iatom = 1, natom
1138           nlmn = cprjin(iatom, idat)%nlmn
1139           do igrad=1,ngrads
1140             cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn) = &
1141 &              dprojections(1:cplex, shift+1:shift+nlmn, idat)
1142             shift = shift + nlmn
1143           end do
1144         end do
1145       end if
1146     end if ! cpopt >= 0
1147   end if ! cpopt >= 2
1148 
1149   if(choice > 0) then
1150 
1151     if(choice /= 7) then
1152       ! opernlc
1153       iatm = 0
1154       ndgxdt = 0
1155       ndgxdtfac = 0
1156       nd2gxdt = 0
1157       nd2gxdtfac = 0
1158       optder = 0
1159 
1160       shift = 0
1161       ABI_MALLOC(sij_typ,(((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2))
1162       do itypat=1, ntypat
1163         nlmn=count(indlmn(3,:,itypat)>0)
1164         if (paw_opt>=2) then
1165           if (cplex_enl==1) then
1166             do ilmn=1,nlmn*(nlmn+1)/2
1167               sij_typ(ilmn)=sij(ilmn,itypat)
1168             end do
1169           else
1170             do ilmn=1,nlmn*(nlmn+1)/2
1171               sij_typ(ilmn)=sij(2*ilmn-1,itypat)
1172             end do
1173           end if
1174         end if
1175 
1176         ibeg = shift+1
1177         iend = shift+nattyp(itypat)*nlmn
1178 
1179         do idat = 1,ndat
1180           call opernlc_ylm(atindx1,cplex,cplex_dgxdt,cplex_d2gxdt,cplex_enl,cplex_fac,dgxdt_dum_in,dgxdt_dum_out,dgxdt_dum_out2,&
1181 &         d2gxdt_dum_in,d2gxdt_dum_out,d2gxdt_dum_out2,dimenl1,dimenl2,dimekbq,enl,&
1182 &         projections(:, ibeg:iend, 1+nspinor*(idat-1):nspinor*idat),&
1183 &         vnl_projections(:, ibeg:iend,1+nspinor*(idat-1):nspinor*idat),&
1184 &         s_projections(:, ibeg:iend,1+nspinor*(idat-1):nspinor*idat),&
1185 &         iatm,indlmn(:,:,itypat),itypat,lambda(idat),mpi_enreg,natom,ndgxdt,ndgxdtfac,nd2gxdt,nd2gxdtfac,&
1186 &         nattyp(itypat),nlmn,nspinor,nspinortot,optder,paw_opt,sij_typ)
1187         end do
1188 
1189         shift = shift + nattyp(itypat)*nlmn
1190         iatm = iatm+nattyp(itypat)
1191       end do
1192       ABI_FREE(sij_typ)
1193     else
1194       s_projections = projections
1195     end if ! choice /= 7
1196 
1197     ! opernlb (only choice=1)
1198     if(signs==2) then
1199       if(paw_opt == 3 .or. paw_opt == 4) then
1200 
1201         ! Get svectout from s_projections
1202         if(cplex == 2) then
1203 
1204           if(.not. gemm_nonlop_is_distributed) then
1205             call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1206             &    gemm_nonlop_kpt(ikpt)%projs, npwout, &
1207             &    s_projections, nprojs, czero, svectout, npwout)
1208           else
1209             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1210             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1211             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1212             &                                         nprojs_my_blk,cplex,&
1213             &                                         projs_local,projs_recv,&
1214             &                                         gemm_nonlop_kpt(ikpt)%projs,&
1215             &                                         s_projections,svectout)
1216           end if
1217         else
1218 
1219           if (.not. allocated(temp_realvec)) then
1220             ABI_ERROR("Please provide memory allocation for temp_realvec array")
1221           end if
1222 
1223           if(.not. gemm_nonlop_is_distributed) then
1224             call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, &
1225             &          gemm_nonlop_kpt(ikpt)%projs_r, npwout, &
1226             &          s_projections, nprojs, zero, temp_realvec, npwout)
1227             svectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1228             call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, &
1229             &          gemm_nonlop_kpt(ikpt)%projs_i, npwout,&
1230             &          s_projections, nprojs, zero, temp_realvec, npwout)
1231             svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1232           else
1233             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1234             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1235             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1236             &                                         nprojs_my_blk,cplex,&
1237             &                                         projs_local,projs_recv,&
1238             &                                         gemm_nonlop_kpt(ikpt)%projs_r,&
1239             &                                         s_projections,temp_realvec)
1240             svectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1241             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1242             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1243             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1244             &                                         nprojs_my_blk,cplex,&
1245             &                                         projs_local,projs_recv,&
1246             &                                         gemm_nonlop_kpt(ikpt)%projs_i,&
1247             &                                         s_projections,temp_realvec)
1248             svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1249           end if
1250 
1251         end if ! cplex = 2
1252         if(choice /= 7) svectout = svectout + vectin ! TODO understand this
1253 
1254       end if  ! (paw_opt == 3 .or. paw_opt == 4)
1255 
1256       if(paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then
1257         ! Get vectout from vnl_projections
1258         if(cplex_fac == 2) then
1259 
1260           if(.not. gemm_nonlop_is_distributed) then
1261             call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs, cone, &
1262             &    gemm_nonlop_kpt(ikpt)%projs, npwout, &
1263             &    vnl_projections, nprojs, czero, vectout, npwout)
1264           else
1265             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1266             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1267             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1268             &                                         nprojs_my_blk,cplex,&
1269             &                                         projs_local,projs_recv,&
1270             &                                         gemm_nonlop_kpt(ikpt)%projs,&
1271             &                                         vnl_projections,vectout)
1272           end if
1273         else
1274 
1275           if (.not. allocated(temp_realvec)) then
1276             ABI_ERROR("Please provide memory allocation for temp_realvec array")
1277           end if
1278 
1279           if(.not. gemm_nonlop_is_distributed) then
1280             call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, &
1281             &          gemm_nonlop_kpt(ikpt)%projs_r, npwout, &
1282             &          vnl_projections, nprojs, zero, temp_realvec, npwout)
1283             vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1284             call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, &
1285             &          gemm_nonlop_kpt(ikpt)%projs_i, npwout, &
1286             &          vnl_projections, nprojs, zero, temp_realvec, npwout)
1287             vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1288           else
1289             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1290             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1291             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1292             &                                         nprojs_my_blk,cplex,&
1293             &                                         projs_local,projs_recv,&
1294             &                                         gemm_nonlop_kpt(ikpt)%projs_r,&
1295             &                                         vnl_projections,temp_realvec)
1296             vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1297             call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
1298             &                                         gemm_nonlop_kpt(ikpt)%nprojs_blk,&
1299             &                                         gemm_nonlop_kpt(ikpt)%nprojs_last_blk,&
1300             &                                         nprojs_my_blk,cplex,&
1301             &                                         projs_local,projs_recv,&
1302             &                                         gemm_nonlop_kpt(ikpt)%projs_i,&
1303             &                                         vnl_projections,temp_realvec)
1304             vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat)
1305           end if
1306 
1307         end if ! cplex_fac == 2
1308 
1309       end if  ! (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)
1310     end if ! opernlb
1311 
1312     ! opernld
1313     if(signs==1) then
1314       enlout=zero
1315       if(choice==1.or.choice==3.or.choice==23) then
1316         ABI_MALLOC(enlk,(ndat))
1317         enlk=zero
1318         do idat=1,ndat*nspinor
1319           proj_shift=0
1320           do itypat=1, ntypat
1321             nlmn=count(indlmn(3,:,itypat)>0)
1322             do ia=1,nattyp(itypat)
1323               !Following loops are a [D][Z]DOT
1324               esum=zero
1325               do ilmn=1,nlmn
1326                 do ii=1,cplex
1327                   esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) &
1328 &                           *projections    (ii,proj_shift+ilmn,idat)
1329                 end do
1330               end do
1331               proj_shift=proj_shift+nlmn
1332               enlk(idat) = enlk(idat) + esum
1333             end do
1334           end do
1335         end do
1336         if (choice==1) enlout(1:ndat)=enlk(1:ndat)
1337       end if ! choice=1/3/23
1338       if(choice==2.or.choice==3.or.choice==23) then
1339         do idat=1,ndat*nspinor
1340           proj_shift=0 ; grad_shift=0
1341           enlout_shift=(idat-1)*nnlout
1342           force_shift=merge(6,0,choice==23)
1343           do itypat=1, ntypat
1344             nlmn=count(indlmn(3,:,itypat)>0)
1345             do ia=1,nattyp(itypat)
1346               if (choice==3.or.choice==23) then
1347                 do igrad=1,6
1348                   !Following loops are a [D][Z]DOT
1349                   esum=zero
1350                   do ilmn=1,nlmn
1351                     do ii=1,cplex
1352                       esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) &
1353 &                               *dprojections   (ii,grad_shift+ilmn,idat)
1354                     end do
1355                   end do
1356                   grad_shift=grad_shift+nlmn
1357                   enlout(enlout_shift+igrad)=enlout(enlout_shift+igrad) + two*esum
1358                 end do
1359               end if
1360               if (choice==2.or.choice==23) then
1361                 do igrad=1,3
1362                   !Following loops are a [D][Z]DOT
1363                   esum=zero
1364                   do ilmn=1,nlmn
1365                     do ii=1,cplex
1366                       esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) &
1367 &                               *dprojections   (ii,grad_shift+ilmn,idat)
1368                     end do
1369                   end do
1370                   grad_shift=grad_shift+nlmn
1371                   enlout(enlout_shift+force_shift+igrad)= &
1372 &                               enlout(enlout_shift+force_shift+igrad) + two*esum
1373                 end do
1374                 force_shift=force_shift+3
1375               end if
1376               proj_shift=proj_shift+nlmn
1377             end do
1378           end do
1379         end do
1380       end if ! choice=2, 3 or 23
1381 
1382     end if !opernld
1383 
1384   end if ! choice>0
1385 
1386 ! Reduction in case of parallelism
1387   if (signs==1.and.mpi_enreg%paral_spinor==1) then
1388     if (size(enlout)>0) then
1389       call xmpi_sum(enlout,mpi_enreg%comm_spinor,ierr)
1390     end if
1391     if (choice==3.or.choice==23) then
1392       call xmpi_sum(enlk,mpi_enreg%comm_spinor,ierr)
1393     end if
1394   end if
1395 
1396 ! Derivatives wrt strain
1397 !  - Convert from reduced to cartesian coordinates
1398 !  - Substract volume contribution
1399  if ((choice==3.or.choice==23).and.signs==1.and.paw_opt<=3) then
1400    do idat=1,ndat
1401      enlout_shift=(idat-1)*nnlout
1402      call strconv(enlout(enlout_shift+1:enlout_shift+6),gprimd,work)
1403      enlout(enlout_shift+1:enlout_shift+3)=(work(1:3)-enlk(idat))
1404      enlout(enlout_shift+4:enlout_shift+6)= work(4:6)
1405    end do
1406  end if
1407 
1408 ! Release memory
1409   if(.not. local_vectproj) then
1410     ABI_FREE(projections)
1411   end if
1412   ABI_FREE(s_projections)
1413   ABI_FREE(vnl_projections)
1414   if(gemm_nonlop_is_distributed) then
1415     ABI_FREE(projs_local)
1416     ABI_FREE(projs_recv)
1417     if (signs==1.and.ngrads>0) then
1418       ABI_FREE(dprojs_local)
1419       ABI_FREE(dprojs_recv)
1420     end if
1421   end if
1422   if (allocated(dprojections)) then
1423     ABI_FREE(dprojections)
1424   end if
1425   if (allocated(enlk)) then
1426     ABI_FREE(enlk)
1427   end if
1428   if (allocated(temp_realvec)) then
1429     ABI_FREE(temp_realvec)
1430   end if
1431 
1432  end subroutine gemm_nonlop
1433 !***
1434 
1435 end module m_gemm_nonlop

m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernla [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 gemm_nonlop_distributed_gemm_opernla

FUNCTION

 Distributed version of "opernla" GEMM called in gemm_nonlop.

INPUTS

SOURCE

664  subroutine gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,&
665  &                                               nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,beta,&
666  &                                               projs_local,projs_recv,projs,vectin,projections)
667    integer,  intent(in)     :: rank,nprocs,npwin,ndat,nspinor
668    integer,  intent(in)     :: nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex
669    real(dp), intent(in)     :: projs(:,:,:),vectin(*)
670    complex(dpc), intent(in) :: beta
671    real(dp), intent(inout)  :: projs_local(:,:,:),projs_recv(:,:,:)
672    real(dp), intent(out)    :: projections(:,:,:)
673 
674    !Local variables
675    integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next
676 
677 ! *************************************************************************
678 
679    rank_next=modulo(rank + 1,nprocs)
680    rank_prev=rank - 1
681    if(rank_prev == -1) rank_prev = nprocs - 1
682    do iblock=1,nprocs
683 
684      if(rank+iblock == nprocs) then
685        nprojs_cur_blk = nprojs_last_blk
686      else
687        nprojs_cur_blk = nprojs_blk
688      end if
689 
690      if(iblock == 1) then
691        call DCOPY(cplex*npwin*nprojs_my_blk , projs,      1, projs_local, 1)
692      else
693        call DCOPY(cplex*npwin*nprojs_cur_blk, projs_recv, 1, projs_local, 1)
694      end if
695 
696      if(iblock < nprocs) then
697        call xmpi_isend(projs_local,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
698        call xmpi_irecv(projs_recv,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
699      end if
700 
701      ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk
702      iend = ibeg+nprojs_cur_blk-1
703      if(cplex==2) then
704        call abi_zgemm_2r('C', 'N', nprojs_cur_blk, ndat*nspinor, npwin, cone, &
705        &                projs_local, npwin,&
706        &                vectin, npwin, beta, projections(:,ibeg:iend,:), nprojs_cur_blk)
707      else
708        call DGEMM('T', 'N', nprojs_cur_blk, ndat*nspinor, npwin, one, &
709        &          projs_local, npwin, &
710        &          vectin, npwin, real(beta), projections(:,ibeg:iend,:), nprojs_cur_blk)
711      end if
712 
713      if(iblock < nprocs) then
714        call xmpi_waitall(req,ierr)
715      end if
716 
717    end do
718  end subroutine gemm_nonlop_distributed_gemm_opernla

m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernlb [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 gemm_nonlop_distributed_gemm_opernlb

FUNCTION

 Distributed version of "opernlb" GEMM called in gemm_nonlop.

INPUTS

SOURCE

733  subroutine gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,&
734  &                                               nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,&
735  &                                               projs_local,projs_recv,projs,projections,vectout)
736    integer,  intent(in)     :: rank,nprocs,npwout,ndat,nspinor
737    integer,  intent(in)     :: nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex
738    real(dp), intent(in)     :: projs(:,:,:),projections(:,:,:)
739    real(dp), intent(inout)  :: projs_local(:,:,:),projs_recv(:,:,:)
740    real(dp), intent(out)    :: vectout(*)
741 
742    !Local variables
743    integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next
744    complex(dpc) :: beta
745 
746 ! *************************************************************************
747 
748    rank_next=modulo(rank + 1,nprocs)
749    rank_prev=rank - 1
750    if(rank_prev == -1) rank_prev = nprocs - 1
751 
752    beta = czero
753 
754    do iblock=1,nprocs
755 
756      if(rank+iblock == nprocs) then
757        nprojs_cur_blk = nprojs_last_blk
758      else
759        nprojs_cur_blk = nprojs_blk
760      end if
761 
762      if(iblock == 1) then
763        call DCOPY(cplex*npwout*nprojs_my_blk, projs, 1, projs_local, 1)
764      else
765        call DCOPY(cplex*npwout*nprojs_cur_blk, projs_recv, 1, projs_local, 1)
766      end if
767 
768      if(iblock < nprocs) then
769        call xmpi_isend(projs_local,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr)
770        call xmpi_irecv(projs_recv,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr)
771      end if
772 
773      ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk
774      iend = ibeg+nprojs_cur_blk-1
775 
776      if(cplex==2) then
777        call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, cone, &
778        &                 projs_local, npwout, &
779        &                 projections(:,ibeg:iend,:), nprojs_cur_blk, beta, vectout, npwout)
780      else
781        call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, one, &
782        &          projs_local, npwout, &
783        &          projections(:,ibeg:iend,:), nprojs_cur_blk, real(beta), vectout, npwout)
784      end if
785 
786      beta = cone
787 
788      if(iblock < nprocs) then
789        call xmpi_waitall(req,ierr)
790      end if
791 
792    end do
793  end subroutine gemm_nonlop_distributed_gemm_opernlb

m_gemm_nonlop/gemm_nonlop_type [ Types ]

[ Top ] [ m_gemm_nonlop ] [ Types ]

NAME

 gemm_nonlop_type

FUNCTION

 Contains information needed to apply the nonlocal operator

SOURCE

73  type,public :: gemm_nonlop_type
74 
75    integer :: nprojs
76    integer :: ngrads
77 
78    integer :: nprojs_blk
79    integer :: nprojs_last_blk
80 
81    real(dp), allocatable :: projs(:, :, :)
82    ! (2, npw, nprojs)
83 
84    real(dp), allocatable :: projs_r(:, :, :)
85    ! (1, npw, nprojs)
86 
87    real(dp), allocatable :: projs_i(:, :, :)
88    ! (1, npw, nprojs)
89 
90    real(dp), allocatable :: dprojs(:, :, :)
91    ! (2, npw, nprojs*ngrads)
92    real(dp), allocatable :: dprojs_r(:, :, :)
93    ! (1, npw, nprojs*ngrads)
94    real(dp), allocatable :: dprojs_i(:, :, :)
95    ! (1, npw, nprojs*ngrads)
96 
97  end type gemm_nonlop_type

m_gemm_nonlop/init_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 init_gemm_nonlop

FUNCTION

 Initalization of the gemm_nonlop_kpt array

INPUTS

 nkpt= number of k-points

SOURCE

136  subroutine init_gemm_nonlop(nkpt)
137 
138   integer,intent(in) :: nkpt
139   integer :: ikpt
140 
141 ! *************************************************************************
142 
143   ! TODO only allocate the number of kpt treated by this proc
144   ABI_MALLOC(gemm_nonlop_kpt, (nkpt))
145   do ikpt=1,nkpt
146     gemm_nonlop_kpt(ikpt)%nprojs = -1
147     gemm_nonlop_kpt(ikpt)%ngrads = -1
148   end do
149 
150  end subroutine init_gemm_nonlop

m_gemm_nonlop/make_gemm_nonlop [ Functions ]

[ Top ] [ m_gemm_nonlop ] [ Functions ]

NAME

 make_gemm_nonlop

FUNCTION

 Build the gemm_nonlop array

INPUTS

SOURCE

242  subroutine make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, &
243 &                            ph3d_k,kpt_k,kg_k,kpg_k, &
244 &                            compute_grad_strain,compute_grad_atom) ! Optional parameters
245 
246   integer, intent(in) :: ikpt
247   integer, intent(in) :: npw, lmnmax, ntypat
248   integer, intent(in) :: indlmn(:,:,:), kg_k(:,:)
249   integer, intent(in) :: nattyp(ntypat)
250   integer, intent(in) :: istwf_k
251   logical, intent(in), optional :: compute_grad_strain,compute_grad_atom
252   real(dp), intent(in) :: ucvol
253   real(dp), intent(in) :: ffnl_k(:,:,:,:)
254   real(dp), intent(in) :: ph3d_k(:,:,:)
255   real(dp), intent(in) :: kpt_k(:)
256   real(dp), intent(in), target :: kpg_k(:,:)
257 
258   integer :: nprojs,ndprojs,ngrads
259 
260   integer,parameter :: alpha(6)=(/1,2,3,3,3,2/),beta(6)=(/1,2,3,2,1,1/)
261   integer :: itypat, ilmn, nlmn, ia, iaph3d, igrad, shift, shift_grad
262   integer :: il, ipw, idir, idir1, idir2, nkpg_local
263   logical :: parity,my_compute_grad_strain,my_compute_grad_atom
264   real(dp),allocatable :: atom_projs(:,:,:), atom_dprojs(:,:,:,:), temp(:)
265   real(dp),pointer :: kpg(:,:)
266   integer :: rank, nprocs, ierr
267   integer :: nprojs_blk, nprojs_last_blk, nprojs_my_blk
268   integer :: lmn_beg,ibeg,iend,shift_do,nlmn_o,lmn_grad_beg
269   logical :: is_last_rank
270   real(dp),allocatable :: dprojs_tmp(:,:,:),dprojs_r_tmp(:,:,:),dprojs_i_tmp(:,:,:)
271 
272 ! *************************************************************************
273 
274   my_compute_grad_strain=.false. ; if (present(compute_grad_strain)) my_compute_grad_strain=compute_grad_strain
275   my_compute_grad_atom=.false. ; if (present(compute_grad_atom)) my_compute_grad_atom=compute_grad_atom
276   ABI_CHECK(size(ph3d_k)>0,'nloalg(2)<0 not compatible with use_gemm_nonlop=1!')
277 !  ABI_CHECK((.not.my_compute_grad_strain).or.size(kpg_k)>0,"kpg_k should be allocated to compute gradients!")
278 !  ABI_CHECK((.not.my_compute_grad_atom).or.size(kpg_k)>0,"kpg_k should be allocated to compute gradients!")
279 
280   iaph3d = 1
281 
282   ABI_MALLOC(atom_projs, (2, npw, lmnmax))
283   if (my_compute_grad_strain) then
284     ndprojs = 3
285     ABI_MALLOC(atom_dprojs, (2, npw, ndprojs, lmnmax))
286   else
287     ndprojs = 0
288   end if
289 
290   ABI_MALLOC(temp, (npw))
291 
292   call free_gemm_nonlop_ikpt(ikpt)
293 
294   ! build nprojs, ngrads
295   nprojs = 0 ; ngrads = 0
296   do itypat=1,ntypat
297     nprojs = nprojs + count(indlmn(3,:,itypat)>0)*nattyp(itypat)
298   end do
299   if (my_compute_grad_strain) ngrads=ngrads+6
300   if (my_compute_grad_atom) ngrads=ngrads+3
301   if (nprojs>0) gemm_nonlop_kpt(ikpt)%nprojs = nprojs
302   if (ngrads>0) gemm_nonlop_kpt(ikpt)%ngrads = ngrads
303 
304   if(gemm_nonlop_is_distributed) then
305     rank = xmpi_comm_rank(xmpi_world); nprocs = xmpi_comm_size(xmpi_world)
306     is_last_rank = (rank==nprocs-1)
307     if(gemm_nonlop_block_comm/=xmpi_comm_null) call xmpi_comm_free(gemm_nonlop_block_comm)
308     if(gemm_nonlop_nblocks==1) gemm_nonlop_nblocks=nprocs
309     write(std_out,*) "Splitting on ", gemm_nonlop_nblocks
310     call xmpi_comm_split(xmpi_world, rank/(nprocs/gemm_nonlop_nblocks), rank, gemm_nonlop_block_comm, ierr)
311     if(ierr/=0) ABI_BUG("Bug split!")
312   end if
313   rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm)
314   is_last_rank = (rank==nprocs-1)
315 
316   if(gemm_nonlop_is_distributed) then
317     nprojs_blk = nprojs / nprocs
318     nprojs_last_blk = nprojs_blk + modulo(nprojs,nprojs_blk)
319     gemm_nonlop_kpt(ikpt)%nprojs_blk = nprojs_blk
320     gemm_nonlop_kpt(ikpt)%nprojs_last_blk = nprojs_last_blk
321     if(is_last_rank) then
322       nprojs_my_blk = nprojs_last_blk
323     else
324       nprojs_my_blk = nprojs_blk
325     end if
326   else
327     nprojs_last_blk = nprojs
328     nprojs_my_blk = nprojs
329     nprojs_blk = nprojs
330   end if
331 
332   if(gemm_nonlop_is_distributed .and. ngrads>0) then
333     !Temporary buffer to help distribute dprojs correctly (and easily)
334     if(istwf_k <= 1) then
335       ABI_MALLOC(dprojs_tmp, (2, npw, nprojs_my_blk*ngrads*2))
336     else
337       ABI_MALLOC(dprojs_r_tmp, (1, npw, nprojs_my_blk*ngrads*2))
338       ABI_MALLOC(dprojs_i_tmp, (1, npw, nprojs_my_blk*ngrads*2))
339     end if
340   end if
341 
342   if(istwf_k <= 1) then
343     ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs, (2, npw, nprojs_last_blk))
344     gemm_nonlop_kpt(ikpt)%projs = zero
345     if(ngrads>0) then
346       ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs, (2, npw, nprojs_last_blk*ngrads))
347       gemm_nonlop_kpt(ikpt)%dprojs = zero
348     end if
349   else
350     ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs_r, (1, npw, nprojs_last_blk))
351     ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs_i, (1, npw, nprojs_last_blk))
352     gemm_nonlop_kpt(ikpt)%projs_r = zero
353     gemm_nonlop_kpt(ikpt)%projs_i = zero
354     if(ngrads>0) then
355       ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs_r, (1, npw, nprojs_last_blk*ngrads))
356       ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs_i, (1, npw, nprojs_last_blk*ngrads))
357       gemm_nonlop_kpt(ikpt)%dprojs_r = zero
358       gemm_nonlop_kpt(ikpt)%dprojs_i = zero
359     end if
360   end if
361 
362   ! Compute (k+G) vectors if needed
363   nkpg_local=0
364   if ((my_compute_grad_strain.or.my_compute_grad_atom).and.size(kpg_k)==0) then
365     nkpg_local=3
366     ABI_MALLOC(kpg,(npw,nkpg_local))
367     call mkkpg(kg_k,kpg,kpt_k,nkpg_local,npw)
368   else
369     kpg => kpg_k
370   end if
371 
372   if(gemm_nonlop_is_distributed) then
373     ibeg = rank*nprojs_blk+1
374     iend = ibeg+nprojs_my_blk
375     shift_do = 0
376     lmn_grad_beg = -1
377   end if
378 
379   shift = 0 ; shift_grad = 0
380   lmn_beg = 1
381 
382   do itypat = 1, ntypat
383     nlmn = count(indlmn(3,:,itypat)>0)
384     nlmn_o = nlmn
385 
386     do ia = 1, nattyp(itypat)
387 
388       ! In distributed mode, loops are skipped until reach the section
389       ! of "ilmn" to be stored by local rank
390       if(gemm_nonlop_is_distributed) then
391         if(shift_do+nlmn < ibeg) then
392           shift_do = shift_do + nlmn
393           iaph3d = iaph3d + 1
394           cycle
395         end if
396 
397         lmn_beg = max(1,ibeg-shift_do)
398         if(lmn_grad_beg==-1) lmn_grad_beg = (lmn_beg-1)*ngrads
399         if(shift_do+nlmn > iend) nlmn = iend - shift_do - 1
400       end if
401 
402       !! build atom_projs, from opernlb
403       !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l
404 
405       ! start from 4pi/sqrt(ucvol)*ffnl
406       ! atom_projs(1, :, lmn_beg:nlmn) = four_pi/sqrt(ham%ucvol) * ham%ffnl_k(:, 1, lmn_beg:nlmn)
407       ! TODO vectorize (DCOPY with stride)
408       atom_projs(:,:,:) = zero
409       do ipw=1, npw
410         atom_projs(1,ipw, 1:nlmn_o) = four_pi/sqrt(ucvol) * ffnl_k(ipw, 1, 1:nlmn_o, itypat)
411       end do
412       if (ndprojs>0) atom_dprojs(:,:,:,:) = zero
413       if (my_compute_grad_strain) then
414         do ipw=1, npw
415           atom_dprojs(1,ipw, 1:3, 1:nlmn_o) = four_pi/sqrt(ucvol) * ffnl_k(ipw, 2:4, 1:nlmn_o, itypat)
416         end do
417       end if
418 
419       ! multiply by (-i)^l
420       do ilmn=1,nlmn_o
421         il=mod(indlmn(1,ilmn, itypat),4);
422         parity=(mod(il,2)==0)
423         if (il>1) then
424           ! multiply by -1
425           atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn)
426           if (ndprojs>0) atom_dprojs(:,:,:,ilmn) = -atom_dprojs(:,:,:,ilmn)
427         end if
428         if(.not. parity) then
429           ! multiply by -i
430           temp = atom_projs(2,:,ilmn)
431           atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn)
432           atom_projs(1,:,ilmn) =  temp
433           if (ndprojs>0) then
434             do idir=1,ndprojs
435               temp = atom_dprojs(2,:,idir,ilmn)
436               atom_dprojs(2,:,idir,ilmn) = -atom_dprojs(1,:,idir,ilmn)
437               atom_dprojs(1,:,idir,ilmn) =  temp
438             end do
439           end if
440         end if
441       end do
442 
443       ! multiply by conj(ph3d)
444       do ilmn=1,nlmn_o
445         temp = atom_projs(1, :, ilmn)
446         atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d_k(1, :, iaph3d) &
447 &                              + atom_projs(2, :, ilmn) * ph3d_k(2, :, iaph3d)
448         atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d_k(1, :, iaph3d) &
449 &                              - temp                   * ph3d_k(2, :, iaph3d)
450       end do
451       if (ndprojs>0) then
452         do ilmn=1,nlmn_o
453           do idir=1,ndprojs
454             temp = atom_dprojs(1, :, idir,ilmn)
455             atom_dprojs(1, :, idir,ilmn) = atom_dprojs(1, :, idir,ilmn) * ph3d_k(1, :, iaph3d) &
456 &                                        + atom_dprojs(2, :, idir,ilmn) * ph3d_k(2, :, iaph3d)
457             atom_dprojs(2, :, idir,ilmn) = atom_dprojs(2, :, idir,ilmn) * ph3d_k(1, :, iaph3d) &
458 &                                        - temp                         * ph3d_k(2, :, iaph3d)
459           end do
460         end do
461       end if
462 
463       !! atom_projs is built, copy to projs
464 
465       if(istwf_k <= 1) then
466         gemm_nonlop_kpt(ikpt)%projs(1:2, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(:, :, lmn_beg:nlmn)
467       else ! istwf_k>1
468         gemm_nonlop_kpt(ikpt)%projs_r(1, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(1, :, lmn_beg:nlmn)
469         gemm_nonlop_kpt(ikpt)%projs_i(1, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(2, :, lmn_beg:nlmn)
470       end if
471 
472       !! Handling dprojs
473 
474       ! In distributed case, we compute values for all nlmn*ngrad in dprojs_tmp.
475       ! dprojs will be filled by "cutting" relevant values out
476       if(ngrads>0) then
477         if(gemm_nonlop_is_distributed) then
478           if(istwf_k <= 1) then
479             igrad=0
480             if(my_compute_grad_strain) then
481               do idir=1,6
482                 idir1=alpha(idir);idir2=beta(idir)
483                 do ilmn=1,nlmn_o
484                   do ipw=1,npw
485                     dprojs_tmp(1:2, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
486   &                  -half*(atom_dprojs(1:2, ipw, idir1, ilmn)*kpg(ipw,idir2) &
487   &                        +atom_dprojs(1:2, ipw, idir2, ilmn)*kpg(ipw,idir1))
488                   end do
489                 end do
490                 igrad=igrad+1
491               end do
492             end if
493             if(my_compute_grad_atom) then
494               do idir=1,3
495                 do ilmn=1,nlmn_o
496                   do ipw=1,npw
497                     dprojs_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
498   &                  +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi
499                     dprojs_tmp(2, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
500   &                  -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi
501                   end do
502                 end do
503                 igrad=igrad+1
504               end do
505             end if
506 
507           else ! istwf_k>1
508             igrad=0
509             if(my_compute_grad_strain) then
510               do idir=1,6
511                 idir1=alpha(idir);idir2=beta(idir)
512                 do ilmn=1,nlmn_o
513                   do ipw=1,npw
514                     dprojs_r_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
515   &                  -half*(atom_dprojs(1, ipw, idir1, ilmn)*kpg(ipw,idir2) &
516   &                        +atom_dprojs(1, ipw, idir2, ilmn)*kpg(ipw,idir1))
517 
518                     dprojs_i_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
519   &                  -half*(atom_dprojs(2, ipw, idir1, ilmn)*kpg(ipw,idir2) &
520   &                        +atom_dprojs(2, ipw, idir2, ilmn)*kpg(ipw,idir1))
521                   end do
522                 end do
523                 igrad=igrad+1
524               end do
525             end if
526             if(my_compute_grad_atom) then
527               do idir=1,3
528                 do ilmn=1,nlmn
529                   do ipw=1,npw
530                     dprojs_r_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
531   &                  +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi
532                     dprojs_i_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = &
533   &                  -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi
534                   end do
535                 end do
536                 igrad=igrad+1
537               end do
538             end if
539           end if ! istwf_k
540         else
541           if(istwf_k <= 1) then
542             igrad=0
543             if(my_compute_grad_strain) then
544               do idir=1,6
545                 idir1=alpha(idir);idir2=beta(idir)
546                 do ilmn=lmn_beg,nlmn
547                   do ipw=1,npw
548                     gemm_nonlop_kpt(ikpt)%dprojs(1:2, ipw, shift_grad+nlmn*igrad+ilmn) = &
549   &                  -half*(atom_dprojs(1:2, ipw, idir1, ilmn)*kpg(ipw,idir2) &
550   &                        +atom_dprojs(1:2, ipw, idir2, ilmn)*kpg(ipw,idir1))
551                   end do
552                 end do
553                 igrad=igrad+1
554               end do
555             end if
556             if(my_compute_grad_atom) then
557               do idir=1,3
558                 do ilmn=lmn_beg,nlmn
559                   do ipw=1,npw
560                     gemm_nonlop_kpt(ikpt)%dprojs(1, ipw, shift_grad+nlmn*igrad+ilmn) = &
561   &                  +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi
562                     gemm_nonlop_kpt(ikpt)%dprojs(2, ipw, shift_grad+nlmn*igrad+ilmn) = &
563   &                  -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi
564                   end do
565                 end do
566                 igrad=igrad+1
567               end do
568             end if
569           else ! istwf_k>1
570             igrad=0
571             if(my_compute_grad_strain) then
572               do idir=1,6
573                 idir1=alpha(idir);idir2=beta(idir)
574                 do ilmn=lmn_beg,nlmn
575                   do ipw=1,npw
576                     gemm_nonlop_kpt(ikpt)%dprojs_r(1, ipw, shift_grad+nlmn*igrad+ilmn) = &
577   &                  -half*(atom_dprojs(1, ipw, idir1, ilmn)*kpg(ipw,idir2) &
578   &                        +atom_dprojs(1, ipw, idir2, ilmn)*kpg(ipw,idir1))
579 
580                     gemm_nonlop_kpt(ikpt)%dprojs_i(1, ipw, shift_grad+nlmn*igrad+ilmn) = &
581   &                  -half*(atom_dprojs(2, ipw, idir1, ilmn)*kpg(ipw,idir2) &
582   &                        +atom_dprojs(2, ipw, idir2, ilmn)*kpg(ipw,idir1))
583                   end do
584                 end do
585                 igrad=igrad+1
586               end do
587             end if
588             if(my_compute_grad_atom) then
589               do idir=1,3
590                 do ilmn=lmn_beg,nlmn
591                   do ipw=1,npw
592                     gemm_nonlop_kpt(ikpt)%dprojs_r(1, ipw, shift_grad+nlmn*igrad+ilmn) = &
593   &                  +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi
594                     gemm_nonlop_kpt(ikpt)%dprojs_i(1, ipw, shift_grad+nlmn*igrad+ilmn) = &
595   &                  -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi
596                   end do
597                 end do
598                 igrad=igrad+1
599               end do
600             end if
601           end if ! istwf_k
602         end if ! gemm_nonlop_is_distributed
603       end if ! ngrads
604 
605       iaph3d = iaph3d + 1
606       shift_grad = shift_grad + ngrads*nlmn_o
607 
608       if(gemm_nonlop_is_distributed) then
609         shift = shift + nlmn - (lmn_beg-1)
610         shift_do = shift_do + nlmn
611         if(shift_do >= iend - 1) exit
612       else
613         shift = shift + nlmn
614       end if
615     end do
616     if(gemm_nonlop_is_distributed .and. shift_do >= iend - 1) exit
617   end do
618 
619   ! Filling dprojs by extracting values from dprojs_tmp
620   if(gemm_nonlop_is_distributed .and. ngrads>0) then
621     shift_grad = lmn_grad_beg
622     if(istwf_k <= 1) then
623       gemm_nonlop_kpt(ikpt)%dprojs(1:2, 1:npw, 1:ngrads*nprojs_my_blk) = &
624       &      dprojs_tmp(1:2, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk)
625     else
626       gemm_nonlop_kpt(ikpt)%dprojs_r(1, 1:npw, 1:ngrads*nprojs_my_blk) = &
627       &      dprojs_r_tmp(1, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk)
628       gemm_nonlop_kpt(ikpt)%dprojs_i(1, 1:npw, 1:ngrads*nprojs_my_blk) = &
629       &      dprojs_i_tmp(1, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk)
630     end if
631   end if
632 
633   if(gemm_nonlop_is_distributed .and. ngrads>0) then
634     if(istwf_k <= 1) then
635       ABI_FREE(dprojs_tmp)
636     else
637       ABI_FREE(dprojs_r_tmp)
638       ABI_FREE(dprojs_i_tmp)
639     end if
640   end if
641   ABI_FREE(atom_projs)
642   ABI_FREE(temp)
643   if (allocated(atom_dprojs)) then
644     ABI_FREE(atom_dprojs)
645   end if
646   if (nkpg_local>0) then
647     ABI_FREE(kpg)
648   end if
649  end subroutine make_gemm_nonlop