TABLE OF CONTENTS
- ABINIT/m_gemm_nonlop
- m_gemm_nonlop/destroy_gemm_nonlop
- m_gemm_nonlop/free_gemm_nonlop_ikpt
- m_gemm_nonlop/gemm_nonlop
- m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernla
- m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernlb
- m_gemm_nonlop/gemm_nonlop_type
- m_gemm_nonlop/init_gemm_nonlop
- m_gemm_nonlop/make_gemm_nonlop
ABINIT/m_gemm_nonlop [ Modules ]
NAME
m_gemm_nonlop
FUNCTION
This module provides functions to compute the nonlocal operator by means of the BLAS GEMM routine. By treating ndat simultaneous wavefunctions, it is able to exploit BLAS3 routines, which leads to excellent CPU efficiency and OpenMP scalability.
COPYRIGHT
Copyright (C) 2014-2024 ABINIT group (AL) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
18 ! TODO list : 19 ! Don't allocate the full nkpt structures, only those that are treated by this proc: use same init as in m_bandfft_kpt 20 ! support more options (forces & stresses mostly) 21 ! Support RF/other computations (only GS right now) 22 ! handle the case where nloalg(2) < 0, ie no precomputation of ph3d 23 ! more systematic checking of the workflow (right now, only works if init/make/gemm/destroy, no multiple makes, etc) 24 ! Avoid allocating the complex matrix when istwfk > 1 25 ! Merge with chebfi's invovl 26 27 28 #if defined HAVE_CONFIG_H 29 #include "config.h" 30 #endif 31 32 #include "abi_common.h" 33 34 module m_gemm_nonlop 35 36 use defs_basis 37 use m_errors 38 use m_abicore 39 use m_xmpi 40 use m_fstrings, only : itoa, ftoa, sjoin 41 use m_abi_linalg 42 43 use defs_abitypes, only : MPI_type 44 use m_opernlc_ylm, only : opernlc_ylm 45 use m_pawcprj, only : pawcprj_type 46 use m_geometry, only : strconv 47 use m_kg, only : mkkpg 48 49 implicit none 50 51 private 52 53 ! Use these routines in order: first call init, then call make_gemm_nonlop for each k point, 54 ! then call gemm_nonlop to do the actual computation, and call destroy when done. See gstate and vtorho. 55 public :: init_gemm_nonlop 56 public :: destroy_gemm_nonlop 57 public :: make_gemm_nonlop 58 public :: gemm_nonlop
m_gemm_nonlop/destroy_gemm_nonlop [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
destroy_gemm_nonlop
FUNCTION
Destruction of the gemm_nonlop_kpt array
INPUTS
nkpt= number of k-points
SOURCE
166 subroutine destroy_gemm_nonlop(nkpt) 167 168 integer,intent(in) :: nkpt 169 integer :: ikpt 170 171 ! ************************************************************************* 172 173 ! TODO add cycling if kpt parallelism 174 do ikpt = 1,nkpt 175 call free_gemm_nonlop_ikpt(ikpt) 176 end do 177 178 ABI_FREE(gemm_nonlop_kpt) 179 180 end subroutine destroy_gemm_nonlop
m_gemm_nonlop/free_gemm_nonlop_ikpt [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
free_destroy_gemm_nonlop_ikpt
FUNCTION
Release memory for one kpt value of the gemm_nonlop_kpt array
INPUTS
ikpt= index of gemm_nonlop_kptto be released
SOURCE
196 subroutine free_gemm_nonlop_ikpt(ikpt) 197 198 integer,intent(in) :: ikpt 199 200 ! ************************************************************************* 201 202 if(gemm_nonlop_kpt(ikpt)%nprojs /= -1) then 203 if (allocated(gemm_nonlop_kpt(ikpt)%projs)) then 204 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs) 205 end if 206 if (allocated(gemm_nonlop_kpt(ikpt)%projs_r)) then 207 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_r) 208 end if 209 if (allocated(gemm_nonlop_kpt(ikpt)%projs_i)) then 210 ABI_FREE(gemm_nonlop_kpt(ikpt)%projs_i) 211 end if 212 gemm_nonlop_kpt(ikpt)%nprojs = -1 213 if(gemm_nonlop_kpt(ikpt)%ngrads /= -1) then 214 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs)) then 215 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs) 216 end if 217 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_r)) then 218 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_r) 219 end if 220 if (allocated(gemm_nonlop_kpt(ikpt)%dprojs_i)) then 221 ABI_FREE(gemm_nonlop_kpt(ikpt)%dprojs_i) 222 end if 223 gemm_nonlop_kpt(ikpt)%ngrads = -1 224 end if 225 end if 226 227 end subroutine free_gemm_nonlop_ikpt
m_gemm_nonlop/gemm_nonlop [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
gemm_nonlop
FUNCTION
Replacement of nonlop. same prototype as nonlop although not all options are implemented.
INPUTS
[gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
SOURCE
810 subroutine gemm_nonlop(atindx1,choice,cpopt,cprjin,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,& 811 & enl,enlout,ffnlin,ffnlout,gmet,gprimd,idir,indlmn,istwf_k,& 812 & kgin,kgout,kpgin,kpgout,kptin,kptout,lambda,lmnmax,matblk,mgfft,& 813 & mpi_enreg,mpsang,mpssoang,natom,nattyp,ndat,ngfft,nkpgin,nkpgout,nloalg,& 814 & nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO,paw_opt,phkxredin,& 815 & phkxredout,ph1d,ph3din,ph3dout,signs,sij,svectout,& 816 & tim_nonlop,ucvol,useylm,vectin,vectout,& 817 & vectproj,gpu_option) 818 819 !Arguments ------------------------------------ 820 !scalars 821 integer,intent(in) :: choice,cpopt,dimenl1,dimenl2,dimekbq,dimffnlin,dimffnlout,idir 822 integer,intent(in) :: istwf_k,lmnmax,matblk,mgfft,mpsang,mpssoang,natom,ndat,nkpgin 823 integer,intent(in) :: nkpgout,nnlout,npwin,npwout,nspinor,nspinortot,ntypat,only_SO 824 integer,intent(in) :: paw_opt,signs,tim_nonlop,useylm 825 integer,optional,intent(in) :: gpu_option 826 real(dp),intent(in) :: lambda(ndat),ucvol 827 type(MPI_type),intent(in) :: mpi_enreg 828 !arrays 829 integer,intent(in) :: atindx1(natom),indlmn(6,lmnmax,ntypat),kgin(3,npwin) 830 integer,intent(in) :: kgout(3,npwout),nattyp(ntypat),ngfft(18),nloalg(3) 831 real(dp),intent(in) :: enl(dimenl1,dimenl2,nspinortot**2,dimekbq) 832 real(dp),intent(in) :: ffnlin(npwin,dimffnlin,lmnmax,ntypat) 833 real(dp),intent(in) :: ffnlout(npwout,dimffnlout,lmnmax,ntypat),gmet(3,3) 834 real(dp),intent(in) :: gprimd(3,3),kpgin(npwin,nkpgin*useylm) 835 real(dp),intent(in) :: kpgout(npwout,nkpgout*useylm),kptin(3),kptout(3) 836 real(dp),intent(in) :: phkxredin(2,natom),phkxredout(2,natom),ph1d(2,*) 837 real(dp),intent(in) :: sij(dimenl1,ntypat*((paw_opt+1)/3)) 838 real(dp),intent(inout) :: ph3din(2,npwin,matblk),ph3dout(2,npwout,matblk) 839 real(dp),intent(inout) :: vectin(2,npwin*nspinor*ndat) 840 real(dp),intent(inout) :: enlout(nnlout*ndat) 841 real(dp),intent(out) :: svectout(2,npwout*nspinor*(paw_opt/3)*ndat) 842 real(dp),intent(inout) :: vectout(2,npwout*nspinor*ndat) !vz_i 843 real(dp),intent(inout),optional, ABI_CONTIGUOUS target :: vectproj(:,:,:) 844 type(pawcprj_type),intent(inout) :: cprjin(natom,nspinor*((cpopt+5)/5)*ndat) 845 846 ! locals 847 integer :: ii, ia, idat, igrad, nprojs, ngrads, shift, iatom, nlmn, ierr, ibeg, iend, ikpt 848 integer :: cplex, cplex_enl, cplex_fac, proj_shift, grad_shift 849 integer :: enlout_shift, force_shift, nnlout_test 850 integer :: iatm, ndgxdt, ndgxdtfac, nd2gxdt, nd2gxdtfac, optder, itypat, ilmn 851 integer :: cplex_dgxdt(1), cplex_d2gxdt(1) 852 logical :: local_vectproj 853 real(dp) :: esum 854 real(dp) :: work(6) 855 real(dp) :: dgxdt_dum_in(1,1,1,1,1), dgxdt_dum_out(1,1,1,1,1),dgxdt_dum_out2(1,1,1,1,1) 856 real(dp) :: d2gxdt_dum_in(1,1,1,1,1), d2gxdt_dum_out(1,1,1,1,1),d2gxdt_dum_out2(1,1,1,1,1) 857 real(dp), allocatable :: enlk(:),sij_typ(:) 858 real(dp), ABI_CONTIGUOUS pointer :: projections(:,:,:) 859 real(dp), allocatable :: s_projections(:,:,:), vnl_projections(:,:,:) 860 real(dp), allocatable :: dprojections(:,:,:), temp_realvec(:) 861 integer :: nprojs_my_blk 862 integer :: rank, nprocs 863 logical :: is_last 864 real(dp), allocatable :: projs_local(:,:,:), dprojs_local(:,:,:) 865 real(dp), allocatable :: projs_recv(:,:,:), dprojs_recv(:,:,:) 866 867 ! ************************************************************************* 868 869 ! We keep the same interface as nonlop, but we don't use many of those 870 ABI_UNUSED((/ffnlin,ffnlout,gmet,kpgin,kpgout/)) 871 ABI_UNUSED((/ph1d(1,1),ph3din,ph3dout/)) 872 ABI_UNUSED((/phkxredin,phkxredout,ucvol/)) 873 ABI_UNUSED((/mgfft,mpsang,mpssoang/)) 874 ABI_UNUSED((/kptin,kptout/)) 875 ABI_UNUSED((/idir,nloalg,ngfft,kgin,kgout,ngfft,only_SO,tim_nonlop,gpu_option/)) 876 877 ! Check supported options 878 if (.not.gemm_nonlop_use_gemm) then 879 ABI_BUG('computation not prepared for gemm_nonlop use!') 880 end if 881 if ( (choice>1.and.choice/=7.and.signs==2) .or. & 882 & (choice>3.and.choice/=7.and.choice/=23.and.signs==1) .or. & 883 & (useylm/=1) ) then 884 ABI_BUG('gemm_nonlop option not supported!') 885 end if 886 if (signs==1) then 887 nnlout_test=0 888 if (choice==1) nnlout_test=1 889 if (choice==2) nnlout_test=3*natom 890 if (choice==3) nnlout_test=6 891 if (choice==23) nnlout_test=6+3*natom 892 if (nnlout<nnlout_test) then 893 ABI_BUG('wrong nnlout size!') 894 end if 895 end if 896 897 ikpt=gemm_nonlop_ikpt_this_proc_being_treated 898 cplex=2;if (istwf_k>1) cplex=1 899 cplex_enl=1;if (paw_opt>0) cplex_enl=2*dimenl1/(lmnmax*(lmnmax+1)) ! is enl complex? 900 cplex_fac=max(cplex,dimekbq) 901 if ((nspinortot==2.or.cplex_enl==2).and.paw_opt>0.and.choice/=7) cplex_fac=2 ! is vnl_projections complex? 902 903 nprojs = gemm_nonlop_kpt(ikpt)%nprojs 904 ngrads = gemm_nonlop_kpt(ikpt)%ngrads 905 nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_blk 906 907 if(gemm_nonlop_is_distributed) then 908 rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm) 909 is_last = (rank==nprocs-1) 910 if(is_last) nprojs_my_blk = gemm_nonlop_kpt(ikpt)%nprojs_last_blk 911 end if 912 913 if(choice==1) ngrads=0 914 ABI_CHECK(ngrads>=3.or.choice/=2 ,"Bad allocation in gemm_nonlop (2)!") 915 ABI_CHECK(ngrads>=6.or.choice/=3 ,"Bad allocation in gemm_nonlop (3)!") 916 ABI_CHECK(ngrads>=9.or.choice/=23,"Bad allocation in gemm_nonlop (23)!") 917 918 ! If vectproj is provided, use it for further calculations, use allocated array otherwise 919 local_vectproj=.false. 920 if(PRESENT(vectproj)) then 921 if(size(vectproj)>1) local_vectproj=.true. 922 end if 923 if (local_vectproj) then 924 projections => vectproj 925 end if 926 927 ! These will store the non-local factors for vectin, svectout and vectout respectively 928 if(.not. local_vectproj) then 929 ABI_MALLOC(projections,(cplex, nprojs,nspinor*ndat)) 930 end if 931 ABI_MALLOC(s_projections,(cplex, nprojs,nspinor*ndat)) 932 ABI_MALLOC(vnl_projections,(cplex_fac, nprojs,nspinor*ndat)) 933 934 if(gemm_nonlop_is_distributed) then 935 ABI_MALLOC(projs_recv, (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 936 ABI_MALLOC(projs_local, (cplex, npwin, gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 937 if (signs==1.and.ngrads>0) then 938 ABI_MALLOC(dprojs_recv, (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 939 ABI_MALLOC(dprojs_local, (cplex, npwin, ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk)) 940 end if 941 end if 942 943 if(.not. local_vectproj) projections = zero 944 s_projections = zero 945 vnl_projections = zero 946 if (signs==1.and.ngrads>0) then 947 ABI_MALLOC(dprojections,(cplex, ngrads*nprojs,nspinor*ndat)) 948 dprojections = zero 949 end if 950 951 if(nprojs == 0) then 952 ! TODO check if this is correct 953 if(signs == 1) then 954 enlout=zero 955 return 956 end if 957 if(signs == 2) then 958 vectout = zero 959 if(paw_opt>0) svectout = vectin 960 return 961 end if 962 end if 963 964 ! determine precisely when temp_realvec needs to be allocated 965 ! to factorize allocate (resp. deallocate) at the begining (resp. at the end) of subroutine 966 ! to avoid multiple allocate/deallocate that can be costly 967 if (cplex /= 2) then 968 if ( (cpopt < 2) .or. & 969 & (paw_opt == 3 .or. paw_opt == 4) .or. & 970 & (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4)) then 971 ABI_MALLOC(temp_realvec,(MAX(npwout,npwin)*nspinor*ndat)) 972 end if 973 end if 974 975 976 if(cpopt >= 2) then 977 if(.not. local_vectproj) then 978 !This use-case is extremely painful for GEMM nonlop performance. 979 !vectproj paramter should always be provided to avoid it 980 981 ! retrieve from cprjin 982 do idat=1, ndat*nspinor 983 shift = 0 984 do iatom = 1, natom 985 nlmn = cprjin(iatom, idat)%nlmn 986 projections(1:cplex, shift+1:shift+nlmn, idat) = cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) 987 shift = shift + nlmn 988 end do 989 end do 990 end if 991 if(cpopt==4.and.allocated(dprojections)) then 992 ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (1)") 993 do idat=1, ndat*nspinor 994 shift = 0 995 do iatom = 1, natom 996 nlmn = cprjin(iatom, idat)%nlmn 997 do igrad=1,ngrads 998 dprojections(1:cplex, shift+1:shift+nlmn, idat) = & 999 & cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn) 1000 shift = shift + nlmn 1001 end do 1002 end do 1003 end do 1004 end if 1005 else ! cpopt < 2 1006 ! opernla 1007 if(cplex == 2) then 1008 if(.not. gemm_nonlop_is_distributed) then 1009 call abi_zgemm_2r('C', 'N', nprojs, ndat*nspinor, npwin, cone, & 1010 & gemm_nonlop_kpt(ikpt)%projs, npwin,& 1011 & vectin, npwin, czero, projections, nprojs) 1012 1013 if(signs==1.and.ngrads>0) then 1014 call abi_zgemm_2r('C', 'N', ngrads*nprojs, ndat*nspinor, npwin, cone, & 1015 gemm_nonlop_kpt(ikpt)%dprojs, npwin,& 1016 vectin, npwin, czero, dprojections, ngrads*nprojs) 1017 end if 1018 else 1019 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1020 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1021 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1022 & nprojs_my_blk,cplex,czero,& 1023 & projs_local,projs_recv,& 1024 & gemm_nonlop_kpt(ikpt)%projs,& 1025 & vectin,projections) 1026 if(signs==1.and.ngrads>0) then 1027 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1028 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1029 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1030 & ngrads*nprojs_my_blk,cplex,czero,& 1031 & dprojs_local,dprojs_recv,& 1032 & gemm_nonlop_kpt(ikpt)%dprojs,& 1033 & vectin,dprojections) 1034 end if 1035 end if 1036 else 1037 1038 if (.not. allocated(temp_realvec)) then 1039 ABI_ERROR("Please provide memory allocation for temp_realvec array") 1040 end if 1041 1042 ! only compute real part of projections = P^* psi => projections_r = P_r^T psi_r + P_i^T psi_i 1043 temp_realvec(1:npwin*nspinor*ndat) = vectin(1,1:npwin*nspinor*ndat) 1044 if(istwf_k == 2 .and. mpi_enreg%me_g0_fft == 1) then 1045 do idat=1, ndat*nspinor 1046 temp_realvec(1+(idat-1)*npwin) = temp_realvec(1+(idat-1)*npwin)/2 1047 end do 1048 end if 1049 if(.not. gemm_nonlop_is_distributed) then 1050 call DGEMM('T', 'N', nprojs, ndat*nspinor, npwin, one, & 1051 & gemm_nonlop_kpt(ikpt)%projs_r, npwin, & 1052 & temp_realvec, npwin, zero, projections, nprojs) 1053 if(signs==1.and.ngrads>0) then 1054 call DGEMM('T', 'N', ngrads*nprojs, ndat*nspinor, npwin, one, & 1055 & gemm_nonlop_kpt(ikpt)%dprojs_r, npwin, & 1056 & temp_realvec, npwin, zero, dprojections, ngrads*nprojs) 1057 end if 1058 else 1059 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1060 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1061 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1062 & nprojs_my_blk,cplex,czero,& 1063 & projs_local,projs_recv,& 1064 & gemm_nonlop_kpt(ikpt)%projs_r,& 1065 & temp_realvec,projections) 1066 if(signs==1.and.ngrads>0) then 1067 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1068 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1069 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1070 & ngrads*nprojs_my_blk,cplex,czero,& 1071 & dprojs_local,dprojs_recv,& 1072 & gemm_nonlop_kpt(ikpt)%dprojs_r,& 1073 & temp_realvec,dprojections) 1074 end if 1075 end if 1076 temp_realvec(1:npwin*nspinor*ndat) = vectin(2,1:npwin*nspinor*ndat) 1077 if(istwf_k == 2 .and. mpi_enreg%me_g0_fft == 1) then 1078 do idat=1, ndat*nspinor 1079 temp_realvec(1+(idat-1)*npwin) = zero 1080 end do 1081 end if 1082 if(.not. gemm_nonlop_is_distributed) then 1083 call DGEMM('T', 'N', nprojs, ndat*nspinor, npwin, one, & 1084 & gemm_nonlop_kpt(ikpt)%projs_i, npwin, & 1085 & temp_realvec, npwin, one , projections, nprojs) 1086 projections = projections * 2 1087 if(signs==1.and.ngrads>0) then 1088 call DGEMM('T', 'N', ngrads*nprojs, ndat*nspinor, npwin, one, & 1089 & gemm_nonlop_kpt(ikpt)%dprojs_i, npwin, & 1090 & temp_realvec, npwin, one , dprojections, ngrads*nprojs) 1091 dprojections = dprojections * 2 1092 end if 1093 else 1094 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1095 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1096 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1097 & nprojs_my_blk,cplex,cone,& 1098 & projs_local,projs_recv,& 1099 & gemm_nonlop_kpt(ikpt)%projs_i,& 1100 & temp_realvec,projections) 1101 projections = projections * 2 1102 if(signs==1.and.ngrads>0) then 1103 call gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 1104 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1105 & ngrads*gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1106 & ngrads*nprojs_my_blk,cplex,cone,& 1107 & dprojs_local,dprojs_recv,& 1108 & gemm_nonlop_kpt(ikpt)%dprojs_i,& 1109 & temp_realvec,dprojections) 1110 dprojections = dprojections * 2 1111 end if 1112 end if 1113 end if ! cplex == 2 1114 call xmpi_sum(projections,mpi_enreg%comm_fft,ierr) 1115 if (choice>1) then 1116 call xmpi_sum(dprojections,mpi_enreg%comm_fft,ierr) 1117 end if 1118 1119 if(cpopt >= 0) then 1120 if(.not. local_vectproj) then 1121 !This use-case is extremely painful for GEMM nonlop performance. 1122 !vectproj paramter should always be provided to avoid it 1123 1124 ! store in cprjin 1125 do idat=1, ndat*nspinor 1126 shift = 0 1127 do iatom = 1, natom 1128 nlmn = cprjin(iatom, idat)%nlmn 1129 cprjin(iatom, idat)%cp(1:cplex, 1:nlmn) = projections(1:cplex, shift+1:shift+nlmn, idat) 1130 shift = shift + nlmn 1131 end do 1132 end do 1133 end if 1134 if(cpopt==3) then 1135 ABI_CHECK(cprjin(1,1)%ncpgr>=ngrads,"cprjin%ncpgr not correct! (2)") 1136 shift = 0 1137 do iatom = 1, natom 1138 nlmn = cprjin(iatom, idat)%nlmn 1139 do igrad=1,ngrads 1140 cprjin(iatom, idat)%dcp(1:cplex,igrad,1:nlmn) = & 1141 & dprojections(1:cplex, shift+1:shift+nlmn, idat) 1142 shift = shift + nlmn 1143 end do 1144 end do 1145 end if 1146 end if ! cpopt >= 0 1147 end if ! cpopt >= 2 1148 1149 if(choice > 0) then 1150 1151 if(choice /= 7) then 1152 ! opernlc 1153 iatm = 0 1154 ndgxdt = 0 1155 ndgxdtfac = 0 1156 nd2gxdt = 0 1157 nd2gxdtfac = 0 1158 optder = 0 1159 1160 shift = 0 1161 ABI_MALLOC(sij_typ,(((paw_opt+1)/3)*lmnmax*(lmnmax+1)/2)) 1162 do itypat=1, ntypat 1163 nlmn=count(indlmn(3,:,itypat)>0) 1164 if (paw_opt>=2) then 1165 if (cplex_enl==1) then 1166 do ilmn=1,nlmn*(nlmn+1)/2 1167 sij_typ(ilmn)=sij(ilmn,itypat) 1168 end do 1169 else 1170 do ilmn=1,nlmn*(nlmn+1)/2 1171 sij_typ(ilmn)=sij(2*ilmn-1,itypat) 1172 end do 1173 end if 1174 end if 1175 1176 ibeg = shift+1 1177 iend = shift+nattyp(itypat)*nlmn 1178 1179 do idat = 1,ndat 1180 call opernlc_ylm(atindx1,cplex,cplex_dgxdt,cplex_d2gxdt,cplex_enl,cplex_fac,dgxdt_dum_in,dgxdt_dum_out,dgxdt_dum_out2,& 1181 & d2gxdt_dum_in,d2gxdt_dum_out,d2gxdt_dum_out2,dimenl1,dimenl2,dimekbq,enl,& 1182 & projections(:, ibeg:iend, 1+nspinor*(idat-1):nspinor*idat),& 1183 & vnl_projections(:, ibeg:iend,1+nspinor*(idat-1):nspinor*idat),& 1184 & s_projections(:, ibeg:iend,1+nspinor*(idat-1):nspinor*idat),& 1185 & iatm,indlmn(:,:,itypat),itypat,lambda(idat),mpi_enreg,natom,ndgxdt,ndgxdtfac,nd2gxdt,nd2gxdtfac,& 1186 & nattyp(itypat),nlmn,nspinor,nspinortot,optder,paw_opt,sij_typ) 1187 end do 1188 1189 shift = shift + nattyp(itypat)*nlmn 1190 iatm = iatm+nattyp(itypat) 1191 end do 1192 ABI_FREE(sij_typ) 1193 else 1194 s_projections = projections 1195 end if ! choice /= 7 1196 1197 ! opernlb (only choice=1) 1198 if(signs==2) then 1199 if(paw_opt == 3 .or. paw_opt == 4) then 1200 1201 ! Get svectout from s_projections 1202 if(cplex == 2) then 1203 1204 if(.not. gemm_nonlop_is_distributed) then 1205 call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1206 & gemm_nonlop_kpt(ikpt)%projs, npwout, & 1207 & s_projections, nprojs, czero, svectout, npwout) 1208 else 1209 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1210 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1211 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1212 & nprojs_my_blk,cplex,& 1213 & projs_local,projs_recv,& 1214 & gemm_nonlop_kpt(ikpt)%projs,& 1215 & s_projections,svectout) 1216 end if 1217 else 1218 1219 if (.not. allocated(temp_realvec)) then 1220 ABI_ERROR("Please provide memory allocation for temp_realvec array") 1221 end if 1222 1223 if(.not. gemm_nonlop_is_distributed) then 1224 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, & 1225 & gemm_nonlop_kpt(ikpt)%projs_r, npwout, & 1226 & s_projections, nprojs, zero, temp_realvec, npwout) 1227 svectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1228 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, & 1229 & gemm_nonlop_kpt(ikpt)%projs_i, npwout,& 1230 & s_projections, nprojs, zero, temp_realvec, npwout) 1231 svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1232 else 1233 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1234 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1235 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1236 & nprojs_my_blk,cplex,& 1237 & projs_local,projs_recv,& 1238 & gemm_nonlop_kpt(ikpt)%projs_r,& 1239 & s_projections,temp_realvec) 1240 svectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1241 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1242 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1243 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1244 & nprojs_my_blk,cplex,& 1245 & projs_local,projs_recv,& 1246 & gemm_nonlop_kpt(ikpt)%projs_i,& 1247 & s_projections,temp_realvec) 1248 svectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1249 end if 1250 1251 end if ! cplex = 2 1252 if(choice /= 7) svectout = svectout + vectin ! TODO understand this 1253 1254 end if ! (paw_opt == 3 .or. paw_opt == 4) 1255 1256 if(paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) then 1257 ! Get vectout from vnl_projections 1258 if(cplex_fac == 2) then 1259 1260 if(.not. gemm_nonlop_is_distributed) then 1261 call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs, cone, & 1262 & gemm_nonlop_kpt(ikpt)%projs, npwout, & 1263 & vnl_projections, nprojs, czero, vectout, npwout) 1264 else 1265 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1266 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1267 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1268 & nprojs_my_blk,cplex,& 1269 & projs_local,projs_recv,& 1270 & gemm_nonlop_kpt(ikpt)%projs,& 1271 & vnl_projections,vectout) 1272 end if 1273 else 1274 1275 if (.not. allocated(temp_realvec)) then 1276 ABI_ERROR("Please provide memory allocation for temp_realvec array") 1277 end if 1278 1279 if(.not. gemm_nonlop_is_distributed) then 1280 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, & 1281 & gemm_nonlop_kpt(ikpt)%projs_r, npwout, & 1282 & vnl_projections, nprojs, zero, temp_realvec, npwout) 1283 vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1284 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs, one, & 1285 & gemm_nonlop_kpt(ikpt)%projs_i, npwout, & 1286 & vnl_projections, nprojs, zero, temp_realvec, npwout) 1287 vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1288 else 1289 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1290 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1291 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1292 & nprojs_my_blk,cplex,& 1293 & projs_local,projs_recv,& 1294 & gemm_nonlop_kpt(ikpt)%projs_r,& 1295 & vnl_projections,temp_realvec) 1296 vectout(1,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1297 call gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 1298 & gemm_nonlop_kpt(ikpt)%nprojs_blk,& 1299 & gemm_nonlop_kpt(ikpt)%nprojs_last_blk,& 1300 & nprojs_my_blk,cplex,& 1301 & projs_local,projs_recv,& 1302 & gemm_nonlop_kpt(ikpt)%projs_i,& 1303 & vnl_projections,temp_realvec) 1304 vectout(2,1:npwout*nspinor*ndat) = temp_realvec(1:npwout*nspinor*ndat) 1305 end if 1306 1307 end if ! cplex_fac == 2 1308 1309 end if ! (paw_opt == 0 .or. paw_opt == 1 .or. paw_opt == 4) 1310 end if ! opernlb 1311 1312 ! opernld 1313 if(signs==1) then 1314 enlout=zero 1315 if(choice==1.or.choice==3.or.choice==23) then 1316 ABI_MALLOC(enlk,(ndat)) 1317 enlk=zero 1318 do idat=1,ndat*nspinor 1319 proj_shift=0 1320 do itypat=1, ntypat 1321 nlmn=count(indlmn(3,:,itypat)>0) 1322 do ia=1,nattyp(itypat) 1323 !Following loops are a [D][Z]DOT 1324 esum=zero 1325 do ilmn=1,nlmn 1326 do ii=1,cplex 1327 esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) & 1328 & *projections (ii,proj_shift+ilmn,idat) 1329 end do 1330 end do 1331 proj_shift=proj_shift+nlmn 1332 enlk(idat) = enlk(idat) + esum 1333 end do 1334 end do 1335 end do 1336 if (choice==1) enlout(1:ndat)=enlk(1:ndat) 1337 end if ! choice=1/3/23 1338 if(choice==2.or.choice==3.or.choice==23) then 1339 do idat=1,ndat*nspinor 1340 proj_shift=0 ; grad_shift=0 1341 enlout_shift=(idat-1)*nnlout 1342 force_shift=merge(6,0,choice==23) 1343 do itypat=1, ntypat 1344 nlmn=count(indlmn(3,:,itypat)>0) 1345 do ia=1,nattyp(itypat) 1346 if (choice==3.or.choice==23) then 1347 do igrad=1,6 1348 !Following loops are a [D][Z]DOT 1349 esum=zero 1350 do ilmn=1,nlmn 1351 do ii=1,cplex 1352 esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) & 1353 & *dprojections (ii,grad_shift+ilmn,idat) 1354 end do 1355 end do 1356 grad_shift=grad_shift+nlmn 1357 enlout(enlout_shift+igrad)=enlout(enlout_shift+igrad) + two*esum 1358 end do 1359 end if 1360 if (choice==2.or.choice==23) then 1361 do igrad=1,3 1362 !Following loops are a [D][Z]DOT 1363 esum=zero 1364 do ilmn=1,nlmn 1365 do ii=1,cplex 1366 esum=esum +vnl_projections(ii,proj_shift+ilmn,idat) & 1367 & *dprojections (ii,grad_shift+ilmn,idat) 1368 end do 1369 end do 1370 grad_shift=grad_shift+nlmn 1371 enlout(enlout_shift+force_shift+igrad)= & 1372 & enlout(enlout_shift+force_shift+igrad) + two*esum 1373 end do 1374 force_shift=force_shift+3 1375 end if 1376 proj_shift=proj_shift+nlmn 1377 end do 1378 end do 1379 end do 1380 end if ! choice=2, 3 or 23 1381 1382 end if !opernld 1383 1384 end if ! choice>0 1385 1386 ! Reduction in case of parallelism 1387 if (signs==1.and.mpi_enreg%paral_spinor==1) then 1388 if (size(enlout)>0) then 1389 call xmpi_sum(enlout,mpi_enreg%comm_spinor,ierr) 1390 end if 1391 if (choice==3.or.choice==23) then 1392 call xmpi_sum(enlk,mpi_enreg%comm_spinor,ierr) 1393 end if 1394 end if 1395 1396 ! Derivatives wrt strain 1397 ! - Convert from reduced to cartesian coordinates 1398 ! - Substract volume contribution 1399 if ((choice==3.or.choice==23).and.signs==1.and.paw_opt<=3) then 1400 do idat=1,ndat 1401 enlout_shift=(idat-1)*nnlout 1402 call strconv(enlout(enlout_shift+1:enlout_shift+6),gprimd,work) 1403 enlout(enlout_shift+1:enlout_shift+3)=(work(1:3)-enlk(idat)) 1404 enlout(enlout_shift+4:enlout_shift+6)= work(4:6) 1405 end do 1406 end if 1407 1408 ! Release memory 1409 if(.not. local_vectproj) then 1410 ABI_FREE(projections) 1411 end if 1412 ABI_FREE(s_projections) 1413 ABI_FREE(vnl_projections) 1414 if(gemm_nonlop_is_distributed) then 1415 ABI_FREE(projs_local) 1416 ABI_FREE(projs_recv) 1417 if (signs==1.and.ngrads>0) then 1418 ABI_FREE(dprojs_local) 1419 ABI_FREE(dprojs_recv) 1420 end if 1421 end if 1422 if (allocated(dprojections)) then 1423 ABI_FREE(dprojections) 1424 end if 1425 if (allocated(enlk)) then 1426 ABI_FREE(enlk) 1427 end if 1428 if (allocated(temp_realvec)) then 1429 ABI_FREE(temp_realvec) 1430 end if 1431 1432 end subroutine gemm_nonlop 1433 !*** 1434 1435 end module m_gemm_nonlop
m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernla [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
gemm_nonlop_distributed_gemm_opernla
FUNCTION
Distributed version of "opernla" GEMM called in gemm_nonlop.
INPUTS
SOURCE
664 subroutine gemm_nonlop_distributed_gemm_opernla(rank,nprocs,npwin,ndat,nspinor,& 665 & nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,beta,& 666 & projs_local,projs_recv,projs,vectin,projections) 667 integer, intent(in) :: rank,nprocs,npwin,ndat,nspinor 668 integer, intent(in) :: nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex 669 real(dp), intent(in) :: projs(:,:,:),vectin(*) 670 complex(dpc), intent(in) :: beta 671 real(dp), intent(inout) :: projs_local(:,:,:),projs_recv(:,:,:) 672 real(dp), intent(out) :: projections(:,:,:) 673 674 !Local variables 675 integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next 676 677 ! ************************************************************************* 678 679 rank_next=modulo(rank + 1,nprocs) 680 rank_prev=rank - 1 681 if(rank_prev == -1) rank_prev = nprocs - 1 682 do iblock=1,nprocs 683 684 if(rank+iblock == nprocs) then 685 nprojs_cur_blk = nprojs_last_blk 686 else 687 nprojs_cur_blk = nprojs_blk 688 end if 689 690 if(iblock == 1) then 691 call DCOPY(cplex*npwin*nprojs_my_blk , projs, 1, projs_local, 1) 692 else 693 call DCOPY(cplex*npwin*nprojs_cur_blk, projs_recv, 1, projs_local, 1) 694 end if 695 696 if(iblock < nprocs) then 697 call xmpi_isend(projs_local,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 698 call xmpi_irecv(projs_recv,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 699 end if 700 701 ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk 702 iend = ibeg+nprojs_cur_blk-1 703 if(cplex==2) then 704 call abi_zgemm_2r('C', 'N', nprojs_cur_blk, ndat*nspinor, npwin, cone, & 705 & projs_local, npwin,& 706 & vectin, npwin, beta, projections(:,ibeg:iend,:), nprojs_cur_blk) 707 else 708 call DGEMM('T', 'N', nprojs_cur_blk, ndat*nspinor, npwin, one, & 709 & projs_local, npwin, & 710 & vectin, npwin, real(beta), projections(:,ibeg:iend,:), nprojs_cur_blk) 711 end if 712 713 if(iblock < nprocs) then 714 call xmpi_waitall(req,ierr) 715 end if 716 717 end do 718 end subroutine gemm_nonlop_distributed_gemm_opernla
m_gemm_nonlop/gemm_nonlop_distributed_gemm_opernlb [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
gemm_nonlop_distributed_gemm_opernlb
FUNCTION
Distributed version of "opernlb" GEMM called in gemm_nonlop.
INPUTS
SOURCE
733 subroutine gemm_nonlop_distributed_gemm_opernlb(rank,nprocs,npwout,ndat,nspinor,& 734 & nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex,& 735 & projs_local,projs_recv,projs,projections,vectout) 736 integer, intent(in) :: rank,nprocs,npwout,ndat,nspinor 737 integer, intent(in) :: nprojs_blk,nprojs_last_blk,nprojs_my_blk,cplex 738 real(dp), intent(in) :: projs(:,:,:),projections(:,:,:) 739 real(dp), intent(inout) :: projs_local(:,:,:),projs_recv(:,:,:) 740 real(dp), intent(out) :: vectout(*) 741 742 !Local variables 743 integer :: iblock,ibeg,iend,req(2),ierr,nprojs_cur_blk,rank_prev,rank_next 744 complex(dpc) :: beta 745 746 ! ************************************************************************* 747 748 rank_next=modulo(rank + 1,nprocs) 749 rank_prev=rank - 1 750 if(rank_prev == -1) rank_prev = nprocs - 1 751 752 beta = czero 753 754 do iblock=1,nprocs 755 756 if(rank+iblock == nprocs) then 757 nprojs_cur_blk = nprojs_last_blk 758 else 759 nprojs_cur_blk = nprojs_blk 760 end if 761 762 if(iblock == 1) then 763 call DCOPY(cplex*npwout*nprojs_my_blk, projs, 1, projs_local, 1) 764 else 765 call DCOPY(cplex*npwout*nprojs_cur_blk, projs_recv, 1, projs_local, 1) 766 end if 767 768 if(iblock < nprocs) then 769 call xmpi_isend(projs_local,rank_prev,iblock,gemm_nonlop_block_comm,req(1),ierr) 770 call xmpi_irecv(projs_recv,rank_next,iblock,gemm_nonlop_block_comm,req(2),ierr) 771 end if 772 773 ibeg = 1 + modulo(rank+iblock-1,nprocs)*nprojs_blk 774 iend = ibeg+nprojs_cur_blk-1 775 776 if(cplex==2) then 777 call abi_zgemm_2r('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, cone, & 778 & projs_local, npwout, & 779 & projections(:,ibeg:iend,:), nprojs_cur_blk, beta, vectout, npwout) 780 else 781 call DGEMM('N', 'N', npwout, ndat*nspinor, nprojs_cur_blk, one, & 782 & projs_local, npwout, & 783 & projections(:,ibeg:iend,:), nprojs_cur_blk, real(beta), vectout, npwout) 784 end if 785 786 beta = cone 787 788 if(iblock < nprocs) then 789 call xmpi_waitall(req,ierr) 790 end if 791 792 end do 793 end subroutine gemm_nonlop_distributed_gemm_opernlb
m_gemm_nonlop/gemm_nonlop_type [ Types ]
[ Top ] [ m_gemm_nonlop ] [ Types ]
NAME
gemm_nonlop_type
FUNCTION
Contains information needed to apply the nonlocal operator
SOURCE
73 type,public :: gemm_nonlop_type 74 75 integer :: nprojs 76 integer :: ngrads 77 78 integer :: nprojs_blk 79 integer :: nprojs_last_blk 80 81 real(dp), allocatable :: projs(:, :, :) 82 ! (2, npw, nprojs) 83 84 real(dp), allocatable :: projs_r(:, :, :) 85 ! (1, npw, nprojs) 86 87 real(dp), allocatable :: projs_i(:, :, :) 88 ! (1, npw, nprojs) 89 90 real(dp), allocatable :: dprojs(:, :, :) 91 ! (2, npw, nprojs*ngrads) 92 real(dp), allocatable :: dprojs_r(:, :, :) 93 ! (1, npw, nprojs*ngrads) 94 real(dp), allocatable :: dprojs_i(:, :, :) 95 ! (1, npw, nprojs*ngrads) 96 97 end type gemm_nonlop_type
m_gemm_nonlop/init_gemm_nonlop [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
init_gemm_nonlop
FUNCTION
Initalization of the gemm_nonlop_kpt array
INPUTS
nkpt= number of k-points
SOURCE
136 subroutine init_gemm_nonlop(nkpt) 137 138 integer,intent(in) :: nkpt 139 integer :: ikpt 140 141 ! ************************************************************************* 142 143 ! TODO only allocate the number of kpt treated by this proc 144 ABI_MALLOC(gemm_nonlop_kpt, (nkpt)) 145 do ikpt=1,nkpt 146 gemm_nonlop_kpt(ikpt)%nprojs = -1 147 gemm_nonlop_kpt(ikpt)%ngrads = -1 148 end do 149 150 end subroutine init_gemm_nonlop
m_gemm_nonlop/make_gemm_nonlop [ Functions ]
[ Top ] [ m_gemm_nonlop ] [ Functions ]
NAME
make_gemm_nonlop
FUNCTION
Build the gemm_nonlop array
INPUTS
SOURCE
242 subroutine make_gemm_nonlop(ikpt,npw,lmnmax,ntypat,indlmn,nattyp,istwf_k,ucvol,ffnl_k, & 243 & ph3d_k,kpt_k,kg_k,kpg_k, & 244 & compute_grad_strain,compute_grad_atom) ! Optional parameters 245 246 integer, intent(in) :: ikpt 247 integer, intent(in) :: npw, lmnmax, ntypat 248 integer, intent(in) :: indlmn(:,:,:), kg_k(:,:) 249 integer, intent(in) :: nattyp(ntypat) 250 integer, intent(in) :: istwf_k 251 logical, intent(in), optional :: compute_grad_strain,compute_grad_atom 252 real(dp), intent(in) :: ucvol 253 real(dp), intent(in) :: ffnl_k(:,:,:,:) 254 real(dp), intent(in) :: ph3d_k(:,:,:) 255 real(dp), intent(in) :: kpt_k(:) 256 real(dp), intent(in), target :: kpg_k(:,:) 257 258 integer :: nprojs,ndprojs,ngrads 259 260 integer,parameter :: alpha(6)=(/1,2,3,3,3,2/),beta(6)=(/1,2,3,2,1,1/) 261 integer :: itypat, ilmn, nlmn, ia, iaph3d, igrad, shift, shift_grad 262 integer :: il, ipw, idir, idir1, idir2, nkpg_local 263 logical :: parity,my_compute_grad_strain,my_compute_grad_atom 264 real(dp),allocatable :: atom_projs(:,:,:), atom_dprojs(:,:,:,:), temp(:) 265 real(dp),pointer :: kpg(:,:) 266 integer :: rank, nprocs, ierr 267 integer :: nprojs_blk, nprojs_last_blk, nprojs_my_blk 268 integer :: lmn_beg,ibeg,iend,shift_do,nlmn_o,lmn_grad_beg 269 logical :: is_last_rank 270 real(dp),allocatable :: dprojs_tmp(:,:,:),dprojs_r_tmp(:,:,:),dprojs_i_tmp(:,:,:) 271 272 ! ************************************************************************* 273 274 my_compute_grad_strain=.false. ; if (present(compute_grad_strain)) my_compute_grad_strain=compute_grad_strain 275 my_compute_grad_atom=.false. ; if (present(compute_grad_atom)) my_compute_grad_atom=compute_grad_atom 276 ABI_CHECK(size(ph3d_k)>0,'nloalg(2)<0 not compatible with use_gemm_nonlop=1!') 277 ! ABI_CHECK((.not.my_compute_grad_strain).or.size(kpg_k)>0,"kpg_k should be allocated to compute gradients!") 278 ! ABI_CHECK((.not.my_compute_grad_atom).or.size(kpg_k)>0,"kpg_k should be allocated to compute gradients!") 279 280 iaph3d = 1 281 282 ABI_MALLOC(atom_projs, (2, npw, lmnmax)) 283 if (my_compute_grad_strain) then 284 ndprojs = 3 285 ABI_MALLOC(atom_dprojs, (2, npw, ndprojs, lmnmax)) 286 else 287 ndprojs = 0 288 end if 289 290 ABI_MALLOC(temp, (npw)) 291 292 call free_gemm_nonlop_ikpt(ikpt) 293 294 ! build nprojs, ngrads 295 nprojs = 0 ; ngrads = 0 296 do itypat=1,ntypat 297 nprojs = nprojs + count(indlmn(3,:,itypat)>0)*nattyp(itypat) 298 end do 299 if (my_compute_grad_strain) ngrads=ngrads+6 300 if (my_compute_grad_atom) ngrads=ngrads+3 301 if (nprojs>0) gemm_nonlop_kpt(ikpt)%nprojs = nprojs 302 if (ngrads>0) gemm_nonlop_kpt(ikpt)%ngrads = ngrads 303 304 if(gemm_nonlop_is_distributed) then 305 rank = xmpi_comm_rank(xmpi_world); nprocs = xmpi_comm_size(xmpi_world) 306 is_last_rank = (rank==nprocs-1) 307 if(gemm_nonlop_block_comm/=xmpi_comm_null) call xmpi_comm_free(gemm_nonlop_block_comm) 308 if(gemm_nonlop_nblocks==1) gemm_nonlop_nblocks=nprocs 309 write(std_out,*) "Splitting on ", gemm_nonlop_nblocks 310 call xmpi_comm_split(xmpi_world, rank/(nprocs/gemm_nonlop_nblocks), rank, gemm_nonlop_block_comm, ierr) 311 if(ierr/=0) ABI_BUG("Bug split!") 312 end if 313 rank = xmpi_comm_rank(gemm_nonlop_block_comm); nprocs = xmpi_comm_size(gemm_nonlop_block_comm) 314 is_last_rank = (rank==nprocs-1) 315 316 if(gemm_nonlop_is_distributed) then 317 nprojs_blk = nprojs / nprocs 318 nprojs_last_blk = nprojs_blk + modulo(nprojs,nprojs_blk) 319 gemm_nonlop_kpt(ikpt)%nprojs_blk = nprojs_blk 320 gemm_nonlop_kpt(ikpt)%nprojs_last_blk = nprojs_last_blk 321 if(is_last_rank) then 322 nprojs_my_blk = nprojs_last_blk 323 else 324 nprojs_my_blk = nprojs_blk 325 end if 326 else 327 nprojs_last_blk = nprojs 328 nprojs_my_blk = nprojs 329 nprojs_blk = nprojs 330 end if 331 332 if(gemm_nonlop_is_distributed .and. ngrads>0) then 333 !Temporary buffer to help distribute dprojs correctly (and easily) 334 if(istwf_k <= 1) then 335 ABI_MALLOC(dprojs_tmp, (2, npw, nprojs_my_blk*ngrads*2)) 336 else 337 ABI_MALLOC(dprojs_r_tmp, (1, npw, nprojs_my_blk*ngrads*2)) 338 ABI_MALLOC(dprojs_i_tmp, (1, npw, nprojs_my_blk*ngrads*2)) 339 end if 340 end if 341 342 if(istwf_k <= 1) then 343 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs, (2, npw, nprojs_last_blk)) 344 gemm_nonlop_kpt(ikpt)%projs = zero 345 if(ngrads>0) then 346 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs, (2, npw, nprojs_last_blk*ngrads)) 347 gemm_nonlop_kpt(ikpt)%dprojs = zero 348 end if 349 else 350 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs_r, (1, npw, nprojs_last_blk)) 351 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%projs_i, (1, npw, nprojs_last_blk)) 352 gemm_nonlop_kpt(ikpt)%projs_r = zero 353 gemm_nonlop_kpt(ikpt)%projs_i = zero 354 if(ngrads>0) then 355 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs_r, (1, npw, nprojs_last_blk*ngrads)) 356 ABI_MALLOC(gemm_nonlop_kpt(ikpt)%dprojs_i, (1, npw, nprojs_last_blk*ngrads)) 357 gemm_nonlop_kpt(ikpt)%dprojs_r = zero 358 gemm_nonlop_kpt(ikpt)%dprojs_i = zero 359 end if 360 end if 361 362 ! Compute (k+G) vectors if needed 363 nkpg_local=0 364 if ((my_compute_grad_strain.or.my_compute_grad_atom).and.size(kpg_k)==0) then 365 nkpg_local=3 366 ABI_MALLOC(kpg,(npw,nkpg_local)) 367 call mkkpg(kg_k,kpg,kpt_k,nkpg_local,npw) 368 else 369 kpg => kpg_k 370 end if 371 372 if(gemm_nonlop_is_distributed) then 373 ibeg = rank*nprojs_blk+1 374 iend = ibeg+nprojs_my_blk 375 shift_do = 0 376 lmn_grad_beg = -1 377 end if 378 379 shift = 0 ; shift_grad = 0 380 lmn_beg = 1 381 382 do itypat = 1, ntypat 383 nlmn = count(indlmn(3,:,itypat)>0) 384 nlmn_o = nlmn 385 386 do ia = 1, nattyp(itypat) 387 388 ! In distributed mode, loops are skipped until reach the section 389 ! of "ilmn" to be stored by local rank 390 if(gemm_nonlop_is_distributed) then 391 if(shift_do+nlmn < ibeg) then 392 shift_do = shift_do + nlmn 393 iaph3d = iaph3d + 1 394 cycle 395 end if 396 397 lmn_beg = max(1,ibeg-shift_do) 398 if(lmn_grad_beg==-1) lmn_grad_beg = (lmn_beg-1)*ngrads 399 if(shift_do+nlmn > iend) nlmn = iend - shift_do - 1 400 end if 401 402 !! build atom_projs, from opernlb 403 !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l 404 405 ! start from 4pi/sqrt(ucvol)*ffnl 406 ! atom_projs(1, :, lmn_beg:nlmn) = four_pi/sqrt(ham%ucvol) * ham%ffnl_k(:, 1, lmn_beg:nlmn) 407 ! TODO vectorize (DCOPY with stride) 408 atom_projs(:,:,:) = zero 409 do ipw=1, npw 410 atom_projs(1,ipw, 1:nlmn_o) = four_pi/sqrt(ucvol) * ffnl_k(ipw, 1, 1:nlmn_o, itypat) 411 end do 412 if (ndprojs>0) atom_dprojs(:,:,:,:) = zero 413 if (my_compute_grad_strain) then 414 do ipw=1, npw 415 atom_dprojs(1,ipw, 1:3, 1:nlmn_o) = four_pi/sqrt(ucvol) * ffnl_k(ipw, 2:4, 1:nlmn_o, itypat) 416 end do 417 end if 418 419 ! multiply by (-i)^l 420 do ilmn=1,nlmn_o 421 il=mod(indlmn(1,ilmn, itypat),4); 422 parity=(mod(il,2)==0) 423 if (il>1) then 424 ! multiply by -1 425 atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn) 426 if (ndprojs>0) atom_dprojs(:,:,:,ilmn) = -atom_dprojs(:,:,:,ilmn) 427 end if 428 if(.not. parity) then 429 ! multiply by -i 430 temp = atom_projs(2,:,ilmn) 431 atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn) 432 atom_projs(1,:,ilmn) = temp 433 if (ndprojs>0) then 434 do idir=1,ndprojs 435 temp = atom_dprojs(2,:,idir,ilmn) 436 atom_dprojs(2,:,idir,ilmn) = -atom_dprojs(1,:,idir,ilmn) 437 atom_dprojs(1,:,idir,ilmn) = temp 438 end do 439 end if 440 end if 441 end do 442 443 ! multiply by conj(ph3d) 444 do ilmn=1,nlmn_o 445 temp = atom_projs(1, :, ilmn) 446 atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d_k(1, :, iaph3d) & 447 & + atom_projs(2, :, ilmn) * ph3d_k(2, :, iaph3d) 448 atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d_k(1, :, iaph3d) & 449 & - temp * ph3d_k(2, :, iaph3d) 450 end do 451 if (ndprojs>0) then 452 do ilmn=1,nlmn_o 453 do idir=1,ndprojs 454 temp = atom_dprojs(1, :, idir,ilmn) 455 atom_dprojs(1, :, idir,ilmn) = atom_dprojs(1, :, idir,ilmn) * ph3d_k(1, :, iaph3d) & 456 & + atom_dprojs(2, :, idir,ilmn) * ph3d_k(2, :, iaph3d) 457 atom_dprojs(2, :, idir,ilmn) = atom_dprojs(2, :, idir,ilmn) * ph3d_k(1, :, iaph3d) & 458 & - temp * ph3d_k(2, :, iaph3d) 459 end do 460 end do 461 end if 462 463 !! atom_projs is built, copy to projs 464 465 if(istwf_k <= 1) then 466 gemm_nonlop_kpt(ikpt)%projs(1:2, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(:, :, lmn_beg:nlmn) 467 else ! istwf_k>1 468 gemm_nonlop_kpt(ikpt)%projs_r(1, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(1, :, lmn_beg:nlmn) 469 gemm_nonlop_kpt(ikpt)%projs_i(1, :, shift+1:shift+(nlmn-(lmn_beg-1))) = atom_projs(2, :, lmn_beg:nlmn) 470 end if 471 472 !! Handling dprojs 473 474 ! In distributed case, we compute values for all nlmn*ngrad in dprojs_tmp. 475 ! dprojs will be filled by "cutting" relevant values out 476 if(ngrads>0) then 477 if(gemm_nonlop_is_distributed) then 478 if(istwf_k <= 1) then 479 igrad=0 480 if(my_compute_grad_strain) then 481 do idir=1,6 482 idir1=alpha(idir);idir2=beta(idir) 483 do ilmn=1,nlmn_o 484 do ipw=1,npw 485 dprojs_tmp(1:2, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 486 & -half*(atom_dprojs(1:2, ipw, idir1, ilmn)*kpg(ipw,idir2) & 487 & +atom_dprojs(1:2, ipw, idir2, ilmn)*kpg(ipw,idir1)) 488 end do 489 end do 490 igrad=igrad+1 491 end do 492 end if 493 if(my_compute_grad_atom) then 494 do idir=1,3 495 do ilmn=1,nlmn_o 496 do ipw=1,npw 497 dprojs_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 498 & +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi 499 dprojs_tmp(2, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 500 & -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi 501 end do 502 end do 503 igrad=igrad+1 504 end do 505 end if 506 507 else ! istwf_k>1 508 igrad=0 509 if(my_compute_grad_strain) then 510 do idir=1,6 511 idir1=alpha(idir);idir2=beta(idir) 512 do ilmn=1,nlmn_o 513 do ipw=1,npw 514 dprojs_r_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 515 & -half*(atom_dprojs(1, ipw, idir1, ilmn)*kpg(ipw,idir2) & 516 & +atom_dprojs(1, ipw, idir2, ilmn)*kpg(ipw,idir1)) 517 518 dprojs_i_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 519 & -half*(atom_dprojs(2, ipw, idir1, ilmn)*kpg(ipw,idir2) & 520 & +atom_dprojs(2, ipw, idir2, ilmn)*kpg(ipw,idir1)) 521 end do 522 end do 523 igrad=igrad+1 524 end do 525 end if 526 if(my_compute_grad_atom) then 527 do idir=1,3 528 do ilmn=1,nlmn 529 do ipw=1,npw 530 dprojs_r_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 531 & +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi 532 dprojs_i_tmp(1, ipw, shift_grad+nlmn_o*igrad+ilmn) = & 533 & -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi 534 end do 535 end do 536 igrad=igrad+1 537 end do 538 end if 539 end if ! istwf_k 540 else 541 if(istwf_k <= 1) then 542 igrad=0 543 if(my_compute_grad_strain) then 544 do idir=1,6 545 idir1=alpha(idir);idir2=beta(idir) 546 do ilmn=lmn_beg,nlmn 547 do ipw=1,npw 548 gemm_nonlop_kpt(ikpt)%dprojs(1:2, ipw, shift_grad+nlmn*igrad+ilmn) = & 549 & -half*(atom_dprojs(1:2, ipw, idir1, ilmn)*kpg(ipw,idir2) & 550 & +atom_dprojs(1:2, ipw, idir2, ilmn)*kpg(ipw,idir1)) 551 end do 552 end do 553 igrad=igrad+1 554 end do 555 end if 556 if(my_compute_grad_atom) then 557 do idir=1,3 558 do ilmn=lmn_beg,nlmn 559 do ipw=1,npw 560 gemm_nonlop_kpt(ikpt)%dprojs(1, ipw, shift_grad+nlmn*igrad+ilmn) = & 561 & +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi 562 gemm_nonlop_kpt(ikpt)%dprojs(2, ipw, shift_grad+nlmn*igrad+ilmn) = & 563 & -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi 564 end do 565 end do 566 igrad=igrad+1 567 end do 568 end if 569 else ! istwf_k>1 570 igrad=0 571 if(my_compute_grad_strain) then 572 do idir=1,6 573 idir1=alpha(idir);idir2=beta(idir) 574 do ilmn=lmn_beg,nlmn 575 do ipw=1,npw 576 gemm_nonlop_kpt(ikpt)%dprojs_r(1, ipw, shift_grad+nlmn*igrad+ilmn) = & 577 & -half*(atom_dprojs(1, ipw, idir1, ilmn)*kpg(ipw,idir2) & 578 & +atom_dprojs(1, ipw, idir2, ilmn)*kpg(ipw,idir1)) 579 580 gemm_nonlop_kpt(ikpt)%dprojs_i(1, ipw, shift_grad+nlmn*igrad+ilmn) = & 581 & -half*(atom_dprojs(2, ipw, idir1, ilmn)*kpg(ipw,idir2) & 582 & +atom_dprojs(2, ipw, idir2, ilmn)*kpg(ipw,idir1)) 583 end do 584 end do 585 igrad=igrad+1 586 end do 587 end if 588 if(my_compute_grad_atom) then 589 do idir=1,3 590 do ilmn=lmn_beg,nlmn 591 do ipw=1,npw 592 gemm_nonlop_kpt(ikpt)%dprojs_r(1, ipw, shift_grad+nlmn*igrad+ilmn) = & 593 & +atom_projs(2, ipw, ilmn)*kpg(ipw,idir)*two_pi 594 gemm_nonlop_kpt(ikpt)%dprojs_i(1, ipw, shift_grad+nlmn*igrad+ilmn) = & 595 & -atom_projs(1, ipw, ilmn)*kpg(ipw,idir)*two_pi 596 end do 597 end do 598 igrad=igrad+1 599 end do 600 end if 601 end if ! istwf_k 602 end if ! gemm_nonlop_is_distributed 603 end if ! ngrads 604 605 iaph3d = iaph3d + 1 606 shift_grad = shift_grad + ngrads*nlmn_o 607 608 if(gemm_nonlop_is_distributed) then 609 shift = shift + nlmn - (lmn_beg-1) 610 shift_do = shift_do + nlmn 611 if(shift_do >= iend - 1) exit 612 else 613 shift = shift + nlmn 614 end if 615 end do 616 if(gemm_nonlop_is_distributed .and. shift_do >= iend - 1) exit 617 end do 618 619 ! Filling dprojs by extracting values from dprojs_tmp 620 if(gemm_nonlop_is_distributed .and. ngrads>0) then 621 shift_grad = lmn_grad_beg 622 if(istwf_k <= 1) then 623 gemm_nonlop_kpt(ikpt)%dprojs(1:2, 1:npw, 1:ngrads*nprojs_my_blk) = & 624 & dprojs_tmp(1:2, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk) 625 else 626 gemm_nonlop_kpt(ikpt)%dprojs_r(1, 1:npw, 1:ngrads*nprojs_my_blk) = & 627 & dprojs_r_tmp(1, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk) 628 gemm_nonlop_kpt(ikpt)%dprojs_i(1, 1:npw, 1:ngrads*nprojs_my_blk) = & 629 & dprojs_i_tmp(1, 1:npw, shift_grad+1:shift_grad+ngrads*nprojs_my_blk) 630 end if 631 end if 632 633 if(gemm_nonlop_is_distributed .and. ngrads>0) then 634 if(istwf_k <= 1) then 635 ABI_FREE(dprojs_tmp) 636 else 637 ABI_FREE(dprojs_r_tmp) 638 ABI_FREE(dprojs_i_tmp) 639 end if 640 end if 641 ABI_FREE(atom_projs) 642 ABI_FREE(temp) 643 if (allocated(atom_dprojs)) then 644 ABI_FREE(atom_dprojs) 645 end if 646 if (nkpg_local>0) then 647 ABI_FREE(kpg) 648 end if 649 end subroutine make_gemm_nonlop