TABLE OF CONTENTS
ABINIT/compute_kgb_indicator [ Functions ]
NAME
compute_kgb_indicator
FUNCTION
Only for "KGB" parallelism (LOBPCG algorithm for Ground-state): Give an indicator of performance for a given distribution of processors (npband, npfft and bandpp). Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.
INPUTS
bandpp=internal lobpcg optimization variable glb_comm=communicator for global MPI communications mband=maximum number of bands. mband=maximum number of plane waves npband=number of processor 'band' npfft = number of processor 'fft' uselinalggpu=indicate if we also test the gpu linear algebra
OUTPUT
acc_kgb = indicator of performance npslk = number of process to used in communicators
SIDE EFFECTS
This routine can be used to find an indicator in order to refine automatic process distribution. This indicator is returned in acc_kgb This routine can be used to find the optimal values of np_slk parameter (ScaLapack) and wheter or not we should use Magma for Linear Algebra in lobpcgwf
SOURCE
2001 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,uselinalggpu) 2002 2003 use m_abi_linalg 2004 2005 !Arguments ------------------------------------ 2006 !scalars 2007 integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft 2008 integer,intent(inout) :: npslk,uselinalggpu 2009 real(dp),intent(inout) :: acc_kgb 2010 2011 !Local variables------------------------------- 2012 !scalars 2013 integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10 2014 integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu 2015 integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks 2016 integer :: use_lapack_gpu,use_slk,vectsize,wfoptalg 2017 real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho 2018 character(len=500) :: msg 2019 !arrays 2020 integer,allocatable :: ranks(:),val_npslk(:) 2021 real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:) 2022 complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:) 2023 2024 !****************************************************************** 2025 2026 DBG_ENTER("COLL") 2027 2028 #ifdef DEBUG_MODE 2029 write(msg,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft 2030 call wrtout(std_out,msg,'PERS') 2031 #endif 2032 2033 !Create local communicator for test 2034 if (xmpi_paral==1) then 2035 nranks=npfft*npband 2036 ABI_MALLOC(ranks,(nranks)) 2037 ranks=(/((my_rank-1),my_rank=1,nranks)/) 2038 kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank) 2039 ABI_FREE(ranks) 2040 else 2041 kgb_comm=xmpi_comm_self 2042 my_rank=0 2043 end if 2044 2045 !Only for process in the new subgroup 2046 if (my_rank/=xmpi_undefined) then 2047 2048 ! We enforce vectsize >=blocksize (This is not true in lobpcg but 2049 ! these are rare cases and this simplify the matrix constructions below...) 2050 blocksize=npband*bandpp 2051 vectsize=max(1+mpw/(npband*npfft),blocksize) 2052 bigorder=3*blocksize 2053 2054 ABI_MALLOC(blockvectorx,(vectsize,blocksize)) 2055 ABI_MALLOC(blockvectorbx,(vectsize,blocksize)) 2056 ABI_MALLOC(sqgram,(blocksize,blocksize)) 2057 ABI_MALLOC(grama,(2*bigorder,bigorder)) 2058 ABI_MALLOC(gramb,(2*bigorder,bigorder)) 2059 ABI_MALLOC(eigen,(bigorder)) 2060 ABI_MALLOC(val_npslk,(max_number_of_npslk)) ! not too much values tested 2061 2062 min_eigen=greatest_real 2063 min_ortho=greatest_real 2064 np_slk_best=-1 ; np_slk_max=0 2065 #ifdef HAVE_LINALG_SCALAPACK 2066 np_slk_max=min(max_number_of_npslk,npband*npfft) 2067 #endif 2068 2069 ! Preselect a range of available np_slk values 2070 val_npslk(1:)=0 ; val_npslk(2)=1 2071 do islk=3,np_slk_max 2072 np_slk=val_npslk(islk-1)*2 2073 do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft))) 2074 np_slk=np_slk+1 2075 end do 2076 if(np_slk>(npband*npfft).or.np_slk>mband) exit 2077 val_npslk(islk)=np_slk 2078 end do 2079 np_slk_max=islk-1 2080 2081 ! Loop over np_slk values 2082 islk1=1 2083 #ifdef HAVE_LINALG_MAGMA 2084 islk1=1-uselinalggpu 2085 #endif 2086 do islk=islk1,np_slk_max 2087 2088 time_xortho=zero ; time_xeigen=zero 2089 2090 use_slk=0 2091 if (islk==0) then 2092 ! This is the test for the GPU 2093 use_lapack_gpu=1 ; np_slk=0 2094 else 2095 use_lapack_gpu=0 ; np_slk=val_npslk(islk) 2096 if (np_slk>0) use_slk=1 2097 end if 2098 2099 ! Initialize linalg parameters for this np_slk value 2100 ! For the first np_slk value, everything is initialized 2101 ! For the following np_slk values, only Scalapack parameters are updated 2102 wfoptalg=14 ! Simulate use of LOBPCG 2103 call abi_linalg_init(bigorder,RUNL_GSTATE,wfoptalg,1,& 2104 & use_lapack_gpu,use_slk,np_slk,kgb_comm) 2105 2106 ! We could do mband/blocksize iter as in lobpcg but it's too long 2107 do iter=1,max_number_of_iter 2108 2109 ! Build matrixes 2110 do ii=1,vectsize 2111 do jj=1,blocksize 2112 if (ii>jj) then 2113 blockvectorx(ii,jj) =czero 2114 blockvectorbx(ii,jj)=czero 2115 else 2116 blockvectorx(ii,jj) =cone 2117 blockvectorbx(ii,jj)=cone 2118 end if 2119 end do 2120 end do 2121 grama=zero;gramb=zero 2122 do jj=1,bigorder 2123 do ii=jj,bigorder 2124 if (ii==jj) then 2125 grama(2*ii-1,jj)=one 2126 gramb(2*ii-1,jj)=one 2127 else 2128 grama(2*ii-1:2*ii,jj)=one 2129 grama(2*jj-1,ii)= one 2130 grama(2*jj ,ii)=-one 2131 end if 2132 end do 2133 end do 2134 2135 ! Call to abi_xorthonormalize 2136 time_xortho=time_xortho-abi_wtime() 2137 call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize) 2138 time_xortho = time_xortho + abi_wtime() 2139 2140 ! Call to abi_xhegv 2141 time_xeigen=time_xeigen-abi_wtime() 2142 call abi_xhegv(1,'v','u',bigorder,grama,bigorder,gramb,bigorder,eigen,& 2143 & x_cplx=2,use_slk=use_slk,use_gpu=use_lapack_gpu) 2144 time_xeigen=time_xeigen+abi_wtime() 2145 2146 end do ! iter 2147 2148 ! Finalize linalg parameters for this np_slk value 2149 ! For the last np_slk value, everything is finalized 2150 ! For the previous np_slk values, only Scalapack parameters are updated 2151 call abi_linalg_finalize() 2152 2153 time_xortho= time_xortho*mband/blocksize 2154 time_xeigen= time_xeigen*mband/blocksize 2155 if (time_xortho<min_ortho) min_ortho=time_xortho 2156 if (time_xeigen<min_eigen) then 2157 min_eigen=time_xeigen 2158 np_slk_best=np_slk 2159 keep_gpu=use_lapack_gpu 2160 end if 2161 2162 end do ! np_slk 2163 2164 #ifdef DEBUG_MODE 2165 write(msg,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,& 2166 ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best 2167 call wrtout(std_out,msg,'PERS') 2168 #endif 2169 2170 ! Final values to be sent to others process 2171 acc_kgb=min_ortho+four*min_eigen 2172 npslk=max(np_slk_best,1) 2173 uselinalggpu=keep_gpu 2174 2175 ABI_FREE(blockvectorx) 2176 ABI_FREE(blockvectorbx) 2177 ABI_FREE(sqgram) 2178 ABI_FREE(grama) 2179 ABI_FREE(gramb) 2180 ABI_FREE(eigen) 2181 ABI_FREE(val_npslk) 2182 2183 end if ! my_rank in group 2184 2185 !Free local MPI communicator 2186 call xmpi_comm_free(kgb_comm) 2187 2188 !Broadcast of results to be sure every process has them 2189 call xmpi_bcast(acc_kgb,0,glb_comm,ierr) 2190 call xmpi_bcast(npslk,0,glb_comm,ierr) 2191 call xmpi_bcast(uselinalggpu,0,glb_comm,ierr) 2192 2193 #ifndef DEBUG_MODE 2194 ABI_UNUSED(msg) 2195 #endif 2196 2197 DBG_EXIT("COLL") 2198 2199 end subroutine compute_kgb_indicator
ABINIT/finddistrproc [ Functions ]
NAME
finddistrproc
FUNCTION
Given a total number of processors, find a suitable distribution that fill all the different levels of parallelization (npimage, nppert, np_spkpt, npspinor, npband, npfft, bandpp) Also determine parameters of parallel Linear Algebra routines (use_slk, np_slk, gpu_linalg_limit)
INPUTS
dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables, for all datasets; at this stage only datasets with index lower than idtset are already initalized filnam(5)=character strings giving file names idtset=number of the current dataset mpi_enreg=information about MPI parallelization mband=maximum number of bands. ndtset_alloc=number of datasets, corrected for allocation of at least one data set tread(11)=flags indicating wether parallel input parameters were read from input file tread(1) : paral_kgb tread(6) : npfft tread(2) : npimage tread(7) : npband tread(3) : nppert tread(8) : bandpp tread(4) : np_spkpt tread(9) : use_slk tread(5) : nspinor tread(10): np_slk tread(11) : gpu_linalg_limit
SIDE EFFECTS
iexit= if incremented, an exit is required dtset%paral_kgb= flag for band-fft parallelism dtset%npimage = number of processors for parallelisation over image dtset%nppert = number of processors for parallelisation over perturbations dtset%npspinor = number of processors for parallelisation on spinor components dtset%np_spkpt = number of processors for parallelisation on spin / k points dtset%npfft = number of processors for parallelisation on fft grid dtset%npband = number of processors for parallelisation on bands dtset%nphf = number of processors for parallelisation on occupied states for fock exchange dtset%bandpp = internal parameter for lobpcg parallelisation algorithm dtset%use_slk = flag for ScalaPAck use dtset%np_slk = number of processors used in ScaLapack routines dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU
SOURCE
1056 subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread) 1057 1058 !Arguments ------------------------------------ 1059 !scalars 1060 integer,intent(in) :: idtset,mband,ndtset_alloc 1061 integer,intent(inout) :: iexit 1062 type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc) 1063 type(MPI_type),intent(inout) :: mpi_enreg 1064 !arrays 1065 integer,intent(in) :: tread(11) 1066 character(len=fnlen),intent(in) :: filnam(5) 1067 1068 !Local variables------------------------------- 1069 !scalars 1070 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20) 1071 integer,parameter :: ALGO_NOT_SET=-1, ALGO_DEFAULT_PAR=2 1072 integer,parameter :: ALGO_CG=0, ALGO_LOBPCG_OLD=1, ALGO_LOBPCG_NEW=2, ALGO_CHEBFI=3, ALGO_CHEBFI_NEW=4 1073 integer,parameter :: NPFMAX=128,BLOCKSIZE_MAX=3000,MAXBAND_PRINT=10 1074 integer,parameter :: MAXCOUNT=250,MAXPRINT=10,MAXBENCH=25,MAXABIPY=25,NPF_CUTOFF=20 1075 real(dp),parameter :: relative_nband_range=0.025 1076 integer :: wf_algo,wf_algo_global,bpp,bpp_max,bpp_min,optdriver,autoparal,nblocks,blocksize 1077 integer :: npi_max,npi_min,npc,npc_max,npc_min 1078 integer :: np_sk,np_sk_max,np_sk_min,npp_max,npp_min 1079 integer :: nps,nps_max,nps_min,npf,npf_max,npf_min 1080 integer :: npb,npb_max,npb_min,max_ncpus,ount,paral_kgb 1081 integer :: work_size,nks_per_proc,tot_ncpus 1082 integer :: ib1,ib2,ibest,icount,ii,imin,jj,kk,mcount,mcount_eff,mpw 1083 integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff 1084 integer :: nproc,nproc1,nprocmin,np_slk,nthreads,use_linalg_gpu,omp_ncpus 1085 logical :: dtset_found,file_found,first_bpp,iam_master 1086 logical :: with_image,with_pert,with_kpt,with_spinor,with_fft,with_band,with_bandpp,with_thread 1087 real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,eff,ucvol,weight0 1088 character(len=9) :: suffix 1089 character(len=20) :: strg 1090 character(len=500) :: msg,msgttl 1091 character(len=fnlen) :: filden 1092 type(hdr_type) :: hdr0 1093 !arrays 1094 integer :: idum(1),idum3(3),ngmax(3),ngmin(3) 1095 integer,allocatable :: nband_best(:),isort(:),jdtset_(:) 1096 integer,allocatable :: my_algo(:),my_distp(:,:),nproc_best(:) 1097 integer,pointer :: nkpt_rbz(:) 1098 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3) 1099 real(dp),allocatable :: weight(:) 1100 real(dp),pointer :: nband_rbz(:,:) 1101 type(dataset_type),pointer :: dtset 1102 1103 !****************************************************************** 1104 1105 DBG_ENTER("COLL") 1106 1107 !Select current dataset 1108 dtset => dtsets(idtset) 1109 1110 !Is automatic parallelization activated? 1111 autoparal = dtset%autoparal 1112 if (autoparal==0) return 1113 1114 !Is it available 1115 if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then 1116 ABI_ERROR("autoparal>0 not available for Hartree-Fock or hybrid XC calculations!") 1117 end if 1118 if ((autoparal>1).and.dtset%wfoptalg/=4.and.dtset%wfoptalg/=14) then 1119 ABI_ERROR("autoparal>1 only available for the old LOBPCG algorithm (wfoptalg=4/14)!") 1120 end if 1121 1122 ! Unit number used for outputting the autoparal sections 1123 ount = ab_out 1124 1125 ! From the documentation: 1126 ! 1127 ! If autoparal > 1 and max_ncpus is greater than 0, ABINIT analyzes the 1128 ! efficiency of the process distribution for each possible number of processors 1129 ! from 2 to max_ncpus. After having printed out the efficiency, the code stops. 1130 1131 ! Handy local variables 1132 iam_master = (mpi_enreg%me==0) 1133 optdriver = dtset%optdriver 1134 max_ncpus = dtset%max_ncpus ; if (dtset%paral_kgb<0) max_ncpus=abs(dtset%paral_kgb) 1135 nthreads=xomp_get_max_threads() 1136 nproc=mpi_enreg%nproc 1137 if (max_ncpus>0) nproc = dtset%max_ncpus/nthreads 1138 if (xmpi_paral==0.and.max_ncpus<=0) nproc=1 1139 1140 nprocmin=2 1141 if (xmpi_paral==1.and.max_ncpus<=0) nprocmin=max(2,nproc-100) 1142 if (max_ncpus>0.and.autoparal/=0) nprocmin=1 1143 1144 wf_algo_global=ALGO_NOT_SET 1145 if (dtset%wfoptalg==0.and.tread(1)==1) wf_algo_global=ALGO_CG 1146 if (dtset%wfoptalg==4.or.dtset%wfoptalg==14) wf_algo_global=ALGO_LOBPCG_OLD 1147 if (dtset%wfoptalg==114) wf_algo_global=ALGO_LOBPCG_NEW 1148 if (dtset%wfoptalg==1) wf_algo_global=ALGO_CHEBFI 1149 if (dtset%wfoptalg==111) wf_algo_global=ALGO_CHEBFI_NEW 1150 1151 ! Some peculiar cases (with direct exit) 1152 ! MG: What is the meaning of max_ncpus < 0. This is not documented! 1153 if (max_ncpus<=0) then 1154 if (nproc==1.and.max_ncpus<=0) then 1155 if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0 1156 if (tread(2)==0.or.xmpi_paral==0) dtset%npimage = 1 1157 if (tread(3)==0.or.xmpi_paral==0) dtset%nppert = 1 1158 if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1 1159 if (tread(5)==0.or.xmpi_paral==0) dtset%np_spkpt = 1 1160 if (tread(6)==0.or.xmpi_paral==0) dtset%npfft = 1 1161 if (tread(7)==0.or.xmpi_paral==0) dtset%npband = 1 1162 if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp = 1 1163 if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk = 0 1164 if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk = 1000000 1165 return 1166 end if 1167 if ((optdriver/=RUNL_GSTATE.and. optdriver/=RUNL_RESPFN.and. optdriver/=RUNL_GWLS).or. & 1168 (optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then 1169 dtset%paral_kgb= 0 1170 dtset%npimage = max(1,dtset%npimage) 1171 dtset%nppert = max(1,dtset%nppert) 1172 dtset%npspinor = max(1,dtset%npspinor) 1173 dtset%np_spkpt = max(1,dtset%np_spkpt) 1174 dtset%npfft = max(1,dtset%npfft) 1175 dtset%npband = max(1,dtset%npband) 1176 dtset%bandpp = max(1,dtset%bandpp) 1177 return 1178 end if 1179 end if 1180 1181 ! Need the metric tensor 1182 call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd) 1183 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 1184 1185 ! Determine some quantities related to plane waves 1186 ! - Crude estimation of the number of PW 1187 ! - Number of G vectors in each direction 1188 mpw=0;ngmin=0;ngmax=0 1189 if (optdriver==RUNL_GSTATE) then 1190 ecut_eff = dtset%ecut*dtset%dilatmx**2 1191 mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation 1192 if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1 1193 call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt) 1194 write(msg,'(a,i0)') ' getmpw sequential formula gave: ',mpw 1195 call wrtout(std_out,msg) 1196 end if 1197 1198 ! Parallelization over images 1199 npi_min=1;npi_max=1;nimage_eff=1 1200 if (optdriver==RUNL_GSTATE) then 1201 nimage_eff=dtset%ndynimage 1202 if (dtset%ntimimage<=1) nimage_eff=dtset%nimage 1203 npi_min=max(1,dtset%npimage) 1204 npi_max=min(nproc,nimage_eff) 1205 if (tread(2)==1) npi_max=dtset%npimage 1206 end if 1207 1208 !Parallelization over k-points and spin components (GS) 1209 np_sk_min=1;np_sk_max=1;nkpt_eff=0 1210 if (optdriver==RUNL_GSTATE) then 1211 nkpt_eff=dtset%nkpt*dtset%nsppol 1212 np_sk_min=max(1,dtset%np_spkpt) 1213 np_sk_max=min(nproc,nkpt_eff) 1214 if (tread(4)==1) np_sk_max=dtset%np_spkpt 1215 end if 1216 1217 !Parallelization over perturbations, k-points and spin components (DFPT) 1218 npp_min=1;npp_max=1;npert_eff=1 1219 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1220 if (dtset%paral_rf==1) then 1221 call dtset%get_npert_rbz(nband_rbz, nkpt_rbz, npert_eff) 1222 do jj=1,npert_eff 1223 ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj)) 1224 nkpt_eff=max(nkpt_eff,ii) 1225 end do 1226 npp_min=max(1,dtset%nppert) 1227 npp_max=min(nproc,npert_eff) 1228 if (tread(3)==1) then 1229 npp_max=dtset%nppert 1230 if (npp_max>npert_eff) then 1231 npp_min=npert_eff;npp_max=npert_eff 1232 ABI_WARNING('nppert is bigger than npert; we set nppert=npert') 1233 end if 1234 end if 1235 np_sk_min=1 1236 np_sk_max=min(nproc,nkpt_eff) 1237 ABI_FREE(nkpt_rbz) 1238 ABI_FREE(nband_rbz) 1239 else 1240 nkpt_eff=nproc 1241 np_sk_min=nproc-5 1242 np_sk_max=nproc 1243 end if 1244 end if 1245 1246 !Parallelization over spinorial components 1247 nps_min=1;nps_max=1 1248 if (optdriver==RUNL_GSTATE) then 1249 nps_min=max(1,dtset%npspinor) 1250 nps_max=min(nproc,dtset%nspinor) 1251 if (tread(5)==1) nps_max=dtset%npspinor 1252 end if 1253 1254 !KGB Parallelization 1255 1256 npf_min=1;npf_max=1 1257 npb_min=1;npb_max=1 1258 bpp_min=1;bpp_max=1 1259 n2=0;n3=0 1260 if (optdriver==RUNL_GSTATE) then 1261 1262 ! >> FFT level 1263 npf_min=max(1,dtset%npfft) 1264 npf_min=min(npf_min,ngmin(2)) 1265 npf_max=min(nproc,NPFMAX) 1266 if (tread(6)==1) then 1267 npf_max=dtset%npfft 1268 if (npf_max>ngmin(2)) then 1269 write(msg,'(3a)') & 1270 "Value of npfft given in input file is too high for the FFT grid!",ch10,& 1271 "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)." 1272 ABI_ERROR(msg) 1273 end if 1274 end if 1275 npf_max=min(npf_max,ngmin(2)) 1276 ! Deactivate MPI FFT parallelism for GPU 1277 if (dtset%use_gpu_cuda==1) then 1278 npf_min=1;npf_max=1 1279 end if 1280 !Deactivate MPI FFT parallelism for GPU 1281 if (tread(1)==1.and.dtset%paral_kgb==0) then 1282 npf_min=1;npf_max=1 1283 end if 1284 !Deactivate MPI FFT parallelism for multi-threaded LOBPCG / CHEBFI 1285 if ((wf_algo_global==ALGO_LOBPCG_NEW.or.wf_algo_global==ALGO_CHEBFI.or.wf_algo_global==ALGO_CHEBFI_NEW).and.nthreads>1) then 1286 npf_min=1;npf_max=1 1287 end if 1288 1289 ! Number of FFT procs has to be a multiple of FFT grid sizes 1290 ! In case of a restart from a density file, it has to be 1291 ! compatible with the FFT grid used for the density 1292 n2=dtset%ngfft(2) ; n3=dtset%ngfft(3) 1293 if (n2==0.and.n3==0.and.(dtset%getden/=0.or.dtset%irdden/=0.or.dtset%iscf<0)) then 1294 dtset_found=.false.;file_found=.false. 1295 !1-Try to find ngfft from previous dataset 1296 if (dtset%getden/=0) then 1297 do ii=1,ndtset_alloc 1298 jj=dtset%getden;if (jj<0) jj=dtset%jdtset+jj 1299 if (dtsets(ii)%jdtset==jj) then 1300 dtset_found=.true. 1301 n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3) 1302 end if 1303 end do 1304 end if 1305 !2-If not found, try to extract ngfft from density file 1306 if (.not.dtset_found) then 1307 !Retrieve file name 1308 suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN' 1309 ABI_MALLOC(jdtset_,(0:ndtset_alloc)) 1310 jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset 1311 call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii) 1312 ABI_FREE(jdtset_) 1313 !Retrieve ngfft from file header 1314 idum3=0 1315 if (mpi_enreg%me==0) then 1316 inquire(file=trim(filden),exist=file_found) 1317 if (file_found) then 1318 call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self) 1319 idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1 1320 call hdr0%free() 1321 ABI_WARNING("Cannot find filden"//filden) 1322 end if 1323 end if 1324 call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii) 1325 n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0) 1326 end if 1327 end if 1328 1329 ! >> Band level 1330 npb_min=max(1,dtset%npband) 1331 npb_max=min(nproc,mband) 1332 if (tread(7)==1) npb_max=dtset%npband 1333 if (tread(1)==1.and.dtset%paral_kgb==0) then 1334 npb_min=1;npb_max=1 1335 end if 1336 1337 ! >> banddp level 1338 bpp_min=max(1,dtset%bandpp) 1339 bpp_max=mband 1340 if (wf_algo_global==ALGO_LOBPCG_OLD) bpp_max=max(4,nint(mband/10.)) ! reasonable bandpp max 1341 if (tread(8)==1) bpp_max=dtset%bandpp 1342 if (wf_algo_global==ALGO_CHEBFI) bpp_min=1 ! bandpp not used with ChebFi 1343 if (wf_algo_global==ALGO_CHEBFI) bpp_max=1 1344 if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_min=1 ! bandpp not used with ChebFi 1345 if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_max=1 ! bandpp not used with ChebFi 1346 1347 end if ! RUNL_GSTATE 1348 1349 !Disable KGB parallelisation in some cases: 1350 ! - no GS 1351 ! - paral_kgb=0 present in input file 1352 ! - nstep=0 1353 ! - Hartree-Fock or hybrid calculation (for now on) 1354 if ( (optdriver/=RUNL_GSTATE).or.(dtset%paral_kgb==0.and.tread(1)==1).or. & 1355 (dtset%nstep==0).or.(dtset%usefock==1)) then 1356 nps_min=1; nps_max=1 1357 npf_min=1; npf_max=1 1358 npb_min=1; npb_max=1 1359 bpp_min=1; bpp_max=1 1360 end if 1361 1362 ! Which levels of parallelism do we have? 1363 with_image =(npi_min/=1.or.npi_max/=1) 1364 with_pert =(npp_min/=1.or.npp_max/=1) 1365 with_kpt =(np_sk_min/=1.or.np_sk_max/=1) 1366 with_spinor=(nps_min/=1.or.nps_max/=1) 1367 with_fft =(npf_min/=1.or.npf_max/=1) 1368 with_band =(npb_min/=1.or.npb_max/=1) 1369 with_bandpp=(bpp_min/=1.or.bpp_max/=1) 1370 with_thread=(nthreads>1) 1371 1372 !Allocate lists 1373 ABI_MALLOC(my_distp,(10,MAXCOUNT)) 1374 ABI_MALLOC(weight,(MAXCOUNT)) 1375 ABI_MALLOC(my_algo,(MAXCOUNT)) 1376 my_distp(1:7,:)=0;weight(:)=zero 1377 my_distp(8,:)=dtset%use_slk 1378 my_distp(9,:)=dtset%np_slk 1379 my_distp(10,:)=dtset%gpu_linalg_limit 1380 my_algo(:)=wf_algo_global 1381 icount=0;imin=1 1382 1383 !Cells= images or perturbations 1384 npc_min=1;npc_max=1;ncell_eff=1 1385 if (optdriver==RUNL_GSTATE) then 1386 ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max 1387 end if 1388 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1389 ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max 1390 end if 1391 1392 !Loop over all possibilities 1393 !Computation of weight~"estimated acceleration" 1394 !================================================================ 1395 1396 !Cells= images or perturbations 1397 npc_min=1;npc_max=1;ncell_eff=1 1398 if (optdriver==RUNL_GSTATE) then 1399 ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max 1400 end if 1401 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1402 ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max 1403 end if 1404 1405 !>>>>> CELLS 1406 do npc=npc_min,npc_max 1407 acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc) 1408 1409 ! >>>>> K-POINTS 1410 do np_sk=np_sk_min,np_sk_max 1411 ! -> for DFPT runs, impose that nsppol divides np_sk 1412 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE]) .and. modulo(np_sk,dtset%nsppol)>0.and.np_sk>1) cycle 1413 acc_k=one;if (np_sk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,np_sk) 1414 1415 ! >>>>> SPINORS 1416 do nps=nps_min,nps_max 1417 acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps) 1418 1419 ! >>>>> FFT 1420 do npf=npf_min,npf_max 1421 ! -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok) 1422 if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle 1423 ! -> npf should be only divisible by 2, 3 or 5 1424 ii=npf 1425 do while (modulo(ii,2)==0) 1426 ii=ii/2 1427 end do 1428 do while (modulo(ii,3)==0) 1429 ii=ii/3 1430 end do 1431 do while (modulo(ii,5)==0) 1432 ii=ii/5 1433 end do 1434 if(ii/=1) cycle 1435 1436 ! Change algo if npfft>1 1437 wf_algo=wf_algo_global 1438 if (optdriver==RUNL_GSTATE.and.npf>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR 1439 1440 ! FFT parallelism not compatible with multithreading 1441 if (wf_algo==ALGO_LOBPCG_NEW.or.wf_algo==ALGO_CHEBFI.or.wf_algo==ALGO_CHEBFI_NEW) then 1442 if (nthreads>1.and.npf>1) cycle 1443 end if 1444 1445 ! >>>>> BANDS 1446 do npb=npb_min,npb_max 1447 nproc1=npc*np_sk*nps*npf*npb 1448 if (nproc1<nprocmin) cycle 1449 if (nproc1>nproc) cycle 1450 if (modulo(mband,npb)>0) cycle 1451 1452 ! Change algo if npband>1 1453 if (optdriver==RUNL_GSTATE.and.npb>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR 1454 1455 ! Base speedup 1456 acc_kgb_0=one;if (npb*npf*nthreads>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf*nthreads)) 1457 1458 if (npb*npf>4.and.wf_algo==ALGO_LOBPCG_OLD) then 1459 ! Promote npb=npf 1460 acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf)) 1461 ! Promote npf<=20 1462 if (npf>20)then 1463 acc_kgb_0=acc_kgb_0* & 1464 & 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) & 1465 & /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2 1466 end if 1467 end if 1468 1469 first_bpp=.true. 1470 do bpp=bpp_min,bpp_max 1471 1472 if (wf_algo==ALGO_LOBPCG_NEW) then 1473 blocksize=npb*bpp;nblocks=mband/blocksize 1474 if (modulo(bpp,nthreads)>0) cycle 1475 if ((bpp>1).and.(modulo(bpp,2)>0)) cycle 1476 if (modulo(mband,npb*bpp)>0) cycle 1477 else if (wf_algo==ALGO_LOBPCG_OLD) then 1478 blocksize=npb*bpp;nblocks=mband/blocksize 1479 if (modulo(mband/npb,bpp)>0) cycle 1480 if ((bpp>1).and.(modulo(bpp,2)>0)) cycle 1481 if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle 1482 if (npb*npf<=4.and.(.not.first_bpp)) cycle 1483 else if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then 1484 !Nothing 1485 else 1486 if (bpp/=1.or.npb/=1) cycle 1487 end if 1488 1489 first_bpp=.false. 1490 1491 acc_kgb=acc_kgb_0 1492 ! OLD LOBPCG: promote bpp*npb>mband/3 1493 if (wf_algo==ALGO_LOBPCG_OLD) then 1494 if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband)) 1495 end if 1496 ! NEW LOBPCG: promote minimal number of blocks 1497 ! promote block size <= BLOCKSIZE_MAX 1498 if (wf_algo==ALGO_LOBPCG_NEW) then 1499 acc_kgb=acc_kgb*(one-0.9_dp*dble(nblocks-1)/dble(mband-1)) 1500 if (blocksize>BLOCKSIZE_MAX) acc_kgb=acc_kgb*max(0.1_dp,one-dble(blocksize)/dble(10*BLOCKSIZE_MAX)) 1501 if (nthreads==1) then 1502 ! Promote npband vs bandpp & npfft 1503 if (blocksize>1) acc_kgb=acc_kgb*(0.1_dp*bpp+0.9_dp-blocksize)/(one-blocksize) 1504 if (npb*npf>4.and.mband>100) acc_kgb=acc_kgb*(one-0.8_dp*((three*bpp*npb)/(one*mband)-one)**2) 1505 tot_ncpus=max(npb,npf);if (tot_ncpus==2) tot_ncpus=0 1506 acc_kgb=acc_kgb*(one-0.8_dp*((dble(npb)/dble(npf))-2_dp)**2/(tot_ncpus-2_dp)**2) 1507 eff=max(npf,20);acc_kgb=acc_kgb*(one-0.8_dp*min(one,(eff-20)**2)) 1508 end if 1509 end if 1510 1511 ! CHEBFI: promote npfft=npband and nband>=npfft 1512 if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then 1513 if (npf>1) then 1514 if (npb>npf) then 1515 acc_kgb=acc_kgb*(one-0.8_dp*0.25_dp*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2) 1516 else 1517 acc_kgb=acc_kgb*(one-0.8_dp*nproc1**2*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2) 1518 end if 1519 end if 1520 end if 1521 1522 ! Resulting "weight" 1523 ! weight0=acc_c*acc_k*acc_s*acc_kgb 1524 weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+np_sk+nps+(npf*npb)) 1525 1526 ! Store data 1527 icount=icount+1 1528 if (icount<=MAXCOUNT) then 1529 my_algo(icount)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET) 1530 my_distp(1:7,icount)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/) 1531 weight(icount)=weight0 1532 if (weight0<weight(imin)) imin=icount 1533 else 1534 if (weight0>weight(imin)) then 1535 my_algo(imin)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET) 1536 my_distp(1:7,imin)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/) 1537 weight(imin)=weight0 1538 idum=minloc(weight);imin=idum(1) 1539 end if 1540 end if 1541 1542 end do ! bpp 1543 end do ! npb 1544 end do ! npf 1545 end do ! nps 1546 end do ! np_sk 1547 end do ! npc 1548 1549 !Compute number of selected distributions 1550 mcount_eff=icount 1551 mcount=min(mcount_eff,MAXCOUNT) 1552 1553 !Stop if no solution found 1554 if (mcount==0) then 1555 ! Override here the 0 default value changed in indefo1 1556 dtset%npimage = max(1,dtset%npimage) 1557 dtset%nppert = max(1,dtset%nppert) 1558 dtset%np_spkpt = max(1,dtset%np_spkpt) 1559 dtset%npspinor = max(1,dtset%npspinor) 1560 dtset%npfft = max(1,dtset%npfft) 1561 dtset%npband = max(1,dtset%npband) 1562 dtset%bandpp = max(1,dtset%bandpp) 1563 write(msg,'(a,i0,2a,i0,a)') & 1564 & 'Your input dataset does not let Abinit find an appropriate process distribution with nCPUs=',nproc*nthreads,ch10, & 1565 & 'Try to comment all the np* vars and set max_ncpus=',nthreads*nproc,' to have advices on process distribution.' 1566 ABI_WARNING(msg) 1567 if (max_ncpus>0) call wrtout(ab_out,msg, do_flush=.True.) 1568 iexit=iexit+1 1569 end if 1570 1571 !Sort data by increasing weight 1572 if (mcount>0) then 1573 ABI_MALLOC(isort,(mcount)) 1574 isort=(/(ii,ii=1,mcount)/) 1575 call sort_dp(mcount,weight,isort,tol6) 1576 ncount=min(mcount,MAXPRINT) 1577 end if 1578 1579 !Deduce a global value for paral_kgb 1580 paral_kgb=dtset%paral_kgb 1581 if (tread(1)==0) then 1582 if (any(my_algo(:)/=ALGO_CG)) paral_kgb=1 1583 end if 1584 1585 ! ====================================== 1586 ! Print output for abipy in Yaml format 1587 ! ====================================== 1588 1589 ! Please DO NOT CHANGE this part without contacting gmatteo first 1590 ! since ANY CHANGE can easily break the interface with AbiPy. 1591 if (iam_master .and. max_ncpus > 0.and. (mcount>0 .or. wf_algo_global == ALGO_CG)) then 1592 write(ount,'(2a)')ch10,"--- !Autoparal" 1593 if (optdriver==RUNL_GSTATE .and. paral_kgb == 0) then 1594 write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)" 1595 else if (optdriver==RUNL_GSTATE) then 1596 write(ount,'(a)')'# Autoparal section for GS calculations with paral_kgb 1' 1597 else if (optdriver==RUNL_RESPFN) then 1598 write(ount,'(a)')'# Autoparal section for DFPT calculations' 1599 else if (optdriver==RUNL_LONGWAVE) then 1600 write(ount,'(a)')'# Autoparal section for LONGWAVE calculations' 1601 else 1602 ABI_ERROR(sjoin('Unsupported optdriver:', itoa(optdriver))) 1603 end if 1604 write(ount,"(a)") "info:" 1605 write(ount,"(a,i0)")" autoparal: ",autoparal 1606 write(ount,"(a,i0)")" paral_kgb: ",paral_kgb 1607 write(ount,"(a,i0)")" max_ncpus: ",max_ncpus 1608 write(ount,"(a,i0)")" nspinor: ",dtset%nspinor 1609 write(ount,"(a,i0)")" nsppol: ",dtset%nsppol 1610 write(ount,"(a,i0)")" nkpt: ",dtset%nkpt 1611 write(ount,"(a,i0)")" mband: ",mband 1612 write(ount,"(a)")"configurations:" 1613 1614 if (optdriver==RUNL_GSTATE.and.paral_kgb==0) then 1615 work_size = dtset%nkpt * dtset%nsppol 1616 do ii=1,max_ncpus 1617 if (ii > work_size) cycle 1618 do omp_ncpus=1,nthreads 1619 nks_per_proc = work_size / ii 1620 nks_per_proc = nks_per_proc + MOD(work_size, ii) 1621 eff = (one * work_size) / (ii * nks_per_proc) 1622 write(ount,"(a,i0)")" - tot_ncpus: ",ii * omp_ncpus 1623 write(ount,"(a,i0)")" mpi_ncpus: ",ii 1624 write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus 1625 write(ount,"(a,f12.9)")" efficiency: ",eff 1626 !write(ount,"(a,f12.2)")" mem_per_cpu: ",mempercpu_mb 1627 end do 1628 end do 1629 1630 else if (optdriver==RUNL_GSTATE) then 1631 omp_ncpus=nthreads 1632 do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1 1633 ii=isort(jj) 1634 tot_ncpus = my_distp(7,ii) 1635 eff = weight(jj) / tot_ncpus 1636 write(ount,'(a,i0)')' - tot_ncpus: ',tot_ncpus 1637 write(ount,'(a,i0)')' mpi_ncpus: ',tot_ncpus 1638 write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus 1639 write(ount,'(a,f12.9)')' efficiency: ',eff 1640 !write(ount,'(a,f12.2)')' mem_per_cpu: ',mempercpu_mb 1641 write(ount,'(a)' )' vars: {' 1642 write(ount,'(a,i0,a)')' npimage: ',my_distp(1,ii),',' 1643 ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy 1644 write(ount,'(a,i0,a)')' npkpt: ',my_distp(2,ii),',' 1645 !write(ount,'(a,i0,a)')' np_spkpt: ',my_distp(2,ii),',' 1646 write(ount,'(a,i0,a)')' npspinor: ',my_distp(3,ii),',' 1647 write(ount,'(a,i0,a)')' npfft: ', my_distp(4,ii),',' 1648 write(ount,'(a,i0,a)')' npband: ',my_distp(5,ii),',' 1649 write(ount,'(a,i0,a)')' bandpp: ',my_distp(6,ii),',' 1650 write(ount,'(a)') ' }' 1651 end do 1652 1653 else if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1654 do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1 1655 ii=isort(jj) 1656 tot_ncpus = my_distp(7,ii) 1657 eff = weight(jj) / tot_ncpus 1658 write(ount,'(a,i0)')' - tot_ncpus: ',tot_ncpus 1659 write(ount,'(a,i0)')' mpi_ncpus: ',tot_ncpus 1660 !write(ount,'(a,i0)')' omp_ncpus: ',omp_ncpus !OMP not supported (yet) 1661 write(ount,'(a,f12.9)')' efficiency: ',eff 1662 !write(ount,'(a,f12.2)')' mem_per_cpu: ',mempercpu_mb 1663 write(ount,'(a)' )' vars: {' 1664 write(ount,'(a,i0,a)')' nppert: ', my_distp(1,ii),',' 1665 ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy 1666 write(ount,'(a,i0,a)')' npkpt: ', my_distp(2,ii),',' 1667 !write(ount,'(a,i0,a)')' np_spkpt: ', my_distp(2,ii),',' 1668 write(ount,'(a)') ' }' 1669 end do 1670 end if 1671 write(ount,'(a)')"..." 1672 end if 1673 1674 !Print out tab with selected choices 1675 if (mcount>0.and.iam_master) then 1676 if (nthreads==1) then 1677 write(msg,'(a,1x,100("="),2a,i0,2a)') ch10,ch10,& 1678 & ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,':',ch10 1679 else 1680 write(msg,'(a,1x,100("="),2a,i0,a,i0,2a)') ch10,ch10,& 1681 & ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,& 1682 & ' and ',nthreads,' openMP threads:',ch10 1683 end if 1684 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1685 !Titles of columns 1686 msgttl='~' 1687 if (with_image) msgttl=trim(msgttl)//'~~~~~~~~~~~' 1688 if (with_pert) msgttl=trim(msgttl)//'~~~~~~~~~~~' 1689 msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! kpt 1690 if (with_spinor) msgttl=trim(msgttl)//'~~~~~~~~~~' 1691 if (with_fft) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1692 if (with_band) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1693 if (with_bandpp) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1694 if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~' 1695 msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! nproc 1696 if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1697 msgttl=trim(msgttl)//'~~~~~~~~~~~' ! CPUs 1698 msgttl=' '//trim(msgttl) 1699 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1700 msg='|' 1701 if (with_image) msg=trim(msg)//' npimage|' 1702 if (with_pert) msg=trim(msg)//' nppert|' 1703 msg=trim(msg)//' np_spkpt|' 1704 if (with_spinor) msg=trim(msg)//' npspinor|' 1705 if (with_fft) msg=trim(msg)//' npfft|' 1706 if (with_band) msg=trim(msg)//' npband|' 1707 if (with_bandpp) msg=trim(msg)//' bandpp|' 1708 if (with_thread) msg=trim(msg)//' #Threads|' 1709 msg=trim(msg)//' #MPI(proc)|' 1710 if (with_thread) msg=trim(msg)//' #CPUs|' 1711 msg=trim(msg)//' WEIGHT|' 1712 msg=' '//trim(msg) 1713 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1714 msg='|' 1715 write(strg,'(i4,a,i4,a)') npi_min,'<<',npi_max,'|';if (with_image) msg=trim(msg)//trim(strg) 1716 write(strg,'(i4,a,i4,a)') npp_min,'<<',npp_max,'|';if (with_pert) msg=trim(msg)//trim(strg) 1717 write(strg,'(i5,a,i5,a)') np_sk_min,'<<',np_sk_max,'|'; msg=trim(msg)//trim(strg) 1718 write(strg,'(i5,a,i2,a)') nps_min,'<<',nps_max,'|';if (with_spinor) msg=trim(msg)//trim(strg) 1719 write(strg,'(i5,a,i5,a)') npf_min,'<<',npf_max,'|';if (with_fft) msg=trim(msg)//trim(strg) 1720 write(strg,'(i5,a,i5,a)') npb_min,'<<',npb_max,'|';if (with_band) msg=trim(msg)//trim(strg) 1721 write(strg,'(i5,a,i5,a)') bpp_min,'<<',bpp_max,'|';if (with_bandpp) msg=trim(msg)//trim(strg) 1722 write(strg,'(i9,a)' ) nthreads ,'|';if (with_thread) msg=trim(msg)//trim(strg) 1723 write(strg,'(i5,a,i5,a)') 1 ,'<<',nproc ,'|'; msg=trim(msg)//trim(strg) 1724 write(strg,'(i4,a,i6,a)') nthreads,'<<',nthreads*nproc,'|';if (with_thread) msg=trim(msg)//trim(strg) 1725 write(strg,'(a,i6,a)') ' <=',nthreads*nproc,'|'; msg=trim(msg)//trim(strg) 1726 msg=' '//trim(msg) 1727 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1728 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1729 !Loop over selected choices 1730 do jj=mcount,mcount-ncount+1,-1 1731 ii=isort(jj) 1732 msg='|' 1733 write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_image) msg=trim(msg)//trim(strg) 1734 write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_pert) msg=trim(msg)//trim(strg) 1735 write(strg,'(i12,a)') my_distp(2,ii),'|'; msg=trim(msg)//trim(strg) 1736 write(strg,'(i9,a)') my_distp(3,ii),'|';if (with_spinor) msg=trim(msg)//trim(strg) 1737 write(strg,'(i12,a)') my_distp(4,ii),'|';if (with_fft) msg=trim(msg)//trim(strg) 1738 write(strg,'(i12,a)') my_distp(5,ii),'|';if (with_band) msg=trim(msg)//trim(strg) 1739 write(strg,'(i12,a)') my_distp(6,ii),'|';if (with_bandpp) msg=trim(msg)//trim(strg) 1740 write(strg,'(i9,a)') nthreads ,'|';if (with_thread) msg=trim(msg)//trim(strg) 1741 write(strg,'(i12,a)') my_distp(7,ii),'|'; msg=trim(msg)//trim(strg) 1742 write(strg,'(i12,a)') nthreads*my_distp(7,ii),'|';if (with_thread) msg=trim(msg)//trim(strg) 1743 write(strg,'(f10.3,a)') weight(jj) ,'|'; msg=trim(msg)//trim(strg) 1744 msg=' '//trim(msg) 1745 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1746 end do 1747 !End of tab 1748 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1749 write(msg,'(a,i6,a,i6,a)')' Only the best possible choices for nproc are printed...' 1750 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1751 end if ! mcount>0 1752 1753 !Determine an optimal number of bands 1754 if (optdriver==RUNL_GSTATE.and. & 1755 & (any(my_algo(1:mcount)==ALGO_LOBPCG_OLD.or. & 1756 & my_algo(1:mcount)==ALGO_LOBPCG_NEW.or. & 1757 & my_algo(1:mcount)==ALGO_CHEBFI.or. & 1758 & my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then 1759 if (mcount>0) then 1760 icount=isort(mcount) 1761 npc=my_distp(1,icount);np_sk=my_distp(2,icount) 1762 nps=my_distp(3,icount);npf=my_distp(4,icount) 1763 else 1764 npc=1;if (with_image ) npc=npi_min 1765 np_sk=1;if (with_kpt ) np_sk=np_sk_min 1766 nps=1;if (with_spinor) nps=nps_min 1767 npf=1;if (with_fft ) npf=npf_min 1768 end if 1769 nproc1=npc*np_sk*nps*npf 1770 msg=ch10//' >>> Possible (best) choices for the number of bands (nband) are:' 1771 if (with_image.or.with_kpt.or.with_spinor.or.with_fft) msg=trim(msg)//ch10//' with:' 1772 write(strg,'(a,i0)') ' npimage=' ,npc;if (with_image) msg=trim(msg)//trim(strg) 1773 write(strg,'(a,i0)') ' np_spkpt=' ,np_sk;if (with_kpt) msg=trim(msg)//trim(strg) 1774 write(strg,'(a,i0)') ' npspinor=',nps;if (with_spinor) msg=trim(msg)//trim(strg) 1775 write(strg,'(a,i0)') ' npfft=' ,npf;if (with_fft) msg=trim(msg)//trim(strg) 1776 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1777 ib1=mband-int(mband*relative_nband_range);if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) ib1=mband 1778 ib2=mband+int(mband*relative_nband_range) 1779 ABI_MALLOC(nproc_best,(1+ib2-ib1)) 1780 ABI_MALLOC(nband_best,(1+ib2-ib1)) 1781 nproc_best(:)=1 1782 nband_best=(/(ii,ii=ib1,ib2)/) 1783 bpp=merge(1,nthreads,my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) 1784 do ii=ib1,ib2 1785 do jj=1,nproc/nproc1 1786 ibest=1 1787 do kk=1,jj 1788 if (mod(jj,kk)/=0) cycle 1789 if (mod(ii,kk*bpp)==0) ibest=max(ibest,kk) 1790 end do 1791 nproc_best(1+ii-ib1)=max(nproc_best(1+ii-ib1),ibest) 1792 end do 1793 end do 1794 call sort_int(1+ib2-ib1,nproc_best,nband_best) 1795 kk=-1 1796 do ii=1+ib2-ib1,max(ib2-ib1-MAXBAND_PRINT,1),-1 1797 write(msg,'(3(a,i6),a,i3,a,i5,a)') ' nband=',nband_best(ii),' using ',nproc1*nproc_best(ii)*nthreads,& 1798 & ' CPUs =',nproc1*nproc_best(ii),' MPI x',nthreads,' threads (npband=',nproc_best(ii),')' 1799 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1800 if (nband_best(ii)==mband) kk=nproc_best(ii) 1801 end do 1802 if (kk==maxval(nproc_best(:))) then 1803 if (my_algo(icount)/=ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then 1804 write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be the best choice!' 1805 end if 1806 if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then 1807 write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be a good choice!' 1808 end if 1809 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1810 end if 1811 ABI_FREE(nproc_best) 1812 ABI_FREE(nband_best) 1813 end if 1814 1815 if (optdriver==RUNL_GSTATE.and.(any(my_algo(1:mcount)==ALGO_CHEBFI .or. my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then 1816 write(msg,'(5a)') & 1817 & ' >>> Note that with the "Chebyshev Filtering" algorithm, it is often',ch10,& 1818 & ' better to increase the number of bands (10% more or a few tens more).',ch10,& 1819 & ' Advice: increase nband and put nbdbuf input variable to (nband_new-nband_old).' 1820 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1821 end if 1822 1823 !Refinement of the process distribution by mean of a LinAlg routines benchmarking 1824 if (mcount>0.and.optdriver==RUNL_GSTATE.and.autoparal/=1) then 1825 icount=isort(mcount) 1826 if (autoparal/=3) then 1827 if (autoparal==2) then 1828 write(msg,'(5a,9(a10,a1))') ch10, & 1829 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 1830 & ' Weights below are corrected according:',ch10,& 1831 & 'npimage','|','np_spkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 1832 & 'nproc' ,'|','weight','|','new weight','|' 1833 else 1834 write(msg,'(5a,11(a10,a1))') ch10, & 1835 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 1836 & ' Weights below are corrected according:',ch10,& 1837 & 'npimage','|','np_spkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 1838 & 'nproc' ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|' 1839 end if 1840 call wrtout(std_out,msg);if (max_ncpus > 0) call wrtout(ab_out,msg) 1841 end if 1842 acc_k=zero 1843 ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1 1844 do jj=mcount,mcount-ncount+1,-1 1845 ii=isort(jj) 1846 npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii) 1847 if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then 1848 use_linalg_gpu=dtset%use_gpu_cuda 1849 call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu) 1850 if (autoparal/=2) then 1851 my_distp(9,ii)=np_slk 1852 if (np_slk>0) my_distp(8,ii)=1 1853 ! * gpu_linalg_limit: 1854 ! No use of GPU: htgspw_01.outuge value ~2 *vectsize*blocksize**2 tested 1855 ! Use of GPU: tiny value ~0.5*vectsize*blocksize**2 tested 1856 my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf 1857 if (use_linalg_gpu==1) my_distp(10,ii)=my_distp(10,ii)/4 1858 end if 1859 if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed 1860 ! * Weight (corrected by 10% of the computed ratio) 1861 weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb) 1862 if (autoparal==2) then 1863 write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2)') & 1864 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 1865 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |' 1866 else if (autoparal==3) then 1867 write(msg,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, & 1868 & ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),& 1869 & ' and use_linalg_gpu=',use_linalg_gpu 1870 else 1871 write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') & 1872 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 1873 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',& 1874 & my_distp(9,ii),'|',use_linalg_gpu,'|' 1875 end if 1876 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 1877 ! We store the best value in weight(mcount) and keep icount 1878 if (weight0 > weight(mcount)) then 1879 icount=ii;weight(mcount)=weight0 1880 end if 1881 end if 1882 end do 1883 end if 1884 1885 !Store new process distribution 1886 if (mcount>0.and.max_ncpus<=0) then 1887 icount=isort(mcount) 1888 nproc1=my_distp(7,icount) 1889 ! Work load distribution 1890 if (optdriver==RUNL_GSTATE) then 1891 dtset%npimage= my_distp(1,icount) 1892 dtset%nppert = 1 1893 dtset%np_spkpt = my_distp(2,icount) 1894 end if 1895 if (optdriver==RUNL_RESPFN) then 1896 dtset%npimage= 1 1897 dtset%nppert = my_distp(1,icount) 1898 dtset%np_spkpt = 1 1899 end if 1900 dtset%npspinor = my_distp(3,icount) 1901 dtset%npfft = my_distp(4,icount) 1902 dtset%npband = my_distp(5,icount) 1903 dtset%bandpp = my_distp(6,icount) 1904 if (tread(1)==0) dtset%paral_kgb= merge(0,1,my_algo(icount)==ALGO_CG) 1905 ! The following lines are mandatory : the DFT+DMFT must use ALL the 1906 ! available procs specified by the user. So nproc1=nproc. 1907 ! Works only if paral_kgb is not activated?? 1908 if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE) then 1909 if (dtset%paral_kgb==0) then 1910 dtset%npspinor = 1 ; dtset%npfft = 1 1911 dtset%npband = 1 ; dtset%bandpp = 1 1912 dtset%npimage = 1 1913 end if 1914 nproc1 = nproc 1915 end if 1916 if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1 1917 ! LinAlg parameters: we change values only if they are not present in input file 1918 if (dtset%paral_kgb==1) then 1919 if (tread(9)==0) dtset%use_slk=my_distp(8,icount) 1920 if (tread(10)==0) dtset%np_slk=my_distp(9,icount) 1921 if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount) 1922 end if 1923 ! New definition of "world" MPI communicator 1924 if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then 1925 nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90 1926 nproc1=min(nproc1,nproc) 1927 nproc1=(nproc1/dtset%nppert)*dtset%nppert 1928 end if 1929 call initmpi_world(mpi_enreg,nproc1) 1930 end if 1931 1932 !Final advice in case max_ncpus > 0 1933 if (max_ncpus>0.and.mcount>0) then 1934 write(msg,'(6a)') ch10,& 1935 ' Launch a parallel version of ABINIT with a distribution of processors among the above list,',ch10,& 1936 ' and the associated input variables (np_spkpt, npband, npfft, bandpp, etc.).',ch10,& 1937 ' The higher weight should be better.' 1938 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 1939 end if 1940 1941 if (mcount>0) then 1942 ABI_FREE(isort) 1943 end if 1944 ABI_FREE(my_distp) 1945 ABI_FREE(my_algo) 1946 ABI_FREE(weight) 1947 1948 !Final line 1949 write(msg,'(a,100("="),2a)') " ",ch10,ch10 1950 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 1951 1952 !max_ncpus requires a stop 1953 if (max_ncpus>0) then 1954 iexit = iexit + 1 ! will stop in the parent. 1955 end if 1956 1957 DBG_EXIT("COLL") 1958 1959 contains 1960 1961 real(dp) pure function speedup_fdp(nn, mm) 1962 ! Expected linear speedup for a nn-sized problem and mm processes 1963 integer,intent(in) :: nn, mm 1964 speedup_fdp = (one*nn) / (one* ((nn / mm) + merge(0, 1, mod(nn, mm) == 0))) 1965 end function speedup_fdp 1966 1967 end subroutine finddistrproc
ABINIT/m_mpi_setup [ Modules ]
NAME
m_mpi_setup
FUNCTION
Initialize MPI parameters and datastructures for parallel execution
COPYRIGHT
Copyright (C) 1999-2022 ABINIT group (FJ, MT, FD) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
16 #if defined HAVE_CONFIG_H 17 #include "config.h" 18 #endif 19 20 #include "abi_common.h" 21 22 module m_mpi_setup 23 24 use defs_basis 25 use m_distribfft 26 use m_xmpi 27 use m_xomp 28 use m_hdr 29 use m_sort 30 use m_errors 31 use m_abicore 32 33 use defs_abitypes, only : MPI_type 34 use m_fstrings, only : sjoin, itoa 35 use m_time, only : abi_wtime 36 use m_parser, only : intagm 37 use m_geometry, only : mkrdim, metric 38 use m_fftcore, only : fftalg_for_npfft, getng, kpgcount 39 use m_mpinfo, only : init_mpi_enreg, mpi_distrib_is_ok, initmpi_atom, proc_distrb_cycle, & 40 initmpi_grid, initmpi_pert, initmpi_img, distrb2, distrb2_hf, initmpi_world 41 use m_libpaw_tools, only : libpaw_write_comm_set 42 use m_dtset, only : dataset_type 43 use m_kg, only : getmpw 44 use m_dtfil, only : mkfilename 45 46 implicit none 47 48 private
ABINIT/mpi_setup [ Functions ]
NAME
mpi_setup
FUNCTION
Big loop on the datasets: - compute mgfft,mpw,nfft,... for this data set; - fill mpi_enreg *** At the output of this routine, all the dtsets input variables are known *** The content of dtsets should not be modified anymore afterwards.
INPUTS
filnam(5)=character strings giving file names ndtset= number of datasets to be read; if 0, no multi-dataset mode ndtset_alloc=number of datasets, corrected for allocation of at least one data set.
OUTPUT
dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables, some of which are initialized here, while other were already initialized previously.
SIDE EFFECTS
mpi_enregs=information about MPI parallelization
SOURCE
84 subroutine mpi_setup(dtsets,filnam,lenstr,mpi_enregs,ndtset,ndtset_alloc,string) 85 86 !Arguments ------------------------------------ 87 !scalars 88 integer,intent(in) :: lenstr,ndtset,ndtset_alloc 89 type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc) 90 character(len=*),intent(in) :: string 91 !arrays 92 character(len=fnlen),intent(in) :: filnam(5) 93 type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc) 94 95 !Local variables ------------------------------- 96 !scalars 97 integer :: blocksize,exchn2n3d,iband,idtset,iexit,ii,iikpt,iikpt_modulo, prtvol 98 integer :: isppol,jdtset,marr,mband_lower,mband_upper 99 integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k,optdriver 100 integer :: mband_mem 101 integer :: nfft,nfftdg,nkpt,nkpt_me,npert,nproc,nproc_fft,nqpt 102 integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread0,usepaw,vectsize 103 integer :: fftalg,fftalga,fftalgc 104 #ifdef HAVE_LINALG_ELPA 105 integer :: icol,irow,np 106 #endif 107 logical :: fftalg_read,ortalg_read,paral_kgb_read,wfoptalg_read,do_check 108 real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol 109 character(len=500) :: msg 110 !arrays 111 integer :: ngfft(18),ngfftdg(18),ngfftc(3),tread(12) 112 integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:) 113 integer,allocatable :: mybands(:) 114 integer,pointer :: nkpt_rbz(:) 115 real(dp),parameter :: k0(3)=(/zero,zero,zero/) 116 real(dp) :: gmet(3,3),gprimd(3,3),kpt(3),qphon(3),rmet(3,3),rprimd(3,3) 117 real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:) 118 real(dp),pointer :: nband_rbz(:,:) 119 character(len=6) :: nm_mkmem(3) 120 121 !************************************************************************* 122 123 DBG_ENTER("COLL") 124 125 iexit=0;mpw_k=0 126 127 call init_mpi_enreg(mpi_enregs(0)) 128 call initmpi_img(dtsets(0),mpi_enregs(0),-1) 129 130 do idtset=1,ndtset_alloc 131 call init_mpi_enreg(mpi_enregs(idtset)) 132 133 ! Handy read-only variables. 134 optdriver = dtsets(idtset)%optdriver 135 prtvol = dtsets(idtset)%prtvol 136 137 ! Read parallel input parameters 138 marr=max(5,dtsets(idtset)%npsp,dtsets(idtset)%nimage) 139 ABI_MALLOC(intarr,(marr)) 140 ABI_MALLOC(dprarr,(marr)) 141 nkpt =dtsets(idtset)%nkpt 142 nsppol=dtsets(idtset)%nsppol 143 jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0 144 usepaw=dtsets(idtset)%usepaw 145 mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol)) 146 mband_lower=minval(dtsets(idtset)%nband(1:nkpt*nsppol)) 147 148 ! Compute metric for this dataset 149 call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd) 150 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 151 152 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'max_ncpus',tread0,'INT') 153 if (tread0==1) dtsets(idtset)%max_ncpus=intarr(1) 154 155 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_atom',tread0,'INT') 156 if(tread0==1) dtsets(idtset)%paral_atom=intarr(1) 157 158 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_rf',tread0,'INT') 159 if (tread0==1.and.any(optdriver==[RUNL_RESPFN, RUNL_NONLINEAR])) dtsets(idtset)%paral_rf=intarr(1) 160 161 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npimage',tread(2),'INT') 162 if(tread(2)==1) dtsets(idtset)%npimage=intarr(1) 163 164 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nppert',tread(3),'INT') 165 if (tread(3)==1.and.optdriver==RUNL_RESPFN) dtsets(idtset)%nppert=intarr(1) 166 167 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_spkpt',tread(4),'INT') 168 if(tread(4)==1)then 169 dtsets(idtset)%np_spkpt=intarr(1) 170 else 171 ! npkpt is obsolete, but still read 172 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npkpt',tread(4),'INT') 173 if(tread(4)==1)then 174 dtsets(idtset)%np_spkpt=intarr(1) 175 endif 176 endif 177 178 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npspinor',tread(5),'INT') 179 if(tread(5)==1) dtsets(idtset)%npspinor=intarr(1) 180 181 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npfft',tread(6),'INT') 182 if(tread(6)==1) dtsets(idtset)%npfft=intarr(1) 183 184 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npband',tread(7),'INT') 185 if(tread(7)==1) dtsets(idtset)%npband=intarr(1) 186 187 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'bandpp',tread(8),'INT') 188 if(tread(8)==1) dtsets(idtset)%bandpp=intarr(1) 189 190 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'use_slk',tread(9),'INT') 191 if(tread(9)==1) dtsets(idtset)%use_slk=intarr(1) 192 193 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_slk',tread(10),'INT') 194 if(tread(10)==1) dtsets(idtset)%np_slk=intarr(1) 195 196 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'slk_rankpp',tread(12),'INT') 197 if(tread(12)==1) dtsets(idtset)%slk_rankpp=intarr(1) 198 199 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pw_unbal_thresh',tread0,'DPR') 200 if(tread0==1) dtsets(idtset)%pw_unbal_thresh=dprarr(1) 201 mpi_enregs(idtset)%pw_unbal_thresh=dtsets(idtset)%pw_unbal_thresh 202 203 call intagm(dprarr,intarr,jdtset,marr,5,string(1:lenstr),'gpu_devices',tread0,'INT') 204 if(tread0==1) dtsets(idtset)%gpu_devices(1:5)=intarr(1:5) 205 206 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_linalg_limit',tread(11),'INT') 207 if(tread(11)==1) dtsets(idtset)%gpu_linalg_limit=intarr(1) 208 209 210 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nphf',tread0,'INT') 211 if(tread0==1) dtsets(idtset)%nphf=intarr(1) 212 213 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'autoparal',tread0,'INT') 214 if(tread0==1) dtsets(idtset)%autoparal=intarr(1) 215 216 ! Read paral_kgb and disable it if not supported in optdriver 217 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_kgb',tread(1),'INT') 218 paral_kgb_read=(tread(1)==1) 219 if (paral_kgb_read) dtsets(idtset)%paral_kgb=intarr(1) 220 if (xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1) then 221 dtsets(idtset)%paral_kgb=0 222 write(msg, '(5a)' ) & 223 'When ABINIT is compiled without MPI flag,',ch10,& 224 'setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,& 225 'Action: modify compilation option or paral_kgb in the input file.' 226 ABI_WARNING(msg) 227 end if 228 if (ALL(optdriver /= [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT]) .and. dtsets(idtset)%paral_kgb/=0) then 229 dtsets(idtset)%paral_kgb=0 230 write(msg, '(a,i0,a)') & 231 "paral_kgb != 0 is not available in optdriver ",optdriver,". Setting paral_kgb to 0" 232 ABI_COMMENT(msg) 233 end if 234 235 wfoptalg_read=.false. 236 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'wfoptalg',tread0,'INT') 237 if(tread0==1) then 238 dtsets(idtset)%wfoptalg=intarr(1) 239 wfoptalg_read=.true. 240 else 241 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0 242 if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10 243 if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114 244 end if 245 246 ! Dump the list of irreducible perturbations and exit. 247 if (dtsets(idtset)%paral_rf==-1.and.optdriver/=RUNL_NONLINEAR) then 248 call dtsets(idtset)%get_npert_rbz(nband_rbz, nkpt_rbz, npert) 249 ABI_FREE(nband_rbz) 250 ABI_FREE(nkpt_rbz) 251 iexit = iexit + 1 252 end if 253 254 ! From total number of procs, compute all possible distributions 255 ! Ignore exit flag if GW/EPH calculations because autoparal section is performed in screening/sigma/bethe_salpeter/eph 256 if (any(optdriver == [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR, RUNL_NONLINEAR])) then 257 iexit = 0 258 else 259 call finddistrproc(dtsets,filnam,idtset,iexit,mband_upper,mpi_enregs(idtset),ndtset_alloc,tread) 260 end if 261 262 call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1) 263 nproc=mpi_enregs(idtset)%nproc_cell 264 265 ! Set paral_kgb to 1 when band-fft parallelism is activated 266 if (ANY(optdriver == [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT])) then 267 if (mpi_enregs(idtset)%nproc_cell>1) then 268 if (dtsets(idtset)%npband>1.or.dtsets(idtset)%npfft>1) then 269 if (.not.paral_kgb_read) dtsets(idtset)%paral_kgb=1 270 end if 271 end if 272 end if 273 274 if ((optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_GWLS.and.optdriver/=RUNL_RTTDDFT).and. & 275 & (dtsets(idtset)%np_spkpt/=1 .or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. & 276 & dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1)) then 277 !& .or.(dtsets(idtset)%iscf<0)) then 278 dtsets(idtset)%np_spkpt=1 ; dtsets(idtset)%npspinor=1 ; dtsets(idtset)%npfft=1 279 dtsets(idtset)%npband=1; dtsets(idtset)%bandpp=1 ; dtsets(idtset)%nphf=1 280 dtsets(idtset)%paral_kgb=0 281 ABI_COMMENT('For non ground state calculations, set bandpp, npfft, npband, npspinor, np_spkpt and nphf to 1') 282 end if 283 284 ! Take into account a possible change of paral_kgb (change of the default algorithm) 285 if (.not.wfoptalg_read) then 286 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0 287 if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10 288 if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114 289 if (mod(dtsets(idtset)%wfoptalg,10)==4) then 290 do iikpt=1,dtsets(idtset)%nkpt 291 if (any(abs(dtsets(idtset)%kpt(:,iikpt))>tol8)) dtsets(idtset)%istwfk(iikpt)=1 292 end do 293 end if 294 end if 295 296 dtsets(idtset)%densfor_pred=2 297 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'densfor_pred',tread0,'INT') 298 if(tread0==1) then 299 dtsets(idtset)%densfor_pred=intarr(1) 300 else 301 if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%densfor_pred=6 302 end if 303 if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) & 304 .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%densfor_pred/=3 )then 305 dtsets(idtset)%densfor_pred=3 306 write(msg, '(a,a,a)' )& 307 'When ionmov==4 and iscf==5 or 6, densfor_pred must be 3.',ch10,& 308 'Set densfor_pred to 3.' 309 ABI_COMMENT(msg) 310 end if 311 312 #ifdef HAVE_LOTF 313 ! LOTF need densfor_pred=2 314 if(dtsets(idtset)%ionmov==23) dtsets(idtset)%densfor_pred=2 315 #endif 316 317 if (usepaw==0) then 318 dtsets(idtset)%ortalg=2 319 else 320 dtsets(idtset)%ortalg=-2 321 end if 322 ortalg_read=.false. 323 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'ortalg',tread0,'INT') 324 if(tread0==1) then 325 dtsets(idtset)%ortalg=intarr(1) 326 ortalg_read=.true. 327 else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then 328 dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg 329 end if 330 331 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'iomode',tread0,'INT') 332 if(tread0==1) then 333 dtsets(idtset)%iomode=intarr(1) 334 else 335 if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) dtsets(idtset)%iomode=IO_MODE_MPI 336 #ifdef HAVE_NETCDF_DEFAULT 337 dtsets(idtset)%iomode=IO_MODE_ETSF 338 #endif 339 end if 340 341 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pawmixdg',tread0,'INT') 342 if(tread0==1) then 343 dtsets(idtset)%pawmixdg=intarr(1) 344 else if (dtsets(idtset)%npfft>1.and.usepaw==1) then 345 dtsets(idtset)%pawmixdg=1 346 end if 347 348 ! Cycle if the processor is not used 349 if (mpi_enregs(idtset)%me<0.or.iexit>0) then 350 ABI_FREE(intarr) 351 ABI_FREE(dprarr) 352 cycle 353 end if 354 355 ! Set cprj_in_memory to zero if paral_kgb has been activated by autoparal 356 if (dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%cprj_in_memory = 0 357 358 response=0 359 if (dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rf2_dkdk/=0 .or. dtsets(idtset)%rf2_dkde/=0 .or. & 360 & dtsets(idtset)%rfelfd/=0 .or. dtsets(idtset)%rfphon/=0 .or. dtsets(idtset)%rfstrs/=0 .or. & 361 & dtsets(idtset)%rfuser/=0 .or. dtsets(idtset)%rfmagn/=0) response=1 362 363 ! If no MPI, set all npxxx variables to 1 364 if (nproc==1) then 365 dtsets(idtset)%np_spkpt = 1 ; dtsets(idtset)%npband = 1 366 dtsets(idtset)%npfft = 1 ; dtsets(idtset)%npspinor = 1 367 dtsets(idtset)%nphf = 1 368 end if 369 370 ! --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION 371 if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then 372 dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft 373 dtsets(idtset)%np_spkpt = 1 374 dtsets(idtset)%npfft = 1 375 dtsets(idtset)%npspinor = 1 376 write(msg, '(5a,i6,a)' )& 377 'If HAVE_GPU_CUDA and recursion are used ',ch10,& 378 'only the band parallelisation is active, we set:',ch10,& 379 'npfft= 1, np_spkpt= 1, npband=',dtsets(idtset)%npband,' .' 380 ABI_WARNING(msg) 381 end if 382 383 if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then 384 dtsets(idtset)%npspinor=1 385 dtsets(idtset)%npfft=2*dtsets(idtset)%npfft 386 write(msg,'(3a)')& 387 'npspinor is bigger than nspinor !',ch10,& 388 'We set npspinor to 1 ; we set npfft to 2*npfft' 389 ABI_WARNING(msg) 390 end if 391 392 ! Some checks on parallelization data 393 if(dtsets(idtset)%paral_kgb < 0 ) then 394 cycle 395 else if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.& 396 dtsets(idtset)%npfft/=1.or.dtsets(idtset)%np_spkpt/=1.or.dtsets(idtset)%npspinor/=1))then 397 if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then 398 write(msg,'(7a)')& 399 'The product of np_spkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,& 400 'The user-defined values of np_spkpt, npfft, npband or npspinor will be modified,',ch10,& 401 'in order to bring this product below nproc .',ch10,& 402 'At present, only a very simple algorithm is used ...' 403 ABI_WARNING(msg) 404 405 if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then 406 dtsets(idtset)%npfft=1 407 ABI_WARNING('Set npfft to 1') 408 else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor <= nproc)then 409 dtsets(idtset)%npfft=1 410 dtsets(idtset)%npband=1 411 ABI_WARNING('Set npfft and npband to 1') 412 else if(dtsets(idtset)%np_spkpt <= nproc)then 413 dtsets(idtset)%npfft=1 414 dtsets(idtset)%npband=1 415 dtsets(idtset)%npspinor=1 416 ABI_WARNING('Set npfft ,npband and npspinor to 1') 417 else 418 dtsets(idtset)%npfft=1 419 dtsets(idtset)%npband=1 420 dtsets(idtset)%np_spkpt=1 421 dtsets(idtset)%npspinor=1 422 ABI_WARNING('Set npfft, npband, nspinor and np_spkpt to 1') 423 end if 424 else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then 425 write(msg,'(a,5i6,4a)')& 426 'np_spkpt,npband,npspinor,npfft,nproc=',& 427 & dtsets(idtset)%np_spkpt,dtsets(idtset)%npfft,dtsets(idtset)%npband,dtsets(idtset)%npspinor,nproc,ch10,& 428 'The number of processors must not be greater than npfft*npband*np_spkpt*npspinor ',ch10,& 429 'when npfft or np_spkpt or npband or npspinor are chosen manually in the input file.' 430 ABI_ERROR(msg) 431 end if 432 end if 433 434 ! LOBPCG and ChebFi need paral_kgb=1 in parallel 435 if ((dtsets(idtset)%npband*dtsets(idtset)%npfft>1).and. & 436 & (mod(dtsets(idtset)%wfoptalg,10)==1.or.mod(dtsets(idtset)%wfoptalg,10)==4)) then 437 dtsets(idtset)%paral_kgb=1 438 end if 439 440 ! Check size of Scalapack communicator 441 #ifdef HAVE_LINALG_ELPA 442 if(dtsets(idtset)%paral_kgb>0.and.dtsets(idtset)%np_slk>0) then 443 np=min(dtsets(idtset)%np_slk,dtsets(idtset)%npband*dtsets(idtset)%npfft*dtsets(idtset)%npspinor) 444 irow=int(sqrt(float(np))) 445 do while(mod(np,irow)/=0) 446 irow=irow-1 447 end do 448 icol=nproc/irow 449 if (icol>mband_lower) then 450 do while(icol>mband_lower) 451 icol=icol-1 452 do while(mod(np,icol)/=0) 453 icol=icol-1 454 end do 455 end do 456 dtsets(idtset)%np_slk=icol 457 write(msg,'(5a,i6,a)')& 458 'The number of band*fft*spinor processors was not consistent with',ch10,& 459 'the size of communicator used for ELPA library (np_slk).',ch10,& 460 'np_slk value has been adjusted to ',dtsets(idtset)%np_slk,'.' 461 ABI_COMMENT(msg) 462 end if 463 end if 464 #endif 465 466 !Additional check in case of a parallelized Hartree-Fock calculation 467 ! %usefock == option to perform Fock exchange calculation 468 ! %nphf == number of processors for Fock exchange calculation 469 if ((dtsets(idtset)%usefock==1).and.(dtsets(idtset)%nphf/=1)) then 470 471 if ((dtsets(idtset)%nphf<0).or.(dtsets(idtset)%nphf==0)) then 472 ABI_ERROR('The value of variable nphf should be a non negative integer.') 473 end if 474 if (dtsets(idtset)%paral_kgb/=0) then 475 ABI_ERROR('Option paral_kgb should be turned off (value 0) for a parallelized Hartree-Fock calculation.') 476 end if 477 if (response/=0) then 478 ABI_ERROR('A response function calculation is not yet possible with a parallelized Hartree-Fock calculation.') 479 end if 480 if (dtsets(idtset)%npspinor>1) then 481 ABI_ERROR('The parallelism on spinors is not supported by a parallelized Hartree-Fock calculation.') 482 end if 483 if (dtsets(idtset)%np_spkpt*dtsets(idtset)%nphf > nproc )then 484 write(msg,'(a,3(a,i0))') ch10,& 485 'The product of variables np_spkpt and nphf is bigger than the number of processors: np_spkpt= ',& 486 dtsets(idtset)%np_spkpt,' nphf= ',dtsets(idtset)%nphf ,' and nproc= ', nproc 487 ABI_ERROR(msg) 488 end if 489 end if ! Fock 490 491 !When using chebfi, the number of blocks is equal to the number of processors 492 if((dtsets(idtset)%wfoptalg == 1) .or. (dtsets(idtset)%wfoptalg == 111)) then 493 !Nband might have different values for different kpoint, but not bandpp. 494 !In this case, we just use the largest nband, and the input will probably fail 495 !at the bandpp check later on 496 dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband 497 end if 498 499 !Check parallelization in case of RTTDDFT 500 !In particular ensure that bandpp = nband / npband 501 if (optdriver == RUNL_RTTDDFT) then 502 dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband 503 if ( tread(8) == 1 ) then 504 write(msg, '(a,a)') 'Setting bandpp is useless in RT-TDDFT because it is automatically set to nband/npband.', ch10 505 ABI_WARNING(msg) 506 end if 507 if (dtsets(idtset)%npfft/=1) then 508 dtsets(idtset)%npfft=1 509 write(msg, '(a,a)') 'RT-TDDFT is not compatible with FFT-parallelization. Remove npfft or set it to 1.', ch10 510 ABI_ERROR(msg) 511 end if 512 if (dtsets(idtset)%npspinor/=1) then 513 dtsets(idtset)%npspinor=1 514 write(msg, '(a,a)') 'RT-TDDFT is not compatible with spinor parallelization. Remove npspinor or set it to 1.', ch10 515 ABI_ERROR(msg) 516 end if 517 if (dtsets(idtset)%nphf/=1) then 518 dtsets(idtset)%nphf=1 519 write(msg, '(a,a)') 'RT-TDDFT is not compatible with HF parallelization. Remove nphf or set it to 1.', ch10 520 ABI_ERROR(msg) 521 end if 522 end if 523 524 ! Set mpi_enreg 525 mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb 526 if(dtsets(idtset)%paral_kgb/=0)then 527 mpi_enregs(idtset)%nproc_spkpt=dtsets(idtset)%np_spkpt 528 mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft 529 mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband 530 mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor) 531 mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp 532 ! Additional setting in case of hybrid functional calculation => not yet tested (CMartins) 533 ! if (dtsets(idtset)%usefock==1) then 534 ! mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf 535 ! if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1 536 ! end if 537 else 538 mpi_enregs(idtset)%bandpp = dtsets(idtset)%bandpp 539 ! Additional setting in case of a Fock exchange of PBE0 calculation 540 if (dtsets(idtset)%usefock==1) then 541 if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1 542 mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf 543 if (dtsets(idtset)%np_spkpt/=1) then 544 mpi_enregs(idtset)%nproc_spkpt = dtsets(idtset)%np_spkpt 545 else 546 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell/mpi_enregs(idtset)%nproc_hf 547 end if 548 else 549 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell 550 end if 551 end if 552 553 if(dtsets(idtset)%paral_kgb>=0) then 554 555 ! Compute processor distribution over perturbations 556 mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf 557 if (mpi_enregs(idtset)%paral_pert==1) then 558 dtsets(idtset)%nppert=max(1,dtsets(idtset)%nppert) 559 if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then 560 ABI_ERROR('The number of processors must not be smaller than nppert !') 561 end if 562 call initmpi_pert(dtsets(idtset),mpi_enregs(idtset)) 563 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell 564 nproc=mpi_enregs(idtset)%nproc_cell 565 end if 566 ! Cycle if the processor is not used 567 if (mpi_enregs(idtset)%me<0) then 568 ABI_FREE(intarr) 569 ABI_FREE(dprarr) 570 cycle 571 end if 572 573 ! Compute processor distribution over kpt (and eventually band-fft) 574 call initmpi_grid(mpi_enregs(idtset)) 575 if(dtsets(idtset)%usewvl==1) mpi_enregs(idtset)%comm_fft=mpi_enregs(idtset)%comm_cell 576 577 ! Initialize tabs used for k/spin parallelism (with sequential-type values) 578 ABI_MALLOC(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol)) 579 ABI_MALLOC(mpi_enregs(idtset)%my_kpttab,(nkpt)) 580 mpi_enregs(idtset)%proc_distrb(:,:,:)=0 581 mpi_enregs(idtset)%my_kpttab(:)=(/(ii,ii=1,nkpt)/) 582 mpi_enregs(idtset)%my_isppoltab(:)=1;if (dtsets(idtset)%nsppol==1) mpi_enregs(idtset)%my_isppoltab(2)=0 583 584 ! HF or hybrid calculation : initialization of the array distrb_hf 585 if (dtsets(idtset)%usefock==1) then 586 ABI_MALLOC(mpi_enregs(idtset)%distrb_hf,(dtsets(idtset)%nkpthf,dtsets(idtset)%nbandhf,1)) 587 ! The dimension of distrb_hf are given by %nkpthf and %nbandhf. 588 ! We assume that there will be no dependence in spinpol for all the occupied states. 589 mpi_enregs(idtset)%distrb_hf=0 590 end if 591 592 ! Define k-points distribution (determine who I am) 593 ! Note that nkpt_me may differ from processor to processor 594 ! This fact will NOT be taken into account when 595 ! the memory needs will be evaluated in the subroutine memory. 596 ! Also, the reduction of k points due to symmetry in RF calculations 597 ! is NOT taken into account. This should be changed later ... 598 nkpt_me=nkpt 599 mband_mem=0 600 if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then 601 nkpt_me=0 602 if(response==0 .or. (response==1 .and. dtsets(idtset)%efmas==1))then 603 mpi_enregs(idtset)%paralbd=0 604 call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset)) 605 do iikpt=1,nkpt 606 if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))& 607 & nkpt_me=nkpt_me+1 608 end do ! ikpt=1,nkpt 609 ! HF or hybrid calculation : define the occupied states distribution (in array distrb_hf) 610 if (dtsets(idtset)%usefock==1) then 611 call distrb2_hf(dtsets(idtset)%nbandhf,dtsets(idtset)%nkpthf,nproc,nsppol,mpi_enregs(idtset)) 612 end if 613 else ! response==1 614 ! TODO: check or remove the following comment which seems outdated 615 ! Wrongly assumes that the number of elements of the 616 ! k-point sets of the two spin polarizations is the maximal 617 ! value of one of these k-point sets ... 618 ! This is to be corrected when RF is implemented 619 ! for spin-polarized case. 620 ! ENDTODO 621 mpi_enregs(idtset)%paralbd=1 622 ! nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert 623 call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset)) 624 do isppol=1,nsppol 625 nspink=0 626 do iikpt=1,nkpt 627 do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt) 628 if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then 629 nspink=nspink+1 630 exit 631 end if 632 end do ! iband 633 end do ! iikpt 634 if(nspink>nkpt_me)nkpt_me=nspink 635 end do ! isppol 636 ! Is nband present in input file or automatically estimated ? 637 tnband=0 638 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nband',tnband,'INT') 639 ! If the number of bands was estimated, there might be a side effect 640 ! when the definitive number of bands is known. k points 641 ! might be attributed to different processors than the present 642 ! proc_distrb describes. At most, the number of k points could increase by 1 ... 643 if(tnband==0)nkpt_me=nkpt_me+1 644 ! In any case, the maximal number of k points is nkpt 645 if(nkpt_me>nkpt)nkpt_me=nkpt 646 647 ! mband_mem 648 ABI_MALLOC (mybands, (mband_upper)) 649 mband_mem = 0 650 do isppol=1,nsppol 651 do iikpt=1,nkpt 652 mybands = 0 653 do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt) 654 if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_band)then 655 mybands(iband)=1 656 end if 657 end do ! iband 658 mband_mem = max(mband_mem, sum(mybands)) 659 end do ! iikpt 660 end do ! isppol 661 ABI_FREE (mybands) 662 end if ! response case 663 end if 664 end if 665 if (mband_mem == 0) mband_mem = mband_upper 666 dtsets(idtset)%mband_mem = mband_mem 667 668 ! Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem 669 ! and mk1mem. 670 nm_mkmem(1)='mkmem ' 671 nm_mkmem(2)='mkqmem' 672 nm_mkmem(3)='mk1mem' 673 674 do ii=1,3 675 676 ! Read in mkmem here if it is in the input file 677 ! TODO: mkmem is not supported any longer. These variables can be removed. 678 if(ii==1)then 679 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkmem',tread0,'INT') 680 else if(ii==2)then 681 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkqmem',tread0,'INT') 682 else if(ii==3)then 683 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mk1mem',tread0,'INT') 684 end if 685 686 687 ! Note that mkmem is used as a dummy variable, representing mkmem as well 688 ! as mkqmem, and mk1mem. 689 if(tread0==1) then 690 mkmem=intarr(1) 691 if (mkmem<0) then 692 ! mkmem is unreasonable; must be zero or positive 693 write(msg, '(4a,i0,4a)')& 694 nm_mkmem(ii),' must be positive or null but ',nm_mkmem(ii),' =',mkmem,ch10,& 695 'Use default ',nm_mkmem(ii),' = nkpt .' 696 ABI_WARNING(msg) 697 mkmem=nkpt 698 end if 699 700 else 701 702 ! mkmem was not set in the input file so default to incore solution 703 !write(msg,'(6a)') & 704 !'mpi_setup: ',nm_mkmem(ii),' undefined in the input file.','Use default ',nm_mkmem(ii),' = nkpt' 705 !call wrtout(std_out, msg) 706 mkmem=nkpt 707 end if 708 709 ! Check whether nkpt distributed on the processors <= mkmem; 710 ! if so then may run entirely in core, 711 ! avoiding i/o to disk for wavefunctions and kg data. 712 ! mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1. 713 if (nkpt_me<=mkmem .and. mkmem/=0 ) then 714 write(msg, '(a,i0,a,a,a,i0,a)' ) & 715 ' mpi_setup: With nkpt_me=',nkpt_me,' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.' 716 if (prtvol > 0) call wrtout(std_out,msg) 717 if(nkpt_me<mkmem .and. nkpt_me/=0)then 718 write(msg,'(3a)')' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.' 719 mkmem=nkpt_me 720 if (prtvol > 0) call wrtout(std_out,msg) 721 end if 722 else if(mkmem/=0)then 723 write(msg, '(a,i0,3a,i0,5a)' ) & 724 ' mpi_setup: With nkpt_me=',nkpt_me,'and ',nm_mkmem(ii),' = ',mkmem,& 725 ' ground state wf require disk i/o.',ch10,& 726 ' Resetting ',nm_mkmem(ii),' to zero to save memory space.' 727 mkmem=0 728 if (prtvol > 0) call wrtout(std_out,msg) 729 end if 730 if(dtsets(idtset)%usewvl == 0 .or. dtsets(idtset)%usepaw==1)then 731 if(ii==1)dtsets(idtset)%mkmem=mkmem 732 end if 733 if(ii==2)dtsets(idtset)%mkqmem=mkmem 734 if(ii==3)dtsets(idtset)%mk1mem=mkmem 735 736 if(dtsets(idtset)%usewvl == 1 .and. dtsets(idtset)%usepaw==1 )then 737 if(dtsets(idtset)%mkmem .ne. dtsets(idtset)%nkpt) then 738 ABI_ERROR("mkmem is not allowed for WVL+PAW") 739 end if 740 end if 741 742 end do ! End the loop on the three possiblities mkmem, mkqmem, mk1mem. 743 744 if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0 745 746 ! Check if some MPI processes are empty (MBPT codes uses a complete different MPI algorithm) 747 do_check = all(optdriver /= [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR]) 748 if (dtsets(idtset)%usewvl == 0 .and. do_check) then 749 if (.not.mpi_distrib_is_ok(mpi_enregs(idtset),mband_upper,& 750 dtsets(idtset)%nkpt,dtsets(idtset)%mkmem,nsppol,msg=msg)) then 751 write(msg,'(5a)') trim(msg),ch10,& 752 'YOU ARE STRONGLY ADVISED TO ACTIVATE AUTOMATIC PARALLELIZATION!',ch10,& 753 'USE "AUTOPARAL=1" IN THE INPUT FILE.' 754 ABI_WARNING(msg) 755 end if 756 end if 757 758 ! call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string) 759 ! Printing of processor distribution 760 ! MPIWF : here, set up the complete ngfft, containing the information 761 ! for the parallelisation of the FFT 762 call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world) 763 call libpaw_write_comm_set(mpi_enregs(idtset)%comm_world) 764 765 ! Default values for sequential case 766 paral_fft=0; nproc_fft=1; me_fft=0 767 768 if(dtsets(idtset)%usewvl == 0)then 769 if(optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then 770 paral_fft=1 ! parallelisation over FFT 771 if (mpi_enregs(idtset)%nproc_cell>0) then 772 if(mpi_enregs(idtset)%paral_kgb == 1) then 773 774 if((dtsets(idtset)%use_gpu_cuda==1).and.(mpi_enregs(idtset)%nproc_fft/=1))then 775 write(msg,'(3a,i0)') & 776 'When use_gpu_cuda is on, the number of FFT processors, npfft, must be 1',ch10,& 777 'However, npfft=',mpi_enregs(idtset)%nproc_fft 778 ABI_ERROR(msg) 779 end if 780 781 if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then 782 write(msg,'(3a,i0,a,i0)') & 783 'The number of FFT processors, npfft, should be a multiple of ngfft(2).',ch10,& 784 'However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2) 785 ABI_BUG(msg) 786 end if 787 788 do iikpt=1,nkpt*nsppol 789 iikpt_modulo = modulo(iikpt,nkpt)+1 790 if ((dtsets(idtset)%istwfk(iikpt_modulo)==2)) then !.and.(dtsets(idtset)%ngfft(7)==401)) then 791 if ((mpi_enregs(idtset)%bandpp==0).or. & 792 ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then 793 write(msg,'(3a,i0)') & 794 'The number bandpp should be 1 or a multiple of 2',ch10,& 795 'However, bandpp=',mpi_enregs(idtset)%bandpp 796 ABI_BUG(msg) 797 end if 798 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 799 write(msg,'(3a,i0,a,i0)') & 800 'The number of bands for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,& 801 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', & 802 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 803 ABI_BUG(msg) 804 end if 805 else if ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then 806 ABI_BUG('The fftalg=400 with istwfk=2 is not valid') 807 else 808 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 809 write(msg,'(3a,i0,a,i0)') & 810 'The number of band for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,& 811 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', & 812 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 813 ABI_BUG(msg) 814 end if 815 if ((mpi_enregs(idtset)%bandpp==0)) then 816 write(msg,'(a,i0,2a,i0,2a,i0)')& 817 'The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,& 818 'and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,& 819 'However, bandpp=',mpi_enregs(idtset)%bandpp 820 ABI_BUG(msg) 821 end if 822 end if 823 end do 824 825 if (xmpi_paral==1) then 826 if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_spkpt)/=0)then 827 write(msg,'(3a,i0,a,i0)') & 828 'The number of KPT processors, np_spkpt, should be a multiple of nkpt*nsppol.',ch10,& 829 'However, np_spkpt=',mpi_enregs(idtset)%nproc_spkpt,' and nkpt*nsppol=',nkpt*nsppol 830 ABI_WARNING(msg) 831 end if 832 end if 833 else 834 do iikpt=1,nkpt*nsppol 835 iikpt_modulo = modulo(iikpt,nkpt)+1 836 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 837 write(msg,'(3a,i0,a,i0)') & 838 'The number of band for the k-point, nband_k, should be a multiple of npband*bandpp.',ch10,& 839 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and npband*bandpp=', & 840 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 841 ABI_BUG(msg) 842 end if 843 end do 844 end if 845 end if 846 nproc_fft=mpi_enregs(idtset)%nproc_fft 847 me_fft=mpi_enregs(idtset)%me_fft 848 end if 849 end if 850 851 ! Compute mgfft,mpw,nfft for this data set (it is dependent of mpi_enreg) 852 ABI_MALLOC(istwfk,(nkpt)) 853 ABI_MALLOC(kpt_with_shift,(3,nkpt)) 854 855 ! Set the default value of fftalg for given npfft but allow the user to override it. 856 ! Warning: If you need to change npfft, **DO IT** before this point so that here we get the correct fftalg 857 dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 858 dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 859 860 fftalg_read=.false. 861 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'fftalg',tread0,'INT') 862 863 if (tread0==1) then 864 dtsets(idtset)%ngfft(7)=intarr(1) 865 if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1) 866 fftalg_read=.true. 867 end if 868 869 ecut =dtsets(idtset)%ecut 870 dilatmx =dtsets(idtset)%dilatmx 871 ngfft(:) =dtsets(idtset)%ngfft(:) 872 istwfk(:)=dtsets(idtset)%istwfk(1:nkpt) 873 nsym =dtsets(idtset)%nsym 874 875 nqpt=dtsets(idtset)%nqpt 876 qphon(:)=zero;if(nqpt/=0) qphon(:)=dtsets(idtset)%qptn(:) 877 878 ABI_MALLOC(symrel,(3,3,nsym)) 879 symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym) 880 ecut_eff=ecut*dilatmx**2 881 882 if (usepaw==1) call wrtout(std_out,'getng is called for the coarse grid:') 883 kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary 884 885 call getng(dtsets(idtset)%boxcutmin,dtsets(idtset)%chksymtnons,ecut_eff,gmet,kpt,me_fft,mgfft,nfft,& 886 & ngfft,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,& 887 & use_gpu_cuda=dtsets(idtset)%use_gpu_cuda) 888 889 dtsets(idtset)%ngfft(:)=ngfft(:) 890 dtsets(idtset)%mgfft=mgfft 891 dtsets(idtset)%nfft=nfft 892 kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm 893 894 exchn2n3d=dtsets(idtset)%exchn2n3d 895 nproc_fft=ngfft(10) ; me_fft=ngfft(11) 896 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 897 898 ! Initialize tables for MPI-FFT. 899 call init_distribfft(mpi_enregs(idtset)%distribfft,'c',mpi_enregs(idtset)%nproc_fft,ngfft(2),ngfft(3)) 900 901 if(response/=0)then 902 ! This value of mpw is used in the first part of respfn.f 903 call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt) 904 end if 905 if(nqpt/=0)then 906 kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1) 907 kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2) 908 kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3) 909 end if 910 if (dtsets(idtset)%usewvl == 0) then 911 call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt) 912 ! Allocate tables for parallel IO of the wavefunctions. 913 if( xmpi_mpiio==1 .and. mpi_enregs(idtset)%paral_kgb == 1 .and. & 914 & any(dtsets(idtset)%iomode == [IO_MODE_MPI, IO_MODE_ETSF])) then 915 ABI_MALLOC(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem)) 916 end if 917 else 918 mpw = 0 919 end if 920 921 ! The dimensioning, in the RF case, should be done only with mpw, 922 ! but mpw is used in the first part of respfn.f, and should at least 923 ! be equal to mpw_k . The chosen way to code is not optimal, only convenient : 924 ! it leads to a small waste of memory. 925 if(response/=0 .and. mpw_k>mpw)mpw=mpw_k 926 dtsets(idtset)%ngfft(:)=ngfft(:) 927 928 ! Initialize ngfftc to the initial guess for the coarse mesh 929 ngfftc(:) = 2 930 931 ! In case of PAW, compute fine FFT parameters 932 if (usepaw==1) then 933 ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2 934 ngfftdg(:)=dtsets(idtset)%ngfftdg(:) 935 call wrtout(std_out,'getng is called for the fine grid:') 936 ! Start with the coarse mesh as an initial guess for the fine mesh 937 ! This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension 938 ngfftc(:) = ngfft(1:3) 939 kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary 940 941 call getng(dtsets(idtset)%bxctmindg,dtsets(idtset)%chksymtnons,& 942 & ecutdg_eff,gmet,kpt,me_fft,mgfftdg,& 943 & nfftdg,ngfftdg,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,ngfftc,& 944 & use_gpu_cuda=dtsets(idtset)%use_gpu_cuda) 945 946 dtsets(idtset)%ngfftdg(:)=ngfftdg(:) 947 dtsets(idtset)%mgfftdg=mgfftdg 948 dtsets(idtset)%nfftdg=nfftdg 949 ! Compute fft distribution for fine grid 950 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 951 call init_distribfft(mpi_enregs(idtset)%distribfft,'f', mpi_enregs(idtset)%nproc_fft,ngfftdg(2),ngfftdg(3)) 952 end if 953 954 dtsets(idtset)%mpw=mpw 955 ABI_FREE(symrel) 956 ABI_FREE(istwfk) 957 ABI_FREE(kpt_with_shift) 958 ABI_FREE(intarr) 959 ABI_FREE(dprarr) 960 961 ! Initialize data for the parallelization over atomic sites (PAW) 962 if (dtsets(idtset)%natom==1) dtsets(idtset)%paral_atom=0 963 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%paral_atom=0 964 if (dtsets(idtset)%usewvl/=0) dtsets(idtset)%paral_atom=0 965 if (dtsets(idtset)%usedmft==1) dtsets(idtset)%paral_atom=0 966 if (optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_RESPFN.and.optdriver/=RUNL_GWLS) dtsets(idtset)%paral_atom=0 967 if (dtsets(idtset)%macro_uj/=0) dtsets(idtset)%paral_atom=0 968 969 call initmpi_atom(dtsets(idtset),mpi_enregs(idtset)) 970 971 ! In case of the use of a GPU (Cuda), some defaults can change 972 ! according to a threshold on matrix sizes 973 if (dtsets(idtset)%use_gpu_cuda==1.or.dtsets(idtset)%use_gpu_cuda==-1) then 974 if (optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then 975 vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor 976 if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize 977 blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp 978 if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft 979 if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then 980 if (.not.wfoptalg_read) then 981 dtsets(idtset)%wfoptalg=14 982 if (.not.fftalg_read) then 983 dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 984 if (usepaw==1) dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 985 end if 986 if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg) 987 end if 988 end if 989 end if 990 end if 991 992 ! initialize data for the parallelization for WVL: 993 if(dtsets(idtset)%usewvl==1) then 994 mpi_enregs(idtset)%comm_wvl=mpi_enregs(idtset)%comm_cell 995 mpi_enregs(idtset)%nproc_wvl=xmpi_comm_size(mpi_enregs(idtset)%comm_wvl) 996 mpi_enregs(idtset)%me_wvl=xmpi_comm_rank(mpi_enregs(idtset)%comm_wvl) 997 end if 998 999 end do 1000 1001 !This is not a very clean exit in case of paral_kgb<0 1002 if (iexit/=0)then 1003 ABI_STOP("Stopping now!") 1004 end if 1005 1006 DBG_EXIT("COLL") 1007 1008 end subroutine mpi_setup