TABLE OF CONTENTS
ABINIT/compute_kgb_indicator [ Functions ]
NAME
compute_kgb_indicator
FUNCTION
Only for "KGB" parallelism (LOBPCG algorithm for Ground-state): Give an indicator of performance for a given distribution of processors (npband, npfft and bandpp). Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.
INPUTS
bandpp=internal lobpcg optimization variable glb_comm=communicator for global MPI communications mband=maximum number of bands. mband=maximum number of plane waves npband=number of processor 'band' npfft = number of processor 'fft' use_linalg_gpu=indicate if we also test the gpu linear algebra (compatible only with the legacy 2013 GPU code)
OUTPUT
acc_kgb = indicator of performance npslk = number of process to used in communicators
SIDE EFFECTS
This routine can be used to find an indicator in order to refine automatic process distribution. This indicator is returned in acc_kgb This routine can be used to find the optimal values of np_slk parameter (ScaLapack) and wheter or not we should use Magma for Linear Algebra in lobpcgwf
SOURCE
2076 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,use_linalg_gpu) 2077 2078 use m_abi_linalg 2079 2080 !Arguments ------------------------------------ 2081 !scalars 2082 integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft 2083 integer,intent(inout) :: npslk,use_linalg_gpu 2084 real(dp),intent(inout) :: acc_kgb 2085 2086 !Local variables------------------------------- 2087 !scalars 2088 integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10 2089 integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu 2090 integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks 2091 integer :: use_lapack_gpu,use_slk,vectsize,wfoptalg 2092 real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho 2093 character(len=500) :: msg 2094 !arrays 2095 integer,allocatable :: ranks(:),val_npslk(:) 2096 real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:) 2097 complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:) 2098 2099 !****************************************************************** 2100 2101 DBG_ENTER("COLL") 2102 2103 #ifdef DEBUG_MODE 2104 write(msg,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft 2105 call wrtout(std_out,msg,'PERS') 2106 #endif 2107 2108 !Create local communicator for test 2109 if (xmpi_paral==1) then 2110 nranks=npfft*npband 2111 ABI_MALLOC(ranks,(nranks)) 2112 ranks=(/((my_rank-1),my_rank=1,nranks)/) 2113 kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank) 2114 ABI_FREE(ranks) 2115 else 2116 kgb_comm=xmpi_comm_self 2117 my_rank=0 2118 end if 2119 2120 !Only for process in the new subgroup 2121 if (my_rank/=xmpi_undefined) then 2122 2123 ! We enforce vectsize >=blocksize (This is not true in lobpcg but 2124 ! these are rare cases and this simplify the matrix constructions below...) 2125 blocksize=npband*bandpp 2126 vectsize=max(1+mpw/(npband*npfft),blocksize) 2127 bigorder=3*blocksize 2128 2129 ABI_MALLOC(blockvectorx,(vectsize,blocksize)) 2130 ABI_MALLOC(blockvectorbx,(vectsize,blocksize)) 2131 ABI_MALLOC(sqgram,(blocksize,blocksize)) 2132 ABI_MALLOC(grama,(2*bigorder,bigorder)) 2133 ABI_MALLOC(gramb,(2*bigorder,bigorder)) 2134 ABI_MALLOC(eigen,(bigorder)) 2135 ABI_MALLOC(val_npslk,(max_number_of_npslk)) ! not too much values tested 2136 2137 min_eigen=greatest_real 2138 min_ortho=greatest_real 2139 np_slk_best=-1 ; np_slk_max=0 2140 #ifdef HAVE_LINALG_SCALAPACK 2141 np_slk_max=min(max_number_of_npslk,npband*npfft) 2142 #endif 2143 2144 ! Preselect a range of available np_slk values 2145 val_npslk(1:)=0 ; val_npslk(2)=1 2146 do islk=3,np_slk_max 2147 np_slk=val_npslk(islk-1)*2 2148 do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft))) 2149 np_slk=np_slk+1 2150 end do 2151 if(np_slk>(npband*npfft).or.np_slk>mband) exit 2152 val_npslk(islk)=np_slk 2153 end do 2154 np_slk_max=islk-1 2155 2156 ! Loop over np_slk values 2157 islk1=1 2158 #ifdef HAVE_LINALG_MAGMA 2159 if (use_linalg_gpu==ABI_GPU_LEGACY) islk1=0 2160 #endif 2161 do islk=islk1,np_slk_max 2162 2163 time_xortho=zero ; time_xeigen=zero 2164 2165 use_slk=0 2166 if (islk==0) then 2167 ! This is the test for the GPU 2168 use_lapack_gpu=1 ; np_slk=0 2169 else 2170 use_lapack_gpu=0 ; np_slk=val_npslk(islk) 2171 if (np_slk>0) use_slk=1 2172 end if 2173 2174 ! Initialize linalg parameters for this np_slk value 2175 ! For the first np_slk value, everything is initialized 2176 ! For the following np_slk values, only Scalapack parameters are updated 2177 wfoptalg=14 ! Simulate use of LOBPCG 2178 call abi_linalg_init(bigorder,RUNL_GSTATE,wfoptalg,1,& 2179 & use_lapack_gpu,use_slk,np_slk,kgb_comm) 2180 2181 ! We could do mband/blocksize iter as in lobpcg but it's too long 2182 do iter=1,max_number_of_iter 2183 2184 ! Build matrixes 2185 do ii=1,vectsize 2186 do jj=1,blocksize 2187 if (ii>jj) then 2188 blockvectorx(ii,jj) =czero 2189 blockvectorbx(ii,jj)=czero 2190 else 2191 blockvectorx(ii,jj) =cone 2192 blockvectorbx(ii,jj)=cone 2193 end if 2194 end do 2195 end do 2196 grama=zero;gramb=zero 2197 do jj=1,bigorder 2198 do ii=jj,bigorder 2199 if (ii==jj) then 2200 grama(2*ii-1,jj)=one 2201 gramb(2*ii-1,jj)=one 2202 else 2203 grama(2*ii-1:2*ii,jj)=one 2204 grama(2*jj-1,ii)= one 2205 grama(2*jj ,ii)=-one 2206 end if 2207 end do 2208 end do 2209 2210 ! Call to abi_xorthonormalize 2211 time_xortho=time_xortho-abi_wtime() 2212 call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize) 2213 time_xortho = time_xortho + abi_wtime() 2214 2215 ! Call to abi_xhegv 2216 time_xeigen=time_xeigen-abi_wtime() 2217 call abi_xhegv(1,'v','u',bigorder,grama,bigorder,gramb,bigorder,eigen,& 2218 & x_cplx=2,use_slk=use_slk,use_gpu_magma=use_lapack_gpu) 2219 time_xeigen=time_xeigen+abi_wtime() 2220 2221 end do ! iter 2222 2223 ! Finalize linalg parameters for this np_slk value 2224 ! For the last np_slk value, everything is finalized 2225 ! For the previous np_slk values, only Scalapack parameters are updated 2226 call abi_linalg_finalize(use_lapack_gpu) 2227 2228 time_xortho= time_xortho*mband/blocksize 2229 time_xeigen= time_xeigen*mband/blocksize 2230 if (time_xortho<min_ortho) min_ortho=time_xortho 2231 if (time_xeigen<min_eigen) then 2232 min_eigen=time_xeigen 2233 np_slk_best=np_slk 2234 keep_gpu=use_lapack_gpu 2235 end if 2236 2237 end do ! np_slk 2238 2239 #ifdef DEBUG_MODE 2240 write(msg,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,& 2241 ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best 2242 call wrtout(std_out,msg,'PERS') 2243 #endif 2244 2245 ! Final values to be sent to others process 2246 acc_kgb=min_ortho+four*min_eigen 2247 npslk=max(np_slk_best,1) 2248 use_linalg_gpu=keep_gpu 2249 2250 ABI_FREE(blockvectorx) 2251 ABI_FREE(blockvectorbx) 2252 ABI_FREE(sqgram) 2253 ABI_FREE(grama) 2254 ABI_FREE(gramb) 2255 ABI_FREE(eigen) 2256 ABI_FREE(val_npslk) 2257 2258 end if ! my_rank in group 2259 2260 !Free local MPI communicator 2261 call xmpi_comm_free(kgb_comm) 2262 2263 !Broadcast of results to be sure every process has them 2264 call xmpi_bcast(acc_kgb,0,glb_comm,ierr) 2265 call xmpi_bcast(npslk,0,glb_comm,ierr) 2266 call xmpi_bcast(use_linalg_gpu,0,glb_comm,ierr) 2267 2268 #ifndef DEBUG_MODE 2269 ABI_UNUSED(msg) 2270 #endif 2271 2272 DBG_EXIT("COLL") 2273 2274 end subroutine compute_kgb_indicator
ABINIT/finddistrproc [ Functions ]
NAME
finddistrproc
FUNCTION
Given a total number of processors, find a suitable distribution that fill all the different levels of parallelization (npimage, nppert, np_spkpt, npspinor, npband, npfft, bandpp) Also determine parameters of parallel Linear Algebra routines (use_slk, np_slk, gpu_linalg_limit)
INPUTS
dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables, for all datasets; at this stage only datasets with index lower than idtset are already initalized filnam(5)=character strings giving file names idtset=number of the current dataset mpi_enreg=information about MPI parallelization mband=maximum number of bands. ndtset_alloc=number of datasets, corrected for allocation of at least one data set tread(11)=flags indicating wether parallel input parameters were read from input file tread(1) : paral_kgb tread(6) : npfft tread(2) : npimage tread(7) : npband tread(3) : nppert tread(8) : bandpp tread(4) : np_spkpt tread(9) : use_slk tread(5) : nspinor tread(10): np_slk tread(11) : gpu_linalg_limit
SIDE EFFECTS
iexit= if incremented, an exit is required dtset%paral_kgb= flag for band-fft parallelism dtset%npimage = number of processors for parallelisation over image dtset%nppert = number of processors for parallelisation over perturbations dtset%npspinor = number of processors for parallelisation on spinor components dtset%np_spkpt = number of processors for parallelisation on spin / k points dtset%npfft = number of processors for parallelisation on fft grid dtset%npband = number of processors for parallelisation on bands dtset%nphf = number of processors for parallelisation on occupied states for fock exchange dtset%bandpp = internal parameter for lobpcg parallelisation algorithm dtset%use_slk = flag for ScalaPAck use dtset%np_slk = number of processors used in ScaLapack routines dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU
SOURCE
1116 subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread) 1117 1118 !Arguments ------------------------------------ 1119 !scalars 1120 integer,intent(in) :: idtset,mband,ndtset_alloc 1121 integer,intent(inout) :: iexit 1122 type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc) 1123 type(MPI_type),intent(inout) :: mpi_enreg 1124 !arrays 1125 integer,intent(in) :: tread(11) 1126 character(len=fnlen),intent(in) :: filnam(5) 1127 1128 !Local variables------------------------------- 1129 !scalars 1130 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20) 1131 integer,parameter :: ALGO_NOT_SET=-1, ALGO_DEFAULT_PAR=2 1132 integer,parameter :: ALGO_CG=0, ALGO_LOBPCG_OLD=1, ALGO_LOBPCG_NEW=2, ALGO_CHEBFI=3, ALGO_CHEBFI_NEW=4 1133 integer,parameter :: NPFMAX=128,BLOCKSIZE_MAX=3000,MAXBAND_PRINT=10 1134 integer,parameter :: MAXCOUNT=250,MAXPRINT=10,MAXBENCH=25,MAXABIPY=25,NPF_CUTOFF=20 1135 real(dp),parameter :: relative_nband_range=0.025 1136 integer :: wf_algo,wf_algo_global,bpp,bpp_max,bpp_min,optdriver,autoparal,nblocks,blocksize 1137 integer :: npi_max,npi_min,npc,npc_max,npc_min 1138 integer :: np_sk,np_sk_max,np_sk_min,npp_max,npp_min 1139 integer :: nps,nps_max,nps_min,npf,npf_max,npf_min 1140 integer :: npb,npb_max,npb_min,max_ncpus,ount,paral_kgb 1141 integer :: work_size,nks_per_proc,tot_ncpus 1142 integer :: ib1,ib2,ibest,icount,ii,imin,jj,kk,mcount,mcount_eff,mpw 1143 integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff 1144 integer :: nproc,nproc1,nprocmin,np_slk,nthreads,use_linalg_gpu,omp_ncpus 1145 logical :: dtset_found,file_found,first_bpp,iam_master 1146 logical :: with_image,with_pert,with_kpt,with_spinor,with_fft,with_band,with_bandpp,with_thread 1147 real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,eff,ucvol,weight0 1148 character(len=10) :: suffix 1149 character(len=20) :: strg 1150 character(len=500) :: msg,msgttl 1151 character(len=fnlen) :: filden 1152 type(hdr_type) :: hdr0 1153 !arrays 1154 integer :: idum(1),idum3(3),ngmax(3),ngmin(3) 1155 integer,allocatable :: nband_best(:),isort(:),jdtset_(:) 1156 integer,allocatable :: my_algo(:),my_distp(:,:),nproc_best(:) 1157 integer,pointer :: nkpt_rbz(:) 1158 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3) 1159 real(dp),allocatable :: weight(:) 1160 real(dp),pointer :: nband_rbz(:,:) 1161 type(dataset_type),pointer :: dtset 1162 1163 !****************************************************************** 1164 1165 DBG_ENTER("COLL") 1166 1167 !Select current dataset 1168 dtset => dtsets(idtset) 1169 1170 !Is automatic parallelization activated? 1171 autoparal = dtset%autoparal 1172 if (autoparal==0) return 1173 1174 !Is it available 1175 if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then 1176 ABI_ERROR("autoparal>0 not available for Hartree-Fock or hybrid XC calculations!") 1177 end if 1178 if ((autoparal>1).and.dtset%wfoptalg/=4.and.dtset%wfoptalg/=14) then 1179 ABI_ERROR("autoparal>1 only available for the old LOBPCG algorithm (wfoptalg=4/14)!") 1180 end if 1181 1182 ! Unit number used for outputting the autoparal sections 1183 ount = ab_out 1184 1185 ! From the documentation: 1186 ! 1187 ! If autoparal > 1 and max_ncpus is greater than 0, ABINIT analyzes the 1188 ! efficiency of the process distribution for each possible number of processors 1189 ! from 2 to max_ncpus. After having printed out the efficiency, the code stops. 1190 1191 ! Handy local variables 1192 iam_master = (mpi_enreg%me==0) 1193 optdriver = dtset%optdriver 1194 max_ncpus = dtset%max_ncpus ; if (dtset%paral_kgb<0) max_ncpus=abs(dtset%paral_kgb) 1195 nthreads=xomp_get_max_threads() 1196 nproc=mpi_enreg%nproc 1197 if (max_ncpus>0) nproc = dtset%max_ncpus/nthreads 1198 if (xmpi_paral==0.and.max_ncpus<=0) nproc=1 1199 1200 nprocmin=2 1201 if (xmpi_paral==1.and.max_ncpus<=0) nprocmin=max(2,nproc-100) 1202 if (max_ncpus>0.and.autoparal/=0) nprocmin=1 1203 1204 wf_algo_global=ALGO_NOT_SET 1205 if (dtset%wfoptalg==0.and.tread(1)==1) wf_algo_global=ALGO_CG 1206 if (dtset%wfoptalg==4.or.dtset%wfoptalg==14) wf_algo_global=ALGO_LOBPCG_OLD 1207 if (dtset%wfoptalg==114) wf_algo_global=ALGO_LOBPCG_NEW 1208 if (dtset%wfoptalg==1) wf_algo_global=ALGO_CHEBFI 1209 if (dtset%wfoptalg==111) wf_algo_global=ALGO_CHEBFI_NEW 1210 1211 ! Some peculiar cases (with direct exit) 1212 ! MG: What is the meaning of max_ncpus < 0. This is not documented! 1213 if (max_ncpus<=0) then 1214 if (nproc==1.and.max_ncpus<=0) then 1215 if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0 1216 if (tread(2)==0.or.xmpi_paral==0) dtset%npimage = 1 1217 if (tread(3)==0.or.xmpi_paral==0) dtset%nppert = 1 1218 if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1 1219 if (tread(5)==0.or.xmpi_paral==0) dtset%np_spkpt = 1 1220 if (tread(6)==0.or.xmpi_paral==0) dtset%npfft = 1 1221 if (tread(7)==0.or.xmpi_paral==0) dtset%npband = 1 1222 if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp = 1 1223 if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk = 0 1224 if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk = 1000000 1225 return 1226 end if 1227 if ((optdriver/=RUNL_GSTATE.and. optdriver/=RUNL_RESPFN.and. optdriver/=RUNL_GWLS).or. & 1228 (optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then 1229 dtset%paral_kgb= 0 1230 dtset%npimage = max(1,dtset%npimage) 1231 dtset%nppert = max(1,dtset%nppert) 1232 dtset%npspinor = max(1,dtset%npspinor) 1233 dtset%np_spkpt = max(1,dtset%np_spkpt) 1234 dtset%npfft = max(1,dtset%npfft) 1235 dtset%npband = max(1,dtset%npband) 1236 dtset%bandpp = max(1,dtset%bandpp) 1237 return 1238 end if 1239 end if 1240 1241 ! Need the metric tensor 1242 call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd) 1243 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 1244 1245 ! Determine some quantities related to plane waves 1246 ! - Crude estimation of the number of PW 1247 ! - Number of G vectors in each direction 1248 mpw=0;ngmin=0;ngmax=0 1249 if (optdriver==RUNL_GSTATE) then 1250 ecut_eff = dtset%ecut*dtset%dilatmx**2 1251 mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation 1252 if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1 1253 call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt) 1254 write(msg,'(a,i0)') ' getmpw sequential formula gave: ',mpw 1255 call wrtout(std_out,msg) 1256 end if 1257 1258 ! Parallelization over images 1259 npi_min=1;npi_max=1;nimage_eff=1 1260 if (optdriver==RUNL_GSTATE) then 1261 nimage_eff=dtset%ndynimage 1262 if (dtset%ntimimage<=1) nimage_eff=dtset%nimage 1263 npi_min=max(1,dtset%npimage) 1264 npi_max=min(nproc,nimage_eff) 1265 if (tread(2)==1) npi_max=dtset%npimage 1266 end if 1267 1268 !Parallelization over k-points and spin components (GS) 1269 np_sk_min=1;np_sk_max=1;nkpt_eff=0 1270 if (optdriver==RUNL_GSTATE) then 1271 nkpt_eff=dtset%nkpt*dtset%nsppol 1272 np_sk_min=max(1,dtset%np_spkpt) 1273 np_sk_max=min(nproc,nkpt_eff) 1274 if (tread(4)==1) np_sk_max=dtset%np_spkpt 1275 end if 1276 1277 !Parallelization over perturbations, k-points and spin components (DFPT) 1278 npp_min=1;npp_max=1;npert_eff=1 1279 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1280 if (dtset%paral_rf==1) then 1281 call dtset%get_npert_rbz(nband_rbz, nkpt_rbz, npert_eff) 1282 do jj=1,npert_eff 1283 ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj)) 1284 nkpt_eff=max(nkpt_eff,ii) 1285 end do 1286 npp_min=max(1,dtset%nppert) 1287 npp_max=min(nproc,npert_eff) 1288 if (tread(3)==1) then 1289 npp_max=dtset%nppert 1290 if (npp_max>npert_eff) then 1291 npp_min=npert_eff;npp_max=npert_eff 1292 ABI_WARNING('nppert is bigger than npert; we set nppert=npert') 1293 end if 1294 end if 1295 np_sk_min=1 1296 np_sk_max=min(nproc,nkpt_eff) 1297 ABI_FREE(nkpt_rbz) 1298 ABI_FREE(nband_rbz) 1299 else 1300 nkpt_eff=nproc 1301 np_sk_min=nproc-5 1302 np_sk_max=nproc 1303 end if 1304 end if 1305 1306 !Parallelization over spinorial components 1307 nps_min=1;nps_max=1 1308 if (optdriver==RUNL_GSTATE) then 1309 nps_min=max(1,dtset%npspinor) 1310 nps_max=min(nproc,dtset%nspinor) 1311 if (tread(5)==1) nps_max=dtset%npspinor 1312 end if 1313 1314 !KGB Parallelization 1315 1316 npf_min=1;npf_max=1 1317 npb_min=1;npb_max=1 1318 bpp_min=1;bpp_max=1 1319 n2=0;n3=0 1320 if (optdriver==RUNL_GSTATE) then 1321 1322 ! >> FFT level 1323 npf_min=max(1,dtset%npfft) 1324 npf_min=min(npf_min,ngmin(2)) 1325 npf_max=min(nproc,NPFMAX) 1326 if (tread(6)==1) then 1327 npf_max=dtset%npfft 1328 if (npf_max>ngmin(2)) then 1329 write(msg,'(3a)') & 1330 "Value of npfft given in input file is too high for the FFT grid!",ch10,& 1331 "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)." 1332 ABI_ERROR(msg) 1333 end if 1334 end if 1335 npf_max=min(npf_max,ngmin(2)) 1336 ! Deactivate MPI FFT parallelism for GPU 1337 if (dtset%gpu_option/=ABI_GPU_DISABLED) then 1338 npf_min=1;npf_max=1 1339 end if 1340 !Deactivate MPI FFT parallelism for GPU 1341 if (tread(1)==1.and.dtset%paral_kgb==0) then 1342 npf_min=1;npf_max=1 1343 end if 1344 !Deactivate MPI FFT parallelism for multi-threaded LOBPCG / CHEBFI 1345 if ((wf_algo_global==ALGO_LOBPCG_NEW.or.wf_algo_global==ALGO_CHEBFI.or.wf_algo_global==ALGO_CHEBFI_NEW).and.nthreads>1) then 1346 npf_min=1;npf_max=1 1347 end if 1348 1349 ! Number of FFT procs has to be a multiple of FFT grid sizes 1350 ! In case of a restart from a density file, it has to be 1351 ! compatible with the FFT grid used for the density 1352 n2=dtset%ngfft(2) ; n3=dtset%ngfft(3) 1353 if (n2==0.and.n3==0) then 1354 if (dtset%getden/=0.or.dtset%irdden/=0.or.& 1355 & dtset%getkden/=0.or.dtset%irdkden/=0.or.dtset%iscf<0) then 1356 dtset_found=.false.;file_found=.false. 1357 !1-Try to find ngfft from previous dataset 1358 if (dtset%getden/=0.or.dtset%getkden/=0) then 1359 do ii=1,ndtset_alloc 1360 jj=dtset%getden;if (jj==0) jj=dtset%getkden 1361 if (jj<0) jj=dtset%jdtset+jj 1362 if (dtsets(ii)%jdtset==jj) then 1363 dtset_found=.true. 1364 n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3) 1365 end if 1366 end do 1367 end if 1368 !2-If not found, try to extract ngfft from density file 1369 if (.not.dtset_found) then 1370 !Retrieve file name 1371 if (dtset%getden/=0.or.dtset%irdden/=0) then 1372 suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN' 1373 else if (dtset%getkden/=0.or.dtset%irdkden/=0) then 1374 suffix='_KDEN';if (dtset%nimage>1) suffix='_IMG1_KDEN' 1375 end if 1376 ABI_MALLOC(jdtset_,(0:ndtset_alloc)) 1377 jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset 1378 call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii) 1379 ABI_FREE(jdtset_) 1380 !Retrieve ngfft from file header 1381 idum3=0 1382 if (mpi_enreg%me==0) then 1383 inquire(file=trim(filden),exist=file_found) 1384 if (file_found) then 1385 call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self) 1386 idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1 1387 call hdr0%free() 1388 ABI_WARNING("Cannot find filden "//filden) 1389 end if 1390 end if 1391 call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii) 1392 n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0) 1393 end if 1394 end if 1395 end if 1396 1397 ! >> Band level 1398 npb_min=max(1,dtset%npband) 1399 npb_max=min(nproc,mband) 1400 if (tread(7)==1) npb_max=dtset%npband 1401 if (tread(1)==1.and.dtset%paral_kgb==0) then 1402 npb_min=1;npb_max=1 1403 end if 1404 1405 ! >> banddp level 1406 if (tread(8)==1) then 1407 bpp_min = dtset%bandpp 1408 else 1409 bpp_min = 1 1410 end if 1411 bpp_max=mband 1412 if (wf_algo_global==ALGO_LOBPCG_OLD) bpp_max=max(4,nint(mband/10.)) ! reasonable bandpp max 1413 if (tread(8)==1) bpp_max=dtset%bandpp 1414 if (wf_algo_global==ALGO_CHEBFI) bpp_min=1 ! bandpp not used with ChebFi 1415 if (wf_algo_global==ALGO_CHEBFI) bpp_max=1 1416 if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_min=1 ! bandpp not used with ChebFi 1417 if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_max=1 ! bandpp not used with ChebFi 1418 1419 end if ! RUNL_GSTATE 1420 1421 !Disable KGB parallelisation in some cases: 1422 ! - no GS 1423 ! - paral_kgb=0 present in input file 1424 ! - nstep=0 1425 ! - Hartree-Fock or hybrid calculation (for now on) 1426 if ( (optdriver/=RUNL_GSTATE).or.(dtset%paral_kgb==0.and.tread(1)==1).or. & 1427 (dtset%nstep==0).or.(dtset%usefock==1)) then 1428 nps_min=1; nps_max=1 1429 npf_min=1; npf_max=1 1430 npb_min=1; npb_max=1 1431 bpp_min=1; bpp_max=1 1432 end if 1433 1434 ! Which levels of parallelism do we have? 1435 with_image =(npi_min/=1.or.npi_max/=1) 1436 with_pert =(npp_min/=1.or.npp_max/=1) 1437 with_kpt =(np_sk_min/=1.or.np_sk_max/=1) 1438 with_spinor=(nps_min/=1.or.nps_max/=1) 1439 with_fft =(npf_min/=1.or.npf_max/=1) 1440 with_band =(npb_min/=1.or.npb_max/=1) 1441 with_bandpp=(bpp_min/=1.or.bpp_max/=1) 1442 with_thread=(nthreads>1) 1443 1444 !Allocate lists 1445 ABI_MALLOC(my_distp,(10,MAXCOUNT)) 1446 ABI_MALLOC(weight,(MAXCOUNT)) 1447 ABI_MALLOC(my_algo,(MAXCOUNT)) 1448 my_distp(1:7,:)=0;weight(:)=zero 1449 my_distp(8,:)=dtset%use_slk 1450 my_distp(9,:)=dtset%np_slk 1451 my_distp(10,:)=dtset%gpu_linalg_limit 1452 my_algo(:)=wf_algo_global 1453 icount=0;imin=1 1454 1455 !Cells= images or perturbations 1456 npc_min=1;npc_max=1;ncell_eff=1 1457 if (optdriver==RUNL_GSTATE) then 1458 ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max 1459 end if 1460 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1461 ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max 1462 end if 1463 1464 !Loop over all possibilities 1465 !Computation of weight~"estimated acceleration" 1466 !================================================================ 1467 1468 !Cells= images or perturbations 1469 npc_min=1;npc_max=1;ncell_eff=1 1470 if (optdriver==RUNL_GSTATE) then 1471 ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max 1472 end if 1473 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1474 ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max 1475 end if 1476 1477 !>>>>> CELLS 1478 do npc=npc_min,npc_max 1479 acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc) 1480 1481 ! >>>>> K-POINTS 1482 do np_sk=np_sk_min,np_sk_max 1483 ! -> for DFPT runs, impose that nsppol divides np_sk 1484 if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE]) .and. modulo(np_sk,dtset%nsppol)>0.and.np_sk>1) cycle 1485 acc_k=one;if (np_sk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,np_sk) 1486 1487 ! >>>>> SPINORS 1488 do nps=nps_min,nps_max 1489 acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps) 1490 1491 ! >>>>> FFT 1492 do npf=npf_min,npf_max 1493 ! -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok) 1494 if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle 1495 ! -> npf should be only divisible by 2, 3 or 5 1496 ii=npf 1497 do while (modulo(ii,2)==0) 1498 ii=ii/2 1499 end do 1500 do while (modulo(ii,3)==0) 1501 ii=ii/3 1502 end do 1503 do while (modulo(ii,5)==0) 1504 ii=ii/5 1505 end do 1506 if(ii/=1) cycle 1507 1508 ! Change algo if npfft>1 1509 wf_algo=wf_algo_global 1510 if (optdriver==RUNL_GSTATE.and.npf>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR 1511 1512 ! FFT parallelism not compatible with multithreading 1513 if (wf_algo==ALGO_LOBPCG_NEW.or.wf_algo==ALGO_CHEBFI.or.wf_algo==ALGO_CHEBFI_NEW) then 1514 if (nthreads>1.and.npf>1) cycle 1515 end if 1516 1517 ! >>>>> BANDS 1518 do npb=npb_min,npb_max 1519 nproc1=npc*np_sk*nps*npf*npb 1520 if (nproc1<nprocmin) cycle 1521 if (nproc1>nproc) cycle 1522 if (modulo(mband,npb)>0) cycle 1523 1524 ! Change algo if npband>1 1525 if (optdriver==RUNL_GSTATE.and.npb>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR 1526 1527 ! Base speedup 1528 acc_kgb_0=one;if (npb*npf*nthreads>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf*nthreads)) 1529 1530 if (npb*npf>4.and.wf_algo==ALGO_LOBPCG_OLD) then 1531 ! Promote npb=npf 1532 acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf)) 1533 ! Promote npf<=20 1534 if (npf>20)then 1535 acc_kgb_0=acc_kgb_0* & 1536 & 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) & 1537 & /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2 1538 end if 1539 end if 1540 1541 first_bpp=.true. 1542 do bpp=bpp_min,bpp_max 1543 1544 if (wf_algo==ALGO_LOBPCG_NEW) then 1545 blocksize=npb*bpp;nblocks=mband/blocksize 1546 if (modulo(bpp,nthreads)>0) cycle 1547 if ((bpp>1).and.(modulo(bpp,2)>0)) cycle 1548 if (modulo(mband,npb*bpp)>0) cycle 1549 else if (wf_algo==ALGO_LOBPCG_OLD) then 1550 blocksize=npb*bpp;nblocks=mband/blocksize 1551 if (modulo(mband/npb,bpp)>0) cycle 1552 if ((bpp>1).and.(modulo(bpp,2)>0)) cycle 1553 if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle 1554 if (npb*npf<=4.and.(.not.first_bpp)) cycle 1555 else if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then 1556 !Nothing 1557 else 1558 if (bpp/=1.or.npb/=1) cycle 1559 end if 1560 1561 first_bpp=.false. 1562 1563 acc_kgb=acc_kgb_0 1564 ! OLD LOBPCG: promote bpp*npb>mband/3 1565 if (wf_algo==ALGO_LOBPCG_OLD) then 1566 if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband)) 1567 end if 1568 ! NEW LOBPCG: promote minimal number of blocks 1569 ! promote block size <= BLOCKSIZE_MAX 1570 if (wf_algo==ALGO_LOBPCG_NEW) then 1571 acc_kgb=acc_kgb*(one-0.9_dp*dble(nblocks-1)/dble(mband-1)) 1572 if (blocksize>BLOCKSIZE_MAX) acc_kgb=acc_kgb*max(0.1_dp,one-dble(blocksize)/dble(10*BLOCKSIZE_MAX)) 1573 if (nthreads==1) then 1574 ! Promote npband vs bandpp & npfft 1575 if (blocksize>1) acc_kgb=acc_kgb*(0.1_dp*bpp+0.9_dp-blocksize)/(one-blocksize) 1576 if (npb*npf>4.and.mband>100) acc_kgb=acc_kgb*(one-0.8_dp*((three*bpp*npb)/(one*mband)-one)**2) 1577 tot_ncpus=max(npb,npf);if (tot_ncpus==2) tot_ncpus=0 1578 acc_kgb=acc_kgb*(one-0.8_dp*((dble(npb)/dble(npf))-2_dp)**2/(tot_ncpus-2_dp)**2) 1579 eff=max(npf,20);acc_kgb=acc_kgb*(one-0.8_dp*min(one,(eff-20)**2)) 1580 end if 1581 end if 1582 1583 ! CHEBFI: promote npfft=npband and nband>=npfft 1584 if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then 1585 if (npf>1) then 1586 if (npb>npf) then 1587 acc_kgb=acc_kgb*(one-0.8_dp*0.25_dp*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2) 1588 else 1589 acc_kgb=acc_kgb*(one-0.8_dp*nproc1**2*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2) 1590 end if 1591 end if 1592 end if 1593 1594 ! Resulting "weight" 1595 ! weight0=acc_c*acc_k*acc_s*acc_kgb 1596 weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+np_sk+nps+(npf*npb)) 1597 1598 ! Store data 1599 icount=icount+1 1600 if (icount<=MAXCOUNT) then 1601 my_algo(icount)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET) 1602 my_distp(1:7,icount)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/) 1603 weight(icount)=weight0 1604 if (weight0<weight(imin)) imin=icount 1605 else 1606 if (weight0>weight(imin)) then 1607 my_algo(imin)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET) 1608 my_distp(1:7,imin)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/) 1609 weight(imin)=weight0 1610 idum=minloc(weight);imin=idum(1) 1611 end if 1612 end if 1613 1614 end do ! bpp 1615 end do ! npb 1616 end do ! npf 1617 end do ! nps 1618 end do ! np_sk 1619 end do ! npc 1620 1621 !Compute number of selected distributions 1622 mcount_eff=icount 1623 mcount=min(mcount_eff,MAXCOUNT) 1624 1625 !Stop if no solution found 1626 if (mcount==0) then 1627 ! Override here the 0 default value changed in indefo1 1628 dtset%npimage = max(1,dtset%npimage) 1629 dtset%nppert = max(1,dtset%nppert) 1630 dtset%np_spkpt = max(1,dtset%np_spkpt) 1631 dtset%npspinor = max(1,dtset%npspinor) 1632 dtset%npfft = max(1,dtset%npfft) 1633 dtset%npband = max(1,dtset%npband) 1634 dtset%bandpp = max(1,dtset%bandpp) 1635 write(msg,'(a,i0,2a,i0,a)') & 1636 & 'Your input dataset does not let Abinit find an appropriate process distribution with nCPUs=',nproc*nthreads,ch10, & 1637 & 'Try to comment all the np* vars and set max_ncpus=',nthreads*nproc,' to have advices on process distribution.' 1638 ABI_WARNING(msg) 1639 if (max_ncpus>0) call wrtout(ab_out,msg, do_flush=.True.) 1640 iexit=iexit+1 1641 end if 1642 1643 !Sort data by increasing weight 1644 if (mcount>0) then 1645 ABI_MALLOC(isort,(mcount)) 1646 isort=(/(ii,ii=1,mcount)/) 1647 call sort_dp(mcount,weight,isort,tol6) 1648 ncount=min(mcount,MAXPRINT) 1649 end if 1650 1651 !Deduce a global value for paral_kgb 1652 paral_kgb=dtset%paral_kgb 1653 if (tread(1)==0) then 1654 if (any(my_algo(:)/=ALGO_CG)) paral_kgb=1 1655 end if 1656 1657 ! ====================================== 1658 ! Print output for abipy in Yaml format 1659 ! ====================================== 1660 1661 ! Please DO NOT CHANGE this part without contacting gmatteo first 1662 ! since ANY CHANGE can easily break the interface with AbiPy. 1663 if (iam_master .and. max_ncpus > 0.and. (mcount>0 .or. wf_algo_global == ALGO_CG)) then 1664 write(ount,'(2a)')ch10,"--- !Autoparal" 1665 if (optdriver==RUNL_GSTATE .and. paral_kgb == 0) then 1666 write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)" 1667 else if (optdriver==RUNL_GSTATE) then 1668 write(ount,'(a)')'# Autoparal section for GS calculations with paral_kgb 1' 1669 else if (optdriver==RUNL_RESPFN) then 1670 write(ount,'(a)')'# Autoparal section for DFPT calculations' 1671 else if (optdriver==RUNL_LONGWAVE) then 1672 write(ount,'(a)')'# Autoparal section for LONGWAVE calculations' 1673 else 1674 ABI_ERROR(sjoin('Unsupported optdriver:', itoa(optdriver))) 1675 end if 1676 write(ount,"(a)") "info:" 1677 write(ount,"(a,i0)")" autoparal: ",autoparal 1678 write(ount,"(a,i0)")" paral_kgb: ",paral_kgb 1679 write(ount,"(a,i0)")" max_ncpus: ",max_ncpus 1680 write(ount,"(a,i0)")" nspinor: ",dtset%nspinor 1681 write(ount,"(a,i0)")" nsppol: ",dtset%nsppol 1682 write(ount,"(a,i0)")" nkpt: ",dtset%nkpt 1683 write(ount,"(a,i0)")" mband: ",mband 1684 write(ount,"(a)")"configurations:" 1685 1686 if (optdriver==RUNL_GSTATE.and.paral_kgb==0) then 1687 work_size = dtset%nkpt * dtset%nsppol 1688 do ii=1,max_ncpus 1689 if (ii > work_size) cycle 1690 do omp_ncpus=1,nthreads 1691 nks_per_proc = work_size / ii 1692 nks_per_proc = nks_per_proc + MOD(work_size, ii) 1693 eff = (one * work_size) / (ii * nks_per_proc) 1694 write(ount,"(a,i0)")" - tot_ncpus: ",ii * omp_ncpus 1695 write(ount,"(a,i0)")" mpi_ncpus: ",ii 1696 write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus 1697 write(ount,"(a,f12.9)")" efficiency: ",eff 1698 !write(ount,"(a,f12.2)")" mem_per_cpu: ",mempercpu_mb 1699 end do 1700 end do 1701 1702 else if (optdriver==RUNL_GSTATE) then 1703 omp_ncpus=nthreads 1704 do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1 1705 ii=isort(jj) 1706 tot_ncpus = my_distp(7,ii) 1707 eff = weight(jj) / tot_ncpus 1708 write(ount,'(a,i0)')' - tot_ncpus: ',tot_ncpus 1709 write(ount,'(a,i0)')' mpi_ncpus: ',tot_ncpus 1710 write(ount,"(a,i0)")" omp_ncpus: ",omp_ncpus 1711 write(ount,'(a,f12.9)')' efficiency: ',eff 1712 !write(ount,'(a,f12.2)')' mem_per_cpu: ',mempercpu_mb 1713 write(ount,'(a)' )' vars: {' 1714 write(ount,'(a,i0,a)')' npimage: ',my_distp(1,ii),',' 1715 ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy 1716 write(ount,'(a,i0,a)')' npkpt: ',my_distp(2,ii),',' 1717 !write(ount,'(a,i0,a)')' np_spkpt: ',my_distp(2,ii),',' 1718 write(ount,'(a,i0,a)')' npspinor: ',my_distp(3,ii),',' 1719 write(ount,'(a,i0,a)')' npfft: ', my_distp(4,ii),',' 1720 write(ount,'(a,i0,a)')' npband: ',my_distp(5,ii),',' 1721 write(ount,'(a,i0,a)')' bandpp: ',my_distp(6,ii),',' 1722 write(ount,'(a)') ' }' 1723 end do 1724 1725 else if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then 1726 do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1 1727 ii=isort(jj) 1728 tot_ncpus = my_distp(7,ii) 1729 eff = weight(jj) / tot_ncpus 1730 write(ount,'(a,i0)')' - tot_ncpus: ',tot_ncpus 1731 write(ount,'(a,i0)')' mpi_ncpus: ',tot_ncpus 1732 !write(ount,'(a,i0)')' omp_ncpus: ',omp_ncpus !OMP not supported (yet) 1733 write(ount,'(a,f12.9)')' efficiency: ',eff 1734 !write(ount,'(a,f12.2)')' mem_per_cpu: ',mempercpu_mb 1735 write(ount,'(a)' )' vars: {' 1736 write(ount,'(a,i0,a)')' nppert: ', my_distp(1,ii),',' 1737 ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy 1738 write(ount,'(a,i0,a)')' npkpt: ', my_distp(2,ii),',' 1739 !write(ount,'(a,i0,a)')' np_spkpt: ', my_distp(2,ii),',' 1740 write(ount,'(a)') ' }' 1741 end do 1742 end if 1743 write(ount,'(a)')"..." 1744 end if 1745 1746 !Print out tab with selected choices 1747 if (mcount>0.and.iam_master) then 1748 if (nthreads==1) then 1749 write(msg,'(a,1x,100("="),2a,i0,2a)') ch10,ch10,& 1750 & ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,':',ch10 1751 else 1752 write(msg,'(a,1x,100("="),2a,i0,a,i0,2a)') ch10,ch10,& 1753 & ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,& 1754 & ' and ',nthreads,' openMP threads:',ch10 1755 end if 1756 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1757 !Titles of columns 1758 msgttl='~' 1759 if (with_image) msgttl=trim(msgttl)//'~~~~~~~~~~~' 1760 if (with_pert) msgttl=trim(msgttl)//'~~~~~~~~~~~' 1761 msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! kpt 1762 if (with_spinor) msgttl=trim(msgttl)//'~~~~~~~~~~' 1763 if (with_fft) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1764 if (with_band) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1765 if (with_bandpp) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1766 if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~' 1767 msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! nproc 1768 if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~~~~' 1769 msgttl=trim(msgttl)//'~~~~~~~~~~~' ! CPUs 1770 msgttl=' '//trim(msgttl) 1771 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1772 msg='|' 1773 if (with_image) msg=trim(msg)//' npimage|' 1774 if (with_pert) msg=trim(msg)//' nppert|' 1775 msg=trim(msg)//' np_spkpt|' 1776 if (with_spinor) msg=trim(msg)//' npspinor|' 1777 if (with_fft) msg=trim(msg)//' npfft|' 1778 if (with_band) msg=trim(msg)//' npband|' 1779 if (with_bandpp) msg=trim(msg)//' bandpp|' 1780 if (with_thread) msg=trim(msg)//' #Threads|' 1781 msg=trim(msg)//' #MPI(proc)|' 1782 if (with_thread) msg=trim(msg)//' #CPUs|' 1783 msg=trim(msg)//' WEIGHT|' 1784 msg=' '//trim(msg) 1785 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1786 msg='|' 1787 write(strg,'(i4,a,i4,a)') npi_min,'<<',npi_max,'|';if (with_image) msg=trim(msg)//trim(strg) 1788 write(strg,'(i4,a,i4,a)') npp_min,'<<',npp_max,'|';if (with_pert) msg=trim(msg)//trim(strg) 1789 write(strg,'(i5,a,i5,a)') np_sk_min,'<<',np_sk_max,'|'; msg=trim(msg)//trim(strg) 1790 write(strg,'(i5,a,i2,a)') nps_min,'<<',nps_max,'|';if (with_spinor) msg=trim(msg)//trim(strg) 1791 write(strg,'(i5,a,i5,a)') npf_min,'<<',npf_max,'|';if (with_fft) msg=trim(msg)//trim(strg) 1792 write(strg,'(i5,a,i5,a)') npb_min,'<<',npb_max,'|';if (with_band) msg=trim(msg)//trim(strg) 1793 write(strg,'(i5,a,i5,a)') bpp_min,'<<',bpp_max,'|';if (with_bandpp) msg=trim(msg)//trim(strg) 1794 write(strg,'(i9,a)' ) nthreads ,'|';if (with_thread) msg=trim(msg)//trim(strg) 1795 write(strg,'(i5,a,i5,a)') 1 ,'<<',nproc ,'|'; msg=trim(msg)//trim(strg) 1796 write(strg,'(i4,a,i6,a)') nthreads,'<<',nthreads*nproc,'|';if (with_thread) msg=trim(msg)//trim(strg) 1797 write(strg,'(a,i6,a)') ' <=',nthreads*nproc,'|'; msg=trim(msg)//trim(strg) 1798 msg=' '//trim(msg) 1799 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1800 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1801 !Loop over selected choices 1802 do jj=mcount,mcount-ncount+1,-1 1803 ii=isort(jj) 1804 msg='|' 1805 write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_image) msg=trim(msg)//trim(strg) 1806 write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_pert) msg=trim(msg)//trim(strg) 1807 write(strg,'(i12,a)') my_distp(2,ii),'|'; msg=trim(msg)//trim(strg) 1808 write(strg,'(i9,a)') my_distp(3,ii),'|';if (with_spinor) msg=trim(msg)//trim(strg) 1809 write(strg,'(i12,a)') my_distp(4,ii),'|';if (with_fft) msg=trim(msg)//trim(strg) 1810 write(strg,'(i12,a)') my_distp(5,ii),'|';if (with_band) msg=trim(msg)//trim(strg) 1811 write(strg,'(i12,a)') my_distp(6,ii),'|';if (with_bandpp) msg=trim(msg)//trim(strg) 1812 write(strg,'(i9,a)') nthreads ,'|';if (with_thread) msg=trim(msg)//trim(strg) 1813 write(strg,'(i12,a)') my_distp(7,ii),'|'; msg=trim(msg)//trim(strg) 1814 write(strg,'(i12,a)') nthreads*my_distp(7,ii),'|';if (with_thread) msg=trim(msg)//trim(strg) 1815 write(strg,'(f10.3,a)') weight(jj) ,'|'; msg=trim(msg)//trim(strg) 1816 msg=' '//trim(msg) 1817 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1818 end do 1819 !End of tab 1820 call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl) 1821 write(msg,'(a,i6,a,i6,a)')' Only the best possible choices for nproc are printed...' 1822 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1823 end if ! mcount>0 1824 1825 !Determine an optimal number of bands 1826 if (optdriver==RUNL_GSTATE.and. & 1827 & (any(my_algo(1:mcount)==ALGO_LOBPCG_OLD.or. & 1828 & my_algo(1:mcount)==ALGO_LOBPCG_NEW.or. & 1829 & my_algo(1:mcount)==ALGO_CHEBFI.or. & 1830 & my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then 1831 if (mcount>0) then 1832 icount=isort(mcount) 1833 npc=my_distp(1,icount);np_sk=my_distp(2,icount) 1834 nps=my_distp(3,icount);npf=my_distp(4,icount) 1835 else 1836 npc=1;if (with_image ) npc=npi_min 1837 np_sk=1;if (with_kpt ) np_sk=np_sk_min 1838 nps=1;if (with_spinor) nps=nps_min 1839 npf=1;if (with_fft ) npf=npf_min 1840 end if 1841 nproc1=npc*np_sk*nps*npf 1842 msg=ch10//' >>> Possible (best) choices for the number of bands (nband) are:' 1843 if (with_image.or.with_kpt.or.with_spinor.or.with_fft) msg=trim(msg)//ch10//' with:' 1844 write(strg,'(a,i0)') ' npimage=' ,npc;if (with_image) msg=trim(msg)//trim(strg) 1845 write(strg,'(a,i0)') ' np_spkpt=' ,np_sk;if (with_kpt) msg=trim(msg)//trim(strg) 1846 write(strg,'(a,i0)') ' npspinor=',nps;if (with_spinor) msg=trim(msg)//trim(strg) 1847 write(strg,'(a,i0)') ' npfft=' ,npf;if (with_fft) msg=trim(msg)//trim(strg) 1848 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1849 ib1=mband-int(mband*relative_nband_range);if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) ib1=mband 1850 ib2=mband+int(mband*relative_nband_range) 1851 ABI_MALLOC(nproc_best,(1+ib2-ib1)) 1852 ABI_MALLOC(nband_best,(1+ib2-ib1)) 1853 nproc_best(:)=1 1854 nband_best=(/(ii,ii=ib1,ib2)/) 1855 bpp=merge(1,nthreads,my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) 1856 do ii=ib1,ib2 1857 do jj=1,nproc/nproc1 1858 ibest=1 1859 do kk=1,jj 1860 if (mod(jj,kk)/=0) cycle 1861 if (mod(ii,kk*bpp)==0) ibest=max(ibest,kk) 1862 end do 1863 nproc_best(1+ii-ib1)=max(nproc_best(1+ii-ib1),ibest) 1864 end do 1865 end do 1866 call sort_int(1+ib2-ib1,nproc_best,nband_best) 1867 kk=-1 1868 do ii=1+ib2-ib1,max(ib2-ib1-MAXBAND_PRINT,1),-1 1869 write(msg,'(3(a,i6),a,i3,a,i5,a)') ' nband=',nband_best(ii),' using ',nproc1*nproc_best(ii)*nthreads,& 1870 & ' CPUs =',nproc1*nproc_best(ii),' MPI x',nthreads,' threads (npband=',nproc_best(ii),')' 1871 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1872 if (nband_best(ii)==mband) kk=nproc_best(ii) 1873 end do 1874 if (kk==maxval(nproc_best(:))) then 1875 if (my_algo(icount)/=ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then 1876 write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be the best choice!' 1877 end if 1878 if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then 1879 write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be a good choice!' 1880 end if 1881 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1882 end if 1883 ABI_FREE(nproc_best) 1884 ABI_FREE(nband_best) 1885 end if 1886 1887 if (optdriver==RUNL_GSTATE.and.(any(my_algo(1:mcount)==ALGO_CHEBFI .or. my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then 1888 write(msg,'(5a)') & 1889 & ' >>> Note that with the "Chebyshev Filtering" algorithm, it is often',ch10,& 1890 & ' better to increase the number of bands (10% more or a few tens more).',ch10,& 1891 & ' Advice: increase nband and put nbdbuf input variable to (nband_new-nband_old).' 1892 call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg) 1893 end if 1894 1895 !Refinement of the process distribution by mean of a LinAlg routines benchmarking 1896 if (mcount>0.and.optdriver==RUNL_GSTATE.and.autoparal/=1) then 1897 icount=isort(mcount) 1898 if (autoparal/=3) then 1899 if (autoparal==2) then 1900 write(msg,'(5a,9(a10,a1))') ch10, & 1901 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 1902 & ' Weights below are corrected according:',ch10,& 1903 & 'npimage','|','np_spkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 1904 & 'nproc' ,'|','weight','|','new weight','|' 1905 else 1906 write(msg,'(5a,11(a10,a1))') ch10, & 1907 & ' Values below have been tested with respect to Linear Algebra performance;',ch10,& 1908 & ' Weights below are corrected according:',ch10,& 1909 & 'npimage','|','np_spkpt' ,'|','npspinor' ,'|','npfft' ,'|','npband','|',' bandpp ' ,'|',& 1910 & 'nproc' ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|' 1911 end if 1912 call wrtout(std_out,msg);if (max_ncpus > 0) call wrtout(ab_out,msg) 1913 end if 1914 acc_k=zero 1915 ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1 1916 do jj=mcount,mcount-ncount+1,-1 1917 ii=isort(jj) 1918 npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii) 1919 if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then 1920 use_linalg_gpu=dtset%gpu_option 1921 call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu) 1922 if (autoparal/=2) then 1923 my_distp(9,ii)=np_slk 1924 if (np_slk>0) my_distp(8,ii)=1 1925 ! * gpu_linalg_limit: 1926 ! No use of GPU: htgspw_01.outuge value ~2 *vectsize*blocksize**2 tested 1927 ! Use of GPU: tiny value ~0.5*vectsize*blocksize**2 tested 1928 my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf 1929 if (use_linalg_gpu/=ABI_GPU_DISABLED) my_distp(10,ii)=my_distp(10,ii)/4 1930 end if 1931 if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed 1932 ! * Weight (corrected by 10% of the computed ratio) 1933 weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb) 1934 if (autoparal==2) then 1935 write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2)') & 1936 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 1937 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |' 1938 else if (autoparal==3) then 1939 write(msg,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, & 1940 & ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),& 1941 & ' and use_linalg_gpu=',use_linalg_gpu 1942 else 1943 write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') & 1944 & my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',& 1945 & my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',& 1946 & my_distp(9,ii),'|',use_linalg_gpu,'|' 1947 end if 1948 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 1949 ! We store the best value in weight(mcount) and keep icount 1950 if (weight0 > weight(mcount)) then 1951 icount=ii;weight(mcount)=weight0 1952 end if 1953 end if 1954 end do 1955 end if 1956 1957 !Store new process distribution 1958 if (mcount>0.and.max_ncpus<=0) then 1959 icount=isort(mcount) 1960 nproc1=my_distp(7,icount) 1961 ! Work load distribution 1962 if (optdriver==RUNL_GSTATE) then 1963 dtset%npimage= my_distp(1,icount) 1964 dtset%nppert = 1 1965 dtset%np_spkpt = my_distp(2,icount) 1966 end if 1967 if (optdriver==RUNL_RESPFN) then 1968 dtset%npimage= 1 1969 dtset%nppert = my_distp(1,icount) 1970 dtset%np_spkpt = 1 1971 end if 1972 dtset%npspinor = my_distp(3,icount) 1973 dtset%npfft = my_distp(4,icount) 1974 dtset%npband = my_distp(5,icount) 1975 dtset%bandpp = my_distp(6,icount) 1976 if (dtset%wfoptalg==114.or.dtset%wfoptalg==14.or.dtset%wfoptalg==4) then !if LOBPCG 1977 dtset%nblock_lobpcg = mband / (dtset%npband*dtset%bandpp) 1978 end if 1979 if (tread(1)==0) dtset%paral_kgb= merge(0,1,my_algo(icount)==ALGO_CG) 1980 ! The following lines are mandatory : the DFT+DMFT must use ALL the 1981 ! available procs specified by the user. So nproc1=nproc. 1982 ! Works only if paral_kgb is not activated?? 1983 if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE) then 1984 if (dtset%paral_kgb==0) then 1985 dtset%npspinor = 1 ; dtset%npfft = 1 1986 dtset%npband = 1 ; dtset%bandpp = 1 1987 dtset%npimage = 1 1988 end if 1989 nproc1 = nproc 1990 end if 1991 if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1 1992 ! LinAlg parameters: we change values only if they are not present in input file 1993 if (dtset%paral_kgb==1) then 1994 if (tread(9)==0) dtset%use_slk=my_distp(8,icount) 1995 if (tread(10)==0) dtset%np_slk=my_distp(9,icount) 1996 if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount) 1997 end if 1998 ! New definition of "world" MPI communicator 1999 if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then 2000 nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90 2001 nproc1=min(nproc1,nproc) 2002 nproc1=(nproc1/dtset%nppert)*dtset%nppert 2003 end if 2004 call initmpi_world(mpi_enreg,nproc1) 2005 end if 2006 2007 !Final advice in case max_ncpus > 0 2008 if (max_ncpus>0.and.mcount>0) then 2009 write(msg,'(6a)') ch10,& 2010 ' Launch a parallel version of ABINIT with a distribution of processors among the above list,',ch10,& 2011 ' and the associated input variables (np_spkpt, npband, npfft, bandpp, etc.).',ch10,& 2012 ' The higher weight should be better.' 2013 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 2014 end if 2015 2016 if (mcount>0) then 2017 ABI_FREE(isort) 2018 end if 2019 ABI_FREE(my_distp) 2020 ABI_FREE(my_algo) 2021 ABI_FREE(weight) 2022 2023 !Final line 2024 write(msg,'(a,100("="),2a)') " ",ch10,ch10 2025 call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg) 2026 2027 !max_ncpus requires a stop 2028 if (max_ncpus>0) then 2029 iexit = iexit + 1 ! will stop in the parent. 2030 end if 2031 2032 DBG_EXIT("COLL") 2033 2034 contains 2035 2036 real(dp) pure function speedup_fdp(nn, mm) 2037 ! Expected linear speedup for a nn-sized problem and mm processes 2038 integer,intent(in) :: nn, mm 2039 speedup_fdp = (one*nn) / (one* ((nn / mm) + merge(0, 1, mod(nn, mm) == 0))) 2040 end function speedup_fdp 2041 2042 end subroutine finddistrproc
ABINIT/m_mpi_setup [ Modules ]
NAME
m_mpi_setup
FUNCTION
Initialize MPI parameters and datastructures for parallel execution
COPYRIGHT
Copyright (C) 1999-2024 ABINIT group (FJ, MT, FD) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
16 #if defined HAVE_CONFIG_H 17 #include "config.h" 18 #endif 19 20 #include "abi_common.h" 21 22 module m_mpi_setup 23 24 use defs_basis 25 use m_distribfft 26 use m_xmpi 27 use m_xomp 28 use m_hdr 29 use m_sort 30 use m_errors 31 use m_abicore 32 33 use defs_abitypes, only : MPI_type 34 use m_fstrings, only : sjoin, itoa 35 use m_time, only : abi_wtime 36 use m_parser, only : intagm 37 use m_geometry, only : mkrdim, metric 38 use m_fftcore, only : fftalg_for_npfft, getng, kpgcount 39 use m_mpinfo, only : init_mpi_enreg, mpi_distrib_is_ok, initmpi_atom, proc_distrb_cycle, & 40 initmpi_grid, initmpi_pert, initmpi_img, distrb2, distrb2_hf, initmpi_world 41 use m_libpaw_tools, only : libpaw_write_comm_set 42 use m_dtset, only : dataset_type 43 use m_kg, only : getmpw 44 use m_dtfil, only : mkfilename 45 46 implicit none 47 48 private
ABINIT/mpi_setup [ Functions ]
NAME
mpi_setup
FUNCTION
Big loop on the datasets: - compute mgfft,mpw,nfft,... for this data set; - fill mpi_enreg *** At the output of this routine, all the dtsets input variables are known *** The content of dtsets should not be modified anymore afterwards.
INPUTS
filnam(5)=character strings giving file names ndtset= number of datasets to be read; if 0, no multi-dataset mode ndtset_alloc=number of datasets, corrected for allocation of at least one data set.
OUTPUT
dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables, some of which are initialized here, while other were already initialized previously.
SIDE EFFECTS
mpi_enregs=information about MPI parallelization
SOURCE
84 subroutine mpi_setup(dtsets,filnam,lenstr,mpi_enregs,ndtset,ndtset_alloc,string) 85 86 !Arguments ------------------------------------ 87 !scalars 88 integer,intent(in) :: lenstr,ndtset,ndtset_alloc 89 type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc) 90 character(len=*),intent(in) :: string 91 !arrays 92 character(len=fnlen),intent(in) :: filnam(5) 93 type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc) 94 95 !Local variables ------------------------------- 96 !scalars 97 integer :: blocksize,exchn2n3d,iband,idtset,iexit,ii,iikpt,iikpt_modulo, prtvol 98 integer :: isppol,jdtset,marr,mband_lower,mband_upper 99 integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k,optdriver 100 integer :: mband_mem 101 integer :: nfft,nfftdg,nkpt,nkpt_me,npert,nproc,nproc_fft,nqpt 102 integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread0,usepaw,vectsize 103 integer :: fftalg,fftalga,fftalgc 104 #ifdef HAVE_LINALG_ELPA 105 integer :: icol,irow,np 106 #endif 107 logical :: fftalg_read,ortalg_read,paral_kgb_read,wfoptalg_read,do_check 108 real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol 109 character(len=500) :: msg 110 !arrays 111 integer :: ngfft(18),ngfftdg(18),ngfftc(3),tread(12) 112 integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:) 113 integer,allocatable :: mybands(:) 114 integer,pointer :: nkpt_rbz(:) 115 real(dp),parameter :: k0(3)=(/zero,zero,zero/) 116 real(dp) :: gmet(3,3),gprimd(3,3),kpt(3),qphon(3),rmet(3,3),rprimd(3,3) 117 real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:) 118 real(dp),pointer :: nband_rbz(:,:) 119 character(len=6) :: nm_mkmem(3) 120 121 !************************************************************************* 122 123 DBG_ENTER("COLL") 124 125 iexit=0;mpw_k=0 126 127 call init_mpi_enreg(mpi_enregs(0)) 128 call initmpi_img(dtsets(0),mpi_enregs(0),-1) 129 130 do idtset=1,ndtset_alloc 131 call init_mpi_enreg(mpi_enregs(idtset)) 132 133 ! Handy read-only variables. 134 optdriver = dtsets(idtset)%optdriver 135 prtvol = dtsets(idtset)%prtvol 136 137 ! Read parallel input parameters 138 marr=max(12,dtsets(idtset)%npsp,dtsets(idtset)%nimage) 139 ABI_MALLOC(intarr,(marr)) 140 ABI_MALLOC(dprarr,(marr)) 141 nkpt =dtsets(idtset)%nkpt 142 nsppol=dtsets(idtset)%nsppol 143 jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0 144 usepaw=dtsets(idtset)%usepaw 145 mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol)) 146 mband_lower=minval(dtsets(idtset)%nband(1:nkpt*nsppol)) 147 148 ! Compute metric for this dataset 149 call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd) 150 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 151 152 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'max_ncpus',tread0,'INT') 153 if (tread0==1) dtsets(idtset)%max_ncpus=intarr(1) 154 155 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_atom',tread0,'INT') 156 if(tread0==1) dtsets(idtset)%paral_atom=intarr(1) 157 158 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_rf',tread0,'INT') 159 if (tread0==1.and.any(optdriver==[RUNL_RESPFN, RUNL_NONLINEAR])) dtsets(idtset)%paral_rf=intarr(1) 160 161 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npimage',tread(2),'INT') 162 if(tread(2)==1) dtsets(idtset)%npimage=intarr(1) 163 164 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nppert',tread(3),'INT') 165 if (tread(3)==1.and.optdriver==RUNL_RESPFN) dtsets(idtset)%nppert=intarr(1) 166 167 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_spkpt',tread(4),'INT') 168 if(tread(4)==1)then 169 dtsets(idtset)%np_spkpt=intarr(1) 170 else 171 ! npkpt is obsolete, but still read 172 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npkpt',tread(4),'INT') 173 if(tread(4)==1)then 174 dtsets(idtset)%np_spkpt=intarr(1) 175 endif 176 endif 177 178 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npspinor',tread(5),'INT') 179 if(tread(5)==1) dtsets(idtset)%npspinor=intarr(1) 180 181 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npfft',tread(6),'INT') 182 if(tread(6)==1) dtsets(idtset)%npfft=intarr(1) 183 184 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npband',tread(7),'INT') 185 if(tread(7)==1) dtsets(idtset)%npband=intarr(1) 186 187 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'bandpp',tread(8),'INT') 188 !Nband might have different values for different kpoint, but not bandpp. 189 !In this case, we just use the largest nband (mband_upper), and the input will probably fail 190 !at the bandpp check later on 191 if(tread(8)==1) then 192 dtsets(idtset)%bandpp=intarr(1) 193 ! check if nblock_lobpcg is read from the input, if so error msg 194 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nblock_lobpcg',tread0,'INT') 195 if (tread0==1) then 196 write(msg,'(3a)') 'Both bandpp and nblock_lobpcg are defined for the same dataset, this is confusing.',ch10,& 197 'Change the input to keep only nblock_lobpcg (preferably) or bandpp.' 198 ABI_ERROR(msg) 199 end if 200 if (dtsets(idtset)%wfoptalg==114.or.dtsets(idtset)%wfoptalg==14.or.dtsets(idtset)%wfoptalg==4) then !if LOBPCG 201 if (mod(mband_upper,dtsets(idtset)%bandpp*dtsets(idtset)%npband)==0) then 202 dtsets(idtset)%nblock_lobpcg=mband_upper/(dtsets(idtset)%bandpp*dtsets(idtset)%npband) 203 else 204 write(msg,'(5a)') 'mband_upper( =max_{kpt}(nband) ) should be a mutltiple of npband*bandpp.',ch10,& 205 'Change nband, npband or bandpp in the input.',ch10,& 206 'A simpler solution is to use nblock_lobpcg instead of bandpp.' 207 ABI_ERROR(msg) 208 end if 209 end if 210 else if (dtsets(idtset)%wfoptalg==114.or.dtsets(idtset)%wfoptalg==14.or.dtsets(idtset)%wfoptalg==4) then !if LOBPCG 211 if (mod(mband_upper,dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband)==0) then 212 dtsets(idtset)%bandpp=mband_upper/(dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband) 213 else 214 write(msg,'(3a)') 'mband_upper( =max_{kpt}(nband) ) should be a mutltiple of nblock_lobpcg*npband.',ch10,& 215 'Change nband, npband or nblock_lobpcg in the input.' 216 ABI_ERROR(msg) 217 end if 218 end if 219 220 ! Warning when using different number of bands for different kpoints (occopt=2) 221 if ( dtsets(idtset)%occopt==2 .and. dtsets(idtset)%nkpt>1 .and. & 222 & ((dtsets(idtset)%bandpp > 1) .or. (dtsets(idtset)%npband > 1)) ) then 223 write(msg, '(4a)' ) & 224 'When working with blocks of bands (bandpp>1 or npband>1),'& 225 &' the number of bands should be the same for every kpoints.',ch10,& 226 &' The run will most probably fail on a other check. If it does not fail, ignore this message.' 227 ABI_WARNING(msg) 228 end if 229 230 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'use_slk',tread(9),'INT') 231 if(tread(9)==1) dtsets(idtset)%use_slk=intarr(1) 232 233 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_slk',tread(10),'INT') 234 if(tread(10)==1) dtsets(idtset)%np_slk=intarr(1) 235 236 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'slk_rankpp',tread(12),'INT') 237 if(tread(12)==1) dtsets(idtset)%slk_rankpp=intarr(1) 238 239 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pw_unbal_thresh',tread0,'DPR') 240 if(tread0==1) dtsets(idtset)%pw_unbal_thresh=dprarr(1) 241 mpi_enregs(idtset)%pw_unbal_thresh=dtsets(idtset)%pw_unbal_thresh 242 243 call intagm(dprarr,intarr,jdtset,marr,12,string(1:lenstr),'gpu_devices',tread0,'INT') 244 if(tread0==1) dtsets(idtset)%gpu_devices(1:12)=intarr(1:12) 245 246 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_kokkos_nthrd',tread0,'INT') 247 if(tread0==1) dtsets(idtset)%gpu_kokkos_nthrd=intarr(1) 248 249 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_linalg_limit',tread(11),'INT') 250 if(tread(11)==1) dtsets(idtset)%gpu_linalg_limit=intarr(1) 251 252 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_nl_distrib',tread0,'INT') 253 if(tread0==1) dtsets(idtset)%gpu_nl_distrib=intarr(1) 254 255 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_nl_splitsize',tread0,'INT') 256 if(tread0==1) dtsets(idtset)%gpu_nl_splitsize=intarr(1) 257 258 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nphf',tread0,'INT') 259 if(tread0==1) dtsets(idtset)%nphf=intarr(1) 260 261 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'autoparal',tread0,'INT') 262 if(tread0==1) dtsets(idtset)%autoparal=intarr(1) 263 264 ! Read paral_kgb and disable it if not supported in optdriver 265 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_kgb',tread(1),'INT') 266 paral_kgb_read=(tread(1)==1) 267 if (paral_kgb_read) dtsets(idtset)%paral_kgb=intarr(1) 268 if (xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1) then 269 dtsets(idtset)%paral_kgb=0 270 write(msg, '(5a)' ) & 271 'When ABINIT is compiled without MPI flag,',ch10,& 272 'setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,& 273 'Action: modify compilation option or paral_kgb in the input file.' 274 ABI_WARNING(msg) 275 end if 276 if (ALL(optdriver /= [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT]) .and. dtsets(idtset)%paral_kgb/=0) then 277 dtsets(idtset)%paral_kgb=0 278 write(msg, '(a,i0,a)') & 279 "paral_kgb != 0 is not available in optdriver ",optdriver,". Setting paral_kgb to 0" 280 ABI_COMMENT(msg) 281 end if 282 283 wfoptalg_read=.false. 284 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'wfoptalg',tread0,'INT') 285 if(tread0==1) then 286 dtsets(idtset)%wfoptalg=intarr(1) 287 wfoptalg_read=.true. 288 else 289 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0 290 if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10 291 if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114 292 end if 293 294 ! Dump the list of irreducible perturbations and exit. 295 if (dtsets(idtset)%paral_rf==-1.and.optdriver/=RUNL_NONLINEAR) then 296 call dtsets(idtset)%get_npert_rbz(nband_rbz, nkpt_rbz, npert) 297 ABI_FREE(nband_rbz) 298 ABI_FREE(nkpt_rbz) 299 iexit = iexit + 1 300 end if 301 302 ! From total number of procs, compute all possible distributions 303 ! Ignore exit flag if GW/EPH calculations because autoparal section is performed in screening/sigma/bethe_salpeter/eph 304 if (any(optdriver == [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR, RUNL_NONLINEAR])) then 305 iexit = 0 306 else 307 call finddistrproc(dtsets,filnam,idtset,iexit,mband_upper,mpi_enregs(idtset),ndtset_alloc,tread) 308 end if 309 310 call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1) 311 nproc=mpi_enregs(idtset)%nproc_cell 312 313 ! Set paral_kgb to 1 when band-fft parallelism is activated 314 if (ANY(optdriver == [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT])) then 315 if (mpi_enregs(idtset)%nproc_cell>1) then 316 if (dtsets(idtset)%npband>1.or.dtsets(idtset)%npfft>1) then 317 if (.not.paral_kgb_read) dtsets(idtset)%paral_kgb=1 318 end if 319 end if 320 end if 321 322 if ((optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_GWLS.and.optdriver/=RUNL_RTTDDFT).and. & 323 & (dtsets(idtset)%np_spkpt/=1 .or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. & 324 & dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1)) then 325 !& .or.(dtsets(idtset)%iscf<0)) then 326 dtsets(idtset)%np_spkpt=1 ; dtsets(idtset)%npspinor=1 ; dtsets(idtset)%npfft=1 327 dtsets(idtset)%npband=1; dtsets(idtset)%bandpp=1 ; dtsets(idtset)%nphf=1 328 dtsets(idtset)%paral_kgb=0 329 ABI_COMMENT('For non ground state calculations, set bandpp, npfft, npband, npspinor, np_spkpt and nphf to 1') 330 end if 331 332 ! Take into account a possible change of paral_kgb (change of the default algorithm) 333 if (.not.wfoptalg_read) then 334 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0 335 if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10 336 if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114 337 if (mod(dtsets(idtset)%wfoptalg,10)==4) then 338 do iikpt=1,dtsets(idtset)%nkpt 339 if (any(abs(dtsets(idtset)%kpt(:,iikpt))>tol8)) dtsets(idtset)%istwfk(iikpt)=1 340 end do 341 end if 342 end if 343 344 dtsets(idtset)%densfor_pred=2 345 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'densfor_pred',tread0,'INT') 346 if(tread0==1) then 347 dtsets(idtset)%densfor_pred=intarr(1) 348 else 349 if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%densfor_pred=6 350 end if 351 if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) & 352 .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%densfor_pred/=3 )then 353 dtsets(idtset)%densfor_pred=3 354 write(msg, '(a,a,a)' )& 355 'When ionmov==4 and iscf==5 or 6, densfor_pred must be 3.',ch10,& 356 'Set densfor_pred to 3.' 357 ABI_COMMENT(msg) 358 end if 359 360 #ifdef HAVE_LOTF 361 ! LOTF need densfor_pred=2 362 if(dtsets(idtset)%ionmov==23) dtsets(idtset)%densfor_pred=2 363 #endif 364 365 if (usepaw==0) then 366 dtsets(idtset)%ortalg=2 367 else 368 dtsets(idtset)%ortalg=-2 369 end if 370 ortalg_read=.false. 371 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'ortalg',tread0,'INT') 372 if(tread0==1) then 373 dtsets(idtset)%ortalg=intarr(1) 374 ortalg_read=.true. 375 else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then 376 dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg 377 end if 378 379 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'iomode',tread0,'INT') 380 if(tread0==1) then 381 dtsets(idtset)%iomode=intarr(1) 382 else 383 if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) dtsets(idtset)%iomode=IO_MODE_MPI 384 #ifdef HAVE_NETCDF_DEFAULT 385 dtsets(idtset)%iomode=IO_MODE_ETSF 386 #endif 387 end if 388 389 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pawmixdg',tread0,'INT') 390 if(tread0==1) then 391 dtsets(idtset)%pawmixdg=intarr(1) 392 else if (dtsets(idtset)%npfft>1.and.usepaw==1) then 393 dtsets(idtset)%pawmixdg=1 394 end if 395 396 ! Cycle if the processor is not used 397 if (mpi_enregs(idtset)%me<0.or.iexit>0) then 398 ABI_FREE(intarr) 399 ABI_FREE(dprarr) 400 cycle 401 end if 402 403 ! Set cprj_in_memory to zero if paral_kgb has been activated by autoparal 404 if (dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%cprj_in_memory = 0 405 406 response=0 407 if (dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rf2_dkdk/=0 .or. dtsets(idtset)%rf2_dkde/=0 .or. & 408 & dtsets(idtset)%rfelfd/=0 .or. dtsets(idtset)%rfphon/=0 .or. dtsets(idtset)%rfstrs/=0 .or. & 409 & dtsets(idtset)%rfuser/=0 .or. dtsets(idtset)%rfmagn/=0) response=1 410 411 ! If no MPI, set all npxxx variables to 1 412 if (nproc==1) then 413 dtsets(idtset)%np_spkpt = 1 ; dtsets(idtset)%npband = 1 414 dtsets(idtset)%npfft = 1 ; dtsets(idtset)%npspinor = 1 415 dtsets(idtset)%nphf = 1 416 end if 417 418 ! --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION 419 if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then 420 dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft 421 dtsets(idtset)%bandpp=1 422 dtsets(idtset)%np_spkpt = 1 423 dtsets(idtset)%npfft = 1 424 dtsets(idtset)%npspinor = 1 425 write(msg, '(5a,i6,a)' )& 426 'If the recursion scheme is used (tfkinfunc==2),',ch10,& 427 'only the band parallelisation is active, we set:',ch10,& 428 'npfft= 1, np_spkpt= 1, npband=',dtsets(idtset)%npband,' .' 429 ABI_WARNING(msg) 430 end if 431 432 if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then 433 dtsets(idtset)%npspinor=1 434 dtsets(idtset)%npfft=2*dtsets(idtset)%npfft 435 write(msg,'(3a)')& 436 'npspinor is bigger than nspinor !',ch10,& 437 'We set npspinor to 1 ; we set npfft to 2*npfft' 438 ABI_WARNING(msg) 439 end if 440 441 ! Some checks on parallelization data 442 if(dtsets(idtset)%paral_kgb < 0 ) then 443 cycle 444 else if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.& 445 dtsets(idtset)%npfft/=1.or.dtsets(idtset)%np_spkpt/=1.or.dtsets(idtset)%npspinor/=1))then 446 if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then 447 write(msg,'(7a)')& 448 'The product of np_spkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,& 449 'The user-defined values of np_spkpt, npfft, npband or npspinor will be modified,',ch10,& 450 'in order to bring this product below nproc .',ch10,& 451 'At present, only a very simple algorithm is used ...' 452 ABI_WARNING(msg) 453 454 if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then 455 dtsets(idtset)%npfft=1 456 ABI_WARNING('Set npfft to 1') 457 else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor <= nproc)then 458 dtsets(idtset)%npfft=1 459 dtsets(idtset)%npband=1 460 ABI_WARNING('Set npfft and npband to 1') 461 else if(dtsets(idtset)%np_spkpt <= nproc)then 462 dtsets(idtset)%npfft=1 463 dtsets(idtset)%npband=1 464 dtsets(idtset)%npspinor=1 465 ABI_WARNING('Set npfft ,npband and npspinor to 1') 466 else 467 dtsets(idtset)%npfft=1 468 dtsets(idtset)%npband=1 469 dtsets(idtset)%np_spkpt=1 470 dtsets(idtset)%npspinor=1 471 ABI_WARNING('Set npfft, npband, nspinor and np_spkpt to 1') 472 end if 473 else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then 474 write(msg,'(a,5i6,4a)')& 475 'np_spkpt,npband,npspinor,npfft,nproc=',& 476 & dtsets(idtset)%np_spkpt,dtsets(idtset)%npfft,dtsets(idtset)%npband,dtsets(idtset)%npspinor,nproc,ch10,& 477 'The number of processors must not be greater than npfft*npband*np_spkpt*npspinor ',ch10,& 478 'when npfft or np_spkpt or npband or npspinor are chosen manually in the input file.' 479 ABI_ERROR(msg) 480 end if 481 end if 482 483 ! LOBPCG and ChebFi need paral_kgb=1 in parallel 484 if ((dtsets(idtset)%npband*dtsets(idtset)%npfft>1).and. & 485 & (mod(dtsets(idtset)%wfoptalg,10)==1.or.mod(dtsets(idtset)%wfoptalg,10)==4)) then 486 dtsets(idtset)%paral_kgb=1 487 end if 488 489 ! Check size of Scalapack communicator 490 #ifdef HAVE_LINALG_ELPA 491 if(dtsets(idtset)%paral_kgb>0.and.dtsets(idtset)%np_slk>0) then 492 np=min(dtsets(idtset)%np_slk,dtsets(idtset)%npband*dtsets(idtset)%npfft*dtsets(idtset)%npspinor) 493 irow=int(sqrt(float(np))) 494 do while(mod(np,irow)/=0) 495 irow=irow-1 496 end do 497 icol=nproc/irow 498 if (icol>mband_lower) then 499 do while(icol>mband_lower) 500 icol=icol-1 501 do while(mod(np,icol)/=0) 502 icol=icol-1 503 end do 504 end do 505 dtsets(idtset)%np_slk=icol 506 write(msg,'(5a,i6,a)')& 507 'The number of band*fft*spinor processors was not consistent with',ch10,& 508 'the size of communicator used for ELPA library (np_slk).',ch10,& 509 'np_slk value has been adjusted to ',dtsets(idtset)%np_slk,'.' 510 ABI_COMMENT(msg) 511 end if 512 end if 513 #endif 514 515 !Additional check in case of a parallelized Hartree-Fock calculation 516 ! %usefock == option to perform Fock exchange calculation 517 ! %nphf == number of processors for Fock exchange calculation 518 if ((dtsets(idtset)%usefock==1).and.(dtsets(idtset)%nphf/=1)) then 519 520 if ((dtsets(idtset)%nphf<0).or.(dtsets(idtset)%nphf==0)) then 521 ABI_ERROR('The value of variable nphf should be a non negative integer.') 522 end if 523 if (dtsets(idtset)%paral_kgb/=0) then 524 ABI_ERROR('Option paral_kgb should be turned off (value 0) for a parallelized Hartree-Fock calculation.') 525 end if 526 if (response/=0) then 527 ABI_ERROR('A response function calculation is not yet possible with a parallelized Hartree-Fock calculation.') 528 end if 529 if (dtsets(idtset)%npspinor>1) then 530 ABI_ERROR('The parallelism on spinors is not supported by a parallelized Hartree-Fock calculation.') 531 end if 532 if (dtsets(idtset)%np_spkpt*dtsets(idtset)%nphf > nproc )then 533 write(msg,'(a,3(a,i0))') ch10,& 534 'The product of variables np_spkpt and nphf is bigger than the number of processors: np_spkpt= ',& 535 dtsets(idtset)%np_spkpt,' nphf= ',dtsets(idtset)%nphf ,' and nproc= ', nproc 536 ABI_ERROR(msg) 537 end if 538 end if ! Fock 539 540 !When using chebfi, the number of blocks is equal to the number of processors 541 if((dtsets(idtset)%wfoptalg == 1) .or. (dtsets(idtset)%wfoptalg == 111)) then 542 !Nband might have different values for different kpoint, but not bandpp. 543 !In this case, we just use the largest nband (mband_upper), and the input will probably fail 544 !at the bandpp check later on 545 dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband 546 if(tread(8)==1) then 547 write(msg, '(a,i8,3a)' ) & 548 'bandpp has been internally set to ',dtsets(idtset)%bandpp,'.',ch10,& 549 'Indeed, there is no need to specify bandpp in the input when using chebfi (wfoptalg=1,111).' 550 ABI_COMMENT(msg) 551 end if 552 end if 553 554 !Check parallelization in case of RTTDDFT 555 !In particular ensure that bandpp = nband / npband 556 if (optdriver == RUNL_RTTDDFT) then 557 dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband 558 if ( tread(8) == 1 ) then 559 write(msg, '(a,a)') 'Setting bandpp is useless in RT-TDDFT because it is automatically set to nband/npband.', ch10 560 ABI_WARNING(msg) 561 end if 562 if (dtsets(idtset)%npfft/=1) then 563 dtsets(idtset)%npfft=1 564 write(msg, '(a,a)') 'RT-TDDFT is not compatible with FFT-parallelization. Remove npfft or set it to 1.', ch10 565 ABI_ERROR(msg) 566 end if 567 if (dtsets(idtset)%npspinor/=1) then 568 dtsets(idtset)%npspinor=1 569 write(msg, '(a,a)') 'RT-TDDFT is not compatible with spinor parallelization. Remove npspinor or set it to 1.', ch10 570 ABI_ERROR(msg) 571 end if 572 if (dtsets(idtset)%nphf/=1) then 573 dtsets(idtset)%nphf=1 574 write(msg, '(a,a)') 'RT-TDDFT is not compatible with HF parallelization. Remove nphf or set it to 1.', ch10 575 ABI_ERROR(msg) 576 end if 577 end if 578 579 ! Set mpi_enreg 580 mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb 581 if(dtsets(idtset)%paral_kgb/=0)then 582 mpi_enregs(idtset)%nproc_spkpt=dtsets(idtset)%np_spkpt 583 mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft 584 mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband 585 mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor) 586 mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp 587 ! Additional setting in case of hybrid functional calculation => not yet tested (CMartins) 588 ! if (dtsets(idtset)%usefock==1) then 589 ! mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf 590 ! if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1 591 ! end if 592 else 593 mpi_enregs(idtset)%bandpp = dtsets(idtset)%bandpp 594 ! Additional setting in case of a Fock exchange of PBE0 calculation 595 if (dtsets(idtset)%usefock==1) then 596 if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1 597 mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf 598 if (dtsets(idtset)%np_spkpt/=1) then 599 mpi_enregs(idtset)%nproc_spkpt = dtsets(idtset)%np_spkpt 600 else 601 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell/mpi_enregs(idtset)%nproc_hf 602 end if 603 else 604 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell 605 end if 606 end if 607 608 if(dtsets(idtset)%paral_kgb>=0) then 609 610 ! Compute processor distribution over perturbations 611 mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf 612 if (mpi_enregs(idtset)%paral_pert==1) then 613 dtsets(idtset)%nppert=max(1,dtsets(idtset)%nppert) 614 if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then 615 ABI_ERROR('The number of processors must not be smaller than nppert !') 616 end if 617 call initmpi_pert(dtsets(idtset),mpi_enregs(idtset)) 618 mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell 619 nproc=mpi_enregs(idtset)%nproc_cell 620 end if 621 ! Cycle if the processor is not used 622 if (mpi_enregs(idtset)%me<0) then 623 ABI_FREE(intarr) 624 ABI_FREE(dprarr) 625 cycle 626 end if 627 628 ! Compute processor distribution over kpt (and eventually band-fft) 629 call initmpi_grid(mpi_enregs(idtset)) 630 if(dtsets(idtset)%usewvl==1) mpi_enregs(idtset)%comm_fft=mpi_enregs(idtset)%comm_cell 631 632 ! Initialize tabs used for k/spin parallelism (with sequential-type values) 633 ABI_MALLOC(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol)) 634 ABI_MALLOC(mpi_enregs(idtset)%my_kpttab,(nkpt)) 635 mpi_enregs(idtset)%proc_distrb(:,:,:)=0 636 mpi_enregs(idtset)%my_kpttab(:)=(/(ii,ii=1,nkpt)/) 637 mpi_enregs(idtset)%my_isppoltab(:)=1;if (dtsets(idtset)%nsppol==1) mpi_enregs(idtset)%my_isppoltab(2)=0 638 639 ! HF or hybrid calculation : initialization of the array distrb_hf 640 if (dtsets(idtset)%usefock==1) then 641 ABI_MALLOC(mpi_enregs(idtset)%distrb_hf,(dtsets(idtset)%nkpthf,dtsets(idtset)%nbandhf,1)) 642 ! The dimension of distrb_hf are given by %nkpthf and %nbandhf. 643 ! We assume that there will be no dependence in spinpol for all the occupied states. 644 mpi_enregs(idtset)%distrb_hf=0 645 end if 646 647 ! Define k-points distribution (determine who I am) 648 ! Note that nkpt_me may differ from processor to processor 649 ! This fact will NOT be taken into account when 650 ! the memory needs will be evaluated in the subroutine memory. 651 ! Also, the reduction of k points due to symmetry in RF calculations 652 ! is NOT taken into account. This should be changed later ... 653 nkpt_me=nkpt 654 mband_mem=0 655 if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then 656 nkpt_me=0 657 if(response==0 .or. (response==1 .and. dtsets(idtset)%efmas==1))then 658 mpi_enregs(idtset)%paralbd=0 659 call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset)) 660 do iikpt=1,nkpt 661 if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))& 662 & nkpt_me=nkpt_me+1 663 end do ! ikpt=1,nkpt 664 ! HF or hybrid calculation : define the occupied states distribution (in array distrb_hf) 665 if (dtsets(idtset)%usefock==1) then 666 call distrb2_hf(dtsets(idtset)%nbandhf,dtsets(idtset)%nkpthf,nproc,nsppol,mpi_enregs(idtset)) 667 end if 668 else ! response==1 669 ! TODO: check or remove the following comment which seems outdated 670 ! Wrongly assumes that the number of elements of the 671 ! k-point sets of the two spin polarizations is the maximal 672 ! value of one of these k-point sets ... 673 ! This is to be corrected when RF is implemented 674 ! for spin-polarized case. 675 ! ENDTODO 676 mpi_enregs(idtset)%paralbd=1 677 ! nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert 678 call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset)) 679 do isppol=1,nsppol 680 nspink=0 681 do iikpt=1,nkpt 682 do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt) 683 if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then 684 nspink=nspink+1 685 exit 686 end if 687 end do ! iband 688 end do ! iikpt 689 if(nspink>nkpt_me)nkpt_me=nspink 690 end do ! isppol 691 ! Is nband present in input file or automatically estimated ? 692 tnband=0 693 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nband',tnband,'INT') 694 ! If the number of bands was estimated, there might be a side effect 695 ! when the definitive number of bands is known. k points 696 ! might be attributed to different processors than the present 697 ! proc_distrb describes. At most, the number of k points could increase by 1 ... 698 if(tnband==0)nkpt_me=nkpt_me+1 699 ! In any case, the maximal number of k points is nkpt 700 if(nkpt_me>nkpt)nkpt_me=nkpt 701 702 ! mband_mem 703 ABI_MALLOC (mybands, (mband_upper)) 704 mband_mem = 0 705 do isppol=1,nsppol 706 do iikpt=1,nkpt 707 mybands = 0 708 do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt) 709 if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_band)then 710 mybands(iband)=1 711 end if 712 end do ! iband 713 mband_mem = max(mband_mem, sum(mybands)) 714 end do ! iikpt 715 end do ! isppol 716 ABI_FREE (mybands) 717 end if ! response case 718 end if 719 end if 720 if (mband_mem == 0) mband_mem = mband_upper 721 dtsets(idtset)%mband_mem = mband_mem 722 723 ! Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem 724 ! and mk1mem. 725 nm_mkmem(1)='mkmem ' 726 nm_mkmem(2)='mkqmem' 727 nm_mkmem(3)='mk1mem' 728 729 do ii=1,3 730 731 ! Read in mkmem here if it is in the input file 732 ! TODO: mkmem is not supported any longer. These variables can be removed. 733 if(ii==1)then 734 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkmem',tread0,'INT') 735 else if(ii==2)then 736 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkqmem',tread0,'INT') 737 else if(ii==3)then 738 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mk1mem',tread0,'INT') 739 end if 740 741 742 ! Note that mkmem is used as a dummy variable, representing mkmem as well 743 ! as mkqmem, and mk1mem. 744 if(tread0==1) then 745 mkmem=intarr(1) 746 if (mkmem<0) then 747 ! mkmem is unreasonable; must be zero or positive 748 write(msg, '(4a,i0,4a)')& 749 nm_mkmem(ii),' must be positive or null but ',nm_mkmem(ii),' =',mkmem,ch10,& 750 'Use default ',nm_mkmem(ii),' = nkpt .' 751 ABI_WARNING(msg) 752 mkmem=nkpt 753 end if 754 755 else 756 757 ! mkmem was not set in the input file so default to incore solution 758 !write(msg,'(6a)') & 759 !'mpi_setup: ',nm_mkmem(ii),' undefined in the input file.','Use default ',nm_mkmem(ii),' = nkpt' 760 !call wrtout(std_out, msg) 761 mkmem=nkpt 762 end if 763 764 ! Check whether nkpt distributed on the processors <= mkmem; 765 ! if so then may run entirely in core, 766 ! avoiding i/o to disk for wavefunctions and kg data. 767 ! mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1. 768 if (nkpt_me<=mkmem .and. mkmem/=0 ) then 769 write(msg, '(a,i0,a,a,a,i0,a)' ) & 770 ' mpi_setup: With nkpt_me=',nkpt_me,' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.' 771 if (prtvol > 0) call wrtout(std_out,msg) 772 if(nkpt_me<mkmem .and. nkpt_me/=0)then 773 write(msg,'(3a)')' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.' 774 mkmem=nkpt_me 775 if (prtvol > 0) call wrtout(std_out,msg) 776 end if 777 else if(mkmem/=0)then 778 write(msg, '(a,i0,3a,i0,5a)' ) & 779 ' mpi_setup: With nkpt_me=',nkpt_me,'and ',nm_mkmem(ii),' = ',mkmem,& 780 ' ground state wf require disk i/o.',ch10,& 781 ' Resetting ',nm_mkmem(ii),' to zero to save memory space.' 782 mkmem=0 783 if (prtvol > 0) call wrtout(std_out,msg) 784 end if 785 if(dtsets(idtset)%usewvl == 0 .or. dtsets(idtset)%usepaw==1)then 786 if(ii==1)dtsets(idtset)%mkmem=mkmem 787 end if 788 if(ii==2)dtsets(idtset)%mkqmem=mkmem 789 if(ii==3)dtsets(idtset)%mk1mem=mkmem 790 791 if(dtsets(idtset)%usewvl == 1 .and. dtsets(idtset)%usepaw==1 )then 792 if(dtsets(idtset)%mkmem .ne. dtsets(idtset)%nkpt) then 793 ABI_ERROR("mkmem is not allowed for WVL+PAW") 794 end if 795 end if 796 797 end do ! End the loop on the three possiblities mkmem, mkqmem, mk1mem. 798 799 if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0 800 801 ! Check if some MPI processes are empty (MBPT codes uses a complete different MPI algorithm) 802 do_check = all(optdriver /= [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR]) 803 if (dtsets(idtset)%usewvl == 0 .and. do_check) then 804 if (.not.mpi_distrib_is_ok(mpi_enregs(idtset),mband_upper,& 805 dtsets(idtset)%nkpt,dtsets(idtset)%mkmem,nsppol,msg=msg)) then 806 write(msg,'(5a)') trim(msg),ch10,& 807 'YOU ARE STRONGLY ADVISED TO ACTIVATE AUTOMATIC PARALLELIZATION!',ch10,& 808 'USE "AUTOPARAL=1" IN THE INPUT FILE.' 809 ABI_WARNING(msg) 810 end if 811 end if 812 813 ! call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string) 814 ! Printing of processor distribution 815 ! MPIWF : here, set up the complete ngfft, containing the information 816 ! for the parallelisation of the FFT 817 call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world) 818 call libpaw_write_comm_set(mpi_enregs(idtset)%comm_world) 819 820 ! Default values for sequential case 821 paral_fft=0; nproc_fft=1; me_fft=0 822 823 if(dtsets(idtset)%usewvl == 0)then 824 if(optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then 825 paral_fft=1 ! parallelisation over FFT 826 if (mpi_enregs(idtset)%nproc_cell>0) then 827 if(mpi_enregs(idtset)%paral_kgb == 1) then 828 829 if((dtsets(idtset)%gpu_option/=ABI_GPU_DISABLED).and.(mpi_enregs(idtset)%nproc_fft/=1))then 830 write(msg,'(3a,i0)') & 831 'When the use of GPU is on, the number of FFT processors, npfft, must be 1',ch10,& 832 'However, npfft=',mpi_enregs(idtset)%nproc_fft 833 ABI_ERROR(msg) 834 end if 835 836 if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then 837 write(msg,'(3a,i0,a,i0)') & 838 'The number of FFT processors, npfft, should be a multiple of ngfft(2).',ch10,& 839 'However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2) 840 ABI_BUG(msg) 841 end if 842 843 do iikpt=1,nkpt*nsppol 844 iikpt_modulo = modulo(iikpt,nkpt)+1 845 if ((dtsets(idtset)%istwfk(iikpt_modulo)==2)) then !.and.(dtsets(idtset)%ngfft(7)==401)) then 846 if ((mpi_enregs(idtset)%bandpp==0).or. & 847 ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then 848 write(msg,'(3a,i0)') & 849 'The number bandpp should be 1 or a multiple of 2',ch10,& 850 'However, bandpp=',mpi_enregs(idtset)%bandpp 851 ABI_BUG(msg) 852 end if 853 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 854 write(msg,'(3a,i0,a,i0)') & 855 'The number of bands for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,& 856 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', & 857 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 858 ABI_BUG(msg) 859 end if 860 else if ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then 861 ABI_BUG('The fftalg=400 with istwfk=2 is not valid') 862 else 863 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 864 write(msg,'(3a,i0,a,i0)') & 865 'The number of band for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,& 866 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', & 867 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 868 ABI_BUG(msg) 869 end if 870 if ((mpi_enregs(idtset)%bandpp==0)) then 871 write(msg,'(a,i0,2a,i0,2a,i0)')& 872 'The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,& 873 'and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,& 874 'However, bandpp=',mpi_enregs(idtset)%bandpp 875 ABI_BUG(msg) 876 end if 877 end if 878 end do 879 880 if (xmpi_paral==1) then 881 if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_spkpt)/=0)then 882 write(msg,'(3a,i0,a,i0)') & 883 'The number of KPT processors, np_spkpt, should be a multiple of nkpt*nsppol.',ch10,& 884 'However, np_spkpt=',mpi_enregs(idtset)%nproc_spkpt,' and nkpt*nsppol=',nkpt*nsppol 885 ABI_WARNING(msg) 886 end if 887 end if 888 else 889 do iikpt=1,nkpt*nsppol 890 iikpt_modulo = modulo(iikpt,nkpt)+1 891 if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then 892 write(msg,'(3a,i0,a,i0)') & 893 'The number of band for the k-point, nband_k, should be a multiple of npband*bandpp.',ch10,& 894 'However, nband_k=',dtsets(idtset)%nband(iikpt),' and npband*bandpp=', & 895 mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp 896 ABI_BUG(msg) 897 end if 898 end do 899 end if 900 end if 901 nproc_fft=mpi_enregs(idtset)%nproc_fft 902 me_fft=mpi_enregs(idtset)%me_fft 903 end if 904 end if 905 906 ! Compute mgfft,mpw,nfft for this data set (it is dependent of mpi_enreg) 907 ABI_MALLOC(istwfk,(nkpt)) 908 ABI_MALLOC(kpt_with_shift,(3,nkpt)) 909 910 ! Set the default value of fftalg for given npfft but allow the user to override it. 911 ! Warning: If you need to change npfft, **DO IT** before this point so that here we get the correct fftalg 912 dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 913 dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 914 915 fftalg_read=.false. 916 call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'fftalg',tread0,'INT') 917 918 if (tread0==1) then 919 dtsets(idtset)%ngfft(7)=intarr(1) 920 if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1) 921 fftalg_read=.true. 922 end if 923 924 ecut =dtsets(idtset)%ecut 925 dilatmx =dtsets(idtset)%dilatmx 926 ngfft(:) =dtsets(idtset)%ngfft(:) 927 istwfk(:)=dtsets(idtset)%istwfk(1:nkpt) 928 nsym =dtsets(idtset)%nsym 929 930 nqpt=dtsets(idtset)%nqpt 931 qphon(:)=zero;if(nqpt/=0) qphon(:)=dtsets(idtset)%qptn(:) 932 933 ABI_MALLOC(symrel,(3,3,nsym)) 934 symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym) 935 ecut_eff=ecut*dilatmx**2 936 937 if (usepaw==1) call wrtout(std_out,'getng is called for the coarse grid:') 938 kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary 939 940 call getng(dtsets(idtset)%boxcutmin,dtsets(idtset)%chksymtnons,ecut_eff,gmet,kpt,me_fft,mgfft,nfft,& 941 & ngfft,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,& 942 & gpu_option=dtsets(idtset)%gpu_option) 943 944 dtsets(idtset)%ngfft(:)=ngfft(:) 945 dtsets(idtset)%mgfft=mgfft 946 dtsets(idtset)%nfft=nfft 947 kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm 948 949 exchn2n3d=dtsets(idtset)%exchn2n3d 950 nproc_fft=ngfft(10) ; me_fft=ngfft(11) 951 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 952 953 ! Initialize tables for MPI-FFT. 954 call init_distribfft(mpi_enregs(idtset)%distribfft,'c',mpi_enregs(idtset)%nproc_fft,ngfft(2),ngfft(3)) 955 956 if(response/=0)then 957 ! This value of mpw is used in the first part of respfn.f 958 call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt) 959 end if 960 if(nqpt/=0)then 961 kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1) 962 kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2) 963 kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3) 964 end if 965 if (dtsets(idtset)%usewvl == 0) then 966 call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt) 967 ! Allocate tables for parallel IO of the wavefunctions. 968 if( xmpi_mpiio==1 .and. mpi_enregs(idtset)%paral_kgb == 1 .and. & 969 & any(dtsets(idtset)%iomode == [IO_MODE_MPI, IO_MODE_ETSF])) then 970 ABI_MALLOC(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem)) 971 end if 972 else 973 mpw = 0 974 end if 975 976 ! The dimensioning, in the RF case, should be done only with mpw, 977 ! but mpw is used in the first part of respfn.f, and should at least 978 ! be equal to mpw_k . The chosen way to code is not optimal, only convenient : 979 ! it leads to a small waste of memory. 980 if(response/=0 .and. mpw_k>mpw)mpw=mpw_k 981 dtsets(idtset)%ngfft(:)=ngfft(:) 982 983 ! Initialize ngfftc to the initial guess for the coarse mesh 984 ngfftc(:) = 2 985 986 ! In case of PAW, compute fine FFT parameters 987 if (usepaw==1) then 988 ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2 989 ngfftdg(:)=dtsets(idtset)%ngfftdg(:) 990 call wrtout(std_out,'getng is called for the fine grid:') 991 ! Start with the coarse mesh as an initial guess for the fine mesh 992 ! This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension 993 ngfftc(:) = ngfft(1:3) 994 kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary 995 996 call getng(dtsets(idtset)%bxctmindg,dtsets(idtset)%chksymtnons,& 997 & ecutdg_eff,gmet,kpt,me_fft,mgfftdg,& 998 & nfftdg,ngfftdg,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,ngfftc,& 999 & gpu_option=dtsets(idtset)%gpu_option) 1000 1001 dtsets(idtset)%ngfftdg(:)=ngfftdg(:) 1002 dtsets(idtset)%mgfftdg=mgfftdg 1003 dtsets(idtset)%nfftdg=nfftdg 1004 ! Compute fft distribution for fine grid 1005 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 1006 call init_distribfft(mpi_enregs(idtset)%distribfft,'f', mpi_enregs(idtset)%nproc_fft,ngfftdg(2),ngfftdg(3)) 1007 end if 1008 1009 dtsets(idtset)%mpw=mpw 1010 ABI_FREE(symrel) 1011 ABI_FREE(istwfk) 1012 ABI_FREE(kpt_with_shift) 1013 ABI_FREE(intarr) 1014 ABI_FREE(dprarr) 1015 1016 ! Initialize data for the parallelization over atomic sites (PAW) 1017 if (dtsets(idtset)%natom==1) dtsets(idtset)%paral_atom=0 1018 if (dtsets(idtset)%usepaw==0) dtsets(idtset)%paral_atom=0 1019 if (dtsets(idtset)%usewvl/=0) dtsets(idtset)%paral_atom=0 1020 if (dtsets(idtset)%usedmft==1) dtsets(idtset)%paral_atom=0 1021 if (optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_RESPFN.and.optdriver/=RUNL_GWLS) dtsets(idtset)%paral_atom=0 1022 if (dtsets(idtset)%macro_uj/=0) dtsets(idtset)%paral_atom=0 1023 1024 call initmpi_atom(dtsets(idtset),mpi_enregs(idtset)) 1025 1026 ! In case of the use of a GPU (Cuda), some defaults can change 1027 ! according to a threshold on matrix sizes 1028 if (dtsets(idtset)%gpu_option/=ABI_GPU_DISABLED.or.dtsets(idtset)%gpu_option==ABI_GPU_UNKNOWN) then 1029 if (optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then 1030 vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor 1031 if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize 1032 blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp 1033 if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft 1034 if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then 1035 if (.not.wfoptalg_read) then 1036 dtsets(idtset)%wfoptalg=14 1037 if (tread(8)==1) then ! bandpp read, so set nblock_lobpcg 1038 dtsets(idtset)%nblock_lobpcg=mband_upper/(dtsets(idtset)%bandpp*dtsets(idtset)%npband) 1039 else ! bandpp set by nblock_lobpcg 1040 dtsets(idtset)%bandpp=mband_upper/(dtsets(idtset)%nblock_lobpcg*dtsets(idtset)%npband) 1041 end if 1042 if (.not.fftalg_read) then 1043 dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 1044 if (usepaw==1) dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft) 1045 end if 1046 if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg) 1047 end if 1048 end if 1049 end if 1050 end if 1051 1052 ! initialize data for the parallelization for WVL: 1053 if(dtsets(idtset)%usewvl==1) then 1054 mpi_enregs(idtset)%comm_wvl=mpi_enregs(idtset)%comm_cell 1055 mpi_enregs(idtset)%nproc_wvl=xmpi_comm_size(mpi_enregs(idtset)%comm_wvl) 1056 mpi_enregs(idtset)%me_wvl=xmpi_comm_rank(mpi_enregs(idtset)%comm_wvl) 1057 end if 1058 1059 end do 1060 1061 !This is not a very clean exit in case of paral_kgb<0 1062 if (iexit/=0)then 1063 ABI_STOP("Stopping now!") 1064 end if 1065 1066 DBG_EXIT("COLL") 1067 1068 end subroutine mpi_setup