TABLE OF CONTENTS


ABINIT/compute_kgb_indicator [ Functions ]

[ Top ] [ Functions ]

NAME

 compute_kgb_indicator

FUNCTION

 Only for "KGB" parallelism (LOBPCG algorithm for Ground-state):
  Give an indicator of performance for a given distribution of processors
  (npband, npfft and bandpp).
  Determine best choice of parameters for Scalapack and/or Magma Linear Algebra routines.

INPUTS

  bandpp=internal lobpcg optimization variable
  glb_comm=communicator for global MPI communications
  mband=maximum number of bands.
  mband=maximum number of plane waves
  npband=number of processor 'band'
  npfft = number of processor 'fft'
  uselinalggpu=indicate if we also test the gpu linear algebra

OUTPUT

  acc_kgb = indicator of performance
  npslk = number of process to used in communicators

SIDE EFFECTS

 This routine can be used to find an indicator in order to refine automatic process distribution.
   This indicator is returned in acc_kgb
 This routine can be used to find the optimal values of np_slk parameter (ScaLapack)
   and wheter or not we should use Magma for Linear Algebra in lobpcgwf

SOURCE

2001 subroutine compute_kgb_indicator(acc_kgb,bandpp,glb_comm,mband,mpw,npband,npfft,npslk,uselinalggpu)
2002 
2003  use m_abi_linalg
2004 
2005 !Arguments ------------------------------------
2006 !scalars
2007  integer,intent(in) :: bandpp,glb_comm,mband,mpw,npband,npfft
2008  integer,intent(inout) :: npslk,uselinalggpu
2009  real(dp),intent(inout) :: acc_kgb
2010 
2011 !Local variables-------------------------------
2012 !scalars
2013  integer,parameter :: max_number_of_npslk=10,max_number_of_iter=10
2014  integer :: blocksize,bigorder,ierr,ii,islk,islk1,iter,jj,keep_gpu
2015  integer :: kgb_comm,my_rank,np_slk,np_slk_max,np_slk_best,nranks
2016  integer :: use_lapack_gpu,use_slk,vectsize,wfoptalg
2017  real(dp) :: min_eigen,min_ortho,time_xeigen,time_xortho
2018  character(len=500) :: msg
2019 !arrays
2020  integer,allocatable :: ranks(:),val_npslk(:)
2021  real(dp),allocatable :: eigen(:),grama(:,:),gramb(:,:)
2022  complex(dpc),allocatable :: blockvectorbx(:,:),blockvectorx(:,:),sqgram(:,:)
2023 
2024 !******************************************************************
2025 
2026  DBG_ENTER("COLL")
2027 
2028 #ifdef DEBUG_MODE
2029  write(msg,'(a,3i3)') 'compute_kgb_indicator : (bpp,npb,npf) = ', bandpp, npband, npfft
2030  call wrtout(std_out,msg,'PERS')
2031 #endif
2032 
2033 !Create local communicator for test
2034  if (xmpi_paral==1) then
2035    nranks=npfft*npband
2036    ABI_MALLOC(ranks,(nranks))
2037    ranks=(/((my_rank-1),my_rank=1,nranks)/)
2038    kgb_comm=xmpi_subcomm(glb_comm,nranks,ranks,my_rank_in_group=my_rank)
2039    ABI_FREE(ranks)
2040  else
2041    kgb_comm=xmpi_comm_self
2042    my_rank=0
2043  end if
2044 
2045 !Only for process in the new subgroup
2046  if (my_rank/=xmpi_undefined) then
2047 
2048 !  We enforce vectsize >=blocksize  (This is not true in lobpcg but
2049 !  these are rare cases and this simplify the matrix constructions below...)
2050    blocksize=npband*bandpp
2051    vectsize=max(1+mpw/(npband*npfft),blocksize)
2052    bigorder=3*blocksize
2053 
2054    ABI_MALLOC(blockvectorx,(vectsize,blocksize))
2055    ABI_MALLOC(blockvectorbx,(vectsize,blocksize))
2056    ABI_MALLOC(sqgram,(blocksize,blocksize))
2057    ABI_MALLOC(grama,(2*bigorder,bigorder))
2058    ABI_MALLOC(gramb,(2*bigorder,bigorder))
2059    ABI_MALLOC(eigen,(bigorder))
2060    ABI_MALLOC(val_npslk,(max_number_of_npslk)) ! not too much values tested
2061 
2062    min_eigen=greatest_real
2063    min_ortho=greatest_real
2064    np_slk_best=-1 ; np_slk_max=0
2065 #ifdef HAVE_LINALG_SCALAPACK
2066    np_slk_max=min(max_number_of_npslk,npband*npfft)
2067 #endif
2068 
2069 !  Preselect a range of available np_slk values
2070    val_npslk(1:)=0 ; val_npslk(2)=1
2071    do islk=3,np_slk_max
2072      np_slk=val_npslk(islk-1)*2
2073      do while ((modulo(npband*npfft,np_slk)>0).and.(np_slk<(npband*npfft)))
2074        np_slk=np_slk+1
2075      end do
2076      if(np_slk>(npband*npfft).or.np_slk>mband) exit
2077      val_npslk(islk)=np_slk
2078    end do
2079    np_slk_max=islk-1
2080 
2081 !  Loop over np_slk values
2082    islk1=1
2083 #ifdef HAVE_LINALG_MAGMA
2084    islk1=1-uselinalggpu
2085 #endif
2086    do islk=islk1,np_slk_max
2087 
2088      time_xortho=zero ; time_xeigen=zero
2089 
2090      use_slk=0
2091      if (islk==0) then
2092 !      This is the test for the GPU
2093        use_lapack_gpu=1 ; np_slk=0
2094      else
2095        use_lapack_gpu=0 ; np_slk=val_npslk(islk)
2096        if (np_slk>0) use_slk=1
2097      end if
2098 
2099 !    Initialize linalg parameters for this np_slk value
2100 !    For the first np_slk value, everything is initialized
2101 !    For the following np_slk values, only Scalapack parameters are updated
2102      wfoptalg=14 ! Simulate use of LOBPCG
2103      call abi_linalg_init(bigorder,RUNL_GSTATE,wfoptalg,1,&
2104 &                         use_lapack_gpu,use_slk,np_slk,kgb_comm)
2105 
2106 !    We could do mband/blocksize iter as in lobpcg but it's too long
2107      do iter=1,max_number_of_iter
2108 
2109 !      Build matrixes
2110        do ii=1,vectsize
2111          do jj=1,blocksize
2112            if (ii>jj) then
2113              blockvectorx(ii,jj) =czero
2114              blockvectorbx(ii,jj)=czero
2115            else
2116              blockvectorx(ii,jj) =cone
2117              blockvectorbx(ii,jj)=cone
2118            end if
2119          end do
2120        end do
2121        grama=zero;gramb=zero
2122        do jj=1,bigorder
2123          do ii=jj,bigorder
2124            if (ii==jj) then
2125              grama(2*ii-1,jj)=one
2126              gramb(2*ii-1,jj)=one
2127            else
2128              grama(2*ii-1:2*ii,jj)=one
2129              grama(2*jj-1,ii)= one
2130              grama(2*jj  ,ii)=-one
2131            end if
2132          end do
2133        end do
2134 
2135 !      Call to abi_xorthonormalize
2136        time_xortho=time_xortho-abi_wtime()
2137        call abi_xorthonormalize(blockvectorx,blockvectorbx,blocksize,kgb_comm,sqgram,vectsize)
2138        time_xortho = time_xortho + abi_wtime()
2139 
2140 !      Call to abi_xhegv
2141        time_xeigen=time_xeigen-abi_wtime()
2142        call abi_xhegv(1,'v','u',bigorder,grama,bigorder,gramb,bigorder,eigen,&
2143 &       x_cplx=2,use_slk=use_slk,use_gpu=use_lapack_gpu)
2144        time_xeigen=time_xeigen+abi_wtime()
2145 
2146      end do ! iter
2147 
2148 !    Finalize linalg parameters for this np_slk value
2149 !    For the last np_slk value, everything is finalized
2150 !    For the previous np_slk values, only Scalapack parameters are updated
2151      call abi_linalg_finalize()
2152 
2153      time_xortho= time_xortho*mband/blocksize
2154      time_xeigen= time_xeigen*mband/blocksize
2155      if (time_xortho<min_ortho) min_ortho=time_xortho
2156      if (time_xeigen<min_eigen) then
2157        min_eigen=time_xeigen
2158        np_slk_best=np_slk
2159        keep_gpu=use_lapack_gpu
2160      end if
2161 
2162    end do ! np_slk
2163 
2164 #ifdef DEBUG_MODE
2165    write(msg,'(2(a,es15.3),a,i3)') ' In the best case, xortho took ',min_ortho,&
2166     ' and xeigen took ',min_eigen,' for np_slk=',np_slk_best
2167    call wrtout(std_out,msg,'PERS')
2168 #endif
2169 
2170 !  Final values to be sent to others process
2171    acc_kgb=min_ortho+four*min_eigen
2172    npslk=max(np_slk_best,1)
2173    uselinalggpu=keep_gpu
2174 
2175    ABI_FREE(blockvectorx)
2176    ABI_FREE(blockvectorbx)
2177    ABI_FREE(sqgram)
2178    ABI_FREE(grama)
2179    ABI_FREE(gramb)
2180    ABI_FREE(eigen)
2181    ABI_FREE(val_npslk)
2182 
2183  end if ! my_rank in group
2184 
2185 !Free local MPI communicator
2186  call xmpi_comm_free(kgb_comm)
2187 
2188 !Broadcast of results to be sure every process has them
2189  call xmpi_bcast(acc_kgb,0,glb_comm,ierr)
2190  call xmpi_bcast(npslk,0,glb_comm,ierr)
2191  call xmpi_bcast(uselinalggpu,0,glb_comm,ierr)
2192 
2193 #ifndef DEBUG_MODE
2194  ABI_UNUSED(msg)
2195 #endif
2196 
2197  DBG_EXIT("COLL")
2198 
2199 end subroutine compute_kgb_indicator

ABINIT/finddistrproc [ Functions ]

[ Top ] [ Functions ]

NAME

 finddistrproc

FUNCTION

   Given a total number of processors, find a suitable distribution
   that fill all the different levels of parallelization
   (npimage, nppert, np_spkpt, npspinor, npband, npfft, bandpp)
   Also determine parameters of parallel Linear Algebra routines
   (use_slk, np_slk, gpu_linalg_limit)

INPUTS

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   for all datasets; at this stage only datasets with index lower than
   idtset are already initalized
  filnam(5)=character strings giving file names
  idtset=number of the current dataset
  mpi_enreg=information about MPI parallelization
  mband=maximum number of bands.
  ndtset_alloc=number of datasets, corrected for allocation of at least one data set
  tread(11)=flags indicating wether parallel input parameters were read from input file
            tread(1)  : paral_kgb      tread(6) : npfft
            tread(2)  : npimage        tread(7) : npband
            tread(3)  : nppert         tread(8) : bandpp
            tread(4)  : np_spkpt       tread(9) : use_slk
            tread(5)  : nspinor        tread(10): np_slk
            tread(11) : gpu_linalg_limit

SIDE EFFECTS

  iexit= if incremented, an exit is required
  dtset%paral_kgb= flag for band-fft parallelism
  dtset%npimage  = number of processors for parallelisation over image
  dtset%nppert   = number of processors for parallelisation over perturbations
  dtset%npspinor = number of processors for parallelisation on spinor components
  dtset%np_spkpt  = number of processors for parallelisation on spin / k points
  dtset%npfft    = number of processors for parallelisation on fft grid
  dtset%npband   = number of processors for parallelisation on bands
  dtset%nphf     = number of processors for parallelisation on occupied states for fock exchange
  dtset%bandpp   = internal parameter for lobpcg parallelisation algorithm
  dtset%use_slk  = flag for ScalaPAck use
  dtset%np_slk   = number of processors used in ScaLapack routines
  dtset%gpu_linalg_limit=threshold activating Linear Algebra on GPU

SOURCE

1056  subroutine finddistrproc(dtsets,filnam,idtset,iexit,mband,mpi_enreg,ndtset_alloc,tread)
1057 
1058 !Arguments ------------------------------------
1059 !scalars
1060  integer,intent(in) :: idtset,mband,ndtset_alloc
1061  integer,intent(inout) :: iexit
1062  type(dataset_type),intent(inout),target :: dtsets(0:ndtset_alloc)
1063  type(MPI_type),intent(inout) :: mpi_enreg
1064 !arrays
1065  integer,intent(in) :: tread(11)
1066  character(len=fnlen),intent(in) :: filnam(5)
1067 
1068 !Local variables-------------------------------
1069 !scalars
1070 !128 should be a reasonable maximum for npfft (scaling is very poor for npfft>20)
1071  integer,parameter :: ALGO_NOT_SET=-1, ALGO_DEFAULT_PAR=2
1072  integer,parameter :: ALGO_CG=0, ALGO_LOBPCG_OLD=1, ALGO_LOBPCG_NEW=2, ALGO_CHEBFI=3, ALGO_CHEBFI_NEW=4
1073  integer,parameter :: NPFMAX=128,BLOCKSIZE_MAX=3000,MAXBAND_PRINT=10
1074  integer,parameter :: MAXCOUNT=250,MAXPRINT=10,MAXBENCH=25,MAXABIPY=25,NPF_CUTOFF=20
1075  real(dp),parameter :: relative_nband_range=0.025
1076  integer :: wf_algo,wf_algo_global,bpp,bpp_max,bpp_min,optdriver,autoparal,nblocks,blocksize
1077  integer :: npi_max,npi_min,npc,npc_max,npc_min
1078  integer :: np_sk,np_sk_max,np_sk_min,npp_max,npp_min
1079  integer :: nps,nps_max,nps_min,npf,npf_max,npf_min
1080  integer :: npb,npb_max,npb_min,max_ncpus,ount,paral_kgb
1081  integer :: work_size,nks_per_proc,tot_ncpus
1082  integer :: ib1,ib2,ibest,icount,ii,imin,jj,kk,mcount,mcount_eff,mpw
1083  integer :: n2,n3,ncell_eff,ncount,nimage_eff,nkpt_eff,npert_eff
1084  integer :: nproc,nproc1,nprocmin,np_slk,nthreads,use_linalg_gpu,omp_ncpus
1085  logical :: dtset_found,file_found,first_bpp,iam_master
1086  logical :: with_image,with_pert,with_kpt,with_spinor,with_fft,with_band,with_bandpp,with_thread
1087  real(dp):: acc_c,acc_k,acc_kgb,acc_kgb_0,acc_s,ecut_eff,eff,ucvol,weight0
1088  character(len=9) :: suffix
1089  character(len=20) :: strg
1090  character(len=500) :: msg,msgttl
1091  character(len=fnlen) :: filden
1092  type(hdr_type) :: hdr0
1093 !arrays
1094  integer :: idum(1),idum3(3),ngmax(3),ngmin(3)
1095  integer,allocatable :: nband_best(:),isort(:),jdtset_(:)
1096  integer,allocatable :: my_algo(:),my_distp(:,:),nproc_best(:)
1097  integer,pointer :: nkpt_rbz(:)
1098  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),rprimd(3,3)
1099  real(dp),allocatable :: weight(:)
1100  real(dp),pointer :: nband_rbz(:,:)
1101  type(dataset_type),pointer :: dtset
1102 
1103 !******************************************************************
1104 
1105  DBG_ENTER("COLL")
1106 
1107 !Select current dataset
1108  dtset => dtsets(idtset)
1109 
1110 !Is automatic parallelization activated?
1111  autoparal = dtset%autoparal
1112  if (autoparal==0) return
1113 
1114 !Is it available
1115  if ((dtset%usefock==1).AND.(dtset%nphf/=1)) then
1116    ABI_ERROR("autoparal>0 not available for Hartree-Fock or hybrid XC calculations!")
1117  end if
1118  if ((autoparal>1).and.dtset%wfoptalg/=4.and.dtset%wfoptalg/=14) then
1119    ABI_ERROR("autoparal>1 only available for the old LOBPCG algorithm (wfoptalg=4/14)!")
1120  end if
1121 
1122  ! Unit number used for outputting the autoparal sections
1123  ount = ab_out
1124 
1125  ! From the documentation:
1126  !
1127  !   If autoparal > 1 and max_ncpus is greater than 0, ABINIT analyzes the
1128  !   efficiency of the process distribution for each possible number of processors
1129  !   from 2 to max_ncpus. After having printed out the efficiency, the code stops.
1130 
1131  ! Handy local variables
1132  iam_master = (mpi_enreg%me==0)
1133  optdriver = dtset%optdriver
1134  max_ncpus = dtset%max_ncpus ; if (dtset%paral_kgb<0) max_ncpus=abs(dtset%paral_kgb)
1135  nthreads=xomp_get_max_threads()
1136  nproc=mpi_enreg%nproc
1137  if (max_ncpus>0) nproc = dtset%max_ncpus/nthreads
1138  if (xmpi_paral==0.and.max_ncpus<=0) nproc=1
1139 
1140  nprocmin=2
1141  if (xmpi_paral==1.and.max_ncpus<=0) nprocmin=max(2,nproc-100)
1142  if (max_ncpus>0.and.autoparal/=0) nprocmin=1
1143 
1144  wf_algo_global=ALGO_NOT_SET
1145  if (dtset%wfoptalg==0.and.tread(1)==1) wf_algo_global=ALGO_CG
1146  if (dtset%wfoptalg==4.or.dtset%wfoptalg==14) wf_algo_global=ALGO_LOBPCG_OLD
1147  if (dtset%wfoptalg==114) wf_algo_global=ALGO_LOBPCG_NEW
1148  if (dtset%wfoptalg==1) wf_algo_global=ALGO_CHEBFI
1149  if (dtset%wfoptalg==111) wf_algo_global=ALGO_CHEBFI_NEW
1150 
1151  ! Some peculiar cases (with direct exit)
1152  ! MG: What is the meaning of max_ncpus < 0. This is not documented!
1153  if (max_ncpus<=0) then
1154    if (nproc==1.and.max_ncpus<=0) then
1155      if (tread(1)==0.or.xmpi_paral==0) dtset%paral_kgb= 0
1156      if (tread(2)==0.or.xmpi_paral==0) dtset%npimage  = 1
1157      if (tread(3)==0.or.xmpi_paral==0) dtset%nppert   = 1
1158      if (tread(4)==0.or.xmpi_paral==0) dtset%npspinor = 1
1159      if (tread(5)==0.or.xmpi_paral==0) dtset%np_spkpt = 1
1160      if (tread(6)==0.or.xmpi_paral==0) dtset%npfft    = 1
1161      if (tread(7)==0.or.xmpi_paral==0) dtset%npband   = 1
1162      if (tread(8)==0.or.xmpi_paral==0) dtset%bandpp   = 1
1163      if (tread(9)==0.or.xmpi_paral==0) dtset%use_slk  = 0
1164      if (tread(10)==0.or.xmpi_paral==0) dtset%np_slk  = 1000000
1165      return
1166    end if
1167    if ((optdriver/=RUNL_GSTATE.and. optdriver/=RUNL_RESPFN.and. optdriver/=RUNL_GWLS).or. &
1168        (optdriver==RUNL_GSTATE.and.dtset%usewvl==1)) then
1169      dtset%paral_kgb= 0
1170      dtset%npimage  = max(1,dtset%npimage)
1171      dtset%nppert   = max(1,dtset%nppert)
1172      dtset%npspinor = max(1,dtset%npspinor)
1173      dtset%np_spkpt = max(1,dtset%np_spkpt)
1174      dtset%npfft    = max(1,dtset%npfft)
1175      dtset%npband   = max(1,dtset%npband)
1176      dtset%bandpp   = max(1,dtset%bandpp)
1177      return
1178    end if
1179  end if
1180 
1181  ! Need the metric tensor
1182  call mkrdim(dtset%acell_orig(1:3,1),dtset%rprim_orig(1:3,1:3,1),rprimd)
1183  call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
1184 
1185  ! Determine some quantities related to plane waves
1186  !  - Crude estimation of the number of PW
1187  !  - Number of G vectors in each direction
1188  mpw=0;ngmin=0;ngmax=0
1189  if (optdriver==RUNL_GSTATE) then
1190    ecut_eff = dtset%ecut*dtset%dilatmx**2
1191    mpw = nint(ucvol*((two*ecut_eff)**1.5_dp)/(six*pi**2)) ! Crude estimation
1192    if (all(dtset%istwfk(1:dtset%nkpt)>1)) mpw=mpw/2+1
1193    call kpgcount(ecut_eff,dtset%exchn2n3d,gmet,dtset%istwfk,dtset%kpt,ngmax,ngmin,dtset%nkpt)
1194    write(msg,'(a,i0)') ' getmpw sequential formula gave: ',mpw
1195    call wrtout(std_out,msg)
1196  end if
1197 
1198  ! Parallelization over images
1199  npi_min=1;npi_max=1;nimage_eff=1
1200  if (optdriver==RUNL_GSTATE) then
1201    nimage_eff=dtset%ndynimage
1202    if (dtset%ntimimage<=1) nimage_eff=dtset%nimage
1203    npi_min=max(1,dtset%npimage)
1204    npi_max=min(nproc,nimage_eff)
1205    if (tread(2)==1) npi_max=dtset%npimage
1206  end if
1207 
1208 !Parallelization over k-points and spin components (GS)
1209  np_sk_min=1;np_sk_max=1;nkpt_eff=0
1210  if (optdriver==RUNL_GSTATE) then
1211    nkpt_eff=dtset%nkpt*dtset%nsppol
1212    np_sk_min=max(1,dtset%np_spkpt)
1213    np_sk_max=min(nproc,nkpt_eff)
1214    if (tread(4)==1) np_sk_max=dtset%np_spkpt
1215  end if
1216 
1217 !Parallelization over perturbations, k-points and spin components (DFPT)
1218  npp_min=1;npp_max=1;npert_eff=1
1219  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1220    if (dtset%paral_rf==1) then
1221      call dtset%get_npert_rbz(nband_rbz, nkpt_rbz, npert_eff)
1222      do jj=1,npert_eff
1223        ii=dtset%nsppol*nkpt_rbz(jj)*maxval(nband_rbz(:,jj))
1224        nkpt_eff=max(nkpt_eff,ii)
1225      end do
1226      npp_min=max(1,dtset%nppert)
1227      npp_max=min(nproc,npert_eff)
1228      if (tread(3)==1) then
1229        npp_max=dtset%nppert
1230        if (npp_max>npert_eff) then
1231          npp_min=npert_eff;npp_max=npert_eff
1232          ABI_WARNING('nppert is bigger than npert; we set nppert=npert')
1233        end if
1234      end if
1235      np_sk_min=1
1236      np_sk_max=min(nproc,nkpt_eff)
1237      ABI_FREE(nkpt_rbz)
1238      ABI_FREE(nband_rbz)
1239    else
1240      nkpt_eff=nproc
1241      np_sk_min=nproc-5
1242      np_sk_max=nproc
1243    end if
1244  end if
1245 
1246 !Parallelization over spinorial components
1247  nps_min=1;nps_max=1
1248  if (optdriver==RUNL_GSTATE) then
1249    nps_min=max(1,dtset%npspinor)
1250    nps_max=min(nproc,dtset%nspinor)
1251    if (tread(5)==1) nps_max=dtset%npspinor
1252  end if
1253 
1254 !KGB Parallelization
1255 
1256  npf_min=1;npf_max=1
1257  npb_min=1;npb_max=1
1258  bpp_min=1;bpp_max=1
1259  n2=0;n3=0
1260  if (optdriver==RUNL_GSTATE) then
1261 
1262 !  >> FFT level
1263    npf_min=max(1,dtset%npfft)
1264    npf_min=min(npf_min,ngmin(2))
1265    npf_max=min(nproc,NPFMAX)
1266    if (tread(6)==1) then
1267      npf_max=dtset%npfft
1268      if (npf_max>ngmin(2)) then
1269        write(msg,'(3a)') &
1270         "Value of npfft given in input file is too high for the FFT grid!",ch10,&
1271         "Action: decrease npfft or increase FFT grid (ecut, ngfft, ...)."
1272        ABI_ERROR(msg)
1273      end if
1274    end if
1275    npf_max=min(npf_max,ngmin(2))
1276    ! Deactivate MPI FFT parallelism for GPU
1277    if (dtset%use_gpu_cuda==1) then
1278      npf_min=1;npf_max=1
1279    end if
1280    !Deactivate MPI FFT parallelism for GPU
1281    if (tread(1)==1.and.dtset%paral_kgb==0) then
1282      npf_min=1;npf_max=1
1283    end if
1284    !Deactivate MPI FFT parallelism for multi-threaded LOBPCG / CHEBFI
1285    if ((wf_algo_global==ALGO_LOBPCG_NEW.or.wf_algo_global==ALGO_CHEBFI.or.wf_algo_global==ALGO_CHEBFI_NEW).and.nthreads>1) then
1286      npf_min=1;npf_max=1
1287    end if
1288 
1289    ! Number of FFT procs has to be a multiple of FFT grid sizes
1290    ! In case of a restart from a density file, it has to be
1291    ! compatible with the FFT grid used for the density
1292    n2=dtset%ngfft(2) ; n3=dtset%ngfft(3)
1293    if (n2==0.and.n3==0.and.(dtset%getden/=0.or.dtset%irdden/=0.or.dtset%iscf<0)) then
1294      dtset_found=.false.;file_found=.false.
1295      !1-Try to find ngfft from previous dataset
1296      if (dtset%getden/=0) then
1297        do ii=1,ndtset_alloc
1298          jj=dtset%getden;if (jj<0) jj=dtset%jdtset+jj
1299          if (dtsets(ii)%jdtset==jj) then
1300            dtset_found=.true.
1301            n2=dtsets(ii)%ngfftdg(2);n3=dtsets(ii)%ngfftdg(3)
1302          end if
1303        end do
1304      end if
1305      !2-If not found, try to extract ngfft from density file
1306      if (.not.dtset_found) then
1307        !Retrieve file name
1308        suffix='_DEN';if (dtset%nimage>1) suffix='_IMG1_DEN'
1309        ABI_MALLOC(jdtset_,(0:ndtset_alloc))
1310        jdtset_=0;if(ndtset_alloc/=0) jdtset_(0:ndtset_alloc)=dtsets(0:ndtset_alloc)%jdtset
1311        call mkfilename(filnam,filden,dtset%getden,idtset,dtset%irdden,jdtset_,ndtset_alloc,suffix,'den',ii)
1312        ABI_FREE(jdtset_)
1313        !Retrieve ngfft from file header
1314        idum3=0
1315        if (mpi_enreg%me==0) then
1316          inquire(file=trim(filden),exist=file_found)
1317          if (file_found) then
1318            call hdr_read_from_fname(hdr0,filden,ii,xmpi_comm_self)
1319            idum3(1:2)=hdr0%ngfft(2:3);if (file_found) idum3(3)=1
1320            call hdr0%free()
1321            ABI_WARNING("Cannot find filden"//filden)
1322          end if
1323        end if
1324        call xmpi_bcast(idum3,0,mpi_enreg%comm_world,ii)
1325        n2=idum3(1);n3=idum3(2);file_found=(idum3(3)/=0)
1326      end if
1327    end if
1328 
1329 !  >> Band level
1330    npb_min=max(1,dtset%npband)
1331    npb_max=min(nproc,mband)
1332    if (tread(7)==1) npb_max=dtset%npband
1333    if (tread(1)==1.and.dtset%paral_kgb==0) then
1334      npb_min=1;npb_max=1
1335    end if
1336 
1337 !  >> banddp level
1338    bpp_min=max(1,dtset%bandpp)
1339    bpp_max=mband
1340    if (wf_algo_global==ALGO_LOBPCG_OLD) bpp_max=max(4,nint(mband/10.)) ! reasonable bandpp max
1341    if (tread(8)==1) bpp_max=dtset%bandpp
1342    if (wf_algo_global==ALGO_CHEBFI) bpp_min=1 ! bandpp not used with ChebFi
1343    if (wf_algo_global==ALGO_CHEBFI) bpp_max=1
1344    if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_min=1 ! bandpp not used with ChebFi
1345    if (wf_algo_global==ALGO_CHEBFI_NEW) bpp_max=1 ! bandpp not used with ChebFi
1346 
1347  end if ! RUNL_GSTATE
1348 
1349 !Disable KGB parallelisation in some cases:
1350 !  - no GS
1351 !  - paral_kgb=0 present in input file
1352 !  - nstep=0
1353 !  - Hartree-Fock or hybrid calculation (for now on)
1354  if ( (optdriver/=RUNL_GSTATE).or.(dtset%paral_kgb==0.and.tread(1)==1).or. &
1355       (dtset%nstep==0).or.(dtset%usefock==1)) then
1356    nps_min=1; nps_max=1
1357    npf_min=1; npf_max=1
1358    npb_min=1; npb_max=1
1359    bpp_min=1; bpp_max=1
1360  end if
1361 
1362  ! Which levels of parallelism do we have?
1363  with_image =(npi_min/=1.or.npi_max/=1)
1364  with_pert  =(npp_min/=1.or.npp_max/=1)
1365  with_kpt   =(np_sk_min/=1.or.np_sk_max/=1)
1366  with_spinor=(nps_min/=1.or.nps_max/=1)
1367  with_fft   =(npf_min/=1.or.npf_max/=1)
1368  with_band  =(npb_min/=1.or.npb_max/=1)
1369  with_bandpp=(bpp_min/=1.or.bpp_max/=1)
1370  with_thread=(nthreads>1)
1371 
1372 !Allocate lists
1373  ABI_MALLOC(my_distp,(10,MAXCOUNT))
1374  ABI_MALLOC(weight,(MAXCOUNT))
1375  ABI_MALLOC(my_algo,(MAXCOUNT))
1376  my_distp(1:7,:)=0;weight(:)=zero
1377  my_distp(8,:)=dtset%use_slk
1378  my_distp(9,:)=dtset%np_slk
1379  my_distp(10,:)=dtset%gpu_linalg_limit
1380  my_algo(:)=wf_algo_global
1381  icount=0;imin=1
1382 
1383 !Cells= images or perturbations
1384  npc_min=1;npc_max=1;ncell_eff=1
1385  if (optdriver==RUNL_GSTATE) then
1386    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
1387  end if
1388  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1389    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
1390  end if
1391 
1392 !Loop over all possibilities
1393 !Computation of weight~"estimated acceleration"
1394 !================================================================
1395 
1396 !Cells= images or perturbations
1397  npc_min=1;npc_max=1;ncell_eff=1
1398  if (optdriver==RUNL_GSTATE) then
1399    ncell_eff=nimage_eff;npc_min=npi_min;npc_max=npi_max
1400  end if
1401  if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1402    ncell_eff=npert_eff;npc_min=npp_min;npc_max=npp_max
1403  end if
1404 
1405 !>>>>> CELLS
1406  do npc=npc_min,npc_max
1407    acc_c=one;if (npc>1) acc_c=0.99_dp*speedup_fdp(ncell_eff,npc)
1408 
1409 !  >>>>> K-POINTS
1410    do np_sk=np_sk_min,np_sk_max
1411 !    -> for DFPT runs, impose that nsppol divides np_sk
1412      if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE]) .and. modulo(np_sk,dtset%nsppol)>0.and.np_sk>1) cycle
1413      acc_k=one;if (np_sk>1) acc_k=0.96_dp*speedup_fdp(nkpt_eff,np_sk)
1414 
1415 !    >>>>> SPINORS
1416      do nps=nps_min,nps_max
1417        acc_s=one;if (nps>1) acc_s=0.85_dp*speedup_fdp(dtset%nspinor,nps)
1418 
1419 !      >>>>> FFT
1420        do npf=npf_min,npf_max
1421 !        -> npf should divide ngfft if set (if unset, ngfft=0 so the modulo test is ok)
1422          if((modulo(n2,npf)>0).or.(modulo(n3,npf)>0)) cycle
1423 !        -> npf should be only divisible by 2, 3 or 5
1424          ii=npf
1425          do while (modulo(ii,2)==0)
1426            ii=ii/2
1427          end do
1428          do while (modulo(ii,3)==0)
1429            ii=ii/3
1430          end do
1431          do while (modulo(ii,5)==0)
1432            ii=ii/5
1433          end do
1434          if(ii/=1) cycle
1435 
1436 !        Change algo if npfft>1
1437          wf_algo=wf_algo_global
1438          if (optdriver==RUNL_GSTATE.and.npf>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR
1439 
1440 !        FFT parallelism not compatible with multithreading
1441          if (wf_algo==ALGO_LOBPCG_NEW.or.wf_algo==ALGO_CHEBFI.or.wf_algo==ALGO_CHEBFI_NEW) then
1442            if (nthreads>1.and.npf>1) cycle
1443          end if
1444 
1445 !        >>>>> BANDS
1446          do npb=npb_min,npb_max
1447            nproc1=npc*np_sk*nps*npf*npb
1448            if (nproc1<nprocmin)     cycle
1449            if (nproc1>nproc)        cycle
1450            if (modulo(mband,npb)>0) cycle
1451 
1452 !          Change algo if npband>1
1453            if (optdriver==RUNL_GSTATE.and.npb>1.and. wf_algo_global==ALGO_NOT_SET) wf_algo=ALGO_DEFAULT_PAR
1454 
1455 !          Base speedup
1456            acc_kgb_0=one;if (npb*npf*nthreads>1) acc_kgb_0=0.7_dp*speedup_fdp(mpw,(npb*npf*nthreads))
1457 
1458            if (npb*npf>4.and.wf_algo==ALGO_LOBPCG_OLD) then
1459 !            Promote npb=npf
1460              acc_kgb_0=acc_kgb_0*min((one*npf)/(one*npb),(one*npb)/(one*npf))
1461 !            Promote npf<=20
1462              if (npf>20)then
1463                acc_kgb_0=acc_kgb_0* &
1464 &                 0.2_dp+(one-0.2_dp)*(sin((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))) &
1465 &                 /((pi*(npf-NPF_CUTOFF))/(one*(NPFMAX-NPF_CUTOFF))))**2
1466              end if
1467            end if
1468 
1469            first_bpp=.true.
1470            do bpp=bpp_min,bpp_max
1471 
1472              if (wf_algo==ALGO_LOBPCG_NEW) then
1473                blocksize=npb*bpp;nblocks=mband/blocksize
1474                if (modulo(bpp,nthreads)>0) cycle
1475                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
1476                if (modulo(mband,npb*bpp)>0) cycle
1477              else if (wf_algo==ALGO_LOBPCG_OLD) then
1478                blocksize=npb*bpp;nblocks=mband/blocksize
1479                if (modulo(mband/npb,bpp)>0) cycle
1480                if ((bpp>1).and.(modulo(bpp,2)>0)) cycle
1481                if (one*npb*bpp >max(1.,mband/3.).and.(mband>30)) cycle
1482                if (npb*npf<=4.and.(.not.first_bpp)) cycle
1483              else if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then
1484                !Nothing
1485              else
1486                if (bpp/=1.or.npb/=1) cycle
1487              end if
1488 
1489              first_bpp=.false.
1490 
1491              acc_kgb=acc_kgb_0
1492 !            OLD LOBPCG: promote bpp*npb>mband/3
1493              if (wf_algo==ALGO_LOBPCG_OLD) then
1494                if (npb*npf>4.and.mband>30) acc_kgb=acc_kgb*(one-(three*bpp*npb)/(one*mband))
1495              end if
1496 !            NEW LOBPCG: promote minimal number of blocks
1497 !                        promote block size <= BLOCKSIZE_MAX
1498              if (wf_algo==ALGO_LOBPCG_NEW) then
1499                acc_kgb=acc_kgb*(one-0.9_dp*dble(nblocks-1)/dble(mband-1))
1500                if (blocksize>BLOCKSIZE_MAX) acc_kgb=acc_kgb*max(0.1_dp,one-dble(blocksize)/dble(10*BLOCKSIZE_MAX))
1501                if (nthreads==1) then
1502 !                Promote npband vs bandpp & npfft
1503                  if (blocksize>1) acc_kgb=acc_kgb*(0.1_dp*bpp+0.9_dp-blocksize)/(one-blocksize)
1504                  if (npb*npf>4.and.mband>100) acc_kgb=acc_kgb*(one-0.8_dp*((three*bpp*npb)/(one*mband)-one)**2)
1505                  tot_ncpus=max(npb,npf);if (tot_ncpus==2) tot_ncpus=0
1506                  acc_kgb=acc_kgb*(one-0.8_dp*((dble(npb)/dble(npf))-2_dp)**2/(tot_ncpus-2_dp)**2)
1507                  eff=max(npf,20);acc_kgb=acc_kgb*(one-0.8_dp*min(one,(eff-20)**2))
1508                end if
1509              end if
1510 
1511 !            CHEBFI: promote npfft=npband and nband>=npfft
1512              if (wf_algo==ALGO_CHEBFI .or. wf_algo==ALGO_CHEBFI_NEW) then
1513                if (npf>1) then
1514                  if (npb>npf) then
1515                    acc_kgb=acc_kgb*(one-0.8_dp*0.25_dp*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2)
1516                  else
1517                    acc_kgb=acc_kgb*(one-0.8_dp*nproc1**2*((dble(npb)/dble(npf))-one)**2/(nproc1-one)**2)
1518                  end if
1519                end if
1520              end if
1521 
1522 !            Resulting "weight"
1523 !            weight0=acc_c*acc_k*acc_s*acc_kgb
1524              weight0=nproc1*(acc_c+acc_k+acc_s+acc_kgb)/(npc+np_sk+nps+(npf*npb))
1525 
1526 !            Store data
1527              icount=icount+1
1528              if (icount<=MAXCOUNT) then
1529                my_algo(icount)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET)
1530                my_distp(1:7,icount)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/)
1531                weight(icount)=weight0
1532                if (weight0<weight(imin)) imin=icount
1533              else
1534                if (weight0>weight(imin)) then
1535                  my_algo(imin)=merge(ALGO_CG,wf_algo,wf_algo==ALGO_NOT_SET)
1536                  my_distp(1:7,imin)=(/npc,np_sk,nps,npf,npb,bpp,nproc1/)
1537                  weight(imin)=weight0
1538                  idum=minloc(weight);imin=idum(1)
1539                end if
1540              end if
1541 
1542            end do ! bpp
1543          end do ! npb
1544        end do ! npf
1545      end do ! nps
1546    end do ! np_sk
1547  end do ! npc
1548 
1549 !Compute number of selected distributions
1550  mcount_eff=icount
1551  mcount=min(mcount_eff,MAXCOUNT)
1552 
1553 !Stop if no solution found
1554  if (mcount==0) then
1555 !  Override here the 0 default value changed in indefo1
1556    dtset%npimage  = max(1,dtset%npimage)
1557    dtset%nppert   = max(1,dtset%nppert)
1558    dtset%np_spkpt = max(1,dtset%np_spkpt)
1559    dtset%npspinor = max(1,dtset%npspinor)
1560    dtset%npfft    = max(1,dtset%npfft)
1561    dtset%npband   = max(1,dtset%npband)
1562    dtset%bandpp   = max(1,dtset%bandpp)
1563    write(msg,'(a,i0,2a,i0,a)')  &
1564 &  'Your input dataset does not let Abinit find an appropriate process distribution with nCPUs=',nproc*nthreads,ch10, &
1565 &  'Try to comment all the np* vars and set max_ncpus=',nthreads*nproc,' to have advices on process distribution.'
1566    ABI_WARNING(msg)
1567    if (max_ncpus>0) call wrtout(ab_out,msg, do_flush=.True.)
1568    iexit=iexit+1
1569  end if
1570 
1571 !Sort data by increasing weight
1572  if (mcount>0) then
1573    ABI_MALLOC(isort,(mcount))
1574    isort=(/(ii,ii=1,mcount)/)
1575    call sort_dp(mcount,weight,isort,tol6)
1576    ncount=min(mcount,MAXPRINT)
1577  end if
1578 
1579 !Deduce a global value for paral_kgb
1580  paral_kgb=dtset%paral_kgb
1581  if (tread(1)==0) then
1582    if (any(my_algo(:)/=ALGO_CG)) paral_kgb=1
1583  end if
1584 
1585  ! ======================================
1586  ! Print output for abipy in Yaml format
1587  ! ======================================
1588 
1589  ! Please DO NOT CHANGE this part without contacting gmatteo first
1590  ! since ANY CHANGE can easily break the interface with AbiPy.
1591  if (iam_master .and. max_ncpus > 0.and. (mcount>0 .or. wf_algo_global == ALGO_CG)) then
1592    write(ount,'(2a)')ch10,"--- !Autoparal"
1593    if (optdriver==RUNL_GSTATE .and. paral_kgb == 0) then
1594      write(ount,"(a)")"# Autoparal section for GS run (band-by-band CG method)"
1595    else if (optdriver==RUNL_GSTATE) then
1596      write(ount,'(a)')'# Autoparal section for GS calculations with paral_kgb 1'
1597    else if (optdriver==RUNL_RESPFN) then
1598      write(ount,'(a)')'# Autoparal section for DFPT calculations'
1599    else if (optdriver==RUNL_LONGWAVE) then
1600      write(ount,'(a)')'# Autoparal section for LONGWAVE calculations'
1601    else
1602      ABI_ERROR(sjoin('Unsupported optdriver:', itoa(optdriver)))
1603    end if
1604    write(ount,"(a)")   "info:"
1605    write(ount,"(a,i0)")"    autoparal: ",autoparal
1606    write(ount,"(a,i0)")"    paral_kgb: ",paral_kgb
1607    write(ount,"(a,i0)")"    max_ncpus: ",max_ncpus
1608    write(ount,"(a,i0)")"    nspinor: ",dtset%nspinor
1609    write(ount,"(a,i0)")"    nsppol: ",dtset%nsppol
1610    write(ount,"(a,i0)")"    nkpt: ",dtset%nkpt
1611    write(ount,"(a,i0)")"    mband: ",mband
1612    write(ount,"(a)")"configurations:"
1613 
1614    if (optdriver==RUNL_GSTATE.and.paral_kgb==0) then
1615      work_size = dtset%nkpt * dtset%nsppol
1616      do ii=1,max_ncpus
1617        if (ii > work_size) cycle
1618        do omp_ncpus=1,nthreads
1619          nks_per_proc = work_size / ii
1620          nks_per_proc = nks_per_proc + MOD(work_size, ii)
1621          eff = (one * work_size) / (ii * nks_per_proc)
1622          write(ount,"(a,i0)")"    - tot_ncpus: ",ii * omp_ncpus
1623          write(ount,"(a,i0)")"      mpi_ncpus: ",ii
1624          write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
1625          write(ount,"(a,f12.9)")"      efficiency: ",eff
1626          !write(ount,"(a,f12.2)")"      mem_per_cpu: ",mempercpu_mb
1627        end do
1628      end do
1629 
1630    else if (optdriver==RUNL_GSTATE) then
1631      omp_ncpus=nthreads
1632      do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1
1633        ii=isort(jj)
1634        tot_ncpus = my_distp(7,ii)
1635        eff = weight(jj) / tot_ncpus
1636        write(ount,'(a,i0)')'    - tot_ncpus: ',tot_ncpus
1637        write(ount,'(a,i0)')'      mpi_ncpus: ',tot_ncpus
1638        write(ount,"(a,i0)")"      omp_ncpus: ",omp_ncpus
1639        write(ount,'(a,f12.9)')'      efficiency: ',eff
1640        !write(ount,'(a,f12.2)')'      mem_per_cpu: ',mempercpu_mb
1641        write(ount,'(a)'   )'      vars: {'
1642        write(ount,'(a,i0,a)')'            npimage: ',my_distp(1,ii),','
1643        ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy
1644        write(ount,'(a,i0,a)')'            npkpt: ',my_distp(2,ii),','
1645        !write(ount,'(a,i0,a)')'            np_spkpt: ',my_distp(2,ii),','
1646        write(ount,'(a,i0,a)')'            npspinor: ',my_distp(3,ii),','
1647        write(ount,'(a,i0,a)')'            npfft: ', my_distp(4,ii),','
1648        write(ount,'(a,i0,a)')'            npband: ',my_distp(5,ii),','
1649        write(ount,'(a,i0,a)')'            bandpp: ',my_distp(6,ii),','
1650        write(ount,'(a)')   '            }'
1651      end do
1652 
1653    else if (any(optdriver == [RUNL_RESPFN, RUNL_LONGWAVE])) then
1654      do jj=mcount,mcount-min(ncount,MAXABIPY)+1,-1
1655        ii=isort(jj)
1656        tot_ncpus = my_distp(7,ii)
1657        eff = weight(jj) / tot_ncpus
1658        write(ount,'(a,i0)')'    - tot_ncpus: ',tot_ncpus
1659        write(ount,'(a,i0)')'      mpi_ncpus: ',tot_ncpus
1660        !write(ount,'(a,i0)')'      omp_ncpus: ',omp_ncpus !OMP not supported  (yet)
1661        write(ount,'(a,f12.9)')'      efficiency: ',eff
1662        !write(ount,'(a,f12.2)')'      mem_per_cpu: ',mempercpu_mb
1663        write(ount,'(a)'   )'      vars: {'
1664        write(ount,'(a,i0,a)')'             nppert: ', my_distp(1,ii),','
1665        ! Keep on using legacy npkpt instead of np_spkpt to maintain compatibility with AbiPy
1666        write(ount,'(a,i0,a)')'             npkpt: ', my_distp(2,ii),','
1667        !write(ount,'(a,i0,a)')'             np_spkpt: ', my_distp(2,ii),','
1668        write(ount,'(a)')   '            }'
1669       end do
1670    end if
1671    write(ount,'(a)')"..."
1672  end if
1673 
1674 !Print out tab with selected choices
1675  if (mcount>0.and.iam_master) then
1676    if (nthreads==1) then
1677      write(msg,'(a,1x,100("="),2a,i0,2a)') ch10,ch10,&
1678 &     ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,':',ch10
1679    else
1680      write(msg,'(a,1x,100("="),2a,i0,a,i0,2a)')  ch10,ch10,&
1681 &     ' Searching for all possible proc distributions for this input with #CPUs<=',nthreads*nproc,&
1682 &     ' and ',nthreads,' openMP threads:',ch10
1683    end if
1684    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1685    !Titles of columns
1686    msgttl='~'
1687    if (with_image)  msgttl=trim(msgttl)//'~~~~~~~~~~~'
1688    if (with_pert)   msgttl=trim(msgttl)//'~~~~~~~~~~~'
1689    msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! kpt
1690    if (with_spinor) msgttl=trim(msgttl)//'~~~~~~~~~~'
1691    if (with_fft)    msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1692    if (with_band)   msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1693    if (with_bandpp) msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1694    if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~'
1695    msgttl=trim(msgttl)//'~~~~~~~~~~~~~' ! nproc
1696    if (with_thread) msgttl=trim(msgttl)//'~~~~~~~~~~~~~'
1697    msgttl=trim(msgttl)//'~~~~~~~~~~~'   ! CPUs
1698    msgttl=' '//trim(msgttl)
1699    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1700    msg='|'
1701    if (with_image)  msg=trim(msg)//'   npimage|'
1702    if (with_pert)   msg=trim(msg)//'    nppert|'
1703    msg=trim(msg)//'       np_spkpt|'
1704    if (with_spinor) msg=trim(msg)//' npspinor|'
1705    if (with_fft)    msg=trim(msg)//'       npfft|'
1706    if (with_band)   msg=trim(msg)//'      npband|'
1707    if (with_bandpp) msg=trim(msg)//'      bandpp|'
1708    if (with_thread) msg=trim(msg)//' #Threads|'
1709    msg=trim(msg)//'  #MPI(proc)|'
1710    if (with_thread) msg=trim(msg)//'       #CPUs|'
1711    msg=trim(msg)//'    WEIGHT|'
1712    msg=' '//trim(msg)
1713    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1714    msg='|'
1715    write(strg,'(i4,a,i4,a)') npi_min,'<<',npi_max,'|';if (with_image)  msg=trim(msg)//trim(strg)
1716    write(strg,'(i4,a,i4,a)') npp_min,'<<',npp_max,'|';if (with_pert)   msg=trim(msg)//trim(strg)
1717    write(strg,'(i5,a,i5,a)') np_sk_min,'<<',np_sk_max,'|';                 msg=trim(msg)//trim(strg)
1718    write(strg,'(i5,a,i2,a)') nps_min,'<<',nps_max,'|';if (with_spinor) msg=trim(msg)//trim(strg)
1719    write(strg,'(i5,a,i5,a)') npf_min,'<<',npf_max,'|';if (with_fft)    msg=trim(msg)//trim(strg)
1720    write(strg,'(i5,a,i5,a)') npb_min,'<<',npb_max,'|';if (with_band)   msg=trim(msg)//trim(strg)
1721    write(strg,'(i5,a,i5,a)') bpp_min,'<<',bpp_max,'|';if (with_bandpp) msg=trim(msg)//trim(strg)
1722    write(strg,'(i9,a)'     ) nthreads            ,'|';if (with_thread) msg=trim(msg)//trim(strg)
1723    write(strg,'(i5,a,i5,a)') 1      ,'<<',nproc  ,'|';                 msg=trim(msg)//trim(strg)
1724    write(strg,'(i4,a,i6,a)') nthreads,'<<',nthreads*nproc,'|';if (with_thread) msg=trim(msg)//trim(strg)
1725    write(strg,'(a,i6,a)')   '  <=',nthreads*nproc,'|';                 msg=trim(msg)//trim(strg)
1726    msg=' '//trim(msg)
1727    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1728    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1729    !Loop over selected choices
1730    do jj=mcount,mcount-ncount+1,-1
1731      ii=isort(jj)
1732      msg='|'
1733      write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_image)  msg=trim(msg)//trim(strg)
1734      write(strg,'(i10,a)') my_distp(1,ii),'|';if (with_pert)   msg=trim(msg)//trim(strg)
1735      write(strg,'(i12,a)') my_distp(2,ii),'|';                 msg=trim(msg)//trim(strg)
1736      write(strg,'(i9,a)')  my_distp(3,ii),'|';if (with_spinor) msg=trim(msg)//trim(strg)
1737      write(strg,'(i12,a)') my_distp(4,ii),'|';if (with_fft)    msg=trim(msg)//trim(strg)
1738      write(strg,'(i12,a)') my_distp(5,ii),'|';if (with_band)   msg=trim(msg)//trim(strg)
1739      write(strg,'(i12,a)') my_distp(6,ii),'|';if (with_bandpp) msg=trim(msg)//trim(strg)
1740      write(strg,'(i9,a)')  nthreads      ,'|';if (with_thread) msg=trim(msg)//trim(strg)
1741      write(strg,'(i12,a)') my_distp(7,ii),'|';                 msg=trim(msg)//trim(strg)
1742      write(strg,'(i12,a)') nthreads*my_distp(7,ii),'|';if (with_thread) msg=trim(msg)//trim(strg)
1743      write(strg,'(f10.3,a)') weight(jj)  ,'|';                 msg=trim(msg)//trim(strg)
1744      msg=' '//trim(msg)
1745      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1746    end do
1747    !End of tab
1748    call wrtout(std_out,msgttl);if(max_ncpus>0) call wrtout(ab_out,msgttl)
1749    write(msg,'(a,i6,a,i6,a)')' Only the best possible choices for nproc are printed...'
1750    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1751  end if ! mcount>0
1752 
1753 !Determine an optimal number of bands
1754  if (optdriver==RUNL_GSTATE.and. &
1755 &    (any(my_algo(1:mcount)==ALGO_LOBPCG_OLD.or. &
1756 &         my_algo(1:mcount)==ALGO_LOBPCG_NEW.or. &
1757 &         my_algo(1:mcount)==ALGO_CHEBFI.or. &
1758 &         my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then
1759    if (mcount>0) then
1760      icount=isort(mcount)
1761      npc=my_distp(1,icount);np_sk=my_distp(2,icount)
1762      nps=my_distp(3,icount);npf=my_distp(4,icount)
1763    else
1764      npc=1;if (with_image ) npc=npi_min
1765      np_sk=1;if (with_kpt   ) np_sk=np_sk_min
1766      nps=1;if (with_spinor) nps=nps_min
1767      npf=1;if (with_fft   ) npf=npf_min
1768    end if
1769    nproc1=npc*np_sk*nps*npf
1770    msg=ch10//' >>> Possible (best) choices for the number of bands (nband) are:'
1771    if (with_image.or.with_kpt.or.with_spinor.or.with_fft) msg=trim(msg)//ch10//'     with:'
1772    write(strg,'(a,i0)') ' npimage=' ,npc;if (with_image)  msg=trim(msg)//trim(strg)
1773    write(strg,'(a,i0)') ' np_spkpt=' ,np_sk;if (with_kpt)    msg=trim(msg)//trim(strg)
1774    write(strg,'(a,i0)') ' npspinor=',nps;if (with_spinor) msg=trim(msg)//trim(strg)
1775    write(strg,'(a,i0)') ' npfft='   ,npf;if (with_fft)    msg=trim(msg)//trim(strg)
1776    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1777    ib1=mband-int(mband*relative_nband_range);if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW) ib1=mband
1778    ib2=mband+int(mband*relative_nband_range)
1779    ABI_MALLOC(nproc_best,(1+ib2-ib1))
1780    ABI_MALLOC(nband_best,(1+ib2-ib1))
1781    nproc_best(:)=1
1782    nband_best=(/(ii,ii=ib1,ib2)/)
1783    bpp=merge(1,nthreads,my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)==ALGO_CHEBFI_NEW)
1784    do ii=ib1,ib2
1785      do jj=1,nproc/nproc1
1786        ibest=1
1787        do kk=1,jj
1788          if (mod(jj,kk)/=0) cycle
1789          if (mod(ii,kk*bpp)==0) ibest=max(ibest,kk)
1790        end do
1791        nproc_best(1+ii-ib1)=max(nproc_best(1+ii-ib1),ibest)
1792      end do
1793    end do
1794    call sort_int(1+ib2-ib1,nproc_best,nband_best)
1795    kk=-1
1796    do ii=1+ib2-ib1,max(ib2-ib1-MAXBAND_PRINT,1),-1
1797      write(msg,'(3(a,i6),a,i3,a,i5,a)') '     nband=',nband_best(ii),' using ',nproc1*nproc_best(ii)*nthreads,&
1798 &        ' CPUs =',nproc1*nproc_best(ii),' MPI x',nthreads,' threads (npband=',nproc_best(ii),')'
1799      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1800      if (nband_best(ii)==mband) kk=nproc_best(ii)
1801    end do
1802    if (kk==maxval(nproc_best(:))) then
1803      if (my_algo(icount)/=ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then
1804        write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be the best choice!'
1805      end if
1806      if (my_algo(icount)==ALGO_CHEBFI .or. my_algo(icount)/=ALGO_CHEBFI_NEW) then
1807        write(msg,'(a,i6,a)') ' >>> The present nband value (',mband,') seems to be a good choice!'
1808      end if
1809      call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1810    end if
1811    ABI_FREE(nproc_best)
1812    ABI_FREE(nband_best)
1813  end if
1814 
1815  if (optdriver==RUNL_GSTATE.and.(any(my_algo(1:mcount)==ALGO_CHEBFI .or. my_algo(1:mcount)==ALGO_CHEBFI_NEW))) then
1816    write(msg,'(5a)') &
1817 &   ' >>> Note that with the "Chebyshev Filtering" algorithm, it is often',ch10,&
1818 &   '     better to increase the number of bands (10% more or a few tens more).',ch10,&
1819 &   '     Advice: increase nband and put nbdbuf input variable to (nband_new-nband_old).'
1820    call wrtout(std_out,msg);if(max_ncpus>0) call wrtout(ab_out,msg)
1821  end if
1822 
1823 !Refinement of the process distribution by mean of a LinAlg routines benchmarking
1824  if (mcount>0.and.optdriver==RUNL_GSTATE.and.autoparal/=1) then
1825    icount=isort(mcount)
1826    if (autoparal/=3) then
1827      if (autoparal==2) then
1828        write(msg,'(5a,9(a10,a1))') ch10, &
1829 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1830 &       ' Weights below are corrected according:',ch10,&
1831 &       'npimage','|','np_spkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1832 &       'nproc'  ,'|','weight','|','new weight','|'
1833      else
1834        write(msg,'(5a,11(a10,a1))') ch10, &
1835 &       ' Values below have been tested with respect to Linear Algebra performance;',ch10,&
1836 &       ' Weights below are corrected according:',ch10,&
1837 &       'npimage','|','np_spkpt' ,'|','npspinor'  ,'|','npfft'     ,'|','npband','|',' bandpp ' ,'|',&
1838 &       'nproc'  ,'|','weight','|','new weight','|','best npslk','|','linalggpu' ,'|'
1839      end if
1840      call wrtout(std_out,msg);if (max_ncpus > 0) call wrtout(ab_out,msg)
1841    end if
1842    acc_k=zero
1843    ncount=min(MAXBENCH,mcount);if (autoparal==3) ncount=1
1844    do jj=mcount,mcount-ncount+1,-1
1845      ii=isort(jj)
1846      npf=my_distp(4,ii);npb=my_distp(5,ii);bpp=my_distp(6,ii)
1847      if ((npb*npf*bpp>1).and.(npf*npb<=mpi_enreg%nproc)) then
1848        use_linalg_gpu=dtset%use_gpu_cuda
1849        call compute_kgb_indicator(acc_kgb,bpp,xmpi_world,mband,mpw,npb,npf,np_slk,use_linalg_gpu)
1850        if (autoparal/=2) then
1851          my_distp(9,ii)=np_slk
1852          if (np_slk>0) my_distp(8,ii)=1
1853 !        * gpu_linalg_limit:
1854 !        No use of GPU: htgspw_01.outuge value ~2  *vectsize*blocksize**2 tested
1855 !        Use of GPU:    tiny value ~0.5*vectsize*blocksize**2 tested
1856          my_distp(10,ii)=2*dtset%mpw*(npb*bpp)**2/npf
1857          if (use_linalg_gpu==1) my_distp(10,ii)=my_distp(10,ii)/4
1858        end if
1859        if (abs(acc_k)<=tol12) acc_k=acc_kgb ! Ref value : the first one computed
1860 !      * Weight (corrected by 10% of the computed ratio)
1861        weight0=weight(jj)*(one + 0.1_dp*acc_k/acc_kgb)
1862        if (autoparal==2) then
1863          write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2)') &
1864 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1865 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |'
1866        else if (autoparal==3) then
1867          write(msg,'(a,5(a,i3))') ch10,' For npband=',npb,', npfft=',npf,' and bandpp=',bpp, &
1868 &         ', compute_kgb_indicator recommends you to set np_slk=',my_distp(9,ii),&
1869 &         ' and use_linalg_gpu=',use_linalg_gpu
1870        else
1871          write(msg, '(7(i10,a1),f9.2,a2,f9.5,a2,2(i10,a1))') &
1872 &         my_distp(1,ii),'|',my_distp(2,ii),'|',my_distp(3,ii),'|',my_distp(4,ii),'|',&
1873 &         my_distp(5,ii),'|',my_distp(6,ii),'|',my_distp(7,ii),'|',weight(jj),'=>', weight0,' |',&
1874 &         my_distp(9,ii),'|',use_linalg_gpu,'|'
1875        end if
1876        call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
1877 !      We store the best value in weight(mcount) and keep icount
1878        if (weight0 > weight(mcount)) then
1879          icount=ii;weight(mcount)=weight0
1880        end if
1881      end if
1882    end do
1883  end if
1884 
1885 !Store new process distribution
1886  if (mcount>0.and.max_ncpus<=0) then
1887    icount=isort(mcount)
1888    nproc1=my_distp(7,icount)
1889 !  Work load distribution
1890    if (optdriver==RUNL_GSTATE) then
1891      dtset%npimage= my_distp(1,icount)
1892      dtset%nppert = 1
1893      dtset%np_spkpt  = my_distp(2,icount)
1894    end if
1895    if (optdriver==RUNL_RESPFN) then
1896      dtset%npimage= 1
1897      dtset%nppert = my_distp(1,icount)
1898      dtset%np_spkpt  = 1
1899    end if
1900    dtset%npspinor = my_distp(3,icount)
1901    dtset%npfft    = my_distp(4,icount)
1902    dtset%npband   = my_distp(5,icount)
1903    dtset%bandpp   = my_distp(6,icount)
1904    if (tread(1)==0)  dtset%paral_kgb= merge(0,1,my_algo(icount)==ALGO_CG)
1905 !  The following lines are mandatory : the DFT+DMFT must use ALL the
1906 !  available procs specified by the user. So nproc1=nproc.
1907 !  Works only if paral_kgb is not activated??
1908    if (dtset%usedmft/=0.and.optdriver==RUNL_GSTATE) then
1909      if (dtset%paral_kgb==0) then
1910        dtset%npspinor = 1 ; dtset%npfft    = 1
1911        dtset%npband   = 1 ; dtset%bandpp   = 1
1912        dtset%npimage  = 1
1913      end if
1914      nproc1 = nproc
1915    end if
1916    if (dtset%npband*dtset%npfft*dtset%bandpp>1) dtset%paral_kgb=1
1917 !  LinAlg parameters: we change values only if they are not present in input file
1918    if (dtset%paral_kgb==1) then
1919      if (tread(9)==0) dtset%use_slk=my_distp(8,icount)
1920      if (tread(10)==0) dtset%np_slk=my_distp(9,icount)
1921      if (tread(11)==0) dtset%gpu_linalg_limit=my_distp(10,icount)
1922    end if
1923 !  New definition of "world" MPI communicator
1924    if (optdriver==RUNL_RESPFN.and.dtset%paral_rf==1) then
1925      nproc1=max(nproc1,dtset%nsppol*dtset%nkpt) ! Take into account the code in respfn.F90
1926      nproc1=min(nproc1,nproc)
1927      nproc1=(nproc1/dtset%nppert)*dtset%nppert
1928    end if
1929    call initmpi_world(mpi_enreg,nproc1)
1930  end if
1931 
1932 !Final advice in case max_ncpus > 0
1933  if (max_ncpus>0.and.mcount>0) then
1934    write(msg,'(6a)') ch10,&
1935    ' Launch a parallel version of ABINIT with a distribution of processors among the above list,',ch10,&
1936    ' and the associated input variables (np_spkpt, npband, npfft, bandpp, etc.).',ch10,&
1937    ' The higher weight should be better.'
1938    call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
1939  end if
1940 
1941  if (mcount>0) then
1942    ABI_FREE(isort)
1943  end if
1944  ABI_FREE(my_distp)
1945  ABI_FREE(my_algo)
1946  ABI_FREE(weight)
1947 
1948 !Final line
1949  write(msg,'(a,100("="),2a)') " ",ch10,ch10
1950  call wrtout(std_out,msg);if (max_ncpus>0) call wrtout(ab_out,msg)
1951 
1952 !max_ncpus requires a stop
1953  if (max_ncpus>0) then
1954    iexit = iexit + 1 ! will stop in the parent.
1955  end if
1956 
1957  DBG_EXIT("COLL")
1958 
1959 contains
1960 
1961 real(dp) pure function speedup_fdp(nn, mm)
1962   ! Expected linear speedup for a nn-sized problem and mm processes
1963   integer,intent(in) :: nn, mm
1964   speedup_fdp = (one*nn) / (one* ((nn / mm) + merge(0, 1, mod(nn, mm) == 0)))
1965 end function speedup_fdp
1966 
1967 end subroutine finddistrproc

ABINIT/m_mpi_setup [ Modules ]

[ Top ] [ Modules ]

NAME

 m_mpi_setup

FUNCTION

  Initialize MPI parameters and datastructures for parallel execution

COPYRIGHT

  Copyright (C) 1999-2022 ABINIT group (FJ, MT, FD)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 module m_mpi_setup
23 
24  use defs_basis
25  use m_distribfft
26  use m_xmpi
27  use m_xomp
28  use m_hdr
29  use m_sort
30  use m_errors
31  use m_abicore
32 
33  use defs_abitypes,  only : MPI_type
34  use m_fstrings,     only : sjoin, itoa
35  use m_time,         only : abi_wtime
36  use m_parser,       only : intagm
37  use m_geometry,     only : mkrdim, metric
38  use m_fftcore,      only : fftalg_for_npfft, getng,  kpgcount
39  use m_mpinfo,       only : init_mpi_enreg, mpi_distrib_is_ok, initmpi_atom, proc_distrb_cycle, &
40                             initmpi_grid, initmpi_pert, initmpi_img, distrb2, distrb2_hf, initmpi_world
41  use m_libpaw_tools, only : libpaw_write_comm_set
42  use m_dtset,        only : dataset_type
43  use m_kg,           only : getmpw
44  use m_dtfil,        only : mkfilename
45 
46  implicit none
47 
48  private

ABINIT/mpi_setup [ Functions ]

[ Top ] [ Functions ]

NAME

 mpi_setup

FUNCTION

 Big loop on the datasets:
 - compute mgfft,mpw,nfft,... for this data set;
 - fill mpi_enreg
  *** At the output of this routine, all the dtsets input variables are known ***
 The content of dtsets should not be modified anymore afterwards.

INPUTS

  filnam(5)=character strings giving file names
  ndtset= number of datasets to be read; if 0, no multi-dataset mode
  ndtset_alloc=number of datasets, corrected for allocation of at least
      one data set.

OUTPUT

  dtsets(0:ndtset_alloc)=<type datafiles_type>contains all input variables,
   some of which are initialized here, while other were already
   initialized previously.

SIDE EFFECTS

   mpi_enregs=information about MPI parallelization

SOURCE

  84 subroutine mpi_setup(dtsets,filnam,lenstr,mpi_enregs,ndtset,ndtset_alloc,string)
  85 
  86 !Arguments ------------------------------------
  87 !scalars
  88  integer,intent(in) :: lenstr,ndtset,ndtset_alloc
  89  type(MPI_type),intent(inout) :: mpi_enregs(0:ndtset_alloc)
  90  character(len=*),intent(in) :: string
  91 !arrays
  92  character(len=fnlen),intent(in) :: filnam(5)
  93  type(dataset_type),intent(inout) :: dtsets(0:ndtset_alloc)
  94 
  95 !Local variables -------------------------------
  96 !scalars
  97  integer :: blocksize,exchn2n3d,iband,idtset,iexit,ii,iikpt,iikpt_modulo, prtvol
  98  integer :: isppol,jdtset,marr,mband_lower,mband_upper
  99  integer :: me_fft,mgfft,mgfftdg,mkmem,mpw,mpw_k,optdriver
 100  integer :: mband_mem
 101  integer :: nfft,nfftdg,nkpt,nkpt_me,npert,nproc,nproc_fft,nqpt
 102  integer :: nspink,nsppol,nsym,paral_fft,response,tnband,tread0,usepaw,vectsize
 103  integer :: fftalg,fftalga,fftalgc
 104 #ifdef HAVE_LINALG_ELPA
 105  integer :: icol,irow,np
 106 #endif
 107  logical :: fftalg_read,ortalg_read,paral_kgb_read,wfoptalg_read,do_check
 108  real(dp) :: dilatmx,ecut,ecut_eff,ecutdg_eff,ucvol
 109  character(len=500) :: msg
 110 !arrays
 111  integer :: ngfft(18),ngfftdg(18),ngfftc(3),tread(12)
 112  integer,allocatable :: intarr(:),istwfk(:),symrel(:,:,:)
 113  integer,allocatable :: mybands(:)
 114  integer,pointer :: nkpt_rbz(:)
 115  real(dp),parameter :: k0(3)=(/zero,zero,zero/)
 116  real(dp) :: gmet(3,3),gprimd(3,3),kpt(3),qphon(3),rmet(3,3),rprimd(3,3)
 117  real(dp),allocatable :: dprarr(:),kpt_with_shift(:,:)
 118  real(dp),pointer :: nband_rbz(:,:)
 119  character(len=6) :: nm_mkmem(3)
 120 
 121 !*************************************************************************
 122 
 123  DBG_ENTER("COLL")
 124 
 125  iexit=0;mpw_k=0
 126 
 127  call init_mpi_enreg(mpi_enregs(0))
 128  call initmpi_img(dtsets(0),mpi_enregs(0),-1)
 129 
 130  do idtset=1,ndtset_alloc
 131    call init_mpi_enreg(mpi_enregs(idtset))
 132 
 133    ! Handy read-only variables.
 134    optdriver = dtsets(idtset)%optdriver
 135    prtvol = dtsets(idtset)%prtvol
 136 
 137 !  Read parallel input parameters
 138    marr=max(5,dtsets(idtset)%npsp,dtsets(idtset)%nimage)
 139    ABI_MALLOC(intarr,(marr))
 140    ABI_MALLOC(dprarr,(marr))
 141    nkpt  =dtsets(idtset)%nkpt
 142    nsppol=dtsets(idtset)%nsppol
 143    jdtset=dtsets(idtset)%jdtset ; if(ndtset==0)jdtset=0
 144    usepaw=dtsets(idtset)%usepaw
 145    mband_upper=maxval(dtsets(idtset)%nband(1:nkpt*nsppol))
 146    mband_lower=minval(dtsets(idtset)%nband(1:nkpt*nsppol))
 147 
 148 !  Compute metric for this dataset
 149    call mkrdim(dtsets(idtset)%acell_orig(1:3,1),dtsets(idtset)%rprim_orig(1:3,1:3,1),rprimd)
 150    call metric(gmet,gprimd,-1,rmet,rprimd,ucvol)
 151 
 152    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'max_ncpus',tread0,'INT')
 153    if (tread0==1) dtsets(idtset)%max_ncpus=intarr(1)
 154 
 155    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_atom',tread0,'INT')
 156    if(tread0==1) dtsets(idtset)%paral_atom=intarr(1)
 157 
 158    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_rf',tread0,'INT')
 159    if (tread0==1.and.any(optdriver==[RUNL_RESPFN, RUNL_NONLINEAR])) dtsets(idtset)%paral_rf=intarr(1)
 160 
 161    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npimage',tread(2),'INT')
 162    if(tread(2)==1) dtsets(idtset)%npimage=intarr(1)
 163 
 164    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nppert',tread(3),'INT')
 165    if (tread(3)==1.and.optdriver==RUNL_RESPFN) dtsets(idtset)%nppert=intarr(1)
 166 
 167    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_spkpt',tread(4),'INT')
 168    if(tread(4)==1)then
 169      dtsets(idtset)%np_spkpt=intarr(1)
 170    else
 171 !    npkpt is obsolete, but still read
 172      call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npkpt',tread(4),'INT')
 173      if(tread(4)==1)then
 174        dtsets(idtset)%np_spkpt=intarr(1)
 175      endif
 176    endif
 177 
 178    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npspinor',tread(5),'INT')
 179    if(tread(5)==1) dtsets(idtset)%npspinor=intarr(1)
 180 
 181    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npfft',tread(6),'INT')
 182    if(tread(6)==1) dtsets(idtset)%npfft=intarr(1)
 183 
 184    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'npband',tread(7),'INT')
 185    if(tread(7)==1) dtsets(idtset)%npband=intarr(1)
 186 
 187    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'bandpp',tread(8),'INT')
 188    if(tread(8)==1) dtsets(idtset)%bandpp=intarr(1)
 189 
 190    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'use_slk',tread(9),'INT')
 191    if(tread(9)==1) dtsets(idtset)%use_slk=intarr(1)
 192 
 193    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'np_slk',tread(10),'INT')
 194    if(tread(10)==1) dtsets(idtset)%np_slk=intarr(1)
 195 
 196    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'slk_rankpp',tread(12),'INT')
 197    if(tread(12)==1) dtsets(idtset)%slk_rankpp=intarr(1)
 198 
 199    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pw_unbal_thresh',tread0,'DPR')
 200    if(tread0==1) dtsets(idtset)%pw_unbal_thresh=dprarr(1)
 201    mpi_enregs(idtset)%pw_unbal_thresh=dtsets(idtset)%pw_unbal_thresh
 202 
 203    call intagm(dprarr,intarr,jdtset,marr,5,string(1:lenstr),'gpu_devices',tread0,'INT')
 204    if(tread0==1) dtsets(idtset)%gpu_devices(1:5)=intarr(1:5)
 205 
 206    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'gpu_linalg_limit',tread(11),'INT')
 207    if(tread(11)==1) dtsets(idtset)%gpu_linalg_limit=intarr(1)
 208 
 209 
 210    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nphf',tread0,'INT')
 211    if(tread0==1) dtsets(idtset)%nphf=intarr(1)
 212 
 213    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'autoparal',tread0,'INT')
 214    if(tread0==1) dtsets(idtset)%autoparal=intarr(1)
 215 
 216 !  Read paral_kgb and disable it if not supported in optdriver
 217    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'paral_kgb',tread(1),'INT')
 218    paral_kgb_read=(tread(1)==1)
 219    if (paral_kgb_read) dtsets(idtset)%paral_kgb=intarr(1)
 220    if (xmpi_paral==0.and.dtsets(idtset)%paral_kgb==1) then
 221      dtsets(idtset)%paral_kgb=0
 222      write(msg, '(5a)' ) &
 223      'When ABINIT is compiled without MPI flag,',ch10,&
 224      'setting paral_kgb/=0 is useless. paral_kgb has been reset to 0.',ch10,&
 225      'Action: modify compilation option or paral_kgb in the input file.'
 226      ABI_WARNING(msg)
 227    end if
 228    if (ALL(optdriver /= [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT]) .and. dtsets(idtset)%paral_kgb/=0) then
 229      dtsets(idtset)%paral_kgb=0
 230      write(msg, '(a,i0,a)') &
 231       "paral_kgb != 0 is not available in optdriver ",optdriver,". Setting paral_kgb to 0"
 232      ABI_COMMENT(msg)
 233    end if
 234 
 235    wfoptalg_read=.false.
 236    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'wfoptalg',tread0,'INT')
 237    if(tread0==1) then
 238      dtsets(idtset)%wfoptalg=intarr(1)
 239      wfoptalg_read=.true.
 240    else
 241      if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0
 242      if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10
 243      if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114
 244    end if
 245 
 246    ! Dump the list of irreducible perturbations and exit.
 247    if (dtsets(idtset)%paral_rf==-1.and.optdriver/=RUNL_NONLINEAR) then
 248      call dtsets(idtset)%get_npert_rbz(nband_rbz, nkpt_rbz, npert)
 249      ABI_FREE(nband_rbz)
 250      ABI_FREE(nkpt_rbz)
 251      iexit = iexit + 1
 252    end if
 253 
 254    ! From total number of procs, compute all possible distributions
 255    ! Ignore exit flag if GW/EPH calculations because autoparal section is performed in screening/sigma/bethe_salpeter/eph
 256    if (any(optdriver == [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR, RUNL_NONLINEAR])) then
 257      iexit = 0
 258    else
 259      call finddistrproc(dtsets,filnam,idtset,iexit,mband_upper,mpi_enregs(idtset),ndtset_alloc,tread)
 260    end if
 261 
 262    call initmpi_img(dtsets(idtset),mpi_enregs(idtset),-1)
 263    nproc=mpi_enregs(idtset)%nproc_cell
 264 
 265 !  Set paral_kgb to 1 when band-fft parallelism is activated
 266    if (ANY(optdriver == [RUNL_GSTATE, RUNL_GWLS, RUNL_RTTDDFT])) then
 267      if (mpi_enregs(idtset)%nproc_cell>1) then
 268        if (dtsets(idtset)%npband>1.or.dtsets(idtset)%npfft>1) then
 269          if (.not.paral_kgb_read) dtsets(idtset)%paral_kgb=1
 270        end if
 271      end if
 272    end if
 273      
 274    if ((optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_GWLS.and.optdriver/=RUNL_RTTDDFT).and. &
 275 &   (dtsets(idtset)%np_spkpt/=1   .or.dtsets(idtset)%npband/=1.or.dtsets(idtset)%npfft/=1.or. &
 276 &   dtsets(idtset)%npspinor/=1.or.dtsets(idtset)%bandpp/=1)) then
 277 !&   .or.(dtsets(idtset)%iscf<0)) then
 278      dtsets(idtset)%np_spkpt=1 ; dtsets(idtset)%npspinor=1 ; dtsets(idtset)%npfft=1
 279      dtsets(idtset)%npband=1; dtsets(idtset)%bandpp=1  ; dtsets(idtset)%nphf=1
 280      dtsets(idtset)%paral_kgb=0
 281      ABI_COMMENT('For non ground state calculations, set bandpp, npfft, npband, npspinor, np_spkpt and nphf to 1')
 282    end if
 283 
 284 !  Take into account a possible change of paral_kgb (change of the default algorithm)
 285    if (.not.wfoptalg_read) then
 286      if (dtsets(idtset)%usepaw==0) dtsets(idtset)%wfoptalg=0
 287      if (dtsets(idtset)%usepaw/=0) dtsets(idtset)%wfoptalg=10
 288      if ((optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS).and.dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%wfoptalg=114
 289      if (mod(dtsets(idtset)%wfoptalg,10)==4) then
 290        do iikpt=1,dtsets(idtset)%nkpt
 291          if (any(abs(dtsets(idtset)%kpt(:,iikpt))>tol8)) dtsets(idtset)%istwfk(iikpt)=1
 292        end do
 293      end if
 294    end if
 295 
 296    dtsets(idtset)%densfor_pred=2
 297    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'densfor_pred',tread0,'INT')
 298    if(tread0==1) then
 299      dtsets(idtset)%densfor_pred=intarr(1)
 300    else
 301      if (dtsets(idtset)%paral_kgb==1) dtsets(idtset)%densfor_pred=6
 302    end if
 303    if((dtsets(idtset)%iscf==5.or.dtsets(idtset)%iscf==6) &
 304       .and. dtsets(idtset)%ionmov==4 .and. dtsets(idtset)%densfor_pred/=3 )then
 305      dtsets(idtset)%densfor_pred=3
 306      write(msg, '(a,a,a)' )&
 307      'When ionmov==4 and iscf==5 or 6, densfor_pred must be 3.',ch10,&
 308      'Set densfor_pred to 3.'
 309      ABI_COMMENT(msg)
 310    end if
 311 
 312 #ifdef HAVE_LOTF
 313 !  LOTF need densfor_pred=2
 314    if(dtsets(idtset)%ionmov==23) dtsets(idtset)%densfor_pred=2
 315 #endif
 316 
 317    if (usepaw==0) then
 318      dtsets(idtset)%ortalg=2
 319    else
 320      dtsets(idtset)%ortalg=-2
 321    end if
 322    ortalg_read=.false.
 323    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'ortalg',tread0,'INT')
 324    if(tread0==1) then
 325      dtsets(idtset)%ortalg=intarr(1)
 326      ortalg_read=.true.
 327    else if (dtsets(idtset)%wfoptalg>=10 .and. dtsets(idtset)%ortalg>0) then
 328      dtsets(idtset)%ortalg=-dtsets(idtset)%ortalg
 329    end if
 330 
 331    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'iomode',tread0,'INT')
 332    if(tread0==1) then
 333      dtsets(idtset)%iomode=intarr(1)
 334    else
 335      if ((xmpi_mpiio==1).and.(dtsets(idtset)%paral_kgb==1)) dtsets(idtset)%iomode=IO_MODE_MPI
 336 #ifdef HAVE_NETCDF_DEFAULT
 337      dtsets(idtset)%iomode=IO_MODE_ETSF
 338 #endif
 339    end if
 340 
 341    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'pawmixdg',tread0,'INT')
 342    if(tread0==1) then
 343      dtsets(idtset)%pawmixdg=intarr(1)
 344    else if (dtsets(idtset)%npfft>1.and.usepaw==1) then
 345      dtsets(idtset)%pawmixdg=1
 346    end if
 347      
 348 !  Cycle if the processor is not used
 349    if (mpi_enregs(idtset)%me<0.or.iexit>0) then
 350      ABI_FREE(intarr)
 351      ABI_FREE(dprarr)
 352      cycle
 353    end if
 354 
 355    ! Set cprj_in_memory to zero if paral_kgb has been activated by autoparal
 356    if (dtsets(idtset)%paral_kgb/=0) dtsets(idtset)%cprj_in_memory = 0
 357 
 358    response=0
 359    if (dtsets(idtset)%rfddk/=0 .or. dtsets(idtset)%rf2_dkdk/=0 .or. dtsets(idtset)%rf2_dkde/=0 .or. &
 360 &   dtsets(idtset)%rfelfd/=0 .or. dtsets(idtset)%rfphon/=0 .or. dtsets(idtset)%rfstrs/=0 .or. &
 361 &   dtsets(idtset)%rfuser/=0 .or. dtsets(idtset)%rfmagn/=0) response=1
 362 
 363 !  If no MPI, set all npxxx variables to 1
 364    if (nproc==1) then
 365      dtsets(idtset)%np_spkpt    = 1 ; dtsets(idtset)%npband   = 1
 366      dtsets(idtset)%npfft    = 1 ; dtsets(idtset)%npspinor = 1
 367      dtsets(idtset)%nphf     = 1
 368    end if
 369 
 370 !    --IF CUDA AND RECURSION:ONLY BAND PARALLELISATION
 371    if(dtsets(idtset)%tfkinfunc==2 .and. nproc/=1)then
 372      dtsets(idtset)%npband = dtsets(idtset)%npband*dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor*dtsets(idtset)%npfft
 373      dtsets(idtset)%np_spkpt = 1
 374      dtsets(idtset)%npfft = 1
 375      dtsets(idtset)%npspinor = 1
 376      write(msg, '(5a,i6,a)' )&
 377      'If HAVE_GPU_CUDA and recursion are used ',ch10,&
 378      'only the band parallelisation is active, we set:',ch10,&
 379      'npfft= 1, np_spkpt= 1, npband=',dtsets(idtset)%npband,' .'
 380      ABI_WARNING(msg)
 381    end if
 382 
 383    if (dtsets(idtset)%npspinor>=2.and.dtsets(idtset)%nspinor==1) then
 384      dtsets(idtset)%npspinor=1
 385      dtsets(idtset)%npfft=2*dtsets(idtset)%npfft
 386      write(msg,'(3a)')&
 387      'npspinor is bigger than nspinor !',ch10,&
 388      'We set npspinor to 1 ; we set npfft to 2*npfft'
 389      ABI_WARNING(msg)
 390    end if
 391 
 392 !  Some checks on parallelization data
 393    if(dtsets(idtset)%paral_kgb < 0 ) then
 394      cycle
 395    else if(dtsets(idtset)%paral_kgb/=0.and.(dtsets(idtset)%bandpp/=1.or.dtsets(idtset)%npband/=1.or.&
 396            dtsets(idtset)%npfft/=1.or.dtsets(idtset)%np_spkpt/=1.or.dtsets(idtset)%npspinor/=1))then
 397      if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor > nproc )then
 398        write(msg,'(7a)')&
 399        'The product of np_spkpt, npfft, npband and npspinor is bigger than the number of processors.',ch10,&
 400        'The user-defined values of np_spkpt, npfft, npband or npspinor will be modified,',ch10,&
 401        'in order to bring this product below nproc .',ch10,&
 402        'At present, only a very simple algorithm is used ...'
 403        ABI_WARNING(msg)
 404 
 405        if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npband*dtsets(idtset)%npspinor <= nproc) then
 406          dtsets(idtset)%npfft=1
 407          ABI_WARNING('Set npfft to 1')
 408        else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npspinor <= nproc)then
 409          dtsets(idtset)%npfft=1
 410          dtsets(idtset)%npband=1
 411          ABI_WARNING('Set npfft and npband to 1')
 412        else if(dtsets(idtset)%np_spkpt <= nproc)then
 413          dtsets(idtset)%npfft=1
 414          dtsets(idtset)%npband=1
 415          dtsets(idtset)%npspinor=1
 416          ABI_WARNING('Set npfft ,npband and npspinor to 1')
 417        else
 418          dtsets(idtset)%npfft=1
 419          dtsets(idtset)%npband=1
 420          dtsets(idtset)%np_spkpt=1
 421          dtsets(idtset)%npspinor=1
 422          ABI_WARNING('Set npfft, npband, nspinor and np_spkpt to 1')
 423        end if
 424      else if(dtsets(idtset)%np_spkpt*dtsets(idtset)%npfft*dtsets(idtset)%npband*dtsets(idtset)%npspinor < nproc)then
 425        write(msg,'(a,5i6,4a)')&
 426        'np_spkpt,npband,npspinor,npfft,nproc=',&
 427 &      dtsets(idtset)%np_spkpt,dtsets(idtset)%npfft,dtsets(idtset)%npband,dtsets(idtset)%npspinor,nproc,ch10,&
 428        'The number of processors must not be greater than npfft*npband*np_spkpt*npspinor ',ch10,&
 429        'when npfft or np_spkpt or npband or npspinor are chosen manually in the input file.'
 430        ABI_ERROR(msg)
 431      end if
 432    end if
 433 
 434 !  LOBPCG and ChebFi need paral_kgb=1 in parallel
 435    if ((dtsets(idtset)%npband*dtsets(idtset)%npfft>1).and. &
 436 &   (mod(dtsets(idtset)%wfoptalg,10)==1.or.mod(dtsets(idtset)%wfoptalg,10)==4)) then
 437      dtsets(idtset)%paral_kgb=1
 438    end if
 439 
 440 !  Check size of Scalapack communicator
 441 #ifdef HAVE_LINALG_ELPA
 442    if(dtsets(idtset)%paral_kgb>0.and.dtsets(idtset)%np_slk>0) then
 443      np=min(dtsets(idtset)%np_slk,dtsets(idtset)%npband*dtsets(idtset)%npfft*dtsets(idtset)%npspinor)
 444      irow=int(sqrt(float(np)))
 445      do while(mod(np,irow)/=0)
 446        irow=irow-1
 447      end do
 448      icol=nproc/irow
 449      if (icol>mband_lower) then
 450        do while(icol>mband_lower)
 451          icol=icol-1
 452          do while(mod(np,icol)/=0)
 453            icol=icol-1
 454          end do
 455        end do
 456        dtsets(idtset)%np_slk=icol
 457        write(msg,'(5a,i6,a)')&
 458        'The number of band*fft*spinor processors was not consistent with',ch10,&
 459        'the size of communicator used for ELPA library (np_slk).',ch10,&
 460        'np_slk value has been adjusted to ',dtsets(idtset)%np_slk,'.'
 461        ABI_COMMENT(msg)
 462      end if
 463    end if
 464 #endif
 465 
 466    !Additional check in case of a parallelized Hartree-Fock calculation
 467    !   %usefock == option to perform Fock exchange calculation
 468    !   %nphf   == number of processors for Fock exchange calculation
 469    if ((dtsets(idtset)%usefock==1).and.(dtsets(idtset)%nphf/=1)) then
 470 
 471      if ((dtsets(idtset)%nphf<0).or.(dtsets(idtset)%nphf==0)) then
 472        ABI_ERROR('The value of variable nphf should be a non negative integer.')
 473      end if
 474      if (dtsets(idtset)%paral_kgb/=0) then
 475        ABI_ERROR('Option paral_kgb should be turned off (value 0) for a parallelized Hartree-Fock calculation.')
 476      end if
 477      if (response/=0) then
 478        ABI_ERROR('A response function calculation is not yet possible with a parallelized Hartree-Fock calculation.')
 479      end if
 480      if (dtsets(idtset)%npspinor>1) then
 481        ABI_ERROR('The parallelism on spinors is not supported by a parallelized Hartree-Fock calculation.')
 482      end if
 483      if (dtsets(idtset)%np_spkpt*dtsets(idtset)%nphf > nproc )then
 484        write(msg,'(a,3(a,i0))') ch10,&
 485        'The product of variables np_spkpt and nphf is bigger than the number of processors: np_spkpt= ',&
 486        dtsets(idtset)%np_spkpt,' nphf= ',dtsets(idtset)%nphf  ,' and nproc= ', nproc
 487        ABI_ERROR(msg)
 488      end if
 489    end if ! Fock
 490 
 491    !When using chebfi, the number of blocks is equal to the number of processors
 492    if((dtsets(idtset)%wfoptalg == 1) .or. (dtsets(idtset)%wfoptalg == 111)) then
 493      !Nband might have different values for different kpoint, but not bandpp.
 494      !In this case, we just use the largest nband, and the input will probably fail
 495      !at the bandpp check later on
 496      dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband
 497    end if
 498 
 499    !Check parallelization in case of RTTDDFT
 500    !In particular ensure that bandpp = nband / npband
 501    if (optdriver == RUNL_RTTDDFT) then
 502       dtsets(idtset)%bandpp = mband_upper / dtsets(idtset)%npband
 503       if ( tread(8) == 1 ) then
 504          write(msg, '(a,a)') 'Setting bandpp is useless in RT-TDDFT because it is automatically set to nband/npband.', ch10
 505          ABI_WARNING(msg)
 506       end if
 507       if (dtsets(idtset)%npfft/=1) then
 508          dtsets(idtset)%npfft=1
 509          write(msg, '(a,a)') 'RT-TDDFT is not compatible with FFT-parallelization. Remove npfft or set it to 1.', ch10
 510          ABI_ERROR(msg)
 511       end if
 512       if (dtsets(idtset)%npspinor/=1) then
 513          dtsets(idtset)%npspinor=1
 514          write(msg, '(a,a)') 'RT-TDDFT is not compatible with spinor parallelization. Remove npspinor or set it to 1.', ch10
 515          ABI_ERROR(msg)
 516       end if
 517       if (dtsets(idtset)%nphf/=1) then
 518          dtsets(idtset)%nphf=1
 519          write(msg, '(a,a)') 'RT-TDDFT is not compatible with HF parallelization. Remove nphf or set it to 1.', ch10
 520          ABI_ERROR(msg)
 521       end if
 522    end if
 523 
 524 !  Set mpi_enreg
 525    mpi_enregs(idtset)%paral_kgb=dtsets(idtset)%paral_kgb
 526    if(dtsets(idtset)%paral_kgb/=0)then
 527      mpi_enregs(idtset)%nproc_spkpt=dtsets(idtset)%np_spkpt
 528      mpi_enregs(idtset)%nproc_fft=dtsets(idtset)%npfft
 529      mpi_enregs(idtset)%nproc_band=dtsets(idtset)%npband
 530      mpi_enregs(idtset)%nproc_spinor=min(dtsets(idtset)%npspinor,dtsets(idtset)%nspinor)
 531      mpi_enregs(idtset)%bandpp=dtsets(idtset)%bandpp
 532 !    Additional setting in case of hybrid functional calculation => not yet tested (CMartins)
 533 !     if (dtsets(idtset)%usefock==1) then
 534 !       mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
 535 !       if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
 536 !     end if
 537    else
 538      mpi_enregs(idtset)%bandpp = dtsets(idtset)%bandpp
 539 !    Additional setting in case of a Fock exchange of PBE0 calculation
 540      if (dtsets(idtset)%usefock==1) then
 541        if (dtsets(idtset)%nphf>1) mpi_enregs(idtset)%paral_hf=1
 542        mpi_enregs(idtset)%nproc_hf = dtsets(idtset)%nphf
 543        if (dtsets(idtset)%np_spkpt/=1) then
 544          mpi_enregs(idtset)%nproc_spkpt = dtsets(idtset)%np_spkpt
 545        else
 546          mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell/mpi_enregs(idtset)%nproc_hf
 547        end if
 548      else
 549        mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell
 550      end if
 551    end if
 552 
 553    if(dtsets(idtset)%paral_kgb>=0) then
 554 
 555 !    Compute processor distribution over perturbations
 556      mpi_enregs(idtset)%paral_pert=dtsets(idtset)%paral_rf
 557      if (mpi_enregs(idtset)%paral_pert==1) then
 558        dtsets(idtset)%nppert=max(1,dtsets(idtset)%nppert)
 559        if(dtsets(idtset)%nppert>mpi_enregs(idtset)%nproc) then
 560          ABI_ERROR('The number of processors must not be smaller than nppert !')
 561        end if
 562        call initmpi_pert(dtsets(idtset),mpi_enregs(idtset))
 563        mpi_enregs(idtset)%nproc_spkpt = mpi_enregs(idtset)%nproc_cell
 564        nproc=mpi_enregs(idtset)%nproc_cell
 565      end if
 566 !    Cycle if the processor is not used
 567      if (mpi_enregs(idtset)%me<0) then
 568        ABI_FREE(intarr)
 569        ABI_FREE(dprarr)
 570        cycle
 571      end if
 572 
 573 !    Compute processor distribution over kpt (and eventually band-fft)
 574      call initmpi_grid(mpi_enregs(idtset))
 575      if(dtsets(idtset)%usewvl==1) mpi_enregs(idtset)%comm_fft=mpi_enregs(idtset)%comm_cell
 576 
 577 !    Initialize tabs used for k/spin parallelism (with sequential-type values)
 578      ABI_MALLOC(mpi_enregs(idtset)%proc_distrb,(nkpt,mband_upper,nsppol))
 579      ABI_MALLOC(mpi_enregs(idtset)%my_kpttab,(nkpt))
 580      mpi_enregs(idtset)%proc_distrb(:,:,:)=0
 581      mpi_enregs(idtset)%my_kpttab(:)=(/(ii,ii=1,nkpt)/)
 582      mpi_enregs(idtset)%my_isppoltab(:)=1;if (dtsets(idtset)%nsppol==1) mpi_enregs(idtset)%my_isppoltab(2)=0
 583 
 584 !    HF or hybrid calculation : initialization of the array distrb_hf
 585      if (dtsets(idtset)%usefock==1) then
 586        ABI_MALLOC(mpi_enregs(idtset)%distrb_hf,(dtsets(idtset)%nkpthf,dtsets(idtset)%nbandhf,1))
 587 !      The dimension of distrb_hf are given by %nkpthf and %nbandhf.
 588 !      We assume that there will be no dependence in spinpol for all the occupied states.
 589        mpi_enregs(idtset)%distrb_hf=0
 590      end if
 591 
 592 !    Define k-points distribution (determine who I am)
 593 !    Note that nkpt_me may differ from processor to processor
 594 !    This fact will NOT be taken into account when
 595 !    the memory needs will be evaluated in the subroutine memory.
 596 !    Also, the reduction of k points due to symmetry in RF calculations
 597 !    is NOT taken into account. This should be changed later ...
 598      nkpt_me=nkpt
 599      mband_mem=0
 600      if(xmpi_paral==1 .and. dtsets(idtset)%usewvl == 0) then
 601        nkpt_me=0
 602        if(response==0 .or. (response==1 .and. dtsets(idtset)%efmas==1))then
 603          mpi_enregs(idtset)%paralbd=0
 604          call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
 605          do iikpt=1,nkpt
 606            if(.not.(proc_distrb_cycle(mpi_enregs(idtset)%proc_distrb,iikpt,1,1,-1,mpi_enregs(idtset)%me_kpt)))&
 607 &           nkpt_me=nkpt_me+1
 608          end do ! ikpt=1,nkpt
 609 !        HF or hybrid calculation : define the occupied states distribution (in array distrb_hf)
 610          if (dtsets(idtset)%usefock==1) then
 611            call distrb2_hf(dtsets(idtset)%nbandhf,dtsets(idtset)%nkpthf,nproc,nsppol,mpi_enregs(idtset))
 612          end if
 613        else ! response==1
 614 !  TODO: check or remove the following comment which seems outdated
 615 !        Wrongly assumes that the number of elements of the
 616 !        k-point sets of the two spin polarizations is the maximal
 617 !        value of one of these k-point sets ...
 618 !        This is to be corrected when RF is implemented
 619 !        for spin-polarized case.
 620 !  ENDTODO
 621          mpi_enregs(idtset)%paralbd=1
 622 !        nproc=mpi_enregs(idtset)%nproc_cell*mpi_enregs(idtset)%nproc_pert
 623          call distrb2(mband_upper,mband_mem,dtsets(idtset)%nband,nkpt,nproc,nsppol,mpi_enregs(idtset))
 624          do isppol=1,nsppol
 625            nspink=0
 626            do iikpt=1,nkpt
 627              do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
 628                if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_cell)then
 629                  nspink=nspink+1
 630                  exit
 631                end if
 632              end do ! iband
 633            end do ! iikpt
 634            if(nspink>nkpt_me)nkpt_me=nspink
 635          end do ! isppol
 636 !        Is nband present in input file or automatically estimated ?
 637          tnband=0
 638          call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'nband',tnband,'INT')
 639 !        If the number of bands was estimated, there might be a side effect
 640 !        when the definitive number of bands is known. k points
 641 !        might be attributed to different processors than the present
 642 !        proc_distrb describes. At most, the number of k points could increase by 1 ...
 643          if(tnband==0)nkpt_me=nkpt_me+1
 644 !        In any case, the maximal number of k points is nkpt
 645          if(nkpt_me>nkpt)nkpt_me=nkpt
 646 
 647 !        mband_mem
 648          ABI_MALLOC (mybands, (mband_upper))
 649          mband_mem = 0
 650          do isppol=1,nsppol
 651            do iikpt=1,nkpt
 652              mybands = 0
 653              do iband=1,dtsets(idtset)%nband(iikpt+(isppol-1)*nkpt)
 654                if(mpi_enregs(idtset)%proc_distrb(iikpt,iband,isppol)==mpi_enregs(idtset)%me_band)then
 655                  mybands(iband)=1
 656                end if
 657              end do ! iband
 658              mband_mem = max(mband_mem, sum(mybands))
 659            end do ! iikpt
 660          end do ! isppol
 661          ABI_FREE (mybands)
 662        end if ! response case
 663      end if
 664    end if
 665    if (mband_mem == 0) mband_mem = mband_upper
 666    dtsets(idtset)%mband_mem = mband_mem
 667 
 668 !  Take care of mkmems. Use the generic name -mkmem- for mkmem as well as mkqmem
 669 !  and mk1mem.
 670    nm_mkmem(1)='mkmem '
 671    nm_mkmem(2)='mkqmem'
 672    nm_mkmem(3)='mk1mem'
 673 
 674    do ii=1,3
 675 
 676 !    Read in mkmem here if it is in the input file
 677 !    TODO: mkmem is not supported any longer. These variables can be removed.
 678      if(ii==1)then
 679        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkmem',tread0,'INT')
 680      else if(ii==2)then
 681        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mkqmem',tread0,'INT')
 682      else if(ii==3)then
 683        call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'mk1mem',tread0,'INT')
 684      end if
 685 
 686 
 687 !    Note that mkmem is used as a dummy variable, representing mkmem as well
 688 !    as mkqmem, and mk1mem.
 689      if(tread0==1) then
 690        mkmem=intarr(1)
 691        if (mkmem<0) then
 692 !        mkmem is unreasonable; must be zero or positive
 693          write(msg, '(4a,i0,4a)')&
 694          nm_mkmem(ii),' must be positive or null but ',nm_mkmem(ii),' =',mkmem,ch10,&
 695          'Use default ',nm_mkmem(ii),' = nkpt .'
 696          ABI_WARNING(msg)
 697          mkmem=nkpt
 698        end if
 699 
 700      else
 701 
 702        !  mkmem was not set in the input file so default to incore solution
 703        !write(msg,'(6a)') &
 704        !'mpi_setup: ',nm_mkmem(ii),' undefined in the input file.','Use default ',nm_mkmem(ii),' = nkpt'
 705        !call wrtout(std_out, msg)
 706        mkmem=nkpt
 707      end if
 708 
 709 !    Check whether nkpt distributed on the processors <= mkmem;
 710 !    if so then may run entirely in core,
 711 !    avoiding i/o to disk for wavefunctions and kg data.
 712 !    mkmem/=0 to avoid i/o; mkmem==0 to use disk i/o for nkpt>=1.
 713      if (nkpt_me<=mkmem .and. mkmem/=0 ) then
 714        write(msg, '(a,i0,a,a,a,i0,a)' ) &
 715         ' mpi_setup: With nkpt_me=',nkpt_me,' and ',nm_mkmem(ii),' = ',mkmem,', ground state wf handled in core.'
 716        if (prtvol > 0) call wrtout(std_out,msg)
 717        if(nkpt_me<mkmem .and. nkpt_me/=0)then
 718          write(msg,'(3a)')' Resetting ',nm_mkmem(ii),' to nkpt_me to save memory space.'
 719          mkmem=nkpt_me
 720          if (prtvol > 0) call wrtout(std_out,msg)
 721        end if
 722      else if(mkmem/=0)then
 723        write(msg, '(a,i0,3a,i0,5a)' ) &
 724        ' mpi_setup: With nkpt_me=',nkpt_me,'and ',nm_mkmem(ii),' = ',mkmem,&
 725        ' ground state wf require disk i/o.',ch10,&
 726        ' Resetting ',nm_mkmem(ii),' to zero to save memory space.'
 727        mkmem=0
 728        if (prtvol > 0) call wrtout(std_out,msg)
 729      end if
 730      if(dtsets(idtset)%usewvl == 0 .or. dtsets(idtset)%usepaw==1)then
 731        if(ii==1)dtsets(idtset)%mkmem=mkmem
 732      end if
 733      if(ii==2)dtsets(idtset)%mkqmem=mkmem
 734      if(ii==3)dtsets(idtset)%mk1mem=mkmem
 735 
 736      if(dtsets(idtset)%usewvl == 1 .and. dtsets(idtset)%usepaw==1 )then
 737        if(dtsets(idtset)%mkmem .ne. dtsets(idtset)%nkpt) then
 738          ABI_ERROR("mkmem is not allowed for WVL+PAW")
 739        end if
 740      end if
 741 
 742    end do  ! End the loop on the three possiblities mkmem, mkqmem, mk1mem.
 743 
 744    if(dtsets(idtset)%paral_kgb==1) mpi_enregs(idtset)%paralbd=0
 745 
 746 !  Check if some MPI processes are empty (MBPT codes uses a complete different MPI algorithm)
 747    do_check = all(optdriver /= [RUNL_SCREENING, RUNL_SIGMA, RUNL_BSE, RUNL_EPH, RUNL_GWR])
 748    if (dtsets(idtset)%usewvl == 0 .and. do_check) then
 749      if (.not.mpi_distrib_is_ok(mpi_enregs(idtset),mband_upper,&
 750           dtsets(idtset)%nkpt,dtsets(idtset)%mkmem,nsppol,msg=msg)) then
 751        write(msg,'(5a)') trim(msg),ch10,&
 752          'YOU ARE STRONGLY ADVISED TO ACTIVATE AUTOMATIC PARALLELIZATION!',ch10,&
 753          'USE "AUTOPARAL=1" IN THE INPUT FILE.'
 754        ABI_WARNING(msg)
 755      end if
 756    end if
 757 
 758 !  call mpi_setup1(dtsets(idtset),jdtset,lenstr,mband_upper,mpi_enregs(idtset),string)
 759 !  Printing of processor distribution
 760 !  MPIWF : here, set up the complete ngfft, containing the information
 761 !  for the parallelisation of the FFT
 762    call abi_io_redirect(new_io_comm=mpi_enregs(idtset)%comm_world)
 763    call libpaw_write_comm_set(mpi_enregs(idtset)%comm_world)
 764 
 765 !  Default values for sequential case
 766    paral_fft=0; nproc_fft=1; me_fft=0
 767 
 768    if(dtsets(idtset)%usewvl == 0)then
 769      if(optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
 770        paral_fft=1           ! parallelisation over FFT
 771        if (mpi_enregs(idtset)%nproc_cell>0) then
 772          if(mpi_enregs(idtset)%paral_kgb == 1) then
 773 
 774            if((dtsets(idtset)%use_gpu_cuda==1).and.(mpi_enregs(idtset)%nproc_fft/=1))then
 775              write(msg,'(3a,i0)') &
 776              'When use_gpu_cuda is on, the number of FFT processors, npfft, must be 1',ch10,&
 777              'However, npfft=',mpi_enregs(idtset)%nproc_fft
 778              ABI_ERROR(msg)
 779            end if
 780 
 781            if(modulo(dtsets(idtset)%ngfft(2),mpi_enregs(idtset)%nproc_fft)/=0)then
 782              write(msg,'(3a,i0,a,i0)') &
 783              'The number of FFT processors, npfft, should be a multiple of ngfft(2).',ch10,&
 784              'However, npfft=',mpi_enregs(idtset)%nproc_fft,' and ngfft(2)=',dtsets(idtset)%ngfft(2)
 785              ABI_BUG(msg)
 786            end if
 787 
 788            do iikpt=1,nkpt*nsppol
 789              iikpt_modulo = modulo(iikpt,nkpt)+1
 790              if ((dtsets(idtset)%istwfk(iikpt_modulo)==2)) then !.and.(dtsets(idtset)%ngfft(7)==401)) then
 791                if ((mpi_enregs(idtset)%bandpp==0).or. &
 792                ((mpi_enregs(idtset)%bandpp/=1).and.(modulo(mpi_enregs(idtset)%bandpp,2)/=0))) then
 793                  write(msg,'(3a,i0)') &
 794                  'The number bandpp should be 1 or a multiple of 2',ch10,&
 795                  'However, bandpp=',mpi_enregs(idtset)%bandpp
 796                  ABI_BUG(msg)
 797                end if
 798                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 799                  write(msg,'(3a,i0,a,i0)') &
 800                  'The number of bands for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
 801                  'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
 802                  mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 803                  ABI_BUG(msg)
 804                end if
 805              else if ((dtsets(idtset)%istwfk(iikpt_modulo)==2) .and. (dtsets(idtset)%ngfft(7)==400)) then
 806                ABI_BUG('The fftalg=400 with istwfk=2 is not valid')
 807              else
 808                if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 809                  write(msg,'(3a,i0,a,i0)') &
 810                  'The number of band for the k-point, nband_k, should be a multiple of nproc_band*bandpp.',ch10,&
 811                  'However, nband_k=',dtsets(idtset)%nband(iikpt),' and nproc_band*bandpp=', &
 812                  mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 813                  ABI_BUG(msg)
 814                end if
 815                if ((mpi_enregs(idtset)%bandpp==0)) then
 816                  write(msg,'(a,i0,2a,i0,2a,i0)')&
 817                  'The number bandpp should not be 0 with fftalg=',dtsets(idtset)%ngfft(7),ch10,&
 818                  'and istwfk=',dtsets(idtset)%istwfk(iikpt_modulo),ch10,&
 819                  'However, bandpp=',mpi_enregs(idtset)%bandpp
 820                  ABI_BUG(msg)
 821                end if
 822              end if
 823            end do
 824 
 825            if (xmpi_paral==1) then
 826              if(modulo(nkpt*nsppol,mpi_enregs(idtset)%nproc_spkpt)/=0)then
 827                write(msg,'(3a,i0,a,i0)') &
 828                'The number of KPT processors, np_spkpt, should be a multiple of nkpt*nsppol.',ch10,&
 829                'However, np_spkpt=',mpi_enregs(idtset)%nproc_spkpt,' and nkpt*nsppol=',nkpt*nsppol
 830                ABI_WARNING(msg)
 831              end if
 832            end if
 833          else
 834            do iikpt=1,nkpt*nsppol
 835              iikpt_modulo = modulo(iikpt,nkpt)+1
 836              if(modulo(dtsets(idtset)%nband(iikpt),mpi_enregs(idtset)%nproc_band*mpi_enregs(idtset)%bandpp)/=0)then
 837                write(msg,'(3a,i0,a,i0)') &
 838                'The number of band for the k-point, nband_k, should be a multiple of npband*bandpp.',ch10,&
 839                'However, nband_k=',dtsets(idtset)%nband(iikpt),' and npband*bandpp=', &
 840                mpi_enregs(idtset)%nproc_band* mpi_enregs(idtset)%bandpp
 841                ABI_BUG(msg)
 842              end if
 843            end do
 844          end if
 845        end if
 846        nproc_fft=mpi_enregs(idtset)%nproc_fft
 847        me_fft=mpi_enregs(idtset)%me_fft
 848      end if
 849    end if
 850 
 851 !  Compute mgfft,mpw,nfft for this data set (it is dependent of mpi_enreg)
 852    ABI_MALLOC(istwfk,(nkpt))
 853    ABI_MALLOC(kpt_with_shift,(3,nkpt))
 854 
 855    ! Set the default value of fftalg for given npfft but allow the user to override it.
 856    ! Warning: If you need to change npfft, **DO IT** before this point so that here we get the correct fftalg
 857    dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 858    dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 859 
 860    fftalg_read=.false.
 861    call intagm(dprarr,intarr,jdtset,marr,1,string(1:lenstr),'fftalg',tread0,'INT')
 862 
 863    if (tread0==1) then
 864      dtsets(idtset)%ngfft(7)=intarr(1)
 865      if (usepaw==1) dtsets(idtset)%ngfftdg(7)=intarr(1)
 866      fftalg_read=.true.
 867    end if
 868 
 869    ecut     =dtsets(idtset)%ecut
 870    dilatmx  =dtsets(idtset)%dilatmx
 871    ngfft(:) =dtsets(idtset)%ngfft(:)
 872    istwfk(:)=dtsets(idtset)%istwfk(1:nkpt)
 873    nsym     =dtsets(idtset)%nsym
 874 
 875    nqpt=dtsets(idtset)%nqpt
 876    qphon(:)=zero;if(nqpt/=0) qphon(:)=dtsets(idtset)%qptn(:)
 877 
 878    ABI_MALLOC(symrel,(3,3,nsym))
 879    symrel(:,:,1:nsym)=dtsets(idtset)%symrel(:,:,1:nsym)
 880    ecut_eff=ecut*dilatmx**2
 881 
 882    if (usepaw==1) call wrtout(std_out,'getng is called for the coarse grid:')
 883    kpt=k0; if (response==1.and.usepaw==1) kpt=qphon ! this is temporary
 884 
 885    call getng(dtsets(idtset)%boxcutmin,dtsets(idtset)%chksymtnons,ecut_eff,gmet,kpt,me_fft,mgfft,nfft,&
 886 &   ngfft,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,&
 887 &   use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
 888 
 889    dtsets(idtset)%ngfft(:)=ngfft(:)
 890    dtsets(idtset)%mgfft=mgfft
 891    dtsets(idtset)%nfft=nfft
 892    kpt_with_shift(:,:)=dtsets(idtset)%kpt(:,1:nkpt)/dtsets(idtset)%kptnrm
 893 
 894    exchn2n3d=dtsets(idtset)%exchn2n3d
 895    nproc_fft=ngfft(10) ; me_fft=ngfft(11)
 896    fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
 897 
 898    ! Initialize tables for MPI-FFT.
 899    call init_distribfft(mpi_enregs(idtset)%distribfft,'c',mpi_enregs(idtset)%nproc_fft,ngfft(2),ngfft(3))
 900 
 901    if(response/=0)then
 902 !    This value of mpw is used in the first part of respfn.f
 903      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw_k,nkpt)
 904    end if
 905    if(nqpt/=0)then
 906      kpt_with_shift(1,:)=kpt_with_shift(1,:)+qphon(1)
 907      kpt_with_shift(2,:)=kpt_with_shift(2,:)+qphon(2)
 908      kpt_with_shift(3,:)=kpt_with_shift(3,:)+qphon(3)
 909    end if
 910    if (dtsets(idtset)%usewvl == 0) then
 911      call getmpw(ecut_eff,exchn2n3d,gmet,istwfk,kpt_with_shift,mpi_enregs(idtset),mpw,nkpt)
 912      ! Allocate tables for parallel IO of the wavefunctions.
 913      if( xmpi_mpiio==1 .and. mpi_enregs(idtset)%paral_kgb == 1 .and. &
 914 &     any(dtsets(idtset)%iomode == [IO_MODE_MPI, IO_MODE_ETSF])) then
 915        ABI_MALLOC(mpi_enregs(idtset)%my_kgtab,(mpw,dtsets(idtset)%mkmem))
 916      end if
 917    else
 918      mpw = 0
 919    end if
 920 
 921 !  The dimensioning, in the RF case, should be done only with mpw,
 922 !  but mpw is used in the first part of respfn.f, and should at least
 923 !  be equal to mpw_k . The chosen way to code is not optimal, only convenient :
 924 !  it leads to a small waste of memory.
 925    if(response/=0 .and. mpw_k>mpw)mpw=mpw_k
 926    dtsets(idtset)%ngfft(:)=ngfft(:)
 927 
 928 !  Initialize ngfftc to the initial guess for the coarse mesh
 929    ngfftc(:) = 2
 930 
 931 !  In case of PAW, compute fine FFT parameters
 932    if (usepaw==1) then
 933      ecutdg_eff=dtsets(idtset)%pawecutdg*dtsets(idtset)%dilatmx**2
 934      ngfftdg(:)=dtsets(idtset)%ngfftdg(:)
 935      call wrtout(std_out,'getng is called for the fine grid:')
 936 !    Start with the coarse mesh as an initial guess for the fine mesh
 937 !    This ensures that the fine mesh will not be any coarser than the coarse mesh in each dimension
 938      ngfftc(:) = ngfft(1:3)
 939      kpt=k0; if (response==1.and.usepaw==1) kpt=qphon  ! this is temporary
 940 
 941      call getng(dtsets(idtset)%bxctmindg,dtsets(idtset)%chksymtnons,&
 942 &     ecutdg_eff,gmet,kpt,me_fft,mgfftdg,&
 943 &     nfftdg,ngfftdg,nproc_fft,nsym,paral_fft,symrel,dtsets(idtset)%tnons,ngfftc,&
 944 &     use_gpu_cuda=dtsets(idtset)%use_gpu_cuda)
 945 
 946      dtsets(idtset)%ngfftdg(:)=ngfftdg(:)
 947      dtsets(idtset)%mgfftdg=mgfftdg
 948      dtsets(idtset)%nfftdg=nfftdg
 949 !    Compute fft distribution for fine grid
 950      fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
 951      call init_distribfft(mpi_enregs(idtset)%distribfft,'f', mpi_enregs(idtset)%nproc_fft,ngfftdg(2),ngfftdg(3))
 952    end if
 953 
 954    dtsets(idtset)%mpw=mpw
 955    ABI_FREE(symrel)
 956    ABI_FREE(istwfk)
 957    ABI_FREE(kpt_with_shift)
 958    ABI_FREE(intarr)
 959    ABI_FREE(dprarr)
 960 
 961 !  Initialize data for the parallelization over atomic sites (PAW)
 962    if (dtsets(idtset)%natom==1) dtsets(idtset)%paral_atom=0
 963    if (dtsets(idtset)%usepaw==0) dtsets(idtset)%paral_atom=0
 964    if (dtsets(idtset)%usewvl/=0) dtsets(idtset)%paral_atom=0
 965    if (dtsets(idtset)%usedmft==1) dtsets(idtset)%paral_atom=0
 966    if (optdriver/=RUNL_GSTATE.and.optdriver/=RUNL_RESPFN.and.optdriver/=RUNL_GWLS) dtsets(idtset)%paral_atom=0
 967    if (dtsets(idtset)%macro_uj/=0) dtsets(idtset)%paral_atom=0
 968 
 969    call initmpi_atom(dtsets(idtset),mpi_enregs(idtset))
 970 
 971 !  In case of the use of a GPU (Cuda), some defaults can change
 972 !  according to a threshold on matrix sizes
 973    if (dtsets(idtset)%use_gpu_cuda==1.or.dtsets(idtset)%use_gpu_cuda==-1) then
 974      if (optdriver==RUNL_GSTATE.or.optdriver==RUNL_GWLS) then
 975        vectsize=dtsets(idtset)%mpw*dtsets(idtset)%nspinor/dtsets(idtset)%npspinor
 976        if (all(dtsets(idtset)%istwfk(:)==2)) vectsize=2*vectsize
 977        blocksize=dtsets(idtset)%npband*dtsets(idtset)%bandpp
 978        if (dtsets(idtset)%paral_kgb==0) blocksize=dtsets(idtset)%npfft
 979        if ((vectsize*blocksize**2)>=dtsets(idtset)%gpu_linalg_limit) then
 980          if (.not.wfoptalg_read) then
 981            dtsets(idtset)%wfoptalg=14
 982            if (.not.fftalg_read) then
 983              dtsets(idtset)%ngfft(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 984              if (usepaw==1) dtsets(idtset)%ngfftdg(7) = fftalg_for_npfft(dtsets(idtset)%npfft)
 985            end if
 986            if (.not.ortalg_read) dtsets(idtset)%ortalg=-abs(dtsets(idtset)%ortalg)
 987          end if
 988        end if
 989      end if
 990    end if
 991 
 992 !  initialize data for the parallelization for WVL:
 993    if(dtsets(idtset)%usewvl==1) then
 994      mpi_enregs(idtset)%comm_wvl=mpi_enregs(idtset)%comm_cell
 995      mpi_enregs(idtset)%nproc_wvl=xmpi_comm_size(mpi_enregs(idtset)%comm_wvl)
 996      mpi_enregs(idtset)%me_wvl=xmpi_comm_rank(mpi_enregs(idtset)%comm_wvl)
 997    end if
 998 
 999  end do
1000 
1001 !This is not a very clean exit in case of paral_kgb<0
1002  if (iexit/=0)then
1003    ABI_STOP("Stopping now!")
1004  end if
1005 
1006  DBG_EXIT("COLL")
1007 
1008 end subroutine mpi_setup