TABLE OF CONTENTS


ABINIT/m_exc_itdiago [ Modules ]

[ Top ] [ Modules ]

NAME

 m_exc_itdiago

FUNCTION

  Iterative diagonalization of the BSE Hamiltonian with band-by-band conjugate gradient method

COPYRIGHT

  Copyright (C) 2008-2024 ABINIT group (MG)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

SOURCE

16 #if defined HAVE_CONFIG_H
17 #include "config.h"
18 #endif
19 
20 #include "abi_common.h"
21 
22 
23 MODULE m_exc_itdiago
24 
25  use defs_basis
26  use m_bs_defs
27  use m_errors
28  use m_abicore
29  use m_linalg_interfaces
30  use m_hdr
31  use m_xmpi
32 #ifdef HAVE_MPI2
33  use mpi
34 #endif
35 
36  use m_io_tools,      only : open_file
37  use m_time,          only : cwtime
38  use m_numeric_tools, only : stats_t, stats_eval
39  use m_hide_lapack,   only : xhpev !xheev,
40  use m_bse_io,        only : exc_read_rcblock
41 
42  implicit none
43 
44  private
45 
46 #ifdef HAVE_MPI1
47  include 'mpif.h'
48 #endif
49 
50  public :: exc_iterative_diago    ! Calculates eigenvalues and eigenvectors of the Resonant BSE Hamiltonian

m_exc_itdiago/convergence_degree [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 convergence_degree

FUNCTION

  Return the degree of convergence from the input residual.

INPUTS

  resid=Residual.

OUTPUT

SOURCE

1151 function convergence_degree(resid)
1152 
1153 !Arguments
1154  integer :: convergence_degree
1155  real(dp),intent(in) :: resid
1156 
1157 !************************************************************************
1158 
1159  if (resid<tolwfr_) then
1160    convergence_degree = STRICT
1161  else
1162    convergence_degree = WORST
1163    if (resid<tolwfr_*10**5) convergence_degree = MEDIUM
1164  end if
1165 
1166 end function convergence_degree

m_exc_itdiago/exc_check_phi_block [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 exc_check_phi_block

FUNCTION

  Debugging tools

INPUTS

OUTPUT

SOURCE

1184 subroutine exc_check_phi_block(string)
1185 
1186 !Arguments ------------------------------------
1187 !scalars
1188  character(len=*),intent(in) :: string
1189 
1190 !Local variables ------------------------------
1191 !scalars
1192  integer :: ii,jj,ierr
1193  real(dp) :: err,rdum
1194 !arrays
1195  complex(dpc),allocatable :: lbuff(:,:)
1196 
1197 !************************************************************************
1198 
1199  if (.not. DEBUGME) return
1200 
1201 #if 0
1202  ABI_MALLOC(lbuff,(hexc_size,nstates))
1203  err = -one
1204  do irank=1,nproc-1
1205    call xmpi_exch(phi_block,hexc_size*nstates,irank,lbuff,master,comm,11,ierr)
1206    if (my_rank==master) then
1207      lbuff = lbuff-phi_block
1208      err = MAX(err,MAXVAL(MAXVAL(ABS(lbuff),DIM=1)))
1209    end if
1210    call xmpi_barrier(comm)
1211  end do
1212  ABI_FREE(lbuff)
1213 #else
1214 
1215  ABI_MALLOC(lbuff,(nstates,nstates))
1216  lbuff=czero
1217  do jj=1,nstates
1218    do ii=1,jj
1219      lbuff(ii,jj) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) )
1220    end do
1221  end do
1222  call xmpi_sum(lbuff,comm,ierr)
1223 
1224  err = -one
1225  do jj=1,nstates
1226    do ii=1,jj
1227      if (ii==jj) then
1228        rdum =  ABS(lbuff(ii,jj)-one)
1229      else
1230        rdum =  ABS(lbuff(ii,jj))
1231      end if
1232      err = MAX(err,rdum)
1233    end do
1234  end do
1235  ABI_FREE(lbuff)
1236 #endif
1237 
1238  if (my_rank==master) then
1239    write(std_out,*)"After ",TRIM(string),", MAX inconsistency error in phi_block= ",err
1240  end if
1241 
1242  !write(std_out,*)"master casts its own data"
1243  !call xmpi_bcast(phi_block,master,comm,ierr)
1244 
1245 end subroutine exc_check_phi_block

m_exc_itdiago/exc_cholesky_ortho [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 exc_cholesky_ortho

FUNCTION

  This routine performs the orthogonalization of the trial eigenstates using the
  Cholesky Algorithm.

SIDE EFFECTS

   phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.

SOURCE

1057 subroutine exc_cholesky_ortho()
1058 
1059 !Local variables ------------------------------
1060  integer :: my_info,ii,jj,ipack,ierr
1061  logical,parameter :: use_unpacked = .False.
1062 !arrays
1063  complex(dpc),allocatable :: overlap(:,:),povlp(:)
1064 
1065 !************************************************************************
1066 
1067  ! 1) overlap_ij =  <phi_i|phi_j>
1068  ABI_MALLOC(overlap, (nstates,nstates))
1069 
1070  if (use_unpacked) then
1071    overlap = czero
1072 
1073    call ZGEMM('C','N',nstates,nstates,my_nt,cone,phi_block,my_nt,phi_block,my_nt,czero,overlap,nstates)
1074    call xmpi_sum(overlap,comm,ierr)
1075 
1076    do ii=1,nstates
1077      overlap(ii,ii)=REAL(overlap(ii,ii),kind=dp)
1078    end do
1079 
1080    ! 2) Cholesky factorization: overlap = U^H U with U upper triangle matrix.
1081    call ZPOTRF('U',nstates,overlap,nstates,my_info)
1082    if (my_info/=0)  then
1083      write(msg,'(a,i3)')' ZPOTRF returned info= ',my_info
1084      ABI_ERROR(msg)
1085    end if
1086 
1087  else
1088    ! 1) Calculate overlap_ij =  <phi_i|phi_j> in packed form.
1089    ABI_MALLOC(povlp,(nstates*(nstates+1)/2))
1090    povlp = czero; ipack=0
1091    do jj=1,nstates
1092      do ii=1,jj
1093        ipack=ipack+1
1094        povlp(ipack) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) )
1095        if (ii==jj) povlp(ipack) = REAL(povlp(ipack),kind=dp)
1096      end do
1097    end do
1098    call xmpi_sum(povlp,comm,ierr)
1099 
1100    ! 2) Cholesky factorization: overlap = U^H U with U upper triangle matrix.
1101    call ZPPTRF("U",nstates,povlp,my_info)
1102    if (my_info/=0)  then
1103      write(msg,'(a,i3)')' ZPPTRF returned info= ',my_info
1104      ABI_ERROR(msg)
1105    end if
1106    !call xmpi_sum(povlp,comm,ierr)
1107    !povlp=povlp/nproc
1108 
1109    !unpack povlp to prepare call to ZTRSM.
1110    ipack=0
1111    do jj=1,nstates
1112      do ii=1,jj
1113        ipack=ipack+1
1114        if (ii/=jj) then
1115          overlap(ii,jj)=      povlp(ipack)
1116          overlap(jj,ii)=CONJG(povlp(ipack))
1117        else
1118          overlap(ii,ii)=REAL(povlp(ipack),kind=dp)
1119        end if
1120      end do
1121    end do
1122    ABI_FREE(povlp)
1123  end if
1124 
1125  ! Check if this can be done with Scalapack. Direct PZTRSM is not provided
1126 
1127  ! 3) Solve X U = phi_block, on exit the phi_block treated by this node is orthonormalized.
1128  !call ZTRSM('R','U','N','N',hexc_size,nstates,cone,overlap,nstates,phi_block,hexc_size)
1129  call ZTRSM('Right','Upper','Normal','Normal',my_nt,nstates,cone,overlap,nstates,phi_block,my_nt)
1130  ABI_FREE(overlap)
1131 
1132 end subroutine exc_cholesky_ortho

m_exc_itdiago/exc_init_phi_block [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 exc_init_phi_block

FUNCTION

  Initialize the eigenstates either from file or fill them with random number
  if restart file is not available

INPUTS

  ihexc_fname=
    Name of the file from which the eigenvectors will be read.
    Empty string to initialize trial eigenvectors with random numbers.

SIDE EFFECTS

   phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.

SOURCE

652 subroutine exc_init_phi_block(ihexc_fname,use_mpio,comm)
653 
654 !Arguments ------------------------------------
655 !scalars
656  integer,intent(in) :: comm
657  logical,intent(in) :: use_mpio
658  character(len=*),intent(in) :: ihexc_fname
659 
660 !Local variables ------------------------------
661  integer :: eig_unt,hexc_size_restart,ii,state,seed
662  integer ::  fold1,fold2,foldim,foldre
663  real(dp) :: cputime,walltime,gflops
664  character(len=500) :: errmsg
665 !arrays
666  complex(dpc),allocatable :: buffer_dpc(:)
667 #ifdef HAVE_MPI_IO
668  integer:: amode,mpi_fh,mpi_err,old_type,etype,eig_type,my_nel,ierr,my_nrows
669  integer(XMPI_OFFSET_KIND) :: ehdr_offset,my_offset,my_offpad,fmarker
670  integer(XMPI_OFFSET_KIND),allocatable :: bsize_frecord(:)
671  integer :: array_of_sizes(2),array_of_subsizes(2),array_of_starts(2)
672 #endif
673 
674 !************************************************************************
675 
676  if (LEN_TRIM(ihexc_fname) == 0) then
677    call wrtout(std_out," Initializing eigenvectors with random numbers","COLL")
678    !
679    ! Use random number generator. For portability, use only integer numbers
680    ! The series of couples (fold1,fold2) is periodic with a period of
681    ! 3x5x7x11x13x17x19x23x29x31, that is, larger than 2**32, the largest integer*4
682    ! fold1 is between 0 and 34, fold2 is between 0 and 114. As sums of five
683    ! uniform random variables, their distribution is close to a gaussian
684    ! the gaussian distributions are folded, in order to be back to a uniform distribution
685    ! foldre is between 0 and 20, foldim is between 0 and 18.
686    !
687    do state=1,nstates
688      do ii=my_t1,my_t2
689        seed=ii+(state-1)*hexc_size ! Different seed for different transitions and bands
690        fold1 =mod(seed,3)+mod(seed,5)+mod(seed,7)+mod(seed,11)+mod(seed,13)
691        fold2 =mod(seed,17)+mod(seed,19)+mod(seed,23)+mod(seed,29)+mod(seed,31)
692        foldre=mod(fold1+fold2,21)
693        foldim=mod(3*fold1+2*fold2,19)
694 
695        phi_block(ii,state) = DCMPLX(foldre,foldim)
696      end do
697    end do
698 
699  else
700 
701    call cwtime(cputime, walltime, gflops, "start")
702 
703    if (.not.use_mpio) then
704      call wrtout(std_out," Initializing eigenvectors from file: "//TRIM(ihexc_fname)//" using Fortran IO.","COLL")
705 
706      if (open_file(ihexc_fname,msg,newunit=eig_unt,form='unformatted',status="old") /=0 ) then
707        ABI_ERROR(msg)
708      end if
709 
710      read(eig_unt, err=10, iomsg=errmsg) hexc_size_restart
711      ABI_CHECK(hexc_size_restart==hexc_size,"hexc_size_restart /= hexc_size")
712      read(eig_unt, err=10, iomsg=errmsg) !skip DCMPLX(exevl(1:hexc_size))
713 
714      ABI_MALLOC(buffer_dpc,(hexc_size))
715      do ii=1,nstates
716        read(eig_unt, err=10, iomsg=errmsg) buffer_dpc
717        phi_block(my_t1:my_t2,ii) = buffer_dpc(my_t1:my_t2)
718      end do
719      ABI_FREE(buffer_dpc)
720 
721      close(eig_unt, err=10, iomsg=errmsg)
722    else
723      call wrtout(std_out," Initializing eigenvectors from file: "//TRIM(ihexc_fname)//" using MPI-IO.","COLL")
724 #ifdef HAVE_MPI_IO
725      !
726      ! Open the file with MPI-IO
727      amode=MPI_MODE_RDONLY
728 
729      call MPI_FILE_OPEN(comm, ihexc_fname, amode, MPI_INFO_NULL, mpi_fh, mpi_err)
730      msg = " MPI_IO error opening file: "//TRIM(ihexc_fname)
731      ABI_CHECK_MPI(mpi_err,msg)
732 
733      ! Move the file pointer to skip the first two records.
734      ehdr_offset=0
735      call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err)
736      write(std_out,*)"fmarker first record ",fmarker
737      call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err)
738      write(std_out,*)"fmarker first record ",fmarker
739      !%call hdr_mpio_skip(mpi_fh,fform,ehdr_offset)
740      !%ehdr_offset = 4*xmpio_bsize_frm + xmpio_bsize_int + nstates*xmpio_bsize_dpc
741 
742      etype=MPI_BYTE; old_type=MPI_DOUBLE_COMPLEX
743 
744      my_nrows=my_t2-my_t1+1; old_type=MPI_DOUBLE_COMPLEX
745      array_of_sizes    = (/hexc_size,nstates/)
746      array_of_subsizes = (/my_nrows,nstates/)
747      array_of_starts   = (/my_t1,1/)
748      call xmpio_create_fsubarray_2D(array_of_sizes,array_of_subsizes,array_of_starts,old_type,eig_type,my_offpad,mpi_err)
749      ABI_CHECK_MPI(mpi_err,"fsubarray_2D")
750      !
751      ! Each node uses a different offset to skip the header and the blocks written by the other CPUs.
752      my_offset = ehdr_offset + my_offpad
753 
754      call MPI_FILE_SET_VIEW(mpi_fh, my_offset, etype, eig_type, 'native', MPI_INFO_NULL, mpi_err)
755      ABI_CHECK_MPI(mpi_err,"SET_VIEW")
756 
757      call MPI_TYPE_FREE(eig_type,mpi_err)
758      ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE")
759 
760      my_nel = my_nrows*nstates
761      call MPI_FILE_READ_ALL(mpi_fh, phi_block, my_nel, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err)
762      ABI_CHECK_MPI(mpi_err,"FILE_READ")
763 
764      ! It seems that personal calls make the code stuck
765      ! check the fortran markers.
766      ABI_MALLOC(bsize_frecord,(nstates))
767      bsize_frecord = hexc_size * xmpi_bsize_dpc
768      ! ehdr_offset points to the end of the header.
769      call xmpio_check_frmarkers(mpi_fh,ehdr_offset,xmpio_collective,nstates,bsize_frecord,ierr)
770      ABI_CHECK(ierr==0,"Error in Fortran markers")
771      ABI_FREE(bsize_frecord)
772      !
773      ! Close the file.
774      call MPI_FILE_CLOSE(mpi_fh, mpi_err)
775      ABI_CHECK_MPI(mpi_err,"FILE_CLOSE")
776 #else
777      ABI_ERROR("You should not be here")
778 #endif
779    end if
780 
781    call cwtime(cputime, walltime, gflops, "stop")
782    write(msg,'(2(a,f9.1),a)')" IO operation completed. cpu_time: ",cputime,"[s], walltime: ",walltime," [s]"
783    call wrtout(std_out, msg, "COLL", do_flush=.True.)
784  end if
785 
786  return
787 
788  ! Handle IO-error
789 10 continue
790  ABI_ERROR(errmsg)
791 
792 end subroutine exc_init_phi_block

m_exc_itdiago/exc_iterative_diago [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

  exc_iterative_diago

FUNCTION

  Calculates eigenvalues and eigenvectors of the Hermitian excitonic Hamiltonian (coupling is neglected).

INPUTS

  Bsp
    %nreh=Rank of the resonant block of the Hamiltonian.
    %nstates=Number of eigenstates required.
    %nline=Max number of line minimizations.
    %tolwfr=Tolerance on the residuals.
    %nbdbuf
    %nstep
  BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code.
    %exh=Name of the file storing the excitonin resonant part.
    %out_eig_out=Name of the file where final results are store.
    %in_eig=Name of the file used to initialize the calculation.
  comm=MPI communicator.

OUTPUT

  Eigenvalues and eigenvectors are written on file %out_eig

NOTES

  Concernig the approach followed to parallelize this routine: the most important
  bottleneck is represent by the storage of the excitonic Hamiltonian since a
  large number of k-points is needed to obtain converged exciton energies.
  The number of eigenstates is usually much smaller than the rank of the full matrix,
  this is especially true if we are only interested in the binding energy of the
  exciton or in the excitonic states close to the single-particle gap.
  Therefore good scaling and good performance should be obtained by distributing
  the row of the excitonic Hamiltonian among the nodes while the required
  eigenvectors are duplicated on each node. The memory needed to stores the eigenvalues
  scales like nreh*nstates where nreh is the rank of the Hamiltonian and this might
  render the calculation unfeasible when nstates is large.
  On the other hand, having the complex set of trial eigenvectors on each node permits to parallelize
  tasks such as the application of the Hamiltonian as well as the orthogonalization or the sub-space rotation
  the later two algorithms represent the most CPU demanding part in standard KS calculations
  as they scale with the third power of the number of atoms.
  The conjugate direction and the gradient as well as Hphi are not distributed as the line minimization
  requires the evaluation of <cg_dir_|H_exc|cg_dir>.
  Note that this routine has been written having in mind an homogeneous network of machines.
  A network made of different CPU will lead to unpredictable results as each node has
  to check for the converge of the calculation.

SOURCE

105 subroutine exc_iterative_diago(BSp,BS_files,Hdr_bse,prtvol,comm)
106 
107 !Arguments ------------------------------------
108 !scalars
109  integer,intent(in) :: comm,prtvol
110  type(excparam),intent(in) :: BSp
111  type(excfiles),intent(in) ::  BS_files
112  type(Hdr_type),intent(in) :: Hdr_bse
113 
114 !Local variables ------------------------------
115 !scalars
116  integer,parameter :: STRICT=2,MEDIUM=1,WORST=0
117  integer,parameter :: master=0
118  integer(i8b) :: bsize_hmat,bsize_phi_block
119  integer :: hexc_size,nstates,nline,nbdbuf,nstep
120  integer :: cg_nsteps,my_t1,my_t2,my_nt
121  integer :: ii,jj,state,line,max_nline
122  integer :: cg_step,nbdbuf_,nsppol,ierr,nproc,my_rank
123  real(dp) :: exc_gap,exc_maxene,norm,etrial,etrial_old,deltae,tolwfr
124 ! real(dp) :: deold
125  real(dp) :: dhd,dhc,den,fac,poly,xx,root,swap,tan2th,diff,tolwfr_
126  !complex(dpc) :: cg_gamma,dotgg,old_dotgg
127  real(dp) :: cg_gamma,dotgg,old_dotgg
128  real(dp) :: max_resid,costh,sinth
129  complex(dpc) :: zz,kprc
130  logical,parameter :: DEBUGME=.False.
131  logical :: use_mpio,is_resonant,diago_is_real
132  character(len=500) :: msg
133  character(len=fnlen) :: hexc_fname,ihexc_fname,oeig_fname
134  type(stats_t) :: stats
135 !arrays
136  integer :: nline_for(Bsp%nstates),convergence_of(Bsp%nstates)
137  real(dp) :: resid(Bsp%nstates),exc_energy(Bsp%nstates),rbuf2(2)
138 ! real(dp),allocatable :: gsc(:,:),cg(:,:)
139  !complex,allocatable :: hexc(:,:)
140  complex(dpc),allocatable :: hexc(:,:),hji(:),vec_tmp(:)
141  complex(dpc),ABI_CONTIGUOUS pointer :: my_phi(:)
142  real(dp),allocatable :: hexc_diagonal(:)
143  complex(dpc),target,allocatable :: phi_block(:,:)
144  complex(dpc),allocatable :: hphi(:) !,buffer_dpc(:)
145  complex(dpc),allocatable :: cg_dir(:),grad_dir(:),prc_dir(:)
146  complex(dpc),allocatable :: old_cg_dir(:)
147 
148 !************************************************************************
149 
150  DBG_ENTER("COLL")
151 
152  if (Bsp%use_coupling>0) then
153    ABI_ERROR("CG Method does not support coupling")
154  end if
155 
156  nsppol = Hdr_bse%nsppol
157  if (Hdr_bse%nsppol == 2) then
158    ABI_WARNING("nsppol==2 with cg method is still under development")
159  end if
160 
161  nproc = xmpi_comm_size(comm); my_rank = xmpi_comm_rank(comm)
162 
163  use_mpio=.FALSE.
164 #ifdef HAVE_MPI_IO
165  use_mpio = (nproc > 1)
166 #endif
167  use_mpio=.FALSE.
168  !use_mpio = .TRUE.
169 
170  hexc_size = SUM(Bsp%nreh)
171  nstates= Bsp%nstates
172  nline  = Bsp%nline
173  nbdbuf = Bsp%nbdbuf
174  nstep  = Bsp%niter
175  tolwfr = Bsp%cg_tolwfr
176 
177  write(msg,'(a,i0)')' Iterative diagonalization of the resonant excitonic Hamiltonian, Matrix size= ',hexc_size
178  call wrtout(std_out,msg,"COLL")
179  call wrtout(ab_out,msg,"COLL")
180 
181  ABI_CHECK(hexc_size>=nproc,"hexc_size<nproc!")
182  ABI_CHECK(nstates <= hexc_size,"nstates cannot be greater that hexc size!")
183 
184  ! Divide the columns of the Hamiltonian among the nodes.
185  call xmpi_split_work(hexc_size,comm,my_t1,my_t2)
186 
187  my_nt = my_t2-my_t1+1
188  write(msg,'(a,i0,a)')" Will handle ",my_nt," columns of the excitonic Hamiltonian. "
189  call wrtout(std_out,msg,"PERS")
190 
191  tolwfr_ = tolwfr
192  if (tolwfr < 10**(-30)) then
193    tolwfr_ = tol12
194    write(msg,'(2(a,es12.4))')" Input tolwfr= ",tolwfr," Using tolwfr= ",tolwfr_
195    ABI_WARNING(msg)
196  end if
197 
198  cg_nsteps = nstep
199  if (cg_nsteps<=0) then
200    cg_nsteps = 30
201    write(msg,'(2(a,es12.4))')" Input nstep= ",nstep," Using cg_nsteps= ",cg_nsteps
202    ABI_WARNING(msg)
203  end if
204 
205  nbdbuf_ = nbdbuf
206  if (nbdbuf<=0) then
207    nbdbuf_ = 4
208    write(msg,'(2(a,i0))')" Input nbdbuf= ",nbdbuf," Using nbdbuf= ",nbdbuf_
209    ABI_WARNING(msg)
210  end if
211 
212  write(msg,"(4(a,i0,a),a,es12.4)")&
213 &  " cg_nsteps: ",cg_nsteps,ch10,&
214 &  " nstates:   ",nstates,ch10,&
215 &  " nline:     ",nline,ch10,&
216 &  " nbdbuf:    ",nbdbuf_,ch10,&
217 &  " tolwfr:    ",tolwfr_
218  call wrtout(std_out,msg,"COLL")
219  call wrtout(ab_out,msg,"COLL")
220 
221  bsize_hmat = 2*dpc*hexc_size*my_nt
222  write(msg,'(a,f8.1,a)')' Allocating excitonic Hamiltonian. Memory requested: ',bsize_hmat*b2Mb,' Mb.'
223  call wrtout(std_out,msg,"COLL")
224 
225  ABI_MALLOC(hexc_diagonal,(my_t1:my_t2))
226  ABI_MALLOC_OR_DIE(hexc,(hexc_size,my_t1:my_t2), ierr)
227  !
228  ! Read and construct full excitonic Hamiltonian using Hermiticity.
229  if (BS_files%in_hreso /= BSE_NOFILE) then
230    hexc_fname = BS_files%in_hreso
231  else
232    hexc_fname = BS_files%out_hreso
233  end if
234  !
235  ! Read the resonant block from file.
236  is_resonant=.TRUE.
237  diago_is_real=(.not.BSp%have_complex_ene)
238  call exc_read_rcblock(hexc_fname,Bsp,is_resonant,diago_is_real,nsppol,Bsp%nreh,hexc_size,my_t1,my_t2,hexc,use_mpio,comm)
239  !
240  ! Save diagonal part for preconditioning.
241  do jj=my_t1,my_t2
242    hexc_diagonal(jj) = REAL(hexc(jj,jj),kind=dp)
243  end do
244  !
245  ! === Initialisation of the excitonic wavefunctions ===
246  ! Two cases are possible.
247  !   1) Fill trial eigenvectors with random numbers
248  !      One needs to initialize wfs in such a way to avoid symmetry traps,
249  !      and to avoid linear dependencies between wavefunctions
250  !   2) Read eigenstates generated by a previous calculation.
251 
252  bsize_phi_block = 2*spc*my_nt*nstates
253  write(msg,'(a,f8.1,a)')' Allocating BSE eigenvectors. Memory requested: ',bsize_phi_block*b2Mb,' Mb.'
254  call wrtout(std_out,msg,"COLL",do_flush=.True.)
255 
256  ABI_MALLOC_OR_DIE(phi_block,(my_t1:my_t2,nstates), ierr)
257 
258  ihexc_fname = ""
259  if (BS_files%in_eig /= BSE_NOFILE) ihexc_fname = BS_files%in_eig
260 
261  call exc_init_phi_block(ihexc_fname,use_mpio,comm)
262  !
263  ! =========================
264  ! === Orthogonalization ===
265  ! =========================
266  call exc_cholesky_ortho()
267  call exc_check_phi_block("First cholesky ortho")
268 
269  ! * Sub-space rotation.
270  call exc_subspace_rotation()
271  call exc_check_phi_block("First subspace_rotation")
272  !
273  ! ===========================
274  ! ==== Conjugate gradient ===
275  ! ===========================
276  ABI_MALLOC(hphi,(hexc_size))
277  ABI_MALLOC(cg_dir,(hexc_size))
278  ABI_MALLOC(old_cg_dir,(hexc_size))
279  ABI_MALLOC(grad_dir,(hexc_size))
280  ABI_MALLOC(prc_dir,(hexc_size))
281 
282  max_nline=nline
283  resid(:)=HUGE(one); nline_for(1:nstates)=max_nline; convergence_of(1:nstates)=WORST
284 
285  do cg_step=1,cg_nsteps
286    do state=1,nstates
287 
288      if (prtvol>=10) then ! Tell us what is going on:
289        write(msg,'(a,i6,2x,a,i3,a)')' --- exc_iterative_diago is called for state ',state,'for',nline_for(state),' lines'
290        call wrtout(std_out,msg,'PERS')
291      end if
292 
293      ! Extraction of the vector that is iteratively updated.
294      my_phi => phi_block(my_t1:my_t2,state)
295 
296      do line=1,nline_for(state)
297        ! Compute etrial=<phi|H|phi> and the residual [H-etrial]|phi>.
298        hphi = czero
299 #ifdef FC_NVHPC
300 !Buggy NVHPC compiler
301        do jj=1,my_t2-my_t1+1;do ii=1,hexc_size
302          hphi(ii)=hphi(ii)+hexc(ii,jj)*my_phi(jj)
303        enddo ; enddo
304 #else
305        hphi = MATMUL(hexc, my_phi)
306 #endif
307        call xmpi_sum(hphi,comm,ierr)
308 
309        etrial = DOT_PRODUCT(my_phi, hphi(my_t1:my_t2))
310        call xmpi_sum(etrial,comm,ierr)
311        exc_energy(state) =  etrial
312 
313        ! Compute residual (squared) norm.
314        grad_dir(my_t1:my_t2) = hphi(my_t1:my_t2) - etrial*my_phi
315        resid(state) =  DOT_PRODUCT(grad_dir(my_t1:my_t2), grad_dir(my_t1:my_t2))
316        call xmpi_sum(resid(state),comm,ierr)
317        convergence_of(state) = convergence_degree(resid(state))
318        !
319        ! Check that etrial is decreasing on succeeding lines:
320        if (line>1 .and. (etrial > etrial_old+tol12)) then
321          write(msg,'(a,i8,a,1p,e14.6,a1,3x,a,1p,e14.6,a1)')&
322 &          'New trial exc_energy at line ',line,' = ',etrial,ch10,&
323 &          'is higher than former:',etrial_old,ch10
324          ABI_WARNING(msg)
325        end if
326        etrial_old = etrial
327        !
328        ! If residual sufficiently small stop line minimization.
329        if (convergence_of(state)==STRICT) then
330          if (prtvol>=10) then
331            write(msg,'(a,i4,a,i2,a,es12.4)')&
332 &            ' exc_iterative_diago: state ',state,' converged after ',line,&
333 &            ' line minimizations : resid =',resid(state)
334            call wrtout(std_out,msg,'PERS')
335          end if
336          EXIT !line
337        end if
338 
339        ! === PROJECT THE STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS ===
340        ! The following projection over the subspace orthogonal to occupied bands
341        ! is optional. It is a bit more accurate, but doubles the number of N^3 ops.
342        ! It is done only if ortalg>=0.
343 
344        ! Project the steepest descent direction: direc(2,npw)=<G|H|Cnk> - \sum_{(i<=n)} <G|H|Cik> , normalized.
345 
346        ! Grad_dir is already orthogonal to this band
347        ABI_MALLOC(hji,(nstates))
348        hji=czero
349 
350        ! MG TODO Don't know why here we sum over i=<=n!!!!!!!!
351        do jj=1,nstates
352          if (jj/=state) hji(jj) = DOT_PRODUCT(phi_block(:,jj), hphi(my_t1:my_t2) )
353        end do
354        call xmpi_sum(hji,comm,ierr)
355 
356        do jj=1,nstates
357          if (jj/=state) grad_dir(my_t1:my_t2) = grad_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj)
358        end do
359        ABI_FREE(hji)
360        !
361        ! === PRECONDITION THE STEEPEST DESCENT DIRECTION ===
362        den = DOT_PRODUCT(grad_dir(my_t1:my_t2), hexc_diagonal(my_t1:my_t2)*grad_dir(my_t1:my_t2) )
363        call xmpi_sum(den,comm,ierr)
364 
365        do ii=my_t1,my_t2
366          ! Teter polynomial ratio, modified according to Kresse, Furthmuller, PRB 54, 11169 (1996) [[cite:Kresse1996]]
367          xx = hexc_diagonal(ii)/den
368          poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp))
369          fac=poly/(poly+16._dp*xx**4)
370          kprc = fac*four/(three*den)
371          prc_dir(ii) = kprc * grad_dir(ii)
372        end do
373        !
374        ! * PROJECT THE PRECOND. STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS.
375        ABI_MALLOC(hji,(nstates))
376        hji=czero
377        do jj=1,nstates
378          hji(jj) = DOT_PRODUCT(phi_block(:,jj), prc_dir(my_t1:my_t2) )
379        end do
380        call xmpi_sum(hji,comm,ierr)
381 
382        do jj=1,nstates
383          prc_dir(my_t1:my_t2) = prc_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj)
384        end do
385        ABI_FREE(hji)
386        !
387        ! === COMPUTE THE CONJUGATE-GRADIENT ===
388        dotgg = DOT_PRODUCT(prc_dir(my_t1:my_t2),grad_dir(my_t1:my_t2))
389        call xmpi_sum(dotgg,comm,ierr)
390 
391        if (line==1) then ! At first iteration, cg_gamma is set to zero
392          cg_gamma=zero
393          old_dotgg=dotgg
394          cg_dir = prc_dir
395          old_cg_dir = cg_dir
396        else
397          cg_gamma=dotgg/old_dotgg
398          old_dotgg=dotgg
399          !write(std_out,*)"cg_gamma= ",cg_gamma
400          !cg_dir = prc_dir + cg_gamma*cg_dir
401          cg_dir = prc_dir + cg_gamma*old_cg_dir !TODO check this, anyhow it is much faster.
402          old_cg_dir =cg_dir  ! old_cg_dir is used to store the previsou CG direction, cg_dir will be orthonormalized to the band
403        end if
404        !
405        ! === PROJECTION OF THE CONJUGATED GRADIENT ===
406        zz = DOT_PRODUCT(my_phi, cg_dir(my_t1:my_t2))
407        call xmpi_sum(zz,comm,ierr)
408        cg_dir(my_t1:my_t2) = cg_dir(my_t1:my_t2) -zz*my_phi(:)
409 
410        norm = DOT_PRODUCT(cg_dir(my_t1:my_t2), cg_dir(my_t1:my_t2) )
411        call xmpi_sum(norm,comm,ierr)
412        norm = SQRT(norm)
413        cg_dir = cg_dir/norm ! Have to normalize it.
414 
415        ! Line minimization of the Raileigh functional.
416        ABI_MALLOC(vec_tmp,(hexc_size))
417        vec_tmp=czero
418 #ifdef FC_NVHPC
419 !Buggy NVHPC compiler
420        do jj=my_t1,my_t2;do ii=1,hexc_size
421          vec_tmp(ii)=vec_tmp(ii)+hexc(ii,jj-my_t1+1)*cg_dir(jj)
422        enddo ; enddo
423 #else
424        vec_tmp = MATMUL(hexc, cg_dir(my_t1:my_t2))
425 #endif
426        call xmpi_sum(vec_tmp,comm,ierr)
427 
428        !if (my_rank==master) then
429        !  write(777,*)"cg_step, state, line",cg_step, state, line
430        !  write(777,*)vec_tmp
431        !end if
432 
433        dhd = DOT_PRODUCT( cg_dir(my_t1:my_t2), vec_tmp(my_t1:my_t2))  ! is this always real?
434        dhc = REAL( DOT_PRODUCT( my_phi, vec_tmp(my_t1:my_t2) ))
435        ABI_FREE(vec_tmp)
436 
437        rbuf2 = (/dhd,dhc/)
438        call xmpi_sum(rbuf2,comm,ierr)
439        dhd = rbuf2(1)
440        dhc = rbuf2(2)
441 
442        !write(201*(my_rank+1),*)"cg_step, state, line dotgg dhd dhc",cg_step,state,line,dotgg,dhd,dhc
443 
444 ! Taken from cgwf
445        ! Compute tan(2 theta),sin(theta) and cos(theta)
446        tan2th=2.0_dp*dhc/(etrial-dhd)
447 
448        if (abs(tan2th)<1.d-05) then
449          costh=1.0_dp-0.125_dp*tan2th**2
450          sinth=0.5_dp*tan2th*(1.0_dp-0.375_dp*tan2th**2)
451          ! Check that result is above machine precision
452          ! FIXME  This part is not safe on clusters made of different machines or different compiation options.
453          if (abs(sinth)<epsilon(0._dp)) then
454            write(msg, '(a,es16.4)' ) ' exc_iterative_diago: converged with tan2th= ',tan2th
455            call wrtout(std_out,msg,'PERS')
456            EXIT !Exit from the loop on line
457          end if
458 
459        else
460          root =sqrt(1.0_dp+tan2th**2)
461          costh=sqrt(0.5_dp+0.5_dp/root)
462          sinth=sign(sqrt(0.5_dp-0.5_dp/root),tan2th)
463        end if
464        !
465        ! Check for lower of two possible roots (same sign as curvature at theta where slope is zero)
466        diff=(etrial-dhd)
467        if (diff>zero) then !   Swap c and d if value of diff is positive
468          swap=costh
469          costh=-sinth
470          sinth=swap
471          if (prtvol<0 .or. prtvol>=10) then
472            write(msg,'(a,i4,es16.6)')' exc_iterative_diago: swap roots, line,diff= ',line,diff
473            call wrtout(std_out,msg,'PERS')
474          end if
475        end if
476        !
477        ! === GENERATE NEW |wf>, H|wf>  =============
478        my_phi = costh*my_phi + sinth*cg_dir(my_t1:my_t2)
479        !write(100*(my_rank+1),*)"cg_step state, line costh sinth etrial",cg_step,state,line,costh,sinth,etrial
480 !end  taken from cgwf
481 
482        !norm = SQRT( DOT_PRODUCT(my_phi,my_phi) )
483        !my_phi = my_phi /norm
484        !write(std_out,*)norm
485        !write(std_out,*)DOT_PRODUCT(hphi,my_phi),cos(theta_min)
486 
487        ! ======================================================================
488        ! =========== CHECK CONVERGENCE AGAINST TRIAL ENERGY ===================
489        ! ======================================================================
490        ! Compute delta(E)
491        !deltae=chc*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc
492        deltae=etrial*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc
493 
494        ! Check convergence and eventually exit
495        ! if (line==1) then
496        !   deold=deltae
497        ! else if (abs(deltae)<0.005_dp*abs(deold) .and. line/=nline_for(state))then
498        !  if (prtvol>=10)then
499        !   write(msg, '(a,i4,1x,a,1p,e12.4,a,e12.4,a)' ) &
500        !&   ' cgwf: line',line,&
501        !&   ' deltae=',deltae,' < 0.005*',deold,' =>skip lines'
502        !   call wrtout(std_out,msg,'PERS')
503        !  end if
504        !  exc_energy(state) = exc_energy(state) + deltae
505        !  EXIT
506        ! end if
507      end do ! LOOP FOR A GIVEN BAND. Note that there are three "exit" instructions inside
508      ! Modify nline_for(state) according to converge degree.
509      !if (convergence_of(state) == STRICT) nline_for(state) = MAX(max_nline-2,2)
510      !if (convergence_of(state) == MEDIUM) nline_for(state) = MAX(max_nline-1,2)
511      !if (convergence_of(state) == WORST ) nline_for(state) = max_nline
512    end do !state
513 
514    if (prtvol>2) then
515      do ii=0,(nstates-1)/8
516        write(msg,'(a,8es10.2)')' res:',(resid(state),state=1+ii*8,MIN(nstates,8+ii*8))
517        call wrtout(std_out,msg,'COLL')
518      end do
519      do ii=0,(nstates-1)/8
520        write(msg,'(a,8es10.2)')' ene:',(exc_energy(state),state=1+ii*8,MIN(nstates,8+ii*8))
521        call wrtout(std_out,msg,'COLL')
522      end do
523    end if
524 
525    write(msg,'(a,i0)')"After cg_step: ",cg_step
526    call exc_check_phi_block(msg)
527 
528    ! Find largest residual over bands and Print residuals
529    max_resid=MAXVAL( resid(:MAX(1,nstates-nbdbuf_)) )
530 
531    if (max_resid < tolwfr_) then
532      write(msg,'(a,i0,2(a,es10.2),a,i0,a)')&
533 &      " After ",cg_step," iterations, max_resid= ",max_resid," < tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")"
534      call wrtout(std_out,msg,'COLL')
535      EXIT ! cg_step
536    end if
537 
538    if (cg_step==1.or.MOD(cg_step,1)==0) then
539      call wrtout(std_out," Subspace rotation + exc_cholesky_ortho ","COLL")
540 
541      call exc_subspace_rotation()
542      call exc_cholesky_ortho()
543 
544      !mcg=hexc_size; mgsc=hexc_size; useoverlap=0
545      !allocate(cg(2,mcg),gsc(2,mgsc*useoverlap))
546      !do ii=1,nstates
547      ! cg(1,:) = REAL (phi_block(:,ii))
548      ! cg(2,:) = AIMAG(phi_block(:,ii))
549      ! call fxphas(cg,gsc,0,0,1,mcg,mgsc,MPI_enreg_seq,1,hexc_size,useoverlap)
550      ! phi_block(:,ii)=CMPLX(cg(1,:),cg(2,:))
551      !end do
552      !deallocate(cg,gsc)
553 
554    end if
555 
556 !XG20141126 : At present, keep this fake test. Seems that there is some bug on shiva
557 !(compiler ?) so that the result is erroneous without it. Even suppressing the
558 !"stop" instruction triggers the bug ...
559      if(cg_step==cg_nsteps+1)then
560        write(std_out,*)' One should not be here : cg_step==cg_nsteps+1 '
561        stop
562      endif
563 
564  end do !cg_step
565 
566  ! Release some memory before entering RMM-DIIS
567  ABI_FREE(hphi)
568  ABI_FREE(cg_dir)
569  ABI_FREE(old_cg_dir)
570  ABI_FREE(grad_dir)
571  ABI_FREE(prc_dir)
572 
573  do ii=0,(nstates-1)/8
574    write(msg,'(a,8es10.2)')' res:',(resid(state),state=1+ii*8,min(nstates,8+ii*8))
575    call wrtout(std_out,msg,'COLL')
576  end do
577  do ii=0,(nstates-1)/8
578    write(msg,'(a,8es10.2)')' ene:',(exc_energy(state),state=1+ii*8,min(nstates,8+ii*8))
579    call wrtout(std_out,msg,'COLL')
580  end do
581 
582  stats = stats_eval(resid)
583 
584  write(msg,"(4(a,es10.2))")&
585 &  ". Residuals: min value: ",stats%min,", Max value: ",stats%max,", mean: ",stats%mean,", stdev: ",stats%stdev
586  call wrtout(std_out,msg)
587  call wrtout(ab_out,msg)
588 
589  if (max_resid > tolwfr_) then
590    write(msg,'(2a,i5,2a,2(a,es10.2),a,i3,a)')ch10,&
591 &    " WARNING: conjugate-gradient not converged after ",cg_step," iterations.",ch10,&
592 &    " max_resid= ",max_resid," > tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")"
593    call wrtout(ab_out,msg,'COLL')
594    call wrtout(std_out,msg,'COLL')
595  end if
596 
597  exc_gap    = MINVAL(exc_energy)
598  exc_maxene = MAXVAL(exc_energy)
599 
600  write(msg,'(a,2(a,f7.2,2a))')ch10,&
601 &  " First excitonic eigenvalue= ",exc_gap*Ha_eV,   " [eV]. ",ch10,&
602 &  " Last  excitonic eigenvalue= ",exc_maxene*Ha_eV," [eV]. ",ch10
603  call wrtout(std_out,msg,"COLL")
604  call wrtout(ab_out,msg,"COLL")
605 
606  call exc_check_phi_block("END OF CONJUGATE-GRADIENT")
607 
608  ABI_FREE(hexc)
609  ABI_FREE(hexc_diagonal)
610  !
611  ! =====================================
612  ! ==== Write final results on file ====
613  ! =====================================
614  oeig_fname = BS_files%out_eig
615  if (oeig_fname== BSE_NOFILE) then
616    ABI_WARNING("oeig_fname was set to "//TRIM(BSE_NOFILE))
617    oeig_fname = TRIM(BS_files%out_basename)//"_BSEIG"
618    ABI_WARNING("using oeig_fname : "//TRIM(oeig_fname))
619  end if
620 
621  call exc_write_phi_block(oeig_fname,use_mpio)
622 
623  ABI_FREE(phi_block)
624 
625  call xmpi_barrier(comm)
626 
627  DBG_EXIT("COLL")
628 
629 CONTAINS  !===========================================================

m_exc_itdiago/exc_subspace_rotation [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 exc_subspace_rotation

FUNCTION

  This routine performs the subspace rotation.

SIDE EFFECTS

   phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.

SOURCE

 960 subroutine exc_subspace_rotation()
 961 
 962 !Local variables ------------------------------
 963  integer :: ii,jj,ipack,ierr
 964 !arrays
 965  real(dp),allocatable :: sub_ene(:)
 966 ! real(dp),allocatable :: evec(:,:)
 967  complex(dpc),allocatable :: sub_ham(:,:),sub_pham(:),hphi_tot(:)
 968 ! complex(dpc),allocatable :: phi_tmp(:,:)
 969 
 970 !************************************************************************
 971 
 972  ! * Sub-space rotation. Calculate <phi_i|H|phi_j> in packed form.
 973  ! TODO: this part can be rewritten using BLAS3 routines.
 974 
 975  ABI_MALLOC(hphi_tot,(hexc_size))
 976  ABI_MALLOC(sub_pham,(nstates*(nstates+1)/2))
 977  sub_pham=czero; ipack=0
 978 
 979  do jj=1,nstates
 980    hphi_tot = czero
 981    hphi_tot(:) = MATMUL(hexc, phi_block(:,jj))
 982    call xmpi_sum(hphi_tot,comm,ierr)
 983 
 984    do ii=1,jj
 985      ipack=ipack+1
 986      sub_pham(ipack) = DOT_PRODUCT(phi_block(my_t1:my_t2,ii), hphi_tot(my_t1:my_t2) )
 987      if (ii==jj) sub_pham(ipack) = REAL(sub_pham(ipack),kind=dp)
 988    end do
 989  end do
 990  call xmpi_sum(sub_pham,comm,ierr)
 991 
 992  ABI_MALLOC(sub_ham,(nstates,nstates))
 993  sub_ham=czero
 994  ABI_MALLOC(sub_ene,(nstates))
 995 
 996  call xhpev("Vectors","Upper",nstates,sub_pham,sub_ene,sub_ham,nstates) !,comm)
 997 
 998  ABI_FREE(hphi_tot)
 999  ABI_FREE(sub_pham)
1000  ABI_FREE(sub_ene)
1001 
1002  !do ii=1,nstates
1003  ! norm = DOT_PRODUCT(sub_ham(:,ii),sub_ham(:,ii))
1004  ! write(std_out,*)"norm subspac",norm
1005  ! sub_ham(:,ii) = sub_ham(:,ii)/norm
1006  !end do
1007 
1008  !allocate(evec(2*nstates,nstates))
1009 
1010  !do ii=1,nstates
1011  ! do jj=1,nstates
1012  ! evec(jj,  ii) = REAL (sub_ham(jj,ii))
1013  ! evec(jj+1,ii) = AIMAG(sub_ham(jj,ii))
1014  ! end do
1015  !end do
1016 
1017  !call normev(evec,nstates,nstates)
1018 
1019  !do ii=1,nstates
1020  ! do jj=1,nstates
1021  !  sub_ham(jj,ii) = CMPLX( evec(jj,ii),evec(jj+1,ii) )
1022  ! end do
1023  !end do
1024  !deallocate(evec)
1025 
1026 #if 0
1027  ABI_MALLOC(phi_tmp,(my_nt,nstates))
1028  phi_tmp = phi_block
1029 
1030  call ZGEMM('N','N',my_nt,nstates,nstates,cone,phi_tmp,my_nt,sub_ham,nstates,czero,phi_block,my_nt)
1031 
1032  ABI_FREE(phi_tmp)
1033 #else
1034  phi_block = MATMUL(phi_block,sub_ham)
1035 #endif
1036 
1037  ABI_FREE(sub_ham)
1038 
1039 end subroutine exc_subspace_rotation

m_exc_itdiago/exc_write_phi_block [ Functions ]

[ Top ] [ m_exc_itdiago ] [ Functions ]

NAME

 exc_write_phi_block

FUNCTION

  Write phi_block on the Fortran file oeig_fname.

SOURCE

806 subroutine exc_write_phi_block(oeig_fname,use_mpio)
807 
808 !Arguments ------------------------------------
809 !scalars
810  character(len=*),intent(in) :: oeig_fname
811  logical,intent(in) :: use_mpio
812 
813 !Local variables ------------------------------
814  integer :: eig_unt,state,mpi_err !,fform
815  real(dp) :: cputime,walltime,gflops
816  character(len=500) :: msg,errmsg
817 ! type(Hdr_type) :: hexc_Hdr
818  logical :: do_ep_lifetime
819 !!arrays
820  complex(dpc),allocatable :: buffer_dpc(:)
821 #ifdef HAVE_MPI_IO
822  integer:: amode,mpi_fh,old_type,etype,eig_type,my_nel,ierr,my_nrows
823  integer(XMPI_OFFSET_KIND) :: ehdr_offset,my_offset,my_offpad,fmarker
824  integer(XMPI_OFFSET_KIND),allocatable :: bsize_frecord(:)
825  integer :: array_of_sizes(2),array_of_subsizes(2),array_of_starts(2)
826 #endif
827 
828 !************************************************************************
829 
830  do_ep_lifetime = .FALSE.
831 
832  call cwtime(cputime, walltime, gflops, "start")
833 
834  if (.not.use_mpio) then
835 
836    ! * Master writes the header.
837    if (my_rank==master) then
838      call wrtout(std_out," Writing eigenstates on file "//TRIM(oeig_fname)//" via Fortran-IO","COLL")
839      if (open_file(oeig_fname,msg, newunit=eig_unt, form='unformatted') /= 0) then
840        ABI_ERROR(msg)
841      end if
842      write(eig_unt, err=10, iomsg=errmsg) do_ep_lifetime
843      write(eig_unt, err=10, iomsg=errmsg) hexc_size, nstates
844      write(eig_unt, err=10, iomsg=errmsg) CMPLX(exc_energy(1:nstates),kind=dpc)
845    end if
846 
847    ! Wavefunctions are gathered on the master node band-by-band.
848    ! TODO bands should be treated in blocks to minimize the number of MPI calls.
849    ABI_MALLOC_OR_DIE(buffer_dpc,(hexc_size), ierr)
850 
851    do state=1,nstates
852      buffer_dpc=czero
853      buffer_dpc(my_t1:my_t2) = phi_block(:,state)
854      call xmpi_sum_master(buffer_dpc,master,comm,mpi_err)
855      if (my_rank==master) write(eig_unt, err=10, iomsg=errmsg) buffer_dpc(1:hexc_size)
856    end do
857    ABI_FREE(buffer_dpc)
858 
859    if (my_rank==master) close(eig_unt, err=10, iomsg=errmsg)
860 
861  else
862 #ifdef HAVE_MPI_IO
863    call wrtout(std_out," Writing eigenstates on file "//TRIM(oeig_fname)//" with MPI-IO","COLL")
864 
865    ! Write the header.
866    if (my_rank==master) then
867      ! Write header using Fortran primitives.
868      if (open_file(oeig_fname,msg,newunit=eig_unt,form='unformatted') /= 0) then
869        ABI_ERROR(msg)
870      end if
871      write(eig_unt, err=10, iomsg=errmsg) nstates
872      write(eig_unt, err=10, iomsg=errmsg) CMPLX(exc_energy(1:nstates),kind=dpc)
873      ! TODO: change setup_bse so that Hdr_bse reflects the parameters of the run.
874      close(eig_unt, err=10, iomsg=errmsg)
875    end if
876 
877    call xmpi_barrier(comm)
878 
879    ! Open the file with MPI-IO
880    amode=MPI_MODE_RDWR
881 
882    call MPI_FILE_OPEN(comm, oeig_fname, amode, MPI_INFO_NULL, mpi_fh, mpi_err)
883    msg = " MPI_IO error opening file: "//TRIM(oeig_fname)
884    ABI_CHECK_MPI(mpi_err,msg)
885 
886    ! Move the file pointer to skip the first two records.
887    ehdr_offset=0
888    call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err)
889    !write(std_out,*)"fmarker first record ",fmarker
890    call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err)
891    !write(std_out,*)"fmarker first record ",fmarker
892    !$call hdr_mpio_skip(mpi_fh,fform,ehdr_offset)
893    !$ehdr_offset = 4*xmpio_bsize_frm + xmpio_bsize_int + nstates*xmpio_bsize_dpc
894 
895    etype=MPI_BYTE; old_type=MPI_DOUBLE_COMPLEX
896 
897    my_nrows=my_t2-my_t1+1; old_type=MPI_DOUBLE_COMPLEX
898    array_of_sizes    = (/hexc_size,nstates/)
899    array_of_subsizes = (/my_nrows,nstates/)
900    array_of_starts   = (/my_t1,1/)
901    call xmpio_create_fsubarray_2D(array_of_sizes,array_of_subsizes,array_of_starts,old_type,eig_type,my_offpad,mpi_err)
902    ABI_CHECK_MPI(mpi_err,"fsubarray_2D")
903    !
904    ! Each node uses a different offset to skip the header and the blocks written by the other CPUs.
905    my_offset = ehdr_offset + my_offpad
906 
907    call MPI_FILE_SET_VIEW(mpi_fh, my_offset, etype, eig_type, 'native', MPI_INFO_NULL, mpi_err)
908    ABI_CHECK_MPI(mpi_err,"SET_VIEW")
909 
910    call MPI_TYPE_FREE(eig_type,mpi_err)
911    ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE")
912 
913    my_nel = my_nrows*nstates
914    call MPI_FILE_WRITE_ALL(mpi_fh, phi_block, my_nel, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err)
915    ABI_CHECK_MPI(mpi_err,"FILE_WRITE")
916 
917    ! It seems that personal calls make the code stuck
918    ABI_MALLOC(bsize_frecord,(nstates))
919    bsize_frecord = hexc_size * xmpi_bsize_dpc
920    ! ehdr_offset points to the end of the header.
921    call xmpio_write_frmarkers(mpi_fh,ehdr_offset,xmpio_collective,nstates,bsize_frecord,ierr)
922    ABI_CHECK(ierr==0,"Error while writing Fortran markers")
923    ABI_FREE(bsize_frecord)
924    !
925    ! Close the file.
926    call MPI_FILE_CLOSE(mpi_fh, mpi_err)
927    ABI_CHECK_MPI(mpi_err,"FILE_CLOSE")
928 #else
929    ABI_ERROR("MPI-IO support not enabled")
930 #endif
931  end if
932 
933  call cwtime(cputime, walltime, gflops, "stop")
934  write(msg,'(2(a,f9.1),a)')" IO operation completed. cpu_time: ",cputime,"[s], walltime: ",walltime," [s]"
935  call wrtout(std_out, msg, "COLL", do_flush=.True.)
936 
937  return
938 
939  ! Handle IO-error
940 10 continue
941  ABI_ERROR(errmsg)
942 
943 end subroutine exc_write_phi_block