TABLE OF CONTENTS
- ABINIT/m_exc_itdiago
- m_exc_itdiago/convergence_degree
- m_exc_itdiago/exc_check_phi_block
- m_exc_itdiago/exc_cholesky_ortho
- m_exc_itdiago/exc_init_phi_block
- m_exc_itdiago/exc_iterative_diago
- m_exc_itdiago/exc_subspace_rotation
- m_exc_itdiago/exc_write_phi_block
ABINIT/m_exc_itdiago [ Modules ]
NAME
m_exc_itdiago
FUNCTION
Iterative diagonalization of the BSE Hamiltonian with band-by-band conjugate gradient method
COPYRIGHT
Copyright (C) 2008-2024 ABINIT group (MG) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
16 #if defined HAVE_CONFIG_H 17 #include "config.h" 18 #endif 19 20 #include "abi_common.h" 21 22 23 MODULE m_exc_itdiago 24 25 use defs_basis 26 use m_bs_defs 27 use m_errors 28 use m_abicore 29 use m_linalg_interfaces 30 use m_hdr 31 use m_xmpi 32 #ifdef HAVE_MPI2 33 use mpi 34 #endif 35 36 use m_io_tools, only : open_file 37 use m_time, only : cwtime 38 use m_numeric_tools, only : stats_t, stats_eval 39 use m_hide_lapack, only : xhpev !xheev, 40 use m_bse_io, only : exc_read_rcblock 41 42 implicit none 43 44 private 45 46 #ifdef HAVE_MPI1 47 include 'mpif.h' 48 #endif 49 50 public :: exc_iterative_diago ! Calculates eigenvalues and eigenvectors of the Resonant BSE Hamiltonian
m_exc_itdiago/convergence_degree [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
convergence_degree
FUNCTION
Return the degree of convergence from the input residual.
INPUTS
resid=Residual.
OUTPUT
SOURCE
1151 function convergence_degree(resid) 1152 1153 !Arguments 1154 integer :: convergence_degree 1155 real(dp),intent(in) :: resid 1156 1157 !************************************************************************ 1158 1159 if (resid<tolwfr_) then 1160 convergence_degree = STRICT 1161 else 1162 convergence_degree = WORST 1163 if (resid<tolwfr_*10**5) convergence_degree = MEDIUM 1164 end if 1165 1166 end function convergence_degree
m_exc_itdiago/exc_check_phi_block [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_check_phi_block
FUNCTION
Debugging tools
INPUTS
OUTPUT
SOURCE
1184 subroutine exc_check_phi_block(string) 1185 1186 !Arguments ------------------------------------ 1187 !scalars 1188 character(len=*),intent(in) :: string 1189 1190 !Local variables ------------------------------ 1191 !scalars 1192 integer :: ii,jj,ierr 1193 real(dp) :: err,rdum 1194 !arrays 1195 complex(dpc),allocatable :: lbuff(:,:) 1196 1197 !************************************************************************ 1198 1199 if (.not. DEBUGME) return 1200 1201 #if 0 1202 ABI_MALLOC(lbuff,(hexc_size,nstates)) 1203 err = -one 1204 do irank=1,nproc-1 1205 call xmpi_exch(phi_block,hexc_size*nstates,irank,lbuff,master,comm,11,ierr) 1206 if (my_rank==master) then 1207 lbuff = lbuff-phi_block 1208 err = MAX(err,MAXVAL(MAXVAL(ABS(lbuff),DIM=1))) 1209 end if 1210 call xmpi_barrier(comm) 1211 end do 1212 ABI_FREE(lbuff) 1213 #else 1214 1215 ABI_MALLOC(lbuff,(nstates,nstates)) 1216 lbuff=czero 1217 do jj=1,nstates 1218 do ii=1,jj 1219 lbuff(ii,jj) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) ) 1220 end do 1221 end do 1222 call xmpi_sum(lbuff,comm,ierr) 1223 1224 err = -one 1225 do jj=1,nstates 1226 do ii=1,jj 1227 if (ii==jj) then 1228 rdum = ABS(lbuff(ii,jj)-one) 1229 else 1230 rdum = ABS(lbuff(ii,jj)) 1231 end if 1232 err = MAX(err,rdum) 1233 end do 1234 end do 1235 ABI_FREE(lbuff) 1236 #endif 1237 1238 if (my_rank==master) then 1239 write(std_out,*)"After ",TRIM(string),", MAX inconsistency error in phi_block= ",err 1240 end if 1241 1242 !write(std_out,*)"master casts its own data" 1243 !call xmpi_bcast(phi_block,master,comm,ierr) 1244 1245 end subroutine exc_check_phi_block
m_exc_itdiago/exc_cholesky_ortho [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_cholesky_ortho
FUNCTION
This routine performs the orthogonalization of the trial eigenstates using the Cholesky Algorithm.
SIDE EFFECTS
phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.
SOURCE
1057 subroutine exc_cholesky_ortho() 1058 1059 !Local variables ------------------------------ 1060 integer :: my_info,ii,jj,ipack,ierr 1061 logical,parameter :: use_unpacked = .False. 1062 !arrays 1063 complex(dpc),allocatable :: overlap(:,:),povlp(:) 1064 1065 !************************************************************************ 1066 1067 ! 1) overlap_ij = <phi_i|phi_j> 1068 ABI_MALLOC(overlap, (nstates,nstates)) 1069 1070 if (use_unpacked) then 1071 overlap = czero 1072 1073 call ZGEMM('C','N',nstates,nstates,my_nt,cone,phi_block,my_nt,phi_block,my_nt,czero,overlap,nstates) 1074 call xmpi_sum(overlap,comm,ierr) 1075 1076 do ii=1,nstates 1077 overlap(ii,ii)=REAL(overlap(ii,ii),kind=dp) 1078 end do 1079 1080 ! 2) Cholesky factorization: overlap = U^H U with U upper triangle matrix. 1081 call ZPOTRF('U',nstates,overlap,nstates,my_info) 1082 if (my_info/=0) then 1083 write(msg,'(a,i3)')' ZPOTRF returned info= ',my_info 1084 ABI_ERROR(msg) 1085 end if 1086 1087 else 1088 ! 1) Calculate overlap_ij = <phi_i|phi_j> in packed form. 1089 ABI_MALLOC(povlp,(nstates*(nstates+1)/2)) 1090 povlp = czero; ipack=0 1091 do jj=1,nstates 1092 do ii=1,jj 1093 ipack=ipack+1 1094 povlp(ipack) = DOT_PRODUCT( phi_block(my_t1:my_t2,ii), phi_block(my_t1:my_t2,jj) ) 1095 if (ii==jj) povlp(ipack) = REAL(povlp(ipack),kind=dp) 1096 end do 1097 end do 1098 call xmpi_sum(povlp,comm,ierr) 1099 1100 ! 2) Cholesky factorization: overlap = U^H U with U upper triangle matrix. 1101 call ZPPTRF("U",nstates,povlp,my_info) 1102 if (my_info/=0) then 1103 write(msg,'(a,i3)')' ZPPTRF returned info= ',my_info 1104 ABI_ERROR(msg) 1105 end if 1106 !call xmpi_sum(povlp,comm,ierr) 1107 !povlp=povlp/nproc 1108 1109 !unpack povlp to prepare call to ZTRSM. 1110 ipack=0 1111 do jj=1,nstates 1112 do ii=1,jj 1113 ipack=ipack+1 1114 if (ii/=jj) then 1115 overlap(ii,jj)= povlp(ipack) 1116 overlap(jj,ii)=CONJG(povlp(ipack)) 1117 else 1118 overlap(ii,ii)=REAL(povlp(ipack),kind=dp) 1119 end if 1120 end do 1121 end do 1122 ABI_FREE(povlp) 1123 end if 1124 1125 ! Check if this can be done with Scalapack. Direct PZTRSM is not provided 1126 1127 ! 3) Solve X U = phi_block, on exit the phi_block treated by this node is orthonormalized. 1128 !call ZTRSM('R','U','N','N',hexc_size,nstates,cone,overlap,nstates,phi_block,hexc_size) 1129 call ZTRSM('Right','Upper','Normal','Normal',my_nt,nstates,cone,overlap,nstates,phi_block,my_nt) 1130 ABI_FREE(overlap) 1131 1132 end subroutine exc_cholesky_ortho
m_exc_itdiago/exc_init_phi_block [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_init_phi_block
FUNCTION
Initialize the eigenstates either from file or fill them with random number if restart file is not available
INPUTS
ihexc_fname= Name of the file from which the eigenvectors will be read. Empty string to initialize trial eigenvectors with random numbers.
SIDE EFFECTS
phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.
SOURCE
652 subroutine exc_init_phi_block(ihexc_fname,use_mpio,comm) 653 654 !Arguments ------------------------------------ 655 !scalars 656 integer,intent(in) :: comm 657 logical,intent(in) :: use_mpio 658 character(len=*),intent(in) :: ihexc_fname 659 660 !Local variables ------------------------------ 661 integer :: eig_unt,hexc_size_restart,ii,state,seed 662 integer :: fold1,fold2,foldim,foldre 663 real(dp) :: cputime,walltime,gflops 664 character(len=500) :: errmsg 665 !arrays 666 complex(dpc),allocatable :: buffer_dpc(:) 667 #ifdef HAVE_MPI_IO 668 integer:: amode,mpi_fh,mpi_err,old_type,etype,eig_type,my_nel,ierr,my_nrows 669 integer(XMPI_OFFSET_KIND) :: ehdr_offset,my_offset,my_offpad,fmarker 670 integer(XMPI_OFFSET_KIND),allocatable :: bsize_frecord(:) 671 integer :: array_of_sizes(2),array_of_subsizes(2),array_of_starts(2) 672 #endif 673 674 !************************************************************************ 675 676 if (LEN_TRIM(ihexc_fname) == 0) then 677 call wrtout(std_out," Initializing eigenvectors with random numbers","COLL") 678 ! 679 ! Use random number generator. For portability, use only integer numbers 680 ! The series of couples (fold1,fold2) is periodic with a period of 681 ! 3x5x7x11x13x17x19x23x29x31, that is, larger than 2**32, the largest integer*4 682 ! fold1 is between 0 and 34, fold2 is between 0 and 114. As sums of five 683 ! uniform random variables, their distribution is close to a gaussian 684 ! the gaussian distributions are folded, in order to be back to a uniform distribution 685 ! foldre is between 0 and 20, foldim is between 0 and 18. 686 ! 687 do state=1,nstates 688 do ii=my_t1,my_t2 689 seed=ii+(state-1)*hexc_size ! Different seed for different transitions and bands 690 fold1 =mod(seed,3)+mod(seed,5)+mod(seed,7)+mod(seed,11)+mod(seed,13) 691 fold2 =mod(seed,17)+mod(seed,19)+mod(seed,23)+mod(seed,29)+mod(seed,31) 692 foldre=mod(fold1+fold2,21) 693 foldim=mod(3*fold1+2*fold2,19) 694 695 phi_block(ii,state) = DCMPLX(foldre,foldim) 696 end do 697 end do 698 699 else 700 701 call cwtime(cputime, walltime, gflops, "start") 702 703 if (.not.use_mpio) then 704 call wrtout(std_out," Initializing eigenvectors from file: "//TRIM(ihexc_fname)//" using Fortran IO.","COLL") 705 706 if (open_file(ihexc_fname,msg,newunit=eig_unt,form='unformatted',status="old") /=0 ) then 707 ABI_ERROR(msg) 708 end if 709 710 read(eig_unt, err=10, iomsg=errmsg) hexc_size_restart 711 ABI_CHECK(hexc_size_restart==hexc_size,"hexc_size_restart /= hexc_size") 712 read(eig_unt, err=10, iomsg=errmsg) !skip DCMPLX(exevl(1:hexc_size)) 713 714 ABI_MALLOC(buffer_dpc,(hexc_size)) 715 do ii=1,nstates 716 read(eig_unt, err=10, iomsg=errmsg) buffer_dpc 717 phi_block(my_t1:my_t2,ii) = buffer_dpc(my_t1:my_t2) 718 end do 719 ABI_FREE(buffer_dpc) 720 721 close(eig_unt, err=10, iomsg=errmsg) 722 else 723 call wrtout(std_out," Initializing eigenvectors from file: "//TRIM(ihexc_fname)//" using MPI-IO.","COLL") 724 #ifdef HAVE_MPI_IO 725 ! 726 ! Open the file with MPI-IO 727 amode=MPI_MODE_RDONLY 728 729 call MPI_FILE_OPEN(comm, ihexc_fname, amode, MPI_INFO_NULL, mpi_fh, mpi_err) 730 msg = " MPI_IO error opening file: "//TRIM(ihexc_fname) 731 ABI_CHECK_MPI(mpi_err,msg) 732 733 ! Move the file pointer to skip the first two records. 734 ehdr_offset=0 735 call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err) 736 write(std_out,*)"fmarker first record ",fmarker 737 call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err) 738 write(std_out,*)"fmarker first record ",fmarker 739 !%call hdr_mpio_skip(mpi_fh,fform,ehdr_offset) 740 !%ehdr_offset = 4*xmpio_bsize_frm + xmpio_bsize_int + nstates*xmpio_bsize_dpc 741 742 etype=MPI_BYTE; old_type=MPI_DOUBLE_COMPLEX 743 744 my_nrows=my_t2-my_t1+1; old_type=MPI_DOUBLE_COMPLEX 745 array_of_sizes = (/hexc_size,nstates/) 746 array_of_subsizes = (/my_nrows,nstates/) 747 array_of_starts = (/my_t1,1/) 748 call xmpio_create_fsubarray_2D(array_of_sizes,array_of_subsizes,array_of_starts,old_type,eig_type,my_offpad,mpi_err) 749 ABI_CHECK_MPI(mpi_err,"fsubarray_2D") 750 ! 751 ! Each node uses a different offset to skip the header and the blocks written by the other CPUs. 752 my_offset = ehdr_offset + my_offpad 753 754 call MPI_FILE_SET_VIEW(mpi_fh, my_offset, etype, eig_type, 'native', MPI_INFO_NULL, mpi_err) 755 ABI_CHECK_MPI(mpi_err,"SET_VIEW") 756 757 call MPI_TYPE_FREE(eig_type,mpi_err) 758 ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE") 759 760 my_nel = my_nrows*nstates 761 call MPI_FILE_READ_ALL(mpi_fh, phi_block, my_nel, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err) 762 ABI_CHECK_MPI(mpi_err,"FILE_READ") 763 764 ! It seems that personal calls make the code stuck 765 ! check the fortran markers. 766 ABI_MALLOC(bsize_frecord,(nstates)) 767 bsize_frecord = hexc_size * xmpi_bsize_dpc 768 ! ehdr_offset points to the end of the header. 769 call xmpio_check_frmarkers(mpi_fh,ehdr_offset,xmpio_collective,nstates,bsize_frecord,ierr) 770 ABI_CHECK(ierr==0,"Error in Fortran markers") 771 ABI_FREE(bsize_frecord) 772 ! 773 ! Close the file. 774 call MPI_FILE_CLOSE(mpi_fh, mpi_err) 775 ABI_CHECK_MPI(mpi_err,"FILE_CLOSE") 776 #else 777 ABI_ERROR("You should not be here") 778 #endif 779 end if 780 781 call cwtime(cputime, walltime, gflops, "stop") 782 write(msg,'(2(a,f9.1),a)')" IO operation completed. cpu_time: ",cputime,"[s], walltime: ",walltime," [s]" 783 call wrtout(std_out, msg, "COLL", do_flush=.True.) 784 end if 785 786 return 787 788 ! Handle IO-error 789 10 continue 790 ABI_ERROR(errmsg) 791 792 end subroutine exc_init_phi_block
m_exc_itdiago/exc_iterative_diago [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_iterative_diago
FUNCTION
Calculates eigenvalues and eigenvectors of the Hermitian excitonic Hamiltonian (coupling is neglected).
INPUTS
Bsp %nreh=Rank of the resonant block of the Hamiltonian. %nstates=Number of eigenstates required. %nline=Max number of line minimizations. %tolwfr=Tolerance on the residuals. %nbdbuf %nstep BS_files<excfiles>=Datatype storing names and files used in the Bethe-Salpeter code. %exh=Name of the file storing the excitonin resonant part. %out_eig_out=Name of the file where final results are store. %in_eig=Name of the file used to initialize the calculation. comm=MPI communicator.
OUTPUT
Eigenvalues and eigenvectors are written on file %out_eig
NOTES
Concernig the approach followed to parallelize this routine: the most important bottleneck is represent by the storage of the excitonic Hamiltonian since a large number of k-points is needed to obtain converged exciton energies. The number of eigenstates is usually much smaller than the rank of the full matrix, this is especially true if we are only interested in the binding energy of the exciton or in the excitonic states close to the single-particle gap. Therefore good scaling and good performance should be obtained by distributing the row of the excitonic Hamiltonian among the nodes while the required eigenvectors are duplicated on each node. The memory needed to stores the eigenvalues scales like nreh*nstates where nreh is the rank of the Hamiltonian and this might render the calculation unfeasible when nstates is large. On the other hand, having the complex set of trial eigenvectors on each node permits to parallelize tasks such as the application of the Hamiltonian as well as the orthogonalization or the sub-space rotation the later two algorithms represent the most CPU demanding part in standard KS calculations as they scale with the third power of the number of atoms. The conjugate direction and the gradient as well as Hphi are not distributed as the line minimization requires the evaluation of <cg_dir_|H_exc|cg_dir>. Note that this routine has been written having in mind an homogeneous network of machines. A network made of different CPU will lead to unpredictable results as each node has to check for the converge of the calculation.
SOURCE
105 subroutine exc_iterative_diago(BSp,BS_files,Hdr_bse,prtvol,comm) 106 107 !Arguments ------------------------------------ 108 !scalars 109 integer,intent(in) :: comm,prtvol 110 type(excparam),intent(in) :: BSp 111 type(excfiles),intent(in) :: BS_files 112 type(Hdr_type),intent(in) :: Hdr_bse 113 114 !Local variables ------------------------------ 115 !scalars 116 integer,parameter :: STRICT=2,MEDIUM=1,WORST=0 117 integer,parameter :: master=0 118 integer(i8b) :: bsize_hmat,bsize_phi_block 119 integer :: hexc_size,nstates,nline,nbdbuf,nstep 120 integer :: cg_nsteps,my_t1,my_t2,my_nt 121 integer :: ii,jj,state,line,max_nline 122 integer :: cg_step,nbdbuf_,nsppol,ierr,nproc,my_rank 123 real(dp) :: exc_gap,exc_maxene,norm,etrial,etrial_old,deltae,tolwfr 124 ! real(dp) :: deold 125 real(dp) :: dhd,dhc,den,fac,poly,xx,root,swap,tan2th,diff,tolwfr_ 126 !complex(dpc) :: cg_gamma,dotgg,old_dotgg 127 real(dp) :: cg_gamma,dotgg,old_dotgg 128 real(dp) :: max_resid,costh,sinth 129 complex(dpc) :: zz,kprc 130 logical,parameter :: DEBUGME=.False. 131 logical :: use_mpio,is_resonant,diago_is_real 132 character(len=500) :: msg 133 character(len=fnlen) :: hexc_fname,ihexc_fname,oeig_fname 134 type(stats_t) :: stats 135 !arrays 136 integer :: nline_for(Bsp%nstates),convergence_of(Bsp%nstates) 137 real(dp) :: resid(Bsp%nstates),exc_energy(Bsp%nstates),rbuf2(2) 138 ! real(dp),allocatable :: gsc(:,:),cg(:,:) 139 !complex,allocatable :: hexc(:,:) 140 complex(dpc),allocatable :: hexc(:,:),hji(:),vec_tmp(:) 141 complex(dpc),ABI_CONTIGUOUS pointer :: my_phi(:) 142 real(dp),allocatable :: hexc_diagonal(:) 143 complex(dpc),target,allocatable :: phi_block(:,:) 144 complex(dpc),allocatable :: hphi(:) !,buffer_dpc(:) 145 complex(dpc),allocatable :: cg_dir(:),grad_dir(:),prc_dir(:) 146 complex(dpc),allocatable :: old_cg_dir(:) 147 148 !************************************************************************ 149 150 DBG_ENTER("COLL") 151 152 if (Bsp%use_coupling>0) then 153 ABI_ERROR("CG Method does not support coupling") 154 end if 155 156 nsppol = Hdr_bse%nsppol 157 if (Hdr_bse%nsppol == 2) then 158 ABI_WARNING("nsppol==2 with cg method is still under development") 159 end if 160 161 nproc = xmpi_comm_size(comm); my_rank = xmpi_comm_rank(comm) 162 163 use_mpio=.FALSE. 164 #ifdef HAVE_MPI_IO 165 use_mpio = (nproc > 1) 166 #endif 167 use_mpio=.FALSE. 168 !use_mpio = .TRUE. 169 170 hexc_size = SUM(Bsp%nreh) 171 nstates= Bsp%nstates 172 nline = Bsp%nline 173 nbdbuf = Bsp%nbdbuf 174 nstep = Bsp%niter 175 tolwfr = Bsp%cg_tolwfr 176 177 write(msg,'(a,i0)')' Iterative diagonalization of the resonant excitonic Hamiltonian, Matrix size= ',hexc_size 178 call wrtout(std_out,msg,"COLL") 179 call wrtout(ab_out,msg,"COLL") 180 181 ABI_CHECK(hexc_size>=nproc,"hexc_size<nproc!") 182 ABI_CHECK(nstates <= hexc_size,"nstates cannot be greater that hexc size!") 183 184 ! Divide the columns of the Hamiltonian among the nodes. 185 call xmpi_split_work(hexc_size,comm,my_t1,my_t2) 186 187 my_nt = my_t2-my_t1+1 188 write(msg,'(a,i0,a)')" Will handle ",my_nt," columns of the excitonic Hamiltonian. " 189 call wrtout(std_out,msg,"PERS") 190 191 tolwfr_ = tolwfr 192 if (tolwfr < 10**(-30)) then 193 tolwfr_ = tol12 194 write(msg,'(2(a,es12.4))')" Input tolwfr= ",tolwfr," Using tolwfr= ",tolwfr_ 195 ABI_WARNING(msg) 196 end if 197 198 cg_nsteps = nstep 199 if (cg_nsteps<=0) then 200 cg_nsteps = 30 201 write(msg,'(2(a,es12.4))')" Input nstep= ",nstep," Using cg_nsteps= ",cg_nsteps 202 ABI_WARNING(msg) 203 end if 204 205 nbdbuf_ = nbdbuf 206 if (nbdbuf<=0) then 207 nbdbuf_ = 4 208 write(msg,'(2(a,i0))')" Input nbdbuf= ",nbdbuf," Using nbdbuf= ",nbdbuf_ 209 ABI_WARNING(msg) 210 end if 211 212 write(msg,"(4(a,i0,a),a,es12.4)")& 213 & " cg_nsteps: ",cg_nsteps,ch10,& 214 & " nstates: ",nstates,ch10,& 215 & " nline: ",nline,ch10,& 216 & " nbdbuf: ",nbdbuf_,ch10,& 217 & " tolwfr: ",tolwfr_ 218 call wrtout(std_out,msg,"COLL") 219 call wrtout(ab_out,msg,"COLL") 220 221 bsize_hmat = 2*dpc*hexc_size*my_nt 222 write(msg,'(a,f8.1,a)')' Allocating excitonic Hamiltonian. Memory requested: ',bsize_hmat*b2Mb,' Mb.' 223 call wrtout(std_out,msg,"COLL") 224 225 ABI_MALLOC(hexc_diagonal,(my_t1:my_t2)) 226 ABI_MALLOC_OR_DIE(hexc,(hexc_size,my_t1:my_t2), ierr) 227 ! 228 ! Read and construct full excitonic Hamiltonian using Hermiticity. 229 if (BS_files%in_hreso /= BSE_NOFILE) then 230 hexc_fname = BS_files%in_hreso 231 else 232 hexc_fname = BS_files%out_hreso 233 end if 234 ! 235 ! Read the resonant block from file. 236 is_resonant=.TRUE. 237 diago_is_real=(.not.BSp%have_complex_ene) 238 call exc_read_rcblock(hexc_fname,Bsp,is_resonant,diago_is_real,nsppol,Bsp%nreh,hexc_size,my_t1,my_t2,hexc,use_mpio,comm) 239 ! 240 ! Save diagonal part for preconditioning. 241 do jj=my_t1,my_t2 242 hexc_diagonal(jj) = REAL(hexc(jj,jj),kind=dp) 243 end do 244 ! 245 ! === Initialisation of the excitonic wavefunctions === 246 ! Two cases are possible. 247 ! 1) Fill trial eigenvectors with random numbers 248 ! One needs to initialize wfs in such a way to avoid symmetry traps, 249 ! and to avoid linear dependencies between wavefunctions 250 ! 2) Read eigenstates generated by a previous calculation. 251 252 bsize_phi_block = 2*spc*my_nt*nstates 253 write(msg,'(a,f8.1,a)')' Allocating BSE eigenvectors. Memory requested: ',bsize_phi_block*b2Mb,' Mb.' 254 call wrtout(std_out,msg,"COLL",do_flush=.True.) 255 256 ABI_MALLOC_OR_DIE(phi_block,(my_t1:my_t2,nstates), ierr) 257 258 ihexc_fname = "" 259 if (BS_files%in_eig /= BSE_NOFILE) ihexc_fname = BS_files%in_eig 260 261 call exc_init_phi_block(ihexc_fname,use_mpio,comm) 262 ! 263 ! ========================= 264 ! === Orthogonalization === 265 ! ========================= 266 call exc_cholesky_ortho() 267 call exc_check_phi_block("First cholesky ortho") 268 269 ! * Sub-space rotation. 270 call exc_subspace_rotation() 271 call exc_check_phi_block("First subspace_rotation") 272 ! 273 ! =========================== 274 ! ==== Conjugate gradient === 275 ! =========================== 276 ABI_MALLOC(hphi,(hexc_size)) 277 ABI_MALLOC(cg_dir,(hexc_size)) 278 ABI_MALLOC(old_cg_dir,(hexc_size)) 279 ABI_MALLOC(grad_dir,(hexc_size)) 280 ABI_MALLOC(prc_dir,(hexc_size)) 281 282 max_nline=nline 283 resid(:)=HUGE(one); nline_for(1:nstates)=max_nline; convergence_of(1:nstates)=WORST 284 285 do cg_step=1,cg_nsteps 286 do state=1,nstates 287 288 if (prtvol>=10) then ! Tell us what is going on: 289 write(msg,'(a,i6,2x,a,i3,a)')' --- exc_iterative_diago is called for state ',state,'for',nline_for(state),' lines' 290 call wrtout(std_out,msg,'PERS') 291 end if 292 293 ! Extraction of the vector that is iteratively updated. 294 my_phi => phi_block(my_t1:my_t2,state) 295 296 do line=1,nline_for(state) 297 ! Compute etrial=<phi|H|phi> and the residual [H-etrial]|phi>. 298 hphi = czero 299 #ifdef FC_NVHPC 300 !Buggy NVHPC compiler 301 do jj=1,my_t2-my_t1+1;do ii=1,hexc_size 302 hphi(ii)=hphi(ii)+hexc(ii,jj)*my_phi(jj) 303 enddo ; enddo 304 #else 305 hphi = MATMUL(hexc, my_phi) 306 #endif 307 call xmpi_sum(hphi,comm,ierr) 308 309 etrial = DOT_PRODUCT(my_phi, hphi(my_t1:my_t2)) 310 call xmpi_sum(etrial,comm,ierr) 311 exc_energy(state) = etrial 312 313 ! Compute residual (squared) norm. 314 grad_dir(my_t1:my_t2) = hphi(my_t1:my_t2) - etrial*my_phi 315 resid(state) = DOT_PRODUCT(grad_dir(my_t1:my_t2), grad_dir(my_t1:my_t2)) 316 call xmpi_sum(resid(state),comm,ierr) 317 convergence_of(state) = convergence_degree(resid(state)) 318 ! 319 ! Check that etrial is decreasing on succeeding lines: 320 if (line>1 .and. (etrial > etrial_old+tol12)) then 321 write(msg,'(a,i8,a,1p,e14.6,a1,3x,a,1p,e14.6,a1)')& 322 & 'New trial exc_energy at line ',line,' = ',etrial,ch10,& 323 & 'is higher than former:',etrial_old,ch10 324 ABI_WARNING(msg) 325 end if 326 etrial_old = etrial 327 ! 328 ! If residual sufficiently small stop line minimization. 329 if (convergence_of(state)==STRICT) then 330 if (prtvol>=10) then 331 write(msg,'(a,i4,a,i2,a,es12.4)')& 332 & ' exc_iterative_diago: state ',state,' converged after ',line,& 333 & ' line minimizations : resid =',resid(state) 334 call wrtout(std_out,msg,'PERS') 335 end if 336 EXIT !line 337 end if 338 339 ! === PROJECT THE STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS === 340 ! The following projection over the subspace orthogonal to occupied bands 341 ! is optional. It is a bit more accurate, but doubles the number of N^3 ops. 342 ! It is done only if ortalg>=0. 343 344 ! Project the steepest descent direction: direc(2,npw)=<G|H|Cnk> - \sum_{(i<=n)} <G|H|Cik> , normalized. 345 346 ! Grad_dir is already orthogonal to this band 347 ABI_MALLOC(hji,(nstates)) 348 hji=czero 349 350 ! MG TODO Don't know why here we sum over i=<=n!!!!!!!! 351 do jj=1,nstates 352 if (jj/=state) hji(jj) = DOT_PRODUCT(phi_block(:,jj), hphi(my_t1:my_t2) ) 353 end do 354 call xmpi_sum(hji,comm,ierr) 355 356 do jj=1,nstates 357 if (jj/=state) grad_dir(my_t1:my_t2) = grad_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj) 358 end do 359 ABI_FREE(hji) 360 ! 361 ! === PRECONDITION THE STEEPEST DESCENT DIRECTION === 362 den = DOT_PRODUCT(grad_dir(my_t1:my_t2), hexc_diagonal(my_t1:my_t2)*grad_dir(my_t1:my_t2) ) 363 call xmpi_sum(den,comm,ierr) 364 365 do ii=my_t1,my_t2 366 ! Teter polynomial ratio, modified according to Kresse, Furthmuller, PRB 54, 11169 (1996) [[cite:Kresse1996]] 367 xx = hexc_diagonal(ii)/den 368 poly=27._dp+xx*(18._dp+xx*(12._dp+xx*8._dp)) 369 fac=poly/(poly+16._dp*xx**4) 370 kprc = fac*four/(three*den) 371 prc_dir(ii) = kprc * grad_dir(ii) 372 end do 373 ! 374 ! * PROJECT THE PRECOND. STEEPEST DESCENT DIRECTION OVER THE SUBSPACE ORTHOGONAL TO OTHER BANDS. 375 ABI_MALLOC(hji,(nstates)) 376 hji=czero 377 do jj=1,nstates 378 hji(jj) = DOT_PRODUCT(phi_block(:,jj), prc_dir(my_t1:my_t2) ) 379 end do 380 call xmpi_sum(hji,comm,ierr) 381 382 do jj=1,nstates 383 prc_dir(my_t1:my_t2) = prc_dir(my_t1:my_t2) - hji(jj)*phi_block(:,jj) 384 end do 385 ABI_FREE(hji) 386 ! 387 ! === COMPUTE THE CONJUGATE-GRADIENT === 388 dotgg = DOT_PRODUCT(prc_dir(my_t1:my_t2),grad_dir(my_t1:my_t2)) 389 call xmpi_sum(dotgg,comm,ierr) 390 391 if (line==1) then ! At first iteration, cg_gamma is set to zero 392 cg_gamma=zero 393 old_dotgg=dotgg 394 cg_dir = prc_dir 395 old_cg_dir = cg_dir 396 else 397 cg_gamma=dotgg/old_dotgg 398 old_dotgg=dotgg 399 !write(std_out,*)"cg_gamma= ",cg_gamma 400 !cg_dir = prc_dir + cg_gamma*cg_dir 401 cg_dir = prc_dir + cg_gamma*old_cg_dir !TODO check this, anyhow it is much faster. 402 old_cg_dir =cg_dir ! old_cg_dir is used to store the previsou CG direction, cg_dir will be orthonormalized to the band 403 end if 404 ! 405 ! === PROJECTION OF THE CONJUGATED GRADIENT === 406 zz = DOT_PRODUCT(my_phi, cg_dir(my_t1:my_t2)) 407 call xmpi_sum(zz,comm,ierr) 408 cg_dir(my_t1:my_t2) = cg_dir(my_t1:my_t2) -zz*my_phi(:) 409 410 norm = DOT_PRODUCT(cg_dir(my_t1:my_t2), cg_dir(my_t1:my_t2) ) 411 call xmpi_sum(norm,comm,ierr) 412 norm = SQRT(norm) 413 cg_dir = cg_dir/norm ! Have to normalize it. 414 415 ! Line minimization of the Raileigh functional. 416 ABI_MALLOC(vec_tmp,(hexc_size)) 417 vec_tmp=czero 418 #ifdef FC_NVHPC 419 !Buggy NVHPC compiler 420 do jj=my_t1,my_t2;do ii=1,hexc_size 421 vec_tmp(ii)=vec_tmp(ii)+hexc(ii,jj-my_t1+1)*cg_dir(jj) 422 enddo ; enddo 423 #else 424 vec_tmp = MATMUL(hexc, cg_dir(my_t1:my_t2)) 425 #endif 426 call xmpi_sum(vec_tmp,comm,ierr) 427 428 !if (my_rank==master) then 429 ! write(777,*)"cg_step, state, line",cg_step, state, line 430 ! write(777,*)vec_tmp 431 !end if 432 433 dhd = DOT_PRODUCT( cg_dir(my_t1:my_t2), vec_tmp(my_t1:my_t2)) ! is this always real? 434 dhc = REAL( DOT_PRODUCT( my_phi, vec_tmp(my_t1:my_t2) )) 435 ABI_FREE(vec_tmp) 436 437 rbuf2 = (/dhd,dhc/) 438 call xmpi_sum(rbuf2,comm,ierr) 439 dhd = rbuf2(1) 440 dhc = rbuf2(2) 441 442 !write(201*(my_rank+1),*)"cg_step, state, line dotgg dhd dhc",cg_step,state,line,dotgg,dhd,dhc 443 444 ! Taken from cgwf 445 ! Compute tan(2 theta),sin(theta) and cos(theta) 446 tan2th=2.0_dp*dhc/(etrial-dhd) 447 448 if (abs(tan2th)<1.d-05) then 449 costh=1.0_dp-0.125_dp*tan2th**2 450 sinth=0.5_dp*tan2th*(1.0_dp-0.375_dp*tan2th**2) 451 ! Check that result is above machine precision 452 ! FIXME This part is not safe on clusters made of different machines or different compiation options. 453 if (abs(sinth)<epsilon(0._dp)) then 454 write(msg, '(a,es16.4)' ) ' exc_iterative_diago: converged with tan2th= ',tan2th 455 call wrtout(std_out,msg,'PERS') 456 EXIT !Exit from the loop on line 457 end if 458 459 else 460 root =sqrt(1.0_dp+tan2th**2) 461 costh=sqrt(0.5_dp+0.5_dp/root) 462 sinth=sign(sqrt(0.5_dp-0.5_dp/root),tan2th) 463 end if 464 ! 465 ! Check for lower of two possible roots (same sign as curvature at theta where slope is zero) 466 diff=(etrial-dhd) 467 if (diff>zero) then ! Swap c and d if value of diff is positive 468 swap=costh 469 costh=-sinth 470 sinth=swap 471 if (prtvol<0 .or. prtvol>=10) then 472 write(msg,'(a,i4,es16.6)')' exc_iterative_diago: swap roots, line,diff= ',line,diff 473 call wrtout(std_out,msg,'PERS') 474 end if 475 end if 476 ! 477 ! === GENERATE NEW |wf>, H|wf> ============= 478 my_phi = costh*my_phi + sinth*cg_dir(my_t1:my_t2) 479 !write(100*(my_rank+1),*)"cg_step state, line costh sinth etrial",cg_step,state,line,costh,sinth,etrial 480 !end taken from cgwf 481 482 !norm = SQRT( DOT_PRODUCT(my_phi,my_phi) ) 483 !my_phi = my_phi /norm 484 !write(std_out,*)norm 485 !write(std_out,*)DOT_PRODUCT(hphi,my_phi),cos(theta_min) 486 487 ! ====================================================================== 488 ! =========== CHECK CONVERGENCE AGAINST TRIAL ENERGY =================== 489 ! ====================================================================== 490 ! Compute delta(E) 491 !deltae=chc*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc 492 deltae=etrial*(costh**2-1._dp)+dhd*sinth**2+2._dp*costh*sinth*dhc 493 494 ! Check convergence and eventually exit 495 ! if (line==1) then 496 ! deold=deltae 497 ! else if (abs(deltae)<0.005_dp*abs(deold) .and. line/=nline_for(state))then 498 ! if (prtvol>=10)then 499 ! write(msg, '(a,i4,1x,a,1p,e12.4,a,e12.4,a)' ) & 500 !& ' cgwf: line',line,& 501 !& ' deltae=',deltae,' < 0.005*',deold,' =>skip lines' 502 ! call wrtout(std_out,msg,'PERS') 503 ! end if 504 ! exc_energy(state) = exc_energy(state) + deltae 505 ! EXIT 506 ! end if 507 end do ! LOOP FOR A GIVEN BAND. Note that there are three "exit" instructions inside 508 ! Modify nline_for(state) according to converge degree. 509 !if (convergence_of(state) == STRICT) nline_for(state) = MAX(max_nline-2,2) 510 !if (convergence_of(state) == MEDIUM) nline_for(state) = MAX(max_nline-1,2) 511 !if (convergence_of(state) == WORST ) nline_for(state) = max_nline 512 end do !state 513 514 if (prtvol>2) then 515 do ii=0,(nstates-1)/8 516 write(msg,'(a,8es10.2)')' res:',(resid(state),state=1+ii*8,MIN(nstates,8+ii*8)) 517 call wrtout(std_out,msg,'COLL') 518 end do 519 do ii=0,(nstates-1)/8 520 write(msg,'(a,8es10.2)')' ene:',(exc_energy(state),state=1+ii*8,MIN(nstates,8+ii*8)) 521 call wrtout(std_out,msg,'COLL') 522 end do 523 end if 524 525 write(msg,'(a,i0)')"After cg_step: ",cg_step 526 call exc_check_phi_block(msg) 527 528 ! Find largest residual over bands and Print residuals 529 max_resid=MAXVAL( resid(:MAX(1,nstates-nbdbuf_)) ) 530 531 if (max_resid < tolwfr_) then 532 write(msg,'(a,i0,2(a,es10.2),a,i0,a)')& 533 & " After ",cg_step," iterations, max_resid= ",max_resid," < tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")" 534 call wrtout(std_out,msg,'COLL') 535 EXIT ! cg_step 536 end if 537 538 if (cg_step==1.or.MOD(cg_step,1)==0) then 539 call wrtout(std_out," Subspace rotation + exc_cholesky_ortho ","COLL") 540 541 call exc_subspace_rotation() 542 call exc_cholesky_ortho() 543 544 !mcg=hexc_size; mgsc=hexc_size; useoverlap=0 545 !allocate(cg(2,mcg),gsc(2,mgsc*useoverlap)) 546 !do ii=1,nstates 547 ! cg(1,:) = REAL (phi_block(:,ii)) 548 ! cg(2,:) = AIMAG(phi_block(:,ii)) 549 ! call fxphas(cg,gsc,0,0,1,mcg,mgsc,MPI_enreg_seq,1,hexc_size,useoverlap) 550 ! phi_block(:,ii)=CMPLX(cg(1,:),cg(2,:)) 551 !end do 552 !deallocate(cg,gsc) 553 554 end if 555 556 !XG20141126 : At present, keep this fake test. Seems that there is some bug on shiva 557 !(compiler ?) so that the result is erroneous without it. Even suppressing the 558 !"stop" instruction triggers the bug ... 559 if(cg_step==cg_nsteps+1)then 560 write(std_out,*)' One should not be here : cg_step==cg_nsteps+1 ' 561 stop 562 endif 563 564 end do !cg_step 565 566 ! Release some memory before entering RMM-DIIS 567 ABI_FREE(hphi) 568 ABI_FREE(cg_dir) 569 ABI_FREE(old_cg_dir) 570 ABI_FREE(grad_dir) 571 ABI_FREE(prc_dir) 572 573 do ii=0,(nstates-1)/8 574 write(msg,'(a,8es10.2)')' res:',(resid(state),state=1+ii*8,min(nstates,8+ii*8)) 575 call wrtout(std_out,msg,'COLL') 576 end do 577 do ii=0,(nstates-1)/8 578 write(msg,'(a,8es10.2)')' ene:',(exc_energy(state),state=1+ii*8,min(nstates,8+ii*8)) 579 call wrtout(std_out,msg,'COLL') 580 end do 581 582 stats = stats_eval(resid) 583 584 write(msg,"(4(a,es10.2))")& 585 & ". Residuals: min value: ",stats%min,", Max value: ",stats%max,", mean: ",stats%mean,", stdev: ",stats%stdev 586 call wrtout(std_out,msg) 587 call wrtout(ab_out,msg) 588 589 if (max_resid > tolwfr_) then 590 write(msg,'(2a,i5,2a,2(a,es10.2),a,i3,a)')ch10,& 591 & " WARNING: conjugate-gradient not converged after ",cg_step," iterations.",ch10,& 592 & " max_resid= ",max_resid," > tolwfr= ",tolwfr_," ( Excluding nbdbuf= ",nbdbuf_,")" 593 call wrtout(ab_out,msg,'COLL') 594 call wrtout(std_out,msg,'COLL') 595 end if 596 597 exc_gap = MINVAL(exc_energy) 598 exc_maxene = MAXVAL(exc_energy) 599 600 write(msg,'(a,2(a,f7.2,2a))')ch10,& 601 & " First excitonic eigenvalue= ",exc_gap*Ha_eV, " [eV]. ",ch10,& 602 & " Last excitonic eigenvalue= ",exc_maxene*Ha_eV," [eV]. ",ch10 603 call wrtout(std_out,msg,"COLL") 604 call wrtout(ab_out,msg,"COLL") 605 606 call exc_check_phi_block("END OF CONJUGATE-GRADIENT") 607 608 ABI_FREE(hexc) 609 ABI_FREE(hexc_diagonal) 610 ! 611 ! ===================================== 612 ! ==== Write final results on file ==== 613 ! ===================================== 614 oeig_fname = BS_files%out_eig 615 if (oeig_fname== BSE_NOFILE) then 616 ABI_WARNING("oeig_fname was set to "//TRIM(BSE_NOFILE)) 617 oeig_fname = TRIM(BS_files%out_basename)//"_BSEIG" 618 ABI_WARNING("using oeig_fname : "//TRIM(oeig_fname)) 619 end if 620 621 call exc_write_phi_block(oeig_fname,use_mpio) 622 623 ABI_FREE(phi_block) 624 625 call xmpi_barrier(comm) 626 627 DBG_EXIT("COLL") 628 629 CONTAINS !===========================================================
m_exc_itdiago/exc_subspace_rotation [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_subspace_rotation
FUNCTION
This routine performs the subspace rotation.
SIDE EFFECTS
phi_block(my_t1:my_t2,nstates)=Contains the trial eigenstates.
SOURCE
960 subroutine exc_subspace_rotation() 961 962 !Local variables ------------------------------ 963 integer :: ii,jj,ipack,ierr 964 !arrays 965 real(dp),allocatable :: sub_ene(:) 966 ! real(dp),allocatable :: evec(:,:) 967 complex(dpc),allocatable :: sub_ham(:,:),sub_pham(:),hphi_tot(:) 968 ! complex(dpc),allocatable :: phi_tmp(:,:) 969 970 !************************************************************************ 971 972 ! * Sub-space rotation. Calculate <phi_i|H|phi_j> in packed form. 973 ! TODO: this part can be rewritten using BLAS3 routines. 974 975 ABI_MALLOC(hphi_tot,(hexc_size)) 976 ABI_MALLOC(sub_pham,(nstates*(nstates+1)/2)) 977 sub_pham=czero; ipack=0 978 979 do jj=1,nstates 980 hphi_tot = czero 981 hphi_tot(:) = MATMUL(hexc, phi_block(:,jj)) 982 call xmpi_sum(hphi_tot,comm,ierr) 983 984 do ii=1,jj 985 ipack=ipack+1 986 sub_pham(ipack) = DOT_PRODUCT(phi_block(my_t1:my_t2,ii), hphi_tot(my_t1:my_t2) ) 987 if (ii==jj) sub_pham(ipack) = REAL(sub_pham(ipack),kind=dp) 988 end do 989 end do 990 call xmpi_sum(sub_pham,comm,ierr) 991 992 ABI_MALLOC(sub_ham,(nstates,nstates)) 993 sub_ham=czero 994 ABI_MALLOC(sub_ene,(nstates)) 995 996 call xhpev("Vectors","Upper",nstates,sub_pham,sub_ene,sub_ham,nstates) !,comm) 997 998 ABI_FREE(hphi_tot) 999 ABI_FREE(sub_pham) 1000 ABI_FREE(sub_ene) 1001 1002 !do ii=1,nstates 1003 ! norm = DOT_PRODUCT(sub_ham(:,ii),sub_ham(:,ii)) 1004 ! write(std_out,*)"norm subspac",norm 1005 ! sub_ham(:,ii) = sub_ham(:,ii)/norm 1006 !end do 1007 1008 !allocate(evec(2*nstates,nstates)) 1009 1010 !do ii=1,nstates 1011 ! do jj=1,nstates 1012 ! evec(jj, ii) = REAL (sub_ham(jj,ii)) 1013 ! evec(jj+1,ii) = AIMAG(sub_ham(jj,ii)) 1014 ! end do 1015 !end do 1016 1017 !call normev(evec,nstates,nstates) 1018 1019 !do ii=1,nstates 1020 ! do jj=1,nstates 1021 ! sub_ham(jj,ii) = CMPLX( evec(jj,ii),evec(jj+1,ii) ) 1022 ! end do 1023 !end do 1024 !deallocate(evec) 1025 1026 #if 0 1027 ABI_MALLOC(phi_tmp,(my_nt,nstates)) 1028 phi_tmp = phi_block 1029 1030 call ZGEMM('N','N',my_nt,nstates,nstates,cone,phi_tmp,my_nt,sub_ham,nstates,czero,phi_block,my_nt) 1031 1032 ABI_FREE(phi_tmp) 1033 #else 1034 phi_block = MATMUL(phi_block,sub_ham) 1035 #endif 1036 1037 ABI_FREE(sub_ham) 1038 1039 end subroutine exc_subspace_rotation
m_exc_itdiago/exc_write_phi_block [ Functions ]
[ Top ] [ m_exc_itdiago ] [ Functions ]
NAME
exc_write_phi_block
FUNCTION
Write phi_block on the Fortran file oeig_fname.
SOURCE
806 subroutine exc_write_phi_block(oeig_fname,use_mpio) 807 808 !Arguments ------------------------------------ 809 !scalars 810 character(len=*),intent(in) :: oeig_fname 811 logical,intent(in) :: use_mpio 812 813 !Local variables ------------------------------ 814 integer :: eig_unt,state,mpi_err !,fform 815 real(dp) :: cputime,walltime,gflops 816 character(len=500) :: msg,errmsg 817 ! type(Hdr_type) :: hexc_Hdr 818 logical :: do_ep_lifetime 819 !!arrays 820 complex(dpc),allocatable :: buffer_dpc(:) 821 #ifdef HAVE_MPI_IO 822 integer:: amode,mpi_fh,old_type,etype,eig_type,my_nel,ierr,my_nrows 823 integer(XMPI_OFFSET_KIND) :: ehdr_offset,my_offset,my_offpad,fmarker 824 integer(XMPI_OFFSET_KIND),allocatable :: bsize_frecord(:) 825 integer :: array_of_sizes(2),array_of_subsizes(2),array_of_starts(2) 826 #endif 827 828 !************************************************************************ 829 830 do_ep_lifetime = .FALSE. 831 832 call cwtime(cputime, walltime, gflops, "start") 833 834 if (.not.use_mpio) then 835 836 ! * Master writes the header. 837 if (my_rank==master) then 838 call wrtout(std_out," Writing eigenstates on file "//TRIM(oeig_fname)//" via Fortran-IO","COLL") 839 if (open_file(oeig_fname,msg, newunit=eig_unt, form='unformatted') /= 0) then 840 ABI_ERROR(msg) 841 end if 842 write(eig_unt, err=10, iomsg=errmsg) do_ep_lifetime 843 write(eig_unt, err=10, iomsg=errmsg) hexc_size, nstates 844 write(eig_unt, err=10, iomsg=errmsg) CMPLX(exc_energy(1:nstates),kind=dpc) 845 end if 846 847 ! Wavefunctions are gathered on the master node band-by-band. 848 ! TODO bands should be treated in blocks to minimize the number of MPI calls. 849 ABI_MALLOC_OR_DIE(buffer_dpc,(hexc_size), ierr) 850 851 do state=1,nstates 852 buffer_dpc=czero 853 buffer_dpc(my_t1:my_t2) = phi_block(:,state) 854 call xmpi_sum_master(buffer_dpc,master,comm,mpi_err) 855 if (my_rank==master) write(eig_unt, err=10, iomsg=errmsg) buffer_dpc(1:hexc_size) 856 end do 857 ABI_FREE(buffer_dpc) 858 859 if (my_rank==master) close(eig_unt, err=10, iomsg=errmsg) 860 861 else 862 #ifdef HAVE_MPI_IO 863 call wrtout(std_out," Writing eigenstates on file "//TRIM(oeig_fname)//" with MPI-IO","COLL") 864 865 ! Write the header. 866 if (my_rank==master) then 867 ! Write header using Fortran primitives. 868 if (open_file(oeig_fname,msg,newunit=eig_unt,form='unformatted') /= 0) then 869 ABI_ERROR(msg) 870 end if 871 write(eig_unt, err=10, iomsg=errmsg) nstates 872 write(eig_unt, err=10, iomsg=errmsg) CMPLX(exc_energy(1:nstates),kind=dpc) 873 ! TODO: change setup_bse so that Hdr_bse reflects the parameters of the run. 874 close(eig_unt, err=10, iomsg=errmsg) 875 end if 876 877 call xmpi_barrier(comm) 878 879 ! Open the file with MPI-IO 880 amode=MPI_MODE_RDWR 881 882 call MPI_FILE_OPEN(comm, oeig_fname, amode, MPI_INFO_NULL, mpi_fh, mpi_err) 883 msg = " MPI_IO error opening file: "//TRIM(oeig_fname) 884 ABI_CHECK_MPI(mpi_err,msg) 885 886 ! Move the file pointer to skip the first two records. 887 ehdr_offset=0 888 call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err) 889 !write(std_out,*)"fmarker first record ",fmarker 890 call xmpio_read_frm(mpi_fh,ehdr_offset,xmpio_collective,fmarker,mpi_err) 891 !write(std_out,*)"fmarker first record ",fmarker 892 !$call hdr_mpio_skip(mpi_fh,fform,ehdr_offset) 893 !$ehdr_offset = 4*xmpio_bsize_frm + xmpio_bsize_int + nstates*xmpio_bsize_dpc 894 895 etype=MPI_BYTE; old_type=MPI_DOUBLE_COMPLEX 896 897 my_nrows=my_t2-my_t1+1; old_type=MPI_DOUBLE_COMPLEX 898 array_of_sizes = (/hexc_size,nstates/) 899 array_of_subsizes = (/my_nrows,nstates/) 900 array_of_starts = (/my_t1,1/) 901 call xmpio_create_fsubarray_2D(array_of_sizes,array_of_subsizes,array_of_starts,old_type,eig_type,my_offpad,mpi_err) 902 ABI_CHECK_MPI(mpi_err,"fsubarray_2D") 903 ! 904 ! Each node uses a different offset to skip the header and the blocks written by the other CPUs. 905 my_offset = ehdr_offset + my_offpad 906 907 call MPI_FILE_SET_VIEW(mpi_fh, my_offset, etype, eig_type, 'native', MPI_INFO_NULL, mpi_err) 908 ABI_CHECK_MPI(mpi_err,"SET_VIEW") 909 910 call MPI_TYPE_FREE(eig_type,mpi_err) 911 ABI_CHECK_MPI(mpi_err,"MPI_TYPE_FREE") 912 913 my_nel = my_nrows*nstates 914 call MPI_FILE_WRITE_ALL(mpi_fh, phi_block, my_nel, MPI_DOUBLE_COMPLEX, MPI_STATUS_IGNORE, mpi_err) 915 ABI_CHECK_MPI(mpi_err,"FILE_WRITE") 916 917 ! It seems that personal calls make the code stuck 918 ABI_MALLOC(bsize_frecord,(nstates)) 919 bsize_frecord = hexc_size * xmpi_bsize_dpc 920 ! ehdr_offset points to the end of the header. 921 call xmpio_write_frmarkers(mpi_fh,ehdr_offset,xmpio_collective,nstates,bsize_frecord,ierr) 922 ABI_CHECK(ierr==0,"Error while writing Fortran markers") 923 ABI_FREE(bsize_frecord) 924 ! 925 ! Close the file. 926 call MPI_FILE_CLOSE(mpi_fh, mpi_err) 927 ABI_CHECK_MPI(mpi_err,"FILE_CLOSE") 928 #else 929 ABI_ERROR("MPI-IO support not enabled") 930 #endif 931 end if 932 933 call cwtime(cputime, walltime, gflops, "stop") 934 write(msg,'(2(a,f9.1),a)')" IO operation completed. cpu_time: ",cputime,"[s], walltime: ",walltime," [s]" 935 call wrtout(std_out, msg, "COLL", do_flush=.True.) 936 937 return 938 939 ! Handle IO-error 940 10 continue 941 ABI_ERROR(errmsg) 942 943 end subroutine exc_write_phi_block