TABLE OF CONTENTS
- ABINIT/ccfft
- ABINIT/fft_init_counters
- ABINIT/fft_output_counters
- ABINIT/fft_stop_counters
- ABINIT/fourdp
- ABINIT/fourwf
- ABINIT/m_fft
- m_fft/fft_allow_ialltoall
- m_fft/fft_poisson
- m_fft/fft_ug_dp
- m_fft/fft_ug_dpc
- m_fft/fft_ug_sp
- m_fft/fft_ug_spc
- m_fft/fft_ur_dp
- m_fft/fft_ur_dpc
- m_fft/fft_ur_spc
- m_fft/fft_use_lib_threads
- m_fft/fftbox_execute_ip_dpc
- m_fft/fftbox_execute_ip_spc
- m_fft/fftbox_execute_op_dpc
- m_fft/fftbox_execute_op_spc
- m_fft/fftbox_mpi_utests
- m_fft/fftbox_plan3_free
- m_fft/fftbox_plan3_from_ngfft
- m_fft/fftbox_plan3_init
- m_fft/fftbox_plan3_t
- m_fft/fftbox_utests
- m_fft/fftmpi_u
- m_fft/fftpac
- m_fft/fftpad_dpc
- m_fft/fftpad_spc
- m_fft/fftu_mpi_utests
- m_fft/fftu_utests
- m_fft/fourdp_6d
- m_fft/fourdp_mpi
- m_fft/fourwf_mpi
- m_fft/indirect_parallel_Fourier
- m_fft/uplan_execute_gr_dpc
- m_fft/uplan_execute_gr_spc
- m_fft/uplan_execute_rg_dpc
- m_fft/uplan_execute_rg_spc
- m_fft/uplan_free
- m_fft/uplan_init
- m_fft/uplan_t
- m_fft/zerosym
ABINIT/ccfft [ Functions ]
NAME
ccfft
FUNCTION
Carry out complex-to-complex Fourier transforms between real and reciprocal (G) space. Library of such routines. Include machine-dependent F90 routines used with fftalg=200.
INPUTS
fftalga=govern the choice of the fft routine to be used if 1: SGoedecker routine if 2: Machine dependent routine, depending on the precompilation options if 3: FFTW library routine if 4: new SGoedecker routine, version 2002 Warning : the second and third dimensions of the Fourier space array are switched, compared to the usual case fftcache=size of the cache (kB) isign= Integer specifying which sign to be used for the transformation. must be either +1 or -1. ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft n1,n2,n3=Actual integer dimensions (see ngfft) for the 3D sequence. Physical dimension of the transform. n4,n5,n6=Leading dimensions. Generally, n6 is not different to n3. ndat=number of FFT to do in // option= 1 if call from fourwf, 2 if call from other routine work1(2,n4*n5*n6)=Array to be transformed.
OUTPUT
inplace = 0 if result is in work2 ; =1 if result is in work1 (machine dependent) normalized=0 if the backward (isign=-1) FFT is not normalized, so has to be normalized outside of ccfft =1 otherwise work2(2,n4*n5*n6)=transformed array in case inplace=0.
SIDE EFFECTS
work1(2,n4*n5*n6)=at input, array to be transformed at output, transformed array (in case inplace=1)
NOTES
precompilation definitions : -D(machine_list) : (case fftalga=200) choice of machine-dependent FFT library, if permitted -DHAVE_FFT_FFTW2 : (case fftalga=300) activate the FFTW lib -Dnolib : (case fftalga=200) call SGoedecker routine, instead of machine-dependent one More about fftalga=200 Library routines for the following platforms have been implemented : Compaq/DEC HP (in place FFT) SGI (in place FFT) NEC (in place FFT) For all the other platforms, or if the CPP directive nolib is activated, one uses the fft routine from S. Goedecker.
SOURCE
3371 subroutine ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,option,work1,work2,comm_fft) 3372 3373 3374 !Arguments ------------------------------------ 3375 !scalars 3376 integer,intent(in) :: isign,n1,n2,n3,n4,n5,n6,ndat,option,comm_fft 3377 !arrays 3378 integer,intent(in) :: ngfft(18) 3379 real(dp),intent(inout) :: work1(2,n4*n5*n6*ndat) 3380 real(dp),intent(inout) :: work2(2,n4*n5*n6*ndat) !vz_i 3381 3382 !Local variables ------------------------------ 3383 !scalars 3384 integer,parameter :: cplex2=2 3385 integer :: fftalg,fftalga,fftalgb,fftalgc,fftcache 3386 integer :: nd2proc,nd3proc,nproc_fft 3387 character(len=500) :: msg 3388 3389 !************************************************************************* 3390 3391 !print *, "in ccfft" 3392 nproc_fft=ngfft(10) 3393 fftcache=ngfft(8); fftalg =ngfft(7); fftalga =fftalg/100; fftalgb=mod(fftalg,100)/10; fftalgc=mod(fftalg,10) 3394 3395 if(fftalga==2)then 3396 ABI_ERROR("Machine dependent FFTs are not supported anymore") 3397 3398 else if(fftalga==3)then 3399 ABI_ERROR("Old interface with FFTW2 is not supported anymore") 3400 3401 else if(fftalga<1 .or. fftalga>4)then 3402 write(msg, '(a,a,a,i5,a,a)' )& 3403 'The allowed values of fftalg(A) are 1, 2, 3, and 4 .',ch10,& 3404 'The actual value of fftalg(A) is',fftalga,ch10,& 3405 'Action: check the value of fftalg in your input file.' 3406 ABI_ERROR(msg) 3407 end if 3408 3409 ! This routine will be removed ASAP. 3410 ! Do not add new FFT libraries without previous discussion with Matteo Giantomassi 3411 ! inplace==1 or normalize==2 are not supported anymore in the caller (fourwf, fourdp) 3412 !inplace=0; normalized=0 3413 3414 if (fftalga/=4) then 3415 ! Call Stefan Goedecker FFT 3416 call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2) 3417 3418 else if (fftalga==4) then 3419 ! Call new version of Stefan Goedecker FFT 3420 nd2proc=((n2-1)/nproc_fft) +1 3421 nd3proc=((n6-1)/nproc_fft) +1 3422 3423 if (isign==1) then 3424 ! Fourier to real space (backward) 3425 call sg2002_back(cplex2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,option,work1,work2,comm_fft) 3426 else 3427 ! isign=-1, real space to Fourier (forward) 3428 call sg2002_forw(cplex2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,option,work1,work2,comm_fft) 3429 end if 3430 end if 3431 3432 end subroutine ccfft
ABINIT/fft_init_counters [ Functions ]
NAME
fft_init_counters
FUNCTION
SOURCE
4673 subroutine fft_init_counters() 4674 4675 fourdp_counter = 0 4676 fourwf_counter = 0 4677 4678 end subroutine fft_init_counters
ABINIT/fft_output_counters [ Functions ]
NAME
fft_output_counters
FUNCTION
SOURCE
4705 subroutine fft_output_counters(nbandtot, mpi_enreg) 4706 4707 !Arguments ------------------------------------ 4708 integer,intent(in) :: nbandtot 4709 type(MPI_type),intent(in) :: mpi_enreg 4710 4711 !Local variables------------------------------- 4712 !scalars 4713 character(len=500) :: msg 4714 integer :: cnt,ierr 4715 !arrays 4716 4717 call wrtout([std_out,ab_out],'') 4718 write(msg,'(a)') ' --- FFT COUNTERS ------------------------------------------------------------' 4719 call wrtout([std_out,ab_out], msg) 4720 write(msg,'(a,i6)') ' total Number of Bands : NB = ',nbandtot 4721 call wrtout([std_out,ab_out], msg) 4722 write(msg,'(a)') ' | total count (TC) | TC/NB' 4723 call wrtout([std_out,ab_out], msg) 4724 write(msg,'(a)') ' -----------------------------------------------------------------------------' 4725 call wrtout([std_out,ab_out], msg) 4726 call xmpi_sum(fourwf_counter,mpi_enreg%comm_kpt,ierr) 4727 cnt=fourdp_counter 4728 if (cnt>0) then 4729 write(msg,'(a,i16,a)') ' fourdp | ',cnt,' |' 4730 call wrtout([std_out,ab_out], msg) 4731 end if 4732 cnt=fourwf_counter 4733 if (cnt>0) then 4734 write(msg,'(a,i16,a,f16.1)') ' fourwf | ',cnt,' | ',dble(cnt)/nbandtot 4735 call wrtout([std_out,ab_out], msg) 4736 end if 4737 write(msg,'(a)') ' -----------------------------------------------------------------------------' 4738 call wrtout([std_out,ab_out], msg) 4739 4740 end subroutine fft_output_counters
ABINIT/fft_stop_counters [ Functions ]
NAME
fft_stop_counters
FUNCTION
SOURCE
4689 subroutine fft_stop_counters() 4690 4691 fourdp_counter = -1 4692 fourwf_counter = -1 4693 4694 end subroutine fft_stop_counters
ABINIT/fourdp [ Functions ]
NAME
fourdp
FUNCTION
Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on fft grid in real space, to create complex f(G)=fofg defined on full fft grid in reciprocal space, in full storage mode, or the reverse operation. For the reverse operation, the final data is divided by nfftot. REAL case when cplex=1, COMPLEX case when cplex=2 Usually used for density and potentials. There are two different possibilities: fftalgb=0 means using the complex-to-complex FFT routine, irrespective of the value of cplex fftalgb=1 means using a real-to-complex FFT or a complex-to-complex FFT, depending on the value of cplex. The only real-to-complex FFT available is from SGoedecker library.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r -1 for transforming from r to G. mpi_enreg=information about MPI parallelization nfft=(effective) number of FFT grid points (for this processor) ndat=Number of functions to transform ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft tim_fourdp=timing code of the calling routine (can be set to 0 if not attributed)
SIDE EFFECTS
Input/Output fofg(2,nfft)=f(G), complex. fofr(cplex*nfft)=input function f(r) (real or complex)
SOURCE
2972 subroutine fourdp(cplex, fofg, fofr, isign, mpi_enreg, nfft, ndat, ngfft, tim_fourdp) 2973 2974 !Arguments ------------------------------------ 2975 !scalars 2976 integer,intent(in) :: cplex,isign,nfft,ndat,tim_fourdp 2977 type(MPI_type),intent(in) :: mpi_enreg 2978 !arrays 2979 integer,intent(in) :: ngfft(18) 2980 real(dp),intent(inout) :: fofg(2,nfft,ndat),fofr(cplex*nfft,ndat) 2981 2982 !Local variables------------------------------- 2983 !scalars 2984 integer :: fftalg,fftalga,fftalgb,fftcache,i1,i2,i3,base,idat 2985 integer :: n1,n1half1,n1halfm,n2,n2half1,n3,n4 2986 integer :: n4half1,n5,n5half1,n6 !nd2proc,nd3proc,i3_local,i2_local, 2987 integer :: comm_fft,nproc_fft,me_fft 2988 real(dp) :: xnorm 2989 character(len=500) :: msg 2990 !arrays 2991 integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:) 2992 integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:) 2993 real(dp) :: tsec(2) 2994 real(dp),allocatable :: work1(:,:,:,:,:),work2(:,:,:,:,:) 2995 real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:) 2996 2997 ! ************************************************************************* 2998 2999 ABI_CHECK(ndat == 1, "ndat != 1 should be tested") 3000 3001 ! Keep track of timing 3002 call timab(1260+tim_fourdp,1,tsec) 3003 3004 if (fourdp_counter>=0) then 3005 fourdp_counter = fourdp_counter + ndat 3006 end if 3007 3008 n1=ngfft(1); n2=ngfft(2); n3=ngfft(3) 3009 n4=ngfft(4); n5=ngfft(5); n6=ngfft(6) 3010 me_fft=ngfft(11); nproc_fft=ngfft(10) 3011 comm_fft = mpi_enreg%comm_fft 3012 !write(std_out,*)"fourdp, nx,ny,nz,nfft =",n1,n2,n3,nfft 3013 3014 fftcache=ngfft(8) 3015 fftalg =ngfft(7); fftalga =fftalg/100; fftalgb =mod(fftalg,100)/10 3016 3017 xnorm=one/dble(n1*n2*n3) 3018 !write(std_out,*)' fourdp :me_fft',me_fft,'nproc_fft',nproc_fft,'nfft',nfft 3019 3020 if (fftalgb /= 0 .and. fftalgb /= 1) then 3021 write(msg, '(a,i0,5a)' )& 3022 'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,& 3023 'The second digit (fftalg(B)) must be 0 or 1.',ch10,& 3024 'Action: change fftalg in your input file.' 3025 ABI_BUG(msg) 3026 end if 3027 3028 if (fftalgb == 1 .and. ALL(fftalga /= [1,3,4,5])) then 3029 write(msg,'(a,i0,5a)')& 3030 'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,& 3031 'When fftalg(B) is 1, the allowed values for fftalg(A) are 1 and 4.',ch10,& 3032 'Action: change fftalg in your input file.' 3033 ABI_BUG(msg) 3034 end if 3035 3036 if (n4<n1.or.n5<n2.or.n6<n3) then 3037 write(msg,'(a,3(i0,1x),a,3(i0,1x))')' Each of n4,n5,n6=',n4,n5,n6,'must be >= n1, n2, n3 =',n1,n2,n3 3038 ABI_BUG(msg) 3039 end if 3040 3041 ! Get the distrib associated with this fft_grid => for i2 and i3 planes 3042 call ptabs_fourdp(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local) 3043 3044 ! Branch immediately depending on nproc_fft 3045 if (nproc_fft > 1) then 3046 call fourdp_mpi(cplex,nfft,ngfft,ndat,isign,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3047 goto 100 3048 end if 3049 3050 if (fftalga == FFT_FFTW3) then 3051 ! Call sequential or MPI FFTW3 version. 3052 if (nproc_fft == 1) then 3053 !call wrtout(std_out,"FFTW3 SEQFOURDP") 3054 call fftw3_seqfourdp(cplex,n1,n2,n3,n1,n2,n3,ndat,isign,fofg,fofr) 3055 else 3056 !call wrtout(std_out,"FFTW3 MPIFOURDP") 3057 call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 3058 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3059 end if 3060 ! Accumulate timing and return 3061 call timab(1260+tim_fourdp,2,tsec); return 3062 end if 3063 3064 if (fftalga == FFT_DFTI) then 3065 ! Call sequential or MPI MKL. 3066 if (nproc_fft == 1) then 3067 call dfti_seqfourdp(cplex,n1,n2,n3,n1,n2,n3,ndat,isign,fofg,fofr) 3068 else 3069 ABI_ERROR("MPI fourdp with MKL cluster DFT not implemented") 3070 end if 3071 ! Accumulate timing and return 3072 call timab(1260+tim_fourdp,2,tsec); return 3073 end if 3074 3075 ! Here, deal with the new SG FFT, complex-to-complex case 3076 if (fftalga==FFT_SG2002 .and. (fftalgb==0 .or. cplex==2)) then 3077 call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3078 !call sg2002_seqfourdp(cplex,nfft,ngfft,ndat,isign,fftn2_fofg,fofr) 3079 end if 3080 3081 ! Here, deal with the new SG FFT, with real-to-complex 3082 if (fftalga==FFT_SG2002 .and. fftalgb==1 .and. cplex==1) then 3083 ABI_CHECK(nproc_fft == 1,"fftalg 41x does not support nproc_fft > 1") 3084 ABI_CHECK(ndat == 1, "ndat must be 1") 3085 3086 n1half1=n1/2+1; n1halfm=(n1+1)/2 3087 n2half1=n2/2+1 3088 ! n4half1 or n5half1 are the odd integers >= n1half1 or n2half1 3089 n4half1=(n1half1/2)*2+1 3090 n5half1=(n2half1/2)*2+1 3091 ABI_MALLOC(workr, (2,n4half1,n5,n6,ndat)) 3092 ABI_MALLOC(workf, (2,n4,n6,n5half1,ndat)) 3093 3094 if (isign==1) then 3095 do idat=1,ndat 3096 do i3=1,n3 3097 do i2=1,n2half1 3098 base=n1*(i2-1+n2*(i3-1)) 3099 do i1=1,n1 3100 workf(1,i1,i3,i2,idat) = fofg(1,i1+base,idat) 3101 workf(2,i1,i3,i2,idat) = fofg(2,i1+base,idat) 3102 end do 3103 end do 3104 end do 3105 end do 3106 3107 !nd2proc=((n2-1)/nproc_fft) +1 3108 !nd3proc=((n6-1)/nproc_fft) +1 3109 3110 ! change the call? n5half1 et n6 ? 3111 call sg2002_back(cplex,ndat,n1,n2,n3,n4,n5,n6,n4half1,n5half1,n6,2,workf,workr,comm_fft) 3112 3113 do idat=1,ndat 3114 do i3=1,n3 3115 do i2=1,n2 3116 base=n1*(i2-1+n2*(i3-1)) 3117 do i1=1,n1half1-1 3118 ! copy data 3119 fofr(2*i1-1+base, idat) = workr(1,i1,i2,i3,idat) 3120 fofr(2*i1 +base, idat) = workr(2,i1,i2,i3,idat) 3121 end do 3122 ! If n1 odd, must add last data 3123 if((2*n1half1-2)/=n1)then 3124 fofr(n1+base, idat) = workr(1,n1half1,i2,i3,idat) 3125 end if 3126 end do 3127 end do 3128 end do 3129 3130 else if (isign==-1) then 3131 do idat=1,ndat 3132 do i3=1,n3 3133 do i2=1,n2 3134 base=n1*(i2-1+n2*(i3-1)) 3135 do i1=1,n1half1-1 3136 workr(1,i1,i2,i3,idat)=fofr(2*i1-1+base,idat) 3137 workr(2,i1,i2,i3,idat)=fofr(2*i1 +base,idat) 3138 end do 3139 ! If n1 odd, must add last data 3140 if((2*n1half1-2)/=n1)then 3141 workr(1,n1half1,i2,i3,idat)=fofr(n1+base,idat) 3142 workr(2,n1half1,i2,i3,idat)=zero 3143 end if 3144 end do 3145 end do 3146 end do 3147 3148 call sg2002_forw(cplex,ndat,n1,n2,n3,n4,n5,n6,n4half1,n5half1,n6,2,workr,workf,comm_fft) 3149 3150 ! Transfer fft output to the original fft box 3151 do idat=1,ndat 3152 do i3=1,n3 3153 3154 do i2=1,n2half1 3155 base=n1*(i2-1+n2*(i3-1)) 3156 do i1=1,n1 3157 fofg(1,i1+base,idat) = workf(1,i1,i3,i2,idat)*xnorm 3158 fofg(2,i1+base,idat) = workf(2,i1,i3,i2,idat)*xnorm 3159 end do 3160 end do 3161 3162 ! Complete missing values with complex conjugate 3163 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 3164 if(n2half1>2)then 3165 do i2=2,n2+1-n2half1 3166 base=n1*((n2+2-i2)-1) 3167 if(i3/=1)base=base+n1*n2*((n3+2-i3)-1) 3168 fofg(1,1+base,idat)= workf(1,1,i3,i2,idat)*xnorm 3169 fofg(2,1+base,idat)=-workf(2,1,i3,i2,idat)*xnorm 3170 do i1=2,n1 3171 fofg(1,n1+2-i1+base,idat)= workf(1,i1,i3,i2,idat)*xnorm 3172 fofg(2,n1+2-i1+base,idat)=-workf(2,i1,i3,i2,idat)*xnorm 3173 end do 3174 end do 3175 end if 3176 3177 end do 3178 end do 3179 3180 end if ! isign 3181 ABI_FREE(workr) 3182 ABI_FREE(workf) 3183 end if 3184 3185 ! Here, one calls the complex-to-complex FFT subroutine 3186 if( (fftalgb==0 .or. cplex==2) .and. fftalga/=4 )then 3187 ABI_CHECK(ndat == 1, "ndat must be 1") 3188 3189 ABI_MALLOC(work1, (2,n4,n5,n6,ndat)) 3190 ABI_MALLOC(work2, (2,n4,n5,n6,ndat)) 3191 3192 if (isign==1) then 3193 3194 ! Transfer fofg to the expanded fft box 3195 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3196 do idat=1,ndat 3197 do i3=1,n3 3198 do i2=1,n2 3199 base=n1*(i2-1+n2*(i3-1)) 3200 do i1=1,n1 3201 work1(1,i1,i2,i3,idat) = fofg(1,i1+base,idat) 3202 work1(2,i1,i2,i3,idat) = fofg(2,i1+base,idat) 3203 end do 3204 end do 3205 end do 3206 end do 3207 3208 ! Call Goedecker C2C FFT 3209 !call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2) 3210 call ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,2,work1,work2,comm_fft) 3211 3212 ! Take data from expanded box and put it in the original box. 3213 if (cplex==1) then 3214 ! REAL case 3215 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3216 do idat=1,ndat 3217 do i3=1,n3 3218 do i2=1,n2 3219 base=n1*(i2-1+n2*(i3-1)) 3220 do i1=1,n1 3221 fofr(i1+base,idat) = work2(1,i1,i2,i3,idat) 3222 end do 3223 end do 3224 end do 3225 end do 3226 3227 else 3228 ! COMPLEX case 3229 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3230 do idat=1,ndat 3231 do i3=1,n3 3232 do i2=1,n2 3233 base=2*n1*(i2-1+n2*(i3-1)) 3234 do i1=1,n1 3235 fofr(2*i1-1+base, idat) = work2(1,i1,i2,i3,idat) 3236 fofr(2*i1 +base, idat) = work2(2,i1,i2,i3,idat) 3237 end do 3238 end do 3239 end do 3240 end do 3241 end if 3242 3243 else if (isign==-1) then 3244 3245 ! Insert fofr into the augmented fft box 3246 if (cplex==1) then 3247 ! REAL case copy data 3248 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3249 do idat=1,ndat 3250 do i3=1,n3 3251 do i2=1,n2 3252 base=n1*(i2-1+n2*(i3-1)) 3253 do i1=1,n1 3254 work1(1,i1,i2,i3,idat) = fofr(i1+base,idat) 3255 work1(2,i1,i2,i3,idat) = zero 3256 end do 3257 end do 3258 end do 3259 end do 3260 else 3261 ! COMPLEX case copy data 3262 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3263 do idat=1,ndat 3264 do i3=1,n3 3265 do i2=1,n2 3266 base=2*n1*(i2-1+n2*(i3-1)) 3267 do i1=1,n1 3268 work1(1,i1,i2,i3, idat) = fofr(2*i1-1+base, idat) 3269 work1(2,i1,i2,i3, idat) = fofr(2*i1 +base, idat) 3270 end do 3271 end do 3272 end do 3273 end do 3274 end if ! cplex 3275 3276 ! Call Stefan Goedecker C2C FFT 3277 !call sg_fft_cc(fftcache,n1,n2,n3,n4,n5,n6,ndat,isign,work1,work2) 3278 call ccfft(ngfft,isign,n1,n2,n3,n4,n5,n6,ndat,2,work1,work2,comm_fft) 3279 3280 ! Transfer fft output to the original fft box 3281 !$OMP PARALLEL DO COLLAPSE(2) PRIVATE(base) 3282 do idat=1,ndat 3283 do i3=1,n3 3284 do i2=1,n2 3285 base=n1*(i2-1+n2*(i3-1)) 3286 do i1=1,n1 3287 fofg(1,i1+base,idat) = work2(1,i1,i2,i3,idat)*xnorm 3288 fofg(2,i1+base,idat) = work2(2,i1,i2,i3,idat)*xnorm 3289 end do 3290 end do 3291 end do 3292 end do 3293 3294 end if ! isign 3295 3296 ABI_FREE(work1) 3297 ABI_FREE(work2) 3298 end if ! End simple algorithm 3299 3300 ! Here sophisticated algorithm based on S. Goedecker routines, only for the REAL case. 3301 ! Take advantage of the fact that fofr is real, and that fofg has corresponding symmetry properties. 3302 if( (fftalgb==1 .and. cplex==1) .and. fftalga/=4 )then 3303 ABI_CHECK(nproc_fft == 1,"nproc > 1 not supported") 3304 do idat=1,ndat 3305 call sg_fft_rc(cplex,fofg(1,1,idat),fofr(1,idat),isign,nfft,ngfft) 3306 end do 3307 end if 3308 3309 100 call timab(1260+tim_fourdp,2,tsec) 3310 3311 end subroutine fourdp
ABINIT/fourwf [ Functions ]
NAME
fourwf
FUNCTION
Carry out composite Fourier transforms between real and reciprocal (G) space. Wavefunctions, contained in a sphere in reciprocal space, can be FFT to real space. They can also be FFT from real space to a sphere. Also, the density maybe accumulated, and a local potential can be applied. The different options are : - option=0 --> reciprocal to real space and output the result. - option=1 --> reciprocal to real space and accumulate the density. - option=2 --> reciprocal to real space, apply the local potential to the wavefunction in real space and produce the result in reciprocal space. - option=3 --> real space to reciprocal space. NOTE that in this case, fftalg=1x1 MUST be used. This may be changed in the future. The different sections of this routine corresponds to different algorithms, used independently of each others : - fftalg=xx0 : use simple complex-to-complex routines, without zero padding (rather simple, so can be used to understand how fourwf.f works); - fftalg=1x1 : use S Goedecker routines, with zero padding (7/12 savings in execution time); - fftalg=1x2 : call even more sophisticated coding also based on S Goedecker routines This routine contains many parts that differ only by small details, in order to treat each case with the better speed. Also for better speed, it uses no F90 construct, except the allocate command and for zeroing arrays.
INPUTS
cplex= if 1 , denpot is real, if 2 , denpot is complex (cplex=2 only allowed for option=2, and istwf_k=1) not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory fofgin(2,npwin)=holds input wavefunction in G vector basis sphere. (intent(in) but the routine sphere can modify it for another iflag) gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space istwf_k=option parameter that describes the storage of wfs kg_kin(3,npwin)=reduced planewave coordinates, input kg_kout(3,npwout)=reduced planewave coordinates, output mgfft=maximum size of 1D FFTs mpi_enreg=information about MPI parallelization ndat=number of FFT to do in // ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft npwin=number of elements in fofgin array (for option 0, 1 and 2) npwout=number of elements in fofgout array (for option 2 and 3) n4,n5,n6=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr. option= if 0: do direct FFT if 1: do direct FFT, then sum the density if 2: do direct FFT, multiply by the potential, then do reverse FFT if 3: do reverse FFT only tim_fourwf=timing code of the calling routine (can be set to 0 if not attributed) weight_r=weight to be used for the accumulation of the density in real space (needed only when option=1) weight_i=weight to be used for the accumulation of the density in real space (needed only when option=1 and (fftalg=4 and fftalgc/=0)) [weight_array_r]= -- optional -- same as weight_r when ndat>1 weight_array_r(i)=weight_r to be used for band i at present only used for the GPU version [weight_array_i]= -- optional -- same as weight_i when ndat>1 weight_array_i(i)=weight_i to be used for band i at present only used for the GPU version [fofginb(2,npwin)]=holds second input wavefunction in G vector basis sphere. (intent(in) but the routine sphere can modify it for another iflag) (for non diagonal occupation) [use_ndo] = use non diagonal occupations. [gpu_option] = GPU implementation to use, i.e. cuda, openMP, ... (0=not using GPU)
OUTPUT
(see side effects)
SIDE EFFECTS
Input/Output for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; fofr(2,n4,n5,n6*ndat) contains the output Fourier Transform of fofgin; no use of denpot, fofgout and npwout. for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; denpot(cplex*n4,n5,n6) contains the input density at input, and the updated density at output (accumulated); no use of fofgout and npwout. for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere; denpot(cplex*n4,n5,n6) contains the input local potential; fofgout(2,npwout*ndat) contains the output function; for option==3, fofr(2,n4,n5,n6*ndat) contains the input real space wavefunction; fofgout(2,npwout*ndat) contains its output Fourier transform; no use of fofgin and npwin.
NOTES
DO NOT CHANGE THE API OF THIS FUNCTION. If you need a specialized routine for the FFT of the wavefunctions, create a wrapper that uses fourwf to accomplish your task. This routine, indeed, has already too many parameters and each change in the API requires a careful modification of the different wrappers used for specialized FFTs such as FFTW3 and MKL-DFTI
SOURCE
2288 subroutine fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2289 kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,& 2290 tim_fourwf,weight_r,weight_i, & 2291 weight_array_r,weight_array_i,gpu_option,use_ndo,fofginb) ! Optional arguments 2292 2293 !Arguments ------------------------------------ 2294 !scalars 2295 integer,intent(in) :: cplex,istwf_k,mgfft,n4,n5,n6,ndat,npwin,npwout,option 2296 integer,intent(in) :: tim_fourwf 2297 integer,intent(in),optional :: gpu_option,use_ndo 2298 real(dp),intent(in) :: weight_r,weight_i 2299 real(dp),intent(in),optional,target :: weight_array_r(ndat),weight_array_i(ndat) 2300 type(MPI_type),intent(in) :: mpi_enreg 2301 !arrays 2302 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2) 2303 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18) 2304 real(dp),intent(inout) :: denpot(cplex*n4,n5,n6),fofgin(2,npwin*ndat) 2305 real(dp),intent(inout),optional :: fofginb(:,:) ! (2,npwin*ndat) 2306 real(dp),intent(inout) :: fofr(2,n4,n5,n6*ndat) 2307 real(dp),intent(out) :: fofgout(2,npwout*ndat) 2308 2309 !Local variables------------------------------- 2310 !scalars 2311 integer :: fftalg,fftalga,fftalgc,fftcache,i1,i2,i2_local,i3,i3_local,i3_glob,idat,ier 2312 integer :: iflag,ig,comm_fft,me_g0,me_fft,n1,n2,n3,nd2proc,nd3proc 2313 integer :: nfftot,nproc_fft,option_ccfft,paral_kgb,gpu_option_ 2314 real(dp) :: fim,fre,xnorm 2315 character(len=500) :: msg 2316 logical :: luse_ndo 2317 !arrays 2318 integer,parameter :: shiftg0(3)=0 2319 integer,parameter :: symmE(3,3)=reshape([1,0,0,0,1,0,0,0,1],[3,3]) 2320 integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:) 2321 integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:) 2322 real(dp) :: tsec(2) 2323 real(dp),allocatable :: work1(:,:,:,:),work2(:,:,:,:),work3(:,:,:,:) 2324 real(dp),allocatable :: work4(:,:,:,:),work_sum(:,:,:,:) 2325 real(dp),pointer :: weight_ptr_r(:),weight_ptr_i(:) 2326 2327 ! ************************************************************************* 2328 2329 ! Accumulate timing 2330 call timab(840+tim_fourwf,1,tsec) 2331 2332 if (fourwf_counter>=0) then 2333 fourwf_counter = fourwf_counter + ndat 2334 if (option==2) fourwf_counter = fourwf_counter + ndat 2335 end if 2336 2337 n1=ngfft(1); n2=ngfft(2); n3=ngfft(3); nfftot=n1*n2*n3 2338 fftcache=ngfft(8) 2339 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10) 2340 me_fft=ngfft(11) 2341 nproc_fft=ngfft(10) 2342 2343 comm_fft = mpi_enreg%comm_fft; me_g0 = mpi_enreg%me_g0_fft 2344 paral_kgb = mpi_enreg%paral_kgb 2345 2346 !if (ndat/=1) then 2347 ! write(std_out,*)fftalg 2348 ! ABI_ERROR("Really? I thought nobody uses ndat > 1") 2349 !end if 2350 2351 !if (weight_r /= weight_i) then 2352 ! write(std_out,*)fftalg 2353 ! ABI_ERROR("Really? I thought nobody uses weight_r != weight_i") 2354 !end if 2355 2356 !if (option == 0 .and. fftalgc == 0) then 2357 ! ABI_ERROR("Option 0 is buggy when fftalgc ==0 is used!") 2358 !end if 2359 2360 !GPU version of fourwf 2361 gpu_option_=ABI_GPU_DISABLED 2362 if (PRESENT(gpu_option)) gpu_option_=gpu_option 2363 2364 if(gpu_option_/=ABI_GPU_DISABLED) then 2365 if (present(weight_array_r)) then 2366 weight_ptr_r => weight_array_r 2367 else 2368 ABI_MALLOC(weight_ptr_r,(ndat)) 2369 weight_ptr_r(:)=weight_r 2370 end if 2371 if (present(weight_array_i)) then 2372 weight_ptr_i => weight_array_i 2373 else 2374 ABI_MALLOC(weight_ptr_i,(ndat)) 2375 weight_ptr_i(:)=weight_i 2376 end if 2377 if(gpu_option_==ABI_GPU_LEGACY) then 2378 #if defined HAVE_GPU_CUDA 2379 call gpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2380 kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,& 2381 paral_kgb,tim_fourwf,weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb) 2382 #endif 2383 else if(gpu_option_==ABI_GPU_KOKKOS) then 2384 #if defined HAVE_GPU_CUDA && defined HAVE_YAKL 2385 call gpu_fourwf_managed(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2386 kg_kin,kg_kout,mgfft,mpi_enreg,ndat,ngfft,npwin,npwout,n4,n5,n6,option,& 2387 paral_kgb,tim_fourwf,weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb) 2388 #endif 2389 else if(gpu_option_==ABI_GPU_OPENMP) then 2390 #ifdef HAVE_OPENMP_OFFLOAD 2391 call ompgpu_fourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2392 kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,& 2393 weight_ptr_r,weight_ptr_i) !,use_ndo,fofginb) 2394 #endif 2395 end if 2396 if (.not.present(weight_array_r)) then 2397 ABI_FREE(weight_ptr_r) 2398 end if 2399 if (.not.present(weight_array_i)) then 2400 ABI_FREE(weight_ptr_i) 2401 end if 2402 call timab(840+tim_fourwf,2,tsec) 2403 return 2404 end if ! GPU 2405 2406 if ((fftalgc < 0 .or. fftalgc > 2)) then 2407 write(msg, '(a,i0,5a)' )& 2408 'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,& 2409 'The third digit, fftalg(C), must be 0, 1, or 2',ch10,& 2410 'Action: change fftalg in your input file.' 2411 ABI_ERROR(msg) 2412 end if 2413 2414 if (fftalgc /= 0 .and. ALL(fftalga /= [1,3,4,5])) then 2415 write(msg, '(a,i0,5a)' )& 2416 'The input algorithm number fftalg= ',fftalg,' is not allowed.',ch10,& 2417 'The first digit must be 1,3,4 when the last digit is not 0.',ch10,& 2418 'Action: change fftalg in your input file.' 2419 ABI_ERROR(msg) 2420 end if 2421 2422 if (option < 0 .or. option > 3)then 2423 write(msg, '(a,i0,3a)' )& 2424 'The option number ',option,' is not allowed.',ch10,& 2425 'Only option=0, 1, 2 or 3 are allowed presently.' 2426 ABI_ERROR(msg) 2427 end if 2428 2429 if (option == 1 .and. cplex /= 1) then 2430 write(msg, '(3a,i0,a)' )& 2431 'With the option number 1, cplex must be 1,',ch10,& 2432 'but it is cplex= ',cplex,'.' 2433 ABI_ERROR(msg) 2434 end if 2435 2436 if (option==2 .and. (cplex/=1 .and. cplex/=2)) then 2437 write(msg, '(3a,i0,a)' )& 2438 'With the option number 2, cplex must be 1 or 2,',ch10,& 2439 'but it is cplex= ',cplex,'.' 2440 ABI_ERROR(msg) 2441 end if 2442 2443 ! DMFT uses its own FFT algorithm (that should be wrapped in a different routine!) 2444 luse_ndo=.false. 2445 if (present(use_ndo).and.present(fofginb)) then 2446 if(use_ndo==1) then 2447 luse_ndo=.true. 2448 if((size(fofginb,2)==0)) then 2449 write(msg, '(a,a,a,i4,i5)' )& 2450 'fofginb has a dimension equal to zero and use_ndo==1',ch10,& 2451 'Action: check dimension of fofginb',size(fofginb,2),use_ndo 2452 ABI_ERROR(msg) 2453 end if 2454 end if 2455 end if 2456 2457 if (luse_ndo) then 2458 if (.not.(fftalgc==2 .and. option/=3)) then 2459 ABI_ERROR("luse_ndo but not .not.(fftalgc==2 .and. option/=3)") 2460 end if 2461 ABI_CHECK(nproc_fft==1, "DMFT with nproc_fft != 1") 2462 ABI_CHECK(ndat == 1, "use_ndo and ndat != 1 not coded") 2463 2464 call sg_fftrisc_2(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,& 2465 istwf_k,kg_kin,kg_kout,& 2466 mgfft,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_2=weight_i,luse_ndo=luse_ndo,fofgin_p=fofginb) 2467 goto 100 2468 end if 2469 2470 ! Get the distrib associated with this fft_grid => for i2 and i3 planes 2471 call ptabs_fourwf(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local) 2472 2473 ! Branch immediately depending on nproc_fft 2474 if (nproc_fft > 1 .and. fftalg /= 412) then 2475 call fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,& 2476 istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,mpi_enreg%distribfft,n1,n2,n3,npwin,npwout,& 2477 n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft) 2478 goto 100 2479 end if 2480 2481 select case (fftalga) 2482 2483 case (FFT_FFTW3) 2484 if (luse_ndo) ABI_ERROR("luse_ndo not supported by FFTW3") 2485 if (nproc_fft == 1) then 2486 ! call wrtout(std_out, "FFTW3_SEQFOURWF") 2487 call fftw3_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2488 kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i) 2489 else 2490 ABI_ERROR("Not coded") 2491 end if 2492 2493 case (FFT_DFTI) 2494 if (luse_ndo) ABI_ERROR("luse_ndo not supported by DFTI") 2495 if (nproc_fft == 1) then 2496 ! call wrtout(std_out, "DFTI_SEQFOURWF") 2497 call dfti_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,& 2498 kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i) 2499 else 2500 ABI_ERROR("Not coded") 2501 end if 2502 2503 case default 2504 ! TODO: Clean this code! 2505 2506 ! Here, use routines that make forwards FFT separately of backwards FFT, 2507 ! in particular, usual 3DFFT library routines, called in ccfft. 2508 if (fftalgc==0 .or. (fftalgc==1 .and. fftalga/=4) .or. & 2509 (fftalgc==2 .and. fftalga/=4 .and. option==3) )then 2510 2511 ABI_MALLOC(work1,(2,n4,n5,n6*ndat)) 2512 2513 if (option/=3)then 2514 ! Insert fofgin into the fft box (array fofr) 2515 2516 if (fftalga/=4)then 2517 iflag=1 2518 call sphere(fofgin,ndat,npwin,fofr,n1,n2,n3,n4,n5,n6,kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one) 2519 2520 else if (fftalga==4 .and. fftalgc==0) then 2521 ! Note the switch of n5 and n6, as they are only 2522 ! needed to dimension work2 inside "sphere" 2523 ABI_MALLOC(work2,(2,n4,n6,n5*ndat)) 2524 2525 iflag=2 2526 nd2proc=((n2-1)/nproc_fft) +1 2527 nd3proc=((n6-1)/nproc_fft) +1 2528 ABI_MALLOC(work3,(2,n4,n6,nd2proc*ndat)) 2529 ABI_MALLOC(work4,(2,n4,n5,nd3proc*ndat)) 2530 2531 if (istwf_k == 1 .and. paral_kgb==1) then 2532 ! sphere dont need a big array 2533 work3=zero 2534 call sphere_fft(fofgin,ndat,npwin,work3,n1,n2,n3,n4,n6,kg_kin,& 2535 mpi_enreg%distribfft%tab_fftwf2_local,nd2proc) 2536 else 2537 ! sphere needs a big array and communications 2538 if (nproc_fft == 1 .and. ndat == 1 .and. istwf_k == 1) then 2539 ! dimensions of tab work3 and work2 are identical no need to use work2 2540 work3=zero 2541 call sphere(fofgin,ndat,npwin,work3,n1,n2,n3,n4,n6,nd2proc,& 2542 kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one) 2543 else 2544 work2=zero 2545 call sphere(fofgin,ndat,npwin,work2,n1,n2,n3,n4,n6,n5,& 2546 kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one) 2547 2548 if (paral_kgb==1 .and. istwf_k > 1) then 2549 ! Collect G-vectors on each node 2550 work3=zero 2551 ABI_MALLOC(work_sum,(2,n4,n6,n5*ndat)) 2552 call timab(48,1,tsec) 2553 call xmpi_sum(work2,work_sum,2*n4*n6*n5*ndat,comm_fft,ier) 2554 call timab(48,2,tsec) 2555 2556 ! Extract my list of G-vectors needed for MPI-FFT. 2557 do idat=1,ndat 2558 do i2=1,n2 2559 if( fftn2_distrib(i2) == me_fft) then 2560 i2_local = ffti2_local(i2) + nd2proc*(idat-1) 2561 do i3=1,n3 2562 do i1=1,n1 2563 work3(1,i1,i3,i2_local)=work_sum(1,i1,i3,i2+n5*(idat-1)) 2564 work3(2,i1,i3,i2_local)=work_sum(2,i1,i3,i2+n5*(idat-1)) 2565 end do 2566 end do 2567 end if 2568 end do 2569 end do 2570 ABI_FREE(work_sum) 2571 end if 2572 2573 if (paral_kgb/=1) then 2574 do idat=1,ndat 2575 do i2=1,n2 2576 do i3=1,n3 2577 do i1=1,n1 2578 work3(1,i1,i3,i2+nd2proc*(idat-1))=work2(1,i1,i3,i2+n5*(idat-1)) 2579 work3(2,i1,i3,i2+nd2proc*(idat-1))=work2(2,i1,i3,i2+n5*(idat-1)) 2580 end do 2581 end do 2582 end do 2583 end do 2584 end if 2585 end if 2586 end if 2587 if (paral_kgb==1) then 2588 option_ccfft=1 2589 else 2590 option_ccfft=2 2591 end if 2592 end if 2593 2594 ! Fourier transform fofr (reciprocal to real space) 2595 ! The output might be in work1 or fofr, depending on inplace 2596 if (fftalgc==0) then 2597 if (fftalga/=4) then 2598 ! Call usual 3DFFT library routines 2599 call ccfft(ngfft,+1,n1,n2,n3,n4,n5,n6,ndat,2,fofr,work1,comm_fft) 2600 else 2601 ! SG simplest complex-to-complex routine 2602 call ccfft(ngfft,+1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work4,comm_fft) 2603 ABI_FREE(work2) 2604 ABI_FREE(work3) 2605 end if 2606 else 2607 ! Call SG routine, with zero padding 2608 call sg_fftpad(fftcache,mgfft,n1,n2,n3,n4,n5,n6,ndat,gboundin,+1,fofr,work1) 2609 end if 2610 end if ! option/=3 2611 2612 ! Note that if option==0 everything is alright already, the output is available in fofr. 2613 ! MG: TODO: Rewrite this mess in human-readable form! 2614 if (option==0) then 2615 if (fftalgc==0) then 2616 if (fftalga/=4) then 2617 call DCOPY(2*n4*n5*n6*ndat,work1,1,fofr,1) 2618 else 2619 call DCOPY(2*n4*n5*n6*ndat,work4,1,fofr,1) 2620 end if 2621 else 2622 ! Results are copied to fofr. 2623 call DCOPY(2*n4*n5*n6*ndat,work1,1,fofr,1) 2624 end if 2625 end if 2626 2627 if (option==1) then 2628 ! Accumulate density 2629 if ((fftalgc==0) .and. (fftalga==4)) then 2630 do idat=1,ndat 2631 do i3=1,n3 2632 if( me_fft == fftn3_distrib(i3) ) then 2633 i3_local = ffti3_local(i3) + nd3proc*(idat-1) 2634 do i2=1,n2 2635 do i1=1,n1 2636 denpot(i1,i2,i3)=denpot(i1,i2,i3)+& 2637 weight_r*work4(1,i1,i2,i3_local)**2+& 2638 weight_i*work4(2,i1,i2,i3_local)**2 2639 end do 2640 end do 2641 end if 2642 end do 2643 end do ! idat 2644 else 2645 call cg_addtorho(n1,n2,n3,n4,n5,n6,ndat,weight_r,weight_i,work1,denpot) 2646 end if 2647 end if ! option==1 2648 2649 if (option==2) then 2650 ! Apply local potential 2651 if (cplex==1) then 2652 2653 if ((fftalgc==0) .and. (fftalga==4)) then 2654 !$OMP PARALLEL DO PRIVATE(i3_local,i3_glob) 2655 do idat=1,ndat 2656 do i3=1,n3 2657 if( me_fft == fftn3_distrib(i3) ) then 2658 i3_local = ffti3_local(i3) + nd3proc*(idat-1) 2659 i3_glob = i3+n3*(idat-1) 2660 do i2=1,n2 2661 do i1=1,n1 2662 fofr(1,i1,i2,i3_glob)= denpot(i1,i2,i3)*work4(1,i1,i2,i3_local) 2663 fofr(2,i1,i2,i3_glob)= denpot(i1,i2,i3)*work4(2,i1,i2,i3_local) 2664 end do 2665 end do 2666 end if 2667 end do 2668 end do 2669 end if 2670 if ((fftalgc/=0) .or. (fftalga/=4)) then 2671 !$OMP PARALLEL DO PRIVATE(i3_glob) 2672 do idat=1,ndat 2673 do i3=1,n3 2674 if( me_fft == fftn3_distrib(i3) ) then 2675 i3_glob = i3+n3*(idat-1) 2676 do i2=1,n2 2677 do i1=1,n1 2678 fofr(1,i1,i2,i3_glob)=denpot(i1,i2,i3)*work1(1,i1,i2,i3+n3*(idat-1)) 2679 fofr(2,i1,i2,i3_glob)=denpot(i1,i2,i3)*work1(2,i1,i2,i3+n3*(idat-1)) 2680 end do 2681 end do 2682 end if 2683 end do 2684 end do 2685 end if 2686 2687 else if (cplex==2) then 2688 if ((fftalgc==0) .and. (fftalga==4)) then 2689 !$OMP PARALLEL DO PRIVATE(fre,fim,i3_local,i3_glob) 2690 do idat=1,ndat 2691 do i3=1,n3 2692 if( me_fft == fftn3_distrib(i3) ) then 2693 i3_local = ffti3_local(i3) + nd3proc*(idat-1) 2694 i3_glob = i3+n3*(idat-1) 2695 do i2=1,n2 2696 do i1=1,n1 2697 fre=work4(1,i1,i2,i3_local) 2698 fim=work4(2,i1,i2,i3_local) 2699 fofr(1,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fre -denpot(2*i1,i2,i3)*fim 2700 fofr(2,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fim +denpot(2*i1,i2,i3)*fre 2701 end do 2702 end do 2703 end if 2704 end do 2705 end do 2706 end if 2707 2708 if ((fftalgc/=0) .or. (fftalga/=4)) then 2709 !$OMP PARALLEL DO PRIVATE(fre,fim,i3_glob) 2710 do idat=1,ndat 2711 do i3=1,n3 2712 if( me_fft == fftn3_distrib(i3) ) then 2713 i3_glob = i3+n3*(idat-1) 2714 do i2=1,n2 2715 do i1=1,n1 2716 fre=work1(1,i1,i2,i3+n3*(idat-1)) 2717 fim=work1(2,i1,i2,i3+n3*(idat-1)) 2718 fofr(1,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fre -denpot(2*i1,i2,i3)*fim 2719 fofr(2,i1,i2,i3_glob)=denpot(2*i1-1,i2,i3)*fim +denpot(2*i1,i2,i3)*fre 2720 end do 2721 end do 2722 end if 2723 end do 2724 end do 2725 end if 2726 end if ! cplex=2 2727 2728 end if ! option=2 2729 2730 ! The data for option==2 or option==3 is now in fofr. 2731 if (option==2 .or. option==3) then 2732 2733 if (fftalgc==0) then 2734 ! Call usual 3DFFT library routines or SG simplest complex-to-complex routine 2735 if (fftalga==FFT_SG2002) then 2736 ABI_FREE(work1) 2737 ABI_MALLOC(work1,(2,n4,n6,n5*ndat)) 2738 end if 2739 2740 if (option==3 .or. fftalga/=4) then 2741 call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,2,fofr,work1,comm_fft) 2742 else 2743 ! creation of small arrays 2744 ! nd3proc=((n5-1)/nproc_fft) +1 2745 nd3proc=((n6-1)/nproc_fft) +1 2746 nd2proc=((n2-1)/nproc_fft) +1 2747 ABI_MALLOC(work3,(2,n4,n5,nd3proc*ndat)) 2748 ABI_MALLOC(work2,(2,n4,n6,nd2proc*ndat)) 2749 2750 if (paral_kgb==1) then 2751 2752 if (cplex==1) then 2753 do idat=1,ndat 2754 do i3=1,n3 2755 if( me_fft == fftn3_distrib(i3) ) then 2756 i3_local = ffti3_local(i3) + nd3proc*(idat-1) 2757 do i2=1,n2 2758 do i1=1,n1 2759 work3(1,i1,i2,i3_local)=denpot(i1,i2,i3)*work4(1,i1,i2,i3_local) 2760 work3(2,i1,i2,i3_local)=denpot(i1,i2,i3)*work4(2,i1,i2,i3_local) 2761 end do 2762 end do 2763 end if 2764 end do 2765 end do 2766 else 2767 do idat=1,ndat 2768 do i3=1,n3 2769 if( me_fft == fftn3_distrib(i3) ) then 2770 i3_local = ffti3_local(i3) + nd3proc*(idat-1) 2771 do i2=1,n2 2772 do i1=1,n1 2773 fre=work4(1,i1,i2,i3_local) 2774 fim=work4(2,i1,i2,i3_local) 2775 work3(1,i1,i2,i3_local) = denpot(2*i1-1,i2,i3)*fre-denpot(2*i1,i2,i3)*fim 2776 work3(2,i1,i2,i3_local) = denpot(2*i1-1,i2,i3)*fim+denpot(2*i1,i2,i3)*fre 2777 end do 2778 end do 2779 end if 2780 end do 2781 end do 2782 end if 2783 option_ccfft=1 2784 2785 else 2786 if (nproc_fft /=1 .or. ndat /= 1 ) then 2787 do idat=1,ndat 2788 do i3=1,n3 2789 do i2=1,n2 2790 do i1=1,n1 2791 work3(1,i1,i2,i3+nd3proc*(idat-1))=fofr(1,i1,i2,i3+n3*(idat-1)) 2792 work3(2,i1,i2,i3+nd3proc*(idat-1))=fofr(2,i1,i2,i3+n3*(idat-1)) 2793 end do 2794 end do 2795 end do 2796 end do 2797 option_ccfft=2 2798 end if 2799 end if 2800 2801 if (paral_kgb==1) then 2802 call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work2,comm_fft) 2803 else 2804 if (nproc_fft /=1 .or. ndat /= 1 ) then 2805 call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,work3,work2,comm_fft) 2806 else 2807 call ccfft(ngfft,-1,n1,n2,n3,n4,n5,n6,ndat,option_ccfft,fofr,work1,comm_fft) 2808 end if 2809 end if 2810 2811 ! load of work1 2812 if ((paral_kgb==1) .and. ( istwf_k > 1 )) work1(:,:,:,:)=zero 2813 2814 if (paral_kgb==1) then 2815 if ( istwf_k > 1 ) then 2816 do idat=1,ndat 2817 do i2=1,n2 2818 if( me_fft == fftn2_distrib(i2) ) then 2819 i2_local = ffti2_local(i2) + nd2proc*(idat-1) 2820 do i3=1,n3 2821 do i1=1,n1 2822 work1(1,i1,i3,i2+n5*(idat-1))= work2(1,i1,i3,i2_local) 2823 work1(2,i1,i3,i2+n5*(idat-1))= work2(2,i1,i3,i2_local) 2824 end do 2825 end do 2826 end if 2827 end do 2828 end do 2829 end if 2830 2831 else 2832 if (nproc_fft /=1 .or. ndat /= 1 ) then 2833 do idat=1,ndat 2834 do i2=1,n2 2835 do i3=1,n3 2836 do i1=1,n1 2837 work1(1,i1,i3,i2+n5*(idat-1))=work2(1,i1,i3,i2+nd2proc*(idat-1)) 2838 work1(2,i1,i3,i2+n5*(idat-1))=work2(2,i1,i3,i2+nd2proc*(idat-1)) 2839 end do 2840 end do 2841 end do 2842 end do 2843 end if 2844 end if 2845 ABI_FREE(work3) 2846 if ((paral_kgb==1) .and. ( istwf_k > 1 )) then 2847 call timab(48,1,tsec) 2848 call xmpi_sum(work1,comm_fft,ier) 2849 call timab(48,2,tsec) 2850 end if 2851 end if 2852 2853 else 2854 ! Call SG routine, with zero padding 2855 call sg_fftpad(fftcache,mgfft,n1,n2,n3,n4,n5,n6,ndat,gboundout,-1,fofr,work1) 2856 end if 2857 2858 xnorm = one/dble(nfftot) 2859 2860 if (fftalga/=4) then 2861 call cg_box2gsph(n1,n2,n3,n4,n5,n6,ndat,npwout,kg_kout,work1,fofgout, rscal=xnorm) 2862 else 2863 ! if fftalga==4 2864 if ((paral_kgb==1) .and. ( istwf_k == 1 )) then 2865 !$OMP PARALLEL DO PRIVATE(i1,i2,i3,i2_local) 2866 do idat=1,ndat 2867 do ig=1,npwout 2868 i1=kg_kout(1,ig); if(i1<0)i1=i1+n1 ; i1=i1+1 2869 i2=kg_kout(2,ig); if(i2<0)i2=i2+n2 ; i2=i2+1 2870 i3=kg_kout(3,ig); if(i3<0)i3=i3+n3 ; i3=i3+1 2871 i2_local = ffti2_local(i2) + nd2proc*(idat-1) 2872 fofgout(1,ig+npwout*(idat-1))= work2(1,i1,i3,i2_local)*xnorm 2873 fofgout(2,ig+npwout*(idat-1))= work2(2,i1,i3,i2_local)*xnorm 2874 end do 2875 end do 2876 ABI_FREE(work2) 2877 else 2878 !$OMP PARALLEL DO PRIVATE(i1,i2,i3) 2879 do idat=1,ndat 2880 do ig=1,npwout 2881 i1=kg_kout(1,ig); if(i1<0)i1=i1+n1; i1=i1+1 2882 i2=kg_kout(2,ig); if(i2<0)i2=i2+n2; i2=i2+1 2883 i3=kg_kout(3,ig); if(i3<0)i3=i3+n3; i3=i3+1 2884 fofgout(1,ig+npwout*(idat-1))=work1(1,i1,i3,i2+n5*(idat-1))*xnorm 2885 fofgout(2,ig+npwout*(idat-1))=work1(2,i1,i3,i2+n5*(idat-1))*xnorm 2886 end do 2887 end do 2888 end if 2889 end if ! fftalga 2890 end if ! if option==2 or 3 2891 2892 ABI_SFREE(work1) 2893 end if 2894 2895 ! Here, call more specialized 3-dimensional fft 2896 ! (zero padding as well as maximize cache reuse) based on S Goedecker routines. 2897 ! Specially tuned for cache architectures. 2898 if (fftalga==FFT_SG .and. fftalgc==2 .and. option/=3) then 2899 call sg_fftrisc(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,& 2900 istwf_k,kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,n4,n5,n6,option,weight_r,weight_i) 2901 end if 2902 2903 ! Here, call new FFT from S Goedecker, also sophisticated specialized 3-dimensional fft 2904 ! (zero padding as well as maximize cache reuse) 2905 if (fftalga==FFT_SG2002 .and. fftalgc/=0) then 2906 ! The args are not the same as fourwf, but might be 2907 call fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,& 2908 istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,mpi_enreg%distribfft,n1,n2,n3,npwin,npwout,& 2909 n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft) 2910 end if 2911 2912 if (option==0.and.(fftalga==FFT_SG.or.fftalga==FFT_SG2002)) then 2913 ! In these cases, add the periodic image of the borders so all fofr components are computed 2914 do i3=1,n3 2915 if (n1==n4-1) then 2916 do i2=1,n2 2917 fofr(:,n4,i2,i3)=fofr(:,1,i2,i3) 2918 end do 2919 end if 2920 if (n2==n5-1) then 2921 fofr(:,:,n5,i3)=fofr(:,:,1,i3) 2922 end if 2923 end do 2924 end if 2925 2926 ABI_SFREE(work4) 2927 ABI_SFREE(work2) 2928 end select 2929 2930 ! Accumulate timing 2931 100 continue 2932 call timab(840+tim_fourwf,2,tsec) 2933 2934 end subroutine fourwf
ABINIT/m_fft [ Modules ]
NAME
m_fft
FUNCTION
This module provides driver routines for sequential FFTs (OpenMP threads are supported). It also defines generic interfaces for single or double precision FFTs.
COPYRIGHT
Copyright (C) 2009-2024 ABINIT group (MG, MM, GZ, MT, MF, XG, PT, FF) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
17 #if defined HAVE_CONFIG_H 18 #include "config.h" 19 #endif 20 21 #include "abi_common.h" 22 23 MODULE m_fft 24 25 use defs_basis 26 use m_abicore 27 use m_errors 28 use m_xomp 29 use m_xmpi 30 use m_cplxtools 31 use m_cgtools 32 use m_sgfft 33 use m_sg2002 34 use m_fftw3 35 use m_dfti 36 #if defined HAVE_MPI2 37 use mpi 38 #endif 39 40 use defs_abitypes, only : MPI_type 41 use defs_fftdata, only : mg 42 use m_time, only : cwtime, timab 43 use m_numeric_tools, only : r2c 44 use m_fstrings, only : sjoin, itoa 45 use m_geometry, only : metric 46 use m_hide_blas, only : xscal 47 use m_fftcore, only : get_cache_kb, kpgsph, get_kg, sphere_fft, sphere_fft1, sphere, change_istwfk, & 48 fftalg_info, fftalg_has_mpi, print_ngfft, getng, sphereboundary 49 use m_mpinfo, only : destroy_mpi_enreg, ptabs_fourdp, ptabs_fourwf, initmpi_seq 50 use m_distribfft, only : distribfft_type, init_distribfft, destroy_distribfft 51 52 #if defined HAVE_GPU_CUDA 53 ! MG: Had to comment this line to avoid "Ambiguous reference to c_ptr on buda2 with CUDA 54 use m_manage_cuda 55 #endif 56 use m_ompgpu_fourwf 57 58 use, intrinsic :: iso_c_binding 59 60 implicit none 61 62 private 63 64 #if defined HAVE_MPI1 65 include 'mpif.h' 66 #endif 67 68 public :: fft_ug ! Driver for zero-padded FFTs u(g) --> u(r) 69 public :: fft_ur ! Driver for zero-padded FFTs u(r) --> u(g) 70 public :: fftpad ! Driver for (low-level) zero-padded FFTs, note that fft_ug is the preferred interface. 71 public :: fft_poisson ! Solve the poisson equation in G-space starting from n(r). 72 public :: fourdp_6d ! Calculate a 6-dimensional Fast Fourier Transform 73 public :: fftpac ! Copy to change the stride of a three-dimension array for more efficient FFT. 74 public :: indirect_parallel_Fourier 75 76 public :: fft_use_lib_threads 77 public :: fft_allow_ialltoall ! Allow the use of non-blocking IALLOTOALL in MPI-FFTs algorithms 78 public :: zerosym ! Symmetrize an array on the FFT grid by vanishing some term on the boundaries. 79 80 ! Main entry points. 81 public :: fourdp 82 public :: fourwf 83 84 integer,public,save :: fourdp_counter = -1 85 integer,public,save :: fourwf_counter = -1 86 public :: fft_init_counters 87 public :: fft_stop_counters 88 public :: fft_output_counters 89 90 ! Driver routines for MPI version. 91 public :: fourdp_mpi ! MPI FFT of densities/potentials on the full box. 92 public :: fourwf_mpi ! specialized MPI-FFT for wavefunctions. 93 !public :: fftmpi_u 94 95 interface fft_ug 96 module procedure fft_ug_sp 97 module procedure fft_ug_dp 98 module procedure fft_ug_spc 99 module procedure fft_ug_dpc 100 end interface fft_ug 101 102 interface fft_ur 103 module procedure fft_ur_dp 104 module procedure fft_ur_spc 105 module procedure fft_ur_dpc 106 end interface fft_ur 107 108 interface fftpad 109 module procedure fftpad_spc 110 module procedure fftpad_dpc 111 end interface fftpad
m_fft/fft_allow_ialltoall [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_allow_ialltoall
FUNCTION
Allow the use of non-blocking IALLOTOALL in MPI-FFTs algorithms Mainly used for profiling purposes.
SOURCE
271 subroutine fft_allow_ialltoall(bool) 272 273 !Arguments ------------------------------------ 274 logical,intent(in) :: bool 275 276 ! ************************************************************************* 277 278 ALLOW_IALLTOALL = bool 279 #ifndef HAVE_MPI_IALLTOALL 280 ALLOW_IALLTOALL = .False. 281 #endif 282 283 end subroutine fft_allow_ialltoall
m_fft/fft_poisson [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_poisson
FUNCTION
Driver routine to solve the Poisson equation in G-space given the density, n(r), in real space of the FFT box.
INPUTS
ngfft(18)=Info on the 3D FFT. cplex=1 if fofr is real, 2 if fofr is complex nx,ny,nz=Number of FFT points along the three directions. ldx,ldy,ldz=Leading dimension of the array nr and vg. ndat = Number of densities vg(nx*ny*nz)=Potential in reciprocal space.
SIDE EFFECTS
nr(cplex*ldx*ldy*ldz*ndat) input: n(r) (real or complex) output: the hartree potential in real space
NOTES
vg is given on the FFT mesh instead of the augmented mesh [ldx,ldy,ldz] in order to simplify the interface with the other routines operating of vg
SOURCE
1111 subroutine fft_poisson(ngfft, cplex, nx, ny, nz, ldx, ldy, ldz, ndat, vg, nr) 1112 1113 !Arguments ------------------------------------ 1114 !scalars 1115 integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat 1116 integer,intent(in) :: ngfft(18) 1117 !arrays 1118 real(dp),intent(inout) :: nr(cplex*ldx*ldy*ldz*ndat) 1119 real(dp),intent(in) :: vg(nx*ny*nz) 1120 1121 !Local variables------------------------------- 1122 integer :: fftalga, fftcache 1123 1124 ! ************************************************************************* 1125 1126 fftalga = ngfft(7)/100; fftcache = ngfft(8) 1127 1128 select case (fftalga) 1129 case (FFT_SG, FFT_SG2002) 1130 ! Note: according to my tests fftalg 1xx is always faster than 4xx in sequential 1131 ! hence it does not make sense to provide a fallback for fftalg 4xx. 1132 ! Use external FFT libraries if you want to run at top-level speed. 1133 call sg_poisson(fftcache,cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr) 1134 1135 case (FFT_FFTW3) 1136 call fftw3_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr) 1137 1138 !case (FFT_DFTI) 1139 ! call dfti_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr) 1140 1141 case default 1142 ABI_BUG(sjoin("Wrong value for fftalga: ",itoa(fftalga))) 1143 end select 1144 1145 end subroutine fft_poisson
m_fft/fft_ug_dp [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ug_dp
FUNCTION
Driver routine for G-->R transform of wavefunctions with zero-padded FFT. TARGET: double precision real arrays with Re/Im.
INPUTS
See fft_ug_dpc
SOURCE
647 subroutine fft_ug_dp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur) 648 649 !Arguments ------------------------------------ 650 !scalars 651 integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat 652 !arrays 653 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 654 real(dp),target,intent(in) :: ug(*) !2*npw_k*nspinor*ndat) 655 real(dp),target,intent(out) :: ur(*) !2*nfft*nspinor*ndat) 656 657 !Local variables------------------------------- 658 complex(dp),contiguous,pointer :: ug_cplx(:), ur_cplx(:) 659 660 ! ************************************************************************* 661 662 call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat]) 663 call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat]) 664 665 call fft_ug_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug_cplx, ur_cplx) 666 667 end subroutine fft_ug_dp
m_fft/fft_ug_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ug_dpc
FUNCTION
Driver routine for G-->R transform of wavefunctions with zero-padded FFT. TARGET: double precision arrays
INPUTS
npw_k=number of plane waves for this k-point. nfft=Number of FFT points. nspinor=number of spinorial components ndat=Numer of wavefunctions to transform. mgfft=Max number of FFT divisions ngfft(18)=information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft istwfk=Option describing the storage of the wavefunction. (at present must be 1) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound_k_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary. ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space
OUTPUT
ur(nfft*nspinor*ndat)=wavefunctions in real space.
SOURCE
741 subroutine fft_ug_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur) 742 743 !Arguments ------------------------------------ 744 !scalars 745 integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat 746 !arrays 747 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 748 complex(dpc),intent(in) :: ug(*) !npw_k*nspinor*ndat) 749 complex(dpc),intent(out) :: ur(*) !nfft*nspinor*ndat) 750 751 ! ************************************************************************* 752 753 #include "fftug_driver.finc" 754 755 end subroutine fft_ug_dpc
m_fft/fft_ug_sp [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ug_sp
FUNCTION
Driver routine for G-->R transform of wavefunctions with zero-padded FFT. TARGET: single precision real arrays with Re/Im.
INPUTS
See fft_ug_dpc
SOURCE
611 subroutine fft_ug_sp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur) 612 613 !Arguments ------------------------------------ 614 !scalars 615 integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat 616 !arrays 617 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 618 real(sp),target,intent(in) :: ug(2*npw_k*nspinor*ndat) 619 real(sp),target,intent(out) :: ur(2*nfft*nspinor*ndat) 620 621 !Local variables------------------------------- 622 complex(sp),contiguous,pointer :: ug_cplx(:), ur_cplx(:) 623 624 ! ************************************************************************* 625 626 call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat]) 627 call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat]) 628 629 call fft_ug_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug_cplx, ur_cplx) 630 631 end subroutine fft_ug_sp
m_fft/fft_ug_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ug_spc
FUNCTION
Driver routine for G-->R transform of wavefunctions with zero-padded FFT. TARGET: single precision arrays
INPUTS
npw_k=number of plane waves for this k-point. nfft=Number of FFT points. nspinor=number of spinorial components ndat=Numer of wavefunctions to transform. mgfft=Max number of FFT divisions ngfft(18)=information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft istwfk=Option describing the storage of the wavefunction. (at present must be 1) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound_k_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary. ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space
OUTPUT
ur(nfft*nspinor*ndat)=wavefunctions in real space.
SOURCE
697 subroutine fft_ug_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ug, ur) 698 699 !Arguments ------------------------------------ 700 !scalars 701 integer,intent(in) :: npw_k,nfft,nspinor,istwf_k,mgfft,ndat 702 !arrays 703 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 704 complex(spc),intent(in) :: ug(*) !npw_k*nspinor*ndat) 705 complex(spc),intent(out) :: ur(*) !nfft*nspinor*ndat) 706 707 ! ************************************************************************* 708 709 #include "fftug_driver.finc" 710 711 end subroutine fft_ug_spc
m_fft/fft_ur_dp [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ur_dp
FUNCTION
Compute ndat zero-padded FFTs from R- to G-space . Mainly used for the transform of wavefunctions. TARGET: double precision real arrays with re/im
INPUTS
See fft_ur_dpc
SOURCE
772 subroutine fft_ur_dp(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug) 773 774 !Arguments ------------------------------------ 775 !scalars 776 integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft 777 !arrays 778 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2), kg_k(3,npw_k) 779 real(dp),target,intent(inout) :: ur(*) !2*nfft*nspinor*ndat) 780 real(dp),target,intent(out) :: ug(*) !2*npw_k*nspinor*ndat) 781 782 !Local variables------------------------------- 783 complex(dp),contiguous,pointer :: ug_cplx(:), ur_cplx(:) 784 785 ! ************************************************************************* 786 787 call C_F_pointer(c_loc(ug), ug_cplx, shape=[npw_k*nspinor*ndat]) 788 call C_F_pointer(c_loc(ur), ur_cplx, shape=[nfft*nspinor*ndat]) 789 call fft_ur_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur_cplx, ug_cplx) 790 791 end subroutine fft_ur_dp
m_fft/fft_ur_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ur_dpc
FUNCTION
Compute ndat zero-padded FFTs from R- to G-space . Mainly used for the transform of wavefunctions. TARGET: dpc complex arrays
INPUTS
npw_k=number of plane waves for this k-point. nfft=Number of FFT points. nspinor=number of spinorial components ndat=Number of wavefunctions to transform. mgfft=Max number of FFT divisions ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft istwfk=Option describing the storage of the wavefunction. (at present must be 1) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.
SIDE EFFECTS
ur(nfft*nspinor*ndat)=In input: wavefunctions in real space Destroyed in output. Do not use ur anymore!
OUTPUT
ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space given on the G-sphere.
SOURCE
873 subroutine fft_ur_dpc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug) 874 875 !Arguments ------------------------------------ 876 !scalars 877 integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft 878 !arrays 879 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 880 complex(dpc),intent(inout) :: ur(*) ! nfft*nspinor*ndat) 881 complex(dpc),intent(out) :: ug(*) ! npw_k*nspinor*ndat) 882 883 ! ************************************************************************* 884 885 #include "fftur_driver.finc" 886 887 end subroutine fft_ur_dpc
m_fft/fft_ur_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_ur_spc
FUNCTION
Compute ndat zero-padded FFTs from R- to G-space . Mainly used for the transform of wavefunctions. TARGET: spc complex arrays
INPUTS
npw_k=number of plane waves for this k-point. nfft=Number of FFT points. nspinor=number of spinorial components ndat=Number of wavefunctions to transform. mgfft=Max number of FFT divisions ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft istwfk=Option describing the storage of the wavefunction. (at present must be 1) kg_k(3,npw_k)=G-vectors in reduced coordinates gbound_k(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.
SIDE EFFECTS
ur(nfft*nspinor*ndat)=In input: wavefunctions in real space Destroyed in output. Do not use ur anymore!
OUTPUT
ug(npw_k*nspinor*ndat)=wavefunctions in reciprocal space given on the G-sphere.
SOURCE
825 subroutine fft_ur_spc(npw_k, nfft, nspinor, ndat, mgfft, ngfft, istwf_k, kg_k, gbound_k, ur, ug) 826 827 !Arguments ------------------------------------ 828 !scalars 829 integer,intent(in) :: npw_k,nfft,nspinor,ndat,istwf_k,mgfft 830 !arrays 831 integer,intent(in) :: ngfft(18),gbound_k(2*mgfft+8,2),kg_k(3,npw_k) 832 complex(spc),intent(inout) :: ur(*) !nfft*nspinor*ndat) 833 complex(spc),intent(out) :: ug(*) !npw_k*nspinor*ndat) 834 835 ! ************************************************************************* 836 837 #include "fftur_driver.finc" 838 839 end subroutine fft_ur_spc
m_fft/fft_use_lib_threads [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fft_use_lib_threads
FUNCTION
INPUTS
OUTPUT
SOURCE
1162 subroutine fft_use_lib_threads(logvar) 1163 1164 !Arguments ------------------------------------ 1165 !scalars 1166 logical,intent(in) :: logvar 1167 1168 ! ************************************************************************* 1169 1170 call dfti_use_lib_threads(logvar) 1171 call fftw3_use_lib_threads(logvar) 1172 1173 end subroutine fft_use_lib_threads
m_fft/fftbox_execute_ip_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_execute_ip_dpc
FUNCTION
In-place FFT transform of complex arrays Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines TARGET: dpc arrays
INPUTS
plan<fftbox_plan3_t>=Structure with the parameters defining the transform. isign= Sign of the exponential in the FFT [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale
SIDE EFFECTS
ff(plan%ldxyz*plan%batch_size) = In input: the data to transform. Changed in output, filled with the FFT results.
SOURCE
460 subroutine fftbox_execute_ip_dpc(plan, ff, isign, ndat, iscale) 461 462 !Arguments ------------------------------------ 463 !scalars 464 class(fftbox_plan3_t),target,intent(inout) :: plan 465 integer,intent(in) :: isign 466 integer,optional,intent(in) :: ndat, iscale 467 !arrays 468 complex(dpc),target,intent(inout) :: ff(*) 469 470 ! ************************************************************************* 471 472 integer :: ndat__, iscale__ 473 ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat 474 ABI_DEFAULT(iscale__, iscale, 1) 475 476 #if defined HAVE_GPU_CUDA 477 if (plan%gpu_option /= ABI_GPU_DISABLED) then 478 call xgpu_fftbox_c2c_ip(plan%dims, plan%embed, ndat__, isign, dpc, iscale__, c_loc(ff), & 479 plan%gpu_plan_ip_dpc, plan%gpu_data_ip_dpc) 480 return 481 end if 482 #endif 483 484 ! CPU version 485 #include "fftbox_ip_driver.finc" 486 487 end subroutine fftbox_execute_ip_dpc
m_fft/fftbox_execute_ip_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_execute_ip_spc
FUNCTION
In-place FFT transform of complex array. Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines TARGET: spc arrays
INPUTS
plan<fftbox_plan3_t>=Structure with the parameters defining the transform. isign= Sign of the exponential in the FFT [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale
SIDE EFFECTS
ff(plan%ldxyz*plan%batch_size) = In input: the data to transform. Changed in output, filled with the FFT results.
SOURCE
407 subroutine fftbox_execute_ip_spc(plan, ff, isign, ndat, iscale) 408 409 !Arguments ------------------------------------ 410 !scalars 411 class(fftbox_plan3_t),target,intent(inout) :: plan 412 integer,intent(in) :: isign 413 integer,optional,intent(in) :: ndat, iscale 414 !arrays 415 complex(spc),target,intent(inout) :: ff(*) 416 417 ! ************************************************************************* 418 419 integer :: ndat__, iscale__ 420 ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat 421 ABI_DEFAULT(iscale__, iscale, 1) 422 423 #if defined HAVE_GPU_CUDA 424 if (plan%gpu_option /= ABI_GPU_DISABLED) then 425 call xgpu_fftbox_c2c_ip(plan%dims, plan%embed, ndat__, isign, spc, iscale__, c_loc(ff), & 426 plan%gpu_plan_ip_spc, plan%gpu_data_ip_spc) 427 return 428 end if 429 #endif 430 431 ! CPU version 432 #include "fftbox_ip_driver.finc" 433 434 end subroutine fftbox_execute_ip_spc
m_fft/fftbox_execute_op_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_execute_op_dpc
FUNCTION
Out-of-place FFT transform of complex arrays. Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines TARGET: dpc arrays
INPUTS
plan<fftbox_plan3_t>=Structure with the parameters defining the transform. ff(plan%ldxyz*plan%batch_size)=The input array to be transformed. isign= Sign of the exponential in the FFT [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale
OUTPUT
gg(plan%ldxyz*plan%batch_size)= The FFT results.
SOURCE
565 subroutine fftbox_execute_op_dpc(plan, ff, gg, isign, ndat, iscale) 566 567 !Arguments ------------------------------------ 568 !scalars 569 class(fftbox_plan3_t),intent(inout) :: plan 570 integer,intent(in) :: isign 571 integer,optional,intent(in) :: ndat, iscale 572 !arrays 573 complex(dpc),target,intent(in) :: ff(*) 574 complex(dpc),target,intent(inout) :: gg(*) 575 576 ! ************************************************************************* 577 578 integer :: ndat__, iscale__ 579 ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat 580 ABI_DEFAULT(iscale__, iscale, 1) 581 582 #if defined HAVE_GPU_CUDA 583 if (plan%gpu_option /= ABI_GPU_DISABLED) then 584 call xgpu_fftbox_c2c_op(plan%dims, plan%embed, ndat__, isign, dpc, iscale__, c_loc(ff), c_loc(gg), & 585 plan%gpu_plan_op_dpc, plan%gpu_idata_op_dpc, plan%gpu_odata_op_dpc) 586 return 587 end if 588 #endif 589 590 ! CPU version 591 #include "fftbox_op_driver.finc" 592 593 end subroutine fftbox_execute_op_dpc
m_fft/fftbox_execute_op_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_execute_op_spc
FUNCTION
Out-of-place FFT transform of complex arrays. Call (FFTW3|DFTI) routines if available, otherwise fallback to SG routines TARGET: spc arrays
INPUTS
plan<fftbox_plan3_t>=Structure with the parameters defining the transform. ff(plan%ldxyz*plan%batch_size)=The input array to be transformed. isign= Sign of the exponential in the FFT [iscale]= 0 if G --> R FFT should not be scaled. Default: 1 i.e. scale
OUTPUT
gg(plan%ldxyz*plan%batch_size)= The FFT results.
SOURCE
512 subroutine fftbox_execute_op_spc(plan, ff, gg, isign, ndat, iscale) 513 514 !Arguments ------------------------------------ 515 !scalars 516 class(fftbox_plan3_t),intent(inout) :: plan 517 integer,intent(in) :: isign 518 integer,optional,intent(in) :: ndat, iscale 519 !arrays 520 complex(spc),target,intent(in) :: ff(*) 521 complex(spc),target,intent(inout) :: gg(*) 522 523 ! ************************************************************************* 524 525 integer :: ndat__, iscale__ 526 ndat__ = plan%batch_size; if (present(ndat) ) ndat__ = ndat 527 ABI_DEFAULT(iscale__, iscale, 1) 528 529 #if defined HAVE_GPU_CUDA 530 if (plan%gpu_option /= ABI_GPU_DISABLED) then 531 call xgpu_fftbox_c2c_op(plan%dims, plan%embed, ndat__, isign, spc, iscale__, c_loc(ff), c_loc(gg), & 532 plan%gpu_plan_op_spc, plan%gpu_idata_op_spc, plan%gpu_odata_op_spc) 533 return 534 end if 535 #endif 536 537 ! CPU version 538 #include "fftbox_op_driver.finc" 539 540 end subroutine fftbox_execute_op_spc
m_fft/fftbox_mpi_utests [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_mpi_utests
FUNCTION
Driver routine for unit tests of the MPI FFT routines.
INPUTS
fftalg =fftalg input variable. cplex=1 for r2c, 2 for c2c transforms. ndat = Number of transform to execute nthreads = Number of OpenMP threads. comm_fft=MPI communicator for the FFT [unit]=Output Unit number (DEFAULT std_out)
OUTPUT
nfailed=number of failures.
SOURCE
1674 function fftbox_mpi_utests(fftalg, cplex, ndat, nthreads, comm_fft, unit) result(nfailed) 1675 1676 !Arguments ----------------------------------- 1677 !scalars 1678 integer,intent(in) :: fftalg,cplex,ndat,nthreads,comm_fft 1679 integer,optional,intent(in) :: unit 1680 integer :: nfailed 1681 1682 !Local variables------------------------------- 1683 !scalars 1684 integer,parameter :: NSETS=6 1685 integer :: ierr,old_nthreads,ount,iset,mpierr,nfft,me_fft 1686 integer :: nproc_fft,fftalga,fftalgc,n1,n2,n3,n4,n5,n6 1687 real(dp),parameter :: ATOL_DP=tol12 1688 real(dp) :: max_abserr 1689 real(dp) :: ctime,wtime,gflops 1690 character(len=500) :: msg,info,library,cplex_mode,padding_mode 1691 type(distribfft_type),target :: fftabs 1692 !arrays 1693 integer :: pars(6,NSETS),ngfft(18) 1694 integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:) 1695 integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:) 1696 real(dp),allocatable :: fofg(:,:),fofr(:),fofr_copy(:) 1697 1698 ! ************************************************************************* 1699 1700 ount = std_out; if (PRESENT(unit)) ount = unit 1701 nfailed = 0 1702 1703 if (nthreads>0) then 1704 old_nthreads = xomp_get_max_threads() 1705 call xomp_set_num_threads(nthreads) 1706 end if 1707 1708 ! These values must be compatible with all the FFT routines. 1709 ! SG library is the most restrictive (only powers of 2,3,5). 1710 pars = RESHAPE( [ & 1711 12, 18, 15, 12, 18, 15, & 1712 12, 18, 15, 13, 19, 16, & 1713 12, 18, 15, 13, 19, 15, & 1714 12, 18, 15, 12, 18, 16, & 1715 12, 18, 15, 13, 18, 15, & 1716 12, 18, 15, 15, 21, 18 & 1717 ], [6, NSETS] ) 1718 1719 fftalga=fftalg/100; fftalgc=mod(fftalg,10) 1720 1721 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 1722 1723 call fftalg_info(fftalg,library,cplex_mode,padding_mode) 1724 1725 !do iset=1,SIZE(pars,DIM=2) 1726 do iset=1,1 1727 n1=pars(1,iset); n2=pars(2,iset); n3=pars(3,iset) 1728 ! For the time being, ngfft(2) and ngfft(3) must be multiple of nproc_fft 1729 n2 = n2 * nproc_fft; n3 = n3 * nproc_fft 1730 !n4=pars(4,iset); n5=pars(5,iset); n6=pars(6,iset) 1731 n4=n1; n5=n2; n6=n3 1732 1733 ! Init ngfft 1734 ! TODO Propagate more info via ngfft, define helper functions, write routine to get fftcache 1735 ngfft = 0 1736 ngfft(1:7) = [n1,n2,n3,n4,n5,n6,fftalg] 1737 ngfft(8)= get_cache_kb() ! cache 1738 ngfft(9)=1 ! paral_fft_ 1739 ngfft(10)=nproc_fft ! nproc_fft 1740 ngfft(11)=xmpi_comm_rank(comm_fft) ! me_fft 1741 ngfft(12)=ngfft(2)/nproc_fft ! n2proc 1742 ngfft(13)=ngfft(3)/nproc_fft ! n3proc 1743 1744 !call print_ngfft(ngfft,"ngfft for MPI-fourdp",unit=std_out,mode_paral="COLL",prtvol=0) 1745 1746 ! Allocate arrays, fill fofr with random numbers and keep a copy. 1747 nfft = (n1 * n2 * n3) / nproc_fft 1748 ABI_MALLOC(fofg, (2,nfft*ndat)) 1749 ABI_MALLOC(fofr, (cplex*nfft*ndat)) 1750 ABI_MALLOC(fofr_copy, (cplex*nfft*ndat)) 1751 1752 call RANDOM_NUMBER(fofr) 1753 fofr_copy = fofr 1754 1755 call init_distribfft(fftabs,"c",nproc_fft,n2,n3) 1756 fftn2_distrib => fftabs%tab_fftdp2_distrib 1757 ffti2_local => fftabs%tab_fftdp2_local 1758 fftn3_distrib => fftabs%tab_fftdp3_distrib 1759 ffti3_local => fftabs%tab_fftdp3_local 1760 1761 call cwtime(ctime,wtime,gflops,"start") 1762 1763 select case (fftalga) 1764 1765 case (FFT_SG2002) 1766 call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1767 call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1768 1769 case (FFT_FFTW3) 1770 call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1771 call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1772 1773 !case (FFT_DFTI) 1774 ! call dfti_mpifourdp(cplex,nfft,ngfft,ndat,-1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1775 ! call dfti_mpifourdp(cplex,nfft,ngfft,ndat,+1,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1776 1777 case default 1778 ABI_BUG(sjoin("fftalg: ", itoa(fftalg), " does not support MPI-FFT")) 1779 end select 1780 1781 call cwtime(ctime,wtime,gflops,"stop") 1782 if (me_fft == 0) then 1783 !write(std_out,'(a,i0,2(a,f10.4))')"fftalg: ",fftalg,", cpu_time: ",ctime,", wall_time: ",wtime 1784 end if 1785 1786 ! Check if F^{-1} F = I within ATOL_DP 1787 ierr = COUNT(ABS(fofr - fofr_copy) > ATOL_DP) 1788 call xmpi_sum(ierr,comm_fft,mpierr) 1789 nfailed = nfailed + ierr 1790 1791 if (cplex == 1) info = sjoin(library,"r2c --> c2r :") 1792 if (cplex == 2) info = sjoin(library,"c2c :") 1793 1794 if (ierr /= 0) then 1795 ! Compute the maximum of the absolute error. 1796 max_abserr = MAXVAL(ABS(fofr - fofr_copy)) 1797 call xmpi_max(max_abserr,comm_fft,mpierr) 1798 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1799 else 1800 write(msg,"(a)")" OK" 1801 end if 1802 call wrtout(ount,sjoin(info, msg)) 1803 1804 call destroy_distribfft(fftabs) 1805 1806 ABI_FREE(fofg) 1807 ABI_FREE(fofr_copy) 1808 ABI_FREE(fofr) 1809 end do 1810 1811 if (nthreads > 0) call xomp_set_num_threads(old_nthreads) 1812 1813 end function fftbox_mpi_utests
m_fft/fftbox_plan3_free [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_plan3_free
FUNCTION
Free dynamic memory.
SOURCE
359 subroutine fftbox_plan3_free(plan) 360 361 !Arguments ------------------------------------ 362 class(fftbox_plan3_t),target,intent(inout) :: plan 363 364 ! ************************************************************************* 365 366 ABI_UNUSED(plan%ldxyz) 367 368 #if defined HAVE_GPU_CUDA 369 call gpu_planpp_free(plan%gpu_plan_ip_spc) 370 call devpp_free(plan%gpu_data_ip_spc) 371 call gpu_planpp_free(plan%gpu_plan_ip_dpc) 372 call devpp_free(plan%gpu_data_ip_dpc) 373 call gpu_planpp_free(plan%gpu_plan_op_spc) 374 call devpp_free(plan%gpu_idata_op_spc) 375 call devpp_free(plan%gpu_odata_op_spc) 376 call gpu_planpp_free(plan%gpu_plan_op_dpc) 377 call devpp_free(plan%gpu_idata_op_dpc) 378 call devpp_free(plan%gpu_odata_op_dpc) 379 #endif 380 381 end subroutine fftbox_plan3_free
m_fft/fftbox_plan3_from_ngfft [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_plan3_from_ngfft
FUNCTION
Initialize plan from ngfft.
SOURCE
335 subroutine fftbox_plan3_from_ngfft(plan, ngfft, batch_size, gpu_option) 336 337 !Arguments ------------------------------------ 338 class(fftbox_plan3_t),intent(out) :: plan 339 integer,intent(in) :: ngfft(18), batch_size, gpu_option 340 341 ! ************************************************************************* 342 343 call plan%init(batch_size, ngfft(1:3), ngfft(4:6), ngfft(7), ngfft(8), gpu_option) 344 345 end subroutine fftbox_plan3_from_ngfft
m_fft/fftbox_plan3_init [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_plan3_init
FUNCTION
Initialize the plan with the options passed to the fttbox_ routines. Low-level constructor.
INPUTS
See fftbox_plan3_t for the meaning of the different arguments.
SOURCE
301 subroutine fftbox_plan3_init(plan, batch_size, dims, embed, fftalg, fftcache, gpu_option) 302 303 !Arguments ------------------------------------ 304 !scalars 305 class(fftbox_plan3_t),intent(out) :: plan 306 integer,intent(in) :: batch_size, fftalg, fftcache, gpu_option 307 !arrays 308 integer,intent(in) :: dims(3), embed(3) 309 310 ! ************************************************************************* 311 312 plan%batch_size = batch_size 313 plan%dims = dims ! ngfft(1:3) 314 plan%embed = embed ! ngfft(4:6) 315 plan%fftalg = fftalg ! ngfft(7) 316 if (fftcache > 0) plan%fftcache = fftcache ! ngfft(8) 317 plan%gpu_option = gpu_option 318 plan%nfft = product(plan%dims) 319 plan%ldxyz = product(plan%embed) 320 321 end subroutine fftbox_plan3_init
m_fft/fftbox_plan3_t [ Types ]
NAME
fftbox_plan3_t
FUNCTION
Options passed to the fftbox_* routines to perform dense 3d FFTs.
SOURCE
125 type,public :: fftbox_plan3_t 126 127 integer :: fftalg = 112 ! The library to call. 128 integer :: fftcache = 16 ! Cache size in kB. Only used in SG routines. 129 integer :: nfft = -1 ! Total number of points in the FFT box. 130 integer :: ldxyz = -1 ! Physical dimension of the array to transform 131 integer :: batch_size = -1 ! MAXIMUM number of FFTs associated to the plan. 132 integer :: dims(3) = -1 ! The number of FFT divisions. 133 integer :: embed(3) = -1 ! Leading dimensions of the input, output arrays. 134 integer :: gpu_option = ABI_GPU_DISABLED ! /= 0 if FFTs should be offloaded to the GPU 135 136 type(c_ptr) :: gpu_plan_ip_spc = c_null_ptr 137 type(c_ptr) :: gpu_data_ip_spc = c_null_ptr 138 type(c_ptr) :: gpu_plan_ip_dpc = c_null_ptr 139 type(c_ptr) :: gpu_data_ip_dpc = c_null_ptr 140 type(c_ptr) :: gpu_plan_op_spc = c_null_ptr 141 type(c_ptr) :: gpu_idata_op_spc = c_null_ptr 142 type(c_ptr) :: gpu_odata_op_spc = c_null_ptr 143 type(c_ptr) :: gpu_plan_op_dpc = c_null_ptr 144 type(c_ptr) :: gpu_idata_op_dpc = c_null_ptr 145 type(c_ptr) :: gpu_odata_op_dpc = c_null_ptr 146 147 contains 148 149 procedure :: init => fftbox_plan3_init ! Low-level constructor 150 procedure :: from_ngfft => fftbox_plan3_from_ngfft ! Build object from ngfft. 151 procedure :: execute_ip_spc => fftbox_execute_ip_spc 152 procedure :: execute_ip_dpc => fftbox_execute_ip_dpc 153 procedure :: execute_op_spc => fftbox_execute_op_spc 154 procedure :: execute_op_dpc => fftbox_execute_op_dpc 155 156 ! Main entry point for performing FFTs on the full box 157 ! complex-to-complex version, operating on complex arrays 158 generic :: execute => execute_ip_spc, & 159 execute_ip_dpc, & 160 execute_op_spc, & 161 execute_op_dpc 162 163 procedure :: free => fftbox_plan3_free 164 ! Free dynamic memory 165 166 end type fftbox_plan3_t
m_fft/fftbox_utests [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftbox_utests
FUNCTION
Driver routine for base unitary tests of the FFT routines (sequential version).
INPUTS
fftalg =fftalg input variable. ndat = Number of transform to execute nthreads = Number of OpenMP threads. gpu_option= GPU version to active (0: no GPU). [unit]=Output Unit number (DEFAULT std_out)
OUTPUT
nfailed=number of failures.
SOURCE
1197 integer function fftbox_utests(fftalg, ndat, nthreads, gpu_option, unit) result(nfailed) 1198 1199 !Arguments ----------------------------------- 1200 !scalars 1201 integer,intent(in) :: fftalg, ndat, nthreads, gpu_option 1202 integer,optional,intent(in) :: unit 1203 1204 !Local variables------------------------------- 1205 !scalars 1206 integer,parameter :: NSETS=6, fftcache0 = 0 1207 integer :: ifft,ierr,ldxyz,old_nthreads,ount,cplex 1208 integer :: iset,nx,ny,nz,ldx,ldy,ldz,fftalga,fftalgc 1209 !integer :: ix,iy,iz,padat,dat 1210 real(dp),parameter :: ATOL_SP=tol6,ATOL_DP=tol12 ! Tolerances on the absolute error 1211 real(dp) :: max_abserr 1212 character(len=500) :: msg,info,library,cplex_mode,padding_mode 1213 type(fftbox_plan3_t) :: box_plan 1214 !arrays 1215 integer :: pars(6,NSETS) 1216 real(dp) :: crand(2) 1217 real(dp),allocatable :: fofg(:),fofr_ref(:),fofr(:) 1218 complex(dpc),allocatable :: ff(:),ff_ref(:),gg(:) 1219 complex(spc),allocatable :: ffsp(:),ff_refsp(:),ggsp(:) 1220 1221 ! ************************************************************************* 1222 1223 nfailed = 0 1224 ount = std_out; if (PRESENT(unit)) ount = unit 1225 1226 if (nthreads > 0) then 1227 old_nthreads = xomp_get_max_threads() 1228 call xomp_set_num_threads(nthreads) 1229 end if 1230 1231 ! These values must be compatible with all the FFT routines. 1232 ! SG library is the most restrictive (only powers of 2,3,5). 1233 pars = RESHAPE( [ & 1234 12, 18, 15, 12, 18, 15, & 1235 12, 18, 15, 13, 19, 16, & 1236 12, 18, 15, 13, 19, 15, & 1237 12, 18, 15, 12, 18, 16, & 1238 12, 18, 15, 13, 18, 15, & 1239 12, 18, 15, 15, 21, 18 & 1240 ], [6, NSETS]) 1241 1242 fftalga=fftalg/100; fftalgc=mod(fftalg,10) 1243 1244 call fftalg_info(fftalg, library, cplex_mode, padding_mode) 1245 1246 do iset=1,SIZE(pars,DIM=2) 1247 nx =pars(1,iset); ny=pars(2,iset); nz=pars(3,iset) 1248 ldx=pars(4,iset); ldy=pars(5,iset); ldz=pars(6,iset) 1249 1250 ! Create the FFT plan 1251 call box_plan%init(ndat, pars(1,iset), pars(4,iset), fftalg, fftcache0, gpu_option) 1252 1253 ldxyz = ldx*ldy*ldz 1254 ! 1255 ! ====================================== 1256 ! === TEST the single precision version 1257 ! ====================================== 1258 ABI_MALLOC(ff_refsp, (ldxyz*ndat)) 1259 ABI_MALLOC(ffsp, (ldxyz*ndat)) 1260 ABI_MALLOC(ggsp, (ldxyz*ndat)) 1261 1262 do ifft=1,ldxyz*ndat 1263 call RANDOM_NUMBER(crand) 1264 ff_refsp(ifft) = DCMPLX(crand(1), crand(2)) 1265 end do 1266 1267 ! Set the augmentation region to zero to avoid SIGFPE, as FFTW3 wrappers use zscal to scale the results. 1268 call cplx_setaug_zero_spc(nx,ny,nz,ldx,ldy,ldz,ndat,ff_refsp) 1269 ffsp = ff_refsp 1270 1271 ! in-place version. 1272 call box_plan%execute(ffsp, +1) 1273 call box_plan%execute(ffsp, -1) 1274 1275 ! do it twice to test GPU version 1276 !call box_plan%execute(ffsp, +1) 1277 !call box_plan%execute(ffsp, -1) 1278 1279 ierr = COUNT(ABS(ffsp - ff_refsp) > ATOL_SP) 1280 nfailed = nfailed + ierr 1281 1282 info = sjoin(library, "c2c_ip_spc :") 1283 if (ierr /= 0) then 1284 max_abserr = MAXVAL(ABS(ffsp - ff_refsp)) 1285 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1286 else 1287 write(msg,"(a)")" OK" 1288 end if 1289 call wrtout(ount,sjoin(info,msg)) 1290 1291 ! out-of-place version. 1292 ffsp = ff_refsp 1293 call box_plan%execute(ffsp, ggsp, +1) 1294 call box_plan%execute(ggsp, ffsp, -1) 1295 1296 ierr = COUNT(ABS(ffsp - ff_refsp) > ATOL_SP) 1297 nfailed = nfailed + ierr 1298 1299 info = sjoin(library, "c2c_op_spc :") 1300 if (ierr /= 0) then 1301 max_abserr = MAXVAL(ABS(ffsp - ff_refsp)) 1302 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1303 else 1304 write(msg,"(a)")" OK" 1305 end if 1306 call wrtout(ount, sjoin(info, msg)) 1307 1308 ABI_FREE(ff_refsp) 1309 ABI_FREE(ffsp) 1310 ABI_FREE(ggsp) 1311 1312 ! ======================================= 1313 ! === TEST the double precision version 1314 ! ======================================= 1315 ABI_MALLOC(ff_ref, (ldxyz*ndat)) 1316 ABI_MALLOC(ff, (ldxyz*ndat)) 1317 ABI_MALLOC(gg, (ldxyz*ndat)) 1318 1319 do ifft=1,ldxyz*ndat 1320 call RANDOM_NUMBER(crand) 1321 ff_ref(ifft) = DCMPLX(crand(1), crand(2)) 1322 end do 1323 1324 ! Set the augmentation region to zero to avoid SIGFPE, as FFTW3 wrappers use zscal to scale the results. 1325 call cplx_setaug_zero_dpc(nx,ny,nz,ldx,ldy,ldz,ndat,ff_ref) 1326 ff = ff_ref 1327 1328 ! in-place version. 1329 call box_plan%execute(ff, +1) 1330 call box_plan%execute(ff, -1) 1331 1332 ierr = COUNT(ABS(ff - ff_ref) > ATOL_DP) 1333 nfailed = nfailed + ierr 1334 1335 info = sjoin(library, "c2c_ip_dpc :") 1336 if (ierr /= 0) then 1337 max_abserr = MAXVAL(ABS(ff - ff_ref)) 1338 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1339 else 1340 write(msg,"(a)")" OK" 1341 end if 1342 call wrtout(ount,sjoin(info, msg)) 1343 1344 ! out-of-place version. 1345 ff = ff_ref 1346 call box_plan%execute(ff, gg, +1) 1347 call box_plan%execute(gg, ff, -1) 1348 1349 ierr = COUNT(ABS(ff - ff_ref) > ATOL_DP) 1350 nfailed = nfailed + ierr 1351 1352 info = sjoin(library, "c2c_op_dpc :") 1353 if (ierr /= 0) then 1354 max_abserr = MAXVAL(ABS(ff - ff_ref)) 1355 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1356 else 1357 write(msg,"(a)")" OK" 1358 end if 1359 call wrtout(ount, sjoin(info, msg)) 1360 1361 ABI_FREE(ff_ref) 1362 ABI_FREE(ff) 1363 ABI_FREE(gg) 1364 1365 call box_plan%free() 1366 !stop 1367 1368 do cplex=1,2 1369 !if (fftalga == FFT_FFTW3 .and. ndat > 1 .and. cplex==1) then 1370 ! call wrtout(ount,"Warning: fourdp with FFTW3-wrappers, cplex=2 and ndat>1, might crash if MKL is used") 1371 ! !CYCLE 1372 !end if 1373 ABI_MALLOC(fofg, (2*ldxyz*ndat)) 1374 ABI_MALLOC(fofr_ref, (cplex*ldxyz*ndat)) 1375 ABI_MALLOC(fofr, (cplex*ldxyz*ndat)) 1376 1377 call RANDOM_NUMBER(fofr_ref) 1378 !call cg_setaug_zero(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,fofr_ref) 1379 fofr = fofr_ref 1380 1381 select case (fftalga) 1382 case (FFT_FFTW3) 1383 call fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,-1,fofg,fofr) 1384 call fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,+1,fofg,fofr) 1385 1386 case (FFT_DFTI) 1387 call dfti_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,-1,fofg,fofr) 1388 call dfti_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,+1,fofg,fofr) 1389 1390 case default 1391 ! TODO 1392 continue 1393 end select 1394 1395 !call cg_setaug_zero(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,fofr) 1396 1397 ierr = COUNT(ABS(fofr - fofr_ref) > ATOL_DP) 1398 nfailed = nfailed + ierr 1399 1400 write(info,"(a,i1,a)")sjoin(library, "fourdp (cplex "),cplex,") :" 1401 !write(info,"(2a,i1,a,i0,a)")trim(library), "fourdp (cplex ", cplex,"), ndata = ",ndat," :" 1402 if (ierr /= 0) then 1403 max_abserr = MAXVAL(ABS(fofr - fofr_ref)) 1404 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1405 else 1406 write(msg,"(a)")" OK" 1407 end if 1408 call wrtout(ount,sjoin(info, msg)) 1409 1410 ABI_FREE(fofg) 1411 ABI_FREE(fofr_ref) 1412 ABI_FREE(fofr) 1413 end do 1414 end do 1415 1416 if (nthreads > 0) call xomp_set_num_threads(old_nthreads) 1417 1418 end function fftbox_utests
m_fft/fftmpi_u [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftmpi_u
FUNCTION
**** DO NOT USE IT **** Wrapper function used to perform the MPI FFT of ndat wavefunctions. Mainly used for unit tests and prototyping. Better integration will be provided afterwards.
INPUTS
See fourwf_mpi
OUTPUT
See fourwf_mpi
SOURCE
4053 ! The final interface should be: 4054 !subroutine fftmpi_u(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,isign,fofg,fofr) 4055 4056 subroutine fftmpi_u(npw_k,n4,n5,n6,ndat,mgfft,ngfft,& 4057 istwf_k,gbound_k,kg_k,me_g0,distribfft,isign,fofg,fofr,comm_fft,cplexwf) 4058 4059 !Arguments ------------------------------------ 4060 !scalars 4061 integer,intent(in) :: istwf_k,mgfft,n4,n5,n6,ndat,npw_k 4062 integer,intent(in) :: isign,comm_fft,me_g0,cplexwf 4063 type(distribfft_type),intent(in) :: distribfft 4064 !arrays 4065 integer,intent(in) :: gbound_k(2*mgfft+8,2),ngfft(18) 4066 integer,intent(in) :: kg_k(3,npw_k) 4067 real(dp),intent(inout) :: fofg(2,npw_k*ndat),fofr(2,n4,n5,n6*ndat) 4068 4069 !Local variables------------------------------- 4070 !scalars 4071 integer,parameter :: npw0=0,cplex0=0 4072 integer :: n1,n2,n3 4073 real(dp),parameter :: weight_r=one,weight_i=one 4074 !arrays 4075 integer :: dummy_kg(0,0) 4076 real(dp) :: dummy_denpot(0,0,0),dummy_fofg(0,0) 4077 4078 ! ************************************************************************* 4079 4080 n1 = ngfft(1); n2 = ngfft(2); n3 = ngfft(3) 4081 4082 if (isign == 1) then 4083 ! option 0 G --> R 4084 call fourwf_mpi(cplex0,dummy_denpot,fofg,dummy_fofg,fofr,& 4085 gbound_k,gbound_k,istwf_k,kg_k,dummy_kg,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,& 4086 npw_k,npw0,n4,n5,n6,ndat,0,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 4087 else 4088 ! option 3 R --> G 4089 call fourwf_mpi(cplex0,dummy_denpot,dummy_fofg,fofg,fofr,& 4090 gbound_k,gbound_k,istwf_k,dummy_kg,kg_k,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,& 4091 npw0,npw_k,n4,n5,n6,ndat,3,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 4092 end if 4093 4094 end subroutine fftmpi_u
m_fft/fftpac [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftpac
FUNCTION
Allow for data copying to modify the stride (dimensioning) of a three-dimensional array, for more efficient three dimensional fft. NOTE that the arrays are in REAL space. Note that arrays aa and bb may be the same array (start at the same address). The array aa also incorporate a spin variable. MG FIXME: THIS IS **VERY BAD** AS FORTRAN DOES NOT ALLOW FOR ALIASING
INPUTS
ispden=actual spin-density of interest nspden=number of spin-density components n1,n2,n3=actual data dimensions, dimensions of complex array a nd1,nd2,nd3=array dimensions of (larger) array b ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft option= see description of side effects
OUTPUT
(see side effects)
SIDE EFFECTS
aa & bb arrays are treated as input or output depending on option: option=1 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) real case option=2 aa(n1*n2*n3,ispden) --> bb(nd1,nd2,nd3) real case option=10 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 real part option=11 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 imag part
SOURCE
4418 subroutine fftpac(ispden,mpi_enreg,nspden,n1,n2,n3,nd1,nd2,nd3,ngfft,aa,bb,option) 4419 4420 !Arguments ------------------------------------ 4421 !scalars 4422 integer,intent(in) :: ispden,n1,n2,n3,nd1,nd2,nd3,nspden,option 4423 type(mpi_type),intent(in) :: mpi_enreg 4424 !arrays 4425 integer,intent(in) :: ngfft(18) 4426 real(dp),intent(inout) :: aa(n1*n2*n3/ngfft(10),nspden),bb(nd1,nd2,nd3) 4427 4428 !Local variables------------------------------- 4429 !scalars 4430 integer :: i1,i2,i3,index,me_fft,nproc_fft 4431 character(len=500) :: msg 4432 !arrays 4433 integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:) 4434 integer, ABI_CONTIGUOUS pointer :: fftn3_distrib(:),ffti3_local(:) 4435 4436 ! ************************************************************************* 4437 4438 me_fft=ngfft(11); nproc_fft=ngfft(10) 4439 4440 if (option==1.or.option==2) then 4441 if (nd1<n1.or.nd2<n2.or.nd3<n3) then 4442 write(msg,'(a,3i0,2a,3i0,a)')& 4443 'Each of nd1,nd2,nd3=',nd1,nd2,nd3,ch10,& 4444 'must be >= n1, n2, n3 =',n1,n2,n3,'.' 4445 ABI_BUG(msg) 4446 end if 4447 else 4448 if (2*nd1<n1.or.nd2<n2.or.nd3<n3) then 4449 write(msg,'(a,3i0,2a,3i0,a)')& 4450 'Each of 2*nd1,nd2,nd3=',2*nd1,nd2,nd3,ch10,& 4451 'must be >= (n1, n2, n3) =',n1,n2,n3,'.' 4452 ABI_BUG(msg) 4453 end if 4454 end if 4455 4456 ! Get the distrib associated with this fft_grid 4457 call ptabs_fourdp(mpi_enreg,n2,n3,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local) 4458 4459 if (option==1) then 4460 ! aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) real case 4461 do i3=1,n3 4462 if (me_fft==fftn3_distrib(i3)) then 4463 do i2=1,n2 4464 do i1=1,n1 4465 aa(i1+n1*(i2-1+n2*(ffti3_local(i3)-1)),ispden)=bb(i1,i2,i3) 4466 end do 4467 end do 4468 end if 4469 end do 4470 4471 else if (option==2) then 4472 ! option=2 aa(n1*n2*n3,ispden) --> bb(nd1,nd2,nd3) real case 4473 ! Here we avoid corrupting the data in a while writing to b in the 4474 ! case in which a and b are same array. 4475 ! Also: replace "trash" data with 0 s to avoid floating point 4476 ! exceptions when this data is actually manipulated in fft. 4477 do i3=nd3,n3+1,-1 4478 do i2=nd2,1,-1 4479 do i1=nd1,1,-1 4480 bb(i1,i2,i3)=0.d0 4481 end do 4482 end do 4483 end do 4484 do i3=n3,1,-1 4485 if (me_fft==fftn3_distrib(i3)) then 4486 do i2=nd2,n2+1,-1 4487 do i1=nd1,1,-1 4488 bb(i1,i2,i3)=0.d0 4489 end do 4490 end do 4491 do i2=n2,1,-1 4492 do i1=nd1,n1+1,-1 4493 bb(i1,i2,i3)=0.d0 4494 end do 4495 do i1=n1,1,-1 4496 bb(i1,i2,i3)=aa(i1+n1*(i2-1+n2*(ffti3_local(i3) - 1)),ispden) 4497 end do 4498 end do 4499 end if 4500 end do 4501 ! MF 4502 else if (option==10 .or. option==11) then 4503 ! option=10 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 real part 4504 ! option=11 aa(n1*n2*n3,ispden) <-- bb(nd1,nd2,nd3) complex case like option 1 imag part 4505 index=1 4506 if(option==11) index=2 4507 do i3=1,n3 4508 do i2=1,n2 4509 do i1=1,n1/2 4510 aa(index,ispden)=bb(i1,i2,i3) 4511 index=index+2 4512 end do 4513 end do 4514 end do 4515 ! MF 4516 else 4517 write(msg,'(a,i0,a)')' Bad option =',option,'.' 4518 ABI_BUG(msg) 4519 end if 4520 4521 end subroutine fftpac
m_fft/fftpad_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftpad_dpc
FUNCTION
Driver routine used to transform COMPLEX arrays using 3D zero-padded FFTs. TARGET: DPC arrays
INPUTS
ngfft(18)=Info on the 3D FFT. nx,ny,nz=Logical dimensions of the FFT mesh. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFTs mgfft=MAX(nx,ny,nz), only used to dimension gbound isign=The sign of the transform. gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= input: The array with the data to be transformed. output: The results of the FFT.
SOURCE
1008 subroutine fftpad_dpc(ff, ngfft, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 1009 1010 !Arguments ------------------------------------ 1011 !scalars 1012 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign 1013 !arrays 1014 integer,intent(in) :: ngfft(18),gbound(2*mgfft+8,2) 1015 complex(dpc),target,intent(inout) :: ff(ldx*ldy*ldz*ndat) 1016 1017 !Local variables------------------------------- 1018 !scalars 1019 integer :: fftalg,fftalga,fftalgc,ncount 1020 integer :: ivz !vz_d 1021 character(len=500) :: msg 1022 !arrays 1023 real(dp),allocatable :: fofr(:,:,:,:,:) 1024 real(dp),allocatable :: fofrvz(:,:) !vz_d 1025 real(dp),ABI_CONTIGUOUS pointer :: fpt_ftarr(:,:,:,:,:) 1026 1027 ! ************************************************************************* 1028 1029 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=MOD(fftalg,10) 1030 1031 select case (fftalga) 1032 1033 case (FFT_FFTW3) 1034 call fftw3_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 1035 1036 case (FFT_DFTI) 1037 call dfti_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 1038 1039 case (FFT_SG) 1040 ! Goedecker"s routines. 1041 ! TODO: sg_fftpad is not the fastest routine, here I should call sg_fftrisc but I need 1042 ! kg_kin that is not available in rho_tw_g, actually one should pass G-G0 due to 1043 ! the shift introduced by the umklapp. 1044 ncount = ldx*ldy*ldz*ndat 1045 1046 ABI_MALLOC(fofr, (2,ldx,ldy,ldz,ndat)) 1047 ! call ZCOPY(ncount,ff,1,fofr,1) !vz_d 1048 ! call DCOPY(2*ncount,ff,1,fofr,1) ! MG 1049 ! alternatif of ZCOPY from vz 1050 ABI_MALLOC(fofrvz,(2,ncount)) !vz_d 1051 do ivz=1,ncount !vz_d 1052 fofrvz(1,ivz)= real(ff(ivz)) !vz_d 1053 fofrvz(2,ivz)=aimag(ff(ivz)) !vz_d 1054 end do !vz_d 1055 call DCOPY(2*ncount,fofrvz,1,fofr,1) !vz_d 1056 ABI_FREE(fofrvz) !vz_d 1057 1058 call C_F_pointer(C_loc(ff),fpt_ftarr, shape=(/2,ldx,ldy,ldz,ndat/)) 1059 1060 ! MT nov. 2012: with xlf, need to put explicit boundaries for the 4th dimension 1061 ! this looks like a compiler bug... 1062 !if (size(fpt_ftarr)==2*size(ff)) then 1063 call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,fpt_ftarr) 1064 !else 1065 ! call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,fpt_ftarr(:,:,:,1:ldz,dat)) 1066 !end if 1067 1068 ABI_FREE(fofr) 1069 1070 if (isign==-1) then ! Here there might be numerical exceptions due to the holes. 1071 call xscal(ncount,one/(nx*ny*nz),ff,1) 1072 end if 1073 1074 case default 1075 write(msg,'(a,i0,a)')"fftalga = ", fftalga," not coded " 1076 ABI_ERROR(msg) 1077 end select 1078 1079 end subroutine fftpad_dpc
m_fft/fftpad_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftpad_spc
FUNCTION
Driver routine used to transform COMPLEX arrays using 3D zero-padded FFTs. TARGET: single-precision complex.
INPUTS
ngfft(18)=Info on the 3D FFT. nx,ny,nz=Logical dimensions of the FFT mesh. ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts). ndat=Number of FFTs mgfft=MAX(nx,ny,nz), only used to dimension gbound isign=The sign of the transform. gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point. See sphereboundary for more info.
SIDE EFFECTS
ff(ldx*ldy*ldz*ndat)= input: The array with the data to be transformed. output: The results of the FFT.
SOURCE
917 subroutine fftpad_spc(ff, ngfft, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 918 919 !Arguments ------------------------------------ 920 !scalars 921 integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign 922 !arrays 923 integer,intent(in) :: ngfft(18),gbound(2*mgfft+8,2) 924 complex(spc),target,intent(inout) :: ff(ldx*ldy*ldz*ndat) 925 926 !Local variables------------------------------- 927 !scalars 928 integer :: fftalg,fftalga,fftalgc,ncount,p 929 character(len=500) :: msg 930 !arrays 931 real(dp),allocatable :: fofr(:,:),ftarr(:,:) 932 933 ! ************************************************************************* 934 935 fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=MOD(fftalg,10) 936 937 select case (fftalga) 938 939 case (FFT_FFTW3) 940 call fftw3_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 941 942 case (FFT_DFTI) 943 call dfti_fftpad(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound) 944 945 case (FFT_SG) 946 ! Goedecker"s routines. 947 ! TODO: sg_fftpad is not the fastest routine, here I should call sg_fftrisc but I need 948 ! kg_kin that are not available in rho_tw_g, actually one should pass G-G0 due to 949 ! the shift introduced by the umklapp. 950 ncount = ldx*ldy*ldz*ndat 951 952 ABI_MALLOC(fofr,(2,ldx*ldy*ldz*ndat)) 953 ABI_MALLOC(ftarr,(2,ldx*ldy*ldz*ndat)) 954 955 do p=1,ldx*ldy*ldz*ndat 956 fofr(1,p) = REAL(ff(p)) 957 fofr(2,p) = AIMAG(ff(p)) 958 end do 959 960 call sg_fftpad(ngfft(8),mgfft,nx,ny,nz,ldx,ldy,ldz,ndat,gbound,isign,fofr,ftarr) 961 ! 962 ! Copy the results. 963 do p=1,ldx*ldy*ldz*ndat 964 ff(p) = CMPLX(ftarr(1,p), ftarr(2,p)) 965 end do 966 967 if (isign==-1) then ! Here there might be numerical exceptions due to the holes. 968 ff = ff/(nx*ny*nz) 969 end if 970 971 ABI_FREE(ftarr) 972 ABI_FREE(fofr) 973 974 case default 975 write(msg,'(a,i0,a)')"fftalga = ", fftalga," not coded " 976 ABI_ERROR(msg) 977 end select 978 979 end subroutine fftpad_spc
m_fft/fftu_mpi_utests [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftu_mpi_utests
FUNCTION
Unit tests for the FFTs of wavefunctions (MPI version).
INPUTS
OUTPUT
nfailed=number of failed tests.
SOURCE
1832 function fftu_mpi_utests(fftalg, ecut, rprimd, ndat, nthreads, comm_fft, paral_kgb, unit) result(nfailed) 1833 1834 !Arguments ------------------------------------ 1835 !scalars 1836 integer,intent(in) :: fftalg,ndat,nthreads,comm_fft,paral_kgb 1837 integer :: nfailed 1838 integer,optional,intent(in) :: unit 1839 real(dp),intent(in) :: ecut 1840 !arrays 1841 real(dp),intent(in) :: rprimd(3,3) 1842 1843 !Local variables------------------------------- 1844 !scalars 1845 integer,parameter :: nsym1=1,npw0=0,cplex_one=1,istwfk_one=1 1846 integer :: n1,n2,n3,idat,n4,n5,n6,ierr,npw_k,full_npw_k,istwf_npw_k,cplexwf 1847 integer :: mgfft,istwf_k,ikpt,old_nthreads,ount,isign,fftalga,fftalgc 1848 integer :: ig,i1,i2,i3,i3_glob,i3dat,nd3proc,i3_local,g0sender 1849 integer :: step,me_g0,me_fft,nproc_fft,mpierr,nfft,cplex,chksymtnons 1850 real(dp),parameter :: boxcutmin2=two,ATOL_DP=tol12,RTOL_DP=tol3 ! Tolerances on the absolute and relative error 1851 real(dp),parameter :: weight_r=one,weight_i=one 1852 real(dp) :: max_abserr,max_relerr,ucvol,relerr,den,refden 1853 real(dp) :: ctime,wtime,gflops 1854 character(len=500) :: msg,info,library,cplex_mode,padding_mode 1855 type(distribfft_type) :: fftabs 1856 !arrays 1857 integer :: symrel(3,3,nsym1),ngfft(18) 1858 integer,allocatable :: full_kg_k(:,:),istw_kg_k(:,:) 1859 integer,allocatable :: gbound_k(:,:),kg_k(:,:) 1860 real(dp) :: dummy_fofg(0,0) !dummy_denpot(0,0,0) 1861 real(dp) :: kpoint(3),kpoints(3,2) 1862 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3),tnons(3,nsym1) 1863 real(dp),allocatable :: fofg(:,:),ref_fofg(:,:),fofg_out(:,:),fofr(:,:,:,:) 1864 real(dp),allocatable :: density(:,:,:),pot(:,:,:),invpot(:,:,:) 1865 real(dp),allocatable :: full_fofg(:,:),istwf_fofg(:,:) 1866 1867 ! ************************************************************************* 1868 1869 nfailed = 0 1870 1871 ount = std_out; if (PRESENT(unit)) ount = unit 1872 1873 if (nthreads > 0) then 1874 old_nthreads = xomp_get_max_threads() 1875 call xomp_set_num_threads(nthreads) 1876 end if 1877 1878 if (.not. fftalg_has_mpi(fftalg)) then 1879 write(msg,'(a,i0,a)')"fftalg: ",fftalg," does not support MPI" 1880 ABI_ERROR(msg) 1881 end if 1882 1883 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 1884 1885 symrel = reshape([1,0,0,0,1,0,0,0,1],[3,3,nsym1]) 1886 tnons=zero 1887 chksymtnons=0 1888 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 1889 1890 kpoints = RESHAPE( [ & 1891 0.1, 0.2, 0.3, & 1892 0.0, 0.0, 0.0 ], [3,2] ) 1893 1894 call fftalg_info(fftalg,library,cplex_mode,padding_mode) 1895 1896 fftalga=fftalg/100; fftalgc=mod(fftalg,10) 1897 1898 do ikpt=1,size(kpoints, dim=2) 1899 kpoint = kpoints(:,ikpt) 1900 1901 ! Get full G-sphere (no MPI distribution, no time-reversal) 1902 call get_kg(kpoint,istwfk_one,ecut,gmet,full_npw_k,full_kg_k) 1903 1904 ! Get full G-sphere (no MPI distribution, use time-reversal if possible) 1905 istwf_k = set_istwfk(kpoint) 1906 call get_kg(kpoint,istwf_k,ecut,gmet,istwf_npw_k,istw_kg_k) 1907 1908 ! Get FFT box dims. 1909 ngfft = -1 1910 ngfft(7) = fftalg 1911 ngfft(8) = get_cache_kb() 1912 1913 call getng(boxcutmin2,chksymtnons,ecut,gmet,kpoint,me_fft,mgfft,nfft,ngfft,nproc_fft,nsym1,& 1914 paral_kgb,symrel,tnons,unit=dev_null) 1915 1916 n1 = ngfft(1); n2 = ngfft(2); n3 = ngfft(3) 1917 ! Do not use augmentation. 1918 !ngfft(4:6) = ngfft(1:3) 1919 n4 = ngfft(4); n5 = ngfft(5); n6 = ngfft(6) 1920 1921 call print_ngfft(ngfft,"ngfft for MPI-fourwf",unit=std_out,mode_paral="COLL",prtvol=0) 1922 1923 ! Compute FFT distribution tables. 1924 call init_distribfft(fftabs,"c",nproc_fft,n2,n3) 1925 1926 ! Set to 1 if this node owns G = 0. 1927 me_g0 = 0; if (fftabs%tab_fftwf2_distrib(1) == me_fft) me_g0 = 1 1928 g0sender = fftabs%tab_fftwf2_distrib(1) 1929 1930 ! Allocate u(g) on the full sphere, initialize with random numbers. 1931 ! and broadcast full data to the other nodes. 1932 ABI_MALLOC(full_fofg, (2,full_npw_k*ndat)) 1933 1934 if (me_fft == g0sender) then 1935 if (istwf_k == 1) then 1936 call RANDOM_NUMBER(full_fofg) 1937 !full_fofg = one 1938 1939 else if (istwf_k == 2) then 1940 ! Special treatment for real wavefunctions. 1941 ! Fill the irreducible G-vectors first so that we have the u(g) of a real u(r) 1942 ! Then get u(g) on the full G-sphere. 1943 ! TODO: Sequential version OK, SIGSEV if mpi_ncpus > 1! 1944 ABI_MALLOC(istwf_fofg, (2,istwf_npw_k*ndat)) 1945 call RANDOM_NUMBER(istwf_fofg) 1946 ! Enforce real u in G-space. 1947 do idat=1,ndat 1948 istwf_fofg(2,1+(idat-1)*istwf_npw_k) = zero 1949 end do 1950 1951 ! from istwfk 2 to 1. 1952 call change_istwfk(istwf_npw_k,istw_kg_k,istwf_k,full_npw_k,full_kg_k,istwfk_one,n1,n2,n3,ndat,istwf_fofg,full_fofg) 1953 ABI_FREE(istwf_fofg) 1954 1955 else 1956 ABI_ERROR("istwf_k /= [1,2] not available in MPI-FFT mode") 1957 end if 1958 end if 1959 1960 call xmpi_bcast(full_fofg,g0sender,comm_fft,ierr) 1961 1962 ! Compute sphere boundaries for zero-padded FFTs 1963 ABI_MALLOC(gbound_k,(2*mgfft+8,2)) 1964 call sphereboundary(gbound_k,istwfk_one,full_kg_k,mgfft,full_npw_k) 1965 1966 ! Extract my G-vectors from full_kg_k and store them in kg_k. 1967 !write(std_out,*)"fftwf2_distrib",fftabs%tab_fftwf2_distrib 1968 do step=1,2 1969 if (step == 2) then 1970 ! Allocate my u(g) and my Gs. 1971 ! Set fofg to zero to bypass a bug with XLF! 1972 ABI_MALLOC(kg_k, (3, npw_k)) 1973 ABI_CALLOC(fofg, (2,npw_k*ndat)) 1974 end if 1975 1976 npw_k = 0 1977 do ig=1,full_npw_k 1978 i1=full_kg_k(1,ig); if(i1<0) i1=i1+n1; i1=i1+1 1979 i2=full_kg_k(2,ig); if(i2<0) i2=i2+n2; i2=i2+1 1980 i3=full_kg_k(3,ig); if(i3<0) i3=i3+n3; i3=i3+1 1981 if (fftabs%tab_fftwf2_distrib(i2) == me_fft) then 1982 npw_k = npw_k + 1 1983 if (step == 2) then 1984 kg_k(:,npw_k) = full_kg_k(:,ig) 1985 fofg(:,npw_k) = full_fofg(:,ig) 1986 end if 1987 end if 1988 end do 1989 end do ! step 1990 1991 ABI_FREE(istw_kg_k) 1992 1993 !write(std_out,*)"dist",trim(itoa(me_fft)),fftabs%tab_fftdp3_distrib(:) !== me_fft) 1994 !write(std_out,*)"local",trim(itoa(me_fft)),fftabs%tab_fftdp3_local(:) 1995 1996 ! Allocate my local portion of u(r), and keep a copy my u(g). 1997 ABI_MALLOC(fofr, (2,n4,n5,n6*ndat)) 1998 ABI_MALLOC(ref_fofg, (2,npw_k*ndat)) 1999 ref_fofg = fofg 2000 2001 call cwtime(ctime,wtime,gflops,"start") 2002 2003 ! ---------------------------------------------------------------- 2004 ! Test the the backward and the forward transform of wavefunctions 2005 ! ---------------------------------------------------------------- 2006 ! c2c or [c2r, r2c] 2007 cplexwf = 2; if (istwf_k==2) cplexwf = 1 2008 ! FIXME: 2009 ! There's a bug somewhere in the MPI routines if cplexwf == 1 is used and ndat > 1 2010 ! Not a serious problem at present since cplexwf==1 is never used in abinit. 2011 if (ndat>1) cplexwf = 2 2012 if (nproc_fft > 1) cplexwf = 2 2013 cplexwf = 2 2014 !cplexwf = 2; if (istwf_k==2) cplexwf = 1 2015 2016 do isign = 1,-1,-2 2017 call fftmpi_u(npw_k,n4,n5,n6,ndat,mgfft,ngfft,& 2018 istwfk_one,gbound_k,kg_k,me_g0,fftabs,isign,fofg,fofr,comm_fft,cplexwf=cplexwf) 2019 end do 2020 2021 ! The final interface should be: 2022 !subroutine fftmpi_u(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,isign,fofg,fofr) 2023 2024 ! Parallel version (must support augmentation in forf(:,:,:,:), hence we have a different API wrt the seq case! 2025 !call fftmpi_ug(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,fofg,fofr) 2026 !call fftmpi_ur(npw_k,n4,n5,n6,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,fftabs,fofr,fofgout) 2027 2028 ! Seq version. 2029 !call fft_ug(npw_k,nxyz,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ugsp,ursp) 2030 !call fft_ur(npw_k,nxyz,nspinor,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ursp,ugsp) 2031 2032 call cwtime(ctime,wtime,gflops,"stop") 2033 if (me_fft == 0) then 2034 !write(std_out,'(a,i0,3(a,f10.4))')"fftalg: ",fftalg,", ecut: ",ecut,", cpu_time: ",ctime,", wall_time: ",wtime 2035 end if 2036 2037 ! Check if F^{-1} F = I within ATOL_DP 2038 ierr = COUNT(ABS(fofg - ref_fofg) > ATOL_DP) 2039 call xmpi_sum(ierr,comm_fft,mpierr) 2040 nfailed = nfailed + ierr 2041 2042 write(info,"(a,i1,a)")sjoin(library,"fftu_mpi, istwfk "),istwf_k," :" 2043 2044 if (ierr /= 0) then 2045 max_abserr = MAXVAL(ABS(fofg - ref_fofg)) 2046 call xmpi_max(max_abserr,comm_fft,mpierr) 2047 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 2048 else 2049 write(msg,"(a)")" OK" 2050 end if 2051 call wrtout(ount,sjoin(info, msg)) 2052 2053 ! ------------------------------------------- 2054 ! Test the accumulation of density (option 1) 2055 ! ------------------------------------------- 2056 ABI_CALLOC(density, (cplex_one*n4,n5,n6)) 2057 !fofg = one 2058 2059 ! Accumulate density. Does not work if cplexwf==1 2060 call fourwf_mpi(cplex_one,density,fofg,dummy_fofg,fofr,& 2061 gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,& 2062 npw_k,npw_k,n4,n5,n6,ndat,1,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 2063 2064 ! Recompute u(r) 2065 ! call fourwf_mpi(cplex_one,density,fofg,dummy_fofg,fofr,& 2066 !& gbound_k,gbound_k,istwf_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,& 2067 !& npw_k,npw_k,n4,n5,n6,ndat,0,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 2068 ! call xmpi_sum(density,comm_fft,ierr) 2069 ! if (me_fft == 0) write(std_out,*)"sum density", sum(density) 2070 2071 !do i3=1,n6 2072 ! write(110+me_fft,*)i3,density(:,:,i3) 2073 !end do 2074 2075 max_relerr = zero 2076 2077 ! FIXME: This is wrong if ndat > 1 2078 ! Must unify the treatment of fofr and rhor on the augmented mesh (n4,n5,n6) 2079 ! back_wf and for_wf return a MPI distributed arrays but fofr is allocated with 2080 ! the global dimensions. 2081 nd3proc=(n3-1)/nproc_fft+1 2082 do i3=1,n3 2083 !do i3=1,nd3proc 2084 if( me_fft == fftabs%tab_fftdp3_distrib(i3) ) then 2085 i3_local = fftabs%tab_fftdp3_local(i3) !+ nd3proc*(idat-1) 2086 !i3_local = i3 2087 i3_glob = i3 2088 !i3_glob = i3_local 2089 !i3_glob = i3 + nd3proc * me_fft 2090 !i3dat = i3 + n6 * (idat-1) 2091 do i2=1,n2 2092 do i1=1,n1 2093 den = density(i1,i2,i3_glob) 2094 refden = zero 2095 do idat=1,ndat 2096 i3dat = i3_local + n6 * (idat-1) 2097 refden = refden + weight_r*fofr(1,i1,i2,i3dat)**2+ weight_i*fofr(2,i1,i2,i3dat)**2 2098 end do 2099 relerr = abs(refden - den) 2100 if (abs(refden) < tol12) refden = tol12 2101 relerr = relerr / abs(refden) 2102 max_relerr = max(max_relerr, relerr) 2103 !if (relerr > RTOL_DP) write(std_out,*)trim(itoa(me_fft)),i1,i2,i3,idat,den,refden 2104 end do 2105 end do 2106 end if 2107 end do 2108 2109 call xmpi_max(max_relerr,comm_fft,mpierr) 2110 2111 write(info,"(a,i1,a)")sjoin(library,"accrho_mpi, istwfk "),istwf_k," :" 2112 if (max_relerr > RTOL_DP) then 2113 write(msg,"(a,es9.2,a)")" FAILED (max_relerr = ",max_relerr,")" 2114 else 2115 write(msg,"(a)")" OK" 2116 end if 2117 call wrtout(ount, sjoin(info, msg)) 2118 2119 ABI_FREE(density) 2120 2121 ! ------------------------------------------- 2122 ! Test the application of the local potential 2123 ! ------------------------------------------- 2124 cplex = 1 2125 ABI_MALLOC(pot, (cplex*n4,n5,n6)) 2126 ABI_MALLOC(invpot, (cplex*n4,n5,n6)) 2127 2128 if (me_fft == g0sender) then 2129 !pot = fofr(1,:,:,:) 2130 !call RANDOM_NUMBER(pot) 2131 !where (abs(pot) < tol4) pot = tol4 2132 ! Simplest potential ever (G=0 to reduce errors due to aliasing) 2133 pot = four 2134 invpot = one/pot 2135 end if 2136 2137 call xmpi_bcast(pot,g0sender,comm_fft,ierr) 2138 call xmpi_bcast(invpot,g0sender,comm_fft,ierr) 2139 2140 ABI_MALLOC(fofg_out, (2,npw_k*ndat)) 2141 2142 ! Compute fofg_out = <G|pot(r)|fofg> 2143 call fourwf_mpi(cplex,pot,fofg,fofg_out,fofr,& 2144 & gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,& 2145 & npw_k,npw_k,n4,n5,n6,ndat,2,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 2146 2147 ! Compute fofg = <G|1/pot(r)|fofg_out> 2148 call fourwf_mpi(cplex,invpot,fofg_out,fofg,fofr,& 2149 & gbound_k,gbound_k,istwfk_one,kg_k,kg_k,me_g0,mgfft,ngfft,fftabs,n1,n2,n3,& 2150 & npw_k,npw_k,n4,n5,n6,ndat,2,weight_r,weight_i,comm_fft,cplexwf=cplexwf) 2151 2152 ! Check if we got the initial u(g) within ATOL_DP 2153 ierr = COUNT(ABS(fofg - ref_fofg) > ATOL_DP) 2154 call xmpi_sum(ierr,comm_fft,mpierr) 2155 nfailed = nfailed + ierr 2156 2157 write(info,"(a,i1,a)")sjoin(library,"<G|vloc|u>, istwfk "),istwf_k," :" 2158 2159 if (ierr /= 0) then 2160 max_abserr = MAXVAL(ABS(fofg - ref_fofg)) 2161 call xmpi_max(max_abserr,comm_fft,mpierr) 2162 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 2163 !if (me_fft == 0) write(std_out,*)(fofg(:,ig),ref_fofg(:,ig), ig=1,npw_k*ndat) 2164 else 2165 write(msg,"(a)")" OK" 2166 end if 2167 call wrtout(ount, sjoin(info, msg)) 2168 2169 ABI_FREE(fofg_out) 2170 ABI_FREE(pot) 2171 ABI_FREE(invpot) 2172 ABI_FREE(kg_k) 2173 ABI_FREE(gbound_k) 2174 ABI_FREE(fofg) 2175 ABI_FREE(ref_fofg) 2176 ABI_FREE(fofr) 2177 ABI_FREE(full_kg_k) 2178 ABI_FREE(full_fofg) 2179 2180 call destroy_distribfft(fftabs) 2181 end do 2182 2183 if (nthreads > 0) call xomp_set_num_threads(old_nthreads) 2184 2185 end function fftu_mpi_utests
m_fft/fftu_utests [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fftu_utests
FUNCTION
Unit tests for the FFTs of wavefunctions (sequential version).
INPUTS
OUTPUT
nfailed=number of failed tests.
SOURCE
1437 function fftu_utests(ecut, ngfft, rprimd, ndat, nthreads, unit) result(nfailed) 1438 1439 !Arguments ------------------------------------ 1440 !scalars 1441 integer,intent(in) :: ndat,nthreads 1442 integer :: nfailed 1443 integer,optional,intent(in) :: unit 1444 real(dp),intent(in) :: ecut 1445 !arrays 1446 integer,intent(in) :: ngfft(18) 1447 real(dp),intent(in) :: rprimd(3,3) 1448 1449 !Local variables------------------------------- 1450 !scalars 1451 integer,parameter :: nspinor1=1,mkmem1=1,exchn2n3d0=0,ikg0=0 1452 integer :: nx,ny,nz,nxyz,ldx,ldy,ldz,ierr,npw_k,mgfft,istwf_k,ikpt,ldxyz,ipw,old_nthreads,ount 1453 integer :: fftalg,npw_k_test 1454 real(dp),parameter :: ATOL_SP=tol6, ATOL_DP=tol12 ! Tolerances on the absolute error 1455 real(dp) :: max_abserr,ucvol 1456 character(len=500) :: msg,info,library,cplex_mode,padding_mode 1457 !arrays 1458 integer :: kg_dum(3,0) 1459 integer,allocatable :: gbound_k(:,:),kg_k(:,:) 1460 real(dp) :: kpoint(3),crand(2),kpoints(3,9) 1461 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3) 1462 real(dp),allocatable :: cg(:,:),cg_ref(:,:),cr(:,:) 1463 complex(spc),allocatable :: ugsp(:),ug_refsp(:),ursp(:) 1464 complex(dpc),allocatable :: ug(:),ug_ref(:),ur(:) 1465 type(MPI_type) :: MPI_enreg_seq 1466 1467 ! ************************************************************************* 1468 1469 ount = std_out; if (PRESENT(unit)) ount = unit 1470 1471 nfailed = 0 1472 fftalg = ngfft(7) 1473 1474 if (nthreads > 0) then 1475 old_nthreads = xomp_get_max_threads() 1476 call xomp_set_num_threads(nthreads) 1477 end if 1478 1479 call metric(gmet,gprimd,-1,rmet,rprimd,ucvol) 1480 1481 nx = ngfft(1); ny = ngfft(2); nz = ngfft(3) 1482 ldx = ngfft(4); ldy = ngfft(5); ldz = ngfft(6) 1483 mgfft = MAXVAL(ngfft(1:3)) 1484 1485 nxyz = nx* ny* nz 1486 ldxyz = ldx*ldy*ldz 1487 1488 ABI_CALLOC(cg_ref, (2, ldxyz*ndat)) 1489 ABI_CALLOC(cg, (2, ldxyz*ndat)) 1490 ABI_CALLOC(cr, (2, ldxyz*ndat)) 1491 1492 ABI_CALLOC(ug_ref, (ldxyz*ndat)) 1493 ABI_CALLOC(ug, (ldxyz*ndat)) 1494 ABI_CALLOC(ur, (ldxyz*ndat)) 1495 1496 ABI_CALLOC(ug_refsp, (ldxyz*ndat)) 1497 ABI_CALLOC(ugsp, (ldxyz*ndat)) 1498 ABI_CALLOC(ursp, (ldxyz*ndat)) 1499 1500 kpoints = RESHAPE([ & 1501 0.1, 0.2, 0.3, & 1502 0.0, 0.0, 0.0, & 1503 0.5, 0.0, 0.0, & 1504 0.0, 0.0, 0.5, & 1505 0.5, 0.0, 0.5, & 1506 0.0, 0.5, 0.0, & 1507 0.5, 0.5, 0.0, & 1508 0.0, 0.5, 0.5, & 1509 0.5, 0.5, 0.5], [3, 9]) 1510 1511 call fftalg_info(fftalg,library,cplex_mode,padding_mode) 1512 1513 call initmpi_seq(MPI_enreg_seq) 1514 1515 do ikpt=1,SIZE(kpoints,DIM=2) 1516 kpoint = kpoints(:,ikpt) 1517 istwf_k = set_istwfk(kpoint) 1518 1519 ! Calculate the number of G-vectors for this k-point. 1520 call kpgsph(ecut,exchn2n3d0,gmet,ikg0,0,istwf_k,kg_dum,kpoint,0,MPI_enreg_seq,0,npw_k) 1521 1522 ! Allocate and calculate the set of G-vectors. 1523 ABI_MALLOC(kg_k,(3,npw_k)) 1524 call kpgsph(ecut,exchn2n3d0,gmet,ikg0,0,istwf_k,kg_k,kpoint,mkmem1,MPI_enreg_seq,npw_k,npw_k_test) 1525 1526 ABI_MALLOC(gbound_k,(2*mgfft+8,2)) 1527 call sphereboundary(gbound_k,istwf_k,kg_k,mgfft,npw_k) 1528 1529 !if (istwf_k==2) then 1530 ! do ipw=1,npw_k 1531 ! write(std_out,*)ipw, kg_k(:,ipw) 1532 ! end do 1533 ! stop 1534 !end if 1535 1536 #if 0 1537 !TODO 1538 ! ================================================ 1539 ! === Test the double precision 2*real version === 1540 ! ================================================ 1541 do ipw=1,npw_k*ndat 1542 call RANDOM_NUMBER(crand) 1543 cg_ref(:,ipw) = crand(:) 1544 end do 1545 1546 if (istwf_k == 2) then 1547 do ipw=1,npw_k*ndat,npw_k 1548 cg_ref(2,ipw) = zero 1549 end do 1550 end if 1551 1552 cg = cg_ref 1553 1554 call fft_ug_dp(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,cg,cr) 1555 call fft_ur_dp(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,cr,cg) 1556 1557 ierr = COUNT(ABS(cg - cg_ref) > ATOL_DP) 1558 nfailed = nfailed + ierr 1559 1560 write(info,"(a,i1,a)")sjoin(library,"fftu_dp, istwfk "),istwf_k," :" 1561 if (ierr /= 0) then 1562 max_abserr = MAXVAL(ABS(cg - cg_ref)) 1563 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1564 else 1565 write(msg,"(a)")" OK" 1566 end if 1567 call wrtout(ount,sjoin(info, msg)) 1568 #endif 1569 1570 ! ================================================= 1571 ! === Test the single precision complex version === 1572 ! ================================================= 1573 do ipw=1,npw_k*ndat 1574 call RANDOM_NUMBER(crand) 1575 ug_refsp(ipw) = CMPLX(crand(1), crand(2)) 1576 end do 1577 1578 if (istwf_k == 2) then 1579 do ipw=1,npw_k*ndat,npw_k 1580 ug_refsp(ipw) = REAL(ug_refsp(ipw)) 1581 end do 1582 end if 1583 1584 ugsp = ug_refsp 1585 1586 call fft_ug(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ugsp,ursp) 1587 call fft_ur(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ursp,ugsp) 1588 1589 ierr = COUNT(ABS(ugsp - ug_refsp) > ATOL_SP) 1590 nfailed = nfailed + ierr 1591 1592 write(info,"(a,i1,a)")sjoin(library,"fftu_spc, istwfk "),istwf_k," :" 1593 if (ierr /= 0) then 1594 max_abserr = MAXVAL(ABS(ugsp - ug_refsp)) 1595 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1596 else 1597 write(msg,"(a)")" OK" 1598 end if 1599 call wrtout(ount,sjoin(info, msg)) 1600 1601 ! ================================================= 1602 ! === Test the double precision complex version === 1603 ! ================================================= 1604 do ipw=1,npw_k*ndat 1605 call RANDOM_NUMBER(crand) 1606 ug_ref(ipw) = DCMPLX(crand(1), crand(2)) 1607 end do 1608 1609 if (istwf_k == 2) then 1610 do ipw=1,npw_k*ndat,npw_k 1611 ug_ref(ipw) = REAL(ug_ref(ipw)) 1612 end do 1613 end if 1614 1615 ug = ug_ref 1616 1617 call fft_ug(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ug,ur) 1618 call fft_ur(npw_k,nxyz,nspinor1,ndat,mgfft,ngfft,istwf_k,kg_k,gbound_k,ur,ug) 1619 1620 ierr = COUNT(ABS(ug - ug_ref) > ATOL_DP) 1621 nfailed = nfailed + ierr 1622 1623 write(info,"(a,i1,a)")sjoin(library,"fftu_dpc, istwfk "),istwf_k," :" 1624 if (ierr /= 0) then 1625 max_abserr = MAXVAL(ABS(ug - ug_ref)) 1626 write(msg,"(a,es9.2,a)")" FAILED (max_abserr = ",max_abserr,")" 1627 else 1628 write(msg,"(a)")" OK" 1629 end if 1630 call wrtout(ount, sjoin(info, msg)) 1631 1632 ABI_FREE(kg_k) 1633 ABI_FREE(gbound_k) 1634 end do 1635 1636 ABI_FREE(cg_ref) 1637 ABI_FREE(cg) 1638 ABI_FREE(cr) 1639 ABI_FREE(ug_ref) 1640 ABI_FREE(ug) 1641 ABI_FREE(ur) 1642 ABI_FREE(ug_refsp) 1643 ABI_FREE(ugsp) 1644 ABI_FREE(ursp) 1645 call destroy_mpi_enreg(MPI_enreg_seq) 1646 1647 if (nthreads > 0) call xomp_set_num_threads(old_nthreads) 1648 1649 end function fftu_utests
m_fft/fourdp_6d [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fourdp_6d
FUNCTION
Calculate a 6-dimensional Fast Fourier Transform isign=-1 : A(G1,G2) = Sum(r1,r2) A(r1,r2) exp(-iG1.r1) exp(+iG2.r2) ^ ^ isign=+1 : A(r1,r2) = Sum(G1,G2) A(G1,G2) exp(+iG1.r1) exp(-iG2.r2) ^ ^ isign=-1 and isign=1 form a transform/inverse-transform pair: calling one then the other will take you back to the original function, multiplied by a factor of (nl*nm*nn)**2. ------------------------------------------------------------------ input: a: A(r1,r2) [overwritten] output: a: A(G1,G2) in the format IGFFT ------------------------------------------------------------------
INPUTS
OUTPUT
SIDE EFFECTS
NOTES
SOURCE
4295 subroutine fourdp_6d(cplex,matrix,isign,MPI_enreg,nfft,ngfft,tim_fourdp) 4296 4297 !Arguments ------------------------------------ 4298 !scalars 4299 integer,intent(in) :: cplex,isign,nfft,tim_fourdp 4300 type(MPI_type),intent(in) :: MPI_enreg 4301 !arrays 4302 integer,intent(in) :: ngfft(18) 4303 complex(gwpc),intent(inout) :: matrix(nfft,nfft) 4304 4305 !Local variables------------------------------- 4306 !scalars 4307 !integer,parameter :: cplex=2 4308 integer :: i1,i2,i3,ifft 4309 integer :: n1,n2,n3 4310 !arrays 4311 real(dp),allocatable :: fofg(:,:),fofr(:) 4312 4313 ! ************************************************************************* 4314 4315 !TODO check normalization factor, it is better if we use the GW conventions. 4316 n1 = ngfft(1) 4317 n2 = ngfft(2) 4318 n3 = ngfft(3) 4319 4320 ABI_MALLOC(fofg,(2,nfft)) 4321 ABI_MALLOC(fofr,(cplex*nfft)) 4322 4323 do i3=0,n3-1 4324 do i2=0,n2-1 4325 do i1=0,n1-1 4326 4327 ifft=1+i1+i2*n1+i3*n1*n2 4328 if (isign==1) then 4329 ! G1 -> r1 transform for each G2 to form A(r1,G2) 4330 fofg(1,:)=REAL (matrix(:,ifft)) 4331 fofg(2,:)=AIMAG(matrix(:,ifft)) 4332 else if (isign==-1) then 4333 ! r1 -> G1 transform for each r2 to form A(G1,r2) 4334 fofr(1:nfft) =REAL (matrix(:,ifft)) 4335 fofr(nfft+1:2*nfft)=AIMAG(matrix(:,ifft)) 4336 else 4337 ABI_ERROR("Wrong isign") 4338 end if 4339 4340 call fourdp(cplex,fofg,fofr,isign,MPI_enreg,nfft,1,ngfft,tim_fourdp) 4341 4342 if (isign==1) then ! Save A(r1,G2) 4343 matrix(:,ifft)=CMPLX(fofr(1:nfft),fofr(nfft+1:2*nfft)) 4344 else if (isign==-1) then ! Save A(G1,r2) 4345 matrix(:,ifft)=CMPLX(fofg(1,:),fofg(2,:)) 4346 end if 4347 4348 end do 4349 end do 4350 end do 4351 4352 do i3=0,n3-1 4353 do i2=0,n2-1 4354 do i1=0,n1-1 4355 4356 ifft=1+i1+i2*n1+i3*n1*n2 4357 if (isign==1) then 4358 ! Do the G2 -> r2 transform of A(r1,G2) to get A(r1,r2) 4359 fofr(1:nfft )=REAL (matrix(ifft,:)) 4360 fofr(nfft+1:2*nfft)=AIMAG(matrix(ifft,:)) 4361 else if (isign==-1) then 4362 ! Do the r2 -> G2 transform of A(G1,r2) to get A(G1,G2) 4363 fofg(1,:)=REAL (matrix(ifft,:)) 4364 fofg(2,:)=AIMAG(matrix(ifft,:)) 4365 end if 4366 4367 call fourdp(2,fofg,fofr,-isign,MPI_enreg,nfft,1,ngfft,tim_fourdp) 4368 4369 if (isign==1) then 4370 matrix(ifft,:)=CMPLX(fofg(1,:),fofg(2,:)) 4371 else if (isign==-1) then 4372 matrix(ifft,:)=CMPLX(fofr(1:nfft),fofr(nfft+1:2*nfft)) 4373 end if 4374 4375 end do 4376 end do 4377 end do 4378 4379 ABI_FREE(fofg) 4380 ABI_FREE(fofr) 4381 4382 end subroutine fourdp_6d
m_fft/fourdp_mpi [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fourdp_mpi
FUNCTION
Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on fft grid in real space, to create complex f(G)=fofg defined on full fft grid in reciprocal space, in full storage mode, or the reverse operation. For the reverse operation, the final data is divided by nfftot. REAL case when cplex=1, COMPLEX case when cplex=2 Usually used for density and potentials.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nfft=(effective) number of FFT grid points (for this processor) ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft ndat=Numbre of FFT transforms isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r -1 for transforming from r to G. fftn2_distrib(2),ffti2_local(2) fftn3_distrib(3),ffti3_local(3) comm_fft=MPI communicator
SIDE EFFECTS
Input/Output fofg(2,nfft)=f(G), complex. fofr(cplex*nfft)=input function f(r) (real or complex)
TODO
Write simplified API for sequential version.
SOURCE
3471 subroutine fourdp_mpi(cplex,nfft,ngfft,ndat,isign,& 3472 & fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3473 3474 !Arguments ------------------------------------ 3475 !scalars 3476 integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft 3477 !arrays 3478 integer,intent(in) :: ngfft(18) 3479 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 3480 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 3481 real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat) 3482 3483 !Local variables------------------------------- 3484 !scalars 3485 integer :: fftalg,fftalga,fftalgc 3486 character(len=500) :: msg 3487 3488 ! ************************************************************************* 3489 3490 fftalg=ngfft(7); fftalga=fftalg/100 ; fftalgc=mod(fftalg,10) 3491 3492 select case (fftalga) 3493 case (FFT_SG2002) 3494 call sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 3495 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3496 3497 case (FFT_FFTW3) 3498 call fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 3499 fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3500 3501 ! TODO 3502 !case (FFT_DFTI) 3503 ! call dfti_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 3504 !& fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 3505 3506 case default 3507 write(msg,"(a,i0)")"Wrong fftalg: ",fftalg 3508 ABI_BUG(msg) 3509 end select 3510 3511 end subroutine fourdp_mpi
m_fft/fourwf_mpi [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
fourwf_mpi
FUNCTION
Carry out composite Fourier transforms between real and reciprocal (G) space. Wavefunctions, contained in a sphere in reciprocal space, can be FFT to real space. They can also be FFT from real space to a sphere. Also, the density maybe accumulated, and a local potential can be applied. The different options are : - reciprocal to real space and output the result (option=0), - reciprocal to real space and accumulate the density (option=1) - reciprocal to real space, apply the local potential to the wavefunction in real space and produce the result in reciprocal space (option=2) - real space to reciprocal space (option=3). Schedule of operations - fftalgc=1 : use separate forward and backward transforms (7/12 savings in execution time); - fftalgc=2 : in case of option=1 and option=2, use routines for composite operation even faster than 1x1 Also for better speed, it uses no F90 construct, except the allocate command and for zeroing arrays.
INPUTS
cplex= if 1 , denpot is real, if 2 , denpot is complex (cplex=2 only allowed for option=2, and istwf_k=1) not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory fftalgc=1 or 2 => simple or composite FFT applications fofgin(2,npwin)=holds input wavefunction in G vector basis sphere. (intent(in) but the routine sphere can modify it for another iflag) gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space istwf_k=option parameter that describes the storage of wfs kg_kin(3,npwin)=reduced planewave coordinates, input kg_kout(3,npwout)=reduced planewave coordinates, output me_g0=1 if this MPI node treats the Gamma, 0 otherwise mgfft=maximum size of 1D FFTs ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft distribfft<distribfft_type>=Tables needed for the FFT parallelism n1,n2,n3=1D FFT sizes npwin=number of elements in fofgin array (for option 0, 1 and 2) npwout=number of elements in fofgout array (for option 2 and 3) n4,n5,n6=dimensions of fofr. ndat=Number of FFTs option= if 0: do direct FFT if 1: do direct FFT, then sum the density if 2: do direct FFT, multiply by the potential, then do reverse FFT if 3: do reverse FFT only weight_r=weight to be used for the accumulation of the density in real space (needed only when option=1) weight_i=weight to be used for the accumulation of the density in real space (needed only when option=1) comm_fft=MPI communicator. [cplexwf]= 1 c2r or r2c can be used. Default: 2 i.e. use complex-to-complex FFTs.
OUTPUT
(see side effects)
SIDE EFFECTS
Input/Output for option==0, fofgin(2,npwin)=holds input wavefunction in G sphere; fofr(2,n4,n5,n6) contains the output Fourier Transform of fofgin; no use of denpot, fofgout and npwout. for option==1, fofgin(2,npwin)=holds input wavefunction in G sphere; denpot(cplex*n4,n5,n6) contains the input density at input, and the updated density at output (accumulated); no use of fofgout and npwout. for option==2, fofgin(2,npwin)=holds input wavefunction in G sphere; denpot(cplex*n4,n5,n6) contains the input local potential; fofgout(2,npwout) contains the output function; for option==3, fofr(2,n4,n5,n6) contains the input real space wavefunction; fofgout(2,npwout) contains its output Fourier transform; no use of fofgin and npwin.
SOURCE
3597 subroutine fourwf_mpi(cplex,denpot,fofgin,fofgout,fofr,& 3598 & gboundin,gboundout,istwf_k,kg_kin,kg_kout,me_g0,mgfft,ngfft,distribfft,n1,n2,n3,& 3599 & npwin,npwout,n4,n5,n6,ndat,option,weight_r,weight_i,comm_fft,cplexwf) 3600 3601 3602 !Arguments ------------------------------------ 3603 !scalars 3604 integer,intent(in) :: cplex,istwf_k,mgfft,n1,n2,n3,n4,n5,n6,npwin,ndat 3605 integer,intent(in) :: npwout,option,comm_fft,me_g0 3606 integer,intent(in),optional :: cplexwf 3607 real(dp),intent(in) :: weight_r,weight_i 3608 type(distribfft_type),intent(in) :: distribfft 3609 !arrays 3610 integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2),ngfft(18) 3611 integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout) 3612 real(dp),intent(inout) :: denpot(cplex*n4,n5,n6),fofgin(2,npwin*ndat) 3613 real(dp),intent(inout) :: fofr(2,n4,n5,n6*ndat) 3614 real(dp),intent(out) :: fofgout(2,npwout*ndat) 3615 3616 !Local variables------------------------------- 3617 !scalars 3618 integer :: fftalg,fftalga,fftalgc,idat 3619 integer :: cplexwf_,i1,i2,i3,iflag,ig,igdat,me_fft,m1i,m1o,m2i,m2o,m3i 3620 integer :: m3o,max1i,max1o,max2i,max2i_plus,max2o,max2o_plus 3621 integer :: max3i,max3o,md1,md1i,md1o,md2,md2i,md2o,md2proc 3622 integer :: md3,md3i,md3o,min1i,min1o,min2i,min2i_moins,min2o,min2o_moins,min3i,min3o 3623 integer :: nd3proc,nproc_fft,n6eff,i3_glob,i2_loc,i2dat_loc,i3dat 3624 integer,save :: nwrites_ialltoall=0 3625 logical :: use_ialltoall 3626 real(dp) :: fim,fre,xnorm 3627 character(len=500) :: msg 3628 !arrays 3629 integer,parameter :: shiftg0(3)=0 3630 integer,parameter :: symmE(3,3)=reshape([1,0,0,0,1,0,0,0,1],[3,3]) 3631 ! real(dp) :: tsec(2) 3632 real(dp) :: weight_array_r(ndat), weight_array_i(ndat) 3633 real(dp),allocatable :: workf(:,:,:,:) 3634 3635 ! ************************************************************************* 3636 3637 !call timab(540,1,tsec) 3638 3639 fftalg=ngfft(7); fftalga=fftalg/100 ; fftalgc=mod(fftalg,10) 3640 3641 me_fft=ngfft(11); nproc_fft=ngfft(10) 3642 !write(std_out,*)"nproc_fft",nproc_fft 3643 ! Risky since I don't know if this entry is initialized 3644 ! Continue to pass comm_fft explicitly. 3645 !comm_fft = ngfft(14) 3646 3647 if (fftalgc<1 .or. fftalgc>2) then 3648 write(msg,'(a,i0,3a)')& 3649 'The input algorithm number fftalgc=',fftalgc,' is not allowed with MPI-FFT. Must be 1 or 2',ch10,& 3650 'Action: change fftalgc in your input file.' 3651 ABI_ERROR(msg) 3652 end if 3653 3654 if (option<0 .or. option>3) then 3655 write(msg,'(a,i0,3a)')& 3656 'The option number',option,' is not allowed.',ch10,& 3657 'Only option=0, 1, 2 or 3 are allowed presently.' 3658 ABI_ERROR(msg) 3659 end if 3660 3661 if (option==1 .and. cplex/=1) then 3662 write(msg,'(a,i0,a)')& 3663 'With the option number 1, cplex must be 1 but it is cplex=',cplex,'.' 3664 ABI_ERROR(msg) 3665 end if 3666 3667 if ( ALL(cplex/=(/1,2/)) .and. ANY(option==(/1,2/)) ) then 3668 write(msg,'(a,i0,a)')' When option is (1,2) cplex must be 1 or 2, but it is cplex=',cplex,'.' 3669 ABI_ERROR(msg) 3670 end if 3671 3672 !write(std_out,*)"in fourwf_mpi with fftalg: ",fftalg,fftalgc 3673 3674 ! We use the non-blocking version if IALLTOALL is available and ndat > 1. 3675 use_ialltoall = .False. 3676 #ifdef HAVE_MPI_IALLTOALL 3677 use_ialltoall = (ndat > 1) 3678 #endif 3679 use_ialltoall = (use_ialltoall .and. ALLOW_IALLTOALL) 3680 if (use_ialltoall .and. nwrites_ialltoall==0) then 3681 nwrites_ialltoall = 1 3682 call wrtout(std_out, "- Will use non-blocking ialltoall for MPI-FFT") 3683 end if 3684 3685 md1i=0; md2i=0; md3i=0; m2i=0 3686 md1o=0; md2o=0; md3o=0; m2o=0 3687 3688 if (option/=3) then 3689 ! Compute the dimensions of the small-box enclosing the input G-sphere 3690 max1i=gboundin(2,1); min1i=gboundin(1,1) 3691 max2i=gboundin(4,1); min2i=gboundin(3,1) 3692 3693 if(istwf_k==2 .or. istwf_k==4 .or. istwf_k==6 .or. istwf_k==8)then 3694 max1i=max(max1i,-min1i) 3695 min1i=-max1i 3696 else if (istwf_k==3 .or. istwf_k==5 .or. istwf_k==7 .or. istwf_k==9) then 3697 max1i=max(max1i,-min1i-1) 3698 min1i=-max1i-1 3699 end if 3700 if (istwf_k>=2 .and. istwf_k<=5) then 3701 max2i=max(max2i,-min2i) 3702 min2i=-max2i 3703 else if (istwf_k>=6 .and. istwf_k<=9) then 3704 max2i=max(max2i,-min2i-1) 3705 min2i=-max2i-1 3706 end if 3707 3708 max3i=gboundin(4,2); min3i=gboundin(3,2) 3709 3710 ! Compute arrays size and leading dimensions to avoid cache trashing 3711 m1i=max1i-min1i+1; md1i=2*(m1i/2)+1 3712 m2i=max2i-min2i+1; md2i=2*(m2i/2)+1 3713 3714 !if (.False.) then 3715 if (nproc_fft/=1) then 3716 ! Increase max2i in order to have m2i divisible by nproc_fft 3717 min2i_moins=(((m2i-1)/nproc_fft+1)*nproc_fft-m2i)/2 3718 max2i_plus=((m2i-1)/nproc_fft+1)*nproc_fft-m2i-min2i_moins 3719 ! max2i=max2i+((m2i-1)/nproc_fft+1)*nproc_fft-m2i 3720 max2i=max2i+max2i_plus 3721 min2i=min2i-min2i_moins 3722 ! careful, to be checked and make sure the max and min are smaller than size of box 3723 m2i=max2i-min2i+1; md2i=2*(m2i/2)+1 3724 end if 3725 ABI_CHECK(m2i <= n2, "m2i > n2") 3726 3727 m3i=max3i-min3i+1; md3i=2*(m3i/2)+1 3728 end if 3729 3730 if (option==2 .or. option==3) then 3731 ! Compute the dimensions of the small-box enclosing the output G-sphere 3732 max1o=gboundout(2,1); min1o=gboundout(1,1) 3733 max2o=gboundout(4,1); min2o=gboundout(3,1) 3734 3735 if (istwf_k==2 .or. istwf_k==4 .or. istwf_k==6 .or. istwf_k==8) then 3736 max1o=max(max1o,-min1o) 3737 min1o=-max1o 3738 else if (istwf_k==3 .or. istwf_k==5 .or. istwf_k==7 .or. istwf_k==9) then 3739 max1o=max(max1o,-min1o-1) 3740 min1o=-max1o-1 3741 end if 3742 if (istwf_k>=2 .and. istwf_k<=5) then 3743 max2o=max(max2o,-min2o) 3744 min2o=-max2o 3745 else if (istwf_k>=6 .and. istwf_k<=9) then 3746 max2o=max(max2o,-min2o-1) 3747 min2o=-max2o-1 3748 end if 3749 3750 max3o=gboundout(4,2); min3o=gboundout(3,2) 3751 3752 ! Compute arrays size and leading dimensions to avoid cache trashing 3753 m1o=max1o-min1o+1; md1o=2*(m1o/2)+1 3754 m2o=max2o-min2o+1; md2o=2*(m2o/2)+1 3755 3756 if (nproc_fft/=1) then 3757 ! Increase max2o in order to have m2o divisible by nproc_fft 3758 min2o_moins=(((m2o-1)/nproc_fft+1)*nproc_fft-m2o)/2 3759 max2o_plus=((m2o-1)/nproc_fft+1)*nproc_fft-m2o-min2o_moins 3760 ! max2o=max2o+((m2o-1)/nproc_fft+1)*nproc_fft-m2o 3761 max2o=max2o+max2o_plus 3762 min2o=min2o-min2o_moins 3763 ! careful, to be checked and make sure the max and min are smaller than size of box 3764 m2o=max2o-min2o+1; md2o=2*(m2o/2)+1 3765 end if 3766 ABI_CHECK(m2o <= n2, "m2o > n2") 3767 3768 m3o=max3o-min3o+1; md3o=2*(m3o/2)+1 3769 end if 3770 3771 md1=max(md1i,md1o) 3772 md2=max(md2i,md2o) 3773 md3=max(md3i,md3o) 3774 3775 md2proc=(max(m2i,m2o)-1)/nproc_fft+1 3776 n6eff=(n6-1)/nproc_fft+1 3777 !write(std_out,*)'fourwf_mpi : max1i,max2i,max3i=',max1i,max2i,max3i 3778 !write(std_out,*)'fourwf_mpi : min1i,min2i,min3i=',min1i,min2i,min3i 3779 !write(std_out,*)'fourwf_mpi : m1i,m2i,m3i=',m1i,m2i,m3i 3780 3781 ! Allocate work array in G-space (note exchange 3 <--> 2) 3782 ABI_MALLOC(workf,(2,md1,md3,md2proc*ndat)) 3783 3784 if (option/=3) then 3785 ! Insert fofgin into the **small** box (array workf) : 3786 ! Note the switch of md3 and md2, as they are only needed to dimension workf inside "sphere" 3787 3788 if (nproc_fft > 1) then 3789 if (istwf_k/=1 )then 3790 write(msg,'(a,i0,a)')'The value of istwf_k: ',istwf_k,' is not allowed. Only istwf_k=1 is allowed in MPI-FFT' 3791 !ABI_WARNING(msg) 3792 ABI_ERROR(msg) 3793 end if 3794 call sphere_fft1(fofgin,ndat,npwin,workf,m1i,m2i,m3i,md1,md3,md2proc,kg_kin,distribfft%tab_fftwf2_local) 3795 else 3796 iflag=2 3797 call sphere(fofgin,ndat,npwin,workf,m1i,m2i,m3i,md1,md3,md2proc,kg_kin,istwf_k,iflag,me_g0,shiftg0,symmE,one) 3798 end if 3799 end if 3800 3801 ! Can we use c2r or r2c? 3802 cplexwf_=2; if (istwf_k==2) cplexwf_=1 3803 if (present(cplexwf)) cplexwf_ = cplexwf 3804 3805 if (option==0 .or. ((option==1.or.option==2) .and. fftalgc==1) .or. option==3) then 3806 ! Treat non-composite operations 3807 3808 if (option/=3) then 3809 ! Fourier transform workf(2,md1,md3,md2proc*ndat) to fofr (reciprocal to real space). 3810 ! FIXME: This is buggy if cplexwx==1 3811 3812 select case (fftalga) 3813 3814 case (FFT_SG2002) 3815 3816 ! do idat=1,ndat 3817 ! call sg2002_mpiback_wf(cplexwf_,1,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3818 ! & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3, & 3819 ! & workf(:,:,:,(idat-1)*md2proc+1:idat*md2proc), & 3820 ! & fofr(:,:,:,(idat-1)*n6eff+1:idat*n6eff),comm_fft) 3821 ! enddo 3822 call sg2002_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3823 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft) 3824 3825 case (FFT_FFTW3) 3826 3827 if (use_ialltoall) then 3828 call fftw3_mpiback_manywf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3829 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft) 3830 else 3831 call fftw3_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3832 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft) 3833 end if 3834 3835 !case (FFT_DFTI) 3836 ! call dfti_mpiback_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3837 ! & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,workf,fofr,comm_fft) 3838 case default 3839 ABI_ERROR("fftalga does not provide MPI back_wf") 3840 end select 3841 3842 end if ! option 3843 3844 nd3proc=(n3-1)/nproc_fft+1 3845 3846 if (option==1) then 3847 ! Accumulate density 3848 do idat=1,ndat 3849 do i3=1,nd3proc 3850 i3_glob = i3 + nd3proc * me_fft 3851 i3dat = i3 + n6eff * (idat-1) 3852 do i2=1,n2 3853 do i1=1,n1 3854 denpot(i1,i2,i3_glob) = denpot(i1,i2,i3_glob) & 3855 + (weight_r*fofr(1,i1,i2,i3dat)**2+ weight_i*fofr(2,i1,i2,i3dat)**2) 3856 end do 3857 end do 3858 end do 3859 end do 3860 end if ! option==1 3861 3862 if (option==2) then 3863 3864 ! Apply local potential 3865 if (cplex==1) then 3866 do idat=1,ndat 3867 do i3=1,nd3proc 3868 i3_glob = i3 + nd3proc * me_fft 3869 i3dat = i3 + n6eff * (idat-1) 3870 do i2=1,n2 3871 do i1=1,n1 3872 fofr(1,i1,i2,i3dat)=denpot(i1,i2,i3_glob)*fofr(1,i1,i2,i3dat) 3873 fofr(2,i1,i2,i3dat)=denpot(i1,i2,i3_glob)*fofr(2,i1,i2,i3dat) 3874 end do 3875 end do 3876 end do 3877 end do 3878 3879 else if (cplex==2) then 3880 do idat=1,ndat 3881 do i3=1,(n3-1)/nproc_fft+1 3882 i3_glob = i3 + nd3proc * me_fft 3883 i3dat = i3 + n6eff * (idat-1) 3884 do i2=1,n2 3885 do i1=1,n1 3886 fre=fofr(1,i1,i2,i3dat) 3887 fim=fofr(2,i1,i2,i3dat) 3888 fofr(1,i1,i2,i3dat)=denpot(2*i1-1,i2,i3_glob)*fre -denpot(2*i1,i2,i3_glob)*fim 3889 fofr(2,i1,i2,i3dat)=denpot(2*i1-1,i2,i3_glob)*fim +denpot(2*i1,i2,i3_glob)*fre 3890 end do 3891 end do 3892 end do 3893 end do 3894 end if ! cplex 3895 3896 end if ! option==2 3897 3898 if (option==2 .or. option==3) then 3899 ! Fourier transform fofr to workf (real to reciprocal space) 3900 ! output in workf(2,md1,md3,md2proc*ndat) 3901 3902 select case (fftalga) 3903 case (FFT_SG2002) 3904 3905 call sg2002_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3906 max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft) 3907 3908 case (FFT_FFTW3) 3909 3910 if (use_ialltoall) then 3911 call fftw3_mpiforw_manywf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3912 max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft) 3913 else 3914 call fftw3_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3915 max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft) 3916 end if 3917 3918 !case (FFT_DFTI) 3919 ! call dfti_mpiforw_wf(cplexwf_,ndat,n1,n2,n3,n4,n5,(n6-1)/nproc_fft+1,& 3920 ! &max1o,max2o,max3o,m1o,m2o,m3o,md1,md2proc,md3,fofr,workf,comm_fft) 3921 3922 case default 3923 ABI_ERROR("fftalga does not provide MPI back_wf") 3924 end select 3925 3926 end if 3927 3928 else if (fftalgc==2 .and. (option==1 .or. option==2)) then 3929 ! Treat composite operations 3930 3931 select case (option) 3932 case (1) 3933 !ABI_CHECK(weight_r == weight_i,"weight_r != weight_i") 3934 weight_array_r(:)=weight_r 3935 weight_array_i(:)=weight_i 3936 3937 select case (fftalga) 3938 case (FFT_SG2002) 3939 ! Note that here we don' fill fofr. Don't know if someone in 3940 ! abinit uses option 1 to get both fofr as well as denpot 3941 call sg2002_accrho(cplexwf_,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3942 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,& 3943 workf,denpot,weight_array_r,weight_array_i) 3944 3945 case (FFT_FFTW3) 3946 call fftw3_accrho(cplexwf_,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3947 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,& 3948 workf,denpot,weight_array_r, weight_array_i) 3949 3950 case default 3951 ABI_ERROR("fftalga does not provide accrho") 3952 end select 3953 3954 case (2) 3955 !write(std_out,*)fftalg,option,cplex 3956 !ABI_CHECK(cplex==1,"cplex!=2 with fftalg 412 is buggy") 3957 3958 select case (fftalga) 3959 3960 case (FFT_SG2002) 3961 3962 if (use_ialltoall) then 3963 call sg2002_applypot_many(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3964 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 3965 max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf) 3966 3967 else 3968 call sg2002_applypot(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3969 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 3970 max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf) 3971 endif 3972 3973 case (FFT_FFTW3) 3974 3975 if (use_ialltoall) then 3976 3977 call fftw3_applypot_many(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3978 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 3979 max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf) 3980 3981 else 3982 call fftw3_applypot(cplexwf_,cplex,ndat,n1,n2,n3,n4,n5,n6,(n6-1)/nproc_fft+1,& 3983 max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 3984 max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,denpot,workf) 3985 end if 3986 3987 case default 3988 ABI_ERROR("fftalga does not provide applypot") 3989 end select 3990 3991 case default 3992 write(msg,"(a,i0,a)")"Option ",option," is not supported when fftalgc == 2" 3993 ABI_ERROR(msg) 3994 end select 3995 3996 end if ! End of composite operations 3997 3998 if (option==2 .or. option==3) then 3999 ! From FFT box to kg_kout. 4000 xnorm=one/dble(n1*n2*n3) 4001 4002 if (nproc_fft > 1) then 4003 do idat=1,ndat 4004 do ig=1,npwout 4005 i1=kg_kout(1,ig); if(i1<0) i1=i1+m1o; i1=i1+1 4006 i2=kg_kout(2,ig); if(i2<0) i2=i2+m2o; i2=i2+1 4007 i3=kg_kout(3,ig); if(i3<0) i3=i3+m3o; i3=i3+1 4008 4009 igdat = ig + (idat-1) * npwout 4010 i2_loc = (i2-1)/nproc_fft +1 4011 i2dat_loc = i2_loc + (idat-1) * md2proc 4012 4013 fofgout(1,igdat)=workf(1,i1,i3,i2dat_loc)*xnorm 4014 fofgout(2,igdat)=workf(2,i1,i3,i2dat_loc)*xnorm 4015 end do 4016 end do 4017 else 4018 ! Warning: This call is buggy if istwfk > 2 4019 iflag=-2 4020 call sphere(fofgout,ndat,npwout,workf,m1o,m2o,m3o,md1,md3,md2proc,kg_kout,istwf_k,iflag,& 4021 me_g0,shiftg0,symmE,xnorm) 4022 end if 4023 end if ! if option==2 or 3 4024 4025 ABI_FREE(workf) 4026 4027 !call timab(540,2,tsec) 4028 4029 end subroutine fourwf_mpi
m_fft/indirect_parallel_Fourier [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
indirect_parallel_Fourier
FUNCTION
The purpose of this routine is to transfer data from right to left right(:,index(i))=left(:,i) The difficulty is that right and left are distributed among processors We will suppose that the distribution is done as a density in Fourier space We first order the right hand side data according to the processor in which they are going to be located in the left hand side. This is done is a way such that a mpi_alltoall put the data on the correct processor. We also transfer their future adress. A final ordering put everything in place
INPUTS
index(sizeindex)= global adress for the transfer from right to left left(2,nleft)=left hand side mpi_enreg=information about MPI parallelization ngleft(18)=contain all needed information about 3D FFT for the left hand side see ~abinit/doc/variables/vargs.htm#ngfft ngright(18)=contain all needed information about 3D FFT for the right hand side see ~abinit/doc/variables/vargs.htm#ngfft nleft=second dimension of left array (for this processor) nright=second dimension of right array (for this processor) sizeindex=size of the index array (different form nright, because it is global to all proccessors)
OUTPUT
left(2,nleft)=the elements of the right hand side, at the correct palce in the correct processor
NOTES
A lot of things to improve.
SOURCE
4557 subroutine indirect_parallel_Fourier(index,left,mpi_enreg,ngleft,ngright,nleft,nright,paral_kgb,right,sizeindex) 4558 4559 !Arguments --------------------------------------------- 4560 !scalars 4561 integer,intent(in) :: ngleft(18),ngright(18),nleft,nright,paral_kgb,sizeindex 4562 type(MPI_type),intent(in) :: mpi_enreg 4563 !arrays 4564 integer,intent(in) :: index(sizeindex) 4565 real(dp),intent(in) :: right(2,nright) 4566 real(dp),intent(inout) :: left(2,nleft) 4567 4568 !Local variables --------------------------------------- 4569 !scalars 4570 integer :: ierr,i_global,ileft,iright,iright_global 4571 integer :: j,j1,j2,j3,j_global,jleft_global 4572 integer :: jleft_local,me_fft,n1l,n2l,n3l,n1r,n2r,n3r,nd2l,nd2r 4573 integer :: nproc_fft,proc_dest,r2,siz_slice_max 4574 !arrays 4575 integer,allocatable :: index_recv(:),index_send(:),siz_slice(:), ffti2r_global(:) 4576 integer, ABI_CONTIGUOUS pointer :: fftn2l_distrib(:),ffti2l_local(:) 4577 integer, ABI_CONTIGUOUS pointer :: fftn3l_distrib(:),ffti3l_local(:) 4578 integer, ABI_CONTIGUOUS pointer :: fftn2r_distrib(:),ffti2r_local(:) 4579 integer, ABI_CONTIGUOUS pointer :: fftn3r_distrib(:),ffti3r_local(:) 4580 real(dp),allocatable :: right_send(:,:),right_recv(:,:) 4581 4582 ! ************************************************************************* 4583 n1r=ngright(1);n2r=ngright(2);n3r=ngright(3) 4584 n1l=ngleft(1) ;n2l=ngleft(2) ;n3l=ngleft(3) 4585 nproc_fft=mpi_enreg%nproc_fft; me_fft=mpi_enreg%me_fft 4586 nd2r=n2r/nproc_fft; nd2l=n2l/nproc_fft 4587 4588 !Get the distrib associated with the left fft_grid 4589 call ptabs_fourdp(mpi_enreg,n2l,n3l,fftn2l_distrib,ffti2l_local,fftn3l_distrib,ffti3l_local) 4590 4591 !Get the distrib associated with the right fft_grid 4592 call ptabs_fourdp(mpi_enreg,n2r,n3r,fftn2r_distrib,ffti2r_local,fftn3r_distrib,ffti3r_local) 4593 4594 !Precompute local --> global corespondance 4595 ABI_MALLOC(ffti2r_global,(nd2r)) 4596 ffti2r_global(:) = -1 4597 do j2=1,n2r 4598 if( fftn2r_distrib(j2) == me_fft ) then 4599 ffti2r_global( ffti2r_local(j2) ) = j2 4600 end if 4601 end do 4602 4603 4604 ABI_MALLOC(siz_slice,(nproc_fft)) 4605 siz_slice(:)=0 4606 do i_global=1,sizeindex !look for the maximal size of slice of data 4607 j_global=index(i_global)!; write(std_out,*) j_global,i_global 4608 if(j_global /=0) then 4609 !use the fact that (j-1)=i1 + n1l*(j2l-1 + n2l*(j3l-1)) 4610 proc_dest= fftn2l_distrib( modulo((j_global-1)/n1l,n2l) + 1) 4611 siz_slice(proc_dest+1)=siz_slice(proc_dest+1)+1 4612 !write(std_out,*) 'in indirect proc',proc_dest,siz_slice(proc_dest+1) 4613 end if 4614 end do 4615 siz_slice_max=maxval(siz_slice) !This value could be made smaller by looking locally 4616 !and performing a allgather with a max 4617 !write(std_out,*) 'siz_slice,sizeindex,siz_slice',siz_slice(:),sizeindex,siz_slice_max 4618 !write(std_out,*) 'sizeindex,nright,nleft',sizeindex,nright,nleft 4619 ABI_MALLOC(right_send,(2,nproc_fft*siz_slice_max)) 4620 ABI_MALLOC(index_send,(nproc_fft*siz_slice_max)) 4621 siz_slice(:)=0; index_send(:)=0; right_send(:,:)=zero 4622 do iright=1,nright 4623 j=iright-1;j1=modulo(j,n1r);j2=modulo(j/n1r,nd2r);j3=j/(n1r*nd2r) 4624 j2 = ffti2r_global(j2+1) - 1 4625 iright_global=n1r*(n2r*j3+j2)+j1+1 4626 jleft_global=index(iright_global) 4627 if(jleft_global/=0)then 4628 j=jleft_global-1;j1=modulo(j,n1l);j2=modulo(j/n1l,n2l);j3=j/(n1l*n2l); r2=ffti2l_local(j2+1)-1 4629 jleft_local=n1l*(nd2l*j3+r2)+j1+1 4630 proc_dest=fftn2l_distrib(j2+1) 4631 siz_slice(proc_dest+1)=siz_slice(proc_dest+1)+1 4632 right_send(:,proc_dest*siz_slice_max+siz_slice(proc_dest+1))=right(:,iright) 4633 index_send(proc_dest*siz_slice_max+siz_slice(proc_dest+1))=jleft_local 4634 !write(std_out,*) 'loop ir',jleft_local,jleft_global,iright_global,iright 4635 end if 4636 end do 4637 ABI_MALLOC(right_recv,(2,nproc_fft*siz_slice_max)) 4638 ABI_MALLOC(index_recv,(nproc_fft*siz_slice_max)) 4639 #if defined HAVE_MPI 4640 if(paral_kgb == 1) then 4641 call mpi_alltoall (right_send,2*siz_slice_max, & 4642 & MPI_double_precision, & 4643 & right_recv,2*siz_slice_max, & 4644 & MPI_double_precision,mpi_enreg%comm_fft,ierr) 4645 call mpi_alltoall (index_send,siz_slice_max, & 4646 & MPI_integer, & 4647 & index_recv,siz_slice_max, & 4648 & MPI_integer,mpi_enreg%comm_fft,ierr) 4649 endif 4650 #endif 4651 do ileft=1,siz_slice_max*nproc_fft 4652 !write(std_out,*)index_recv(ileft) 4653 if(index_recv(ileft) /=0 ) left(:,index_recv(ileft))=right_recv(:,ileft) 4654 end do 4655 ABI_FREE(right_recv) 4656 ABI_FREE(index_recv) 4657 ABI_FREE(right_send) 4658 ABI_FREE(index_send) 4659 ABI_FREE(siz_slice) 4660 ABI_FREE(ffti2r_global) 4661 4662 end subroutine indirect_parallel_Fourier
m_fft/uplan_execute_gr_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_execute_gr_dpc
FUNCTION
INPUTS
SOURCE
4884 subroutine uplan_execute_gr_dpc(uplan, ndat, ug, ur, isign, iscale) 4885 4886 !Arguments ------------------------------------ 4887 class(uplan_t),intent(in) :: uplan 4888 integer,intent(in) :: ndat 4889 complex(dp),intent(in) :: ug(uplan%npw*uplan%nspinor*ndat) 4890 complex(dp),intent(out) :: ur(uplan%nfft*uplan%nspinor*ndat) 4891 integer,optional,intent(in) :: isign, iscale 4892 4893 !Local variables------------------------------- 4894 integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache 4895 4896 ! ************************************************************************* 4897 4898 ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!") 4899 ABI_CHECK_IEQ(dp, uplan%kind, "Incosistent kind!") 4900 4901 isign__ = +1; if (present(isign)) isign__ = isign 4902 iscale__ = 0; if (present(iscale)) iscale__ = iscale 4903 4904 fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10) 4905 nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3) 4906 ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it. 4907 4908 if (uplan%gpu_option == ABI_GPU_DISABLED) then 4909 select case (fftalga) 4910 case (FFT_FFTW3) 4911 call fftw3_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, & 4912 uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, & 4913 isign=isign__, iscale=iscale__) 4914 case (FFT_DFTI) 4915 call dfti_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, & 4916 uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, & 4917 isign=isign__, iscale=iscale__) 4918 case default 4919 ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga))) 4920 end select 4921 4922 else 4923 NOT_IMPLEMENTED_ERROR() 4924 end if 4925 4926 end subroutine uplan_execute_gr_dpc
m_fft/uplan_execute_gr_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_execute_gr_spc
FUNCTION
INPUTS
SOURCE
4827 subroutine uplan_execute_gr_spc(uplan, ndat, ug, ur, isign, iscale) 4828 4829 !Arguments ------------------------------------ 4830 class(uplan_t),intent(in) :: uplan 4831 integer,intent(in) :: ndat 4832 complex(sp),intent(in) :: ug(uplan%npw*uplan%nspinor*ndat) 4833 complex(sp),intent(out) :: ur(uplan%nfft*uplan%nspinor*ndat) 4834 integer,optional,intent(in) :: isign, iscale 4835 4836 !Local variables------------------------------- 4837 integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache 4838 4839 ! ************************************************************************* 4840 4841 ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!") 4842 ABI_CHECK_IEQ(sp, uplan%kind, "Incosistent kind!") 4843 4844 isign__ = +1; if (present(isign)) isign__ = isign 4845 iscale__ = 0; if (present(iscale)) iscale__ = iscale 4846 4847 fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10) 4848 nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3) 4849 ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it. 4850 4851 if (uplan%gpu_option == ABI_GPU_DISABLED) then 4852 select case (fftalga) 4853 case (FFT_FFTW3) 4854 call fftw3_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, & 4855 uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, & 4856 isign=isign__, iscale=iscale__) 4857 case (FFT_DFTI) 4858 call dfti_fftug(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, & 4859 uplan%istwfk, uplan%mgfft, uplan%kg_k, uplan%gbound, ug, ur, & 4860 isign=isign__, iscale=iscale__) 4861 case default 4862 ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga))) 4863 end select 4864 4865 else 4866 NOT_IMPLEMENTED_ERROR() 4867 end if 4868 4869 end subroutine uplan_execute_gr_spc
m_fft/uplan_execute_rg_dpc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_execute_rg_dpc
FUNCTION
INPUTS
SOURCE
4996 subroutine uplan_execute_rg_dpc(uplan, ndat, ur, ug, isign, iscale) 4997 4998 !Arguments ------------------------------------ 4999 class(uplan_t),intent(in) :: uplan 5000 integer,intent(in) :: ndat 5001 complex(dp),intent(inout) :: ur(uplan%nfft*uplan%nspinor*ndat) 5002 complex(dp),intent(out) :: ug(uplan%npw*uplan%nspinor*ndat) 5003 integer,optional,intent(in) :: isign, iscale 5004 5005 !Local variables------------------------------- 5006 integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache 5007 ! ************************************************************************* 5008 5009 ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!") 5010 ABI_CHECK_IEQ(dp, uplan%kind, "Incosistent kind!") 5011 5012 isign__ = -1; if (present(isign)) isign__ = isign 5013 iscale__ = 1; if (present(iscale)) iscale__ = iscale 5014 5015 fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10) 5016 nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3) 5017 ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it. 5018 5019 if (uplan%gpu_option == ABI_GPU_DISABLED) then 5020 select case (fftalga) 5021 case (FFT_FFTW3) 5022 call fftw3_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, & 5023 uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__) 5024 case (FFT_DFTI) 5025 call dfti_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, & 5026 uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__) 5027 case default 5028 ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga))) 5029 end select 5030 5031 else 5032 NOT_IMPLEMENTED_ERROR() 5033 end if 5034 5035 end subroutine uplan_execute_rg_dpc
m_fft/uplan_execute_rg_spc [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_execute_rg_spc
FUNCTION
INPUTS
SOURCE
4941 subroutine uplan_execute_rg_spc(uplan, ndat, ur, ug, isign, iscale) 4942 4943 !Arguments ------------------------------------ 4944 class(uplan_t),intent(in) :: uplan 4945 integer,intent(in) :: ndat 4946 complex(sp),intent(inout) :: ur(uplan%nfft*uplan%nspinor*ndat) 4947 complex(sp),intent(out) :: ug(uplan%npw*uplan%nspinor*ndat) 4948 integer,optional,intent(in) :: isign, iscale 4949 4950 !Local variables------------------------------- 4951 integer :: isign__, iscale__, nx, ny, nz, ldx, ldy, ldz, fftalg, fftalga, fftalgc, fftcache 4952 4953 ! ************************************************************************* 4954 4955 ABI_CHECK_ILEQ(ndat, uplan%batch_size, "ndat > batch_size!") 4956 ABI_CHECK_IEQ(sp, uplan%kind, "Incosistent kind!") 4957 4958 isign__ = -1; if (present(isign)) isign__ = isign 4959 iscale__ = 1; if (present(iscale)) iscale__ = iscale 4960 4961 fftalg = uplan%ngfft(7); fftcache = uplan%ngfft(8); fftalga = fftalg/100; fftalgc = mod(fftalg, 10) 4962 nx = uplan%ngfft(1); ny = uplan%ngfft(2); nz = uplan%ngfft(3) 4963 ldx = nx; ldy = ny; ldz = nz ! No augmentation, the caller does not support it. 4964 4965 if (uplan%gpu_option == ABI_GPU_DISABLED) then 4966 select case (fftalga) 4967 case (FFT_FFTW3) 4968 call fftw3_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, & 4969 uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__) 4970 case (FFT_DFTI) 4971 call dfti_fftur(fftalg, fftcache, uplan%npw, nx, ny, nz, ldx, ldy, ldz, uplan%nspinor*ndat, uplan%istwfk, uplan%mgfft, & 4972 uplan%kg_k, uplan%gbound, ur, ug, isign=isign__, iscale=iscale__) 4973 case default 4974 ABI_ERROR(sjoin("Wrong fftalga:", itoa(fftalga))) 4975 end select 4976 4977 else 4978 NOT_IMPLEMENTED_ERROR() 4979 end if 4980 4981 end subroutine uplan_execute_rg_spc
m_fft/uplan_free [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_free
FUNCTION
INPUTS
SOURCE
4801 subroutine uplan_free(uplan) 4802 4803 !Arguments ------------------------------------ 4804 class(uplan_t),intent(inout) :: uplan 4805 ! ************************************************************************* 4806 4807 ABI_SFREE(uplan%gbound) 4808 if (uplan%gpu_option /= ABI_GPU_DISABLED) then 4809 ! Free memory on the GPU 4810 end if 4811 4812 end subroutine uplan_free
m_fft/uplan_init [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
uplan_init
FUNCTION
INPUTS
SOURCE
4755 subroutine uplan_init(uplan, npw, nspinor, batch_size, ngfft, istwfk, kg_k, kind, gpu_option) 4756 4757 !Arguments ------------------------------------ 4758 !scalars 4759 class(uplan_t),intent(out) :: uplan 4760 integer,intent(in) :: npw, nspinor, batch_size, istwfk, kind, gpu_option 4761 !arrays 4762 integer,intent(in) :: ngfft(18) 4763 integer,target,intent(in) :: kg_k(3,npw) 4764 4765 ! ************************************************************************* 4766 4767 uplan%npw = npw 4768 uplan%nspinor = nspinor 4769 uplan%istwfk = istwfk 4770 uplan%batch_size = batch_size 4771 uplan%kind = kind 4772 uplan%gpu_option = gpu_option 4773 uplan%ngfft = ngfft 4774 uplan%mgfft = maxval(ngfft(1:3)) 4775 uplan%nfft = product(ngfft(1:3)) 4776 uplan%kg_k => kg_k 4777 4778 ABI_MALLOC(uplan%gbound, (2 * uplan%mgfft + 8, 2)) 4779 call sphereboundary(uplan%gbound, uplan%istwfk, uplan%kg_k, uplan%mgfft, uplan%npw) 4780 4781 if (uplan%gpu_option /= ABI_GPU_DISABLED) then 4782 ! Allocate memory on the device and transfer data. 4783 NOT_IMPLEMENTED_ERROR() 4784 end if 4785 4786 end subroutine uplan_init
m_fft/uplan_t [ Types ]
NAME
uplan_t
FUNCTION
SOURCE
207 type, public :: uplan_t 208 209 integer :: npw = -1 210 integer :: nspinor = -1 211 integer :: batch_size = -1 ! MAXIMUM number of FFTs associated to the plan. 212 integer :: istwfk = -1 213 integer :: kind = -1 214 integer :: gpu_option = ABI_GPU_DISABLED ! /= 0 if FFTs should be offloaded to the GPU. 215 integer :: nfft = -1 ! Total number of points in the FFT box. 216 integer :: mgfft = -1 217 integer :: ngfft(18) 218 integer, contiguous, pointer :: kg_k(:,:) 219 integer, allocatable :: gbound(:,:) 220 221 contains 222 procedure :: init => uplan_init ! Build object 223 procedure :: free => uplan_free ! Free dynamic memory 224 225 procedure :: execute_gr_spc => uplan_execute_gr_spc 226 procedure :: execute_gr_dpc => uplan_execute_gr_dpc 227 procedure :: execute_rg_spc => uplan_execute_rg_spc 228 procedure :: execute_rg_dpc => uplan_execute_rg_dpc 229 230 ! Main entry point for performing FFTs on the full box 231 ! complex-to-complex version, operating on complex arrays 232 generic :: execute_gr => execute_gr_spc, & 233 execute_gr_dpc 234 235 generic :: execute_rg => execute_rg_spc, & 236 execute_rg_dpc 237 end type uplan_t
m_fft/zerosym [ Functions ]
[ Top ] [ m_fft ] [ Functions ]
NAME
zerosym
FUNCTION
Symmetrize an array on the FFT grid by vanishing some term on the boundaries.
INPUTS
cplex= if 1, input array is REAL, if 2, input array is COMPLEX mpi_enreg=information about MPI parallelization n1,n2,n3=FFT dimensions nfft=n1*n2*n3 ig1,ig2,ig3=optional arguments= indexes of unbalanced g-vectors to cancel if not present, ig1=1+n1/2, ig2=1+n2/2, ig3=1+n3/2 for even n1,n2,n3 if igj=-1, nothing is done in direction j
OUTPUT
(see side effects)
SIDE EFFECTS
array(cplex,n1*n2*n3)=complex array to be symetrized
SOURCE
4120 subroutine zerosym(array,cplex,n1,n2,n3, & 4121 ig1,ig2,ig3,comm_fft,distribfft) ! Optional arguments 4122 4123 4124 !Arguments ------------------------------------ 4125 !scalars 4126 integer,intent(in) :: cplex,n1,n2,n3 4127 integer,optional,intent(in) :: ig1,ig2,ig3,comm_fft 4128 type(distribfft_type),intent(in),target,optional :: distribfft 4129 !arrays 4130 real(dp),intent(inout) :: array(cplex,n1*n2*n3) 4131 4132 !Local variables------------------------------- 4133 !scalars 4134 integer :: i1,i2,i3,ifft,ifft_proc,index,j,j1,j2,j3,me_fft,nd2 4135 integer :: nproc_fft,n1sel,nn12,n2sel,n3sel,r2 4136 !arrays 4137 integer, ABI_CONTIGUOUS pointer :: fftn2_distrib(:),ffti2_local(:) 4138 4139 ! ********************************************************************** 4140 4141 DBG_ENTER("COLL") 4142 4143 me_fft=0;nproc_fft=1 4144 if (present(comm_fft)) then 4145 me_fft=xmpi_comm_rank(comm_fft) 4146 nproc_fft=xmpi_comm_size(comm_fft) 4147 end if 4148 nd2=(n2-1)/nproc_fft+1 4149 nn12=n1*n2 4150 4151 !Get the distrib associated with this fft_grid 4152 if (present(distribfft)) then 4153 if (n2== distribfft%n2_coarse) then 4154 fftn2_distrib => distribfft%tab_fftdp2_distrib 4155 ffti2_local => distribfft%tab_fftdp2_local 4156 else if(n2 == distribfft%n2_fine) then 4157 fftn2_distrib => distribfft%tab_fftdp2dg_distrib 4158 ffti2_local => distribfft%tab_fftdp2dg_local 4159 else 4160 ABI_BUG("Unable to find an allocated distrib for this fft grid") 4161 end if 4162 else 4163 ABI_MALLOC(fftn2_distrib,(n2)) 4164 ABI_MALLOC(ffti2_local,(n2)) 4165 fftn2_distrib=0;ffti2_local=(/(i2,i2=1,n2)/) 4166 end if 4167 4168 if (present(ig1)) then 4169 n1sel=ig1 4170 else if (mod(n1,2)==0) then 4171 n1sel=1+n1/2 4172 else 4173 n1sel=-1 4174 end if 4175 if (present(ig2)) then 4176 n2sel=ig2 4177 else if (mod(n2,2)==0) then 4178 n2sel=1+n2/2 4179 else 4180 n2sel=-1 4181 end if 4182 if (present(ig3)) then 4183 n3sel=ig3 4184 else if (mod(n3,2)==0) then 4185 n3sel=1+n3/2 4186 else 4187 n3sel=-1 4188 end if 4189 4190 if (n1sel>0) then 4191 index=n1sel-nn12-n1 4192 do i3=1,n3 4193 index=index+nn12;ifft=index 4194 do i2=1,n2 4195 ifft=ifft+n1 4196 if (nproc_fft>1) then 4197 ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress 4198 j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2) !;r2=modulo(j2,nd2) 4199 if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft 4200 r2= ffti2_local(j2+1) - 1 4201 ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc 4202 array(:,ifft_proc)=zero 4203 end if 4204 else 4205 array(:,ifft)=zero 4206 end if 4207 end do 4208 end do 4209 end if 4210 4211 if (n2sel>0) then 4212 index=n1*n2sel-nn12-n1 4213 do i3=1,n3 4214 index=index+nn12;ifft=index 4215 do i1=1,n1 4216 ifft=ifft+1 4217 if (nproc_fft>1) then 4218 ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress 4219 j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2); 4220 if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft 4221 r2= ffti2_local(j2+1) - 1 4222 ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc 4223 array(:,ifft_proc)=zero 4224 end if 4225 else 4226 array(:,ifft)=zero 4227 end if 4228 end do 4229 end do 4230 end if 4231 4232 if (n3sel>0) then 4233 index=nn12*n3sel-nn12-n1 4234 do i2=1,n2 4235 index=index+n1;ifft=index 4236 do i1=1,n1 4237 ifft=ifft+1 4238 if (nproc_fft>1) then 4239 ! MPIWF: consider ifft only if it is treated by the current proc and compute its adress 4240 j=ifft-1;j1=modulo(j,n1);j2=modulo(j/n1,n2);j3=j/(n1*n2) 4241 if(fftn2_distrib(j2+1)==me_fft) then ! MPIWF this ifft is to be treated by me_fft 4242 r2= ffti2_local(j2+1) - 1 4243 ifft_proc=n1*(nd2*j3+r2)+j1+1 !this is ifft in the current proc 4244 array(:,ifft_proc)=zero 4245 end if 4246 else 4247 array(:,ifft)=zero 4248 end if 4249 end do 4250 end do 4251 end if 4252 4253 if (.not.present(distribfft)) then 4254 ABI_FREE(fftn2_distrib) 4255 ABI_FREE(ffti2_local) 4256 end if 4257 4258 DBG_EXIT("COLL") 4259 4260 end subroutine zerosym