TABLE OF CONTENTS


ABINIT/m_fftw3 [ Modules ]

[ Top ] [ Modules ]

NAME

 m_fftw3

FUNCTION

  This module provides wrappers for the FFTW3 routines: in-place and out-of-place version.

COPYRIGHT

 Copyright (C) 2009-2024 ABINIT group (MG, FD)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

NOTES

  1) MPI parallelism is in testing stage
  2) For better performance the FFT divisions should contain small factors  [2, 3, 5, 7, 11]

SOURCE

 20 #if defined HAVE_CONFIG_H
 21 #include "config.h"
 22 #endif
 23 
 24 #include "abi_common.h"
 25 
 26 ! It seems that MKL wrappers do not like the advanced interfaces for
 27 ! r2c and c2r transforms although they work fine if the true FFTW3 library is used.
 28 !#define DEV_RC_BUG
 29 #undef DEV_RC_BUG
 30 
 31 #define FFTLIB "FFTW3"
 32 #define FFT_PREF(name) CONCAT(fftw3_,name)
 33 #define SPAWN_THREADS_HERE(ndat, nthreads) fftw3_spawn_threads_here(ndat, nthreads)
 34 #define FFT_DOUBLE 1
 35 #define FFT_SINGLE 2
 36 #define FFT_MIXPREC 3
 37 
 38 MODULE m_fftw3
 39 
 40  use defs_basis
 41  use m_abicore
 42  use m_errors
 43  use m_xomp
 44  use m_xmpi
 45  use m_hide_blas
 46  use m_cgtools
 47  use m_cplxtools
 48  use m_distribfft
 49  use m_fftcore
 50  use, intrinsic :: iso_c_binding
 51 
 52  use m_time,           only : timab
 53  use m_numeric_tools,  only : imax_loc
 54  use defs_abitypes,    only : MPI_type
 55  use m_mpinfo,         only : ptabs_fourwf
 56  use m_fstrings,       only : strcat, itoa, sjoin
 57  use m_fft_mesh,       only : zpad_t, zpad_init, zpad_free
 58 
 59  implicit none
 60 
 61 #ifdef HAVE_FFTW3_MPI
 62  include 'fftw3-mpi.f03'
 63 #endif
 64 
 65 !This should be done but MKL fftw hasn't always this include file
 66 !#ifdef HAVE_FFT_FFTW3
 67 ! include 'fftw3.f03'
 68 !#endif
 69 
 70  private
 71 
 72 ! Entry points for client code
 73  public :: fftw3_seqfourdp      ! 3D FFT of lengths nx, ny, nz. Mainly used for densities or potentials.
 74  public :: fftw3_seqfourwf      ! FFT transform of wavefunctions (high-level interface).
 75  public :: fftw3_fftrisc
 76  public :: fftw3_fftrisc_mixprec ! Mixed precision version of fftrisc: input/output in dp, computation done in sp.
 77  public :: fftw3_fftug          ! G-->R. 3D zero-padded FFT of lengths nx, ny, nz. Mainly used for wavefunctions
 78  public :: fftw3_fftur          ! R-->G, 3D zero-padded FFT of lengths nx, ny, nz. Mainly used for wavefunctions
 79  public :: fftw3_use_lib_threads
 80  public :: fftw3_spawn_threads_here
 81 
 82  public :: fftw3_mpifourdp
 83 
 84 ! Low-level routines.
 85  public :: fftw3_cleanup        ! Reset FFTW to the pristine state it was in when you started your program,
 86  public :: fftw3_init_threads   ! one-time initialization required to use FFTW3 threads.
 87  public :: fftw3_set_nthreads   ! Set the number of threads you want FFTW3 to use when HAVE_FFT_FFTW3_THREADS is defined.
 88  public :: fftw3_r2c_op         ! Real to complex transform (out-of-place version).
 89  public :: fftw3_c2r_op         ! Complex to real transform (out-of-place version).
 90  public :: fftw3_c2c_op         ! complex to complex transform (out-of-place version).
 91  public :: fftw3_c2c_ip         ! complex to complex transform (in-place version).
 92  public :: fftw3_many_dft_op    ! Driver routine for many out-of-place 3D complex-to-complex FFTs.
 93  public :: fftw3_many_dft_ip    ! Driver routine for many in-place 3D complex-to-complex FFTs.
 94  public :: fftw3_fftpad         ! Driver routines for zero-padded FFT of wavefunctions.
 95  public :: fftw3_fftpad_dp      ! Driver routines for zero-padded FFT of wavefunctions.
 96  public :: fftw3_fftug_dp       ! Driver routines for zero-padded FFT of wavefunctions.
 97  public :: fftw3_poisson        ! Solve the poisson equation in G-space starting from n(r).
 98 
 99  ! MPI version
100  public :: fftw3_mpiback_wf
101  public :: fftw3_mpiback_manywf
102  public :: fftw3_mpiforw_wf
103  public :: fftw3_mpiforw_manywf
104  public :: fftw3_mpiback
105  public :: fftw3_mpiforw
106  public :: fftw3_applypot
107  public :: fftw3_applypot_many
108  public :: fftw3_accrho
109 
110 #ifdef HAVE_FFTW3_MPI
111 ! flags copied from fftw3.f
112  integer,public,parameter :: ABI_FFTW_FORWARD = FFTW_FORWARD
113  integer,public,parameter :: ABI_FFTW_BACKWARD = FFTW_BACKWARD
114  integer,public,parameter :: ABI_FFTW_ESTIMATE = FFTW_ESTIMATE
115  integer,public,parameter :: ABI_FFTW_MEASURE = FFTW_MEASURE
116  ! end flags copied from fftw3.f
117  integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_IN = FFTW_MPI_TRANSPOSED_IN
118  integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_OUT = FFTW_MPI_TRANSPOSED_OUT
119  ! end flags copies from fftw3-mpi.f03
120 #else
121  integer,public,parameter :: ABI_FFTW_FORWARD = -1
122  integer,public,parameter :: ABI_FFTW_BACKWARD = +1
123  integer,public,parameter :: ABI_FFTW_ESTIMATE = 64
124  integer,public,parameter :: ABI_FFTW_MEASURE = 0
125 ! end flags copied from fftw3.f
126  integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_IN = 536870912
127  integer,public,parameter :: ABI_FFTW_MPI_TRANSPOSED_OUT = 1073741824
128 ! end flags copies from fftw3-mpi.f03
129 #endif
130 
131 ! ==========================================================================================
132 ! ==== Variables introduced for the FFTW3 interface in abinit. Not belonging to fftw3.f ====
133 ! ==========================================================================================
134 
135  integer,public,parameter :: NULL_PLAN = 0
136  ! MKL wrappers might return NULL_PLAN if a particular FFTW3 feature is not available
137 
138  integer,public,parameter :: KIND_FFTW_PLAN = 8
139  ! It should be at least integer*@SIZEOF_INT_P@
140  ! MKL wrappers requires it to be integer*8, so do _not_ use C_INTPTR_T.
141 
142 #ifdef HAVE_FFTW3_THREADS
143  integer,private,save :: THREADS_INITED = 0
144  ! 1 if treads have been initialized. 0 otherwise.
145 #endif
146 
147  logical,private,save :: USE_LIB_THREADS = .FALSE.

m_fftw3/cplan_many_dft [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2611 !! FIXME  technically it should be intent(inout) since FFTW3 can destroy the input for particular flags.
2612 
2613 function cplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)
2614 
2615 !Arguments ------------------------------------
2616 !scalars
2617  integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
2618  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2619  integer(KIND_FFTW_PLAN) :: plan
2620 !arrays
2621  complex(spc) :: fin(*),fout(*)
2622 
2623 !Local variables-------------------------------
2624  character(len=500) :: msg,frmt
2625 
2626 ! *************************************************************************
2627 
2628 !$OMP CRITICAL (OMPC_cplan_many_dft)
2629  call fftw3_set_nthreads(nthreads)
2630 
2631  call sfftw_plan_many_dft(plan, rank, n, howmany, &
2632 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
2633 !$OMP END CRITICAL (OMPC_cplan_many_dft)
2634 
2635  if (plan==NULL_PLAN) then ! handle the error
2636    call wrtout(std_out, "sfftw_plan_many_dft returned NULL_PLAN (complex version)")
2637    write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2638    write(msg,frmt)&
2639 &    " n = ",n," howmany = ",howmany," sign = ",sign," flags = ",flags,ch10,&
2640 &    " inembed = ",inembed," istride = ",istride," idist =",idist,ch10,     &
2641 &    " onembed = ",onembed," ostride = ",ostride," odist =",idist,ch10
2642    call wrtout(std_out, msg)
2643    ABI_ERROR("Check FFTW library and/or abinit code")
2644  end if
2645 
2646 end function cplan_many_dft

m_fftw3/dplan_many_dft_1D [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2514 function dplan_many_dft_1D(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)
2515 
2516 !Arguments ------------------------------------
2517 !scalars
2518  integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
2519  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2520  integer(KIND_FFTW_PLAN) :: plan
2521 !arrays
2522  real(dp) :: fin(*),fout(*)
2523 
2524 !Local variables-------------------------------
2525  character(len=500) :: msg,frmt
2526 
2527 ! *************************************************************************
2528 
2529 !$OMP CRITICAL (OMPC_dfftw_plan_many_dft_1D)
2530  call fftw3_set_nthreads(nthreads)
2531 
2532  call dfftw_plan_many_dft(plan, rank, n, howmany, &
2533 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
2534 !$OMP END CRITICAL (OMPC_dfftw_plan_many_dft_1D)
2535 
2536  if (plan==NULL_PLAN) then
2537    call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN!")
2538    write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2539    write(msg,frmt)&
2540 &    " n= ",n," howmany= ",howmany," sign= ",sign," flags= ",flags,ch10,&
2541 &    " inembed= ",inembed," istride= ",istride," idist=",idist,ch10,    &
2542 &    " onembed= ",onembed," ostride= ",ostride," odist=",idist,ch10
2543    call wrtout(std_out, msg)
2544    ABI_ERROR("Check FFTW library and/or abinit code")
2545  end if
2546 
2547 end function dplan_many_dft_1D

m_fftw3/dplan_many_dft_2D [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2563 function dplan_many_dft_2D(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)
2564 
2565 !Arguments ------------------------------------
2566 !scalars
2567  integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
2568  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2569  integer(KIND_FFTW_PLAN) :: plan
2570 !arrays
2571  real(dp) :: fin(2,*),fout(2,*)
2572 
2573 !Local variables-------------------------------
2574  character(len=500) :: msg,frmt
2575 
2576 ! *************************************************************************
2577 
2578 !$OMP CRITICAL (OMPC_dfftw_plan_many_dft_2D)
2579  call fftw3_set_nthreads(nthreads)
2580 
2581  call dfftw_plan_many_dft(plan, rank, n, howmany, &
2582 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
2583 !$OMP END CRITICAL (OMPC_dfftw_plan_many_dft_2D)
2584 
2585  if (plan==NULL_PLAN) then
2586    call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN!")
2587    write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2588    write(msg,frmt)&
2589 &    " n= ",n," howmany= ",howmany," sign= ",sign," flags= ",flags,ch10,&
2590 &    " inembed= ",inembed," istride= ",istride," idist=",idist,ch10,    &
2591 &    " onembed= ",onembed," ostride= ",ostride," odist=",idist,ch10
2592    call wrtout(std_out, msg)
2593    ABI_ERROR("Check FFTW library and/or abinit code")
2594  end if
2595 
2596 end function dplan_many_dft_2D

m_fftw3/dplan_many_dft_c2r [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2763 function dplan_many_dft_c2r(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags, nthreads) result(plan)
2764 
2765 !Arguments ------------------------------------
2766 !scalars
2767  integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads
2768  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2769  integer(KIND_FFTW_PLAN) :: plan
2770 !arrays
2771  real(dp) :: fin(*),fout(*)
2772 
2773 !Local variables-------------------------------
2774  character(len=500) :: msg,frmt
2775 
2776 ! *************************************************************************
2777 
2778 !$OMP CRITICAL (OMPC_dplan_many_dft_c2r)
2779  call fftw3_set_nthreads(nthreads)
2780 
2781  call dfftw_plan_many_dft_c2r(plan, rank, n, howmany, &
2782 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, flags)
2783 !$OMP END CRITICAL (OMPC_dplan_many_dft_c2r)
2784 
2785  if (plan==NULL_PLAN) then ! handle the error.
2786    call wrtout(std_out, "dfftw_plan_many_dft_c2r returned NULL_PLAN")
2787    write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2788    write(msg,frmt)&
2789 &    " n = ",n," howmany = ",howmany," flags = ",flags,ch10,&
2790 &    " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,&
2791 &    " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10
2792    call wrtout(std_out, msg)
2793    ABI_ERROR("Check FFTW library and/or abinit code")
2794  end if
2795 
2796 end function dplan_many_dft_c2r

m_fftw3/dplan_many_dft_r2c [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2711 !! FIXME  technically it should be intent(inout) since FFTW3 can destroy the input
2712 !! for particular flags.
2713 
2714 function dplan_many_dft_r2c(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,flags,nthreads) result(plan)
2715 
2716 !Arguments ------------------------------------
2717 !scalars
2718  integer,intent(in) :: rank,howmany,istride,ostride,flags,idist,odist,nthreads
2719  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2720  integer(KIND_FFTW_PLAN) :: plan
2721 !arrays
2722  real(dp) :: fin(*),fout(*)
2723 
2724 !Local variables-------------------------------
2725  character(len=500) :: msg,frmt
2726 
2727 ! *************************************************************************
2728 
2729 !$OMP CRITICAL (OMPC_dplan_many_dft_r2c)
2730  call fftw3_set_nthreads(nthreads)
2731 
2732  call dfftw_plan_many_dft_r2c(plan, rank, n, howmany, &
2733 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, flags)
2734 !$OMP END CRITICAL (OMPC_dplan_many_dft_r2c)
2735 
2736  if (plan==NULL_PLAN) then ! handle the error.
2737    call wrtout(std_out, "dfftw_plan_many_dft_r2c returned NULL_PLAN")
2738    write(frmt,*)"(a,",rank,"(1x,i0),2(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2739    write(msg,frmt)&
2740 &    " n = ",n," howmany = ",howmany," flags = ",flags,ch10,&
2741 &    " inembed = ",inembed," istride = ",istride," idist = ",idist,ch10,&
2742 &    " onembed = ",onembed," ostride = ",ostride," odist = ",idist,ch10
2743    call wrtout(std_out, msg)
2744    ABI_ERROR("Check FFTW library and/or abinit code")
2745  end if
2746 
2747 end function dplan_many_dft_r2c

m_fftw3/fftw3_accrho [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_accrho

FUNCTION

 Accumulates the real space density rho from the ndat wavefunctions zf
 by transforming zf into real space and adding all the amplitudes squared

 INPUTS:
   ZF: input array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1 ! maximal number of small box 2nd dim slices for one proc
   weight(ndat)= weight for the density accumulation

 OUTPUTS:
    RHOoutput(i1,i2,i3) = RHOinput(i1,i2,i3) + sum on idat of (FFT(ZF))**2 *weight
        i1=1,n1 , i2=1,n2 , i3=1,n3
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of RHO
   nd3proc=((nd3-1)/nproc_fft)+1 ! maximal number of big box 3rd dim slices for one proc

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

5793 subroutine fftw3_accrho(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
5794 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,zf,rho,weight_r,weight_i)
5795 
5796 !Arguments ------------------------------------
5797  integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
5798  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft
5799  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
5800  real(dp),intent(in) :: weight_r(ndat) , weight_i(ndat)
5801  real(dp),intent(inout) :: rho(nd1,nd2,nd3)
5802 
5803 !Local variables-------------------------------
5804 !scalars
5805 #ifdef HAVE_FFTW3
5806  integer,parameter :: unused0=0
5807  integer :: j,i1,idat,ierr,j3glob
5808  integer :: ioption,j2,j3,j2st,jp2st,lzt,m1zt,ma,mb,n1dfft,nnd3
5809  integer :: m2eff,ncache,n1eff,jeff,includelast,lot1,lot2,lot3,nthreads
5810  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
5811  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
5812  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
5813  character(len=500) :: msg
5814 !arrays
5815  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
5816  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
5817  real(dp) :: tsec(2)
5818 
5819 ! *************************************************************************
5820 
5821  !ioption=0 ! This was in the old version.
5822  ioption=1 ! This one is needed to be compatible with paral_kgb
5823 
5824  !nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
5825 
5826 ! find cache size that gives optimal performance on machine
5827  ncache=2*max(n1,n2,n3,1024)
5828  if (ncache/(2*max(n1,n2,n3)) < 1) then
5829     write(msg,"(5a)") &
5830 &     'ncache has to be enlarged to be able to hold at',ch10,&
5831 &     'least one 1-d FFT of each size even though this will',ch10,&
5832 &     'reduce the performance for shorter transform lengths'
5833     ABI_ERROR(msg)
5834  end if
5835 
5836 !Effective m1 and m2 (complex-to-complex or real-to-complex)
5837  n1eff=n1; m2eff=m2 ; m1zt=n1
5838  if (cplexwf==1) then
5839    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
5840  end if
5841 
5842  lzt=m2eff
5843  if (mod(m2eff,2) == 0) lzt=lzt+1
5844  if (mod(m2eff,4) == 0) lzt=lzt+1
5845 
5846  ! maximal number of big box 3rd dim slices for all procs
5847  nnd3=nd3proc*nproc_fft
5848 
5849  ABI_MALLOC(zw,(2,ncache/2))
5850  ABI_MALLOC(zt,(2,lzt,m1zt))
5851  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
5852  if (nproc_fft > 1)  then
5853    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
5854  end if
5855 
5856  ! Create plans.
5857  ! The prototype for sfftw_plan_many_dft is:
5858  ! sfftw_plan_many_dft(rank, n, howmany,
5859  !   fin,  iembed, istride, idist,
5860  !   fout, oembed, ostride, odist, isign, my_flags)
5861 
5862  lot3=ncache/(2*n3)
5863  lot1=ncache/(2*n1)
5864  lot2=ncache/(2*n2)
5865 
5866  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
5867  !nthreads = 1
5868 
5869  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
5870 &    zw, [ncache/2], lot3, 1,                          &
5871 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5872 
5873  if (mod(m1, lot3) /= 0) then
5874    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), &
5875 &      zw, [ncache/2], lot3, 1,                                    &
5876 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5877  end if
5878 
5879  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
5880 &    zw, [ncache/2],  lot1, 1,                         &
5881 &    zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5882 
5883  if (mod(m2eff, lot1) /= 0) then
5884    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), &
5885 &      zw, [ncache/2],  lot1, 1,                                      &
5886 &      zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5887  end if
5888 
5889  ! FIXME THis won't work if ixplexwf == 1
5890  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
5891 &    zw, [ncache/2], lot2, 1,                          &
5892 &    zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5893 
5894  if (mod(n1eff, lot2) /= 0) then
5895    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
5896 &      zw, [ncache/2], lot2, 1,                                      &
5897 &      zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5898  end if
5899 
5900  do idat=1,ndat
5901    ! transform along z axis
5902    ! input: I1,I3,J2,(Jp2)
5903    !lot=ncache/(4*n3)
5904 
5905    ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
5906    do j2=1,md2proc
5907      if (me_fft*md2proc+j2 <= m2eff) then ! MG REMOVED TO BE COSISTENT WITH BACK_WF
5908        do i1=1,m1,lot3
5909          ma=i1
5910          mb=min(i1+(lot3-1),m1)
5911          n1dfft=mb-ma+1
5912 
5913          ! zero-pad n1dfft G_z lines
5914          !  input: G1,G3,G2,(Gp2)
5915          ! output: G1,R3,G2,(Gp2)
5916          call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw)
5917 
5918          ! Transform along z.
5919          if (n1dfft == lot3) then
5920            call dfftw_execute_dft(bw_plan3_lot, zw, zw)
5921          else
5922            call dfftw_execute_dft(bw_plan3_rest, zw, zw)
5923          end if
5924 
5925          ! Local rotation.
5926          ! input:  G1,R3,G2,(Gp2)
5927          ! output: G1,G2,R3,(Gp2)
5928          call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2)
5929        end do
5930      end if
5931    end do
5932 
5933    ! Interprocessor data transposition
5934    ! input:  G1,G2,R3,Rp3,(Gp2)
5935    ! output: G1,G2,R3,Gp2,(Rp3)
5936    if (nproc_fft > 1) then
5937      call timab(543,1,tsec)
5938      call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, &
5939 &                       zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
5940      call timab(543,2,tsec)
5941    end if
5942 
5943    ! Loop over the z treated by this node.
5944    do j3=1,nd3proc
5945      j3glob = j3 + me_fft*nd3proc
5946 
5947      if (me_fft*nd3proc+j3 <= n3) then
5948        Jp2st=1; J2st=1
5949 
5950        ! Loop over G_y in the small box.
5951        do j=1,m2eff,lot1
5952          ma=j
5953          mb=min(j+(lot1-1),m2eff)
5954          n1dfft=mb-ma+1
5955 
5956          ! Zero-pad input.
5957          ! input:  G1,G2,R3,JG2,(Rp3)
5958          ! output: G2,G1,R3,JG2,(Rp3)
5959          if (nproc_fft == 1) then
5960           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
5961 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw,unused0, unused0,unused0)
5962          else
5963           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
5964 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw, unused0,unused0,unused0)
5965          end if
5966 
5967          ! Transform along x
5968          ! input:  G2,G1,R3,(Rp3)
5969          ! output: G2,R1,R3,(Rp3)
5970          if (n1dfft == lot1) then
5971            call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
5972          else
5973            call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
5974          end if
5975 
5976        end do
5977 
5978        ! Transform along y axis (take into account c2c or c2r case).
5979        ! Must loop over the full box.
5980        !lot=ncache/(4*n2)
5981        ! FIXME THis won't work
5982        if (cplexwf==1) then
5983          if (mod(lot2,2) /=0) lot2=lot2-1 ! needed to introduce jeff
5984        end if
5985 
5986        do j=1,n1eff,lot2
5987          ma=j
5988          mb=min(j+(lot2-1),n1eff)
5989          n1dfft=mb-ma+1
5990          jeff=j
5991          includelast=1
5992 
5993          if (cplexwf==1) then
5994            jeff=2*j-1
5995            includelast=1
5996            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
5997          end if
5998 
5999          ! Zero-pad the input.
6000          ! input:  G2,R1,R3,(Rp3)
6001          ! output: R1,G2,R3,(Rp3)
6002          if (cplexwf==2) then
6003            call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw)
6004          else
6005            call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw)
6006          end if
6007 
6008          if (n1dfft == lot2) then
6009            call dfftw_execute_dft(bw_plan2_lot, zw, zw)
6010          else
6011            call dfftw_execute_dft(bw_plan2_rest, zw, zw)
6012          end if
6013 
6014          ! Accumulate
6015          call addrho(cplexwf,includelast,nd1,nd2,n2,lot2,n1dfft,&
6016 &          zw,rho(jeff,1,j3glob),weight_r(idat),weight_i(idat))
6017        end do
6018        ! output: i1,i2,j3,(jp3)
6019 
6020       end if
6021     end do ! j3
6022  end do ! idat
6023 
6024  call dfftw_destroy_plan(bw_plan3_lot)
6025  if (mod(m1, lot3) /= 0) then
6026    call dfftw_destroy_plan(bw_plan3_rest)
6027  end if
6028 
6029  call dfftw_destroy_plan(bw_plan1_lot)
6030  if (mod(m2eff, lot1) /= 0) then
6031    call dfftw_destroy_plan(bw_plan1_rest)
6032  end if
6033 
6034  call dfftw_destroy_plan(bw_plan2_lot)
6035  if (mod(n1eff, lot2) /= 0) then
6036    call dfftw_destroy_plan(bw_plan2_rest)
6037  end if
6038 
6039  ABI_FREE(zmpi2)
6040  ABI_FREE(zw)
6041  ABI_FREE(zt)
6042  if (nproc_fft > 1)  then
6043    ABI_FREE(zmpi1)
6044  end if
6045 
6046 #else
6047  ABI_ERROR("FFTW3 support not activated")
6048  ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/))
6049  ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft/))
6050  ABI_UNUSED((/zf(1,1,1,1,1),rho(1,1,1),weight_r(1),weight_i(1)/))
6051 #endif
6052 
6053 end subroutine fftw3_accrho

m_fftw3/fftw3_alloc_complex1d_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_alloc_complex1d_dpc

SOURCE

2999 #ifdef HAVE_FFTW3
3000 
3001 subroutine fftw3_alloc_complex1d_dpc(size,cptr,fptr)
3002 
3003 !Arguments ------------------------------------
3004 !scalars
3005  integer,intent(in) :: size
3006  complex(dpc),ABI_CONTIGUOUS pointer :: fptr(:)
3007  type(C_PTR),intent(out) :: cptr
3008 
3009 ! *************************************************************************
3010 
3011  cptr = fftw_malloc( INT(2*size*C_DOUBLE, KIND=C_SIZE_T))
3012  if (.not. C_ASSOCIATED(cptr)) then
3013    ABI_ERROR("fftw_malloc returned NULL!")
3014  end if
3015 
3016  call c_f_pointer(cptr, fptr, [size])
3017 
3018 end subroutine fftw3_alloc_complex1d_dpc

m_fftw3/fftw3_alloc_complex1d_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_alloc_complex1d_spc

SOURCE

2967 #ifdef HAVE_FFTW3
2968 
2969 subroutine fftw3_alloc_complex1d_spc(size,cptr,fptr)
2970 
2971 !Arguments ------------------------------------
2972 !scalars
2973  integer,intent(in) :: size
2974  complex(spc),ABI_CONTIGUOUS pointer :: fptr(:)
2975  type(C_PTR),intent(out) :: cptr
2976 
2977 ! *************************************************************************
2978 
2979  cptr = fftw_malloc( INT(2*size*C_FLOAT, KIND=C_SIZE_T))
2980  if (.not. C_ASSOCIATED(cptr)) then
2981    ABI_ERROR("fftw_malloc returned NULL!")
2982  end if
2983 
2984  call c_f_pointer(cptr, fptr, [size])
2985 
2986 end subroutine fftw3_alloc_complex1d_spc

m_fftw3/fftw3_alloc_real1d_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_alloc_real1d_dp

FUNCTION

SOURCE

2903 #ifdef HAVE_FFTW3
2904 
2905 subroutine fftw3_alloc_real1d_dp(size,cptr,fptr)
2906 
2907 !Arguments ------------------------------------
2908 !scalars
2909  integer,intent(in) :: size
2910  real(dp),ABI_CONTIGUOUS pointer :: fptr(:)
2911  type(C_PTR),intent(out) :: cptr
2912 
2913 ! *************************************************************************
2914 
2915  cptr = fftw_malloc( INT(size*C_DOUBLE, KIND=C_SIZE_T))
2916  if (.not. C_ASSOCIATED(cptr)) then
2917    ABI_ERROR("fftw_malloc returned NULL!")
2918  end if
2919 
2920  call c_f_pointer(cptr, fptr, [size])
2921 
2922 end subroutine fftw3_alloc_real1d_dp

m_fftw3/fftw3_alloc_real2d_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_alloc_real2d_dp

SOURCE

2935 #ifdef HAVE_FFTW3
2936 
2937 subroutine fftw3_alloc_real2d_dp(shape,cptr,fptr)
2938 
2939 !Arguments ------------------------------------
2940 !scalars
2941  integer,intent(in) :: shape(2)
2942  real(dp),ABI_CONTIGUOUS pointer :: fptr(:,:)
2943  type(C_PTR),intent(out) :: cptr
2944 
2945 ! *************************************************************************
2946 
2947  cptr = fftw_malloc( INT(product(shape)*C_DOUBLE, KIND=C_SIZE_T))
2948  if (.not. C_ASSOCIATED(cptr)) then
2949    ABI_ERROR("fftw_malloc returned NULL!")
2950  end if
2951 
2952  call c_f_pointer(cptr, fptr, shape)
2953 
2954 end subroutine fftw3_alloc_real2d_dp

m_fftw3/fftw3_applypot [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_applypot

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

5328 subroutine fftw3_applypot(cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
5329 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
5330 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
5331 
5332 !Arguments ------------------------------------
5333  integer,intent(in) :: cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
5334  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
5335  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
5336  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
5337  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
5338 
5339 !Local variables-------------------------------
5340 !scalars
5341 #ifdef HAVE_FFTW3
5342  integer,parameter :: unused0=0
5343  integer :: j,i1,i2,i3,idat,ierr,j3glob,nthreads
5344  integer :: ioption,j2,j3,lzt,m1zt,ma,mb,n1dfft,nnd3,lot1,lot2,lot3
5345  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
5346  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
5347  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
5348  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
5349  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
5350  integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest
5351  integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest
5352  integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest
5353  character(len=500) :: msg
5354 !arrays
5355  real(dp) :: tsec(2)
5356  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
5357  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
5358 ! FFT work arrays
5359 
5360 ! *************************************************************************
5361 
5362  !ioption=0 ! This was in the old version.
5363  ioption=1 ! This one is needed to be compatible with paral_kgb
5364 
5365  ncache=2*max(n1,n2,n3,1024)
5366  if (ncache/(2*max(n1,n2,n3)) < 1) then
5367    write(msg,"(5a)") &
5368 &    'ncache has to be enlarged to be able to hold at',ch10,&
5369 &    'least one 1-d FFT of each size even though this will',ch10,&
5370 &    'reduce the performance for shorter transform lengths'
5371    ABI_ERROR(msg)
5372  end if
5373 
5374  !call wrtout(std_out,"applypot standard ALLTOALL + FFTW3")
5375 
5376  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
5377  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
5378  if (cplexwf==1) then
5379    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
5380  end if
5381 
5382  m2eff=max(m2ieff,m2oeff)
5383  lzt=m2eff
5384  if (mod(m2eff,2) == 0) lzt=lzt+1
5385  if (mod(m2eff,4) == 0) lzt=lzt+1
5386 
5387  ! maximal number of big box 3rd dim slices for all procs
5388  nnd3=nd3proc*nproc_fft
5389 
5390  ABI_MALLOC(zw,(2,ncache/2))
5391  ABI_MALLOC(zt,(2,lzt,m1zt))
5392  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
5393  if (nproc_fft > 1)  then
5394    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
5395  end if
5396 
5397  lot3=ncache/(2*n3)
5398  lot1=ncache/(2*n1)
5399  lot2=ncache/(2*n2)
5400 
5401  ! The prototype for sfftw_plan_many_dft is:
5402  ! sfftw_plan_many_dft(rank, n, howmany,
5403  !   fin,  iembed, istride, idist,
5404  !   fout, oembed, ostride, odist, isign, my_flags)
5405 
5406  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
5407  !nthreads = 1
5408 
5409  ! Create plans for G --> R (see back_wf)
5410  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
5411 &    zw, [ncache/2], lot3, 1,                          &
5412 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5413 
5414  if (mod(m1i, lot3) /= 0) then
5415    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1i, lot3),&
5416 &      zw, [ncache/2], lot3, 1,                                    &
5417 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5418  end if
5419 
5420  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
5421 &    zw, [ncache/2],  lot1, 1,                         &
5422 &    zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5423 
5424  if (mod(m2ieff, lot1) /= 0) then
5425    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2ieff, lot1), &
5426 &      zw, [ncache/2],  lot1, 1,                                       &
5427 &      zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5428  end if
5429 
5430  !TODO this won't work if iclexwf==1
5431  ! Recheck this
5432  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
5433 &    zw, [ncache/2], lot2, 1,                          &
5434 &    zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5435 
5436  if (mod(n1eff, lot2) /= 0) then
5437    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
5438 &      zw, [ncache/2], lot2, 1,                                      &
5439 &      zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
5440  end if
5441 
5442  ! Create plans for G --> R (see forw_wf)
5443  fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
5444 &    zw, [ncache/2], lot3, 1,                          &
5445 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5446 
5447  if (mod(m1o, lot3) /= 0) then
5448    fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1o, lot3),&
5449 &    zw, [ncache/2], lot3, 1,                                      &
5450 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5451  end if
5452 
5453  fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1,&
5454 &    zt, [lzt, m1zt], lzt,  1,                        &
5455 &    zw, [ncache/2],  lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5456 
5457  if (mod(m2oeff, lot1) /= 0) then
5458    fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2oeff, lot1),&
5459 &    zt, [lzt, m1zt], lzt,  1,                                        &
5460 &    zw, [ncache/2],  lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5461  end if
5462 
5463  fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2,&
5464 &    zw, [ncache/2], lot2, 1,                         &
5465 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5466 
5467  if (mod(n1eff, lot2) /= 0) then
5468    fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2),&
5469 &    zw, [ncache/2], lot2, 1,                                       &
5470 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5471  end if
5472 
5473  do idat=1,ndat
5474    !
5475    ! transform along z axis
5476    ! input: G1,G3,G2,(Gp2)
5477    do j2=1,md2proc
5478      if (me_fft*md2proc+j2 <= m2ieff) then
5479        do i1=1,m1i,lot3
5480          ma=i1
5481          mb=min(i1+(lot3-1),m1i)
5482          n1dfft=mb-ma+1
5483 
5484          ! zero-pad n1dfft G_z lines
5485          ! input: G1,G3,G2,(Gp2)
5486          call fill_cent(md1,md3,lot3,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw)
5487 
5488          if (n1dfft == lot3) then
5489            call dfftw_execute_dft(bw_plan3_lot, zw, zw)
5490          else
5491            call dfftw_execute_dft(bw_plan3_rest, zw, zw)
5492          end if
5493 
5494          ! Local rotation.
5495          ! input:  G1,R3,G2,(Gp2)
5496          ! output: G1,G2,R3,(Gp2)
5497          call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2)
5498        end do
5499      end if
5500    end do
5501 
5502    ! Interprocessor data transposition
5503    ! input:  G1,G2,R3,Rp3,(Gp2)
5504    ! output: G1,G2,R3,Gp2,(Rp3)
5505    if (nproc_fft > 1) then
5506      call timab(543,1,tsec)
5507      call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc,&
5508 &                       zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
5509      call timab(543,2,tsec)
5510    end if
5511 
5512    do j3=1,nd3proc
5513      j3glob = j3 + me_fft*nd3proc
5514      if (me_fft*nd3proc+j3 <= n3) then
5515        Jp2stb=1; J2stb=1
5516        Jp2stf=1; J2stf=1
5517 
5518        ! transform along x axis
5519        do j=1,m2ieff,lot1
5520          ma=j
5521          mb=min(j+(lot1-1),m2ieff)
5522          n1dfft=mb-ma+1
5523 
5524          ! Zero-pad input.
5525          ! input:  G1,G2,R3,G2,(Rp3)
5526          ! output: G2,G1,R3,G2,(Rp3)
5527          if (nproc_fft == 1) then
5528            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,&
5529 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw, unused0, unused0, unused0)
5530          else
5531            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,&
5532 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw, unused0, unused0, unused0)
5533          end if
5534 
5535          ! Transform along x
5536          ! input:  G2,G1,R3,(Rp3)
5537          ! output: G2,R1,R3,(Rp3)
5538          if (n1dfft == lot1) then
5539            call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
5540          else
5541            call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
5542          end if
5543        end do
5544 
5545        ! Transform along y axis (take into account c2c or c2r case).
5546        ! Must loop over the full box.
5547        !TODO this won't work
5548        if (cplexwf==1) then
5549          if(mod(lot2,2).ne.0) lot2=lot2-1 ! needed to introduce jeff
5550        end if
5551 
5552        do j=1,n1eff,lot2
5553          ma=j
5554          mb=min(j+(lot2-1),n1eff)
5555          n1dfft=mb-ma+1
5556          jeff=j
5557          includelast=1
5558 
5559          if (cplexwf==1) then
5560            jeff=2*j-1
5561            includelast=1
5562            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
5563          end if
5564 
5565          ! Zero-pad the input.
5566          !  input: G2,R1,R3,(Rp3)
5567          ! output: R1,G2,R3,(Rp3)
5568          if (cplexwf==2) then
5569            call switch_cent(n1dfft,max2i,m2i,n2,lot2,n1,lzt,zt(1,1,jeff),zw)
5570          else
5571            call switchreal_cent(includelast,n1dfft,max2i,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw)
5572          end if
5573 
5574          ! input:  R1,G2,R3,(Rp3)
5575          ! output: R1,R2,R3,(Rp3)
5576          ! Be careful here
5577          if (n1dfft == lot2) then
5578            call dfftw_execute_dft(bw_plan2_lot, zw, zw)
5579          else
5580            call dfftw_execute_dft(bw_plan2_rest, zw, zw)
5581          end if
5582 
5583          ! Multiply with potential in real space
5584          jx=cplex*(jeff-1)+1
5585          call multpot(cplexwf,cplex,includelast,nd1,nd2,n2,lot2,n1dfft,pot(jx,1,j3glob),zw)
5586 
5587          ! TRANSFORM BACK IN FOURIER SPACE
5588          ! transform along y axis
5589          ! input: R1,R2,R3,(Rp3)
5590          if (n1dfft == lot2) then
5591            call dfftw_execute_dft(fw_plan2_lot,  zw, zw)
5592          else
5593            call dfftw_execute_dft(fw_plan2_rest, zw, zw)
5594          end if
5595 
5596          ! input: R1,G2,R3,(Rp3)
5597          ! output: G2,R1,R3,(Rp3)
5598          if (cplexwf==2) then
5599            call unswitch_cent(n1dfft,max2o,m2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff))
5600          else
5601            call unswitchreal_cent(n1dfft,max2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff))
5602          end if
5603        end do ! j
5604 
5605        ! transform along x axis
5606        ! input:  R2,R1,R3,(Rp3)
5607        ! output: R2,G1,R3,(Rp3)
5608        do j=1,m2oeff,lot1
5609          ma=j
5610          mb=min(j+(lot1-1),m2oeff)
5611          n1dfft=mb-ma+1
5612 
5613          if (n1dfft == lot1) then
5614            call dfftw_execute_dft(fw_plan1_lot,  zt(1,j,1), zw)
5615          else
5616            call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw)
5617          end if
5618 
5619          ! input:  G2,G1,R3,Gp2,(Rp3)
5620          ! output: G1,G2,R3,Gp2,(Rp3)
5621          if (nproc_fft == 1) then
5622            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,&
5623 &           md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2)
5624          else
5625            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,&
5626 &           md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1)
5627          end if
5628        end do ! j
5629      end if
5630    end do
5631 
5632    ! Interprocessor data transposition
5633    ! input:  G1,G2,R3,Gp2,(Rp3)
5634    ! output: G1,G2,R3,Rp3,(Gp2)
5635    if (nproc_fft > 1) then
5636      call timab(544,1,tsec)
5637      call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, &
5638 &                       zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr)
5639      call timab(544,2,tsec)
5640    end if
5641 
5642    ! transform along z axis
5643    ! input: G1,G2,R3,(Gp2)
5644    !lot=ncache/(4*n3)
5645    do j2=1,md2proc
5646      if (me_fft*md2proc+j2 <= m2oeff) then
5647        do i1=1,m1o,lot3
5648          ma=i1
5649          mb=min(i1+(lot3-1),m1o)
5650          n1dfft=mb-ma+1
5651 
5652          ! input:  G1,G2,R3,(Gp2)
5653          ! output: G1,R3,G2,(Gp2)
5654          call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw)
5655 
5656           if (n1dfft == lot3) then
5657             call dfftw_execute_dft(fw_plan3_lot, zw, zw)
5658           else
5659             call dfftw_execute_dft(fw_plan3_rest, zw, zw)
5660           end if
5661 
5662          call unfill_cent(md1,md3,lot3,n1dfft,max3o,m3o,n3,zw,zf(1,i1,1,j2,idat))
5663          ! output: G1,G3,G2,(Gp2)
5664        end do
5665      end if
5666    end do
5667 
5668    ! Complete missing values with complex conjugate
5669    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
5670    if (cplexwf==1) then
5671      do i3=1,m3o
5672        i3inv=m3o+2-i3
5673        if (i3==1) i3inv=1
5674        if (m2oeff>1)then
5675          do i2=2,m2oeff
5676            i2inv=m2o+2-i2
5677            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
5678            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
5679            do i1=2,m1o
5680              i1inv=m1o+2-i1
5681              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
5682              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
5683            end do
5684          end do
5685        end if
5686      end do
5687    end if
5688 
5689  end do ! idat
5690 
5691  call dfftw_destroy_plan(bw_plan3_lot)
5692  if (mod(m1i, lot3) /= 0) then
5693    call dfftw_destroy_plan(bw_plan3_rest)
5694  end if
5695 
5696  call dfftw_destroy_plan(bw_plan1_lot)
5697  if (mod(m2ieff, lot1) /= 0) then
5698    call dfftw_destroy_plan(bw_plan1_rest)
5699  end if
5700 
5701  call dfftw_destroy_plan(bw_plan2_lot)
5702  if (mod(n1eff, lot2) /= 0) then
5703    call dfftw_destroy_plan(bw_plan2_rest)
5704  end if
5705 
5706  call dfftw_destroy_plan(fw_plan3_lot)
5707  if (mod(m1o, lot3) /= 0) then
5708    call dfftw_destroy_plan(fw_plan3_rest)
5709  end if
5710 
5711  call dfftw_destroy_plan(fw_plan1_lot)
5712  if (mod(m2oeff, lot1) /= 0) then
5713    call dfftw_destroy_plan(fw_plan1_rest)
5714  end if
5715 
5716  call dfftw_destroy_plan(fw_plan2_lot)
5717  if (mod(n1eff, lot2) /= 0) then
5718    call dfftw_destroy_plan(fw_plan2_rest)
5719  end if
5720 
5721  ABI_FREE(zmpi2)
5722  ABI_FREE(zw)
5723  ABI_FREE(zt)
5724  if (nproc_fft > 1)  then
5725    ABI_FREE(zmpi1)
5726  end if
5727 
5728 #else
5729  ABI_ERROR("FFTW3 support not activated")
5730  ABI_UNUSED((/cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc/))
5731  ABI_UNUSED((/max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3/))
5732  ABI_UNUSED((/max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft/))
5733  ABI_UNUSED((/pot(1,1,1),zf(1,1,1,1,1)/))
5734 #endif
5735 
5736 end subroutine fftw3_applypot

m_fftw3/fftw3_applypot_many [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_applypot_many

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

6798 subroutine fftw3_applypot_many(cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
6799 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
6800 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
6801 
6802 !Arguments ------------------------------------
6803  integer,intent(in) :: cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
6804  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
6805  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
6806  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
6807  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
6808 
6809 !Local variables-------------------------------
6810 !scalars
6811 #ifdef HAVE_FFTW3
6812  integer,parameter :: unused0=0
6813  integer :: j,i1,i2,i3,idat,ierr,j3glob,nthreads
6814  integer :: ioption,j2,j3,lzt,m1zt,ma,mb,n1dfft,nnd3,lot1,lot2,lot3
6815  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
6816  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
6817  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
6818  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
6819  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
6820  integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest
6821  integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest
6822  integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest
6823  character(len=500) :: msg
6824 !arrays
6825  integer :: requests(ndat)
6826  real(dp) :: tsec(2)
6827  real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
6828  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
6829 ! FFT work arrays
6830 
6831 ! *************************************************************************
6832 
6833  !ioption=0 ! This was in the old version.
6834  ioption=1 ! This one is needed to be compatible with paral_kgb
6835 
6836  ncache=2*max(n1,n2,n3,1024)
6837  if (ncache/(2*max(n1,n2,n3)) < 1) then
6838    write(msg,"(5a)") &
6839 &    'ncache has to be enlarged to be able to hold at',ch10,&
6840 &    'least one 1-d FFT of each size even though this will',ch10,&
6841 &    'reduce the performance for shorter transform lengths'
6842    ABI_ERROR(msg)
6843  end if
6844 
6845  !call wrtout(std_out,"applypot with non-blocking IALLTOALL + FFTW3")
6846  !write(std_out,"(a,i0)")"in applypot_many with ndat: ",ndat
6847 
6848  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
6849  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
6850  if (cplexwf==1) then
6851    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
6852  end if
6853 
6854  m2eff=max(m2ieff,m2oeff)
6855  lzt=m2eff
6856  if (mod(m2eff,2) == 0) lzt=lzt+1
6857  if (mod(m2eff,4) == 0) lzt=lzt+1
6858 
6859  ! maximal number of big box 3rd dim slices for all procs
6860  nnd3=nd3proc*nproc_fft
6861 
6862  ABI_MALLOC(zw,(2,ncache/2))
6863  ABI_MALLOC(zt,(2,lzt,m1zt))
6864  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
6865  if (nproc_fft > 1)  then
6866    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
6867  end if
6868 
6869  lot3=ncache/(2*n3)
6870  lot1=ncache/(2*n1)
6871  lot2=ncache/(2*n2)
6872 
6873  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
6874  !nthreads = 1
6875 
6876  ! The prototype for sfftw_plan_many_dft is:
6877  ! sfftw_plan_many_dft(rank, n, howmany,
6878  !   fin,  iembed, istride, idist,
6879  !   fout, oembed, ostride, odist, isign, my_flags)
6880 
6881  ! Create plans for G --> R (see back_wf)
6882  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
6883 &    zw, [ncache/2], lot3, 1,                          &
6884 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6885 
6886  if (mod(m1i, lot3) /= 0) then
6887    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1i, lot3),&
6888 &      zw, [ncache/2], lot3, 1,                                    &
6889 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6890  end if
6891 
6892  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
6893 &    zw, [ncache/2],  lot1, 1,                         &
6894 &    zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6895 
6896  if (mod(m2ieff, lot1) /= 0) then
6897    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2ieff, lot1), &
6898 &      zw, [ncache/2],  lot1, 1,                                       &
6899 &      zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6900  end if
6901 
6902  !TODO this won't work if iclexwf==1
6903  ! Recheck this
6904  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
6905 &    zw, [ncache/2], lot2, 1,                          &
6906 &    zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6907 
6908  if (mod(n1eff, lot2) /= 0) then
6909    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
6910 &      zw, [ncache/2], lot2, 1,                                      &
6911 &      zw, [ncache/2], lot2, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6912  end if
6913 
6914  ! Create plans for G --> R (see forw_wf)
6915  fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
6916 &    zw, [ncache/2], lot3, 1,                          &
6917 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6918 
6919  if (mod(m1o, lot3) /= 0) then
6920    fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1o, lot3),&
6921 &    zw, [ncache/2], lot3, 1,                                      &
6922 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6923  end if
6924 
6925  fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1,&
6926 &    zt, [lzt, m1zt], lzt,  1,                        &
6927 &    zw, [ncache/2],  lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6928 
6929  if (mod(m2oeff, lot1) /= 0) then
6930    fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2oeff, lot1),&
6931 &    zt, [lzt, m1zt], lzt,  1,                                        &
6932 &    zw, [ncache/2],  lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6933  end if
6934 
6935  fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2,&
6936 &    zw, [ncache/2], lot2, 1,                         &
6937 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6938 
6939  if (mod(n1eff, lot2) /= 0) then
6940    fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2),&
6941 &    zw, [ncache/2], lot2, 1,                                       &
6942 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6943  end if
6944 
6945  ! Here we take advantage of non-blocking IALLTOALL:
6946  ! Perform the first step of MPI-FFT for ndat wavefunctions.
6947  do idat=1,ndat
6948    !
6949    ! transform along z axis
6950    ! input: G1,G3,G2,(Gp2)
6951    do j2=1,md2proc
6952      if (me_fft*md2proc+j2 <= m2ieff) then
6953        do i1=1,m1i,lot3
6954          ma=i1
6955          mb=min(i1+(lot3-1),m1i)
6956          n1dfft=mb-ma+1
6957 
6958          ! zero-pad n1dfft G_z lines
6959          ! input: G1,G3,G2,(Gp2)
6960          call fill_cent(md1,md3,lot3,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw)
6961 
6962          if (n1dfft == lot3) then
6963            call dfftw_execute_dft(bw_plan3_lot, zw, zw)
6964          else
6965            call dfftw_execute_dft(bw_plan3_rest, zw, zw)
6966          end if
6967 
6968          ! Local rotation.
6969          ! input:  G1,R3,G2,(Gp2)
6970          ! output: G1,G2,R3,(Gp2)
6971          call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2(:,:,:,:,idat))
6972        end do
6973      end if
6974    end do
6975 
6976    ! Interprocessor data transposition
6977    ! input:  G1,G2,R3,Rp3,(Gp2)
6978    ! output: G1,G2,R3,Gp2,(Rp3)
6979    if (nproc_fft > 1) then
6980      call timab(543,1,tsec)
6981      call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,&
6982 &                        zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
6983      call timab(543,2,tsec)
6984    end if
6985  end do ! idat
6986 
6987  ! The second step of MPI-FFT
6988  do idat=1,ndat
6989     ! Make sure communication is completed.
6990     if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
6991 
6992    do j3=1,nd3proc
6993      j3glob = j3 + me_fft*nd3proc
6994      if (me_fft*nd3proc+j3 <= n3) then
6995        Jp2stb=1; J2stb=1
6996        Jp2stf=1; J2stf=1
6997 
6998        ! transform along x axis
6999        do j=1,m2ieff,lot1
7000          ma=j
7001          mb=min(j+(lot1-1),m2ieff)
7002          n1dfft=mb-ma+1
7003 
7004          ! Zero-pad input.
7005          ! input:  G1,G2,R3,G2,(Rp3)
7006          ! output: G2,G1,R3,G2,(Rp3)
7007          if (nproc_fft == 1) then
7008            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,&
7009 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw, unused0, unused0, unused0)
7010          else
7011            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot1,max1i,md1,m1i,n1,&
7012 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw, unused0, unused0, unused0)
7013          end if
7014 
7015          ! Transform along x
7016          ! input:  G2,G1,R3,(Rp3)
7017          ! output: G2,R1,R3,(Rp3)
7018          if (n1dfft == lot1) then
7019            call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
7020          else
7021            call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
7022          end if
7023        end do
7024 
7025        ! Transform along y axis (take into account c2c or c2r case).
7026        ! Must loop over the full box.
7027        !TODO this won't work
7028        if (cplexwf==1) then
7029          if(mod(lot2,2).ne.0) lot2=lot2-1 ! needed to introduce jeff
7030        end if
7031 
7032        do j=1,n1eff,lot2
7033          ma=j
7034          mb=min(j+(lot2-1),n1eff)
7035          n1dfft=mb-ma+1
7036          jeff=j
7037          includelast=1
7038 
7039          if (cplexwf==1) then
7040            jeff=2*j-1
7041            includelast=1
7042            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
7043          end if
7044 
7045          ! Zero-pad the input.
7046          !  input: G2,R1,R3,(Rp3)
7047          ! output: R1,G2,R3,(Rp3)
7048          if (cplexwf==2) then
7049            call switch_cent(n1dfft,max2i,m2i,n2,lot2,n1,lzt,zt(1,1,jeff),zw)
7050          else
7051            call switchreal_cent(includelast,n1dfft,max2i,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw)
7052          end if
7053 
7054          ! input:  R1,G2,R3,(Rp3)
7055          ! output: R1,R2,R3,(Rp3)
7056          ! Be careful here
7057          if (n1dfft == lot2) then
7058            call dfftw_execute_dft(bw_plan2_lot, zw, zw)
7059          else
7060            call dfftw_execute_dft(bw_plan2_rest, zw, zw)
7061          end if
7062 
7063          ! Multiply with potential in real space
7064          jx=cplex*(jeff-1)+1
7065          call multpot(cplexwf,cplex,includelast,nd1,nd2,n2,lot2,n1dfft,pot(jx,1,j3glob),zw)
7066 
7067          ! TRANSFORM BACK IN FOURIER SPACE
7068          ! transform along y axis
7069          ! input: R1,R2,R3,(Rp3)
7070          if (n1dfft == lot2) then
7071            call dfftw_execute_dft(fw_plan2_lot,  zw, zw)
7072          else
7073            call dfftw_execute_dft(fw_plan2_rest, zw, zw)
7074          end if
7075 
7076          !  input: R1,G2,R3,(Rp3)
7077          ! output: G2,R1,R3,(Rp3)
7078          if (cplexwf==2) then
7079            call unswitch_cent(n1dfft,max2o,m2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff))
7080          else
7081            call unswitchreal_cent(n1dfft,max2o,n2,lot2,n1,lzt,zw,zt(1,1,jeff))
7082          end if
7083        end do ! j
7084 
7085        ! transform along x axis
7086        ! input:  R2,R1,R3,(Rp3)
7087        ! output: R2,G1,R3,(Rp3)
7088        do j=1,m2oeff,lot1
7089          ma=j
7090          mb=min(j+(lot1-1),m2oeff)
7091          n1dfft=mb-ma+1
7092 
7093          if (n1dfft == lot1) then
7094            call dfftw_execute_dft(fw_plan1_lot,  zt(1,j,1), zw)
7095          else
7096            call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw)
7097          end if
7098 
7099          ! input:  G2,G1,R3,Gp2,(Rp3)
7100          ! output: G1,G2,R3,Gp2,(Rp3)
7101          if (nproc_fft == 1) then
7102            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,&
7103 &           md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2(:,:,:,:,idat))
7104          else
7105            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot1,max1o,md1,m1o,n1,&
7106 &           md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1(:,:,:,:,idat))
7107          end if
7108        end do ! j
7109      end if
7110    end do
7111 
7112    ! Interprocessor data transposition
7113    ! input:  G1,G2,R3,Gp2,(Rp3)
7114    ! output: G1,G2,R3,Rp3,(Gp2)
7115    if (nproc_fft > 1) then
7116      call timab(544,1,tsec)
7117      call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
7118 &                        zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
7119      call timab(544,2,tsec)
7120    end if
7121  end do
7122 
7123  do idat=1,ndat
7124    if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
7125    ! transform along z axis
7126    ! input: G1,G2,R3,(Gp2)
7127    !lot=ncache/(4*n3)
7128    do j2=1,md2proc
7129      if (me_fft*md2proc+j2 <= m2oeff) then
7130        do i1=1,m1o,lot3
7131          ma=i1
7132          mb=min(i1+(lot3-1),m1o)
7133          n1dfft=mb-ma+1
7134 
7135          ! input:  G1,G2,R3,(Gp2)
7136          ! output: G1,R3,G2,(Gp2)
7137          call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw)
7138 
7139           if (n1dfft == lot3) then
7140             call dfftw_execute_dft(fw_plan3_lot, zw, zw)
7141           else
7142             call dfftw_execute_dft(fw_plan3_rest, zw, zw)
7143           end if
7144 
7145          call unfill_cent(md1,md3,lot3,n1dfft,max3o,m3o,n3,zw,zf(1,i1,1,j2,idat))
7146          ! output: G1,G3,G2,(Gp2)
7147        end do
7148      end if
7149    end do
7150 
7151    ! Complete missing values with complex conjugate
7152    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
7153    if (cplexwf==1) then
7154      do i3=1,m3o
7155        i3inv=m3o+2-i3
7156        if (i3==1) i3inv=1
7157        if (m2oeff>1)then
7158          do i2=2,m2oeff
7159            i2inv=m2o+2-i2
7160            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
7161            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
7162            do i1=2,m1o
7163              i1inv=m1o+2-i1
7164              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
7165              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
7166            end do
7167          end do
7168        end if
7169      end do
7170    end if
7171 
7172  end do ! idat
7173 
7174  call dfftw_destroy_plan(bw_plan3_lot)
7175  if (mod(m1i, lot3) /= 0) then
7176    call dfftw_destroy_plan(bw_plan3_rest)
7177  end if
7178 
7179  call dfftw_destroy_plan(bw_plan1_lot)
7180  if (mod(m2ieff, lot1) /= 0) then
7181    call dfftw_destroy_plan(bw_plan1_rest)
7182  end if
7183 
7184  call dfftw_destroy_plan(bw_plan2_lot)
7185  if (mod(n1eff, lot2) /= 0) then
7186    call dfftw_destroy_plan(bw_plan2_rest)
7187  end if
7188 
7189  call dfftw_destroy_plan(fw_plan3_lot)
7190  if (mod(m1o, lot3) /= 0) then
7191    call dfftw_destroy_plan(fw_plan3_rest)
7192  end if
7193 
7194  call dfftw_destroy_plan(fw_plan1_lot)
7195  if (mod(m2oeff, lot1) /= 0) then
7196    call dfftw_destroy_plan(fw_plan1_rest)
7197  end if
7198 
7199  call dfftw_destroy_plan(fw_plan2_lot)
7200  if (mod(n1eff, lot2) /= 0) then
7201    call dfftw_destroy_plan(fw_plan2_rest)
7202  end if
7203 
7204  ABI_FREE(zmpi2)
7205  ABI_FREE(zw)
7206  ABI_FREE(zt)
7207  if (nproc_fft > 1)  then
7208    ABI_FREE(zmpi1)
7209  end if
7210 
7211 #else
7212  ABI_ERROR("FFTW3 support not activated")
7213  ABI_UNUSED((/cplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc/))
7214  ABI_UNUSED((/max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3/))
7215  ABI_UNUSED((/max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft/))
7216  ABI_UNUSED((/pot(1,1,1),zf(1,1,1,1,1)/))
7217 #endif
7218 
7219 end subroutine fftw3_applypot_many

m_fftw3/fftw3_c2c_ip_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_c2c_ip_dpc

FUNCTION

 Driver routine for in-place 3D complex-complex FFT.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimensions of the array.
 ndat=Number of FFTs to be done.
 iscale=0 if G --> R FFT should not be scaled.
 isign= +1 : ff(G) => ff(R); -1 : ff(R) => ff(G)
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    In input: the complex array to be transformed.
    In output: the Fourier transformed in the space specified by isign.

SOURCE

1561 subroutine fftw3_c2c_ip_dpc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, fftw_flags)
1562 
1563 !Arguments ------------------------------------
1564 !scalars
1565  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,iscale,isign
1566  integer,optional,intent(in) :: fftw_flags
1567 !arrays
1568  complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat)
1569 
1570 #ifdef HAVE_FFTW3
1571 !Local variables-------------------------------
1572 !scalars
1573  integer,parameter :: rank3=3,nt_all=-1
1574  integer :: my_flags,dist,stride
1575  integer(KIND_FFTW_PLAN) :: my_plan
1576 !arrays
1577  integer :: embed(rank3),n(rank3)
1578 
1579 ! *************************************************************************
1580 
1581  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags=fftw_flags
1582 
1583  stride = 1
1584  dist   = ldx*ldy*ldz
1585  embed  = [ldx, ldy, ldz]
1586  n      = [nx, ny, nz]
1587 
1588  my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, ff, embed, stride, dist, isign, my_flags, nt_all)
1589 
1590  ! Now perform the 3D FFT via FFTW.
1591  call dfftw_execute_dft(my_plan, ff, ff)
1592 
1593  call fftw3_destroy_plan(my_plan)
1594 
1595  ! -1, FFTW returns not normalized FTs
1596  if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then
1597    call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), ff, 1)
1598  end if
1599 
1600 #else
1601  ABI_ERROR("FFTW3 support not activated")
1602  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
1603  ABI_UNUSED(ff)
1604  if (PRESENT(fftw_flags)) then
1605    ABI_UNUSED(fftw_flags)
1606  end if
1607 #endif
1608 
1609 end subroutine fftw3_c2c_ip_dpc

m_fftw3/fftw3_c2c_ip_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_c2c_ip_spc

FUNCTION

 Driver routine for in-place 3D complex-complex FFT.
 TARGET: Simple precision complex arrays.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimensions of the array.
 ndat=Number of FFTs to be done.
 iscale=0 if G --> R FFT should not be scaled.
 isign= +1 : ff(G) => ff(R); -1 : ff(R) => ff(G)
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    In input: the complex array to be transformed.
    In output: the Fourier transformed in the space specified by isign.

SOURCE

1426 subroutine fftw3_c2c_ip_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, fftw_flags)
1427 
1428 !Arguments ------------------------------------
1429 !scalars
1430  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,iscale,isign
1431  integer,optional,intent(in) :: fftw_flags
1432 !arrays
1433  complex(spc),intent(inout) :: ff(ldx*ldy*ldz*ndat)
1434 
1435 #ifdef HAVE_FFTW3
1436 !Local variables-------------------------------
1437 !scalars
1438  integer,parameter :: rank3=3,nt_all=-1
1439  integer :: my_flags,dist,stride
1440  integer(KIND_FFTW_PLAN) :: my_plan
1441 !arrays
1442  integer :: embed(rank3),n(rank3)
1443 
1444 ! *************************************************************************
1445 
1446  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags=fftw_flags
1447 
1448  stride = 1
1449  dist   = ldx*ldy*ldz
1450  embed  = [ldx, ldy, ldz]
1451  n      = [nx, ny, nz]
1452 
1453  my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, ff, embed, stride, dist, isign, my_flags, nt_all)
1454 
1455  ! Now perform the 3D FFT via FFTW.
1456  call sfftw_execute_dft(my_plan, ff, ff)
1457 
1458  call fftw3_destroy_plan(my_plan)
1459 
1460  if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs
1461    call xscal(ldx*ldy*ldz*ndat, REAL(one/(nx*ny*nz),KIND=sp), ff, 1)
1462  end if
1463 
1464 #else
1465  ABI_ERROR("FFTW3 support not activated")
1466  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
1467  ABI_UNUSED(ff)
1468  if (PRESENT(fftw_flags)) then
1469    ABI_UNUSED(fftw_flags)
1470  end if
1471 #endif
1472 
1473 end subroutine fftw3_c2c_ip_spc

m_fftw3/fftw3_c2c_op_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_c2c_op_dpc

FUNCTION

 Driver routine for out-of-place 3D complex-complex FFT of lengths nx, ny, nz.
 TARGET: single precision complex arrays

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimensions of the array.
 ndat=Number of FFTs to be done.
 iscale=0 if G --> R FFT should not be scaled.
 isign= +1 : ff(G) => gg(R); -1 : ff(R) => gg(G)
 ff(ldx*ldy*ldz*ndat)=The array to be transformed.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 gg(ldx*ldy*ldz*ndat)=The FFT of ff.

SOURCE

1714 subroutine fftw3_c2c_op_dpc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, gg, fftw_flags)
1715 
1716 !Arguments ------------------------------------
1717 !scalars
1718  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,isign,ndat,iscale
1719  integer,optional,intent(in) :: fftw_flags
1720 !arrays
1721  complex(dpc),intent(in) :: ff(ldx*ldy*ldz*ndat)
1722  complex(dpc),intent(out) :: gg(ldx*ldy*ldz*ndat)
1723 
1724 #ifdef HAVE_FFTW3
1725 !Local variables-------------------------------
1726 !scalars
1727  integer,parameter :: rank3=3,nt_all=-1
1728  integer :: my_flags,dist,stride
1729  integer(KIND_FFTW_PLAN) :: my_plan
1730 !arrays
1731  integer :: embed(rank3),n(rank3)
1732 
1733 ! *************************************************************************
1734 
1735  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
1736 
1737  stride = 1
1738  dist   = ldx*ldy*ldz
1739  embed  = [ldx, ldy, ldz]
1740  n      = [nx, ny, nz]
1741 
1742  my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, gg, embed, stride, dist, isign, my_flags, nt_all)
1743 
1744  ! Now perform the 3D FFT via FFTW.
1745  call dfftw_execute_dft(my_plan, ff, gg)
1746 
1747  call fftw3_destroy_plan(my_plan)
1748 
1749  if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs
1750    call xscal(ldx*ldy*ldz*ndat, one/(nx*ny*nz), gg, 1)
1751  end if
1752 
1753 #else
1754  ABI_ERROR("FFTW3 support not activated")
1755  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
1756  ABI_UNUSED(ff)
1757  ABI_UNUSED(gg)
1758  if (PRESENT(fftw_flags)) then
1759    ABI_UNUSED(fftw_flags)
1760  end if
1761 #endif
1762 
1763 end subroutine fftw3_c2c_op_dpc

m_fftw3/fftw3_c2c_op_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_c2c_op_spc

FUNCTION

 Driver routine for out-of-place 3D complex-complex FFT of lengths nx, ny, nz.
 TARGET: single precision complex arrays

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimensions of the array.
 ndat=Number of FFTs to be done.
 iscale=0 if G --> R FFT should not be scaled.
 isign= +1 : ff(G) => gg(R); -1 : ff(R) => gg(G)
 ff(ldx*ldy*ldz*ndat)=The array to be transformed.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 gg(ldx*ldy*ldz*ndat)=The FFT of ff.

SOURCE

1637 subroutine fftw3_c2c_op_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale, isign, ff, gg, fftw_flags)
1638 
1639 !Arguments ------------------------------------
1640 !scalars
1641  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,iscale,isign,ndat
1642  integer,optional,intent(in) :: fftw_flags
1643 !arrays
1644  complex(spc),intent(in) :: ff(ldx*ldy*ldz*ndat)
1645  complex(spc),intent(out) :: gg(ldx*ldy*ldz*ndat)
1646 
1647 #ifdef HAVE_FFTW3
1648 !Local variables-------------------------------
1649 !scalars
1650  integer,parameter :: rank3=3,nt_all=-1
1651  integer :: my_flags,dist,stride
1652  integer(KIND_FFTW_PLAN) :: my_plan
1653 !arrays
1654  integer :: embed(rank3),n(rank3)
1655 
1656 ! *************************************************************************
1657 
1658  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
1659 
1660  stride = 1
1661  dist   = ldx*ldy*ldz
1662  embed  = [ldx, ldy, ldz]
1663  n      = [nx, ny, nz]
1664 
1665  my_plan = fftw3_plan_many_dft(rank3, n, ndat, ff, embed, stride, dist, gg, embed, stride, dist, isign, my_flags, nt_all)
1666 
1667  ! Now perform the 3D FFT via FFTW.
1668  call sfftw_execute_dft(my_plan, ff, gg)
1669 
1670  call fftw3_destroy_plan(my_plan)
1671 
1672  if (isign == ABI_FFTW_FORWARD .and. iscale /= 0) then ! -1, FFTW returns not normalized FTs
1673    call xscal(ldx*ldy*ldz*ndat, REAL(one/(nx*ny*nz), KIND=sp), gg, 1)
1674  end if
1675 
1676 #else
1677  ABI_ERROR("FFTW3 support not activated")
1678  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,isign/))
1679  ABI_UNUSED(ff)
1680  ABI_UNUSED(gg)
1681  if (PRESENT(fftw_flags)) then
1682    ABI_UNUSED(fftw_flags)
1683  end if
1684 #endif
1685 
1686 end subroutine fftw3_c2c_op_spc

m_fftw3/fftw3_c2r_op [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_c2r_op

FUNCTION

 Driver routine for out-of-place 3D complex-to-real FFT of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of point along the three directions.
 ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
 ndat=Number of FFTs to be done.
 ff(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 gg(ldx*ldy*ldz*ndat)=The backwards real FFT of ff.

NOTES

  FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes
  if the advanced interface is used.

SOURCE

1943 subroutine fftw3_c2r_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags)
1944 
1945 !Arguments ------------------------------------
1946 !scalars
1947  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat
1948  integer,optional,intent(in) :: fftw_flags
1949 !arrays
1950  real(dp),intent(in) :: ff(2,ldx*ldy*ldz*ndat)
1951  real(dp),intent(out) :: gg(ldx*ldy*ldz*ndat)
1952 
1953 #ifdef HAVE_FFTW3
1954 !Local variables-------------------------------
1955 !scalars
1956  integer,parameter :: rank3=3,nt_all=-1
1957  integer :: nhp,my_flags,padx,i2,i3,igp,igf,idat,padatf,padatp,idist,odist,stride
1958  integer(KIND_FFTW_PLAN) :: my_plan
1959 !arrays
1960  integer :: inembed(rank3),onembed(rank3),n(rank3)
1961  real(dp),allocatable :: ff_hp(:,:)
1962 
1963 ! *************************************************************************
1964 
1965 #ifdef DEV_RC_BUG
1966  if (ANY( (/nx,ny,nz/) /= (/ldx,ldy,ldz/) )) then
1967    ABI_ERROR("Augmentation not supported")
1968  end if
1969 #endif
1970 
1971  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
1972 
1973  stride  = 1
1974  nhp     = (nx/2+1)*ny*nz
1975  idist   = nhp
1976  odist   = ldx*ldy*ldz
1977  n       = (/nx,ny,nz/)
1978  inembed = (/(nx/2+1),ny,nz/)
1979  onembed = (/ldx,ldy,ldz/)
1980 
1981  ! Fill the Hermitian part: Hermitian redundancy: out[i] is the conjugate of out[n-i]
1982  ABI_MALLOC(ff_hp,(2,nhp*ndat))
1983 
1984  padx = (nx/2+1)
1985  do idat=1,ndat
1986    padatf=(idat-1)*ldx*ldy*ldz
1987    padatp=(idat-1)*padx*ny*nz
1988 !$OMP PARALLEL DO PRIVATE(igf,igp)
1989    do i3=1,nz
1990      do i2=1,ny
1991        igf = (i3-1)*ldx*ldy + (i2-1)*ldx   + padatf
1992        igp = (i3-1)*padx*ny + (i2-1)*padx  + padatp
1993        ff_hp(:,igp+1:igp+padx) = ff(:,igf+1:igf+padx)
1994      end do
1995    end do
1996  end do
1997 
1998  ! NOTE: The c2r transform destroys its input array even for out-of-place transforms.
1999 #ifdef DEV_RC_BUG
2000  if (ndat/=1) ABI_ERROR("ndat/=1 + MKL not coded")
2001  call dfftw_plan_dft_c2r_3d(my_plan, nx, ny, nz, ff_hp, gg, my_flags)
2002  if (my_plan==NULL_PLAN) then
2003    ABI_ERROR("dfftw_plan_dft_c2r_3d returned NULL_PLAN")
2004  end if
2005 #else
2006  my_plan = dplan_many_dft_c2r(rank3, n, ndat, ff_hp, inembed, stride, idist, gg, onembed, stride, odist, my_flags, nt_all)
2007 #endif
2008 
2009  ! Now perform the 3D FFT via FFTW. c2r are always ABI_FFTW_BACKWARD
2010  call dfftw_execute_dft_c2r(my_plan, ff_hp, gg)
2011 
2012  call fftw3_destroy_plan(my_plan)
2013  ABI_FREE(ff_hp)
2014 
2015 #else
2016  ABI_ERROR("FFTW3 support not activated")
2017  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/))
2018  ABI_UNUSED(ff(1,1))
2019  ABI_UNUSED(gg(1))
2020  if (PRESENT(fftw_flags)) then
2021    ABI_UNUSED(fftw_flags)
2022  end if
2023 #endif
2024 
2025 end subroutine fftw3_c2r_op

m_fftw3/fftw3_cleanup [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_cleanup

FUNCTION

  Reset FFTW to the pristine state it was in when you started your program,
  All existing plans become undefined.

NOTES

  FFTW planner saves some other persistent data, such as the accumulated wisdom and a list of
  algorithms available in the current configuration. If you want to deallocate all of that and reset
  FFTW to the pristine state it was in when you started your program, you can call fftw3_cleanup();
  After calling fftw3_cleanup, all existing plans become undefined, and you should not attempt to
  execute them nor to destroy them. You can however create and execute/destroy new plans, in which case
  FFTW starts accumulating wisdom information again.
  fftw3_cleanup does not deallocate your plans, however. To prevent memory leaks, you must still call
  fftw_destroy_plan before executing fftw3_cleanup

SOURCE

2204 subroutine fftw3_cleanup()
2205 
2206 ! *************************************************************************
2207 
2208 #ifdef HAVE_FFTW3_MPI
2209  call fftw_mpi_cleanup()
2210 #endif
2211 #ifdef HAVE_FFTW3_THREADS
2212  if (THREADS_INITED==1) then
2213    call dfftw_cleanup_threads()
2214    THREADS_INITED = 0
2215  end if
2216 #elif defined HAVE_FFTW3
2217  call dfftw_cleanup()
2218 #else
2219  ABI_ERROR("FFTW3 support not activated")
2220 #endif
2221 
2222 end subroutine fftw3_cleanup

m_fftw3/fftw3_destroy_plan [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_destroy_plan

FUNCTION

  Release the memory allocate for the plan.

INPUTS

SOURCE

2238 subroutine fftw3_destroy_plan(plan)
2239 
2240 !Arguments ------------------------------------
2241 !scalars
2242  integer(KIND_FFTW_PLAN),intent(in) :: plan
2243 
2244 ! *************************************************************************
2245 
2246 #ifdef HAVE_FFTW3
2247 !$OMP CRITICAL (OMPC_fftw3_destroy_plan)
2248  call dfftw_destroy_plan(plan)
2249 !$OMP END CRITICAL (OMPC_fftw3_destroy_plan)
2250 
2251 #else
2252  if (.FALSE.) write(std_out,*)plan
2253 #endif
2254 
2255 end subroutine fftw3_destroy_plan

m_fftw3/fftw3_execute_dft_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_execute_dft_dp

FUNCTION

INPUTS

OUTPUT

NOTES

  This interface is used to perform complex to complex FFT with real arrays
  containing the real and imaginary part. I have to admit that this interface
  is a bit ambiguous since FFTW3 provides routines for real-to-real transforms.

SOURCE

2820 #ifdef HAVE_FFTW3
2821 
2822 subroutine fftw3_execute_dft_dp(plan, in, out)
2823 
2824 !Arguments ------------------------------------
2825 !scalars
2826  integer(KIND_FFTW_PLAN),intent(in) :: plan
2827  real(C_DOUBLE),intent(inout) :: in(*)
2828  real(C_DOUBLE),intent(out) :: out(*)
2829 
2830 ! *************************************************************************
2831 
2832  call dfftw_execute_dft(plan, in, out)
2833 
2834 end subroutine fftw3_execute_dft_dp

m_fftw3/fftw3_execute_dft_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_execute_dft_dpc

SOURCE

2874 #ifdef HAVE_FFTW3
2875 
2876 subroutine fftw3_execute_dft_dpc(plan, in, out)
2877 
2878 !Arguments ------------------------------------
2879 !scalars
2880  integer(KIND_FFTW_PLAN),intent(in) :: plan
2881  complex(C_DOUBLE_COMPLEX),intent(inout) :: in(*)
2882  complex(C_DOUBLE_COMPLEX),intent(out) :: out(*)
2883 
2884 ! *************************************************************************
2885 
2886  call dfftw_execute_dft(plan, in, out)
2887 
2888 end subroutine fftw3_execute_dft_dpc

m_fftw3/fftw3_execute_dft_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_execute_dft_spc

SOURCE

2847 #ifdef HAVE_FFTW3
2848 
2849 subroutine fftw3_execute_dft_spc(plan, in, out)
2850 
2851 !Arguments ------------------------------------
2852 !scalars
2853  integer(KIND_FFTW_PLAN),intent(in) :: plan
2854  complex(C_FLOAT_COMPLEX),intent(inout) :: in(*)
2855  complex(C_FLOAT_COMPLEX),intent(out) :: out(*)
2856 
2857 ! *************************************************************************
2858 
2859  call sfftw_execute_dft(plan, in, out)
2860 
2861 end subroutine fftw3_execute_dft_spc

m_fftw3/fftw3_fftpad_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_fftpad_dp

FUNCTION

  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3.
  The 3D ffts are computed only on lines and planes which have non zero elements.
  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz)
  FFT transform is in-place.

INPUTS

   nx,ny,nz=Logical dimensions of the FFT mesh.
   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
   ndat=Number of FFT transforms.
   mgfft=MAX(nx,ny,nz), only used to dimension gbound
   isign=The sign of the transform.
   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
     See sphereboundary for more info.

SIDE EFFECTS

   ff(2*ldx*ldy*ldz*ndat)=
     input: The array with the data to be transformed.
     output: The results of the FFT.

SOURCE

2406 subroutine fftw3_fftpad_dp(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale)
2407 
2408 !Arguments ------------------------------------
2409 !scalars
2410  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
2411 !arrays
2412  integer,intent(in) :: gbound(2*mgfft+8,2)
2413  real(dp),intent(inout) :: ff(2*ldx*ldy*ldz*ndat)
2414  integer,optional,intent(in) :: iscale
2415 
2416 !Local variables-------------------------------
2417 !scalars
2418 #ifdef HAVE_FFTW3
2419  integer,parameter :: dst=2
2420  integer :: iscale__
2421  real(dp) :: fact
2422 
2423 ! *************************************************************************
2424 
2425  iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale
2426 
2427 #include "fftw3_fftpad.finc"
2428 
2429 #else
2430  ABI_ERROR("FFTW3 support not activated")
2431  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,mgfft,isign/))
2432  ABI_UNUSED(gbound(1,1))
2433  ABI_UNUSED(ff(1))
2434 #endif
2435 
2436 end subroutine fftw3_fftpad_dp

m_fftw3/fftw3_fftpad_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_fftpad_dpc

FUNCTION

  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3.
  The 3D ffts are computed only on lines and planes which have non zero elements.
  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz)
  FFT transform is in-place. Target: complex arrays.

INPUTS

   nx,ny,nz=Logical dimensions of the FFT mesh.
   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
   ndat=Number of FFT transforms.
   mgfft=MAX(nx,ny,nz), only used to dimension gbound.
   isign=The sign of the transform.
   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
     See sphereboundary for more info.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    input: The array with the data to be transformed.
    output: The results of the FFT.

SOURCE

2467 subroutine fftw3_fftpad_dpc(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale)
2468 
2469 !Arguments ------------------------------------
2470 !scalars
2471  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
2472  integer,optional,intent(in) :: iscale
2473 !arrays
2474  integer,intent(in) :: gbound(2*mgfft+8,2)
2475  complex(dpc),intent(inout) :: ff(ldx*ldy*ldz*ndat)
2476 
2477 #ifdef HAVE_FFTW3
2478 !Local variables-------------------------------
2479  integer,parameter :: dst=1
2480  integer :: iscale__
2481  real(dp) :: fact
2482 
2483 ! *************************************************************************
2484 
2485  iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale
2486 
2487 #include "fftw3_fftpad.finc"
2488 
2489 #else
2490  ABI_ERROR("FFTW3 support not activated")
2491  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/))
2492  ABI_UNUSED(gbound(1,1))
2493  ABI_UNUSED(ff(1))
2494 #endif
2495 
2496 end subroutine fftw3_fftpad_dpc

m_fftw3/fftw3_fftpad_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_fftpad_spc

FUNCTION

  This routine transforms wavefunctions using 3D zero-padded FFTs with FFTW3.
  The 3D ffts are computed only on lines and planes which have non zero elements.
  These lines and planes are defined by the two vectors do_fft_x(ldy*nz) and do_fft_y(nz)
  FFT transform is in-place. Target: complex arrays.

INPUTS

   nx,ny,nz=Logical dimensions of the FFT mesh.
   ldx,ldy,ldz=Physical dimension of the f array (to avoid cache conflicts).
   ndat=Number of FFT transforms.
   mgfft=MAX(nx,ny,nz), only used to dimension gbound.
   isign=The sign of the transform.
   gbound(2*mgfft+8,2)= The boundaries of the basis sphere of G vectors at a given k-point.
     See sphereboundary for more info.

SIDE EFFECTS

  ff(ldx*ldy*ldz*ndat)=
    input: The array with the data to be transformed.
    output: The results of the FFT.

SOURCE

1504 subroutine fftw3_fftpad_spc(ff, nx, ny, nz, ldx, ldy, ldz, ndat, mgfft, isign, gbound, iscale)
1505 
1506 !Arguments ------------------------------------
1507 !scalars
1508  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign
1509 !arrays
1510  integer,intent(in) :: gbound(2*mgfft+8,2)
1511  complex(spc),intent(inout) :: ff(ldx*ldy*ldz*ndat)
1512  integer,optional,intent(in) :: iscale
1513 
1514 #ifdef HAVE_FFTW3
1515 !Local variables-------------------------------
1516  integer,parameter :: dst=1
1517  integer :: iscale__
1518  real(sp) :: fact
1519 
1520 ! *************************************************************************
1521 
1522  iscale__ = merge(1, 0, isign == -1); if (present(iscale)) iscale__ = iscale
1523 
1524 #include "fftw3_fftpad.finc"
1525 
1526 #else
1527  ABI_ERROR("FFTW3 support not activated")
1528  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,isign/))
1529  ABI_UNUSED(gbound(1,1))
1530  ABI_UNUSED(ff(1))
1531 #endif
1532 
1533 end subroutine fftw3_fftpad_spc

m_fftw3/fftw3_fftrisc_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftrisc_dp

FUNCTION

 Carry out Fourier transforms between real and reciprocal (G) space,
 for wavefunctions, contained in a sphere in reciprocal space,
 in both directions. Also accomplish some post-processing.

NOTES

 Specifically uses rather sophisticated algorithms, based on S Goedecker
 routines, specialized for superscalar RISC architecture.
 Zero padding : saves 7/12 execution time
 Bi-dimensional data locality in most of the routine : cache reuse
 For k-point (0 0 0) : takes advantage of symmetry of data.
 Note however that no blocking is used, in both 1D z-transform
 or subsequent 2D transform. This should be improved.

INPUTS

  cplex= if 1 , denpot is real, if 2 , denpot is complex
     (cplex=2 only allowed for option=2 when istwf_k=1)
     one can also use cplex=0 if option=0 or option=3
  fofgin(2,npwin)=holds input wavefunction in G vector basis sphere.
  gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space
  gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space
  istwf_k=option parameter that describes the storage of wfs
  kg_kin(3,npwin)=reduced planewave coordinates, input
  kg_kout(3,npwout)=reduced planewave coordinates, output
  mgfft=maximum size of 1D FFTs
  ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
  npwin=number of elements in fofgin array (for option 0, 1 and 2)
  npwout=number of elements in fofgout array (for option 2 and 3)
  ldx,ldy,ldz=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr.
  option= if 0: do direct FFT
          if 1: do direct FFT, then sum the density
          if 2: do direct FFT, multiply by the potential, then do reverse FFT
          if 3: do reverse FFT only
  weight=weight to be used for the accumulation of the density in real space
          (needed only when option=1)

OUTPUT

  (see side effects)

OPTIONS

  The different options are:
  - reciprocal to real space and output the result (when option=0),
  - reciprocal to real space and accumulate the density (when option=1) or
  - reciprocal to real space, apply the local potential to the wavefunction
    in real space and produce the result in reciprocal space (when option=2)
  - real space to reciprocal space (when option=3).
  option=0 IS NOT ALLOWED when istwf_k>2
  option=3 IS NOT ALLOWED when istwf_k>=2

SIDE EFFECTS

  for option==0, fofgin(2,npwin)=holds input wavefunction in G sphere;
                 fofr(2,ldx,ldy,ldz) contains the Fourier Transform of fofgin;
                 no use of denpot, fofgout and npwout.
  for option==1, fofgin(2,npwin)=holds input wavefunction in G sphere;
                 denpot(cplex*ldx,ldy,ldz) contains the input density at input,
                 and the updated density at output;
                 no use of fofgout and npwout.
  for option==2, fofgin(2,npwin)=holds input wavefunction in G sphere;
                 denpot(cplex*ldx,ldy,ldz) contains the input local potential;
                 fofgout(2,npwout) contains the output function;
  for option==3, fofr(2,ldx,ldy,ldz) contains the real space wavefunction;
                 fofgout(2,npwout) contains its Fourier transform;
                 no use of fofgin and npwin.

SOURCE

892 subroutine fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
893                             mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, &
894                             weight_r, weight_i, abi_convention, iscale)
895 
896 !Arguments ------------------------------------
897 !scalars
898  integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option
899  real(dp),intent(in) :: weight_r,weight_i
900 !arrays
901  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
902  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
903  real(dp),intent(in) :: fofgin(2,npwin)
904  real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofr(2,ldx*ldy*ldz)
905  real(dp),intent(inout) :: fofgout(2,npwout)
906  logical,optional,intent(in) :: abi_convention
907  integer,optional,intent(in) :: iscale
908 
909 ! *************************************************************************
910 
911 #ifdef HAVE_FFTW3
912 
913 #undef  FFT_PRECISION
914 #undef  MYKIND
915 #undef  MYCZERO
916 #undef  MYCMPLX
917 #undef  MYCONJG
918 
919 #define FFT_PRECISION FFT_DOUBLE
920 #define MYKIND DPC
921 #define MYCZERO (0._dp,0._dp)
922 #define MYCMPLX  DCMPLX
923 #define MYCONJG  DCONJG
924 
925 #include "fftw3_fftrisc.finc"
926 
927 #else
928  ABI_ERROR("FFTW3 support not activated")
929  ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/))
930  ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/))
931  ABI_UNUSED((/denpot(1,1,1),fofgin(1,1),fofgout(1,1),fofr(1,1),weight_r,weight_i/))
932 #endif
933 
934 end subroutine fftw3_fftrisc_dp

m_fftw3/fftw3_fftrisc_mixprec [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftrisc_mixprec

FUNCTION

  Mixed precision version of fftrisc: input/output in dp, computation done in sp.
  See fftw3_fftrisc_dp for API docs.

SOURCE

949 subroutine fftw3_fftrisc_mixprec(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
950                                  mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, &
951                                  weight_r,weight_i, abi_convention, iscale) ! optional
952 
953 !Arguments ------------------------------------
954 !scalars
955  integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option
956  real(dp),intent(in) :: weight_r,weight_i
957 !arrays
958  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
959  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
960  real(dp),intent(in) :: fofgin(2,npwin)
961  real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofr(2,ldx*ldy*ldz)
962  real(dp),intent(inout) :: fofgout(2,npwout)
963  logical,optional,intent(in) :: abi_convention
964  integer,optional,intent(in) :: iscale
965 
966 ! *************************************************************************
967 
968 #ifdef HAVE_FFTW3
969 
970 #undef  FFT_PRECISION
971 #undef  MYKIND
972 #undef  MYCZERO
973 #undef  MYCMPLX
974 #undef  MYCONJG
975 
976 #define FFT_PRECISION FFT_MIXPREC
977 #define MYKIND SPC
978 #define MYCZERO (0._sp,0._sp)
979 #define MYCMPLX  CMPLX
980 #define MYCONJG  CONJG
981 
982 #include "fftw3_fftrisc.finc"
983 
984 #else
985  ABI_ERROR("FFTW3 support not activated")
986  ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/))
987  ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/))
988  ABI_UNUSED((/denpot(1,1,1),fofgin(1,1),fofgout(1,1),fofr(1,1),weight_r,weight_i/))
989 #endif
990 
991 end subroutine fftw3_fftrisc_mixprec

m_fftw3/fftw3_fftrisc_sp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftrisc_sp

FUNCTION

 Carry out Fourier transforms between real and reciprocal (G) space,
 for wavefunctions, contained in a sphere in reciprocal space,
 in both directions. Also accomplish some post-processing.
 See fftw3_fftrisc_dp for API doc.

SOURCE

773 subroutine fftw3_fftrisc_sp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
774                             mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option, &
775                             weight_r,weight_i, abi_convention, iscale)
776 
777 !Arguments ------------------------------------
778 !scalars
779  integer,intent(in) :: cplex,istwf_k,mgfft,ldx,ldy,ldz,npwin,npwout,option
780  real(dp),intent(in) :: weight_i,weight_r
781 !arrays
782  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
783  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
784  real(sp),intent(in) :: fofgin(2,npwin)
785  real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz)
786  real(sp),intent(inout) :: fofr(2,ldx*ldy*ldz)
787  real(sp),intent(inout) :: fofgout(2,npwout)
788  logical,optional,intent(in) :: abi_convention
789  integer,optional,intent(in) :: iscale
790 
791 ! *************************************************************************
792 
793 #ifdef HAVE_FFTW3
794 
795 #undef  FFT_PRECISION
796 #undef  MYKIND
797 #undef  MYCZERO
798 #undef  MYCMPLX
799 #undef  MYCONJG
800 
801 #define FFT_PRECISION FFT_SINGLE
802 #define MYKIND SPC
803 #define MYCZERO (0._sp,0._sp)
804 #define MYCMPLX  CMPLX
805 #define MYCONJG  CONJG
806 
807 #include "fftw3_fftrisc.finc"
808 
809 #else
810  ABI_ERROR("FFTW3 support not activated")
811  ABI_UNUSED((/cplex,gboundin(1,1),gboundout(1,1),istwf_k,kg_kin(1,1),kg_kout(1,1)/))
812  ABI_UNUSED((/mgfft,ngfft(1),npwin,npwout,ldx,ldy,ldz,option/))
813  ABI_UNUSED((/denpot(1,1,1),weight_r,weight_i/))
814  ABI_UNUSED((/fofgin(1,1),fofgout(1,1),fofr(1,1)/))
815 #endif
816 
817 end subroutine fftw3_fftrisc_sp

m_fftw3/fftw3_fftug_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftug_dp

FUNCTION

 Compute ndat zero-padded FFTs from G to R space.
 Mainly used for the transform of wavefunctions.
 TARGET: dp arrays with real and imaginary part

INPUTS

 fftalg=FFT algorith (see input variable)
 fftcache=size of the cache (kB)
 npw_k=number of plane waves for this k-point.
 nx,ny,nz=Number of point along the three directions.
 ldx,ldy,ldz=Leading dimensions of the array.
 ndat=Number of transforms
 istwf_k=Option describing the storage of the wavefunction.
 mgfft=Max number of FFT divisions (used to dimension gbound)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound(2*mgfft+8,2)=Table for zero-padded FFT. See sphereboundary.
 ug(npw_k*ndat)=wavefunctions in reciprocal space.

OUTPUT

  ur(ldx*ldy*ldz*ndat)=wavefunctions in real space.

SOURCE

1023 subroutine fftw3_fftug_dp(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1024                           istwf_k, mgfft, kg_k,gbound, ug, ur, &
1025                           isign, iscale)  ! optional
1026 
1027 !Arguments ------------------------------------
1028 !scalars
1029  integer,intent(in) :: fftalg,fftcache
1030  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1031 !arrays
1032  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1033  real(dp),target,intent(in) :: ug(2*npw_k*ndat)
1034  real(dp),target,intent(inout) :: ur(2*ldx*ldy*ldz*ndat)
1035  integer,optional,intent(in) :: isign, iscale
1036 
1037 #ifdef HAVE_FFTW3
1038 !Local variables-------------------------------
1039 !scalars
1040  integer,parameter :: dist=2
1041  integer :: iscale__, isign__
1042  real(dp) :: fofgout(2,0)
1043  real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1044 
1045 ! *************************************************************************
1046 
1047  iscale__ = 0; if (present(iscale)) iscale__ = iscale
1048  isign__ = +1; if (present(isign)) isign__ = isign
1049 
1050 #undef TK_PREF
1051 #define TK_PREF(name) CONCAT(cg_,name)
1052 
1053 #undef  FFT_PRECISION
1054 #define FFT_PRECISION FFT_DOUBLE
1055 
1056 #include "fftug.finc"
1057 
1058 #undef  FFT_PRECISION
1059 
1060 #else
1061  ! Silence compiler warning
1062  ABI_ERROR("FFT_FFTW3 support not activated")
1063  ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1),iscale,isign/))
1064  ABI_UNUSED((/ug(1),ur(1)/))
1065 #endif
1066 
1067 end subroutine fftw3_fftug_dp

m_fftw3/fftw3_fftug_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftug_dpc

FUNCTION

 Compute ndat zero-padded FFTs.
 Mainly used for the transform of wavefunctions.
 TARGET: DPC arrays
 See fftw3_fftug_dp for API docs.

SOURCE

1146 subroutine fftw3_fftug_dpc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1147                            istwf_k, mgfft, kg_k, gbound, ug, ur, &
1148                            isign, iscale)  ! optional
1149 
1150 !Arguments ------------------------------------
1151 !scalars
1152  integer,intent(in) :: fftalg,fftcache
1153  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1154 !arrays
1155  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1156  complex(dpc),target,intent(in) :: ug(npw_k*ndat)
1157  complex(dpc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat)
1158  integer,optional,intent(in) :: isign, iscale
1159 
1160 #ifdef HAVE_FFTW3
1161 !Local variables-------------------------------
1162 !scalars
1163  integer,parameter :: dist=1
1164  integer :: iscale__, isign__
1165 !arrays
1166  real(dp) :: fofgout(2,0)
1167  real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1168 
1169 ! *************************************************************************
1170 
1171  iscale__ = 0; if (present(iscale)) iscale__ = iscale
1172  isign__ = +1; if (present(isign)) isign__ = isign
1173 
1174 #undef TK_PREF
1175 #define TK_PREF(name) CONCAT(cplx_,name)
1176 
1177 #undef  FFT_PRECISION
1178 #define FFT_PRECISION FFT_DOUBLE
1179 
1180 #include "fftug.finc"
1181 
1182 #undef  FFT_PRECISION
1183 
1184 #else
1185  ! Silence compiler warning
1186  ABI_ERROR("FFTW3 support not activated")
1187  ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/))
1188  ABI_UNUSED((/ug(1),ur(1)/))
1189 #endif
1190 
1191 end subroutine fftw3_fftug_dpc

m_fftw3/fftw3_fftug_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftug_spc

FUNCTION

 Compute ndat zero-padded FFTs from G-->R.
 Mainly used for the transform of wavefunctions.
 TARGET: spc arrays
 See fftw3_fftug_dp for API docs.

SOURCE

1084 subroutine fftw3_fftug_spc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1085                            istwf_k, mgfft, kg_k, gbound, ug, ur, &
1086                            isign, iscale) ! optional
1087 
1088 !Arguments ------------------------------------
1089 !scalars
1090  integer,intent(in) :: fftalg,fftcache
1091  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1092  integer,optional,intent(in) :: isign, iscale
1093 !arrays
1094  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1095  complex(spc),target,intent(in) :: ug(npw_k*ndat)
1096  complex(spc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat)
1097 
1098 #ifdef HAVE_FFTW3
1099 !Local variables-------------------------------
1100 !scalars
1101  integer,parameter :: dist=1
1102  integer :: iscale__, isign__
1103 !arrays
1104  real(sp) :: fofgout(2,0)
1105  real(sp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1106 
1107 ! *************************************************************************
1108 
1109  iscale__ = 0; if (present(iscale)) iscale__ = iscale
1110  isign__ = +1; if (present(isign)) isign__ = isign
1111 
1112 #undef TK_PREF
1113 #define TK_PREF(name) CONCAT(cplx_,name)
1114 
1115 #undef  FFT_PRECISION
1116 #define FFT_PRECISION FFT_SINGLE
1117 
1118 #include "fftug.finc"
1119 
1120 #undef  FFT_PRECISION
1121 
1122 #else
1123  ! Silence compiler warning
1124  ABI_ERROR("FFTW3 support not activated")
1125  ABI_UNUSED((/fftalg,fftcache,npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/))
1126  ABI_UNUSED((/ug(1),ur(1)/))
1127 #endif
1128 
1129 end subroutine fftw3_fftug_spc

m_fftw3/fftw3_fftur_dp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftur_dp

FUNCTION

 Compute ndat zero-padded FFTs from R- to G-space .
 Mainly used for the transform of wavefunctions.
 TARGET: dp arrays

INPUTS

 fftalg=FFT algorith (see input variable)
 fftcache=size of the cache (kB)
 npw_k=number of plane waves for this k-point.
 nx,ny,nz=Number of point along the three directions.
 ldx,ldy,ldz=Leading dimensions of the array.
 ndat=Number of transforms
 istwf_k=Option describing the storage of the wavefunction.
 mgfft=Max number of FFT divisions (used to dimension gbound)
 kg_k(3,npw_k)=G-vectors in reduced coordinates
 gbound(2*mgfft+8,2)=Table for padded-FFT. See sphereboundary.

 SIDE EFFECT
 ur(ldx*ldy*ldz*ndat)= In input: wavefunctions in real space.
                       Destroyed in output. Do not use it anymore!

OUTPUT

 ug(npw_k*ndat)=wavefunctions in reciprocal space.

SOURCE

1225 subroutine fftw3_fftur_dp(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1226                           istwf_k, mgfft, kg_k, gbound, ur, ug, &
1227                           isign, iscale) ! optional
1228 
1229 !Arguments ------------------------------------
1230 !scalars
1231  integer,intent(in) :: fftalg,fftcache
1232  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1233  integer,optional,intent(in) :: isign, iscale
1234 !arrays
1235  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1236  real(dp),target,intent(inout) :: ur(2*ldx*ldy*ldz*ndat)
1237  real(dp),target,intent(inout) :: ug(2*npw_k*ndat)
1238 
1239 #ifdef HAVE_FFTW3
1240 !Local variables-------------------------------
1241 !scalars
1242  integer,parameter :: dist=2
1243  integer :: iscale__, isign__
1244 !arrays
1245  real(dp) :: dum_ugin(2,0)
1246  real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1247 
1248 ! *************************************************************************
1249 
1250  iscale__ = 1; if (present(iscale)) iscale__ = iscale
1251  isign__ = -1; if (present(isign)) isign__ = isign
1252 
1253 #undef TK_PREF
1254 #define TK_PREF(name) CONCAT(cg_,name)
1255 
1256 #undef  FFT_PRECISION
1257 #define FFT_PRECISION FFT_DOUBLE
1258 
1259 #include "fftur.finc"
1260 
1261 #undef  FFT_PRECISION
1262 
1263 #else
1264  ! Silence compiler warning
1265  ABI_ERROR("FFTW3 support not activated")
1266  ABI_UNUSED((/fftalg,fftcache/))
1267  ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/))
1268  ABI_UNUSED((/ug(1),ur(1)/))
1269 #endif
1270 
1271 end subroutine fftw3_fftur_dp

m_fftw3/fftw3_fftur_dpc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftur_dpc

FUNCTION

 Compute ndat zero-padded FFTs from R ro G.
 Mainly used for the transform of wavefunctions.
 TARGET: DPC arrays
 See fftw3_fftur_dp for API doc.

SOURCE

1351 subroutine fftw3_fftur_dpc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1352                            istwf_k, mgfft, kg_k, gbound, ur, ug, &
1353                            isign, iscale) ! optional
1354 
1355 !Arguments ------------------------------------
1356 !scalars
1357  integer,intent(in) :: fftalg,fftcache
1358  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1359  integer,optional,intent(in) :: isign, iscale
1360 !arrays
1361  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1362  complex(dpc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat)
1363  complex(dpc),target,intent(inout) :: ug(npw_k*ndat)
1364 
1365 #ifdef HAVE_FFTW3
1366 !Local variables-------------------------------
1367 !scalars
1368  integer,parameter :: dist=1
1369  integer :: iscale__, isign__
1370 !arrays
1371  real(dp) :: dum_ugin(2,0)
1372  real(dp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1373 
1374 ! *************************************************************************
1375 
1376  iscale__ = 1; if (present(iscale)) iscale__ = iscale
1377  isign__ = -1; if (present(isign)) isign__ = isign
1378 
1379 #undef TK_PREF
1380 #define TK_PREF(name) CONCAT(cplx_,name)
1381 
1382 #undef  FFT_PRECISION
1383 #define FFT_PRECISION FFT_DOUBLE
1384 
1385 #include "fftur.finc"
1386 
1387 #undef  FFT_PRECISION
1388 
1389 #else
1390  ! Silence compiler warning
1391  ABI_ERROR("FFTW3 support not activated")
1392  ABI_UNUSED((/fftalg,fftcache/))
1393  ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/))
1394  ABI_UNUSED((/ug(1),ur(1)/))
1395 #endif
1396 
1397 end subroutine fftw3_fftur_dpc

m_fftw3/fftw3_fftur_spc [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_fftur_spc

FUNCTION

 Compute ndat zero-padded FFTs from R- to G-space .
 Mainly used for the transform of wavefunctions.
 TARGET: spc arrays
 See fftw3_fftur_dp for API doc.

SOURCE

1288 subroutine fftw3_fftur_spc(fftalg, fftcache, npw_k, nx, ny, nz, ldx, ldy, ldz, ndat, &
1289                            istwf_k, mgfft, kg_k, gbound, ur, ug, &
1290                            isign, iscale) ! optional
1291 
1292 !Arguments ------------------------------------
1293 !scalars
1294  integer,intent(in) :: fftalg,fftcache
1295  integer,intent(in) :: npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft
1296  integer,optional,intent(in) :: isign, iscale
1297 !arrays
1298  integer,intent(in) :: gbound(2*mgfft+8,2),kg_k(3,npw_k)
1299  complex(spc),target,intent(inout) :: ur(ldx*ldy*ldz*ndat)
1300  complex(spc),target,intent(inout) :: ug(npw_k*ndat)
1301 
1302 #ifdef HAVE_FFTW3
1303 !Local variables-------------------------------
1304 !scalars
1305  integer,parameter :: dist=1
1306  integer :: iscale__, isign__
1307 !arrays
1308  real(sp) :: dum_ugin(2,0)
1309  real(sp),ABI_CONTIGUOUS pointer :: real_ug(:,:),real_ur(:,:)
1310 
1311 ! *************************************************************************
1312 
1313  iscale__ = 1; if (present(iscale)) iscale__ = iscale
1314  isign__ = -1; if (present(isign)) isign__ = isign
1315 
1316 #undef TK_PREF
1317 #define TK_PREF(name) CONCAT(cplx_,name)
1318 
1319 #undef  FFT_PRECISION
1320 #define FFT_PRECISION FFT_SINGLE
1321 
1322 #include "fftur.finc"
1323 
1324 #undef  FFT_PRECISION
1325 
1326 #else
1327  ! Silence compiler warning
1328  ABI_ERROR("FFTW3 support not activated")
1329  ABI_UNUSED((/fftalg,fftcache/))
1330  ABI_UNUSED((/npw_k,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_k(1,1),gbound(1,1)/))
1331  ABI_UNUSED((/ug(1),ur(1)/))
1332 #endif
1333 
1334 end subroutine fftw3_fftur_spc

m_fftw3/fftw3_init_threads [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_init_threads

FUNCTION

  This function performs the one-time initialization required to use FFTW3 threads.
  It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.

INPUTS

SIDE EFFECTS

  The one-time initialization required to use FFTW3 threads is performed when the routine
  is called for the first time.

SOURCE

2276 subroutine fftw3_init_threads()
2277 
2278 !Local variables ------------------------------
2279 !scalars
2280 #ifdef HAVE_FFTW3_THREADS
2281  integer :: iret
2282 #endif
2283 
2284 ! *************************************************************************
2285 
2286 #ifdef HAVE_FFTW3_THREADS
2287  if (THREADS_INITED==0) then
2288    !call wrtout(std_out,"Calling dfftw_init_threads()")
2289    call dfftw_init_threads(iret)
2290 
2291    if (iret==0) then
2292      ABI_WARNING(" dfftw_init_threads returned 0; threaded FFTW3 is not being used!")
2293    else
2294      THREADS_INITED=1
2295    end if
2296    call fftw3_set_nthreads()
2297  end if
2298 
2299 #ifndef HAVE_OPENMP
2300   ABI_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!")
2301 #endif
2302 #endif
2303 
2304 #ifdef HAVE_FFTW3_MPI
2305   !call wrtout(std_out,"Calling fftw_mpi_init()")
2306   call fftw_mpi_init()
2307 #endif
2308 
2309 end subroutine fftw3_init_threads

m_fftw3/fftw3_many_dft_ip [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_many_dft_ip

FUNCTION

 Driver routine for many in-place 3D complex-to-complex FFTs of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimension of the finout array (to avoid cache conflicts).
 ndat=Number of FFTs to be done.
 isign=sign of Fourier transform exponent: current convention uses
   +1 for transforming from G to r,
   -1 for transforming from r to G.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 finout(2,ldx*ldy*ldz*ndat)=
   In input: The complex array to be transformed.
   In output: The FFT results.

SOURCE

2132 subroutine fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,finout,fftw_flags)
2133 
2134 !Arguments ------------------------------------
2135 !scalars
2136  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign
2137  integer,optional,intent(in) :: fftw_flags
2138 !arrays
2139  real(dp),intent(inout) :: finout(2*ldx*ldy*ldz*ndat)
2140 
2141 #ifdef HAVE_FFTW3
2142 !Local variables-------------------------------
2143 !scalars
2144  integer,parameter :: rank3=3,nt_all=-1
2145  integer :: my_flags,dist,stride
2146  integer(KIND_FFTW_PLAN) :: my_plan
2147 !arrays
2148  integer :: embed(rank3),n(rank3)
2149 
2150 ! *************************************************************************
2151 
2152  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
2153 
2154  stride = 1
2155  dist   = ldx*ldy*ldz
2156  embed  = [ldx, ldy, ldz]
2157  n      = [nx, ny, nz]
2158 
2159  my_plan = fftw3_plan_many_dft(rank3, n, ndat, finout, embed, stride, dist, finout, embed, stride, dist, isign, my_flags, nt_all)
2160 
2161  ! Now perform the 3D FFT via FFTW.
2162  call dfftw_execute_dft(my_plan, finout, finout)
2163  call fftw3_destroy_plan(my_plan)
2164 
2165  ! -1, FFTW returns not normalized FTs
2166  if (isign == ABI_FFTW_FORWARD) then
2167   call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), finout, 1)
2168  end if
2169 
2170 #else
2171  ABI_ERROR("FFTW3 support not activated")
2172  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/))
2173  if (PRESENT(fftw_flags)) then
2174    ABI_UNUSED(fftw_flags)
2175  end if
2176  ABI_UNUSED(finout(1))
2177 #endif
2178 
2179 end subroutine fftw3_many_dft_ip

m_fftw3/fftw3_many_dft_op [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_many_dft_op

FUNCTION

 Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts).
 ndat=Number of FFTs to be done.
 fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
 isign=sign of Fourier transform exponent: current convention uses
   +1 for transforming from G to r,
   -1 for transforming from r to G.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.

SOURCE

2053 subroutine fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fin,fout,fftw_flags)
2054 
2055 !Arguments ------------------------------------
2056 !scalars
2057  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign
2058  integer,optional,intent(in) :: fftw_flags
2059 !arrays
2060  real(dp),intent(in) :: fin(2*ldx*ldy*ldz*ndat)
2061  real(dp),intent(out) :: fout(2*ldx*ldy*ldz*ndat)
2062 
2063 #ifdef HAVE_FFTW3
2064 !Local variables-------------------------------
2065 !scalars
2066  integer,parameter :: rank3=3,nt_all=-1
2067  integer :: my_flags,dist,stride
2068  integer(KIND_FFTW_PLAN) :: my_plan
2069 !arrays
2070  integer :: embed(rank3),n(rank3)
2071 
2072 ! *************************************************************************
2073 
2074  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
2075 
2076  stride = 1
2077  dist   = ldx*ldy*ldz
2078  embed  = [ldx, ldy, ldz]
2079  n      = [nx, ny, nz]
2080 
2081  my_plan = fftw3_plan_many_dft(rank3, n, ndat, fin, embed, stride, dist, fout, embed, stride, dist, isign, my_flags, nt_all)
2082 
2083  ! Now perform the 3D FFT via FFTW.
2084  call dfftw_execute_dft(my_plan, fin, fout)
2085 
2086  call fftw3_destroy_plan(my_plan)
2087 
2088  ! -1, FFTW returns not normalized FTs
2089  if (isign == ABI_FFTW_FORWARD) then
2090   call ZDSCAL(ldx*ldy*ldz*ndat, one/(nx*ny*nz), fout, 1)
2091  end if
2092 
2093 #else
2094  ABI_ERROR("FFTW3 support not activated")
2095  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/))
2096  if (PRESENT(fftw_flags)) then
2097    ABI_UNUSED(fftw_flags)
2098  end if
2099  ABI_UNUSED(fin(1))
2100  ABI_UNUSED(fout(1))
2101 #endif
2102 
2103 end subroutine fftw3_many_dft_op

m_fftw3/fftw3_mpiback [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiback

FUNCTION

   CALCULATES THE DISCRETE FOURIER TRANSFORM  in parallel using MPI/OpenMP

   ZR(I1,I2,I3)= \sum_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

 Adopt standard convention that isign=1 for backward transform

 INPUTS:
    option= 1 if call from fourwf, 2 if call from other routine
    cplex=1 for real --> complex, 2 for complex --> complex
    ZF: input array in G-space (note the switch of i2 and i3)

         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS:
    ZR: output array in R space.

         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of ZF and ZR
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

 NOTES:
   The maximum number of processors that can reasonably be used is max(n2,n3)
   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

4664 subroutine fftw3_mpiback(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zf,zr,comm_fft)
4665 
4666 !Arguments ------------------------------------
4667 ! real space input
4668  integer,intent(in) :: cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,comm_fft
4669  real(dp),intent(in) :: zf(2,nd1,nd3,nd2proc,ndat)
4670  real(dp),intent(out) :: zr(2,nd1eff,nd2,nd3proc,ndat)
4671 
4672 !Local variables-------------------------------
4673 !scalaras
4674 #ifdef HAVE_FFTW3
4675  integer :: j,i1,idat,ierr,includelast,j2,j2st,j3,jeff,jp2st,lzt,nthreads
4676  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft,lot1,lot2,lot3
4677  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
4678  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
4679  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
4680  character(len=500) :: msg
4681 !arrays
4682  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
4683  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
4684 
4685 ! *************************************************************************
4686 
4687  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
4688 
4689  ! find cache size that gives optimal performance on machine
4690  ncache=2*max(n1,n2,n3,1024)
4691 
4692  if (ncache/(2*max(n1,n2,n3))<1) then
4693    write(msg,'(5a)') &
4694 &    'ncache has to be enlarged to be able to hold at',ch10, &
4695 &    'least one 1-d FFT of each size even though this will',ch10,&
4696 &    'reduce the performance for shorter transform lengths'
4697    ABI_ERROR(msg)
4698  end if
4699 
4700 ! check input
4701  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
4702    ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3")
4703  end if
4704 
4705  ! Effective n1 and n2 (complex-to-complex or real-to-complex)
4706  n1eff=n1; n2eff=n2; n1zt=n1
4707  if (cplex==1) then
4708    n1eff=(n1+1)/2; n2eff=n2/2+1 ; n1zt=2*(n1/2+1)
4709  end if
4710 
4711  lzt=n2eff
4712  if (mod(n2eff,2) == 0) lzt=lzt+1
4713  if (mod(n2eff,4) == 0) lzt=lzt+1
4714 
4715 ! maximal number of big box 3rd dim slices for all procs
4716  nnd3=nd3proc*nproc_fft
4717 
4718  ABI_MALLOC(zw,(2,ncache/2))
4719  ABI_MALLOC(zt,(2,lzt,n1zt))
4720  ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3))
4721  if (nproc_fft>1)  then
4722    ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3))
4723  end if
4724 
4725 !DEBUG
4726 ! write(std_out,'(a,3i4)' )'back,zf n1,n2,n3',n1,n2,n3
4727 ! write(std_out,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc
4728 ! write(std_out,'(a,3i4)' )'m1,m2,m3',m1,m2,m3
4729 ! write(std_out,'(a,3i4)' )'max1,max2,max3',max1,max2,max3
4730 ! write(std_out,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3
4731 ! write(std_out,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
4732 !ENDDEBUG
4733 
4734  ! Create plans.
4735  ! The prototype for sfftw_plan_many_dft is:
4736  ! sfftw_plan_many_dft(rank, n, howmany,
4737  !   fin,  iembed, istride, idist,
4738  !   fout, oembed, ostride, odist, isign, my_flags)
4739 
4740  lot3=ncache/(2*n3)
4741  lot1=ncache/(2*n1)
4742  lot2=ncache/(2*n2)
4743 
4744  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
4745  !nthreads = 1
4746 
4747  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
4748 &    zw, [ncache/2], lot3, 1,                          &
4749 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4750 
4751  if (mod(n1, lot3) /= 0) then
4752    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(n1, lot3), &
4753 &      zw, [ncache/2], lot3, 1,                                    &
4754 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4755  end if
4756 
4757  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
4758 &    zw, [ncache/2],  lot1, 1,                         &
4759 &    zt, [lzt, n1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4760 
4761  if (mod(n2eff, lot1) /= 0) then
4762    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(n2eff, lot1), &
4763 &      zw, [ncache/2], lot1, 1,                                       &
4764 &      zt, [lzt, n1zt],   lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4765  end if
4766 
4767  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
4768 &    zw, [ncache/2], lot2, 1,                          &
4769 &    zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4770 
4771  if (mod(n1eff, lot2) /= 0) then
4772    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
4773 &      zw, [ncache/2], lot2, 1,                                      &
4774 &      zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4775  end if
4776 
4777  do idat=1,ndat
4778    ! transform along z axis
4779    ! input: I1,I3,J2,(Jp2)
4780 
4781    do j2=1,nd2proc
4782      if (me_fft*nd2proc+j2 <= n2eff) then
4783 
4784        do i1=1,n1,lot3
4785          ma=i1
4786          mb=min(i1+(lot3-1),n1)
4787          n1dfft=mb-ma+1
4788 
4789          ! input:  G1,G3,G2,(Gp2)
4790          ! output: G1,R3,G2,(Gp2)
4791          call fill(nd1,nd3,lot3,n1dfft,n3,zf(1,i1,1,j2,idat),zw)
4792 
4793          if (n1dfft == lot3) then
4794            call dfftw_execute_dft(bw_plan3_lot, zw, zw)
4795          else
4796            call dfftw_execute_dft(bw_plan3_rest, zw, zw)
4797          end if
4798 
4799          ! input:  G1,R3,G2,(Gp2)
4800          ! output: G1,G2,R3,(Gp2)
4801          call scramble(i1,j2,lot3,n1dfft,n1,n3,nd2proc,nd3,zw,zmpi2)
4802        end do
4803      end if
4804    end do
4805 
4806    ! Interprocessor data transposition
4807    ! input:  G1,G2,R3,Rp3,(Gp2)
4808    ! output: G1,G2,G3,Gp2,(Rp3)
4809    if (nproc_fft>1) then
4810      call xmpi_alltoall(zmpi2,2*n1*nd2proc*nd3proc, &
4811 &                       zmpi1,2*n1*nd2proc*nd3proc,comm_fft,ierr)
4812    end if
4813 
4814    do j3=1,nd3proc
4815      if (me_fft*nd3proc+j3 <= n3) then
4816        Jp2st=1; J2st=1
4817 
4818        ! transform along x axis
4819        do j=1,n2eff,lot1
4820          ma=j
4821          mb=min(j+(lot1-1),n2eff)
4822          n1dfft=mb-ma+1
4823 
4824          ! input:  G1,G2,R3,Gp2,(Rp3)
4825          ! output: G2,G1,R3,Jp2,(Rp3)
4826          if (nproc_fft == 1) then
4827            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zmpi2,zw)
4828          else
4829            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zmpi1,zw)
4830          end if
4831 
4832          ! input:  G2,G1,R3,(Rp3)
4833          ! output: G2,R1,R3,(Rp3)
4834          if (n1dfft == lot1) then
4835            call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
4836          else
4837            call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
4838          end if
4839 
4840        end do
4841 
4842        ! transform along y axis
4843        do j=1,n1eff,lot2
4844          ma=j
4845          mb=min(j+(lot2-1),n1eff)
4846          n1dfft=mb-ma+1
4847          includelast=1
4848          if (cplex==1) then
4849           jeff=2*j-1
4850           includelast=1
4851           if (mb==n1eff .and. n1eff*2/=n1) includelast=0
4852          end if
4853 
4854          ! input:  G2,R1,R3,(Rp3)
4855          ! output: R1,G2,R3,(Rp3)
4856          if (cplex==2) then
4857            call switch(n1dfft,n2,lot2,n1,lzt,zt(1,1,j),zw)
4858          else
4859            call switchreal(includelast,n1dfft,n2,n2eff,lot2,n1zt,lzt,zt(1,1,jeff),zw)
4860          end if
4861 
4862          if (n1dfft == lot2) then
4863            call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat))
4864          else
4865            call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat))
4866          end if
4867        end do
4868        ! output: R1,R2,R3,(Rp3)
4869 
4870      end if
4871    end do
4872  end do ! idat
4873 
4874  call dfftw_destroy_plan(bw_plan3_lot)
4875  if (mod(n1, lot3) /= 0) then
4876    call dfftw_destroy_plan(bw_plan3_rest)
4877  end if
4878 
4879  call dfftw_destroy_plan(bw_plan1_lot)
4880  if (mod(n2eff, lot1) /= 0) then
4881    call dfftw_destroy_plan(bw_plan1_rest)
4882  end if
4883 
4884  call dfftw_destroy_plan(bw_plan2_lot)
4885  if (mod(n1eff, lot2) /= 0) then
4886    call dfftw_destroy_plan(bw_plan2_rest)
4887  end if
4888 
4889  ABI_FREE(zmpi2)
4890  ABI_FREE(zw)
4891  ABI_FREE(zt)
4892  if (nproc_fft>1)  then
4893    ABI_FREE(zmpi1)
4894  end if
4895 
4896 #else
4897  ABI_ERROR("FFTW3 support not activated")
4898  ABI_UNUSED((/cplex,ndat,n1,n2,n3,nd1,nd2,nd1eff,nd2proc,nd3proc,option,comm_fft/))
4899  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
4900 #endif
4901 
4902 end subroutine fftw3_mpiback

m_fftw3/fftw3_mpiback_manywf [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiback_manywf

FUNCTION

   Does multiple 3-dim backward FFTs from Fourier into real space
   Adopt standard convention that isign=1 for backward transform

   CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)=

   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

   in parallel using MPI/OpenMP.

 INPUTS:
    cplexwf=1 if wavefunction is real, 2 if complex
    ndat=Number of wavefunctions to transform.
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can be found in subroutine CTRIG
    nd1,nd2,nd3: Leading Dimension of ZR
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc
    max1 is positive or zero; m1 >=max1+1
      i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
      then, if m1 > max1+1, one has min1=max1-m1+1 and
      i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
    max2 and max3 have a similar definition of range
    m1,m2,m3=Size of the box enclosing the G-sphere.
    md1,md2,md3: Dimension of ZF given on the **small** FFT box.
    md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    comm_fft=MPI communicator for the FFT.
    ZF: input array (note the switch of i2 and i3)
          real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
          imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

 OUTPUTS
    ZR: output array
          ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
          ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

NOTES

   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

6116 subroutine fftw3_mpiback_manywf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
6117 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft)
6118 
6119 !Arguments ------------------------------------
6120  integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
6121  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
6122  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
6123  real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat)
6124 
6125 #ifdef HAVE_FFTW3
6126 !Local variables-------------------------------
6127  integer,parameter :: nt1=1
6128  integer :: j,i1,i2,idat,ierr,includelast,nthreads
6129  integer :: ioption,j2,j3,j2st,jp2st,jeff,lzt,m1zt,ma,mb,n1dfft,nnd3
6130  integer :: lot1,lot2,lot3
6131  integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft
6132  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
6133  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
6134  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
6135  !type(C_PTR) :: zw_cptr,zt_cptr
6136  character(len=500) :: msg
6137 !arrays
6138  integer :: requests(ndat)
6139  real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:)  ! work arrays for MPI
6140  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
6141  !real(dp),ABI_CONTIGUOUS pointer :: zw(:,:),zt(:,:,:)
6142 ! FFT work arrays
6143  real(dp) :: tsec(2)
6144 
6145 ! *************************************************************************
6146 
6147  !call wrtout(std_out,"mpiback with non-blocking IALLTOALL + FFTW3")
6148 
6149 
6150  ! FIXME must provide a default value but which one?
6151  ! ioption = 0
6152  ioption = 1
6153  !if (paral_kgb==1) ioption=1
6154 
6155  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
6156 
6157  ! Find cache size that gives optimal performance on machine
6158  ncache=2*max(n1,n2,n3,1024)
6159  if (ncache/(2*max(n1,n2,n3))<1) then
6160    write(msg,"(5a)") &
6161 &    'ncache has to be enlarged to be able to hold at',ch10, &
6162 &    'least one 1-d FFT of each size even though this will',ch10,&
6163 &    'reduce the performance for shorter transform lengths'
6164     ABI_ERROR(msg)
6165  end if
6166 
6167  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
6168  n1eff=n1; m2eff=m2; m1zt=n1
6169  if (cplexwf==1) then
6170    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
6171  end if
6172 
6173  lzt=m2eff
6174  if (mod(m2eff,2)==0) lzt=lzt+1
6175  if (mod(m2eff,4)==0) lzt=lzt+1
6176 
6177  ! maximal number of big box 3rd dim slices for all procs
6178  nnd3=nd3proc*nproc_fft
6179 
6180  ! Allocate cache work array and work arrays for MPI transpositions.
6181  ABI_MALLOC(zw,(2,ncache/2))
6182  ABI_MALLOC(zt,(2,lzt,m1zt))
6183 
6184  !call fftw3_alloc_real([2,ncache/2],zw_cptr,zw)
6185  !call fftw3_alloc_real([2,lzt,m1zt],zt_cptr,zt)
6186 
6187  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
6188  if (nproc_fft>1)  then
6189    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
6190  end if
6191 
6192  ! Create plans.
6193  ! The prototype for sfftw_plan_many_dft is:
6194  ! sfftw_plan_many_dft(rank, n, howmany,
6195  !   fin,  iembed, istride, idist,
6196  !   fout, oembed, ostride, odist, isign, my_flags)
6197 
6198  lot3=ncache/(2*n3)
6199  lot1=ncache/(2*n1)
6200  lot2=ncache/(2*n2)
6201 
6202  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
6203  !nthreads = 1
6204 
6205  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
6206 &    zw, [ncache/2], lot3, 1,                          &
6207 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6208 
6209  if (mod(m1, lot3) /= 0) then
6210    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), &
6211 &      zw, [ncache/2], lot3, 1,                                    &
6212 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6213  end if
6214 
6215  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
6216 &    zw, [ncache/2],  lot1, 1,                         &
6217 &    zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6218 
6219  if (mod(m2eff, lot1) /= 0) then
6220    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), &
6221 &      zw, [ncache/2],  lot1, 1,                                      &
6222 &      zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6223  end if
6224 
6225  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
6226 &    zw, [ncache/2], lot2, 1,                          &
6227 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6228 
6229  if (mod(n1eff, lot2) /= 0) then
6230    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
6231 &      zw, [ncache/2], lot2, 1,                                      &
6232 &      zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
6233  end if
6234 
6235  do idat=1,ndat
6236     ! transform along z axis
6237     ! input: G1,G3,G2,(Gp2)
6238 
6239     ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
6240     do j2=1,md2proc
6241       ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard
6242       do i1=1,m1,lot3
6243         ma=i1
6244         mb=min(i1+(lot3-1),m1)
6245         n1dfft=mb-ma+1
6246 
6247         ! zero-pad n1dfft G_z lines
6248         ! input:  G1,G3,G2,(Gp2)
6249         ! output: G1,R3,G2,(Gp2)
6250         call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw)
6251 
6252         ! Transform along z.
6253         if (n1dfft == lot3) then
6254           call dfftw_execute_dft(bw_plan3_lot, zw, zw)
6255         else
6256           call dfftw_execute_dft(bw_plan3_rest, zw, zw)
6257         end if
6258 
6259         ! Local rotation.
6260         ! input:  G1,R3,G2,(Gp2)
6261         ! output: G1,G2,R3,(Gp2)
6262         call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2(:,:,:,:,idat))
6263       end do
6264     end do ! j2
6265 
6266     ! Interprocessor data transposition
6267     ! input:  G1,G2,R3,Rp3,(Gp2)
6268     ! output: G1,G2,R3,Gp2,(Rp3)
6269     if (nproc_fft>1) then
6270       call timab(543,1,tsec)
6271       call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
6272 &                         zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
6273       call timab(543,2,tsec)
6274     end if
6275  end do
6276 
6277  do idat=1,ndat
6278     if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
6279     ! Loop over the z treated by this node.
6280     do j3=1,nd3proc
6281       if (me_fft*nd3proc+j3 <= n3) then
6282         Jp2st=1; J2st=1
6283 
6284         ! Loop over G_y in the small box.
6285         do j=1,m2eff,lot1
6286           ma=j
6287           mb=min(j+(lot1-1),m2eff)
6288           n1dfft=mb-ma+1
6289 
6290           ! Zero-pad input.
6291           ! input:  G1,G2,R3,JG2,(Rp3)
6292           ! output: G2,G1,R3,JG2,(Rp3)
6293           if (nproc_fft==1) then
6294             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
6295 &             md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw,max2,m2,n2)
6296           else
6297             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
6298 &             md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw,max2,m2,n2)
6299           end if
6300 
6301           ! Transform along x
6302           ! input:  G2,G1,R3,(Rp3)
6303           ! output: G2,R1,R3,(Rp3)
6304           if (n1dfft == lot1) then
6305             call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
6306           else
6307             call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
6308           end if
6309 
6310         end do ! j
6311 
6312         ! Transform along y axis (take into account c2c or c2r case).
6313         ! Must loop over the full box.
6314         do j=1,n1eff,lot2
6315           ma=j
6316           mb=min(j+(lot2-1),n1eff)
6317           n1dfft=mb-ma+1
6318           includelast=1
6319 
6320           if (cplexwf==1) then
6321             jeff=2*j-1
6322             if (mb==n1eff .and. n1eff*2/=n1) includelast=0
6323           end if
6324 
6325           ! Zero-pad the input.
6326           ! input:  G2,R1,R3,(Rp3)
6327           ! output: R1,G2,R3,(Rp3)
6328           if (cplexwf==2) then
6329             call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw)
6330           else
6331             call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw)
6332           end if
6333 
6334           ! input:  R1,G2,R3,(Rp3)
6335           ! output: R1,R2,R3,(Rp3)
6336           if (n1dfft == lot2) then
6337             call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat))
6338           else
6339             call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat))
6340           end if
6341 
6342         end do
6343 
6344         ! Treat real wavefunctions.
6345         if (cplexwf==1) then
6346           n1half=n1/2
6347           ! If odd
6348           if (n1half*2/=n1) then
6349             do i2=1,n2
6350               zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat)
6351               zr(2,n1,i2,j3,idat)=zero
6352             end do
6353           end if
6354           do i2=1,n2
6355             do i1=n1half,1,-1
6356               zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat)
6357               zr(1,2*i1  ,i2,j3,idat)=zr(2,i1,i2,j3,idat)
6358               zr(2,2*i1-1,i2,j3,idat)=zero
6359               zr(2,2*i1  ,i2,j3,idat)=zero
6360             end do
6361           end do
6362         end if
6363 
6364       end if
6365 
6366    end do ! j3
6367  end do ! idat
6368 
6369  call dfftw_destroy_plan(bw_plan3_lot)
6370  if (mod(m1, lot3) /= 0) then
6371    call dfftw_destroy_plan(bw_plan3_rest)
6372  end if
6373 
6374  call dfftw_destroy_plan(bw_plan1_lot)
6375  if (mod(m2eff, lot1) /= 0) then
6376    call dfftw_destroy_plan(bw_plan1_rest)
6377  end if
6378 
6379  call dfftw_destroy_plan(bw_plan2_lot)
6380  if (mod(n1eff, lot2) /= 0) then
6381    call dfftw_destroy_plan(bw_plan2_rest)
6382  end if
6383 
6384  ABI_FREE(zmpi2)
6385  ABI_FREE(zw)
6386  ABI_FREE(zt)
6387  if (nproc_fft>1)  then
6388    ABI_FREE(zmpi1)
6389  end if
6390 
6391 #else
6392  ABI_ERROR("FFTW3 support not activated")
6393  ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/))
6394  ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/))
6395  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
6396 #endif
6397 
6398 end subroutine fftw3_mpiback_manywf

m_fftw3/fftw3_mpiback_wf [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiback_wf

FUNCTION

   Does multiple 3-dim backward FFTs from Fourier into real space
   Adopt standard convention that isign=1 for backward transform

   CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)=

   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

   in parallel using MPI/OpenMP.

 INPUTS:
    cplexwf=1 if wavefunction is real, 2 if complex
    ndat=Number of wavefunctions to transform.
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can be found in subroutine CTRIG
    nd1,nd2,nd3: Leading Dimension of ZR
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc
    max1 is positive or zero; m1 >=max1+1
      i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
      then, if m1 > max1+1, one has min1=max1-m1+1 and
      i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
    max2 and max3 have a similar definition of range
    m1,m2,m3=Size of the box enclosing the G-sphere.
    md1,md2,md3: Dimension of ZF given on the **small** FFT box.
    md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    comm_fft=MPI communicator for the FFT.
    ZF: input array (note the switch of i2 and i3)
          real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
          imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

 OUTPUTS
    ZR: output array
          ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
          ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

NOTES

   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

3970 subroutine fftw3_mpiback_wf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
3971 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft)
3972 
3973 !Arguments ------------------------------------
3974  integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
3975  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
3976  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
3977  real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat)
3978 
3979 #ifdef HAVE_FFTW3
3980 !Local variables-------------------------------
3981  integer,parameter :: nt1=1
3982  integer :: j,i1,i2,idat,ierr,includelast
3983  integer :: ioption,j2,j3,j2st,jp2st,jeff,lzt,m1zt,ma,mb,n1dfft,nnd3
3984  integer :: lot1,lot2,lot3
3985  integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft,nthreads
3986  integer(KIND_FFTW_PLAN) :: bw_plan1_lot,bw_plan1_rest
3987  integer(KIND_FFTW_PLAN) :: bw_plan2_lot,bw_plan2_rest
3988  integer(KIND_FFTW_PLAN) :: bw_plan3_lot,bw_plan3_rest
3989  !type(C_PTR) :: zw_cptr,zt_cptr
3990  character(len=500) :: msg
3991 !arrays
3992  real(dp),allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:)  ! work arrays for MPI
3993  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
3994  !real(dp),ABI_CONTIGUOUS pointer :: zw(:,:),zt(:,:,:)
3995 ! FFT work arrays
3996  real(dp) :: tsec(2)
3997 
3998 ! *************************************************************************
3999 
4000  !call wrtout(std_out,"mpiback standard ALLTOALL + FFTW3")
4001 
4002  ! FIXME must provide a default value but which one?
4003  ! ioption = 0
4004  ioption = 1
4005  !if (paral_kgb==1) ioption=1
4006 
4007  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
4008 
4009  ! Find cache size that gives optimal performance on machine
4010  ncache=2*max(n1,n2,n3,1024)
4011  if (ncache/(2*max(n1,n2,n3))<1) then
4012    write(msg,"(5a)") &
4013 &    'ncache has to be enlarged to be able to hold at',ch10, &
4014 &    'least one 1-d FFT of each size even though this will',ch10,&
4015 &    'reduce the performance for shorter transform lengths'
4016     ABI_ERROR(msg)
4017  end if
4018 
4019  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
4020  n1eff=n1; m2eff=m2; m1zt=n1
4021  if (cplexwf==1) then
4022    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
4023  end if
4024 
4025  lzt=m2eff
4026  if (mod(m2eff,2)==0) lzt=lzt+1
4027  if (mod(m2eff,4)==0) lzt=lzt+1
4028 
4029  ! maximal number of big box 3rd dim slices for all procs
4030  nnd3=nd3proc*nproc_fft
4031 
4032  ! Allocate cache work array and work arrays for MPI transpositions.
4033  ABI_MALLOC(zw,(2,ncache/2))
4034  ABI_MALLOC(zt,(2,lzt,m1zt))
4035 
4036  !call fftw3_alloc_real([2,ncache/2],zw_cptr,zw)
4037  !call fftw3_alloc_real([2,lzt,m1zt],zt_cptr,zt)
4038 
4039  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
4040  if (nproc_fft>1)  then
4041    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
4042  end if
4043 
4044 !DEBUG
4045 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': fftw3_mpiback_wf,zf n1,n2,n3',n1,n2,n3
4046 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': nd1,nd2,nd3proc',nd1,nd2,nd3proc
4047 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': m1,m2,m3',m1,m2,m3
4048 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': max1,max2,max3',max1,max2,max3
4049 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': md1,md2proc,md3',md1,md2proc,md3
4050 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
4051 !ENDDEBUG
4052 
4053  ! Create plans.
4054  ! The prototype for sfftw_plan_many_dft is:
4055  ! sfftw_plan_many_dft(rank, n, howmany,
4056  !   fin,  iembed, istride, idist,
4057  !   fout, oembed, ostride, odist, isign, my_flags)
4058 
4059  lot3=ncache/(2*n3)
4060  lot1=ncache/(2*n1)
4061  lot2=ncache/(2*n2)
4062 
4063  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
4064  !nthreads = 1
4065 
4066  bw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
4067 &    zw, [ncache/2], lot3, 1,                          &
4068 &    zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4069 
4070  if (mod(m1, lot3) /= 0) then
4071    bw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), &
4072 &      zw, [ncache/2], lot3, 1,                                    &
4073 &      zw, [ncache/2], lot3, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4074  end if
4075 
4076  bw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
4077 &    zw, [ncache/2],  lot1, 1,                         &
4078 &    zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4079 
4080  if (mod(m2eff, lot1) /= 0) then
4081    bw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), &
4082 &      zw, [ncache/2],  lot1, 1,                                      &
4083 &      zt, [lzt, m1zt], lzt, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4084  end if
4085 
4086  bw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
4087 &    zw, [ncache/2], lot2, 1,                          &
4088 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4089 
4090  if (mod(n1eff, lot2) /= 0) then
4091    bw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
4092 &      zw, [ncache/2], lot2, 1,                                      &
4093 &      zr, [nd1,nd2,nd3proc,ndat], nd1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
4094  end if
4095 
4096  do idat=1,ndat
4097     ! transform along z axis
4098     ! input: G1,G3,G2,(Gp2)
4099 
4100     ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
4101     do j2=1,md2proc
4102       ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard
4103       do i1=1,m1,lot3
4104         ma=i1
4105         mb=min(i1+(lot3-1),m1)
4106         n1dfft=mb-ma+1
4107 
4108         ! zero-pad n1dfft G_z lines
4109         ! input:  G1,G3,G2,(Gp2)
4110         ! output: G1,R3,G2,(Gp2)
4111         call fill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw)
4112 
4113         ! Transform along z.
4114         if (n1dfft == lot3) then
4115           call dfftw_execute_dft(bw_plan3_lot, zw, zw)
4116         else
4117           call dfftw_execute_dft(bw_plan3_rest, zw, zw)
4118         end if
4119 
4120         ! Local rotation.
4121         ! input:  G1,R3,G2,(Gp2)
4122         ! output: G1,G2,R3,(Gp2)
4123         call scramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zw,zmpi2)
4124       end do
4125     end do ! j2
4126 
4127     ! Interprocessor data transposition
4128     ! input:  G1,G2,R3,Rp3,(Gp2)
4129     ! output: G1,G2,R3,Gp2,(Rp3)
4130     if (nproc_fft>1) then
4131       call timab(543,1,tsec)
4132       call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, &
4133 &                        zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
4134       call timab(543,2,tsec)
4135     end if
4136 
4137     ! Loop over the z treated by this node.
4138     do j3=1,nd3proc
4139       if (me_fft*nd3proc+j3 <= n3) then
4140         Jp2st=1; J2st=1
4141 
4142         ! Loop over G_y in the small box.
4143         do j=1,m2eff,lot1
4144           ma=j
4145           mb=min(j+(lot1-1),m2eff)
4146           n1dfft=mb-ma+1
4147 
4148           ! Zero-pad input.
4149           ! input:  G1,G2,R3,JG2,(Rp3)
4150           ! output: G2,G1,R3,JG2,(Rp3)
4151           if (nproc_fft==1) then
4152             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
4153 &             md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw,max2,m2,n2)
4154           else
4155             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
4156 &             md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw,max2,m2,n2)
4157           end if
4158 
4159           ! Transform along x
4160           ! input:  G2,G1,R3,(Rp3)
4161           ! output: G2,R1,R3,(Rp3)
4162           if (n1dfft == lot1) then
4163             call dfftw_execute_dft(bw_plan1_lot, zw, zt(1,j,1))
4164           else
4165             call dfftw_execute_dft(bw_plan1_rest, zw, zt(1,j,1))
4166           end if
4167 
4168         end do ! j
4169 
4170         ! Transform along y axis (take into account c2c or c2r case).
4171         ! Must loop over the full box.
4172         do j=1,n1eff,lot2
4173           ma=j
4174           mb=min(j+(lot2-1),n1eff)
4175           n1dfft=mb-ma+1
4176           includelast=1
4177 
4178           if (cplexwf==1) then
4179             jeff=2*j-1
4180             if (mb==n1eff .and. n1eff*2/=n1) includelast=0
4181           end if
4182 
4183           ! Zero-pad the input.
4184           ! input:  G2,R1,R3,(Rp3)
4185           ! output: R1,G2,R3,(Rp3)
4186           if (cplexwf==2) then
4187             call switch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zt(1,1,j),zw)
4188           else
4189             call switchreal_cent(includelast,n1dfft,max2,n2,lot2,m1zt,lzt,zt(1,1,jeff),zw)
4190           end if
4191 
4192           ! input:  R1,G2,R3,(Rp3)
4193           ! output: R1,R2,R3,(Rp3)
4194           if (n1dfft == lot2) then
4195             call dfftw_execute_dft(bw_plan2_lot, zw, zr(1,j,1,j3,idat))
4196           else
4197             call dfftw_execute_dft(bw_plan2_rest, zw, zr(1,j,1,j3,idat))
4198           end if
4199 
4200         end do
4201 
4202         ! Treat real wavefunctions.
4203         if (cplexwf==1) then
4204           n1half=n1/2
4205           ! If odd
4206           if (n1half*2/=n1) then
4207             do i2=1,n2
4208               zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat)
4209               zr(2,n1,i2,j3,idat)=zero
4210             end do
4211           end if
4212           do i2=1,n2
4213             do i1=n1half,1,-1
4214               zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat)
4215               zr(1,2*i1  ,i2,j3,idat)=zr(2,i1,i2,j3,idat)
4216               zr(2,2*i1-1,i2,j3,idat)=zero
4217               zr(2,2*i1  ,i2,j3,idat)=zero
4218             end do
4219           end do
4220         end if
4221 
4222       end if
4223    end do ! j3
4224  end do ! idat
4225 
4226  call dfftw_destroy_plan(bw_plan3_lot)
4227  if (mod(m1, lot3) /= 0) then
4228    call dfftw_destroy_plan(bw_plan3_rest)
4229  end if
4230 
4231  call dfftw_destroy_plan(bw_plan1_lot)
4232  if (mod(m2eff, lot1) /= 0) then
4233    call dfftw_destroy_plan(bw_plan1_rest)
4234  end if
4235 
4236  call dfftw_destroy_plan(bw_plan2_lot)
4237  if (mod(n1eff, lot2) /= 0) then
4238    call dfftw_destroy_plan(bw_plan2_rest)
4239  end if
4240 
4241  ABI_FREE(zmpi2)
4242  ABI_FREE(zw)
4243  ABI_FREE(zt)
4244  if (nproc_fft>1)  then
4245    ABI_FREE(zmpi1)
4246  end if
4247 
4248 #else
4249  ABI_ERROR("FFTW3 support not activated")
4250  ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/))
4251  ABI_UNUSED((/ max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/))
4252  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
4253 #endif
4254 
4255 end subroutine fftw3_mpiback_wf

m_fftw3/fftw3_mpiforw [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiforw

FUNCTION

   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=
   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)
   in parallel using MPI/OpenMP and BLAS library calls.

INPUTS

    ZR: input array
         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS
    ZF: output array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
     n1,n2,n3: logical dimension of the transform. As transform lengths
               most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
     nd1,nd2,nd3: Dimension of ZR and ZF
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

NOTES

  SHOULD describe nd1eff
  SHOULD put cplex and nd1eff in OMP declarations
  SHOULD describe the change of value of nd2prod

  The maximum number of processors that can reasonably be used is max(n2,n3)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

SOURCE

4957 subroutine fftw3_mpiforw(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zr,zf,comm_fft)
4958 
4959 !Arguments ------------------------------------
4960 !scalars
4961  integer,intent(in) :: cplex,comm_fft
4962  integer,intent(in) :: ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option
4963 !arrays
4964  real(dp),intent(in) :: zr(2,nd1eff,nd2,nd3proc,ndat)
4965  real(dp),intent(out) :: zf(2,nd1,nd3,nd2proc,ndat)
4966 
4967 !Local variables-------------------------------
4968 !scalars
4969 #ifdef HAVE_FFTW3
4970  integer :: j,i1,idat,ierr,j2,j2st,j3,jp2st,lzt,nthreads
4971  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft,lot1,lot2,lot3
4972  integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest
4973  integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest
4974  integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest
4975  character(len=500) :: msg
4976 !arrays
4977  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
4978  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
4979 
4980 ! *************************************************************************
4981 
4982  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
4983 
4984  ! find cache size that gives optimal performance on machine
4985  ncache=2*max(n1,n2,n3,1024)
4986  if (ncache/(2*max(n1,n2,n3))<1) then
4987    write(msg,'(5a)')&
4988 &     'ncache has to be enlarged to be able to hold at',ch10, &
4989 &     'least one 1-d FFT of each size even though this will',ch10,&
4990 &     'reduce the performance for shorter transform lengths'
4991    ABI_ERROR(msg)
4992  end if
4993 
4994  ! check input
4995  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
4996    ABI_ERROR("forw: assertion error nd1<n1 .or. nd2<n2 .or. nd3<n3")
4997  end if
4998 
4999 !Effective n1 and n2 (complex-to-complex or real-to-complex)
5000  n1eff=n1; n2eff=n2; n1zt=n1
5001  if (cplex==1) then
5002    n1eff=(n1+1)/2; n2eff=n2/2+1; n1zt=2*(n1/2+1)
5003  end if
5004 
5005  lzt=n2eff
5006  if (mod(n2eff,2) == 0) lzt=lzt+1
5007  if (mod(n2eff,4) == 0) lzt=lzt+1
5008 
5009  ! maximal number of big box 3rd dim slices for all procs
5010  nnd3=nd3proc*nproc_fft
5011 
5012  ABI_MALLOC(zw,(2,ncache/2))
5013  ABI_MALLOC(zt,(2,lzt,n1zt))
5014  ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3))
5015  if (nproc_fft>1)  then
5016    ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3))
5017  end if
5018 
5019  ! Create plans.
5020  ! The prototype for sfftw_plan_many_dft is:
5021  ! sfftw_plan_many_dft(rank, n, howmany,
5022  !   fin,  iembed, istride, idist,
5023  !   fout, oembed, ostride, odist, isign, my_flags)
5024 
5025  lot1=ncache/(2*n1)
5026  lot2=ncache/(2*n2)
5027  lot3=ncache/(2*n3)
5028 
5029  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
5030  !nthreads = 1
5031 
5032  fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
5033 &    zw, [ncache/2], lot3, 1,                          &
5034 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5035 
5036  if (mod(n1, lot3) /= 0) then
5037    fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(n1, lot3), &
5038 &    zw, [ncache/2], lot3, 1,                                      &
5039 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5040  end if
5041 
5042  fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
5043 &    zt, [lzt, n1zt],   lzt,  1,                       &
5044 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5045 
5046  if (mod(n2eff, lot1) /= 0) then
5047    fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(n2eff, lot1), &
5048 &    zt, [lzt, n1zt],   lzt, 1,                                       &
5049 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5050  end if
5051 
5052  fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
5053 &    zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1,         &
5054 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5055 
5056  if (mod(n1eff, lot2) /= 0) then
5057    fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
5058 &    zr, [nd1eff,nd2,nd3proc,ndat], nd1eff, 1,                       &
5059 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
5060  end if
5061 
5062  do idat=1,ndat
5063 
5064    do j3=1,nd3proc
5065      if (me_fft*(nd3proc)+j3 <= n3) then
5066        Jp2st=1; J2st=1
5067 
5068        ! transform along y axis
5069        ! input: R1,R2,R3,(Rp3)
5070        do j=1,n1eff,lot2
5071          ma=j
5072          mb=min(j+(lot2-1),n1eff)
5073          n1dfft=mb-ma+1
5074 
5075          if (n1dfft == lot2) then
5076            call dfftw_execute_dft(fw_plan2_lot,  zr(1,j,1,j3,idat), zw)
5077          else
5078            call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw)
5079          end if
5080 
5081          !  input: R1,G2,R3,(Rp3)
5082          ! output: G2,R1,R3,(Rp3)
5083          if (cplex==2) then
5084            call unswitch(n1dfft,n2,lot2,n1zt,lzt,zw,zt(1,1,j))
5085          else
5086            call unswitchreal(n1dfft,n2,n2eff,lot2,n1zt,lzt,zw,zt(1,1,2*j-1))
5087          end if
5088        end do
5089 
5090        ! transform along x axis
5091        ! input: G2,R1,R3,(Rp3)
5092        do j=1,n2eff,lot1
5093          ma=j
5094          mb=min(j+(lot1-1),n2eff)
5095          n1dfft=mb-ma+1
5096 
5097          if (n1dfft == lot1) then
5098            call dfftw_execute_dft(fw_plan1_lot,  zt(1,j,1), zw)
5099          else
5100            call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw)
5101          end if
5102 
5103          ! input:  G2,G1,R3,Gp2,(Rp3)
5104          ! output: G1,G2,R3,Gp2,(Rp3)
5105          ! write(std_out,*) 'J2st,Jp2st',J2st,Jp2st
5106          if (nproc_fft == 1) then
5107            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zw,zmpi2)
5108          else
5109            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot1,n1,nd2proc,nd3proc,nproc_fft,option,zw,zmpi1)
5110          end if
5111        end do
5112 
5113      end if
5114    end do ! j3
5115 
5116    ! Interprocessor data transposition
5117    ! input:  G1,G2,R3,Gp2,(Rp3)
5118    ! output: G1,G2,R3,Rp3,(Gp2)
5119    if (nproc_fft>1) then
5120      call xmpi_alltoall(zmpi1,2*n1*nd2proc*nd3proc, &
5121 &                       zmpi2,2*n1*nd2proc*nd3proc,comm_fft,ierr)
5122    end if
5123 
5124    ! transform along z axis
5125    ! input: G1,G2,R3,(Gp2)
5126 
5127    do j2=1,nd2proc
5128      if (me_fft*(nd2proc)+j2 <= n2eff) then
5129        do i1=1,n1,lot3
5130          ma=i1
5131          mb=min(i1+(lot3-1),n1)
5132          n1dfft=mb-ma+1
5133 
5134          ! input:  G1,G2,R3,(Gp2)
5135          ! output: G1,R3,G2,(Gp2)
5136          call unscramble(i1,j2,lot3,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw)
5137 
5138          if (n1dfft == lot3) then
5139            call dfftw_execute_dft(fw_plan3_lot, zw, zw)
5140          else
5141            call dfftw_execute_dft(fw_plan3_rest, zw, zw)
5142          end if
5143 
5144          call unfill(nd1,nd3,lot3,n1dfft,n3,zw,zf(1,i1,1,j2,idat))
5145          ! output: G1,G3,G2,(Gp2)
5146        end do
5147      end if
5148    end do
5149 
5150  end do ! idat
5151 
5152  call dfftw_destroy_plan(fw_plan3_lot)
5153  if (mod(n1, lot3) /= 0) then
5154    call dfftw_destroy_plan(fw_plan3_rest)
5155  end if
5156 
5157  call dfftw_destroy_plan(fw_plan1_lot)
5158  if (mod(n2eff, lot1) /= 0) then
5159    call dfftw_destroy_plan(fw_plan1_rest)
5160  end if
5161 
5162  call dfftw_destroy_plan(fw_plan2_lot)
5163  if (mod(n1eff, lot2) /= 0) then
5164    call dfftw_destroy_plan(fw_plan2_rest)
5165  end if
5166 
5167  ABI_FREE(zmpi2)
5168  ABI_FREE(zw)
5169  ABI_FREE(zt)
5170  if (nproc_fft>1)  then
5171    ABI_FREE(zmpi1)
5172  end if
5173 
5174 #else
5175  ABI_ERROR("FFTW3 support not activated")
5176  ABI_UNUSED((/cplex,ndat,n1,n2,n3,nd1,nd2,nd1eff,nd2proc,nd3proc,option,comm_fft/))
5177  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
5178 #endif
5179 
5180 end subroutine fftw3_mpiforw

m_fftw3/fftw3_mpiforw_manywf [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiforw_manywf

FUNCTION

   Does multiple 3-dim backward FFTs from real into Fourier space
   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM

   ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)

   in parallel using MPI/OpenMP.

 INPUT:
   ZR: input array
        ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
        ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
   NOTE that ZR is changed by the routine

   n1,n2,n3: logical dimension of the transform. As transform lengths
             most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
   nd1,nd2,nd3: Dimension of ZR
   nd3proc=((nd3-1)/nproc_fft)+1  maximal number of big box 3rd dim slices for one proc

 OUTPUT:
   ZF: output array (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
     i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
     then, if m1 > max1+1, one has min1=max1-m1+1 and
     i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
     i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc-1] rank of the processor in the FFT communicator.
   comm_fft=MPI communicator for parallel FFT.

NOTES

  The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

SOURCE

6462 subroutine fftw3_mpiforw_manywf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
6463 &        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft)
6464 
6465 !Arguments ------------------------------------
6466 !scalars
6467  integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
6468  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
6469 !arrays
6470  real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat)
6471  real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat)
6472 
6473 !Local variables-------------------------------
6474 !scalars
6475 #ifdef HAVE_FFTW3
6476  integer :: j,i1,i2,i3,idat,ierr,nproc_fft,me_fft
6477  integer :: ioption,j2,j3,j2st,jp2st,lot1,lot2,lot3,lzt,m1zt,ma,mb,n1dfft,nnd3
6478  integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv,nthreads
6479  integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest
6480  integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest
6481  integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest
6482  character(len=500) :: msg
6483 !arrays
6484  integer :: requests(ndat)
6485  real(dp) ABI_ASYNC, allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
6486  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
6487 ! FFT work arrays
6488  real(dp) :: tsec(2)
6489 
6490 ! *************************************************************************
6491 
6492  ! FIXME must provide a default value but which one?
6493  !ioption = 0
6494  ioption = 1
6495  !if (paral_kgb==1) ioption=1
6496 
6497  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
6498 
6499  ! find cache size that gives optimal performance on machine
6500  ncache=2*max(n1,n2,n3,1024)
6501  !ncache=2*max(n1,n2,n3,16*1024)
6502 
6503  if (ncache/(2*max(n1,n2,n3))<1) then
6504    write(msg,'(5a)') &
6505 &    'ncache has to be enlarged to be able to hold at',ch10, &
6506 &    'least one 1-d FFT of each size even though this will',ch10,&
6507 &    'reduce the performance for shorter transform lengths'
6508    ABI_ERROR(msg)
6509  end if
6510 
6511  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
6512  n1eff=n1; m2eff=m2; m1zt=n1
6513  if (cplexwf==1) then
6514    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
6515  end if
6516 
6517  lzt=m2eff
6518  if (mod(m2eff,2)==0) lzt=lzt+1
6519  if (mod(m2eff,4)==0) lzt=lzt+1
6520 
6521  ! maximal number of big box 3rd dim slices for all procs
6522  nnd3=nd3proc*nproc_fft
6523 
6524  ABI_MALLOC(zw,(2,ncache/2))
6525  ABI_MALLOC(zt,(2,lzt,m1zt))
6526  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
6527  if (nproc_fft>1)  then
6528    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
6529  end if
6530 
6531  ! Create plans.
6532  ! The prototype for sfftw_plan_many_dft is:
6533  ! sfftw_plan_many_dft(rank, n, howmany,
6534  !   fin,  iembed, istride, idist,
6535  !   fout, oembed, ostride, odist, isign, my_flags)
6536 
6537  lot2=ncache/(2*n2)
6538  lot1=ncache/(2*n1)
6539  lot3=ncache/(2*n3)
6540 
6541  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
6542  !nthreads = 1
6543 
6544  fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
6545 &    zw, [ncache/2], lot3, 1,                          &
6546 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6547 
6548  if (mod(m1, lot3) /= 0) then
6549    fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), &
6550 &    zw, [ncache/2], lot3, 1,                                      &
6551 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6552  end if
6553 
6554  fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
6555 &    zt, [lzt, m1zt],   lzt,  1,                       &
6556 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6557 
6558  if (mod(m2eff, lot1) /= 0) then
6559    fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), &
6560 &    zt, [lzt, m1zt],   lzt, 1,                                       &
6561 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6562  end if
6563 
6564  fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
6565 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1,               &
6566 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6567 
6568  if (mod(n1eff, lot2) /= 0) then
6569    fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
6570 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1,                             &
6571 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
6572  end if
6573 
6574  do idat=1,ndat
6575    ! Loop over the z-planes treated by this node
6576    do j3=1,nd3proc
6577 
6578      if (me_fft*nd3proc+j3 <= n3) then
6579        Jp2st=1
6580        J2st=1
6581 
6582        ! Treat real wavefunctions.
6583        if (cplexwf==1) then
6584          n1half=n1/2
6585          do i2=1,n2
6586            do i1=1,n1half
6587              zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat)
6588              zr(2,i1,i2,j3,idat)=zr(1,2*i1  ,i2,j3,idat)
6589            end do
6590          end do
6591          ! If odd
6592          if(n1half*2/=n1)then
6593            do i2=1,n2
6594              zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat)
6595              zr(2,n1eff,i2,j3,idat)=zero
6596            end do
6597          end if
6598        end if
6599 
6600        ! transform along y axis
6601        ! input: R1,R2,R3,(Rp3)
6602        ! input: R1,G2,R3,(Rp3)
6603        do j=1,n1eff,lot2
6604          ma=j
6605          mb=min(j+(lot2-1),n1eff)
6606          n1dfft=mb-ma+1
6607 
6608          if (n1dfft == lot2) then
6609            call dfftw_execute_dft(fw_plan2_lot,  zr(1,j,1,j3,idat), zw)
6610          else
6611            call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw)
6612          end if
6613 
6614          ! input:  R1,G2,R3,(Rp3)
6615          ! output: G2,R1,R3,(Rp3)
6616          if (cplexwf==2) then
6617            call unswitch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zw,zt(1,1,j))
6618          else
6619            call unswitchreal_cent(n1dfft,max2,n2,lot2,n1,lzt,zw,zt(1,1,2*j-1))
6620          end if
6621        end do
6622 
6623        ! transform along x axis
6624        ! input: G2,R1,R3,(Rp3)
6625        do j=1,m2eff,lot1
6626          ma=j
6627          mb=min(j+(lot1-1),m2eff)
6628          n1dfft=mb-ma+1
6629 
6630          if (n1dfft == lot1) then
6631            call dfftw_execute_dft(fw_plan1_lot,  zt(1,j,1), zw)
6632          else
6633            call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw)
6634          end if
6635          ! output: G2,G1,R3,(Rp3)
6636 
6637          ! input:  G2,G1,R3,Gp2,(Rp3)
6638          ! output: G1,G2,R3,Gp2,(Rp3)
6639          if (nproc_fft==1) then
6640            call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
6641 &            md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2(:,:,:,:,idat))
6642          else
6643            call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
6644 &            md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1(:,:,:,:,idat))
6645          end if
6646        end do
6647      end if
6648    end do ! j3
6649 
6650    ! Interprocessor data transposition
6651    ! input:  G1,G2,R3,Gp2,(Rp3)
6652    ! output: G1,G2,R3,Rp3,(Gp2)
6653    if (nproc_fft>1) then
6654      call timab(544,1,tsec)
6655      call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
6656 &                        zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
6657      call timab(544,2,tsec)
6658    end if
6659  end do
6660 
6661  do idat=1,ndat
6662     if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
6663    ! transform along z axis
6664    ! input: G1,G2,R3,(Gp2)
6665 
6666    do j2=1,md2proc
6667      if (me_fft*md2proc+j2 <= m2eff) then
6668        ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2
6669        do i1=1,m1,lot3
6670          ma=i1
6671          mb=min(i1+(lot3-1),m1)
6672          n1dfft=mb-ma+1
6673 
6674          ! input:  G1,G2,R3,(Gp2)
6675          ! output: G1,R3,G2,(Gp2)
6676          call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw)
6677 
6678          if (n1dfft == lot3) then
6679            call dfftw_execute_dft(fw_plan3_lot, zw, zw)
6680          else
6681            call dfftw_execute_dft(fw_plan3_rest, zw, zw)
6682          end if
6683 
6684          call unfill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zw,zf(1,i1,1,j2,idat))
6685          ! output: G1,G3,G2,(Gp2)
6686        end do
6687      end if
6688    end do
6689 
6690    if (cplexwf==1) then
6691      ! Complete missing values with complex conjugate
6692      ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
6693      do i3=1,m3
6694        i3inv=m3+2-i3
6695        if(i3==1)i3inv=1
6696 
6697        if (m2eff>1) then
6698          do i2=2,m2eff
6699            i2inv=m2+2-i2
6700            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
6701            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
6702            do i1=2,m1
6703              i1inv=m1+2-i1
6704              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
6705              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
6706            end do
6707          end do
6708        end if
6709      end do
6710    end if
6711 
6712  end do ! idat
6713 
6714  call dfftw_destroy_plan(fw_plan3_lot)
6715  if (mod(m1, lot3) /= 0) then
6716    call dfftw_destroy_plan(fw_plan3_rest)
6717  end if
6718 
6719  call dfftw_destroy_plan(fw_plan1_lot)
6720  if (mod(m2eff, lot1) /= 0) then
6721    call dfftw_destroy_plan(fw_plan1_rest)
6722  end if
6723 
6724  call dfftw_destroy_plan(fw_plan2_lot)
6725  if (mod(n1eff, lot2) /= 0) then
6726    call dfftw_destroy_plan(fw_plan2_rest)
6727  end if
6728 
6729  ABI_FREE(zmpi2)
6730  ABI_FREE(zw)
6731  ABI_FREE(zt)
6732  if (nproc_fft>1)  then
6733    ABI_FREE(zmpi1)
6734  end if
6735 
6736 #else
6737  ABI_ERROR("FFTW3 support not activated")
6738  ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/))
6739  ABI_UNUSED((/max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/))
6740  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
6741 #endif
6742 
6743 end subroutine fftw3_mpiforw_manywf

m_fftw3/fftw3_mpiforw_wf [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpiforw_wf

FUNCTION

   Does multiple 3-dim backward FFTs from real into Fourier space
   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM

   ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)

   in parallel using MPI/OpenMP.

 INPUT:
   ZR: input array
        ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
        ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
   NOTE that ZR is changed by the routine

   n1,n2,n3: logical dimension of the transform. As transform lengths
             most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
   nd1,nd2,nd3: Dimension of ZR
   nd3proc=((nd3-1)/nproc_fft)+1  maximal number of big box 3rd dim slices for one proc

 OUTPUT:
   ZF: output array (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
     i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
     then, if m1 > max1+1, one has min1=max1-m1+1 and
     i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
     i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc-1] rank of the processor in the FFT communicator.
   comm_fft=MPI communicator for parallel FFT.

NOTES

  The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

SOURCE

4319 subroutine fftw3_mpiforw_wf(cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
4320 &        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft)
4321 
4322 !Arguments ------------------------------------
4323 !scalars
4324  integer,intent(in) :: cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
4325  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
4326 !arrays
4327  real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat)
4328  real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat)
4329 
4330 !Local variables-------------------------------
4331 !scalars
4332 #ifdef HAVE_FFTW3
4333  integer :: j,i1,i2,i3,idat,ierr,nproc_fft,me_fft,nthreads
4334  integer :: ioption,j2,j3,j2st,jp2st,lot1,lot2,lot3,lzt,m1zt,ma,mb,n1dfft,nnd3
4335  integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv
4336  integer(KIND_FFTW_PLAN) :: fw_plan1_lot,fw_plan1_rest
4337  integer(KIND_FFTW_PLAN) :: fw_plan2_lot,fw_plan2_rest
4338  integer(KIND_FFTW_PLAN) :: fw_plan3_lot,fw_plan3_rest
4339  character(len=500) :: msg
4340 !arrays
4341  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
4342  real(dp),allocatable :: zw(:,:),zt(:,:,:) ! cache work array and array for transpositions
4343 ! FFT work arrays
4344  real(dp) :: tsec(2)
4345 
4346 ! *************************************************************************
4347 
4348  ! FIXME must provide a default value but which one?
4349  !ioption = 0
4350  ioption = 1
4351  !if (paral_kgb==1) ioption=1
4352 
4353  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
4354 
4355  ! find cache size that gives optimal performance on machine
4356  ncache=2*max(n1,n2,n3,1024)
4357  !ncache=2*max(n1,n2,n3,16*1024)
4358 
4359  if (ncache/(2*max(n1,n2,n3))<1) then
4360    write(msg,'(5a)') &
4361 &    'ncache has to be enlarged to be able to hold at',ch10, &
4362 &    'least one 1-d FFT of each size even though this will',ch10,&
4363 &    'reduce the performance for shorter transform lengths'
4364    ABI_ERROR(msg)
4365  end if
4366 
4367  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
4368  n1eff=n1; m2eff=m2; m1zt=n1
4369  if (cplexwf==1) then
4370    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
4371  end if
4372 
4373  lzt=m2eff
4374  if (mod(m2eff,2)==0) lzt=lzt+1
4375  if (mod(m2eff,4)==0) lzt=lzt+1
4376 
4377  ! maximal number of big box 3rd dim slices for all procs
4378  nnd3=nd3proc*nproc_fft
4379 
4380  ABI_MALLOC(zw,(2,ncache/2))
4381  ABI_MALLOC(zt,(2,lzt,m1zt))
4382  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
4383  if (nproc_fft>1)  then
4384    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
4385  end if
4386 
4387 !DEBUG
4388 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'fftw3_mpiforw_wf, enter, i1,i2,i3,zr,n1,n2,n3',n1,n2,n3
4389 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'nd1,nd2,nd3proc',nd1,nd2,nd3proc
4390 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'m1,m2,m3',m1,m2,m3
4391 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'max1,max2,max3',max1,max2,max3
4392 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'md1,md2proc,md3',md1,md2proc,md3
4393 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
4394 !ENDDEBUG
4395 
4396  ! Create plans.
4397  ! The prototype for sfftw_plan_many_dft is:
4398  ! sfftw_plan_many_dft(rank, n, howmany,
4399  !   fin,  iembed, istride, idist,
4400  !   fout, oembed, ostride, odist, isign, my_flags)
4401 
4402  lot2=ncache/(2*n2)
4403  lot1=ncache/(2*n1)
4404  lot3=ncache/(2*n3)
4405 
4406  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
4407  !nthreads = 1
4408 
4409  fw_plan3_lot = dplan_many_dft_2D(1, [n3], lot3, &
4410 &    zw, [ncache/2], lot3, 1,                          &
4411 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE,nthreads)
4412 
4413  if (mod(m1, lot3) /= 0) then
4414    fw_plan3_rest = dplan_many_dft_2D(1, [n3], mod(m1, lot3), &
4415 &    zw, [ncache/2], lot3, 1,                                      &
4416 &    zw, [ncache/2], lot3, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
4417  end if
4418 
4419  fw_plan1_lot = dplan_many_dft_2D(1, [n1], lot1, &
4420 &    zt, [lzt, m1zt],   lzt,  1,                       &
4421 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
4422 
4423  if (mod(m2eff, lot1) /= 0) then
4424    fw_plan1_rest = dplan_many_dft_2D(1, [n1], mod(m2eff, lot1), &
4425 &    zt, [lzt, m1zt],   lzt, 1,                                       &
4426 &    zw, [ncache/2], lot1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
4427  end if
4428 
4429  fw_plan2_lot = dplan_many_dft_2D(1, [n2], lot2, &
4430 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1,               &
4431 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
4432 
4433  if (mod(n1eff, lot2) /= 0) then
4434    fw_plan2_rest = dplan_many_dft_2D(1, [n2], mod(n1eff,lot2), &
4435 &    zr, [nd1,nd2,nd3proc,ndat], nd1, 1,                             &
4436 &    zw, [ncache/2], lot2, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
4437  end if
4438 
4439  do idat=1,ndat
4440    ! Loop over the z-planes treated by this node
4441    do j3=1,nd3proc
4442 
4443      if (me_fft*nd3proc+j3 <= n3) then
4444        Jp2st=1
4445        J2st=1
4446 
4447        ! Treat real wavefunctions.
4448        if (cplexwf==1) then
4449          n1half=n1/2
4450          do i2=1,n2
4451            do i1=1,n1half
4452              zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat)
4453              zr(2,i1,i2,j3,idat)=zr(1,2*i1  ,i2,j3,idat)
4454            end do
4455          end do
4456          ! If odd
4457          if(n1half*2/=n1)then
4458            do i2=1,n2
4459              zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat)
4460              zr(2,n1eff,i2,j3,idat)=zero
4461            end do
4462          end if
4463        end if
4464 
4465        ! transform along y axis
4466        ! input: R1,R2,R3,(Rp3)
4467        ! input: R1,G2,R3,(Rp3)
4468        do j=1,n1eff,lot2
4469          ma=j
4470          mb=min(j+(lot2-1),n1eff)
4471          n1dfft=mb-ma+1
4472 
4473          if (n1dfft == lot2) then
4474            call dfftw_execute_dft(fw_plan2_lot,  zr(1,j,1,j3,idat), zw)
4475          else
4476            call dfftw_execute_dft(fw_plan2_rest, zr(1,j,1,j3,idat), zw)
4477          end if
4478 
4479          ! input:  R1,G2,R3,(Rp3)
4480          ! output: G2,R1,R3,(Rp3)
4481          if (cplexwf==2) then
4482            call unswitch_cent(n1dfft,max2,m2,n2,lot2,n1,lzt,zw,zt(1,1,j))
4483          else
4484            call unswitchreal_cent(n1dfft,max2,n2,lot2,n1,lzt,zw,zt(1,1,2*j-1))
4485          end if
4486        end do
4487 
4488        ! transform along x axis
4489        ! input: G2,R1,R3,(Rp3)
4490        do j=1,m2eff,lot1
4491          ma=j
4492          mb=min(j+(lot1-1),m2eff)
4493          n1dfft=mb-ma+1
4494 
4495          if (n1dfft == lot1) then
4496            call dfftw_execute_dft(fw_plan1_lot,  zt(1,j,1), zw)
4497          else
4498            call dfftw_execute_dft(fw_plan1_rest, zt(1,j,1), zw)
4499          end if
4500          ! output: G2,G1,R3,(Rp3)
4501 
4502          ! input:  G2,G1,R3,Gp2,(Rp3)
4503          ! output: G1,G2,R3,Gp2,(Rp3)
4504          if (nproc_fft==1) then
4505            call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
4506 &            md2proc,nd3proc,nproc_fft,ioption,zw,zmpi2)
4507          else
4508            call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot1,max1,md1,m1,n1,&
4509 &            md2proc,nd3proc,nproc_fft,ioption,zw,zmpi1)
4510          end if
4511        end do
4512 
4513       end if
4514     end do ! j3
4515 
4516     ! Interprocessor data transposition
4517     ! input:  G1,G2,R3,Gp2,(Rp3)
4518     ! output: G1,G2,R3,Rp3,(Gp2)
4519     if (nproc_fft>1) then
4520       call timab(544,1,tsec)
4521       call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, &
4522 &                        zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr)
4523       call timab(544,2,tsec)
4524     end if
4525 
4526     ! transform along z axis
4527     ! input: G1,G2,R3,(Gp2)
4528 
4529     do j2=1,md2proc
4530       if (me_fft*md2proc+j2 <= m2eff) then
4531         ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2
4532         do i1=1,m1,lot3
4533           ma=i1
4534           mb=min(i1+(lot3-1),m1)
4535           n1dfft=mb-ma+1
4536 
4537           ! input:  G1,G2,R3,(Gp2)
4538           ! output: G1,R3,G2,(Gp2)
4539           call unscramble(i1,j2,lot3,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw)
4540 
4541           if (n1dfft == lot3) then
4542             call dfftw_execute_dft(fw_plan3_lot, zw, zw)
4543           else
4544             call dfftw_execute_dft(fw_plan3_rest, zw, zw)
4545           end if
4546 
4547           call unfill_cent(md1,md3,lot3,n1dfft,max3,m3,n3,zw,zf(1,i1,1,j2,idat))
4548           ! output: G1,G3,G2,(Gp2)
4549         end do
4550       end if
4551     end do
4552 
4553     if (cplexwf==1) then
4554       ! Complete missing values with complex conjugate
4555       ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
4556       do i3=1,m3
4557         i3inv=m3+2-i3
4558         if(i3==1)i3inv=1
4559 
4560         if (m2eff>1) then
4561           do i2=2,m2eff
4562             i2inv=m2+2-i2
4563             zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
4564             zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
4565             do i1=2,m1
4566               i1inv=m1+2-i1
4567               zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
4568               zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
4569             end do
4570           end do
4571         end if
4572       end do
4573     end if
4574 
4575  end do ! idat
4576 
4577  call dfftw_destroy_plan(fw_plan3_lot)
4578  if (mod(m1, lot3) /= 0) then
4579    call dfftw_destroy_plan(fw_plan3_rest)
4580  end if
4581 
4582  call dfftw_destroy_plan(fw_plan1_lot)
4583  if (mod(m2eff, lot1) /= 0) then
4584    call dfftw_destroy_plan(fw_plan1_rest)
4585  end if
4586 
4587  call dfftw_destroy_plan(fw_plan2_lot)
4588  if (mod(n1eff, lot2) /= 0) then
4589    call dfftw_destroy_plan(fw_plan2_rest)
4590  end if
4591 
4592  ABI_FREE(zmpi2)
4593  ABI_FREE(zw)
4594  ABI_FREE(zt)
4595  if (nproc_fft>1)  then
4596    ABI_FREE(zmpi1)
4597  end if
4598 
4599 #else
4600  ABI_ERROR("FFTW3 support not activated")
4601  ABI_UNUSED((/cplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc/))
4602  ABI_UNUSED((/max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft/))
4603  ABI_UNUSED((/zf(1,1,1,1,1),zr(1,1,1,1,1)/))
4604 #endif
4605 
4606 end subroutine fftw3_mpiforw_wf

m_fftw3/fftw3_mpifourdp_c2c [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_mpifourdp_c2c

FUNCTION

 Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths n1, n2, n3.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ndat=Number of FFTs to be done.
 isign=sign of Fourier transform exponent: current convention uses
   +1 for transforming from G to r,
   -1 for transforming from r to G.
 fftn2_distrib(n2)=  rank of the processor which own fft planes in 2nd dimension for fourdp
 ffti2_local(n2) = local i2 indices in fourdp
 fftn3_distrib(n3) = rank of the processor which own fft planes in 3rd dimension for fourdp
 ffti3_local(n3) = local i3 indices in fourdp
 comm_fft=MPI communicator for the FFT
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
 fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.

TODO

   Add c2r and r2c version.

SIDE EFFECTS

 Input/Output
 fofg(2,nfft*ndat)=f(G), complex.
 fofr(cplex*nfft*ndat)=input function f(r) (real or complex)

SOURCE

3795 subroutine fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,&
3796 &  fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags)
3797 
3798 !Arguments ------------------------------------
3799 !scalars
3800  integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft
3801  integer,optional,intent(in) :: fftw_flags
3802 !arrays
3803  integer,intent(in) :: ngfft(18)
3804  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
3805  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
3806  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
3807 
3808 #ifdef HAVE_FFTW3_MPI
3809 !Local variables-------------------------------
3810 !scalars
3811  integer,parameter :: rank3=3
3812  integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,my_flags,me_fft,nproc_fft
3813  integer(C_INTPTR_T) :: alloc_local,local_n0,local_0_start,local_n1,local_1_start
3814  type(C_PTR) :: plan,cptr_cdata
3815 !arrays
3816  integer(C_INTPTR_T) :: fft_sizes(4)
3817  complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: f03_cdata(:)
3818 
3819 !*************************************************************************
3820 
3821  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
3822 
3823  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3)
3824  ! No augmentation as FFTW3 does not support it
3825  n4=n1; n5=n2; n6=n3
3826  me_fft=ngfft(11); nproc_fft=ngfft(10)
3827 
3828  nd2proc=((n2-1)/nproc_fft) +1
3829  nd3proc=((n6-1)/nproc_fft) +1
3830 
3831  ! Get local data size and allocate (note dimension reversal, we call the C interface directly!)
3832  fft_sizes = [n3,n2,n1,ndat]
3833 
3834  ! Use TRANSPOSED_OUT
3835  my_flags = ior(ABI_FFTW_ESTIMATE, ABI_FFTW_MPI_TRANSPOSED_OUT)
3836 
3837  if (isign == ABI_FFTW_BACKWARD) then
3838    ! G --> R, Exchange n2 and n3
3839    fft_sizes = [n2,n3,n1,ndat]
3840    !my_flags = ior(ABI_FFTW_ESTIMATE, ABI_FFTW_MPI_TRANSPOSED_IN)
3841  end if
3842 
3843  alloc_local = fftw_mpi_local_size_many_transposed(&
3844 &      rank3,fft_sizes(1:3),fft_sizes(4), &
3845 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, &
3846 &      local_n0,local_0_start, &
3847 &      local_n1,local_1_start)
3848 
3849  ! C to F
3850  !local_0_start = local_0_start + 1
3851  !local_1_start = local_1_start + 1
3852  !write(std_out,*)"local_n0,local_0_start,alloc_local",local_n0,local_0_start,alloc_local
3853  !write(std_out,*)"local_n1,local_1_start,alloc_local",local_n1,local_1_start,alloc_local
3854 
3855  ! Allocate cptr_cdata, associate to F pointer and build the plane.
3856  cptr_cdata = fftw_alloc_complex(alloc_local)
3857 
3858  call c_f_pointer(cptr_cdata, f03_cdata, [alloc_local])
3859 
3860  plan = fftw_mpi_plan_many_dft(rank3,fft_sizes(1:3),fft_sizes(4), &
3861 &                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, &
3862 &                              f03_cdata,f03_cdata,comm_fft,isign,my_flags)
3863 
3864  select case (isign)
3865  case (ABI_FFTW_BACKWARD)
3866      ! G --> R
3867      ABI_CHECK(local_n0 == nd2proc, "local_n0 != nd2proc")
3868 
3869      call mpifft_fg2dbox_dpc(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,f03_cdata)
3870 
3871      ! Compute transform.
3872      call fftw_mpi_execute_dft(plan, f03_cdata, f03_cdata)
3873 
3874      call mpifft_dbox2fr_dpc(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,f03_cdata,cplex,nfft,fofr)
3875 
3876  case (ABI_FFTW_FORWARD)
3877      ! R --> G
3878      ABI_CHECK(local_n0 == nd3proc, "local_n0 != nd3proc")
3879 
3880      call mpifft_fr2dbox_dpc(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,f03_cdata)
3881 
3882      ! Compute transform.
3883      call fftw_mpi_execute_dft(plan, f03_cdata, f03_cdata)
3884 
3885      ! Scale results.
3886      call mpifft_dbox2fg_dpc(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,f03_cdata,nfft,fofg)
3887 
3888  case default
3889    ABI_ERROR("Wrong sign")
3890  end select
3891 
3892  call fftw_destroy_plan(plan)
3893  call fftw_free(cptr_cdata)
3894 
3895 #else
3896  ABI_ERROR("FFTW3_MPI support not activated")
3897  ABI_UNUSED((/cplex,nfft,ngfft(1),ndat,isign,comm_fft/))
3898  ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/))
3899  ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/))
3900  if (PRESENT(fftw_flags)) then
3901     ABI_UNUSED(fftw_flags)
3902  end if
3903  ABI_UNUSED(fofg(1,1))
3904  ABI_UNUSED(fofr(1))
3905 #endif
3906 
3907 end subroutine fftw3_mpifourdp_c2c

m_fftw3/fftw3_mpifourdp_c2r [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpifourdp_c2r

FUNCTION

 Driver routine for transposed out-of-place 3D complex-to-real FFT of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of point along the three directions.
 ndat=Number of FFTs to be done.
 fofg(2,nx*ny*nz*ndat)=The complex array to be transformed.
 comm_fft=MPI communicator.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 fofr(2,nx*ny*nz*ndat)=The backwards real FFT of ff.

NOTES

 LOCAL DATA IN FOURIER SPACE : TRANSPOSED ORDER
 real space     --> dim = [  nx  | ny | nz/np_fft]
 fourier  space --> dim = [ nx/2 | nz | ny/np_ff ]

SOURCE

3398 subroutine fftw3_mpifourdp_c2r(nfft,ngfft,ndat,&
3399   fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags)
3400 
3401 !Arguments ------------------------------------
3402 !scalars
3403  integer,intent(in) :: nfft,ndat,comm_fft
3404  integer,optional,intent(in) :: fftw_flags
3405 !arrays
3406  integer,intent(in) :: ngfft(18)
3407  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
3408  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
3409  real(dp),intent(in) :: fofg(2,nfft*ndat)
3410  real(dp),intent(out) :: fofr(nfft*ndat)
3411 
3412 !Local variables-------------------------------
3413 #ifdef HAVE_FFTW3_MPI
3414 !scalars
3415  integer,parameter :: rank3=3
3416  integer :: nx,ny,nz,nproc_fft
3417  type(C_PTR) :: plan_bw, cdata_cplx,cdata_real
3418  integer(C_INTPTR_T) :: i,j,jdat,k,alloc_local,fft_sizes(4),demi_nx,base,idat,kdat
3419  integer(C_INTPTR_T) :: local_n0, local_0_start, local_n1, local_1_start
3420 !arrays
3421  complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data_cplx(:,:,:)
3422  real(C_DOUBLE), ABI_CONTIGUOUS pointer :: data_real(:,:,:)
3423 
3424 ! *************************************************************************
3425 
3426  !ABI_CHECK(ndat==1, "ndat > 1 not implemented yet")
3427 
3428  nx=ngfft(1); ny=ngfft(2); nz=ngfft(3)
3429  nproc_fft = xmpi_comm_size(comm_fft)
3430 
3431  demi_nx = nx/2 + 1
3432  fft_sizes(1)=nz
3433  fft_sizes(2)=ny
3434  fft_sizes(3)=demi_nx
3435  fft_sizes(4)=ndat
3436 
3437  alloc_local = fftw_mpi_local_size_many_transposed(&
3438 &      rank3,fft_sizes(1:3),fft_sizes(4), &
3439 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, &
3440 &      local_n0,local_0_start, &
3441 &      local_n1,local_1_start)
3442 
3443  cdata_cplx = fftw_alloc_complex(alloc_local)
3444  cdata_real = fftw_alloc_real(alloc_local*2)
3445 
3446 ! OLD BY FDHAM
3447  ! dimensions are  (x/2,z,y) in Fourier's Space
3448  call c_f_pointer(cdata_cplx, data_cplx, [demi_nx  ,fft_sizes(1),local_n1])
3449  ! dimensions in real space : (nx,ny,nz/nproc)
3450  call c_f_pointer(cdata_real, data_real, [2*demi_nx,fft_sizes(2),local_n0])
3451 
3452  ! dimensions are  (x/2,z,y) in Fourier's Space
3453  !call c_f_pointer(cdata_cplx, data_cplx, [demi_nx  ,fft_sizes(1),local_n0])
3454 
3455  !! dimensions in real space : (nx,ny,nz/nproc)
3456  !call c_f_pointer(cdata_real, data_real, [2*demi_nx,fft_sizes(2),local_n1])
3457 
3458  fft_sizes(3)=nx
3459  plan_bw =  fftw_mpi_plan_many_dft_c2r(&
3460 &      rank3,fft_sizes(1:3),fft_sizes(4), &
3461 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, &
3462 &      data_cplx, data_real , &
3463 &      comm_fft,ior(ABI_FFTW_ESTIMATE,ABI_FFTW_MPI_TRANSPOSED_IN))
3464 
3465  do idat=1,ndat
3466    do k=1, nz
3467      do j=1, ny / nproc_fft
3468        jdat = j + (idat-1) * ny / nproc_fft
3469        base = nx*((j-1) + (ny/nproc_fft)*(k-1)) + (idat-1) * nfft
3470        do i=1, demi_nx
3471          data_cplx(i,k,jdat) = CMPLX(fofg(1, i + base), fofg(2, i + base), kind=C_DOUBLE_COMPLEX)
3472        end do
3473      end do
3474    end do
3475  end do
3476 
3477  ! compute transform (as many times as desired)
3478  call fftw_mpi_execute_dft_c2r(plan_bw, data_cplx, data_real)
3479 
3480  do idat=1,ndat
3481    do k=1,local_n0
3482      kdat = k + (idat - 1) * local_n0
3483      do j=1,ny
3484        base = nx*((j-1) + ny*(k-1)) + (idat - 1) * nfft
3485        do i=1,nx
3486          fofr(i+base) = data_real(i,j,kdat)
3487        end do
3488      end do
3489    end do
3490  end do
3491 
3492  call fftw_destroy_plan(plan_bw)
3493  call fftw_free(cdata_cplx)
3494  call fftw_free(cdata_real)
3495 
3496 #else
3497  ABI_ERROR("FFTW3_MPI support not activated")
3498  ABI_UNUSED((/nfft,ngfft(1),ndat,comm_fft/))
3499  ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/))
3500  ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/))
3501  if (PRESENT(fftw_flags)) then
3502     ABI_UNUSED(fftw_flags)
3503  end if
3504  ABI_UNUSED(fofg(1,1))
3505  ABI_UNUSED(fofr(1))
3506 #endif
3507 
3508 end subroutine fftw3_mpifourdp_c2r

m_fftw3/fftw3_mpifourdp_r2c [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_mpifourdp_r2c

FUNCTION

 Driver routine for out-of-place 3D real-to-complex FFT of lengths nx, ny, nz.

INPUTS

 fofr(nx*ny*nz*ndat)=The real array to be transformed.
 ndat=Number of FFTs to be done.
 comm_fft=MPI communicator for the FFT.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 fofg(2,nx*ny*nz*ndat)=The forward FFT of ff.

NOTES

 LOCAL DATA FOR FOURIER TRANSFORMS : TRANSPOSED ORDER AND DISTRIBUTED
 real space     --> dim = [  nx  | ny | nz/np_fft ]
 fourier  space --> dim = [  nx | nz | ny/np_fft ]
 we can't take in account the symetric of the real case because after
 fft have been computed, the symetric data needed are dispatched over
 other process in parallel

SOURCE

3541 subroutine fftw3_mpifourdp_r2c(nfft,ngfft,ndat,&
3542   fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags)
3543 
3544 !Arguments ------------------------------------
3545 !scalars
3546  integer,intent(in) :: nfft,ndat,comm_fft
3547  integer,optional,intent(in) :: fftw_flags
3548 !arrays
3549  integer,intent(in) :: ngfft(18)
3550  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
3551  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
3552  real(dp),intent(in) :: fofr(nfft*ndat)
3553  real(dp),intent(out) :: fofg(2,nfft*ndat)
3554 
3555 !Local variables-------------------------------
3556 #ifdef HAVE_FFTW3_MPI
3557  !scalars
3558  integer,parameter :: rank3=3
3559  integer :: my_flags,nproc_fft,nx,ny,nz
3560  integer(C_INTPTR_T) :: i,j,k,base,alloc_local,i1,i2,i3,igf,idat,kdat,i2dat,padatf
3561  integer(C_INTPTR_T) :: local_n0,local_0_start,local_n1,local_1_start
3562  real(dp) :: factor_fft
3563  type(C_PTR) :: plan_fw,cdata_cplx,cdata_real
3564 !arrays
3565  complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data_cplx(:,:,:),data_real(:,:,:)
3566  integer(C_INTPTR_T) :: fft_sizes(4)
3567 
3568 ! *************************************************************************
3569 
3570  nproc_fft = xmpi_comm_size(comm_fft)
3571 
3572  nx=ngfft(1); ny=ngfft(2); nz=ngfft(3)
3573 
3574  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
3575 
3576  fft_sizes(1)=nz
3577  fft_sizes(2)=ny
3578  fft_sizes(3)=nx
3579  fft_sizes(4)=ndat
3580 
3581  ! Get parallel sizes
3582  alloc_local = fftw_mpi_local_size_many_transposed(&
3583 &      rank3,fft_sizes(1:3),fft_sizes(4), &
3584 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, &
3585 &      local_n0,local_0_start, &
3586 &      local_n1,local_1_start)
3587 
3588  ! Allocate data and reference it
3589 
3590  ! local data in real space     --> dim = [nx | ny | nz/nproc_fft]
3591  cdata_real = fftw_alloc_complex(alloc_local)
3592  call c_f_pointer(cdata_real, data_real, [fft_sizes(3),fft_sizes(2),local_n0])
3593 
3594  ! local data in Fourier space --> dim = [nx | nz | ny/nproc_fft]
3595  cdata_cplx = fftw_alloc_complex(alloc_local)
3596  call c_f_pointer(cdata_cplx, data_cplx, [fft_sizes(3),fft_sizes(1),local_n1])
3597 
3598  ! TODO: Use true real to complex API!
3599  ! Create Plan C2C (nx,ny,nz)
3600  plan_fw =  fftw_mpi_plan_many_dft(&
3601 &      rank3,fft_sizes(1:3),fft_sizes(4), &
3602 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, &
3603 &      data_real, data_cplx , &
3604 &      comm_fft,ABI_FFTW_FORWARD,ior(ABI_FFTW_ESTIMATE,ABI_FFTW_MPI_TRANSPOSED_OUT))
3605 
3606  ! Copy input data in correct format
3607  do idat=1,ndat
3608    do k=1,local_n0
3609      kdat = k + (idat-1) * local_n0
3610      do j=1, ny
3611        base = nx*((j-1) + ny*(k-1)) + (idat-1) * nfft
3612        do i=1, nx
3613          data_real(i,j,kdat) = CMPLX(fofr(i+base),zero, kind=C_DOUBLE_COMPLEX)
3614        end do
3615      end do
3616    end do
3617  end do
3618 
3619  ! Compute transform
3620  call fftw_mpi_execute_dft(plan_fw, data_real, data_cplx)
3621 
3622  factor_fft = one / (nx*ny*nz)
3623 
3624  do idat=1,ndat
3625     padatf=(idat-1)*nfft
3626     do i3=1,nz
3627        do i2=1,ny/nproc_fft ! equivalent a local_n1
3628           i2dat = i2 + (idat-1) * ny/nproc_fft
3629           do i1=1,nx
3630              igf = i1 + nx*( (i2-1) + (i3-1)*ny/nproc_fft  ) + padatf
3631              fofg(1,igf) = real(data_cplx(i1,i3,i2dat)) * factor_fft
3632              fofg(2,igf) =aimag(data_cplx(i1,i3,i2dat)) * factor_fft
3633           end do
3634        end do
3635     end do
3636  end do
3637 
3638  call fftw_destroy_plan(plan_fw)
3639  call fftw_free(cdata_cplx)
3640  call fftw_free(cdata_real)
3641 
3642 #else
3643  ABI_ERROR("FFTW3_MPI support not activated")
3644  ABI_UNUSED((/nfft,ngfft(1),ndat,comm_fft/))
3645  ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/))
3646  ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/))
3647  if (PRESENT(fftw_flags)) then
3648     ABI_UNUSED(fftw_flags)
3649  end if
3650  ABI_UNUSED(fofg(1,1))
3651  ABI_UNUSED(fofr(1))
3652 #endif
3653 
3654 end subroutine fftw3_mpifourdp_r2c

m_fftw3/fftw3_plan3_t [ Types ]

[ Top ] [ m_fftw3 ] [ Types ]

NAME

 fftw3_plan3_t

FUNCTION

  Structure storing the pointer to the FFTW plan as well as the options used to generate it.

SOURCE

161  type,private :: fftw3_plan3_t
162    integer :: isign=0                           ! Sign of the exponential in the FFT
163    integer :: ndat=-1                           ! Number of FFTs associated to the plan
164    integer :: flags=-HUGE(0)                    ! FFTW3 flags used to construct the plan.
165    integer(KIND_FFTW_PLAN) :: plan=NULL_PLAN    ! FFTW3 plan.
166    integer :: nthreads=1                        ! The number of threads associated to the plan.
167    integer :: idist=-1
168    integer :: odist=-1
169    integer :: istride=-1
170    integer :: ostride=-1
171    integer :: n(3)=-1                           ! The number of FFT divisions.
172    integer :: inembed(3)=-1
173    integer :: onembed(3)=-1
174    !integer(C_INT) :: alignment(2)              ! The alignment of the arrays used to construct the plan.
175  end type fftw3_plan3_t

m_fftw3/fftw3_poisson [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_poisson

FUNCTION

  Solve the Poisson equation in G-space given the density, n(r),
  in real space of the FFT box.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nx,ny,nz=Number of FFT points along the three directions.
 ldx,ldy,ldz=Leading dimension of the array nr and vg.
 ndat = Number of densities
 vg(nx*ny*nz)=Potential in reciprocal space.

SIDE EFFECTS

 nr(cplex*ldx*ldy*ldz*ndat)
    input: n(r) (real or complex)
    output: the hartree potential in real space

NOTES

   vg is given on the FFT mesh instead of the augmented mesh [ldx,ldy,ldz]
   in order to simplify the interface with the other routines operating of vg

SOURCE

7250 subroutine fftw3_poisson(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,vg,nr)
7251 
7252 !Arguments ------------------------------------
7253 !scalars
7254  integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat
7255 !arrays
7256  real(dp),intent(inout) :: nr(cplex*ldx*ldy*ldz*ndat)
7257  real(dp),intent(in) :: vg(nx*ny*nz)
7258 
7259 #ifdef HAVE_FFTW3
7260 !Local variables-------------------------------
7261 !scalars
7262  integer,parameter :: rank1=1,rank2=2
7263  integer :: ii,jj,kk,sidx,ig,ir,vgbase,ypad
7264  integer, parameter :: nthreads=1
7265  integer(KIND_FFTW_PLAN) :: bw_plan_xy,bw_plan3
7266  integer(KIND_FFTW_PLAN) :: fw_plan_xy,fw_plan3
7267  real(dp) :: fft_fact,vg_fftfact
7268 
7269 ! *************************************************************************
7270 
7271  !write(std_out,*)"in poisson"
7272  ABI_CHECK(cplex==2,"cplex!=2 not coded")
7273  ABI_CHECK(ndat==1,"ndat!=1 not coded")
7274 
7275  fft_fact = one/(nx*ny*nz)
7276 
7277  ! The prototype for sfftw_plan_many_dft is:
7278  ! sfftw_plan_many_dft(n, howmany,
7279  !   fin,  iembed, istride, idist,
7280  !   fout, oembed, ostride, odist, isign, my_flags)
7281 
7282  ! 1) ldx*ldy transforms along Rz.
7283  fw_plan3 = fftw3_plan_many_dft(rank1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
7284 &   nr, (/ldx, ldy, ldz/), ldx*ldy, 1,                  &
7285 &   nr, (/ldx, ldy, ldz/), ldx*ldy, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
7286 
7287  call fftw3_execute_dft(fw_plan3, nr, nr) ! Now we have nr(x,y,Gz)
7288  call fftw3_destroy_plan(fw_plan3)
7289 
7290  ! R --> G Transforms in x-y plane
7291  fw_plan_xy = fftw3_plan_many_dft(rank2, [nx,ny], 1, &
7292 &     nr, (/ldx, ldy, ldz/), 1, 1,                   &
7293 &     nr, (/ldx, ldy, ldz/), 1, 1, ABI_FFTW_FORWARD, ABI_FFTW_ESTIMATE, nthreads)
7294 
7295  ! G --> R Transforms in x-y plane
7296  bw_plan_xy = fftw3_plan_many_dft(rank2, [nx, ny], 1, &
7297 &     nr, (/ldx, ldy, ldz/), 1, 1,                    &
7298 &     nr, (/ldx, ldy, ldz/), 1, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
7299 
7300  ! Loop on z-planes.
7301  do kk=1,nz
7302    sidx = 1 + cplex*(kk-1)*ldx*ldy  !+ cplex*(dat-1) * ldx*ldy*ldz
7303 
7304    call fftw3_execute_dft(fw_plan_xy, nr(sidx:), nr(sidx:))
7305 
7306    ! At this point we have nr(Gx,Gy,Gz) on the current plane.
7307    ! Multiply by vc(Gx,Gy,Gz) and then back transform immediately to get vc(x,y,Gz)
7308    ! Note that nr is complex whereas vg is real.
7309    ! Besides, FFTW returns not normalized FTs if sign=-1 so we have to scale by fft_fact
7310    vgbase = (kk-1)*nx*ny !;vgbase = (kk-1)*ldx*ldy
7311 
7312    ig = 0
7313    do jj=1,ny
7314      ypad = cplex*(jj-1)*ldx + sidx
7315      do ii=1,nx
7316        ig = ig + 1
7317        vg_fftfact = vg(vgbase+ig) * fft_fact
7318 
7319        ir = cplex*(ii-1) + ypad
7320        nr(ir:ir+1) = nr(ir:ir+1) * vg_fftfact
7321      end do
7322    end do
7323 
7324    call fftw3_execute_dft(bw_plan_xy, nr(sidx:), nr(sidx:))
7325  end do
7326 
7327  ! Free plans
7328  call fftw3_destroy_plan(fw_plan_xy)
7329  call fftw3_destroy_plan(bw_plan_xy)
7330 
7331  ! Final transforms of vc(x,y,Gz) along Gz to get vc(x,y,z)
7332  bw_plan3 = fftw3_plan_many_dft(rank1, (/nz/), ldx*ldy, & ! We have to visit the entire augmented x-y plane!
7333 &   nr, (/ldx, ldy, ldz/), ldx*ldy, 1,                  &
7334 &   nr, (/ldx, ldy, ldz/), ldx*ldy, 1, ABI_FFTW_BACKWARD, ABI_FFTW_ESTIMATE, nthreads)
7335 
7336  call fftw3_execute_dft(bw_plan3, nr, nr)
7337  call fftw3_destroy_plan(bw_plan3)
7338 
7339 #else
7340  ABI_UNUSED((/cplex,nx,ny,nz,ldx,ldy,ldz,ndat/))
7341  ABI_UNUSED((/nr(1),vg(1)/))
7342 #endif
7343 
7344 end subroutine fftw3_poisson
7345 !!**
7346 
7347 !----------------------------------------------------------------------
7348 
7349 END MODULE m_fftw3

m_fftw3/fftw3_r2c_op [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_r2c_op

FUNCTION

 Driver routine for out-of-place 3D real-to-complex FFT of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimensions of the f array (to avoid cache conflicts).
 ff(ldx*ldy*ldz*ndat)=The real array to be transformed.
 ndat=Number of FFTs to be done.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

OUTPUT

 gg(2,nx*ny*nz*ndat)=The forward FFT of ff.

NOTES

  FIXME For the time-being. No augmentation of the mesh to reduce memory conflicts, as MKL crashes
  if the advanced interface is used.

SOURCE

1792 subroutine fftw3_r2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,ff,gg,fftw_flags)
1793 
1794 !Arguments ------------------------------------
1795 !scalars
1796  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat
1797  integer,optional,intent(in) :: fftw_flags
1798 !arrays
1799  real(dp),intent(in) :: ff(ldx*ldy*ldz*ndat)
1800  real(dp),intent(out) :: gg(2,ldx*ldy*ldz*ndat)
1801 
1802 #ifdef HAVE_FFTW3
1803 !Local variables-------------------------------
1804 !scalars
1805  integer,parameter :: rank3=3,nt_all=-1
1806  integer :: nhp,my_flags,idist,odist,padx,i1,i2,i3,igp,igf,imgf,stride
1807  integer :: i1inv,i2inv,i3inv,idat,padatf
1808  integer(KIND_FFTW_PLAN) :: my_plan
1809 !arrays
1810  integer :: inembed(rank3),onembed(rank3),n(rank3)
1811  integer,allocatable :: i1inver(:),i2inver(:),i3inver(:)
1812  real(dp),allocatable :: gg_hp(:,:)
1813 
1814 ! *************************************************************************
1815 
1816  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
1817 
1818  idist = ldx*ldy*ldz
1819  nhp = (nx/2+1)*ny*nz
1820  odist = nhp
1821 
1822  stride = 1
1823  n      = (/nx,ny,nz/)
1824  inembed= (/ldx,ldy,ldz/)
1825  onembed= (/(nx/2+1),ny,nz/)
1826 
1827  ABI_MALLOC(gg_hp,(2,nhp*ndat))
1828 
1829 #ifdef DEV_RC_BUG
1830  if (ndat/=1) ABI_ERROR("ndat/=1 + MKL not coded")
1831 
1832  if (ANY( n /= inembed )) then
1833    ABI_ERROR("Augmentation not supported")
1834  end if
1835 
1836  call dfftw_plan_dft_r2c_3d(my_plan, nx, ny, nz, ff, gg_hp, my_flags)
1837  if (my_plan==NULL_PLAN) then
1838    ABI_ERROR("dfftw_plan_dft_r2c_3d returned NULL_PLAN")
1839  end if
1840 
1841  !fftw_plan fftw_plan_many_dft_r2c(int rank3, const int *n, int howmany,
1842  !  double *in, const int *inembed, int istride, int idist,
1843  !  fftw_complex *out, const int *onembed, int ostride, int odist, unsigned flags);
1844 #else
1845  my_plan = dplan_many_dft_r2c(rank3, n, ndat, ff, inembed, stride, idist, gg_hp, onembed, stride, odist, my_flags, nt_all)
1846 #endif
1847 
1848  ! Now perform the 3D FFT via FFTW. r2c are always ABI_FFTW_FORWARD
1849  call dfftw_execute_dft_r2c(my_plan, ff, gg_hp)
1850 
1851  call fftw3_destroy_plan(my_plan)
1852 
1853  ! FFTW returns not normalized FTs
1854  call ZDSCAL(nhp*ndat, one/(nx*ny*nz), gg_hp, 1)
1855 
1856  ! Reconstruct full FFT: Hermitian redundancy: out[i] is the conjugate of out[n-i]
1857  padx = (nx/2+1)
1858  ABI_MALLOC(i1inver,(padx))
1859  ABI_MALLOC(i2inver,(ny))
1860  ABI_MALLOC(i3inver,(nz))
1861 
1862  i1inver(1)=1
1863  do i1=2,padx
1864    i1inver(i1)=nx+2-i1
1865  end do
1866 
1867  i2inver(1)=1
1868  do i2=2,ny
1869    i2inver(i2)=ny+2-i2
1870  end do
1871 
1872  i3inver(1)=1
1873  do i3=2,nz
1874    i3inver(i3)=nz+2-i3
1875  end do
1876 
1877  igp=0
1878  do idat=1,ndat
1879    padatf = (idat-1)*ldx*ldy*ldz
1880    do i3=1,nz
1881      i3inv = i3inver(i3)
1882      do i2=1,ny
1883        i2inv = i2inver(i2)
1884        do i1=1,padx
1885          igp = igp+1
1886          igf = i1 + (i3-1)*ldx*ldy + (i2-1)*ldx + padatf
1887          gg(:,igf) =  gg_hp(:,igp)
1888          i1inv = i1inver(i1)
1889          if (i1inv/=i1) then
1890            imgf = i1inv + (i3inv-1)*ldx*ldy + (i2inv-1)*ldx + padatf
1891            gg(1,imgf) =  gg_hp(1,igp)
1892            gg(2,imgf) = -gg_hp(2,igp)
1893          end if
1894        end do
1895      end do
1896    end do
1897  end do
1898 
1899  ABI_FREE(i1inver)
1900  ABI_FREE(i2inver)
1901  ABI_FREE(i3inver)
1902  ABI_FREE(gg_hp)
1903 
1904 #else
1905  ABI_ERROR("FFTW3 support not activated")
1906  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz/))
1907  ABI_UNUSED(ff)
1908  ABI_UNUSED(gg(1,1))
1909  if (PRESENT(fftw_flags)) then
1910    ABI_UNUSED(fftw_flags)
1911  end if
1912 #endif
1913 
1914 end subroutine fftw3_r2c_op

m_fftw3/fftw3_seqfourdp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_seqfourdp

FUNCTION

 Driver routine for 3D FFT of lengths nx, ny, nz. Mainly used for densities or potentials.
 FFT Transform is out-of-place

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nx,ny,nz=Number of point along the three directions.
 ldx,ldy,ldz=Leading dimension of the array.
 ndat = Number of FFTS
 isign= +1 : fofg(G) => fofr(R);
        -1 : fofr(R) => fofg(G)
 fofg(2,ldx*ldy*ldz*ndat)=The array to be transformed.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator. Defaults to FFTW_ESTIMATE.

OUTPUT

 fofr(cplex,ldx*ldy*ldz*ndat)=The FFT of fofg

SOURCE

294 subroutine fftw3_seqfourdp(cplex,nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofg,fofr,fftw_flags)
295 
296 !Arguments ------------------------------------
297 !scalars
298  integer,intent(in) :: cplex,nx,ny,nz,ldx,ldy,ldz,ndat,isign
299  integer,optional,intent(in) :: fftw_flags
300 !arrays
301  real(dp),intent(inout) :: fofg(2*ldx*ldy*ldz*ndat)
302  real(dp),intent(inout) :: fofr(cplex*ldx*ldy*ldz*ndat)
303 
304 !Local variables-------------------------------
305 !scalars
306  integer,parameter :: iscale1 = 1
307  integer :: my_flags,ii,jj
308  complex(spc), allocatable :: work_sp(:)
309 
310 ! *************************************************************************
311 
312  my_flags = ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
313 
314  select case (cplex)
315  case (2)
316    ! Complex to Complex.
317    if (fftcore_mixprec == 1) then
318      ! Mixed precision: copyin + in-place + copyout
319      ABI_MALLOC(work_sp, (ldx*ldy*ldz*ndat))
320      if (isign == ABI_FFTW_BACKWARD) then ! +1
321        work_sp(:) = cmplx(fofg(1::2), fofg(2::2), kind=spc)
322      else if (isign == ABI_FFTW_FORWARD) then ! -1
323        work_sp(:) = cmplx(fofr(1::2), fofr(2::2), kind=spc)
324      else
325        ABI_BUG("Wrong isign")
326      end if
327 
328      call fftw3_c2c_ip_spc(nx, ny, nz, ldx, ldy, ldz, ndat, iscale1, isign, work_sp, fftw_flags=my_flags)
329 
330      if (isign == ABI_FFTW_BACKWARD) then ! +1
331        jj = 1
332        do ii=1,ldx*ldy*ldz*ndat
333          fofr(jj) = real(work_sp(ii), kind=dp)
334          fofr(jj+1) = aimag(work_sp(ii))
335          jj = jj + 2
336        end do
337      else if (isign == ABI_FFTW_FORWARD) then ! -1
338        jj = 1
339        do ii=1,ldx*ldy*ldz*ndat
340          fofg(jj) = real(work_sp(ii), kind=dp)
341          fofg(jj+1) = aimag(work_sp(ii))
342          jj = jj + 2
343        end do
344      end if
345      ABI_FREE(work_sp)
346 
347    else
348      ! double precision version.
349      select case (isign)
350      case (ABI_FFTW_BACKWARD) ! +1
351        call fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofg,fofr,fftw_flags=my_flags)
352      case (ABI_FFTW_FORWARD)  ! -1
353        call fftw3_many_dft_op(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fofr,fofg,fftw_flags=my_flags)
354      case default
355        ABI_BUG("Wrong isign")
356      end select
357    end if
358 
359  case (1)
360    ! Real case.
361    select case (isign)
362    case (ABI_FFTW_FORWARD)
363      ! -1; R --> G
364      call fftw3_r2c_op(nx,ny,nz,ldx,ldy,ldz,ndat,fofr,fofg,fftw_flags=my_flags)
365    case (ABI_FFTW_BACKWARD)
366      ! +1; G --> R
367      call fftw3_c2r_op(nx,ny,nz,ldx,ldy,ldz,ndat,fofg,fofr,fftw_flags=my_flags)
368    case default
369      ABI_BUG("Wrong isign")
370    end select
371 
372  case default
373    ABI_BUG(" Wrong value for cplex")
374  end select
375 
376 end subroutine fftw3_seqfourdp

m_fftw3/fftw3_seqfourwf [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_seqfourwf

FUNCTION

 Carry out composite Fourier transforms between real and reciprocal (G) space.
 Wavefunctions, contained in a sphere in reciprocal space,
 can be FFT to real space. They can also be FFT from real space
 to a sphere. Also, the density maybe accumulated, and a local potential can be applied.

 The different options are :
 - option=0 --> reciprocal to real space and output the result.
 - option=1 --> reciprocal to real space and accumulate the density.
 - option=2 --> reciprocal to real space, apply the local potential to the wavefunction
                in real space and produce the result in reciprocal space.
 - option=3 --> real space to reciprocal space.
                NOTE that in this case, fftalg=1x1 MUST be used. This may be changed in the future.

INPUTS

 cplex= if 1 , denpot is real, if 2 , denpot is complex
    (cplex=2 only allowed for option=2, and istwf_k=1)
    not relevant if option=0 or option=3, so cplex=0 can be used to minimize memory
 fofgin(2,npwin)=holds input wavefunction in G vector basis sphere.
                 (intent(in) but the routine sphere can modify it for another iflag)
 gboundin(2*mgfft+8,2)=sphere boundary info for reciprocal to real space
 gboundout(2*mgfft+8,2)=sphere boundary info for real to reciprocal space
 istwf_k=option parameter that describes the storage of wfs
 kg_kin(3,npwin)=reduced planewave coordinates, input
 kg_kout(3,npwout)=reduced planewave coordinates, output
 mgfft=maximum size of 1D FFTs
 ndat=number of FFT to do in //
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 npwin=number of elements in fofgin array (for option 0, 1 and 2)
 npwout=number of elements in fofgout array (for option 2 and 3)
 ldx,ldy,ldz=ngfft(4),ngfft(5),ngfft(6), dimensions of fofr.
 option= if 0: do direct FFT
         if 1: do direct FFT, then sum the density
         if 2: do direct FFT, multiply by the potential, then do reverse FFT
         if 3: do reverse FFT only
 weight_r=weight to be used for the accumulation of the density in real space
         (needed only when option=1)

OUTPUT

  (see side effects)

SIDE EFFECTS

 Input/Output
 for option==0, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                fofr(2,ldx*ldy*ldz) contains the output Fourier Transform of fofgin;
                no use of denpot, fofgout and npwout.
 for option==1, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                denpot(cplex*ldx,ldy,ldz) contains the input density at input,
                and the updated density at output (accumulated);
                no use of fofgout and npwout.
 for option==2, fofgin(2,npwin*ndat)=holds input wavefunction in G sphere;
                denpot(cplex*ldx,ldy,ldz) contains the input local potential;
                fofgout(2,npwout*ndat) contains the output function;
 for option==3, fofr(2,ldx*ldy*ldz*ndat) contains the input real space wavefunction;
                fofgout(2,npwout*ndat) contains its output Fourier transform;
                no use of fofgin and npwin.

SOURCE

444 subroutine fftw3_seqfourwf(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k, &
445                           kg_kin,kg_kout,mgfft,ndat,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
446 
447 !Arguments ------------------------------------
448 !scalars
449  integer,intent(in) :: cplex,istwf_k,ldx,ldy,ldz,ndat,npwin,npwout,option,mgfft
450  real(dp),intent(in) :: weight_i,weight_r
451 !arrays
452  integer,intent(in) :: gboundin(2*mgfft+8,2),gboundout(2*mgfft+8,2)
453  integer,intent(in) :: kg_kin(3,npwin),kg_kout(3,npwout),ngfft(18)
454  real(dp),intent(inout) :: denpot(cplex*ldx,ldy,ldz),fofgin(2,npwin*ndat)
455  real(dp),intent(inout) :: fofr(2,ldx*ldy*ldz*ndat)
456  real(dp),intent(out) :: fofgout(2,npwout*ndat)
457 
458 !Local variables-------------------------------
459 !scalars
460  integer,parameter :: me_g0=1,ndat1=1
461  integer :: nx,ny,nz,fftalg,fftalga,fftalgc,fftcache,dat,ptg,ptr,ptgin,ptgout,nthreads
462  character(len=500) :: msg
463  logical :: use_fftrisc
464 !arrays
465  !real(dp),allocatable :: saveden(:,:,:)
466 #if 0
467  logical :: use_fftbox
468  integer,parameter :: shiftg(3)=(/0,0,0/)
469  integer :: symm(3,3)
470 #endif
471 
472 ! *************************************************************************
473 
474  if (all(option /= [0, 1, 2, 3])) then
475    write(msg,'(a,i0,a)')' Option:',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.'
476    ABI_ERROR(msg)
477  end if
478 
479  if (option == 1 .and. cplex /= 1) then
480    ABI_ERROR(sjoin("With option number 1, cplex must be 1 but it is cplex:", itoa(cplex)))
481  end if
482 
483  if (option==2 .and. (cplex/=1 .and. cplex/=2)) then
484    ABI_ERROR(sjoin("With the option number 2, cplex must be 1 or 2, but it is cplex:", itoa(cplex)))
485  end if
486 
487  nx=ngfft(1); ny=ngfft(2); nz=ngfft(3)
488  fftalg=ngfft(7); fftalga=fftalg/100; fftalgc=mod(fftalg,10)
489  fftcache=ngfft(8)
490 
491  use_fftrisc = (fftalgc==2)
492  if (istwf_k==2.and.option==3) use_fftrisc = .FALSE.
493  if (istwf_k>2.and.ANY(option==(/0,3/))) use_fftrisc = .FALSE.
494 
495  nthreads = xomp_get_num_threads(open_parallel=.TRUE.)
496 
497  if (use_fftrisc) then
498    !call wrtout(std_out, calling fftw3_fftrisc")
499 
500    if (ndat == 1) then
501      if (fftcore_mixprec == 0) then
502        call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
503          mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
504      else
505        call fftw3_fftrisc_mixprec(cplex,denpot,fofgin,fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
506          mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
507      end if
508 
509    else
510      ! All this boilerplate code is needed because the caller might pass zero-sized arrays
511      ! for the arguments that are not referenced and we don't want to have problems at run-time.
512      ! Moreover option 1 requires a special treatment when threads are started at this level.
513 
514      SELECT CASE (option)
515      CASE (0)
516        !
517        ! fofgin -> fofr, no use of denpot, fofgout and npwout.
518        if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
519          do dat=1,ndat
520            ptg = 1 + (dat-1)*npwin
521            ptr = 1 + (dat-1)*ldx*ldy*ldz
522            call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
523 &            mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
524          end do
525        else
526 !$OMP PARALLEL DO PRIVATE(ptg,ptr)
527          do dat=1,ndat
528            ptg = 1 + (dat-1)*npwin
529            ptr = 1 + (dat-1)*ldx*ldy*ldz
530            call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
531 &            mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
532          end do
533        end if
534 
535      CASE (1)
536        !fofgin -> local ur and accumulate density in denpot
537        ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot.
538        ! but this causes problems with the stack.
539 
540        do dat=1,ndat
541          ptg = 1 + (dat-1)*npwin
542          ptr = 1 + (dat-1)*ldx*ldy*ldz
543          call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptg),fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
544 &          mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
545        end do
546 
547        ! This version doesn't seem efficient
548        !!!  !$OMP PARALLEL PRIVATE(ptg,ptr,saveden)
549        !!!         ABI_MALLOC(saveden, (ldx,ldy,ldz))
550        !!!         saveden = zero
551        !!!  !$OMP DO
552        !!!         do dat=1,ndat
553        !!!           ptg = 1 + (dat-1)*npwin
554        !!!           ptr = 1 + (dat-1)*ldx*ldy*ldz
555        !!!           call fftw3_fftrisc_dp(cplex,saveden,fofgin(1,ptg),fofgout,fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
556        !!!  &          mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r)
557        !!!         end do
558        !!!  !$OMP END DO NOWAIT
559        !!!  !$OMP CRITICAL (OMPC_addrho)
560        !!!         denpot = denpot + saveden
561        !!!  !$OMP END CRITICAL (OMPC_addrho)
562        !!!         ABI_FREE(saveden)
563        !!!  !$OMP END PARALLEL
564 
565      CASE (2)
566        ! <G|vloc(r)|fofgin(r)> in fofgout
567        if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
568          do dat=1,ndat
569            ptgin  = 1 + (dat-1)*npwin
570            ptgout = 1 + (dat-1)*npwout
571            if (fftcore_mixprec == 0) then
572              call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,&
573                  istwf_k,kg_kin,kg_kout,mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
574            else
575              call fftw3_fftrisc_mixprec(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,&
576                  istwf_k,kg_kin,kg_kout,mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
577            end if
578          end do
579        else
580 !$OMP PARALLEL DO PRIVATE(ptgin,ptgout)
581          do dat=1,ndat
582            ptgin  = 1 + (dat-1)*npwin
583            ptgout = 1 + (dat-1)*npwout
584            call fftw3_fftrisc_dp(cplex,denpot,fofgin(1,ptgin),fofgout(1,ptgout),fofr,gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
585 &            mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
586          end do
587        end if
588 
589      CASE (3)
590        ! fofr -> fofgout
591        if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
592          do dat=1,ndat
593            ptr    = 1 + (dat-1)*ldx*ldy*ldz
594            ptgout = 1 + (dat-1)*npwout
595            call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout(1,ptgout),fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
596 &            mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
597          end do
598        else
599 !$OMP PARALLEL DO PRIVATE(ptr,ptgout)
600          do dat=1,ndat
601            ptr    = 1 + (dat-1)*ldx*ldy*ldz
602            ptgout = 1 + (dat-1)*npwout
603            call fftw3_fftrisc_dp(cplex,denpot,fofgin,fofgout(1,ptgout),fofr(1,ptr),gboundin,gboundout,istwf_k,kg_kin,kg_kout,&
604 &            mgfft,ngfft,npwin,npwout,ldx,ldy,ldz,option,weight_r,weight_i)
605          end do
606        end if
607 
608      CASE DEFAULT
609        write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.'
610        ABI_ERROR(msg)
611      END SELECT
612 
613    end if
614 
615  else
616 
617 #if 1
618    SELECT CASE (option)
619    CASE (0)
620      !
621      ! FFT u(g) --> u(r)
622      if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
623        call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr)
624      else
625 !$OMP PARALLEL DO PRIVATE(ptg, ptr)
626        do dat=1,ndat
627          ptg = 1 + (dat-1)*npwin
628          ptr = 1 + (dat-1)*ldx*ldy*ldz
629          call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat1,&
630 &          istwf_k,mgfft,kg_kin,gboundin,fofgin(1,ptg),fofr(1,ptr))
631        end do
632      end if
633 
634    CASE (1)
635      ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot.
636      call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr)
637      call cg_addtorho(nx,ny,nz,ldx,ldy,ldz,ndat,weight_r,weight_i,fofr,denpot)
638 
639    CASE (2)
640 
641      if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
642        call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat,istwf_k,mgfft,kg_kin,gboundin,fofgin,fofr)
643        call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat,cplex,denpot,fofr)
644 
645        !  The data for option==2 is now in fofr.
646        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,-1,gboundout)
647 
648        call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout)
649      else
650 
651 !$OMP PARALLEL DO PRIVATE(ptg, ptr)
652        do dat=1,ndat
653          ptg = 1 + (dat-1)*npwin
654          ptr = 1 + (dat-1)*ldx*ldy*ldz
655          call fftw3_fftug_dp(fftalg,fftcache,npwin,nx,ny,nz,ldx,ldy,ldz,ndat1,&
656 &          istwf_k,mgfft,kg_kin,gboundin,fofgin(1,ptg),fofr(1,ptr))
657 
658          call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat1,cplex,denpot,fofr(1,ptr))
659 
660          !  The data for option==2 is now in fofr.
661          call fftw3_fftpad_dp(fofr(1,ptr),nx,ny,nz,ldx,ldy,ldz,ndat1,mgfft,-1,gboundout)
662 
663          ptg = 1 + (dat-1)*npwout
664          call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat1,npwout,kg_kout,fofr(1,ptr),fofgout(1,ptg))
665        end do
666      end if
667 
668    CASE (3)
669      !  The data for option==3 is already in fofr.
670      if (.not.fftw3_spawn_threads_here(ndat,nthreads)) then
671        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,-1,gboundout)
672        call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout)
673      else
674 !$OMP PARALLEL DO PRIVATE(ptg, ptr)
675        do dat=1,ndat
676          ptg = 1 + (dat-1)*npwout
677          ptr = 1 + (dat-1)*ldx*ldy*ldz
678          call fftw3_fftpad_dp(fofr(1,ptr),nx,ny,nz,ldx,ldy,ldz,ndat1,mgfft,-1,gboundout)
679          call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat1,npwout,kg_kout,fofr(1,ptr),fofgout(1,ptg))
680        end do
681      end if
682 
683    CASE DEFAULT
684      write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.'
685      ABI_ERROR(msg)
686    END SELECT
687 
688 
689 #else
690    symm=0; symm(1,1)=1; symm(2,2)=1; symm(3,3)=1
691    use_fftbox = .FALSE.
692 #ifdef HAVE_OPENMP
693    use_fftbox = (ndat>1)
694 #endif
695    !use_fftbox = .TRUE.
696 
697    SELECT CASE (option)
698    CASE (0)
699      !
700      ! FFT u(g) --> u(r)
701      call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one)
702 
703      if (use_fftbox) then
704        call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr)
705      else
706        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin)
707      end if
708 
709    CASE (1)
710      ! TODO this is delicate part to do in parallel, as one should OMP reduce denpot.
711 
712      call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one)
713 
714      if (use_fftbox) then
715        call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr)
716      else
717        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin)
718      end if
719 
720      call cg_addtorho(nx,ny,nz,ldx,ldy,ldz,ndat,weight_r,weight_i,fofr,denpot)
721 
722    CASE (2)
723 
724      call sphere(fofgin,ndat,npwin,fofr,nx,ny,nz,ldx,ldy,ldz,kg_kin,istwf_k,1,me_g0,shiftg,symm,one)
725 
726      if (use_fftbox) then
727        call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_BACKWARD,fofr)
728      else
729        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_BACKWARD,gboundin)
730      end if
731 
732      call cg_vlocpsi(nx,ny,nz,ldx,ldy,ldz,ndat,cplex,denpot,fofr)
733 
734      ! The data for option==2 is now in fofr.
735      if (use_fftbox) then
736        call fftw3_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,ABI_FFTW_FORWARD,fofr)
737      else
738        call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_FORWARD,gboundout)
739      end if
740 
741      call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout)
742 
743    CASE (3)
744      !  The data for option==3 is already in fofr.
745      call fftw3_fftpad_dp(fofr,nx,ny,nz,ldx,ldy,ldz,ndat,mgfft,ABI_FFTW_FORWARD,gboundout)
746 
747      call cg_box2gsph(nx,ny,nz,ldx,ldy,ldz,ndat,npwout,kg_kout,fofr,fofgout)
748 
749    CASE DEFAULT
750      write(msg,'(a,i0,a)')'Option',option,' is not allowed. Only option=0, 1, 2 or 3 are allowed presently.'
751      ABI_ERROR(msg)
752    END SELECT
753 #endif
754  end if
755 
756 end subroutine fftw3_seqfourwf

m_fftw3/fftw3_set_nthreads [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3_set_nthreads

FUNCTION

  This function sets the number of threads you want FFTW3 to use (or actually, the maximum number).
  It also performs any one-time initialization required to use FFTW3 threads.
  All plans subsequently created with any planner routine will use nthreads threads.
  If you pass an nthreads argument of 1 (the default), threads are disabled for subsequent plans.
  It does nothing if HAVE_FFT_FFTW3_THREADS is not defined.

INPUTS

  [nthreads]=The number of threads you want FFTW3 to use.  Default xomp_get_max_threads()

SOURCE

2330 subroutine fftw3_set_nthreads(nthreads)
2331 
2332 !Arguments ------------------------------------
2333  integer,optional,intent(in) :: nthreads
2334 
2335 !Local variables ------------------------------
2336 !scalars
2337 #ifdef HAVE_FFTW3_THREADS
2338  integer :: istat,nt
2339  integer,parameter :: enough=1
2340  integer,save :: nwarns=0
2341 #endif
2342 
2343 ! *************************************************************************
2344 
2345 #ifdef HAVE_FFTW3_THREADS
2346  if (THREADS_INITED==0) then
2347    ABI_WARNING("Threads are not initialized")
2348  end if
2349 
2350  if (PRESENT(nthreads)) then
2351    if (nthreads<=0) then
2352      nt = xomp_get_max_threads()
2353    else
2354      nt = nthreads
2355    end if
2356  else
2357    nt = xomp_get_max_threads()
2358  end if
2359 
2360  call dfftw_plan_with_nthreads(nt)
2361 
2362 #ifndef HAVE_OPENMP
2363   if (nwarns <= enough) then
2364     nwarns = nwarns + 1
2365     ABI_WARNING("Using FFTW3 with threads but HAVE_OPENMP is not defined!")
2366   end if
2367 #endif
2368 
2369 #else
2370  if (PRESENT(nthreads)) then
2371    ABI_UNUSED(nthreads)
2372  end if
2373 #endif
2374 
2375 end subroutine fftw3_set_nthreads

m_fftw3/fftw3_spawn_threads_here [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_spawn_threads_here

FUNCTION

  Helper function that returns true if FFT calls should be OMP
  parallelized in the client code.

INPUTS

  ndat=Number of FFT transforms to do
  nthreads = Number of threads available

SOURCE

3039 function fftw3_spawn_threads_here(ndat,nthreads) result(ans)
3040 
3041 !Arguments ------------------------------------
3042 !scalars
3043  integer,intent(in) :: ndat,nthreads
3044  logical :: ans
3045 
3046 ! *************************************************************************
3047 
3048  ans = .FALSE.
3049 #ifdef HAVE_OPENMP
3050  ans = (nthreads > 1 .and. MOD(ndat,nthreads) == 0 .and. .not. USE_LIB_THREADS)
3051 #else
3052  ABI_UNUSED((/ndat,nthreads/))
3053 #endif
3054 
3055 end function fftw3_spawn_threads_here

m_fftw3/fftw3_use_lib_threads [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

 fftw3_use_lib_threads

FUNCTION

INPUTS

SOURCE

3070 subroutine fftw3_use_lib_threads(logvar)
3071 
3072 !Arguments ------------------------------------
3073 !scalars
3074  logical,intent(in) :: logvar
3075 
3076 ! *************************************************************************
3077 
3078  USE_LIB_THREADS = logvar
3079 
3080 end subroutine fftw3_use_lib_threads

m_fftw3/fftw3mpi_many_dft_ip [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3mpi_many_dft_ip

FUNCTION

 Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts).
 ndat=Number of FFTs to be done.
 fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
 isign=sign of Fourier transform exponent: current convention uses
   +1 for transforming from G to r,
   -1 for transforming from r to G.
 comm_fft=MPI communicator for the FFT
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

TODO

  Remove me

OUTPUT

 fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.

SOURCE

3211 subroutine fftw3mpi_many_dft_ip(nx,ny,nz,ldx,ldy,ldz,ndat,isign,fin,fout,comm_fft,fftw_flags)
3212 
3213 !Arguments ------------------------------------
3214 !scalars
3215  integer,intent(in) :: nx,ny,nz,ldx,ldy,ldz,ndat,isign,comm_fft
3216  integer,optional,intent(in) :: fftw_flags
3217 !arrays
3218  real(dp),intent(in) :: fin(2,ldx,ldy,ldz*ndat)
3219  real(dp),intent(out) :: fout(2,ldx,ldy,ldz*ndat)
3220 
3221 #ifdef HAVE_FFTW3_MPI
3222 !Local variables-------------------------------
3223 !scalars
3224  integer,parameter :: rank3=3
3225  integer :: my_flags
3226  real(dp):: factor_fft
3227 !arrays
3228  type(C_PTR) :: plan, cdata
3229  complex(C_DOUBLE_COMPLEX), ABI_CONTIGUOUS pointer :: data(:,:,:)
3230  integer(C_INTPTR_T) :: i, j, k, alloc_local, local_n0, local_0_start,fft_sizes(4)
3231 
3232 !*************************************************************************
3233 
3234  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
3235 
3236  ! get local data size and allocate (note dimension reversal)
3237  fft_sizes = [nz,ny,nx,ndat]
3238 
3239  alloc_local = fftw_mpi_local_size_many( &
3240 &      rank3,fft_sizes(1:3),fft_sizes(4),&
3241 &      FFTW_MPI_DEFAULT_BLOCK, comm_fft, &
3242 &      local_n0,local_0_start)
3243 
3244  ! Allocate cdata, build the plane and copy data: fin --> data
3245  cdata = fftw_alloc_complex(alloc_local)
3246  call c_f_pointer(cdata, data, [fft_sizes(3),fft_sizes(2), local_n0])
3247 
3248  plan = fftw_mpi_plan_many_dft(rank3,fft_sizes(1:3),fft_sizes(4), &
3249 &                               FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, &
3250 &                               data,data,comm_fft,isign,my_flags)
3251 
3252  do k=1, local_n0*ndat
3253     do j=1, ny
3254        do i=1, nx
3255           data(i,j,k) = CMPLX( fin(1,i,j,k),fin(2,i,j,k),C_DOUBLE_COMPLEX)
3256        end do
3257     end do
3258  end do
3259 
3260  ! Compute transform.
3261  call fftw_mpi_execute_dft(plan, data, data)
3262 
3263  if(isign==ABI_FFTW_FORWARD) then
3264     ! Scale results.
3265     factor_fft = one / (nx*ny*nz)
3266     do k=1, local_n0*ndat
3267        do j=1, ny
3268           do i=1, nx
3269              fout(1,i,j,k) =  real(data(i,j,k)) * factor_fft
3270              fout(2,i,j,k) = aimag(data(i,j,k)) * factor_fft
3271           end do
3272        end do
3273     end do
3274  end if
3275 
3276  call fftw_destroy_plan(plan)
3277  call fftw_free(cdata)
3278 
3279 #else
3280  ABI_ERROR("FFTW3_MPI support not activated")
3281  ABI_UNUSED((/nx,ny,nz,ldx,ldy,ldz,ndat,isign/))
3282  ABI_UNUSED(comm_fft)
3283  if (PRESENT(fftw_flags)) then
3284     ABI_UNUSED(fftw_flags)
3285  end if
3286  ABI_UNUSED(fin(1,1,1,1))
3287  ABI_UNUSED(fout(1,1,1,1))
3288 #endif
3289 
3290 end subroutine fftw3mpi_many_dft_ip

m_fftw3/fftw3mpi_many_dft_tr [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftw3mpi_many_dft_tr

FUNCTION

 Driver routine for many out-of-place 3D complex-to-complex FFTs of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ldx,ldy,ldz=Physical dimension of the fin and fout arrays (to avoid cache conflicts).
 ndat=Number of FFTs to be done.
 fin(2*ldx*ldy*ldz*ndat)=The complex array to be transformed.
 isign=sign of Fourier transform exponent: current convention uses
   +1 for transforming from G to r,
   -1 for transforming from r to G.
 comm_fft=MPI communicator for the FFT.
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.
   Defaults to ABI_FFTW_ESTIMATE.

TODO

  Remove me

OUTPUT

 fout(2,ldx*ldy*ldz*ndat)=The Fourier transform of fin.

SOURCE

3322 subroutine fftw3mpi_many_dft_tr(nx,ny,nz,ndat,isign,fin,fout,comm_fft,fftw_flags)
3323 
3324 !Arguments ------------------------------------
3325 !scalars
3326  integer,intent(in) :: nx,ny,nz,ndat,isign,comm_fft
3327  integer,optional,intent(in) :: fftw_flags
3328 !arrays
3329  complex(C_DOUBLE_COMPLEX),ABI_CONTIGUOUS pointer  :: fin(:,:,:)
3330  complex(C_DOUBLE_COMPLEX),ABI_CONTIGUOUS pointer :: fout(:,:,:)
3331 
3332 !Local variables-------------------------------
3333 #ifdef HAVE_FFTW3_MPI
3334 !scalars
3335  integer :: my_flags
3336  !FFTWMPI stuff
3337  type(C_PTR) :: plan
3338  integer(C_INTPTR_T) :: fft_sizes(4)
3339 
3340 !*************************************************************************
3341 
3342  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
3343  my_flags = ior(my_flags,FFTW_DESTROY_INPUT)
3344 
3345  fft_sizes(1)=nz
3346  fft_sizes(2)=ny
3347  fft_sizes(3)=nx
3348  fft_sizes(4)=ndat
3349 
3350  plan = fftw_mpi_plan_many_dft(3,fft_sizes(1:3),fft_sizes(4), &
3351 &                              FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, &
3352 &                              fin,fout,comm_fft,isign,my_flags)
3353 
3354 !Compute transform (as many times as desired)
3355  call fftw_mpi_execute_dft(plan, fin, fout)
3356  call fftw_destroy_plan(plan)
3357 
3358 #else
3359  ABI_ERROR("FFTW3_MPI support not activated")
3360  ABI_UNUSED((/nx,ny,nz,ndat,isign,comm_fft/))
3361  if (PRESENT(fftw_flags)) then
3362     ABI_UNUSED(fftw_flags)
3363  end if
3364  ABI_UNUSED(fin(1,1,1))
3365  ABI_UNUSED(fout(1,1,1))
3366 #endif
3367 
3368 end subroutine fftw3mpi_many_dft_tr

m_fftw3/fftwmpi_free_work_array [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftwmpi_free_work_array

FUNCTION

  routine for freeing fftw work arrray

INPUTS

OUTPUT

 cdata_f,cdata_r: C pointers to free for fourier andreal data

SOURCE

3161 subroutine fftwmpi_free_work_array(cdata_f,cdata_r)
3162 
3163 !Arguments ------------------------------------
3164 !scalars
3165  type(C_PTR), intent(inout) :: cdata_f,cdata_r
3166 
3167 ! *************************************************************************
3168 
3169 #ifdef HAVE_FFTW3_MPI
3170  call fftw_free(cdata_r)
3171  call fftw_free(cdata_f)
3172 #else
3173  ABI_ERROR("FFTW3_MPI support not activated")
3174  if(.false.) then
3175    cdata_r = C_NULL_PTR; cdata_f = C_NULL_PTR
3176  end if
3177 #endif
3178 
3179 end subroutine fftwmpi_free_work_array

m_fftw3/fftwmpi_get_work_array [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  fftwmpi_get_work_array

FUNCTION

 Driver routine for allocate fftw work arrray for 3D complex-to-complex FFTs of lengths nx, ny, nz.

INPUTS

 nx,ny,nz=Number of points along the three directions.
 ndat=Number of FFTs to be done.
 comm_fft=MPI communicator.

OUTPUT

 cdata_f,cdata_r: C pointers to use for fourier andreal data
 n0,n0_tr : local size on the shared dimension (nz or ny if transposed mode is used)
 offset,offset_tr : offset per process in continuous tabx

SOURCE

3104 subroutine fftwmpi_get_work_array(cdata_f,cdata_r,rank,nx,ny,nz,ndat,comm_fft,n0,offset,n0_tr,offset_tr)
3105 
3106 !Arguments ------------------------------------
3107 !scalars
3108  integer,intent(in) :: nx,ny,nz,ndat,rank,comm_fft
3109  integer(C_INTPTR_T), intent(out) :: n0, offset, n0_tr, offset_tr
3110  type(C_PTR), intent(out) :: cdata_f,cdata_r
3111 
3112 !Local variables-------------------------------
3113 #ifdef HAVE_FFTW3_MPI
3114 !scalars
3115  integer(C_INTPTR_T) :: alloc_local
3116 !arrays
3117  integer(C_INTPTR_T) :: fft_sizes(4)
3118 
3119 ! *************************************************************************
3120 
3121  ! Dimensions are inverted here (C interface).
3122  fft_sizes(1)=nz
3123  fft_sizes(2)=ny
3124  fft_sizes(3)=nx
3125  fft_sizes(4)=ndat
3126 
3127  alloc_local = fftw_mpi_local_size_many_transposed(rank,fft_sizes(1:3),fft_sizes(4), &
3128 &      FFTW_MPI_DEFAULT_BLOCK,FFTW_MPI_DEFAULT_BLOCK, comm_fft, &
3129 &      n0,offset, &
3130 &      n0_tr,offset_tr)
3131 
3132  cdata_f = fftw_alloc_complex(alloc_local)
3133  cdata_r = fftw_alloc_complex(alloc_local)
3134 
3135 #else
3136   ABI_ERROR("FFTW3_MPI support not activated")
3137   ABI_UNUSED((/nx,ny,nz,ndat,rank,comm_fft/))
3138   cdata_f = C_NULL_PTR; cdata_r = C_NULL_PTR
3139   n0 = 0; offset = 0; n0_tr = 0; offset_tr = 0
3140 #endif
3141 
3142 end subroutine fftwmpi_get_work_array

m_fftw3/old_fftw3_mpifourdp [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

  old_fftw3_mpifourdp

FUNCTION

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ndat=Number of FFTs to be done.
 isign= +1 : fofg(G) => fofr(R);
        -1 : fofr(R) => fofg(G)
 fftn2_distrib(n2)=  rank of the processor which own fft planes in 2nd dimension for fourdp
 ffti2_local(n2) = local i2 indices in fourdp
 fftn3_distrib(n3) = rank of the processor which own fft planes in 3rd dimension for fourdp
 ffti3_local(n3) = local i3 indices in fourdp
 comm_fft=MPI communicator for the FFT
 [fftw_flags]=Flags used to create the plan. They can be combined with the "+" operator.

SIDE EFFECTS

 Input/Output
 fofg(2,nfft*ndat)=f(G), complex.
 fofr(cplex*nfft*ndat)=input function f(r) (real or complex)

SOURCE

3685 subroutine old_fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
3686   fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags)
3687 
3688 !Arguments ------------------------------------
3689 !scalars
3690  integer,intent(in) :: cplex,nfft,ndat,isign,comm_fft
3691  integer,optional,intent(in) :: fftw_flags
3692 !arrays
3693  integer,intent(in) :: ngfft(18)
3694  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
3695  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
3696  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
3697 
3698 #ifdef HAVE_FFTW3_MPI
3699 !Local variables-------------------------------
3700 !scalars
3701  integer :: nx,ny,nz,my_flags
3702 
3703 ! *************************************************************************
3704 
3705  my_flags=ABI_FFTW_ESTIMATE; if (PRESENT(fftw_flags)) my_flags= fftw_flags
3706 
3707  nx=ngfft(1); ny=ngfft(2); nz=ngfft(3)
3708  !me_fft=ngfft(11); nproc_fft=ngfft(10)
3709 
3710  select case (cplex)
3711 
3712  case (1)
3713 
3714    ! Complex to Complex.
3715    ! This one is ok when ndat > 1
3716    !call fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,&
3717    !& fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags=my_flags)
3718    !return
3719 
3720    ! r2c or c2r case.
3721    ! FIXME this one is buggy when ndat > 1
3722    select case (isign)
3723    case (ABI_FFTW_FORWARD)
3724      ! +1; R --> G
3725     call fftw3_mpifourdp_r2c(nfft,ngfft,ndat,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,&
3726 &     fofg,fofr,comm_fft,fftw_flags=my_flags)
3727 
3728    case (ABI_FFTW_BACKWARD)
3729      ! -1; G --> R
3730     call fftw3_mpifourdp_c2r(nfft,ngfft,ndat,fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,&
3731 &     fofg,fofr,comm_fft,fftw_flags=my_flags)
3732 
3733    case default
3734      ABI_BUG("Wrong isign")
3735    end select
3736 
3737  case (2)
3738    ! Complex to Complex.
3739    call fftw3_mpifourdp_c2c(cplex,nfft,ngfft,ndat,isign,&
3740 &    fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft,fftw_flags=my_flags)
3741 
3742  case default
3743    ABI_BUG(" Wrong value for cplex")
3744  end select
3745 
3746 #else
3747  ABI_ERROR("FFTW3_MPI support not activated")
3748  ABI_UNUSED((/cplex,nfft,ngfft(1),ndat,isign,comm_fft/))
3749  ABI_UNUSED((/fftn2_distrib(1),ffti2_local(1)/))
3750  ABI_UNUSED((/fftn3_distrib(1),ffti3_local(1)/))
3751  if (PRESENT(fftw_flags)) then
3752     ABI_UNUSED(fftw_flags)
3753  end if
3754  ABI_UNUSED(fofg(1,1))
3755  ABI_UNUSED(fofr(1))
3756 #endif
3757 
3758 end subroutine old_fftw3_mpifourdp

m_fftw3/zplan_many_dft [ Functions ]

[ Top ] [ m_fftw3 ] [ Functions ]

NAME

FUNCTION

INPUTS

SIDE EFFECTS

SOURCE

2661 !! FIXME  technically it should be intent(inout) since FFTW3 can destroy the input for particular flags.
2662 
2663 function zplan_many_dft(rank,n,howmany,fin,inembed,istride,idist,fout,onembed,ostride,odist,sign,flags,nthreads) result(plan)
2664 
2665 !Arguments ------------------------------------
2666 !scalars
2667  integer,intent(in) :: rank,howmany,istride,ostride, sign,flags,idist,odist,nthreads
2668  integer,intent(in) :: n(rank),inembed(rank),onembed(rank)
2669  integer(KIND_FFTW_PLAN) :: plan
2670 !arrays
2671  complex(dpc) :: fin(*),fout(*)
2672 
2673 !Local variables-------------------------------
2674  character(len=500) :: msg,frmt
2675 
2676 ! *************************************************************************
2677 
2678 !$OMP CRITICAL (OMPC_zplan_many_dft)
2679  call fftw3_set_nthreads(nthreads)
2680 
2681  call dfftw_plan_many_dft(plan, rank, n, howmany, &
2682 &  fin, inembed, istride, idist, fout, onembed, ostride, odist, sign, flags)
2683 !$OMP END CRITICAL (OMPC_zplan_many_dft)
2684 
2685  if (plan==NULL_PLAN) then ! handle the error
2686    call wrtout(std_out, "dfftw_plan_many_dft returned NULL_PLAN (complex version)")
2687    write(frmt,*)"(a,",rank,"(1x,i0),3(a,i0),a,2(a,",rank,"(1x,i0),2(a,i0),a))"
2688    write(msg,frmt)&
2689 &    " n = ",n," howmany = ",howmany," sign = ",sign," flags = ",flags,ch10,&
2690 &    " inembed = ",inembed," istride = ",istride," idist =",idist,ch10,     &
2691 &    " onembed = ",onembed," ostride = ",ostride," odist =",idist,ch10
2692    call wrtout(std_out, msg)
2693    ABI_ERROR("Check FFTW library and/or abinit code")
2694  end if
2695 
2696 end function zplan_many_dft

m_m_fftw3/fftw3_mpifourdp [ Functions ]

[ Top ] [ Functions ]

NAME

 fftw3_mpifourdp

FUNCTION

 Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on
 fft grid in real space, to create complex f(G)=fofg defined on full fft grid
 in reciprocal space, in full storage mode, or the reverse operation.
 For the reverse operation, the final data is divided by nfftot.
 REAL case when cplex=1, COMPLEX case when cplex=2
 Usually used for density and potentials.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 ndat=Numbre of FFT transforms
 isign=sign of Fourier transform exponent: current convention uses
    +1 for transforming from G to r
    -1 for transforming from r to G.
 fftn2_distrib(2),ffti2_local(2)
 fftn3_distrib(3),ffti3_local(3)
 comm_fft=MPI communicator

SIDE EFFECTS

 Input/Output
 fofg(2,nfft)=f(G), complex.
 fofr(cplex*nfft)=input function f(r) (real or complex)

SOURCE

5216 subroutine fftw3_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
5217 &  fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
5218 
5219 !Arguments ------------------------------------
5220 !scalars
5221  integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft
5222 !arrays
5223  integer,intent(in) :: ngfft(18)
5224  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
5225  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
5226  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
5227 
5228 !Local variables-------------------------------
5229 !scalars
5230  integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,nproc_fft,me_fft
5231 !arrays
5232  real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:)
5233 
5234 ! *************************************************************************
5235 
5236  ! Note the only c2c is supported in parallel.
5237  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3)
5238  n4=ngfft(4); n5=ngfft(5); n6=ngfft(6)
5239  me_fft=ngfft(11); nproc_fft=ngfft(10)
5240 
5241  nd2proc=((n2-1)/nproc_fft) +1
5242  nd3proc=((n6-1)/nproc_fft) +1
5243  ABI_MALLOC(workr,(2,n4,n5,nd3proc,ndat))
5244  ABI_MALLOC(workf,(2,n4,n6,nd2proc,ndat))
5245 
5246  ! Complex to Complex
5247  ! TODO: Complex to Real
5248  select case (isign)
5249  case (1)
5250    ! G --> R
5251    call mpifft_fg2dbox(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,workf)
5252 
5253    call fftw3_mpiback(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workf,workr,comm_fft)
5254 
5255    call mpifft_dbox2fr(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,workr,cplex,nfft,fofr)
5256 
5257  case (-1)
5258    ! R --> G
5259    call mpifft_fr2dbox(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,workr)
5260 
5261    call fftw3_mpiforw(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workr,workf,comm_fft)
5262 
5263    ! Transfer FFT output to the original fft box.
5264    call mpifft_dbox2fg(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,workf,nfft,fofg)
5265 
5266  case default
5267    ABI_BUG("Wrong isign")
5268  end select
5269 
5270  ABI_FREE(workr)
5271  ABI_FREE(workf)
5272 
5273 end subroutine fftw3_mpifourdp