TABLE OF CONTENTS


ABINIT/fftprof [ Programs ]

[ Top ] [ Programs ]

NAME

 fftprof

FUNCTION

  Utility for profiling the FFT libraries supported by ABINIT.

COPYRIGHT

 Copyright (C) 2004-2024 ABINIT group (MG)
 This file is distributed under the terms of the
 GNU General Public License, see ~abinit/COPYING
 or http://www.gnu.org/copyleft/gpl.txt .

OUTPUT

  Timing analysis of the different FFT libraries and algorithms.

NOTES

  Point-group symmetries are not taken into account in getng during the generation
  of the FFT mesh. Therefore the FFT mesh might differ from the one
  found by abinit for the same cutoff and Bravais lattice (actually it might be smaller).

 Description of the input file (Fortran NAMELIST):

   &CONTROL
     tasks = string specifying the tasks to perform i.e. the routines that should be tested or profiled.
             allowed values:
                 fourdp --> Test FFT transforms of density and potentials on the full box.
                 fourwf --> Test FFT transforms of wavefunctions using the zero-padded algorithm.
                 gw_fft --> Test the FFT transforms used in the GW code.
                 all    --> Test all FFT routines (DEFAULT)
     fftalgs = list of fftalg values (used to select the FFT libraries to use, see abinit doc for more info)
     use_gpu_fftalgs = for each fftalg in fftalgs, it is set to 0 if the GPU version doesnt have
                       to be used, or to the value selecting the GPU implementation (see defs_basis.F90)
     ncalls = integer defining the number of calls for each tests. The final Wall time and CPU time
              are computed by averaging the final results over ncalls executions.
     max_nthreads = Maximum number of threads (DEFAULT 1, meaningful only if the code
                    uses threaded external libraries or OpenMP parallelization)
     ndat   = integer specifying how many FFT transforms should be executed for each call to the FFT routine
              (same meaning as the ndat input variable passed to fourwf)
     necut  = Used if tasks = "bench". Specifies the number of cutoff energies to profile (see also ecut_arth)
     ecut_arth = Used if tasks = "bench". Used to generate an arithmetic progression of cutoff energies
                 whose starting value is ecut_arth(1) and whose step is ecut_arth(2)

  &SYSTEM
     ecut = cutoff energy for wavefunctions (real, Hartree units)
     rprimd = Direct lattice vectors in Bohr. (3,3) matrix in Fortran column-major order
     kpoint = real(3) vector specifying the reduced coordinates of the k-point of the wavefunction (used if tasks = "fourwf").
               The value of the k-point defines the storage scheme (istwfk) of the u(G) coefficients and therefore
               the FFT algorithm used to perform the transform u(G) <--> u(R) in fourwf.
     osc_ecut  = cutoff energy (Hartree) for the oscillator matrix elements computed in the GW code
                 Corresponds to the input variables (ecuteps, ecutsigx) used in the main code.
     nsym     =Number of symmetry operations (DEFAULT 1)
     symrel(3,3,nsym) = Symmetry operation in real space used to select
               the FFT mesh in the routine getng (default: Identity matrix)
               NOTE that the real tnons is not taken into account anyhow: tnons is set to zero.

SOURCE

 59 #if defined HAVE_CONFIG_H
 60 #include "config.h"
 61 #endif
 62 
 63 #include "abi_common.h"
 64 
 65 program fftprof
 66 
 67  use defs_basis
 68  use m_xmpi
 69  use m_xomp
 70  use m_errors
 71  use m_abicore
 72  use m_dfti
 73 #ifdef HAVE_GPU_CUDA
 74  use m_gpu_toolbox
 75  use m_manage_cuda
 76 #endif
 77 
 78  use defs_abitypes,  only : MPI_type
 79  use m_build_info,   only : abinit_version
 80  use m_fstrings,     only : lower
 81  use m_specialmsg,   only : specialmsg_getcount, herald
 82  use m_argparse,     only : get_arg !, get_arg_list, get_start_step_num
 83  use m_io_tools,     only : flush_unit
 84  use m_geometry,     only : metric
 85  use m_fftcore,      only : get_cache_kb, get_kg, fftalg_isavailable, fftalg_has_mpi, getng, fftcore_set_mixprec
 86  use m_fft,          only : fft_use_lib_threads, fftbox_utests, fftu_utests, fftbox_mpi_utests, fftu_mpi_utests
 87  use m_fftw3,        only : fftw3_init_threads
 88  use m_fft_prof,     only : fft_test_t, fft_prof_t, fft_tests_free, fftprof_ncalls_per_test, fftprofs_free, &
 89 & fftprofs_print, prof_fourdp, prof_fourwf, prof_rhotwg
 90  use m_mpinfo,       only : destroy_mpi_enreg, initmpi_seq
 91 
 92  implicit none
 93 
 94 !Arguments -----------------------------------
 95 !scalars
 96  integer,parameter :: MAX_NFFTALGS = 50, MAX_NSYM = 48
 97  integer,parameter :: paral_kgb0 = 0, me_fft0 = 0, nproc_fft1 = 1, master = 0
 98  integer :: ii,fftcache,it,cplex,ntests,option_fourwf,osc_npw
 99  integer :: map2sphere,use_padfft,isign,nthreads,comm,nprocs,my_rank
100  integer :: iset,iall,inplace,nsets,avail,ith,idx,ut_nfft,ut_mgfft
101  integer :: nfftalgs,alg,fftalg,fftalga,fftalgc,nfailed,ierr,paral_kgb,abimem_level
102  character(len=24) :: codename
103  character(len=500) :: header,msg
104  real(dp),parameter :: boxcutmin2 = two
105  real(dp) :: ucvol, abimem_limit_mb
106  logical :: test_fourdp,test_fourwf,test_gw,do_seq_bench,do_seq_utests
107  logical :: iam_master,do_mpi_utests !,do_mpi_bench,
108  type(MPI_type) :: MPI_enreg
109 !arrays
110  integer :: ut_ngfft(18)
111 #ifdef HAVE_GPU_CUDA
112  integer :: gpu_devices(12)
113 #endif
114  real(dp),parameter :: gamma_point(3) = zero
115  real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3)
116  type(FFT_test_t),allocatable :: Ftest(:)
117  type(FFT_prof_t),allocatable :: Ftprof(:)
118  integer,allocatable :: osc_gvec(:,:), fft_setups(:,:),fourwf_params(:,:)
119 ! ==========  INPUT FILE ==============
120  integer :: ncalls = 10, max_nthreads = 1, ndat = 1, necut = 0, nsym = 1
121  integer :: mixprec = 0, gpu_option = 0, init_gpu_flavor
122  character(len=500) :: tasks="all"
123  integer :: fftalgs(MAX_NFFTALGS) = 0, use_gpu_fftalgs(MAX_NFFTALGS) = 0
124  integer :: symrel(3,3,MAX_NSYM) = 0
125  real(dp),parameter :: k0(3)=(/zero,zero,zero/)
126  real(dp) :: ecut = 30, osc_ecut = 3
127  real(dp) :: ecut_arth(2) = zero
128  real(dp) :: rprimd(3,3)
129  real(dp) :: kpoint(3) = (/0.1,0.2,0.3/)
130  real(dp) :: tnons(3,MAX_NSYM) = zero
131  logical :: use_lib_threads = .FALSE.
132  namelist /CONTROL/ tasks, ncalls, max_nthreads, ndat, fftalgs, use_gpu_fftalgs, &
133                     necut, ecut_arth, use_lib_threads, mixprec
134  namelist /SYSTEM/ ecut, rprimd, kpoint, osc_ecut, nsym, symrel
135 
136 ! *************************************************************************
137 
138  ! Change communicator for I/O (mandatory!)
139  call abi_io_redirect(new_io_comm=xmpi_world)
140 
141  call xmpi_init()
142  comm = xmpi_world
143  my_rank = xmpi_comm_rank(comm); nprocs = xmpi_comm_size(comm)
144  iam_master = (my_rank == master)
145 
146  ! Initialize memory profiling if it is activated
147  ! if a full abimem.mocc report is desired, set the argument of abimem_init to "2" instead of "0"
148  ! note that abimem.mocc files can easily be multiple GB in size so don't use this option normally
149  ABI_CHECK(get_arg("abimem-level", abimem_level, msg, default=0) == 0, msg)
150  ABI_CHECK(get_arg("abimem-limit-mb", abimem_limit_mb, msg, default=20.0_dp) == 0, msg)
151 #ifdef HAVE_MEM_PROFILING
152  call abimem_init(abimem_level, limit_mb=abimem_limit_mb)
153 #endif
154 
155  if (iam_master) then
156    ! Print header and read input file.
157    codename='FFTPROF'//REPEAT(' ',17)
158    call herald(codename,abinit_version,std_out)
159 
160    write(std_out,'(a)')" Tool for profiling and testing the FFT libraries used in ABINIT."
161    write(std_out,'(a)')" Allowed options are: "
162    write(std_out,'(a)')"   fourdp --> Test FFT transforms of density and potentials on the full box."
163    write(std_out,'(a)')"   fourwf --> Test FFT transforms of wavefunctions using the zero-pad algorithm."
164    write(std_out,'(a)')"   gw_fft --> Test the FFT transforms used in the GW code."
165    write(std_out,'(a)')"   all    --> Test all FFT routines."
166    !write(std_out,'(a)')"   seq-utests --> Units tests for sequential FFTs."
167    !write(std_out,'(a)')"   mpi-utests --> Units tests for MPI FFTs."
168    !write(std_out,'(a)')"   seq-bench  --> Benchmark mode for sequential FFTs."
169    !write(std_out,'(a)')"   mpi-bench  --> Benchmark mode for MPI FFTs."
170    write(std_out,'(a)')" "
171 
172    !Read input file
173    read(std_in, NML=CONTROL)
174    read(std_in, NML=SYSTEM)
175    !write(std_out, NML=CONTROL)
176    !write(std_out, NML=SYSTEM)
177    call lower(tasks)
178  end if
179 
180  ! Broadcast variables.
181  if (nprocs > 1) then
182    ! CONTROL
183    call xmpi_bcast(tasks,master,comm,ierr)
184    call xmpi_bcast(ncalls,master,comm,ierr)
185    call xmpi_bcast(max_nthreads,master,comm,ierr)
186    call xmpi_bcast(ndat,master,comm,ierr)
187    call xmpi_bcast(fftalgs,master,comm,ierr)
188    call xmpi_bcast(use_gpu_fftalgs,master,comm,ierr)
189    call xmpi_bcast(necut,master,comm,ierr)
190    call xmpi_bcast(ecut_arth,master,comm,ierr)
191    call xmpi_bcast(use_lib_threads,master,comm,ierr)
192    call xmpi_bcast(mixprec,master,comm,ierr)
193    ! SYSTEM
194    call xmpi_bcast(ecut,master,comm,ierr)
195    call xmpi_bcast(rprimd,master,comm,ierr)
196    call xmpi_bcast(kpoint,master,comm,ierr)
197    call xmpi_bcast(osc_ecut,master,comm,ierr)
198    call xmpi_bcast(nsym,master,comm,ierr)
199    call xmpi_bcast(symrel,master,comm,ierr)
200  end if
201 
202  ! Set precision for FFT libs.
203  ii = fftcore_set_mixprec(mixprec)
204 
205  ! replace "+" with white spaces
206  do ii=1,LEN_TRIM(tasks)
207    if (tasks(ii:ii) == "+") tasks(ii:ii) = " "
208  end do
209  !%call str_replace_chars(tasks,"+"," ")
210  tasks = " "//TRIM(tasks)//""
211 
212  iall=INDEX (tasks,"all")
213  test_fourdp = (iall>0 .or. INDEX(tasks," fourdp")>0 )
214  test_fourwf = (iall>0 .or. INDEX(tasks," fourwf")>0 )
215  test_gw     = (iall>0 .or. INDEX(tasks," gw_fft")>0 )
216 
217  do_seq_utests = INDEX(tasks," utests") >0
218  do_seq_bench  = INDEX(tasks," bench") >0
219  do_mpi_utests = INDEX(tasks," mpi-utests")>0
220  !do_mpi_bench  = INDEX(tasks," mpi-bench")>0
221 
222 #ifdef HAVE_FFTW3_THREADS
223  call fftw3_init_threads()
224 #endif
225 
226 #ifndef HAVE_OPENMP
227  ABI_CHECK(max_nthreads <= 1, "nthreads>1 but OMP support is not enabled!")
228 #endif
229  if (max_nthreads>1 .and. iam_master) call xomp_show_info(std_out)
230 
231  call fft_use_lib_threads(use_lib_threads)
232  !write(std_out,*)"use_lib_threads: ",use_lib_threads
233 
234  init_gpu_flavor = maxval(use_gpu_fftalgs)
235 #if defined HAVE_GPU_CUDA
236  if (init_gpu_flavor /= ABI_GPU_DISABLED) then
237    gpu_devices(:)=-1
238    call setdevice_cuda(gpu_devices, init_gpu_flavor)
239    ABI_CHECK(init_gpu_flavor /= ABI_GPU_DISABLED, "Cannot find any free GPU device!")
240    ! Cublas initialization
241    call gpu_linalg_init()
242  end if
243 #endif
244 
245  if (do_mpi_utests) then
246    ! Execute unit tests for MPI FFTs and terminate execution.
247    call wrtout(std_out, "=== MPI FFT Unit Tests ===")
248 
249    ! List the FFT libraries that will be tested.
250    ! Goedecker FFTs are always available, other libs are optional.
251    nfftalgs = COUNT(fftalgs /= 0)
252    ABI_CHECK(nfftalgs > 0, "fftalgs must be specified")
253 
254    nfailed = 0; nthreads = 0
255    do ii=1,nfftalgs
256      fftalg = fftalgs(ii); gpu_option = use_gpu_fftalgs(ii)
257      do paral_kgb=1,1
258        write(msg,"(5(a,i0))")&
259         "MPI fftu_utests with fftalg = ",fftalg,", paral_kgb = ",paral_kgb," ndat = ",ndat,", nthreads = ",nthreads
260        call wrtout(std_out, msg)
261        nfailed = nfailed + fftu_mpi_utests(fftalg,ecut,rprimd,ndat,nthreads,comm,paral_kgb)
262      end do
263    end do
264 
265    nfailed = 0; nthreads = 0
266    do ii=1,nfftalgs
267      fftalg = fftalgs(ii); gpu_option = use_gpu_fftalgs(ii)
268      do cplex=1,2
269        write(msg,"(4(a,i0))")&
270          "MPI fftbox_utests with fftalg = ",fftalg,", cplex = ",cplex," ndat = ",ndat,", nthreads = ",nthreads
271        call wrtout(std_out, msg)
272        nfailed = nfailed + fftbox_mpi_utests(fftalg=fftalg,cplex=cplex,ndat=ndat,nthreads=nthreads,comm_fft=comm)
273      end do
274    end do
275 
276    write(msg,'(a,i0)')"Total number of failed tests = ",nfailed
277    call wrtout(std_out, msg)
278    goto 100 ! Jump to xmpi_end
279  end if
280 
281  call initmpi_seq(MPI_enreg)
282  call metric(gmet,gprimd,std_out,rmet,rprimd,ucvol)
283 
284  ! symmetries (if given) are used for defining the FFT mesh.
285  if (nsym==1 .and. ALL(symrel==0)) then
286    symrel(:,:,1) = RESHAPE([1,0,0,0,1,0,0,0,1], [3,3])
287  end if
288 
289  ! Set the number of calls for test.
290  ncalls=ABS(ncalls); if (ncalls==0) ncalls=10
291  call fftprof_ncalls_per_test(ncalls)
292 
293  ! List the FFT libraries that will be tested.
294  ! Goedecker FFTs are always available, other libs are optional.
295  nfftalgs = count(fftalgs /= 0)
296  ABI_CHECK(nfftalgs > 0, "fftalgs must be specified")
297 
298  ntests = max_nthreads * nfftalgs
299 
300  ! First dimension contains [fftalg, fftcache, ndat, nthreads, available, gpu_option]
301  ABI_MALLOC(fft_setups, (6, ntests))
302 
303  ! Default Goedecker library.
304  idx=0
305  do alg=1,nfftalgs
306    fftalg = fftalgs(alg); gpu_option = use_gpu_fftalgs(alg)
307    fftalga = fftalg/100; fftalgc = mod(fftalg, 10)
308    avail = merge(1, 0, fftalg_isavailable(fftalg))
309    !fftcache is machine-dependent.
310    fftcache = get_cache_kb()
311    do ith=1,max_nthreads
312      idx = idx + 1
313      fft_setups(:,idx) = [fftalg, fftcache, ndat, ith, avail, gpu_option]
314    end do
315  end do
316 
317  ! Init Ftest objects.
318  ABI_MALLOC(Ftest,(ntests))
319  ABI_MALLOC(Ftprof,(ntests))
320 
321  do it=1,ntests
322    call Ftest(it)%init(fft_setups(:,it), kpoint, ecut, boxcutmin2, rprimd, nsym, symrel, MPI_enreg)
323     ! Ftest%results is allocated using nfftot and the consistency btw libs is tested assuming an equal number of FFT-points.
324    if ( ANY(Ftest(it)%ngfft(1:3) /= Ftest(1)%ngfft(1:3)) ) then
325      ABI_ERROR("Consistency check assumes equal FFT meshes. Cannot continue")
326    end if
327    if (it == 1) then
328      call Ftest(it)%print() !,header)
329    else if (fft_setups(1,it) /= fft_setups(1,it-1)) then
330      call Ftest(it)%print() !,header)
331    end if
332  end do
333  !
334  ! =======================
335  ! ==== fourdp timing ====
336  ! =======================
337  if (test_fourdp) then
338    do isign=-1,1,2
339      do cplex=1,2
340        do it=1,ntests
341          call Ftest(it)%time_fourdp(isign, cplex, header, Ftprof(it))
342        end do
343        call fftprofs_print(Ftprof, header)
344        call fftprofs_free(Ftprof)
345      end do
346    end do
347  end if
348  !
349  ! =======================
350  ! ==== fourwf timing ====
351  ! =======================
352  if (test_fourwf) then
353    ! possible combinations of (option, cplex) supported in fourwf.
354    ! (cplex=2 only allowed for option=2, and istwf_k=1)
355    nsets=4; if (Ftest(1)%istwf_k==1) nsets=5
356    ABI_MALLOC(fourwf_params,(2,nsets))
357    fourwf_params(:,1) = [0, 0]
358    fourwf_params(:,2) = [1, 1]
359    fourwf_params(:,3) = [2, 1]
360    fourwf_params(:,4) = [3, 0]
361    if (nsets==5) fourwf_params(:,5) = [2, 2]
362 
363    do iset=1,nsets
364      option_fourwf = fourwf_params(1,iset)
365      cplex         = fourwf_params(2,iset)
366      do it=1,ntests
367        call Ftest(it)%time_fourwf(cplex, option_fourwf, header, Ftprof(it))
368      end do
369      call fftprofs_print(Ftprof, header)
370      call fftprofs_free(Ftprof)
371    end do
372    ABI_FREE(fourwf_params)
373  end if
374  !
375  ! ==========================
376  ! ==== Test GW routines ====
377  ! ==========================
378  ! These routines are used in the GW part, FFTW3 is expected to
379  ! be more efficient as the conversion complex(:) <--> real(2,:) is not needed.
380  if (test_gw) then
381 
382    ! fourdp timing with complex arrays
383    do isign=-1,1,2
384      do inplace=0,1
385        do it=1,ntests
386          call Ftest(it)%time_fftbox(isign, inplace, header, Ftprof(it))
387        end do
388        call fftprofs_print(Ftprof, header)
389        call fftprofs_free(Ftprof)
390      end do
391    end do
392 
393    ! zero padded FFTs with complex arrays
394    do isign=-1,1,2
395      do it=1,ntests
396        call Ftest(it)%time_fftu(isign, header, Ftprof(it))
397      end do
398      call fftprofs_print(Ftprof, header)
399      call fftprofs_free(Ftprof)
400    end do
401 
402    ! rho_tw_g timing
403    ABI_CHECK(osc_ecut > zero, "osc_ecut <= zero!")
404 
405    call get_kg(gamma_point,1,osc_ecut,gmet,osc_npw,osc_gvec)
406    ! TODO should reorder by shells to be consistent with the GW part!
407    ! Moreover I guess this ordering is more efficient when we have
408    ! to map the box to the G-sphere!
409    map2sphere=1; !map2sphere=0
410 
411    do use_padfft=0,1
412      do it=1,ntests
413        call Ftest(it)%time_rhotwg(map2sphere, use_padfft, osc_npw, osc_gvec, header, Ftprof(it))
414      end do
415      call fftprofs_print(Ftprof, header)
416      call fftprofs_free(Ftprof)
417    end do
418 
419    ABI_FREE(osc_gvec)
420  end if ! test_gw
421 
422  if (do_seq_utests) then
423    call wrtout(std_out, "=== FFT Unit Tests ===")
424 
425    nfailed = 0
426    do idx=1,ntests
427      ! fft_setups(:,idx) = [fftalg,fftcache,ndat,ith,avail,gpu_option]
428      fftalg   = fft_setups(1, idx)
429      fftcache = fft_setups(2, idx)
430      ndat     = fft_setups(3, idx)
431      nthreads = fft_setups(4, idx)
432      ! Skip the test if library is not available.
433      if (fft_setups(5,idx) == 0) CYCLE
434      gpu_option = fft_setups(6, idx)
435 
436      write(msg,"(3(a,i0))")"fftbox_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads
437      call wrtout(std_out, msg)
438 
439      nfailed = nfailed + fftbox_utests(fftalg, ndat, nthreads, gpu_option)
440 
441      ! Initialize ngfft(7:8) here.
442      ut_ngfft = -1
443      ut_ngfft(7) = fftalg
444      ut_ngfft(8) = fftcache
445 
446      call getng(boxcutmin2,0,ecut,gmet,k0,me_fft0,ut_mgfft,ut_nfft,ut_ngfft,nproc_fft1,nsym,&
447        paral_kgb0,symrel,tnons,unit=dev_null)
448 
449      write(msg,"(3(a,i0))")"fftu_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads
450      call wrtout(std_out, msg)
451 
452      nfailed = nfailed + fftu_utests(ecut, ut_ngfft, rprimd, ndat, nthreads)
453    end do
454 
455    write(msg,'(a,i0)')"Total number of failed tests = ",nfailed
456    call wrtout(std_out, msg)
457  end if
458 
459  ! Benchmarks for the sequential version.
460  if (do_seq_bench) then
461    call wrtout(std_out, "Entering benchmark mode")
462    write(std_out,*)"ecut_arth",ecut_arth,", necut ",necut
463 
464    if (INDEX(tasks, "bench_fourdp") > 0) then
465      isign=1; cplex=1
466      call prof_fourdp(fft_setups, isign, cplex, necut, ecut_arth, boxcutmin2, rprimd, nsym, symrel, MPI_enreg)
467 
468      isign=1; cplex=2
469      call prof_fourdp(fft_setups, isign, cplex, necut, ecut_arth, boxcutmin2, rprimd, nsym, symrel, MPI_enreg)
470    end if
471 
472    if (INDEX(tasks, "bench_fourwf") > 0) then
473      cplex=2; option_fourwf=0
474      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
475 
476      cplex=1; option_fourwf=1
477      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
478 
479      cplex=1; option_fourwf=2
480      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
481 
482      cplex=2; option_fourwf=2
483      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
484 
485      cplex=0; option_fourwf=3
486      call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg)
487    end if
488 
489    if (INDEX(tasks, "bench_rhotwg") > 0) then
490      map2sphere = 1; use_padfft = 1
491      call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,&
492       rprimd,nsym,symrel,gmet,MPI_enreg)
493 
494      map2sphere = 1; use_padfft = 0
495      call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,&
496       rprimd,nsym,symrel,gmet,MPI_enreg)
497    end if
498  end if
499  !
500  !===============================
501  !=== End of run, free memory ===
502  !===============================
503  call wrtout(std_out,ch10//" Analysis completed.")
504 
505  ABI_FREE(fft_setups)
506  call fft_tests_free(Ftest)
507  ABI_FREE(Ftest)
508  call fftprofs_free(Ftprof)
509  ABI_FREE(Ftprof)
510  call destroy_mpi_enreg(MPI_enreg)
511 
512 #if defined HAVE_GPU_CUDA
513  if (init_gpu_flavor /= ABI_GPU_DISABLED) then
514    call gpu_linalg_shutdown()
515    call unsetdevice_cuda(init_gpu_flavor)
516  end if
517 #endif
518 
519  call flush_unit(std_out)
520  call abinit_doctor("__fftprof")
521  100 call xmpi_end()
522 
523  end program fftprof