TABLE OF CONTENTS
ABINIT/fftprof [ Programs ]
NAME
fftprof
FUNCTION
Utility for profiling the FFT libraries supported by ABINIT.
COPYRIGHT
Copyright (C) 2004-2024 ABINIT group (MG) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
OUTPUT
Timing analysis of the different FFT libraries and algorithms.
NOTES
Point-group symmetries are not taken into account in getng during the generation of the FFT mesh. Therefore the FFT mesh might differ from the one found by abinit for the same cutoff and Bravais lattice (actually it might be smaller). Description of the input file (Fortran NAMELIST): &CONTROL tasks = string specifying the tasks to perform i.e. the routines that should be tested or profiled. allowed values: fourdp --> Test FFT transforms of density and potentials on the full box. fourwf --> Test FFT transforms of wavefunctions using the zero-padded algorithm. gw_fft --> Test the FFT transforms used in the GW code. all --> Test all FFT routines (DEFAULT) fftalgs = list of fftalg values (used to select the FFT libraries to use, see abinit doc for more info) use_gpu_fftalgs = for each fftalg in fftalgs, it is set to 0 if the GPU version doesnt have to be used, or to the value selecting the GPU implementation (see defs_basis.F90) ncalls = integer defining the number of calls for each tests. The final Wall time and CPU time are computed by averaging the final results over ncalls executions. max_nthreads = Maximum number of threads (DEFAULT 1, meaningful only if the code uses threaded external libraries or OpenMP parallelization) ndat = integer specifying how many FFT transforms should be executed for each call to the FFT routine (same meaning as the ndat input variable passed to fourwf) necut = Used if tasks = "bench". Specifies the number of cutoff energies to profile (see also ecut_arth) ecut_arth = Used if tasks = "bench". Used to generate an arithmetic progression of cutoff energies whose starting value is ecut_arth(1) and whose step is ecut_arth(2) &SYSTEM ecut = cutoff energy for wavefunctions (real, Hartree units) rprimd = Direct lattice vectors in Bohr. (3,3) matrix in Fortran column-major order kpoint = real(3) vector specifying the reduced coordinates of the k-point of the wavefunction (used if tasks = "fourwf"). The value of the k-point defines the storage scheme (istwfk) of the u(G) coefficients and therefore the FFT algorithm used to perform the transform u(G) <--> u(R) in fourwf. osc_ecut = cutoff energy (Hartree) for the oscillator matrix elements computed in the GW code Corresponds to the input variables (ecuteps, ecutsigx) used in the main code. nsym =Number of symmetry operations (DEFAULT 1) symrel(3,3,nsym) = Symmetry operation in real space used to select the FFT mesh in the routine getng (default: Identity matrix) NOTE that the real tnons is not taken into account anyhow: tnons is set to zero.
SOURCE
59 #if defined HAVE_CONFIG_H 60 #include "config.h" 61 #endif 62 63 #include "abi_common.h" 64 65 program fftprof 66 67 use defs_basis 68 use m_xmpi 69 use m_xomp 70 use m_errors 71 use m_abicore 72 use m_dfti 73 #ifdef HAVE_GPU_CUDA 74 use m_gpu_toolbox 75 use m_manage_cuda 76 #endif 77 78 use defs_abitypes, only : MPI_type 79 use m_build_info, only : abinit_version 80 use m_fstrings, only : lower 81 use m_specialmsg, only : specialmsg_getcount, herald 82 use m_argparse, only : get_arg !, get_arg_list, get_start_step_num 83 use m_io_tools, only : flush_unit 84 use m_geometry, only : metric 85 use m_fftcore, only : get_cache_kb, get_kg, fftalg_isavailable, fftalg_has_mpi, getng, fftcore_set_mixprec 86 use m_fft, only : fft_use_lib_threads, fftbox_utests, fftu_utests, fftbox_mpi_utests, fftu_mpi_utests 87 use m_fftw3, only : fftw3_init_threads 88 use m_fft_prof, only : fft_test_t, fft_prof_t, fft_tests_free, fftprof_ncalls_per_test, fftprofs_free, & 89 & fftprofs_print, prof_fourdp, prof_fourwf, prof_rhotwg 90 use m_mpinfo, only : destroy_mpi_enreg, initmpi_seq 91 92 implicit none 93 94 !Arguments ----------------------------------- 95 !scalars 96 integer,parameter :: MAX_NFFTALGS = 50, MAX_NSYM = 48 97 integer,parameter :: paral_kgb0 = 0, me_fft0 = 0, nproc_fft1 = 1, master = 0 98 integer :: ii,fftcache,it,cplex,ntests,option_fourwf,osc_npw 99 integer :: map2sphere,use_padfft,isign,nthreads,comm,nprocs,my_rank 100 integer :: iset,iall,inplace,nsets,avail,ith,idx,ut_nfft,ut_mgfft 101 integer :: nfftalgs,alg,fftalg,fftalga,fftalgc,nfailed,ierr,paral_kgb,abimem_level 102 character(len=24) :: codename 103 character(len=500) :: header,msg 104 real(dp),parameter :: boxcutmin2 = two 105 real(dp) :: ucvol, abimem_limit_mb 106 logical :: test_fourdp,test_fourwf,test_gw,do_seq_bench,do_seq_utests 107 logical :: iam_master,do_mpi_utests !,do_mpi_bench, 108 type(MPI_type) :: MPI_enreg 109 !arrays 110 integer :: ut_ngfft(18) 111 #ifdef HAVE_GPU_CUDA 112 integer :: gpu_devices(12) 113 #endif 114 real(dp),parameter :: gamma_point(3) = zero 115 real(dp) :: gmet(3,3),gprimd(3,3),rmet(3,3) 116 type(FFT_test_t),allocatable :: Ftest(:) 117 type(FFT_prof_t),allocatable :: Ftprof(:) 118 integer,allocatable :: osc_gvec(:,:), fft_setups(:,:),fourwf_params(:,:) 119 ! ========== INPUT FILE ============== 120 integer :: ncalls = 10, max_nthreads = 1, ndat = 1, necut = 0, nsym = 1 121 integer :: mixprec = 0, gpu_option = 0, init_gpu_flavor 122 character(len=500) :: tasks="all" 123 integer :: fftalgs(MAX_NFFTALGS) = 0, use_gpu_fftalgs(MAX_NFFTALGS) = 0 124 integer :: symrel(3,3,MAX_NSYM) = 0 125 real(dp),parameter :: k0(3)=(/zero,zero,zero/) 126 real(dp) :: ecut = 30, osc_ecut = 3 127 real(dp) :: ecut_arth(2) = zero 128 real(dp) :: rprimd(3,3) 129 real(dp) :: kpoint(3) = (/0.1,0.2,0.3/) 130 real(dp) :: tnons(3,MAX_NSYM) = zero 131 logical :: use_lib_threads = .FALSE. 132 namelist /CONTROL/ tasks, ncalls, max_nthreads, ndat, fftalgs, use_gpu_fftalgs, & 133 necut, ecut_arth, use_lib_threads, mixprec 134 namelist /SYSTEM/ ecut, rprimd, kpoint, osc_ecut, nsym, symrel 135 136 ! ************************************************************************* 137 138 ! Change communicator for I/O (mandatory!) 139 call abi_io_redirect(new_io_comm=xmpi_world) 140 141 call xmpi_init() 142 comm = xmpi_world 143 my_rank = xmpi_comm_rank(comm); nprocs = xmpi_comm_size(comm) 144 iam_master = (my_rank == master) 145 146 ! Initialize memory profiling if it is activated 147 ! if a full abimem.mocc report is desired, set the argument of abimem_init to "2" instead of "0" 148 ! note that abimem.mocc files can easily be multiple GB in size so don't use this option normally 149 ABI_CHECK(get_arg("abimem-level", abimem_level, msg, default=0) == 0, msg) 150 ABI_CHECK(get_arg("abimem-limit-mb", abimem_limit_mb, msg, default=20.0_dp) == 0, msg) 151 #ifdef HAVE_MEM_PROFILING 152 call abimem_init(abimem_level, limit_mb=abimem_limit_mb) 153 #endif 154 155 if (iam_master) then 156 ! Print header and read input file. 157 codename='FFTPROF'//REPEAT(' ',17) 158 call herald(codename,abinit_version,std_out) 159 160 write(std_out,'(a)')" Tool for profiling and testing the FFT libraries used in ABINIT." 161 write(std_out,'(a)')" Allowed options are: " 162 write(std_out,'(a)')" fourdp --> Test FFT transforms of density and potentials on the full box." 163 write(std_out,'(a)')" fourwf --> Test FFT transforms of wavefunctions using the zero-pad algorithm." 164 write(std_out,'(a)')" gw_fft --> Test the FFT transforms used in the GW code." 165 write(std_out,'(a)')" all --> Test all FFT routines." 166 !write(std_out,'(a)')" seq-utests --> Units tests for sequential FFTs." 167 !write(std_out,'(a)')" mpi-utests --> Units tests for MPI FFTs." 168 !write(std_out,'(a)')" seq-bench --> Benchmark mode for sequential FFTs." 169 !write(std_out,'(a)')" mpi-bench --> Benchmark mode for MPI FFTs." 170 write(std_out,'(a)')" " 171 172 !Read input file 173 read(std_in, NML=CONTROL) 174 read(std_in, NML=SYSTEM) 175 !write(std_out, NML=CONTROL) 176 !write(std_out, NML=SYSTEM) 177 call lower(tasks) 178 end if 179 180 ! Broadcast variables. 181 if (nprocs > 1) then 182 ! CONTROL 183 call xmpi_bcast(tasks,master,comm,ierr) 184 call xmpi_bcast(ncalls,master,comm,ierr) 185 call xmpi_bcast(max_nthreads,master,comm,ierr) 186 call xmpi_bcast(ndat,master,comm,ierr) 187 call xmpi_bcast(fftalgs,master,comm,ierr) 188 call xmpi_bcast(use_gpu_fftalgs,master,comm,ierr) 189 call xmpi_bcast(necut,master,comm,ierr) 190 call xmpi_bcast(ecut_arth,master,comm,ierr) 191 call xmpi_bcast(use_lib_threads,master,comm,ierr) 192 call xmpi_bcast(mixprec,master,comm,ierr) 193 ! SYSTEM 194 call xmpi_bcast(ecut,master,comm,ierr) 195 call xmpi_bcast(rprimd,master,comm,ierr) 196 call xmpi_bcast(kpoint,master,comm,ierr) 197 call xmpi_bcast(osc_ecut,master,comm,ierr) 198 call xmpi_bcast(nsym,master,comm,ierr) 199 call xmpi_bcast(symrel,master,comm,ierr) 200 end if 201 202 ! Set precision for FFT libs. 203 ii = fftcore_set_mixprec(mixprec) 204 205 ! replace "+" with white spaces 206 do ii=1,LEN_TRIM(tasks) 207 if (tasks(ii:ii) == "+") tasks(ii:ii) = " " 208 end do 209 !%call str_replace_chars(tasks,"+"," ") 210 tasks = " "//TRIM(tasks)//"" 211 212 iall=INDEX (tasks,"all") 213 test_fourdp = (iall>0 .or. INDEX(tasks," fourdp")>0 ) 214 test_fourwf = (iall>0 .or. INDEX(tasks," fourwf")>0 ) 215 test_gw = (iall>0 .or. INDEX(tasks," gw_fft")>0 ) 216 217 do_seq_utests = INDEX(tasks," utests") >0 218 do_seq_bench = INDEX(tasks," bench") >0 219 do_mpi_utests = INDEX(tasks," mpi-utests")>0 220 !do_mpi_bench = INDEX(tasks," mpi-bench")>0 221 222 #ifdef HAVE_FFTW3_THREADS 223 call fftw3_init_threads() 224 #endif 225 226 #ifndef HAVE_OPENMP 227 ABI_CHECK(max_nthreads <= 1, "nthreads>1 but OMP support is not enabled!") 228 #endif 229 if (max_nthreads>1 .and. iam_master) call xomp_show_info(std_out) 230 231 call fft_use_lib_threads(use_lib_threads) 232 !write(std_out,*)"use_lib_threads: ",use_lib_threads 233 234 init_gpu_flavor = maxval(use_gpu_fftalgs) 235 #if defined HAVE_GPU_CUDA 236 if (init_gpu_flavor /= ABI_GPU_DISABLED) then 237 gpu_devices(:)=-1 238 call setdevice_cuda(gpu_devices, init_gpu_flavor) 239 ABI_CHECK(init_gpu_flavor /= ABI_GPU_DISABLED, "Cannot find any free GPU device!") 240 ! Cublas initialization 241 call gpu_linalg_init() 242 end if 243 #endif 244 245 if (do_mpi_utests) then 246 ! Execute unit tests for MPI FFTs and terminate execution. 247 call wrtout(std_out, "=== MPI FFT Unit Tests ===") 248 249 ! List the FFT libraries that will be tested. 250 ! Goedecker FFTs are always available, other libs are optional. 251 nfftalgs = COUNT(fftalgs /= 0) 252 ABI_CHECK(nfftalgs > 0, "fftalgs must be specified") 253 254 nfailed = 0; nthreads = 0 255 do ii=1,nfftalgs 256 fftalg = fftalgs(ii); gpu_option = use_gpu_fftalgs(ii) 257 do paral_kgb=1,1 258 write(msg,"(5(a,i0))")& 259 "MPI fftu_utests with fftalg = ",fftalg,", paral_kgb = ",paral_kgb," ndat = ",ndat,", nthreads = ",nthreads 260 call wrtout(std_out, msg) 261 nfailed = nfailed + fftu_mpi_utests(fftalg,ecut,rprimd,ndat,nthreads,comm,paral_kgb) 262 end do 263 end do 264 265 nfailed = 0; nthreads = 0 266 do ii=1,nfftalgs 267 fftalg = fftalgs(ii); gpu_option = use_gpu_fftalgs(ii) 268 do cplex=1,2 269 write(msg,"(4(a,i0))")& 270 "MPI fftbox_utests with fftalg = ",fftalg,", cplex = ",cplex," ndat = ",ndat,", nthreads = ",nthreads 271 call wrtout(std_out, msg) 272 nfailed = nfailed + fftbox_mpi_utests(fftalg=fftalg,cplex=cplex,ndat=ndat,nthreads=nthreads,comm_fft=comm) 273 end do 274 end do 275 276 write(msg,'(a,i0)')"Total number of failed tests = ",nfailed 277 call wrtout(std_out, msg) 278 goto 100 ! Jump to xmpi_end 279 end if 280 281 call initmpi_seq(MPI_enreg) 282 call metric(gmet,gprimd,std_out,rmet,rprimd,ucvol) 283 284 ! symmetries (if given) are used for defining the FFT mesh. 285 if (nsym==1 .and. ALL(symrel==0)) then 286 symrel(:,:,1) = RESHAPE([1,0,0,0,1,0,0,0,1], [3,3]) 287 end if 288 289 ! Set the number of calls for test. 290 ncalls=ABS(ncalls); if (ncalls==0) ncalls=10 291 call fftprof_ncalls_per_test(ncalls) 292 293 ! List the FFT libraries that will be tested. 294 ! Goedecker FFTs are always available, other libs are optional. 295 nfftalgs = count(fftalgs /= 0) 296 ABI_CHECK(nfftalgs > 0, "fftalgs must be specified") 297 298 ntests = max_nthreads * nfftalgs 299 300 ! First dimension contains [fftalg, fftcache, ndat, nthreads, available, gpu_option] 301 ABI_MALLOC(fft_setups, (6, ntests)) 302 303 ! Default Goedecker library. 304 idx=0 305 do alg=1,nfftalgs 306 fftalg = fftalgs(alg); gpu_option = use_gpu_fftalgs(alg) 307 fftalga = fftalg/100; fftalgc = mod(fftalg, 10) 308 avail = merge(1, 0, fftalg_isavailable(fftalg)) 309 !fftcache is machine-dependent. 310 fftcache = get_cache_kb() 311 do ith=1,max_nthreads 312 idx = idx + 1 313 fft_setups(:,idx) = [fftalg, fftcache, ndat, ith, avail, gpu_option] 314 end do 315 end do 316 317 ! Init Ftest objects. 318 ABI_MALLOC(Ftest,(ntests)) 319 ABI_MALLOC(Ftprof,(ntests)) 320 321 do it=1,ntests 322 call Ftest(it)%init(fft_setups(:,it), kpoint, ecut, boxcutmin2, rprimd, nsym, symrel, MPI_enreg) 323 ! Ftest%results is allocated using nfftot and the consistency btw libs is tested assuming an equal number of FFT-points. 324 if ( ANY(Ftest(it)%ngfft(1:3) /= Ftest(1)%ngfft(1:3)) ) then 325 ABI_ERROR("Consistency check assumes equal FFT meshes. Cannot continue") 326 end if 327 if (it == 1) then 328 call Ftest(it)%print() !,header) 329 else if (fft_setups(1,it) /= fft_setups(1,it-1)) then 330 call Ftest(it)%print() !,header) 331 end if 332 end do 333 ! 334 ! ======================= 335 ! ==== fourdp timing ==== 336 ! ======================= 337 if (test_fourdp) then 338 do isign=-1,1,2 339 do cplex=1,2 340 do it=1,ntests 341 call Ftest(it)%time_fourdp(isign, cplex, header, Ftprof(it)) 342 end do 343 call fftprofs_print(Ftprof, header) 344 call fftprofs_free(Ftprof) 345 end do 346 end do 347 end if 348 ! 349 ! ======================= 350 ! ==== fourwf timing ==== 351 ! ======================= 352 if (test_fourwf) then 353 ! possible combinations of (option, cplex) supported in fourwf. 354 ! (cplex=2 only allowed for option=2, and istwf_k=1) 355 nsets=4; if (Ftest(1)%istwf_k==1) nsets=5 356 ABI_MALLOC(fourwf_params,(2,nsets)) 357 fourwf_params(:,1) = [0, 0] 358 fourwf_params(:,2) = [1, 1] 359 fourwf_params(:,3) = [2, 1] 360 fourwf_params(:,4) = [3, 0] 361 if (nsets==5) fourwf_params(:,5) = [2, 2] 362 363 do iset=1,nsets 364 option_fourwf = fourwf_params(1,iset) 365 cplex = fourwf_params(2,iset) 366 do it=1,ntests 367 call Ftest(it)%time_fourwf(cplex, option_fourwf, header, Ftprof(it)) 368 end do 369 call fftprofs_print(Ftprof, header) 370 call fftprofs_free(Ftprof) 371 end do 372 ABI_FREE(fourwf_params) 373 end if 374 ! 375 ! ========================== 376 ! ==== Test GW routines ==== 377 ! ========================== 378 ! These routines are used in the GW part, FFTW3 is expected to 379 ! be more efficient as the conversion complex(:) <--> real(2,:) is not needed. 380 if (test_gw) then 381 382 ! fourdp timing with complex arrays 383 do isign=-1,1,2 384 do inplace=0,1 385 do it=1,ntests 386 call Ftest(it)%time_fftbox(isign, inplace, header, Ftprof(it)) 387 end do 388 call fftprofs_print(Ftprof, header) 389 call fftprofs_free(Ftprof) 390 end do 391 end do 392 393 ! zero padded FFTs with complex arrays 394 do isign=-1,1,2 395 do it=1,ntests 396 call Ftest(it)%time_fftu(isign, header, Ftprof(it)) 397 end do 398 call fftprofs_print(Ftprof, header) 399 call fftprofs_free(Ftprof) 400 end do 401 402 ! rho_tw_g timing 403 ABI_CHECK(osc_ecut > zero, "osc_ecut <= zero!") 404 405 call get_kg(gamma_point,1,osc_ecut,gmet,osc_npw,osc_gvec) 406 ! TODO should reorder by shells to be consistent with the GW part! 407 ! Moreover I guess this ordering is more efficient when we have 408 ! to map the box to the G-sphere! 409 map2sphere=1; !map2sphere=0 410 411 do use_padfft=0,1 412 do it=1,ntests 413 call Ftest(it)%time_rhotwg(map2sphere, use_padfft, osc_npw, osc_gvec, header, Ftprof(it)) 414 end do 415 call fftprofs_print(Ftprof, header) 416 call fftprofs_free(Ftprof) 417 end do 418 419 ABI_FREE(osc_gvec) 420 end if ! test_gw 421 422 if (do_seq_utests) then 423 call wrtout(std_out, "=== FFT Unit Tests ===") 424 425 nfailed = 0 426 do idx=1,ntests 427 ! fft_setups(:,idx) = [fftalg,fftcache,ndat,ith,avail,gpu_option] 428 fftalg = fft_setups(1, idx) 429 fftcache = fft_setups(2, idx) 430 ndat = fft_setups(3, idx) 431 nthreads = fft_setups(4, idx) 432 ! Skip the test if library is not available. 433 if (fft_setups(5,idx) == 0) CYCLE 434 gpu_option = fft_setups(6, idx) 435 436 write(msg,"(3(a,i0))")"fftbox_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads 437 call wrtout(std_out, msg) 438 439 nfailed = nfailed + fftbox_utests(fftalg, ndat, nthreads, gpu_option) 440 441 ! Initialize ngfft(7:8) here. 442 ut_ngfft = -1 443 ut_ngfft(7) = fftalg 444 ut_ngfft(8) = fftcache 445 446 call getng(boxcutmin2,0,ecut,gmet,k0,me_fft0,ut_mgfft,ut_nfft,ut_ngfft,nproc_fft1,nsym,& 447 paral_kgb0,symrel,tnons,unit=dev_null) 448 449 write(msg,"(3(a,i0))")"fftu_utests with fftalg = ",fftalg,", ndat = ",ndat,", nthreads = ",nthreads 450 call wrtout(std_out, msg) 451 452 nfailed = nfailed + fftu_utests(ecut, ut_ngfft, rprimd, ndat, nthreads) 453 end do 454 455 write(msg,'(a,i0)')"Total number of failed tests = ",nfailed 456 call wrtout(std_out, msg) 457 end if 458 459 ! Benchmarks for the sequential version. 460 if (do_seq_bench) then 461 call wrtout(std_out, "Entering benchmark mode") 462 write(std_out,*)"ecut_arth",ecut_arth,", necut ",necut 463 464 if (INDEX(tasks, "bench_fourdp") > 0) then 465 isign=1; cplex=1 466 call prof_fourdp(fft_setups, isign, cplex, necut, ecut_arth, boxcutmin2, rprimd, nsym, symrel, MPI_enreg) 467 468 isign=1; cplex=2 469 call prof_fourdp(fft_setups, isign, cplex, necut, ecut_arth, boxcutmin2, rprimd, nsym, symrel, MPI_enreg) 470 end if 471 472 if (INDEX(tasks, "bench_fourwf") > 0) then 473 cplex=2; option_fourwf=0 474 call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg) 475 476 cplex=1; option_fourwf=1 477 call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg) 478 479 cplex=1; option_fourwf=2 480 call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg) 481 482 cplex=2; option_fourwf=2 483 call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg) 484 485 cplex=0; option_fourwf=3 486 call prof_fourwf(fft_setups,cplex,option_fourwf,kpoint,necut,ecut_arth,boxcutmin2,rprimd,nsym,symrel,MPI_enreg) 487 end if 488 489 if (INDEX(tasks, "bench_rhotwg") > 0) then 490 map2sphere = 1; use_padfft = 1 491 call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,& 492 rprimd,nsym,symrel,gmet,MPI_enreg) 493 494 map2sphere = 1; use_padfft = 0 495 call prof_rhotwg(fft_setups,map2sphere,use_padfft,necut,ecut_arth,osc_ecut,boxcutmin2,& 496 rprimd,nsym,symrel,gmet,MPI_enreg) 497 end if 498 end if 499 ! 500 !=============================== 501 !=== End of run, free memory === 502 !=============================== 503 call wrtout(std_out,ch10//" Analysis completed.") 504 505 ABI_FREE(fft_setups) 506 call fft_tests_free(Ftest) 507 ABI_FREE(Ftest) 508 call fftprofs_free(Ftprof) 509 ABI_FREE(Ftprof) 510 call destroy_mpi_enreg(MPI_enreg) 511 512 #if defined HAVE_GPU_CUDA 513 if (init_gpu_flavor /= ABI_GPU_DISABLED) then 514 call gpu_linalg_shutdown() 515 call unsetdevice_cuda(init_gpu_flavor) 516 end if 517 #endif 518 519 call flush_unit(std_out) 520 call abinit_doctor("__fftprof") 521 100 call xmpi_end() 522 523 end program fftprof