./52_fft_mpi_noabirule/m

ABINIT/m_sg2002
m_sg2002/ctrig
m_sg2002/fftstp
m_sg2002/sg2002_accrho
m_sg2002/sg2002_applypot
m_sg2002/sg2002_applypot_many
m_sg2002/sg2002_back
m_sg2002/sg2002_forw
m_sg2002/sg2002_mpiback_wf
m_sg2002/sg2002_mpiforw_wf
m_sg2002/sg2002_mpifourdp

ABINIT/m_sg2002 [ Modules ]

NAME

  m_sg2002

FUNCTION

  Copyright (C) 2002-2007 Stefan Goedecker, CEA Grenoble
  Copyright (C) 2014-2024 ABINIT group (XG)
  This file is distributed under the terms of the
  GNU General Public License, see ~abinit/COPYING
  or http://www.gnu.org/copyleft/gpl.txt .

NOTES

SOURCE

18 #if defined HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include "abi_common.h"
23 
24 module m_sg2002
25 
26  use defs_basis
27  use defs_fftdata
28  use m_abicore
29  use m_errors
30  use m_xmpi
31 
32  use m_time,         only : timab
33  use m_fstrings,     only : itoa
34  use m_fftcore,      only : sphere_fft1, fill, scramble, switchreal, switch, mpiswitch,&
35 &                           unfill, unscramble, unswitchreal, unswitch, unmpiswitch,&
36 &                           fill_cent, switch_cent, switchreal_cent, mpiswitch_cent, multpot, addrho,&
37 &                           unfill_cent, unswitchreal_cent, unswitch_cent, unmpiswitch_cent, unscramble,&
38 &                           mpifft_fg2dbox, mpifft_dbox2fr, mpifft_fr2dbox, mpifft_dbox2fg
39 
40  implicit none
41 
42  private
43 
44  ! Public API:
45  !public :: sg2002_seqfourdp   ! seq-FFT of densities and potentials.
46  public :: sg2002_mpifourdp    ! MPI-FFT of densities and potentials.
47  !public :: mpi_fourwf
48  !public :: sg2002_seqfourwf   ! seq-FFT of wavefunctions.
49  !public :: sg2002_mpifourwf   ! MPI-FFT of wavefunctions.
50 
51 ! Low-level tools.
52 ! These procedure shouls be accessed via a wrapper that selected the library via fftalg
53  public :: sg2002_back           ! G --> R for densities and potentials
54  public :: sg2002_forw           ! R --> G for densities and potentials
55  public :: sg2002_mpiback_wf     ! G --> R for wavefunctions
56  public :: sg2002_mpiforw_wf     ! R --> G for wavefunctions
57  public :: sg2002_applypot       ! Compute <G|vloc|u> where u is given in reciprocal space.
58  public :: sg2002_applypot_many  ! Compute <G|vloc|u> where u is given in reciprocal space.
59  public :: sg2002_accrho         ! Compute rho = weigth_r*Re(u(r))**2 + weigth_i*Im(u(r))**2
60 
61 contains

m_sg2002/ctrig [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  ctrig

FUNCTION

INPUTS

OUTPUT

SOURCE

2513 subroutine ctrig(n,trig,after,before,now,isign,ic)
2514 
2515  implicit none
2516 
2517 !Arguments ------------------------------------
2518  integer,intent(in) :: n,isign
2519  integer,intent(inout) :: ic
2520  integer,intent(inout) :: after(mdata),before(mdata),now(mdata)
2521  real(dp),intent(inout) :: trig(2,n)
2522 
2523 !Local variables-------------------------------
2524 !scalars
2525  integer :: i,itt,j,nh
2526  real(dp) :: angle,trigc,trigs
2527 
2528 ! *************************************************************************
2529 
2530  do i=1,ndata
2531    if (n.eq.ifftdata(1,i)) then
2532      ic=0
2533      do j=1,(mdata-1)
2534        itt=ifftdata(1+j,i)
2535        if (itt.gt.1) then
2536          ic=ic+1
2537          now(j)=ifftdata(1+j,i)
2538        else
2539          goto 1000
2540        end if
2541      end do
2542      goto 1000
2543    end if
2544  end do
2545 
2546  write(std_out,*) 'VALUE OF',n,'NOT ALLOWED FOR FFT, ALLOWED VALUES ARE:'
2547 37 format(15(i5))
2548  write(std_out,37) (ifftdata(1,i),i=1,ndata)
2549  ABI_ERROR("Aborting now")
2550 
2551 1000 continue
2552  after(1)=1
2553  before(ic)=1
2554  do i=2,ic
2555    after(i)=after(i-1)*now(i-1)
2556    before(ic-i+1)=before(ic-i+2)*now(ic-i+2)
2557  end do
2558 
2559  angle=isign*two_pi/n
2560  if (mod(n,2).eq.0) then
2561    nh=n/2
2562    trig(1,1)=one
2563    trig(2,1)=zero
2564    trig(1,nh+1)=-one
2565    trig(2,nh+1)=zero
2566    do i=1,nh-1
2567      trigc=cos(i*angle)
2568      trigs=sin(i*angle)
2569      trig(1,i+1)=trigc
2570      trig(2,i+1)=trigs
2571      trig(1,n-i+1)=trigc
2572      trig(2,n-i+1)=-trigs
2573    end do
2574  else
2575    nh=(n-1)/2
2576    trig(1,1)=one
2577    trig(2,1)=zero
2578    do i=1,nh
2579      trigc=cos(i*angle)
2580      trigs=sin(i*angle)
2581      trig(1,i+1)=trigc
2582      trig(2,i+1)=trigs
2583      trig(1,n-i+1)=trigc
2584      trig(2,n-i+1)=-trigs
2585    end do
2586  end if
2587 
2588 end subroutine ctrig

m_sg2002/fftstp [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  fftstp

FUNCTION

INPUTS

   mm
   n1dfft
   m
   nn
   n
   zin
   trig
   after
   now
   before
   isign

OUTPUT

   zout

SOURCE

2615 subroutine fftstp(mm,n1dfft,m,nn,n,zin,zout,trig,after,now,before,isign)
2616 
2617  implicit none
2618 
2619 !Arguments ------------------------------------
2620  integer,intent(in) :: after,before,mm,n1dfft,m,nn,n,now,isign
2621  real(dp),intent(in) :: trig(2,n),zin(2,mm,m)
2622  real(dp),intent(inout) :: zout(2,nn,n)
2623 
2624 !Local variables-------------------------------
2625  integer :: atn,atb,ia,ias,ib,itrig,itt,j,nin1,nin2,nin3,nin4,nin5,nin6,nin7,nin8
2626  integer :: nout1,nout2,nout3,nout4,nout5,nout6,nout7,nout8
2627  real(dp) :: am,ap,bm,bp,ci3,ci4,ci5,ci6,ci7,ci8,cm,cos2,cos4,cp,cr2,cr3,cr4,cr5,cr6,cr7,cr8
2628  real(dp) :: dm,bb,ci2,dpp,r,r2,r25,r3,r34,r4,r5,r6,r7,r8,rt2i,s,r1,s1,s2,s3,s25,s34,s4,s5,s6,s7,s8
2629  real(dp) :: sin2,ui1,ui2,ui3,ur1,ur2,ur3,sin4,vi1,vi2,vi3,vr1,vr2,vr3
2630 
2631 ! *************************************************************************
2632         atn=after*now
2633         atb=after*before
2634 
2635 !         sqrt(.5d0)
2636         rt2i=half_sqrt2
2637         if (now.eq.2) then
2638         ia=1
2639         nin1=ia-after
2640         nout1=ia-atn
2641         do ib=1,before
2642           nin1=nin1+after
2643           nin2=nin1+atb
2644           nout1=nout1+atn
2645           nout2=nout1+after
2646             do j=1,n1dfft
2647             r1=zin(1,j,nin1)
2648             s1=zin(2,j,nin1)
2649             r2=zin(1,j,nin2)
2650             s2=zin(2,j,nin2)
2651             zout(1,j,nout1)= r2 + r1
2652             zout(2,j,nout1)= s2 + s1
2653             zout(1,j,nout2)= r1 - r2
2654             zout(2,j,nout2)= s1 - s2
2655           enddo
2656         enddo
2657         do 2000,ia=2,after
2658         ias=ia-1
2659         if (2*ias.eq.after) then
2660                 if (isign.eq.1) then
2661                         nin1=ia-after
2662                         nout1=ia-atn
2663                         do ib=1,before
2664                           nin1=nin1+after
2665                           nin2=nin1+atb
2666                           nout1=nout1+atn
2667                           nout2=nout1+after
2668                           do j=1,n1dfft
2669                             r1=zin(1,j,nin1)
2670                             s1=zin(2,j,nin1)
2671                             r2=zin(2,j,nin2)
2672                             s2=zin(1,j,nin2)
2673                             zout(1,j,nout1)= r1 - r2
2674                             zout(2,j,nout1)= s2 + s1
2675                             zout(1,j,nout2)= r2 + r1
2676                             zout(2,j,nout2)= s1 - s2
2677                           enddo
2678                         enddo
2679                 else
2680                         nin1=ia-after
2681                         nout1=ia-atn
2682                         do ib=1,before
2683                           nin1=nin1+after
2684                           nin2=nin1+atb
2685                           nout1=nout1+atn
2686                           nout2=nout1+after
2687                             do j=1,n1dfft
2688                             r1=zin(1,j,nin1)
2689                             s1=zin(2,j,nin1)
2690                             r2=zin(2,j,nin2)
2691                             s2=zin(1,j,nin2)
2692                             zout(1,j,nout1)= r2 + r1
2693                             zout(2,j,nout1)= s1 - s2
2694                             zout(1,j,nout2)= r1 - r2
2695                             zout(2,j,nout2)= s2 + s1
2696                           enddo
2697                         enddo
2698                 end if
2699         else if (4*ias.eq.after) then
2700                 if (isign.eq.1) then
2701                         nin1=ia-after
2702                         nout1=ia-atn
2703                         do ib=1,before
2704                         nin1=nin1+after
2705                         nin2=nin1+atb
2706                         nout1=nout1+atn
2707                         nout2=nout1+after
2708                         do j=1,n1dfft
2709                         r1=zin(1,j,nin1)
2710                         s1=zin(2,j,nin1)
2711                         r=zin(1,j,nin2)
2712                         s=zin(2,j,nin2)
2713                         r2=(r - s)*rt2i
2714                         s2=(r + s)*rt2i
2715                         zout(1,j,nout1)= r2 + r1
2716                         zout(2,j,nout1)= s2 + s1
2717                         zout(1,j,nout2)= r1 - r2
2718                         zout(2,j,nout2)= s1 - s2
2719                         enddo
2720                         enddo
2721                 else
2722                         nin1=ia-after
2723                         nout1=ia-atn
2724                         do ib=1,before
2725                         nin1=nin1+after
2726                         nin2=nin1+atb
2727                         nout1=nout1+atn
2728                         nout2=nout1+after
2729                         do j=1,n1dfft
2730                         r1=zin(1,j,nin1)
2731                         s1=zin(2,j,nin1)
2732                         r=zin(1,j,nin2)
2733                         s=zin(2,j,nin2)
2734                         r2=(r + s)*rt2i
2735                         s2=(s - r)*rt2i
2736                         zout(1,j,nout1)= r2 + r1
2737                         zout(2,j,nout1)= s2 + s1
2738                         zout(1,j,nout2)= r1 - r2
2739                         zout(2,j,nout2)= s1 - s2
2740                         enddo
2741                         enddo
2742                 end if
2743         else if (4*ias.eq.3*after) then
2744                 if (isign.eq.1) then
2745                         nin1=ia-after
2746                         nout1=ia-atn
2747                         do ib=1,before
2748                         nin1=nin1+after
2749                         nin2=nin1+atb
2750                         nout1=nout1+atn
2751                         nout2=nout1+after
2752                         do j=1,n1dfft
2753                         r1=zin(1,j,nin1)
2754                         s1=zin(2,j,nin1)
2755                         r=zin(1,j,nin2)
2756                         s=zin(2,j,nin2)
2757                         r2=(r + s)*rt2i
2758                         s2=(r - s)*rt2i
2759                         zout(1,j,nout1)= r1 - r2
2760                         zout(2,j,nout1)= s2 + s1
2761                         zout(1,j,nout2)= r2 + r1
2762                         zout(2,j,nout2)= s1 - s2
2763                         enddo
2764                         enddo
2765                 else
2766                         nin1=ia-after
2767                         nout1=ia-atn
2768                         do ib=1,before
2769                         nin1=nin1+after
2770                         nin2=nin1+atb
2771                         nout1=nout1+atn
2772                         nout2=nout1+after
2773                         do j=1,n1dfft
2774                         r1=zin(1,j,nin1)
2775                         s1=zin(2,j,nin1)
2776                         r=zin(1,j,nin2)
2777                         s=zin(2,j,nin2)
2778                         r2=(s - r)*rt2i
2779                         s2=(r + s)*rt2i
2780                         zout(1,j,nout1)= r2 + r1
2781                         zout(2,j,nout1)= s1 - s2
2782                         zout(1,j,nout2)= r1 - r2
2783                         zout(2,j,nout2)= s2 + s1
2784                         enddo
2785                         enddo
2786                 end if
2787         else
2788                 itrig=ias*before+1
2789                 cr2=trig(1,itrig)
2790                 ci2=trig(2,itrig)
2791                 nin1=ia-after
2792                 nout1=ia-atn
2793                 do ib=1,before
2794                 nin1=nin1+after
2795                 nin2=nin1+atb
2796                 nout1=nout1+atn
2797                 nout2=nout1+after
2798                 do j=1,n1dfft
2799                 r1=zin(1,j,nin1)
2800                 s1=zin(2,j,nin1)
2801                 r=zin(1,j,nin2)
2802                 s=zin(2,j,nin2)
2803                 r2=r*cr2 - s*ci2
2804                 s2=r*ci2 + s*cr2
2805                 zout(1,j,nout1)= r2 + r1
2806                 zout(2,j,nout1)= s2 + s1
2807                 zout(1,j,nout2)= r1 - r2
2808                 zout(2,j,nout2)= s1 - s2
2809                 enddo
2810                 enddo
2811         end if
2812 2000        continue
2813         else if (now.eq.4) then
2814         if (isign.eq.1) then
2815                 ia=1
2816                 nin1=ia-after
2817                 nout1=ia-atn
2818                 do ib=1,before
2819                 nin1=nin1+after
2820                 nin2=nin1+atb
2821                 nin3=nin2+atb
2822                 nin4=nin3+atb
2823                 nout1=nout1+atn
2824                 nout2=nout1+after
2825                 nout3=nout2+after
2826                 nout4=nout3+after
2827                 do j=1,n1dfft
2828                 r1=zin(1,j,nin1)
2829                 s1=zin(2,j,nin1)
2830                 r2=zin(1,j,nin2)
2831                 s2=zin(2,j,nin2)
2832                 r3=zin(1,j,nin3)
2833                 s3=zin(2,j,nin3)
2834                 r4=zin(1,j,nin4)
2835                 s4=zin(2,j,nin4)
2836                 r=r1 + r3
2837                 s=r2 + r4
2838                 zout(1,j,nout1) = r + s
2839                 zout(1,j,nout3) = r - s
2840                 r=r1 - r3
2841                 s=s2 - s4
2842                 zout(1,j,nout2) = r - s
2843                 zout(1,j,nout4) = r + s
2844                 r=s1 + s3
2845                 s=s2 + s4
2846                 zout(2,j,nout1) = r + s
2847                 zout(2,j,nout3) = r - s
2848                 r=s1 - s3
2849                 s=r2 - r4
2850                 zout(2,j,nout2) = r + s
2851                 zout(2,j,nout4) = r - s
2852                 enddo
2853                 enddo
2854                 do 4000,ia=2,after
2855                 ias=ia-1
2856                 if (2*ias.eq.after) then
2857                         nin1=ia-after
2858                         nout1=ia-atn
2859                         do ib=1,before
2860                         nin1=nin1+after
2861                         nin2=nin1+atb
2862                         nin3=nin2+atb
2863                         nin4=nin3+atb
2864                         nout1=nout1+atn
2865                         nout2=nout1+after
2866                         nout3=nout2+after
2867                         nout4=nout3+after
2868                         do j=1,n1dfft
2869                         r1=zin(1,j,nin1)
2870                         s1=zin(2,j,nin1)
2871                         r=zin(1,j,nin2)
2872                         s=zin(2,j,nin2)
2873                         r2=(r-s)*rt2i
2874                         s2=(r+s)*rt2i
2875                         r3=zin(2,j,nin3)
2876                         s3=zin(1,j,nin3)
2877                         r=zin(1,j,nin4)
2878                         s=zin(2,j,nin4)
2879                         r4=(r + s)*rt2i
2880                         s4=(r - s)*rt2i
2881                         r=r1 - r3
2882                         s=r2 - r4
2883                         zout(1,j,nout1) = r + s
2884                         zout(1,j,nout3) = r - s
2885                         r=r1 + r3
2886                         s=s2 - s4
2887                         zout(1,j,nout2) = r - s
2888                         zout(1,j,nout4) = r + s
2889                         r=s1 + s3
2890                         s=s2 + s4
2891                         zout(2,j,nout1) = r + s
2892                         zout(2,j,nout3) = r - s
2893                         r=s1 - s3
2894                         s=r2 + r4
2895                         zout(2,j,nout2) = r + s
2896                         zout(2,j,nout4) = r - s
2897                         enddo
2898                         enddo
2899                 else
2900                         itt=ias*before
2901                         itrig=itt+1
2902                         cr2=trig(1,itrig)
2903                         ci2=trig(2,itrig)
2904                         itrig=itrig+itt
2905                         cr3=trig(1,itrig)
2906                         ci3=trig(2,itrig)
2907                         itrig=itrig+itt
2908                         cr4=trig(1,itrig)
2909                         ci4=trig(2,itrig)
2910                         nin1=ia-after
2911                         nout1=ia-atn
2912                         do ib=1,before
2913                         nin1=nin1+after
2914                         nin2=nin1+atb
2915                         nin3=nin2+atb
2916                         nin4=nin3+atb
2917                         nout1=nout1+atn
2918                         nout2=nout1+after
2919                         nout3=nout2+after
2920                         nout4=nout3+after
2921                         do j=1,n1dfft
2922                         r1=zin(1,j,nin1)
2923                         s1=zin(2,j,nin1)
2924                         r=zin(1,j,nin2)
2925                         s=zin(2,j,nin2)
2926                         r2=r*cr2 - s*ci2
2927                         s2=r*ci2 + s*cr2
2928                         r=zin(1,j,nin3)
2929                         s=zin(2,j,nin3)
2930                         r3=r*cr3 - s*ci3
2931                         s3=r*ci3 + s*cr3
2932                         r=zin(1,j,nin4)
2933                         s=zin(2,j,nin4)
2934                         r4=r*cr4 - s*ci4
2935                         s4=r*ci4 + s*cr4
2936                         r=r1 + r3
2937                         s=r2 + r4
2938                         zout(1,j,nout1) = r + s
2939                         zout(1,j,nout3) = r - s
2940                         r=r1 - r3
2941                         s=s2 - s4
2942                         zout(1,j,nout2) = r - s
2943                         zout(1,j,nout4) = r + s
2944                         r=s1 + s3
2945                         s=s2 + s4
2946                         zout(2,j,nout1) = r + s
2947                         zout(2,j,nout3) = r - s
2948                         r=s1 - s3
2949                         s=r2 - r4
2950                         zout(2,j,nout2) = r + s
2951                         zout(2,j,nout4) = r - s
2952                         enddo
2953                         enddo
2954                 end if
2955 4000                continue
2956         else
2957                 ia=1
2958                 nin1=ia-after
2959                 nout1=ia-atn
2960                 do ib=1,before
2961                 nin1=nin1+after
2962                 nin2=nin1+atb
2963                 nin3=nin2+atb
2964                 nin4=nin3+atb
2965                 nout1=nout1+atn
2966                 nout2=nout1+after
2967                 nout3=nout2+after
2968                 nout4=nout3+after
2969                 do j=1,n1dfft
2970                 r1=zin(1,j,nin1)
2971                 s1=zin(2,j,nin1)
2972                 r2=zin(1,j,nin2)
2973                 s2=zin(2,j,nin2)
2974                 r3=zin(1,j,nin3)
2975                 s3=zin(2,j,nin3)
2976                 r4=zin(1,j,nin4)
2977                 s4=zin(2,j,nin4)
2978                 r=r1 + r3
2979                 s=r2 + r4
2980                 zout(1,j,nout1) = r + s
2981                 zout(1,j,nout3) = r - s
2982                 r=r1 - r3
2983                 s=s2 - s4
2984                 zout(1,j,nout2) = r + s
2985                 zout(1,j,nout4) = r - s
2986                 r=s1 + s3
2987                 s=s2 + s4
2988                 zout(2,j,nout1) = r + s
2989                 zout(2,j,nout3) = r - s
2990                 r=s1 - s3
2991                 s=r2 - r4
2992                 zout(2,j,nout2) = r - s
2993                 zout(2,j,nout4) = r + s
2994                 enddo
2995                 enddo
2996                 do 4100,ia=2,after
2997                 ias=ia-1
2998                 if (2*ias.eq.after) then
2999                         nin1=ia-after
3000                         nout1=ia-atn
3001                         do ib=1,before
3002                         nin1=nin1+after
3003                         nin2=nin1+atb
3004                         nin3=nin2+atb
3005                         nin4=nin3+atb
3006                         nout1=nout1+atn
3007                         nout2=nout1+after
3008                         nout3=nout2+after
3009                         nout4=nout3+after
3010                         do j=1,n1dfft
3011                         r1=zin(1,j,nin1)
3012                         s1=zin(2,j,nin1)
3013                         r=zin(1,j,nin2)
3014                         s=zin(2,j,nin2)
3015                         r2=(r + s)*rt2i
3016                         s2=(s - r)*rt2i
3017                         r3=zin(2,j,nin3)
3018                         s3=zin(1,j,nin3)
3019                         r=zin(1,j,nin4)
3020                         s=zin(2,j,nin4)
3021                         r4=(s - r)*rt2i
3022                         s4=(r + s)*rt2i
3023                         r=r1 + r3
3024                         s=r2 + r4
3025                         zout(1,j,nout1) = r + s
3026                         zout(1,j,nout3) = r - s
3027                         r=r1 - r3
3028                         s=s2 + s4
3029                         zout(1,j,nout2) = r + s
3030                         zout(1,j,nout4) = r - s
3031                         r=s1 - s3
3032                         s=s2 - s4
3033                         zout(2,j,nout1) = r + s
3034                         zout(2,j,nout3) = r - s
3035                         r=s1 + s3
3036                         s=r2 - r4
3037                         zout(2,j,nout2) = r - s
3038                         zout(2,j,nout4) = r + s
3039                         enddo
3040                         enddo
3041                 else
3042                         itt=ias*before
3043                         itrig=itt+1
3044                         cr2=trig(1,itrig)
3045                         ci2=trig(2,itrig)
3046                         itrig=itrig+itt
3047                         cr3=trig(1,itrig)
3048                         ci3=trig(2,itrig)
3049                         itrig=itrig+itt
3050                         cr4=trig(1,itrig)
3051                         ci4=trig(2,itrig)
3052                         nin1=ia-after
3053                         nout1=ia-atn
3054                         do ib=1,before
3055                         nin1=nin1+after
3056                         nin2=nin1+atb
3057                         nin3=nin2+atb
3058                         nin4=nin3+atb
3059                         nout1=nout1+atn
3060                         nout2=nout1+after
3061                         nout3=nout2+after
3062                         nout4=nout3+after
3063                         do j=1,n1dfft
3064                         r1=zin(1,j,nin1)
3065                         s1=zin(2,j,nin1)
3066                         r=zin(1,j,nin2)
3067                         s=zin(2,j,nin2)
3068                         r2=r*cr2 - s*ci2
3069                         s2=r*ci2 + s*cr2
3070                         r=zin(1,j,nin3)
3071                         s=zin(2,j,nin3)
3072                         r3=r*cr3 - s*ci3
3073                         s3=r*ci3 + s*cr3
3074                         r=zin(1,j,nin4)
3075                         s=zin(2,j,nin4)
3076                         r4=r*cr4 - s*ci4
3077                         s4=r*ci4 + s*cr4
3078                         r=r1 + r3
3079                         s=r2 + r4
3080                         zout(1,j,nout1) = r + s
3081                         zout(1,j,nout3) = r - s
3082                         r=r1 - r3
3083                         s=s2 - s4
3084                         zout(1,j,nout2) = r + s
3085                         zout(1,j,nout4) = r - s
3086                         r=s1 + s3
3087                         s=s2 + s4
3088                         zout(2,j,nout1) = r + s
3089                         zout(2,j,nout3) = r - s
3090                         r=s1 - s3
3091                         s=r2 - r4
3092                         zout(2,j,nout2) = r - s
3093                         zout(2,j,nout4) = r + s
3094                         enddo
3095                         enddo
3096                 end if
3097 4100                continue
3098         end if
3099         else if (now.eq.8) then
3100         if (isign.eq.-1) then
3101                 ia=1
3102                         nin1=ia-after
3103                         nout1=ia-atn
3104                         do ib=1,before
3105                         nin1=nin1+after
3106                         nin2=nin1+atb
3107                         nin3=nin2+atb
3108                         nin4=nin3+atb
3109                         nin5=nin4+atb
3110                         nin6=nin5+atb
3111                         nin7=nin6+atb
3112                         nin8=nin7+atb
3113                         nout1=nout1+atn
3114                         nout2=nout1+after
3115                         nout3=nout2+after
3116                         nout4=nout3+after
3117                         nout5=nout4+after
3118                         nout6=nout5+after
3119                         nout7=nout6+after
3120                         nout8=nout7+after
3121                         do j=1,n1dfft
3122                         r1=zin(1,j,nin1)
3123                         s1=zin(2,j,nin1)
3124                         r2=zin(1,j,nin2)
3125                         s2=zin(2,j,nin2)
3126                         r3=zin(1,j,nin3)
3127                         s3=zin(2,j,nin3)
3128                         r4=zin(1,j,nin4)
3129                         s4=zin(2,j,nin4)
3130                         r5=zin(1,j,nin5)
3131                         s5=zin(2,j,nin5)
3132                         r6=zin(1,j,nin6)
3133                         s6=zin(2,j,nin6)
3134                         r7=zin(1,j,nin7)
3135                         s7=zin(2,j,nin7)
3136                         r8=zin(1,j,nin8)
3137                         s8=zin(2,j,nin8)
3138                         r=r1 + r5
3139                         s=r3 + r7
3140                         ap=r + s
3141                         am=r - s
3142                         r=r2 + r6
3143                         s=r4 + r8
3144                         bp=r + s
3145                         bm=r - s
3146                         r=s1 + s5
3147                         s=s3 + s7
3148                         cp=r + s
3149                         cm=r - s
3150                         r=s2 + s6
3151                         s=s4 + s8
3152                         dpp=r + s
3153                         dm=r - s
3154                         zout(1,j,nout1) = ap + bp
3155                         zout(2,j,nout1) = cp + dpp
3156                         zout(1,j,nout5) = ap - bp
3157                         zout(2,j,nout5) = cp - dpp
3158                         zout(1,j,nout3) = am + dm
3159                         zout(2,j,nout3) = cm - bm
3160                         zout(1,j,nout7) = am - dm
3161                         zout(2,j,nout7) = cm + bm
3162                         r=r1 - r5
3163                         s=s3 - s7
3164                         ap=r + s
3165                         am=r - s
3166                         r=s1 - s5
3167                         s=r3 - r7
3168                         bp=r + s
3169                         bm=r - s
3170                         r=s4 - s8
3171                         s=r2 - r6
3172                         cp=r + s
3173                         cm=r - s
3174                         r=s2 - s6
3175                         s=r4 - r8
3176                         dpp=r + s
3177                         dm=r - s
3178                         r = ( cp + dm)*rt2i
3179                         s = ( dm - cp)*rt2i
3180                         cp= ( cm + dpp)*rt2i
3181                         dpp = ( cm - dpp)*rt2i
3182                         zout(1,j,nout2) = ap + r
3183                         zout(2,j,nout2) = bm + s
3184                         zout(1,j,nout6) = ap - r
3185                         zout(2,j,nout6) = bm - s
3186                         zout(1,j,nout4) = am + cp
3187                         zout(2,j,nout4) = bp + dpp
3188                         zout(1,j,nout8) = am - cp
3189                         zout(2,j,nout8) = bp - dpp
3190                         enddo
3191                         enddo
3192                 do 8000,ia=2,after
3193                 ias=ia-1
3194                         itt=ias*before
3195                         itrig=itt+1
3196                         cr2=trig(1,itrig)
3197                         ci2=trig(2,itrig)
3198                         itrig=itrig+itt
3199                         cr3=trig(1,itrig)
3200                         ci3=trig(2,itrig)
3201                         itrig=itrig+itt
3202                         cr4=trig(1,itrig)
3203                         ci4=trig(2,itrig)
3204                         itrig=itrig+itt
3205                         cr5=trig(1,itrig)
3206                         ci5=trig(2,itrig)
3207                         itrig=itrig+itt
3208                         cr6=trig(1,itrig)
3209                         ci6=trig(2,itrig)
3210                         itrig=itrig+itt
3211                         cr7=trig(1,itrig)
3212                         ci7=trig(2,itrig)
3213                         itrig=itrig+itt
3214                         cr8=trig(1,itrig)
3215                         ci8=trig(2,itrig)
3216                         nin1=ia-after
3217                         nout1=ia-atn
3218                         do ib=1,before
3219                         nin1=nin1+after
3220                         nin2=nin1+atb
3221                         nin3=nin2+atb
3222                         nin4=nin3+atb
3223                         nin5=nin4+atb
3224                         nin6=nin5+atb
3225                         nin7=nin6+atb
3226                         nin8=nin7+atb
3227                         nout1=nout1+atn
3228                         nout2=nout1+after
3229                         nout3=nout2+after
3230                         nout4=nout3+after
3231                         nout5=nout4+after
3232                         nout6=nout5+after
3233                         nout7=nout6+after
3234                         nout8=nout7+after
3235                         do j=1,n1dfft
3236                         r1=zin(1,j,nin1)
3237                         s1=zin(2,j,nin1)
3238                         r=zin(1,j,nin2)
3239                         s=zin(2,j,nin2)
3240                         r2=r*cr2 - s*ci2
3241                         s2=r*ci2 + s*cr2
3242                         r=zin(1,j,nin3)
3243                         s=zin(2,j,nin3)
3244                         r3=r*cr3 - s*ci3
3245                         s3=r*ci3 + s*cr3
3246                         r=zin(1,j,nin4)
3247                         s=zin(2,j,nin4)
3248                         r4=r*cr4 - s*ci4
3249                         s4=r*ci4 + s*cr4
3250                         r=zin(1,j,nin5)
3251                         s=zin(2,j,nin5)
3252                         r5=r*cr5 - s*ci5
3253                         s5=r*ci5 + s*cr5
3254                         r=zin(1,j,nin6)
3255                         s=zin(2,j,nin6)
3256                         r6=r*cr6 - s*ci6
3257                         s6=r*ci6 + s*cr6
3258                         r=zin(1,j,nin7)
3259                         s=zin(2,j,nin7)
3260                         r7=r*cr7 - s*ci7
3261                         s7=r*ci7 + s*cr7
3262                         r=zin(1,j,nin8)
3263                         s=zin(2,j,nin8)
3264                         r8=r*cr8 - s*ci8
3265                         s8=r*ci8 + s*cr8
3266                         r=r1 + r5
3267                         s=r3 + r7
3268                         ap=r + s
3269                         am=r - s
3270                         r=r2 + r6
3271                         s=r4 + r8
3272                         bp=r + s
3273                         bm=r - s
3274                         r=s1 + s5
3275                         s=s3 + s7
3276                         cp=r + s
3277                         cm=r - s
3278                         r=s2 + s6
3279                         s=s4 + s8
3280                         dpp=r + s
3281                         dm=r - s
3282                         zout(1,j,nout1) = ap + bp
3283                         zout(2,j,nout1) = cp + dpp
3284                         zout(1,j,nout5) = ap - bp
3285                         zout(2,j,nout5) = cp - dpp
3286                         zout(1,j,nout3) = am + dm
3287                         zout(2,j,nout3) = cm - bm
3288                         zout(1,j,nout7) = am - dm
3289                         zout(2,j,nout7) = cm + bm
3290                         r=r1 - r5
3291                         s=s3 - s7
3292                         ap=r + s
3293                         am=r - s
3294                         r=s1 - s5
3295                         s=r3 - r7
3296                         bp=r + s
3297                         bm=r - s
3298                         r=s4 - s8
3299                         s=r2 - r6
3300                         cp=r + s
3301                         cm=r - s
3302                         r=s2 - s6
3303                         s=r4 - r8
3304                         dpp=r + s
3305                         dm=r - s
3306                         r = ( cp + dm)*rt2i
3307                         s = ( dm - cp)*rt2i
3308                         cp= ( cm + dpp)*rt2i
3309                         dpp = ( cm - dpp)*rt2i
3310                         zout(1,j,nout2) = ap + r
3311                         zout(2,j,nout2) = bm + s
3312                         zout(1,j,nout6) = ap - r
3313                         zout(2,j,nout6) = bm - s
3314                         zout(1,j,nout4) = am + cp
3315                         zout(2,j,nout4) = bp + dpp
3316                         zout(1,j,nout8) = am - cp
3317                         zout(2,j,nout8) = bp - dpp
3318                         enddo
3319                         enddo
3320 8000                continue
3321 
3322         else
3323                 ia=1
3324                         nin1=ia-after
3325                         nout1=ia-atn
3326                         do ib=1,before
3327                         nin1=nin1+after
3328                         nin2=nin1+atb
3329                         nin3=nin2+atb
3330                         nin4=nin3+atb
3331                         nin5=nin4+atb
3332                         nin6=nin5+atb
3333                         nin7=nin6+atb
3334                         nin8=nin7+atb
3335                         nout1=nout1+atn
3336                         nout2=nout1+after
3337                         nout3=nout2+after
3338                         nout4=nout3+after
3339                         nout5=nout4+after
3340                         nout6=nout5+after
3341                         nout7=nout6+after
3342                         nout8=nout7+after
3343                         do j=1,n1dfft
3344                         r1=zin(1,j,nin1)
3345                         s1=zin(2,j,nin1)
3346                         r2=zin(1,j,nin2)
3347                         s2=zin(2,j,nin2)
3348                         r3=zin(1,j,nin3)
3349                         s3=zin(2,j,nin3)
3350                         r4=zin(1,j,nin4)
3351                         s4=zin(2,j,nin4)
3352                         r5=zin(1,j,nin5)
3353                         s5=zin(2,j,nin5)
3354                         r6=zin(1,j,nin6)
3355                         s6=zin(2,j,nin6)
3356                         r7=zin(1,j,nin7)
3357                         s7=zin(2,j,nin7)
3358                         r8=zin(1,j,nin8)
3359                         s8=zin(2,j,nin8)
3360                         r=r1 + r5
3361                         s=r3 + r7
3362                         ap=r + s
3363                         am=r - s
3364                         r=r2 + r6
3365                         s=r4 + r8
3366                         bp=r + s
3367                         bm=r - s
3368                         r=s1 + s5
3369                         s=s3 + s7
3370                         cp=r + s
3371                         cm=r - s
3372                         r=s2 + s6
3373                         s=s4 + s8
3374                         dpp=r + s
3375                         dm=r - s
3376                         zout(1,j,nout1) = ap + bp
3377                         zout(2,j,nout1) = cp + dpp
3378                         zout(1,j,nout5) = ap - bp
3379                         zout(2,j,nout5) = cp - dpp
3380                         zout(1,j,nout3) = am - dm
3381                         zout(2,j,nout3) = cm + bm
3382                         zout(1,j,nout7) = am + dm
3383                         zout(2,j,nout7) = cm - bm
3384                         r= r1 - r5
3385                         s=-s3 + s7
3386                         ap=r + s
3387                         am=r - s
3388                         r=s1 - s5
3389                         s=r7 - r3
3390                         bp=r + s
3391                         bm=r - s
3392                         r=-s4 + s8
3393                         s= r2 - r6
3394                         cp=r + s
3395                         cm=r - s
3396                         r=-s2 + s6
3397                         s= r4 - r8
3398                         dpp=r + s
3399                         dm=r - s
3400                         r = ( cp + dm)*rt2i
3401                         s = ( cp - dm)*rt2i
3402                         cp= ( cm + dpp)*rt2i
3403                         dpp= ( dpp - cm)*rt2i
3404                         zout(1,j,nout2) = ap + r
3405                         zout(2,j,nout2) = bm + s
3406                         zout(1,j,nout6) = ap - r
3407                         zout(2,j,nout6) = bm - s
3408                         zout(1,j,nout4) = am + cp
3409                         zout(2,j,nout4) = bp + dpp
3410                         zout(1,j,nout8) = am - cp
3411                         zout(2,j,nout8) = bp - dpp
3412                         enddo
3413                         enddo
3414 
3415                 do 8001,ia=2,after
3416                 ias=ia-1
3417                         itt=ias*before
3418                         itrig=itt+1
3419                         cr2=trig(1,itrig)
3420                         ci2=trig(2,itrig)
3421                         itrig=itrig+itt
3422                         cr3=trig(1,itrig)
3423                         ci3=trig(2,itrig)
3424                         itrig=itrig+itt
3425                         cr4=trig(1,itrig)
3426                         ci4=trig(2,itrig)
3427                         itrig=itrig+itt
3428                         cr5=trig(1,itrig)
3429                         ci5=trig(2,itrig)
3430                         itrig=itrig+itt
3431                         cr6=trig(1,itrig)
3432                         ci6=trig(2,itrig)
3433                         itrig=itrig+itt
3434                         cr7=trig(1,itrig)
3435                         ci7=trig(2,itrig)
3436                         itrig=itrig+itt
3437                         cr8=trig(1,itrig)
3438                         ci8=trig(2,itrig)
3439                         nin1=ia-after
3440                         nout1=ia-atn
3441                         do ib=1,before
3442                         nin1=nin1+after
3443                         nin2=nin1+atb
3444                         nin3=nin2+atb
3445                         nin4=nin3+atb
3446                         nin5=nin4+atb
3447                         nin6=nin5+atb
3448                         nin7=nin6+atb
3449                         nin8=nin7+atb
3450                         nout1=nout1+atn
3451                         nout2=nout1+after
3452                         nout3=nout2+after
3453                         nout4=nout3+after
3454                         nout5=nout4+after
3455                         nout6=nout5+after
3456                         nout7=nout6+after
3457                         nout8=nout7+after
3458                         do j=1,n1dfft
3459                         r1=zin(1,j,nin1)
3460                         s1=zin(2,j,nin1)
3461                         r=zin(1,j,nin2)
3462                         s=zin(2,j,nin2)
3463                         r2=r*cr2 - s*ci2
3464                         s2=r*ci2 + s*cr2
3465                         r=zin(1,j,nin3)
3466                         s=zin(2,j,nin3)
3467                         r3=r*cr3 - s*ci3
3468                         s3=r*ci3 + s*cr3
3469                         r=zin(1,j,nin4)
3470                         s=zin(2,j,nin4)
3471                         r4=r*cr4 - s*ci4
3472                         s4=r*ci4 + s*cr4
3473                         r=zin(1,j,nin5)
3474                         s=zin(2,j,nin5)
3475                         r5=r*cr5 - s*ci5
3476                         s5=r*ci5 + s*cr5
3477                         r=zin(1,j,nin6)
3478                         s=zin(2,j,nin6)
3479                         r6=r*cr6 - s*ci6
3480                         s6=r*ci6 + s*cr6
3481                         r=zin(1,j,nin7)
3482                         s=zin(2,j,nin7)
3483                         r7=r*cr7 - s*ci7
3484                         s7=r*ci7 + s*cr7
3485                         r=zin(1,j,nin8)
3486                         s=zin(2,j,nin8)
3487                         r8=r*cr8 - s*ci8
3488                         s8=r*ci8 + s*cr8
3489                         r=r1 + r5
3490                         s=r3 + r7
3491                         ap=r + s
3492                         am=r - s
3493                         r=r2 + r6
3494                         s=r4 + r8
3495                         bp=r + s
3496                         bm=r - s
3497                         r=s1 + s5
3498                         s=s3 + s7
3499                         cp=r + s
3500                         cm=r - s
3501                         r=s2 + s6
3502                         s=s4 + s8
3503                         dpp=r + s
3504                         dm=r - s
3505                         zout(1,j,nout1) = ap + bp
3506                         zout(2,j,nout1) = cp + dpp
3507                         zout(1,j,nout5) = ap - bp
3508                         zout(2,j,nout5) = cp - dpp
3509                         zout(1,j,nout3) = am - dm
3510                         zout(2,j,nout3) = cm + bm
3511                         zout(1,j,nout7) = am + dm
3512                         zout(2,j,nout7) = cm - bm
3513                         r= r1 - r5
3514                         s=-s3 + s7
3515                         ap=r + s
3516                         am=r - s
3517                         r=s1 - s5
3518                         s=r7 - r3
3519                         bp=r + s
3520                         bm=r - s
3521                         r=-s4 + s8
3522                         s= r2 - r6
3523                         cp=r + s
3524                         cm=r - s
3525                         r=-s2 + s6
3526                         s= r4 - r8
3527                         dpp=r + s
3528                         dm=r - s
3529                         r = ( cp + dm)*rt2i
3530                         s = ( cp - dm)*rt2i
3531                         cp= ( cm + dpp)*rt2i
3532                         dpp= ( dpp - cm)*rt2i
3533                         zout(1,j,nout2) = ap + r
3534                         zout(2,j,nout2) = bm + s
3535                         zout(1,j,nout6) = ap - r
3536                         zout(2,j,nout6) = bm - s
3537                         zout(1,j,nout4) = am + cp
3538                         zout(2,j,nout4) = bp + dpp
3539                         zout(1,j,nout8) = am - cp
3540                         zout(2,j,nout8) = bp - dpp
3541                         enddo
3542                         enddo
3543 8001                continue
3544 
3545         end if
3546         else if (now.eq.3) then
3547 !         .5d0*sqrt(3.d0)
3548         bb=isign*0.8660254037844387d0
3549         ia=1
3550         nin1=ia-after
3551         nout1=ia-atn
3552         do ib=1,before
3553         nin1=nin1+after
3554         nin2=nin1+atb
3555         nin3=nin2+atb
3556         nout1=nout1+atn
3557         nout2=nout1+after
3558         nout3=nout2+after
3559         do j=1,n1dfft
3560         r1=zin(1,j,nin1)
3561         s1=zin(2,j,nin1)
3562         r2=zin(1,j,nin2)
3563         s2=zin(2,j,nin2)
3564         r3=zin(1,j,nin3)
3565         s3=zin(2,j,nin3)
3566         r=r2 + r3
3567         s=s2 + s3
3568         zout(1,j,nout1) = r + r1
3569         zout(2,j,nout1) = s + s1
3570         r1=r1 - .5d0*r
3571         s1=s1 - .5d0*s
3572         r2=bb*(r2-r3)
3573         s2=bb*(s2-s3)
3574         zout(1,j,nout2) = r1 - s2
3575         zout(2,j,nout2) = s1 + r2
3576         zout(1,j,nout3) = r1 + s2
3577         zout(2,j,nout3) = s1 - r2
3578         enddo
3579         enddo
3580         do 3000,ia=2,after
3581         ias=ia-1
3582         if (4*ias.eq.3*after) then
3583         if (isign.eq.1) then
3584                 nin1=ia-after
3585                 nout1=ia-atn
3586                 do ib=1,before
3587                 nin1=nin1+after
3588                 nin2=nin1+atb
3589                 nin3=nin2+atb
3590                 nout1=nout1+atn
3591                 nout2=nout1+after
3592                 nout3=nout2+after
3593                 do j=1,n1dfft
3594                 r1=zin(1,j,nin1)
3595                 s1=zin(2,j,nin1)
3596                 r2=zin(2,j,nin2)
3597                 s2=zin(1,j,nin2)
3598                 r3=zin(1,j,nin3)
3599                 s3=zin(2,j,nin3)
3600                 r=r3 + r2
3601                 s=s2 - s3
3602                 zout(1,j,nout1) = r1 - r
3603                 zout(2,j,nout1) = s + s1
3604                 r1=r1 + .5d0*r
3605                 s1=s1 - .5d0*s
3606                 r2=bb*(r2-r3)
3607                 s2=bb*(s2+s3)
3608                 zout(1,j,nout2) = r1 - s2
3609                 zout(2,j,nout2) = s1 - r2
3610                 zout(1,j,nout3) = r1 + s2
3611                 zout(2,j,nout3) = s1 + r2
3612                 enddo
3613                 enddo
3614         else
3615                 nin1=ia-after
3616                 nout1=ia-atn
3617                 do ib=1,before
3618                 nin1=nin1+after
3619                 nin2=nin1+atb
3620                 nin3=nin2+atb
3621                 nout1=nout1+atn
3622                 nout2=nout1+after
3623                 nout3=nout2+after
3624                 do j=1,n1dfft
3625                 r1=zin(1,j,nin1)
3626                 s1=zin(2,j,nin1)
3627                 r2=zin(2,j,nin2)
3628                 s2=zin(1,j,nin2)
3629                 r3=zin(1,j,nin3)
3630                 s3=zin(2,j,nin3)
3631                 r=r2 - r3
3632                 s=s2 + s3
3633                 zout(1,j,nout1) = r + r1
3634                 zout(2,j,nout1) = s1 - s
3635                 r1=r1 - .5d0*r
3636                 s1=s1 + .5d0*s
3637                 r2=bb*(r2+r3)
3638                 s2=bb*(s2-s3)
3639                 zout(1,j,nout2) = r1 + s2
3640                 zout(2,j,nout2) = s1 + r2
3641                 zout(1,j,nout3) = r1 - s2
3642                 zout(2,j,nout3) = s1 - r2
3643                 enddo
3644                 enddo
3645         end if
3646         else if (8*ias.eq.3*after) then
3647         if (isign.eq.1) then
3648                 nin1=ia-after
3649                 nout1=ia-atn
3650                 do ib=1,before
3651                 nin1=nin1+after
3652                 nin2=nin1+atb
3653                 nin3=nin2+atb
3654                 nout1=nout1+atn
3655                 nout2=nout1+after
3656                 nout3=nout2+after
3657                 do j=1,n1dfft
3658                 r1=zin(1,j,nin1)
3659                 s1=zin(2,j,nin1)
3660                 r=zin(1,j,nin2)
3661                 s=zin(2,j,nin2)
3662                 r2=(r - s)*rt2i
3663                 s2=(r + s)*rt2i
3664                 r3=zin(2,j,nin3)
3665                 s3=zin(1,j,nin3)
3666                 r=r2 - r3
3667                 s=s2 + s3
3668                 zout(1,j,nout1) = r + r1
3669                 zout(2,j,nout1) = s + s1
3670                 r1=r1 - .5d0*r
3671                 s1=s1 - .5d0*s
3672                 r2=bb*(r2+r3)
3673                 s2=bb*(s2-s3)
3674                 zout(1,j,nout2) = r1 - s2
3675                 zout(2,j,nout2) = s1 + r2
3676                 zout(1,j,nout3) = r1 + s2
3677                 zout(2,j,nout3) = s1 - r2
3678                 enddo
3679                 enddo
3680         else
3681                 nin1=ia-after
3682                 nout1=ia-atn
3683                 do ib=1,before
3684                 nin1=nin1+after
3685                 nin2=nin1+atb
3686                 nin3=nin2+atb
3687                 nout1=nout1+atn
3688                 nout2=nout1+after
3689                 nout3=nout2+after
3690                 do j=1,n1dfft
3691                 r1=zin(1,j,nin1)
3692                 s1=zin(2,j,nin1)
3693                 r=zin(1,j,nin2)
3694                 s=zin(2,j,nin2)
3695                 r2=(r + s)*rt2i
3696                 s2=(s - r)*rt2i
3697                 r3=zin(2,j,nin3)
3698                 s3=zin(1,j,nin3)
3699                 r=r2 + r3
3700                 s=s2 - s3
3701                 zout(1,j,nout1) = r + r1
3702                 zout(2,j,nout1) = s + s1
3703                 r1=r1 - .5d0*r
3704                 s1=s1 - .5d0*s
3705                 r2=bb*(r2-r3)
3706                 s2=bb*(s2+s3)
3707                 zout(1,j,nout2) = r1 - s2
3708                 zout(2,j,nout2) = s1 + r2
3709                 zout(1,j,nout3) = r1 + s2
3710                 zout(2,j,nout3) = s1 - r2
3711                 enddo
3712                 enddo
3713         end if
3714         else
3715         itt=ias*before
3716         itrig=itt+1
3717         cr2=trig(1,itrig)
3718         ci2=trig(2,itrig)
3719         itrig=itrig+itt
3720         cr3=trig(1,itrig)
3721         ci3=trig(2,itrig)
3722         nin1=ia-after
3723         nout1=ia-atn
3724         do ib=1,before
3725         nin1=nin1+after
3726         nin2=nin1+atb
3727         nin3=nin2+atb
3728         nout1=nout1+atn
3729         nout2=nout1+after
3730         nout3=nout2+after
3731         do j=1,n1dfft
3732         r1=zin(1,j,nin1)
3733         s1=zin(2,j,nin1)
3734         r=zin(1,j,nin2)
3735         s=zin(2,j,nin2)
3736         r2=r*cr2 - s*ci2
3737         s2=r*ci2 + s*cr2
3738         r=zin(1,j,nin3)
3739         s=zin(2,j,nin3)
3740         r3=r*cr3 - s*ci3
3741         s3=r*ci3 + s*cr3
3742         r=r2 + r3
3743         s=s2 + s3
3744         zout(1,j,nout1) = r + r1
3745         zout(2,j,nout1) = s + s1
3746         r1=r1 - .5d0*r
3747         s1=s1 - .5d0*s
3748         r2=bb*(r2-r3)
3749         s2=bb*(s2-s3)
3750         zout(1,j,nout2) = r1 - s2
3751         zout(2,j,nout2) = s1 + r2
3752         zout(1,j,nout3) = r1 + s2
3753         zout(2,j,nout3) = s1 - r2
3754         enddo
3755         enddo
3756         end if
3757 3000        continue
3758         else if (now==5) then
3759 !         cos(2.d0*pi/5.d0)
3760         cos2=0.3090169943749474d0
3761 !         cos(4.d0*pi/5.d0)
3762         cos4=-0.8090169943749474d0
3763 !        sin(2.d0*pi/5.d0)
3764         sin2=isign*0.9510565162951536d0
3765 !         sin(4.d0*pi/5.d0)
3766         sin4=isign*0.5877852522924731d0
3767         ia=1
3768         nin1=ia-after
3769         nout1=ia-atn
3770         do ib=1,before
3771         nin1=nin1+after
3772         nin2=nin1+atb
3773         nin3=nin2+atb
3774         nin4=nin3+atb
3775         nin5=nin4+atb
3776         nout1=nout1+atn
3777         nout2=nout1+after
3778         nout3=nout2+after
3779         nout4=nout3+after
3780         nout5=nout4+after
3781         do j=1,n1dfft
3782         r1=zin(1,j,nin1)
3783         s1=zin(2,j,nin1)
3784         r2=zin(1,j,nin2)
3785         s2=zin(2,j,nin2)
3786         r3=zin(1,j,nin3)
3787         s3=zin(2,j,nin3)
3788         r4=zin(1,j,nin4)
3789         s4=zin(2,j,nin4)
3790         r5=zin(1,j,nin5)
3791         s5=zin(2,j,nin5)
3792         r25 = r2 + r5
3793         r34 = r3 + r4
3794         s25 = s2 - s5
3795         s34 = s3 - s4
3796         zout(1,j,nout1) = r1 + r25 + r34
3797         r = r1 + cos2*r25 + cos4*r34
3798         s = sin2*s25 + sin4*s34
3799         zout(1,j,nout2) = r - s
3800         zout(1,j,nout5) = r + s
3801         r = r1 + cos4*r25 + cos2*r34
3802         s = sin4*s25 - sin2*s34
3803         zout(1,j,nout3) = r - s
3804         zout(1,j,nout4) = r + s
3805         r25 = r2 - r5
3806         r34 = r3 - r4
3807         s25 = s2 + s5
3808         s34 = s3 + s4
3809         zout(2,j,nout1) = s1 + s25 + s34
3810         r = s1 + cos2*s25 + cos4*s34
3811         s = sin2*r25 + sin4*r34
3812         zout(2,j,nout2) = r + s
3813         zout(2,j,nout5) = r - s
3814         r = s1 + cos4*s25 + cos2*s34
3815         s = sin4*r25 - sin2*r34
3816         zout(2,j,nout3) = r + s
3817         zout(2,j,nout4) = r - s
3818         enddo
3819         enddo
3820         do 5000,ia=2,after
3821         ias=ia-1
3822         if (8*ias.eq.5*after) then
3823                 if (isign.eq.1) then
3824                         nin1=ia-after
3825                         nout1=ia-atn
3826                         do ib=1,before
3827                         nin1=nin1+after
3828                         nin2=nin1+atb
3829                         nin3=nin2+atb
3830                         nin4=nin3+atb
3831                         nin5=nin4+atb
3832                         nout1=nout1+atn
3833                         nout2=nout1+after
3834                         nout3=nout2+after
3835                         nout4=nout3+after
3836                         nout5=nout4+after
3837                         do j=1,n1dfft
3838                         r1=zin(1,j,nin1)
3839                         s1=zin(2,j,nin1)
3840                         r=zin(1,j,nin2)
3841                         s=zin(2,j,nin2)
3842                         r2=(r - s)*rt2i
3843                         s2=(r + s)*rt2i
3844                         r3=zin(2,j,nin3)
3845                         s3=zin(1,j,nin3)
3846                         r=zin(1,j,nin4)
3847                         s=zin(2,j,nin4)
3848                         r4=(r + s)*rt2i
3849                         s4=(r - s)*rt2i
3850                         r5=zin(1,j,nin5)
3851                         s5=zin(2,j,nin5)
3852                         r25 = r2 - r5
3853                         r34 = r3 + r4
3854                         s25 = s2 + s5
3855                         s34 = s3 - s4
3856                         zout(1,j,nout1) = r1 + r25 - r34
3857                         r = r1 + cos2*r25 - cos4*r34
3858                         s = sin2*s25 + sin4*s34
3859                         zout(1,j,nout2) = r - s
3860                         zout(1,j,nout5) = r + s
3861                         r = r1 + cos4*r25 - cos2*r34
3862                         s = sin4*s25 - sin2*s34
3863                         zout(1,j,nout3) = r - s
3864                         zout(1,j,nout4) = r + s
3865                         r25 = r2 + r5
3866                         r34 = r4 - r3
3867                         s25 = s2 - s5
3868                         s34 = s3 + s4
3869                         zout(2,j,nout1) = s1 + s25 + s34
3870                         r = s1 + cos2*s25 + cos4*s34
3871                         s = sin2*r25 + sin4*r34
3872                         zout(2,j,nout2) = r + s
3873                         zout(2,j,nout5) = r - s
3874                         r = s1 + cos4*s25 + cos2*s34
3875                         s = sin4*r25 - sin2*r34
3876                         zout(2,j,nout3) = r + s
3877                         zout(2,j,nout4) = r - s
3878                         enddo
3879                         enddo
3880                 else
3881                         nin1=ia-after
3882                         nout1=ia-atn
3883                         do ib=1,before
3884                         nin1=nin1+after
3885                         nin2=nin1+atb
3886                         nin3=nin2+atb
3887                         nin4=nin3+atb
3888                         nin5=nin4+atb
3889                         nout1=nout1+atn
3890                         nout2=nout1+after
3891                         nout3=nout2+after
3892                         nout4=nout3+after
3893                         nout5=nout4+after
3894                         do j=1,n1dfft
3895                         r1=zin(1,j,nin1)
3896                         s1=zin(2,j,nin1)
3897                         r=zin(1,j,nin2)
3898                         s=zin(2,j,nin2)
3899                         r2=(r + s)*rt2i
3900                         s2=(s - r)*rt2i
3901                         r3=zin(2,j,nin3)
3902                         s3=zin(1,j,nin3)
3903                         r=zin(1,j,nin4)
3904                         s=zin(2,j,nin4)
3905                         r4=(s - r)*rt2i
3906                         s4=(r + s)*rt2i
3907                         r5=zin(1,j,nin5)
3908                         s5=zin(2,j,nin5)
3909                         r25 = r2 - r5
3910                         r34 = r3 + r4
3911                         s25 = s2 + s5
3912                         s34 = s4 - s3
3913                         zout(1,j,nout1) = r1 + r25 + r34
3914                         r = r1 + cos2*r25 + cos4*r34
3915                         s = sin2*s25 + sin4*s34
3916                         zout(1,j,nout2) = r - s
3917                         zout(1,j,nout5) = r + s
3918                         r = r1 + cos4*r25 + cos2*r34
3919                         s = sin4*s25 - sin2*s34
3920                         zout(1,j,nout3) = r - s
3921                         zout(1,j,nout4) = r + s
3922                         r25 = r2 + r5
3923                         r34 = r3 - r4
3924                         s25 = s2 - s5
3925                         s34 = s3 + s4
3926                         zout(2,j,nout1) = s1 + s25 - s34
3927                         r = s1 + cos2*s25 - cos4*s34
3928                         s = sin2*r25 + sin4*r34
3929                         zout(2,j,nout2) = r + s
3930                         zout(2,j,nout5) = r - s
3931                         r = s1 + cos4*s25 - cos2*s34
3932                         s = sin4*r25 - sin2*r34
3933                         zout(2,j,nout3) = r + s
3934                         zout(2,j,nout4) = r - s
3935                         enddo
3936                         enddo
3937                 end if
3938         else
3939                 ias=ia-1
3940                 itt=ias*before
3941                 itrig=itt+1
3942                 cr2=trig(1,itrig)
3943                 ci2=trig(2,itrig)
3944                 itrig=itrig+itt
3945                 cr3=trig(1,itrig)
3946                 ci3=trig(2,itrig)
3947                 itrig=itrig+itt
3948                 cr4=trig(1,itrig)
3949                 ci4=trig(2,itrig)
3950                 itrig=itrig+itt
3951                 cr5=trig(1,itrig)
3952                 ci5=trig(2,itrig)
3953                 nin1=ia-after
3954                 nout1=ia-atn
3955                 do ib=1,before
3956                 nin1=nin1+after
3957                 nin2=nin1+atb
3958                 nin3=nin2+atb
3959                 nin4=nin3+atb
3960                 nin5=nin4+atb
3961                 nout1=nout1+atn
3962                 nout2=nout1+after
3963                 nout3=nout2+after
3964                 nout4=nout3+after
3965                 nout5=nout4+after
3966                 do j=1,n1dfft
3967                 r1=zin(1,j,nin1)
3968                 s1=zin(2,j,nin1)
3969                 r=zin(1,j,nin2)
3970                 s=zin(2,j,nin2)
3971                 r2=r*cr2 - s*ci2
3972                 s2=r*ci2 + s*cr2
3973                 r=zin(1,j,nin3)
3974                 s=zin(2,j,nin3)
3975                 r3=r*cr3 - s*ci3
3976                 s3=r*ci3 + s*cr3
3977                 r=zin(1,j,nin4)
3978                 s=zin(2,j,nin4)
3979                 r4=r*cr4 - s*ci4
3980                 s4=r*ci4 + s*cr4
3981                 r=zin(1,j,nin5)
3982                 s=zin(2,j,nin5)
3983                 r5=r*cr5 - s*ci5
3984                 s5=r*ci5 + s*cr5
3985                 r25 = r2 + r5
3986                 r34 = r3 + r4
3987                 s25 = s2 - s5
3988                 s34 = s3 - s4
3989                 zout(1,j,nout1) = r1 + r25 + r34
3990                 r = r1 + cos2*r25 + cos4*r34
3991                 s = sin2*s25 + sin4*s34
3992                 zout(1,j,nout2) = r - s
3993                 zout(1,j,nout5) = r + s
3994                 r = r1 + cos4*r25 + cos2*r34
3995                 s = sin4*s25 - sin2*s34
3996                 zout(1,j,nout3) = r - s
3997                 zout(1,j,nout4) = r + s
3998                 r25 = r2 - r5
3999                 r34 = r3 - r4
4000                 s25 = s2 + s5
4001                 s34 = s3 + s4
4002                 zout(2,j,nout1) = s1 + s25 + s34
4003                 r = s1 + cos2*s25 + cos4*s34
4004                 s = sin2*r25 + sin4*r34
4005                 zout(2,j,nout2) = r + s
4006                 zout(2,j,nout5) = r - s
4007                 r = s1 + cos4*s25 + cos2*s34
4008                 s = sin4*r25 - sin2*r34
4009                 zout(2,j,nout3) = r + s
4010                 zout(2,j,nout4) = r - s
4011                 enddo
4012                 enddo
4013         end if
4014 5000        continue
4015        else if (now.eq.6) then
4016 !         .5d0*sqrt(3.d0)
4017         bb=isign*0.8660254037844387d0
4018 
4019         ia=1
4020         nin1=ia-after
4021         nout1=ia-atn
4022         do ib=1,before
4023         nin1=nin1+after
4024         nin2=nin1+atb
4025         nin3=nin2+atb
4026         nin4=nin3+atb
4027         nin5=nin4+atb
4028         nin6=nin5+atb
4029         nout1=nout1+atn
4030         nout2=nout1+after
4031         nout3=nout2+after
4032         nout4=nout3+after
4033         nout5=nout4+after
4034         nout6=nout5+after
4035         do j=1,n1dfft
4036         r2=zin(1,j,nin3)
4037         s2=zin(2,j,nin3)
4038         r3=zin(1,j,nin5)
4039         s3=zin(2,j,nin5)
4040         r=r2 + r3
4041         s=s2 + s3
4042         r1=zin(1,j,nin1)
4043         s1=zin(2,j,nin1)
4044         ur1 = r + r1
4045         ui1 = s + s1
4046         r1=r1 - .5d0*r
4047         s1=s1 - .5d0*s
4048         r=r2-r3
4049         s=s2-s3
4050         ur2 = r1 - s*bb
4051         ui2 = s1 + r*bb
4052         ur3 = r1 + s*bb
4053         ui3 = s1 - r*bb
4054 
4055         r2=zin(1,j,nin6)
4056         s2=zin(2,j,nin6)
4057         r3=zin(1,j,nin2)
4058         s3=zin(2,j,nin2)
4059         r=r2 + r3
4060         s=s2 + s3
4061         r1=zin(1,j,nin4)
4062         s1=zin(2,j,nin4)
4063         vr1 = r + r1
4064         vi1 = s + s1
4065         r1=r1 - .5d0*r
4066         s1=s1 - .5d0*s
4067         r=r2-r3
4068         s=s2-s3
4069         vr2 = r1 - s*bb
4070         vi2 = s1 + r*bb
4071         vr3 = r1 + s*bb
4072         vi3 = s1 - r*bb
4073 
4074         zout(1,j,nout1)=ur1+vr1
4075         zout(2,j,nout1)=ui1+vi1
4076         zout(1,j,nout5)=ur2+vr2
4077         zout(2,j,nout5)=ui2+vi2
4078         zout(1,j,nout3)=ur3+vr3
4079         zout(2,j,nout3)=ui3+vi3
4080         zout(1,j,nout4)=ur1-vr1
4081         zout(2,j,nout4)=ui1-vi1
4082         zout(1,j,nout2)=ur2-vr2
4083         zout(2,j,nout2)=ui2-vi2
4084         zout(1,j,nout6)=ur3-vr3
4085         zout(2,j,nout6)=ui3-vi3
4086         enddo
4087         enddo
4088 
4089         else
4090           ABI_ERROR('error fftstp')
4091         end if
4092 
4093 end subroutine fftstp

m_sg2002/sg2002_accrho [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

 sg2002_accrho

FUNCTION

 Accumulates the real space density rho from the ndat wavefunctions zf
 by transforming zf into real space and adding all the amplitudes squared

 INPUTS:
   ZF: input array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1 ! maximal number of small box 2nd dim slices for one proc
   weight(ndat)= weight for the density accumulation

 OUTPUTS:
    RHOoutput(i1,i2,i3) = RHOinput(i1,i2,i3) + sum on idat of (Re(FFT(ZF))**2 *weight_r + weight_i*Im(FFT(ZF))**2
        i1=1,n1 , i2=1,n2 , i3=1,n3
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of RHO
   nd3proc=((nd3-1)/nproc_fft)+1 ! maximal number of big box 3rd dim slices for one proc

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

2264 subroutine sg2002_accrho(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
2265 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,zf,rho,weight_r,weight_i)
2266 
2267  implicit none
2268 
2269 !Arguments ------------------------------------
2270  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
2271  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft
2272  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
2273  real(dp),intent(in) :: weight_r(ndat), weight_i(ndat)
2274  real(dp),intent(inout) :: rho(nd1,nd2,nd3)
2275 
2276 !Local variables-------------------------------
2277 !scalars
2278  integer,parameter :: unused0=0
2279  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j3glob
2280  integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
2281  integer :: m2eff,ncache,n1eff,jeff,includelast
2282 !arrays
2283  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
2284  real(dp), allocatable :: zt(:,:,:)  ! work arrays for transpositions
2285  real(dp), allocatable :: zw(:,:,:) ! cache work array
2286  real(dp) :: tsec(2)
2287 ! FFT work arrays
2288  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
2289  integer, allocatable, dimension(:) :: after1,now1,before1, after2,now2,before2,after3,now3,before3
2290 
2291 ! *************************************************************************
2292 
2293  !ioption=0 ! This was in the old version.
2294  ioption=1 ! This one is needed to be compatible with paral_kgb
2295 
2296  !nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
2297 
2298 ! find cache size that gives optimal performance on machine
2299  ncache=4*max(n1,n2,n3,1024)
2300  if (ncache/(4*max(n1,n2,n3)) < 1) then
2301     write(std_out,*) &
2302 &     'ncache has to be enlarged to be able to hold at', &
2303 &     'least one 1-d FFT of each size even though this will', &
2304 &     'reduce the performance for shorter transform lengths'
2305     ABI_ERROR("Aborting now")
2306  end if
2307 
2308 !Effective m1 and m2 (complex-to-complex or real-to-complex)
2309  n1eff=n1; m2eff=m2 ; m1zt=n1
2310  if (icplexwf==1) then
2311    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
2312  end if
2313 
2314  lzt=m2eff
2315  if (mod(m2eff,2) == 0) lzt=lzt+1
2316  if (mod(m2eff,4) == 0) lzt=lzt+1
2317 
2318  ! maximal number of big box 3rd dim slices for all procs
2319  nnd3=nd3proc*nproc_fft
2320 
2321  ABI_MALLOC(trig1,(2,n1))
2322  ABI_MALLOC(after1,(mdata))
2323  ABI_MALLOC(now1,(mdata))
2324  ABI_MALLOC(before1,(mdata))
2325  ABI_MALLOC(trig2,(2,n2))
2326  ABI_MALLOC(after2,(mdata))
2327  ABI_MALLOC(now2,(mdata))
2328  ABI_MALLOC(before2,(mdata))
2329  ABI_MALLOC(trig3,(2,n3))
2330  ABI_MALLOC(after3,(mdata))
2331  ABI_MALLOC(now3,(mdata))
2332  ABI_MALLOC(before3,(mdata))
2333 
2334  ABI_MALLOC(zw,(2,ncache/4,2))
2335  ABI_MALLOC(zt,(2,lzt,m1zt))
2336  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
2337  if (nproc_fft > 1)  then
2338    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
2339  end if
2340 
2341  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
2342  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
2343  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
2344 
2345  do idat=1,ndat
2346    ! transform along z axis
2347    ! input: I1,I3,J2,(Jp2)
2348    lot=ncache/(4*n3)
2349 
2350    ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
2351    do j2=1,md2proc
2352      if (me_fft*md2proc+j2 <= m2eff) then ! MG REMOVED TO BE COSISTENT WITH BACK_WF
2353        do i1=1,m1,lot
2354          ma=i1
2355          mb=min(i1+(lot-1),m1)
2356          n1dfft=mb-ma+1
2357 
2358          ! zero-pad n1dfft G_z lines
2359          !  input: G1,G3,G2,(Gp2)
2360          ! output: G1,R3,G2,(Gp2)
2361          call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
2362 
2363          ! Transform along z.
2364          inzee=1
2365          do i=1,ic3
2366            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
2367 &                       trig3,after3(i),now3(i),before3(i),1)
2368            inzee=3-inzee
2369          end do
2370 
2371          ! Local rotation.
2372          ! input:  G1,R3,G2,(Gp2)
2373          ! output: G1,G2,R3,(Gp2)
2374          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2)
2375        end do
2376      end if
2377    end do
2378 
2379    ! Interprocessor data transposition
2380    ! input:  G1,G2,R3,Rp3,(Gp2)
2381    ! output: G1,G2,R3,Gp2,(Rp3)
2382    if (nproc_fft > 1) then
2383      call timab(543,1,tsec)
2384      call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, &
2385 &                       zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
2386      call timab(543,2,tsec)
2387    end if
2388 
2389    ! Loop over the z treated by this node.
2390    do j3=1,nd3proc
2391      j3glob = j3 + me_fft*nd3proc
2392      !ABI_CHECK(j3glob <= n3, "j3glob")
2393 
2394      if (me_fft*nd3proc+j3 <= n3) then
2395        Jp2st=1; J2st=1
2396 
2397        lot=ncache/(4*n1)
2398 
2399        ! Loop over G_y in the small box.
2400        do j=1,m2eff,lot
2401          ma=j
2402          mb=min(j+(lot-1),m2eff)
2403          n1dfft=mb-ma+1
2404 
2405          ! Zero-pad input.
2406          ! input:  G1,G2,R3,JG2,(Rp3)
2407          ! output: G2,G1,R3,JG2,(Rp3)
2408 
2409          if (nproc_fft == 1) then
2410           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
2411 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1),unused0, unused0,unused0)
2412          else
2413           call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
2414 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0,unused0,unused0)
2415          end if
2416 
2417          ! Transform along x
2418          ! input:  G2,G1,R3,(Rp3)
2419          ! output: G2,R1,R3,(Rp3)
2420          inzee=1
2421          do i=1,ic1-1
2422            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2423 &                    trig1,after1(i),now1(i),before1(i),1)
2424            inzee=3-inzee
2425          end do
2426 
2427          i=ic1
2428          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
2429 &                    trig1,after1(i),now1(i),before1(i),1)
2430        end do
2431 
2432        ! Transform along y axis (take into account c2c or c2r case).
2433        ! Must loop over the full box.
2434        lot=ncache/(4*n2)
2435        if (icplexwf==1) then
2436          if (mod(lot,2).ne.0) lot=lot-1 ! needed to introduce jeff
2437        end if
2438 
2439        do j=1,n1eff,lot
2440          ma=j
2441          mb=min(j+(lot-1),n1eff)
2442          n1dfft=mb-ma+1
2443          jeff=j
2444          includelast=1
2445 
2446          if (icplexwf==1) then
2447            jeff=2*j-1
2448            includelast=1
2449            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
2450          end if
2451 
2452          ! Zero-pad the input.
2453          ! input:  G2,R1,R3,(Rp3)
2454          ! output: R1,G2,R3,(Rp3)
2455          if (icplexwf==2) then
2456            call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
2457          else
2458            call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
2459          end if
2460 
2461          inzee=1
2462          do i=1,ic2
2463            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2464 &                       trig2,after2(i),now2(i),before2(i),1)
2465            inzee=3-inzee
2466          end do
2467 
2468          ! Accumulate
2469          call addrho(icplexwf,includelast,nd1,nd2,n2,lot,n1dfft,&
2470 &          zw(1,1,inzee),rho(jeff,1,j3glob),weight_r(idat),weight_i(idat))
2471        end do
2472        ! output: i1,i2,j3,(jp3)
2473 
2474       end if
2475     end do ! j3
2476  end do ! idat
2477 
2478  ABI_FREE(trig1)
2479  ABI_FREE(after1)
2480  ABI_FREE(now1)
2481  ABI_FREE(before1)
2482  ABI_FREE(trig2)
2483  ABI_FREE(after2)
2484  ABI_FREE(now2)
2485  ABI_FREE(before2)
2486  ABI_FREE(trig3)
2487  ABI_FREE(after3)
2488  ABI_FREE(now3)
2489  ABI_FREE(before3)
2490 
2491  ABI_FREE(zmpi2)
2492  ABI_FREE(zw)
2493  ABI_FREE(zt)
2494  if (nproc_fft > 1)  then
2495    ABI_FREE(zmpi1)
2496  end if
2497 
2498 end subroutine sg2002_accrho

m_sg2002/sg2002_applypot [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_applypot

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

1419 subroutine sg2002_applypot(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
1420 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
1421 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
1422 
1423  implicit none
1424 
1425 !Arguments ------------------------------------
1426  integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
1427  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
1428  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
1429  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
1430  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
1431 
1432 !Local variables-------------------------------
1433 !scalars
1434  integer,parameter :: unused0=0
1435  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob
1436  integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1437  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
1438  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
1439 !arrays
1440  real(dp) :: tsec(2)
1441  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1442  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
1443  real(dp), allocatable :: zw(:,:,:) ! cache work array
1444 ! FFT work arrays
1445  real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3
1446  real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3
1447  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
1448 
1449 ! *************************************************************************
1450 
1451  !ioption=0 ! This was in the old version.
1452  ioption=1 ! This one is needed to be compatible with paral_kgb
1453 
1454  ncache=4*max(n1,n2,n3,1024)
1455  if (ncache/(4*max(n1,n2,n3)) < 1) then
1456    write(std_out,*) &
1457 &    'ncache has to be enlarged to be able to hold at', &
1458 &    'least one 1-d FFT of each size even though this will', &
1459 &    'reduce the performance for shorter transform lengths'
1460    ABI_ERROR("Aborting now")
1461  end if
1462 
1463  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1464  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
1465  if (icplexwf==1) then
1466    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
1467  end if
1468 
1469  m2eff=max(m2ieff,m2oeff)
1470  lzt=m2eff
1471  if (mod(m2eff,2) == 0) lzt=lzt+1
1472  if (mod(m2eff,4) == 0) lzt=lzt+1
1473 
1474  ! maximal number of big box 3rd dim slices for all procs
1475  nnd3=nd3proc*nproc_fft
1476 
1477  ABI_MALLOC(btrig1,(2,n1))
1478  ABI_MALLOC(ftrig1,(2,n1))
1479  ABI_MALLOC(after1,(mdata))
1480  ABI_MALLOC(now1,(mdata))
1481  ABI_MALLOC(before1,(mdata))
1482  ABI_MALLOC(btrig2,(2,n2))
1483  ABI_MALLOC(ftrig2,(2,n2))
1484  ABI_MALLOC(after2,(mdata))
1485  ABI_MALLOC(now2,(mdata))
1486  ABI_MALLOC(before2,(mdata))
1487  ABI_MALLOC(btrig3,(2,n3))
1488  ABI_MALLOC(ftrig3,(2,n3))
1489  ABI_MALLOC(after3,(mdata))
1490  ABI_MALLOC(now3,(mdata))
1491  ABI_MALLOC(before3,(mdata))
1492 
1493  ABI_MALLOC(zw,(2,ncache/4,2))
1494  ABI_MALLOC(zt,(2,lzt,m1zt))
1495  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3))
1496  if (nproc_fft > 1)  then
1497    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3))
1498  end if
1499 
1500  call ctrig(n3,btrig3,after3,before3,now3,1,ic3)
1501  call ctrig(n1,btrig1,after1,before1,now1,1,ic1)
1502  call ctrig(n2,btrig2,after2,before2,now2,1,ic2)
1503 
1504  do j=1,n1
1505    ftrig1(1,j)= btrig1(1,j)
1506    ftrig1(2,j)=-btrig1(2,j)
1507  end do
1508  do j=1,n2
1509    ftrig2(1,j)= btrig2(1,j)
1510    ftrig2(2,j)=-btrig2(2,j)
1511  end do
1512  do j=1,n3
1513    ftrig3(1,j)= btrig3(1,j)
1514    ftrig3(2,j)=-btrig3(2,j)
1515  end do
1516 
1517  do idat=1,ndat
1518    !
1519    ! transform along z axis
1520    ! input: G1,G3,G2,(Gp2)
1521    lot=ncache/(4*n3)
1522    do j2=1,md2proc
1523      if (me_fft*md2proc+j2 <= m2ieff) then
1524        do i1=1,m1i,lot
1525          ma=i1
1526          mb=min(i1+(lot-1),m1i)
1527          n1dfft=mb-ma+1
1528 
1529          ! zero-pad n1dfft G_z lines
1530          ! input: G1,G3,G2,(Gp2)
1531          call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
1532 
1533          inzee=1
1534          do i=1,ic3
1535            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1536 &                      btrig3,after3(i),now3(i),before3(i),1)
1537            inzee=3-inzee
1538          end do
1539 
1540          ! Local rotation.
1541          ! input:  G1,R3,G2,(Gp2)
1542          ! output: G1,G2,R3,(Gp2)
1543          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2)
1544        end do
1545      end if
1546    end do
1547 
1548    ! Interprocessor data transposition
1549    ! input:  G1,G2,R3,Rp3,(Gp2)
1550    ! output: G1,G2,R3,Gp2,(Rp3)
1551    if (nproc_fft > 1) then
1552       call timab(543,1,tsec)
1553       call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc,&
1554 &                        zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr)
1555       call timab(543,2,tsec)
1556    end if
1557 
1558    do j3=1,nd3proc
1559      j3glob = j3 + me_fft*nd3proc
1560 
1561      if (me_fft*nd3proc+j3 <= n3) then
1562        Jp2stb=1; J2stb=1
1563        Jp2stf=1; J2stf=1
1564 
1565        ! transform along x axis
1566        lot=ncache/(4*n1)
1567 
1568        do j=1,m2ieff,lot
1569          ma=j
1570          mb=min(j+(lot-1),m2ieff)
1571          n1dfft=mb-ma+1
1572 
1573          ! Zero-pad input.
1574          ! input:  G1,G2,R3,G2,(Rp3)
1575          ! output: G2,G1,R3,G2,(Rp3)
1576          if (nproc_fft == 1) then
1577            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
1578 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1), unused0, unused0, unused0)
1579          else
1580            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
1581 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0, unused0, unused0)
1582          end if
1583 
1584          ! Transform along x
1585          ! input:  G2,G1,R3,(Rp3)
1586          ! output: G2,R1,R3,(Rp3)
1587          inzee=1
1588          do i=1,ic1-1
1589            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1590 &                     btrig1,after1(i),now1(i),before1(i),1)
1591            inzee=3-inzee
1592          end do
1593 
1594          i=ic1
1595          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
1596 &                    btrig1,after1(i),now1(i),before1(i),1)
1597        end do
1598 
1599        ! Transform along y axis (take into account c2c or c2r case).
1600        ! Must loop over the full box.
1601        lot=ncache/(4*n2)
1602 
1603        if (icplexwf==1) then
1604          if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff
1605        end if
1606 
1607        do j=1,n1eff,lot
1608          ma=j
1609          mb=min(j+(lot-1),n1eff)
1610          n1dfft=mb-ma+1
1611          jeff=j
1612          includelast=1
1613 
1614          if (icplexwf==1) then
1615            jeff=2*j-1
1616            includelast=1
1617            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
1618          end if
1619 
1620          ! Zero-pad the input.
1621          !  input: G2,R1,R3,(Rp3)
1622          ! output: R1,G2,R3,(Rp3)
1623          if (icplexwf==2) then
1624            call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1))
1625          else
1626            call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
1627          end if
1628 
1629          ! input:  R1,G2,R3,(Rp3)
1630          ! output: R1,R2,R3,(Rp3)
1631          inzee=1
1632          do i=1,ic2
1633            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1634 &                       btrig2,after2(i),now2(i),before2(i),1)
1635             inzee=3-inzee
1636          end do
1637          ! output: R1,R2,R3,(Rp3)
1638 
1639          ! Multiply with potential in real space
1640          jx=cplex*(jeff-1)+1
1641          call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee))
1642 
1643          ! TRANSFORM BACK IN FOURIER SPACE
1644          ! transform along y axis
1645          ! input: R1,R2,R3,(Rp3)
1646          do i=1,ic2
1647            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1648 &                       ftrig2,after2(i),now2(i),before2(i),-1)
1649            inzee=3-inzee
1650          end do
1651 
1652          !  input: R1,G2,R3,(Rp3)
1653          ! output: G2,R1,R3,(Rp3)
1654          if (icplexwf==2) then
1655            call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
1656          else
1657            call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
1658          end if
1659 
1660        end do ! j
1661 
1662        ! transform along x axis
1663        ! input:  R2,R1,R3,(Rp3)
1664        ! output: R2,G1,R3,(Rp3)
1665        lot=ncache/(4*n1)
1666 
1667        do j=1,m2oeff,lot
1668          ma=j
1669          mb=min(j+(lot-1),m2oeff)
1670          n1dfft=mb-ma+1
1671          i=1
1672          call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
1673 &                    ftrig1,after1(i),now1(i),before1(i),-1)
1674 
1675          inzee=1
1676          do i=2,ic1
1677            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1678 &                       ftrig1,after1(i),now1(i),before1(i),-1)
1679            inzee=3-inzee
1680          end do
1681 
1682          ! input:  G2,G1,R3,Gp2,(Rp3)
1683          ! output: G1,G2,R3,Gp2,(Rp3)
1684          if (nproc_fft == 1) then
1685            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
1686 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2)
1687          else
1688            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
1689 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1)
1690          end if
1691        end do ! j
1692      end if
1693    end do
1694 
1695    ! Interprocessor data transposition
1696    ! input:  G1,G2,R3,Gp2,(Rp3)
1697    ! output: G1,G2,R3,Rp3,(Gp2)
1698    if (nproc_fft > 1) then
1699      call timab(544,1,tsec)
1700      call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, &
1701 &                       zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr)
1702      call timab(544,2,tsec)
1703    end if
1704 
1705    ! transform along z axis
1706    ! input: G1,G2,R3,(Gp2)
1707    lot=ncache/(4*n3)
1708    do j2=1,md2proc
1709      if (me_fft*md2proc+j2 <= m2oeff) then
1710        do i1=1,m1o,lot
1711          ma=i1
1712          mb=min(i1+(lot-1),m1o)
1713          n1dfft=mb-ma+1
1714 
1715          ! input:  G1,G2,R3,(Gp2)
1716          ! output: G1,R3,G2,(Gp2)
1717          call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw(1,1,1))
1718 
1719          inzee=1
1720          do i=1,ic3
1721            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1722 &           ftrig3,after3(i),now3(i),before3(i),-1)
1723            inzee=3-inzee
1724          end do
1725 
1726          call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
1727          ! output: G1,G3,G2,(Gp2)
1728        end do
1729      end if
1730    end do
1731 
1732    ! Complete missing values with complex conjugate
1733    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
1734    if (icplexwf==1) then
1735      do i3=1,m3o
1736        i3inv=m3o+2-i3
1737        if (i3==1) i3inv=1
1738        if (m2oeff>1)then
1739          do i2=2,m2oeff
1740            i2inv=m2o+2-i2
1741            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
1742            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
1743            do i1=2,m1o
1744              i1inv=m1o+2-i1
1745              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
1746              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
1747            end do
1748          end do
1749        end if
1750      end do
1751    end if
1752 
1753  end do ! idat
1754 
1755  ABI_FREE(btrig1)
1756  ABI_FREE(ftrig1)
1757  ABI_FREE(after1)
1758  ABI_FREE(now1)
1759  ABI_FREE(before1)
1760  ABI_FREE(btrig2)
1761  ABI_FREE(ftrig2)
1762  ABI_FREE(after2)
1763  ABI_FREE(now2)
1764  ABI_FREE(before2)
1765  ABI_FREE(btrig3)
1766  ABI_FREE(ftrig3)
1767  ABI_FREE(after3)
1768  ABI_FREE(now3)
1769  ABI_FREE(before3)
1770 
1771  ABI_FREE(zmpi2)
1772  ABI_FREE(zw)
1773  ABI_FREE(zt)
1774  if (nproc_fft > 1)  then
1775    ABI_FREE(zmpi1)
1776  end if
1777 
1778 end subroutine sg2002_applypot

m_sg2002/sg2002_applypot_many [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_applypot_many

FUNCTION

 Applies the local real space potential to multiple wavefunctions in Fourier space

INPUTS

   ZF: Wavefunction (input/output) (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
   i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
   then, if m1 > max1+1, one has min1=max1-m1+1 and
   i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
   i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc

   POT: Potential
        POT(cplex*i1,i2,i3)
        cplex=1 or 2 ,  i1=1,n1 , i2=1,n2 , i3=1,n3
   nd1,nd2,nd3: dimension of pot
   comm_fft: MPI communicator
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG

 NOTES:
   PERFORMANCE CONSIDERATIONS:
   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

1833 subroutine sg2002_applypot_many(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,&
1834 &  max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,&
1835 &  max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf)
1836 
1837  implicit none
1838 
1839 !Arguments ------------------------------------
1840  integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc
1841  integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3
1842  integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft
1843  real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3)
1844  real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat)
1845 
1846 !Local variables-------------------------------
1847 !scalars
1848  integer,parameter :: unused0=0
1849  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob
1850  integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1851  integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb
1852  integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff
1853 !arrays
1854  integer :: requests(ndat)
1855  real(dp) :: tsec(2)
1856  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1857  real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
1858  real(dp), allocatable :: zw(:,:,:) ! cache work array
1859 ! FFT work arrays
1860  real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3
1861  real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3
1862  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
1863 
1864 ! *************************************************************************
1865 
1866  !ioption=0 ! This was in the old version.
1867  ioption=1 ! This one is needed to be compatible with paral_kgb
1868 
1869  ! call timab(541,1,tsec)
1870  ncache=4*max(n1,n2,n3,1024)
1871  if (ncache/(4*max(n1,n2,n3)) < 1) then
1872    write(std_out,*) &
1873 &    'ncache has to be enlarged to be able to hold at', &
1874 &    'least one 1-d FFT of each size even though this will', &
1875 &    'reduce the performance for shorter transform lengths'
1876    ABI_ERROR("Aborting now")
1877  end if
1878 
1879  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1880  n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1
1881  if (icplexwf==1) then
1882    n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1)
1883  end if
1884 
1885  m2eff=max(m2ieff,m2oeff)
1886  lzt=m2eff
1887  if (mod(m2eff,2) == 0) lzt=lzt+1
1888  if (mod(m2eff,4) == 0) lzt=lzt+1
1889 
1890  ! maximal number of big box 3rd dim slices for all procs
1891  nnd3=nd3proc*nproc_fft
1892 
1893  ABI_MALLOC(btrig1,(2,n1))
1894  ABI_MALLOC(ftrig1,(2,n1))
1895  ABI_MALLOC(after1,(mdata))
1896  ABI_MALLOC(now1,(mdata))
1897  ABI_MALLOC(before1,(mdata))
1898  ABI_MALLOC(btrig2,(2,n2))
1899  ABI_MALLOC(ftrig2,(2,n2))
1900  ABI_MALLOC(after2,(mdata))
1901  ABI_MALLOC(now2,(mdata))
1902  ABI_MALLOC(before2,(mdata))
1903  ABI_MALLOC(btrig3,(2,n3))
1904  ABI_MALLOC(ftrig3,(2,n3))
1905  ABI_MALLOC(after3,(mdata))
1906  ABI_MALLOC(now3,(mdata))
1907  ABI_MALLOC(before3,(mdata))
1908 
1909  ABI_MALLOC(zw,(2,ncache/4,2))
1910  ABI_MALLOC(zt,(2,lzt,m1zt))
1911  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
1912  if (nproc_fft > 1)  then
1913    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
1914  end if
1915 
1916  call ctrig(n3,btrig3,after3,before3,now3,1,ic3)
1917  call ctrig(n1,btrig1,after1,before1,now1,1,ic1)
1918  call ctrig(n2,btrig2,after2,before2,now2,1,ic2)
1919 
1920  do j=1,n1
1921    ftrig1(1,j)= btrig1(1,j)
1922    ftrig1(2,j)=-btrig1(2,j)
1923  end do
1924  do j=1,n2
1925    ftrig2(1,j)= btrig2(1,j)
1926    ftrig2(2,j)=-btrig2(2,j)
1927  end do
1928  do j=1,n3
1929    ftrig3(1,j)= btrig3(1,j)
1930    ftrig3(2,j)=-btrig3(2,j)
1931  end do
1932 
1933  ! Here we take advantage of non-blocking IALLTOALL:
1934  ! Perform the first step of MPI-FFT for ndat wavefunctions.
1935  do idat=1,ndat
1936 
1937    !
1938    ! transform along z axis
1939    ! input: G1,G3,G2,(Gp2)
1940    lot=ncache/(4*n3)
1941    do j2=1,md2proc
1942      if (me_fft*md2proc+j2 <= m2ieff) then
1943        do i1=1,m1i,lot
1944          ma=i1
1945          mb=min(i1+(lot-1),m1i)
1946          n1dfft=mb-ma+1
1947 
1948          ! zero-pad n1dfft G_z lines
1949          ! input: G1,G3,G2,(Gp2)
1950          call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
1951 
1952          inzee=1
1953          do i=1,ic3
1954            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1955 &                      btrig3,after3(i),now3(i),before3(i),1)
1956            inzee=3-inzee
1957          end do
1958 
1959          ! Local rotation.
1960          ! input:  G1,R3,G2,(Gp2)
1961          ! output: G1,G2,R3,(Gp2)
1962          call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
1963        end do
1964      end if
1965    end do
1966 
1967    ! Interprocessor data transposition
1968    ! input:  G1,G2,R3,Rp3,(Gp2)
1969    ! output: G1,G2,R3,Gp2,(Rp3)
1970    if (nproc_fft > 1) then
1971       call timab(543,1,tsec)
1972       call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,&
1973 &                         zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
1974       call timab(543,2,tsec)
1975    end if
1976  end do ! idat
1977 
1978  ! The second step of MPI-FFT
1979  do idat=1,ndat
1980     ! Make sure communication is completed.
1981     if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
1982 
1983    do j3=1,nd3proc
1984      j3glob = j3 + me_fft*nd3proc
1985 
1986      if (me_fft*nd3proc+j3 <= n3) then
1987        Jp2stb=1; J2stb=1
1988        Jp2stf=1; J2stf=1
1989 
1990        ! transform along x axis
1991        lot=ncache/(4*n1)
1992 
1993        do j=1,m2ieff,lot
1994          ma=j
1995          mb=min(j+(lot-1),m2ieff)
1996          n1dfft=mb-ma+1
1997 
1998          ! Zero-pad input.
1999          ! input:  G1,G2,R3,G2,(Rp3)
2000          ! output: G2,G1,R3,G2,(Rp3)
2001          if (nproc_fft == 1) then
2002            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
2003 &           md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0)
2004          else
2005            call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,&
2006 &           md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0)
2007          end if
2008 
2009          ! Transform along x
2010          ! input:  G2,G1,R3,(Rp3)
2011          ! output: G2,R1,R3,(Rp3)
2012          inzee=1
2013          do i=1,ic1-1
2014            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2015 &                     btrig1,after1(i),now1(i),before1(i),1)
2016            inzee=3-inzee
2017          end do
2018 
2019          i=ic1
2020          call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
2021 &                    btrig1,after1(i),now1(i),before1(i),1)
2022        end do
2023 
2024        ! Transform along y axis (take into account c2c or c2r case).
2025        ! Must loop over the full box.
2026        lot=ncache/(4*n2)
2027 
2028        if (icplexwf==1) then
2029          if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff
2030        end if
2031 
2032        do j=1,n1eff,lot
2033          ma=j
2034          mb=min(j+(lot-1),n1eff)
2035          n1dfft=mb-ma+1
2036          jeff=j
2037          includelast=1
2038 
2039          if (icplexwf==1) then
2040            jeff=2*j-1
2041            includelast=1
2042            if (mb==n1eff .and. n1eff*2/=n1) includelast=0
2043          end if
2044 
2045          ! Zero-pad the input.
2046          !  input: G2,R1,R3,(Rp3)
2047          ! output: R1,G2,R3,(Rp3)
2048          if (icplexwf==2) then
2049            call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1))
2050          else
2051            call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
2052          end if
2053 
2054          ! input:  R1,G2,R3,(Rp3)
2055          ! output: R1,R2,R3,(Rp3)
2056          inzee=1
2057          do i=1,ic2
2058            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2059 &                       btrig2,after2(i),now2(i),before2(i),1)
2060             inzee=3-inzee
2061          end do
2062          ! output: R1,R2,R3,(Rp3)
2063 
2064          ! Multiply with potential in real space
2065          jx=cplex*(jeff-1)+1
2066          call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee))
2067 
2068          ! TRANSFORM BACK IN FOURIER SPACE
2069          ! transform along y axis
2070          ! input: R1,R2,R3,(Rp3)
2071          do i=1,ic2
2072            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
2073 &                       ftrig2,after2(i),now2(i),before2(i),-1)
2074            inzee=3-inzee
2075          end do
2076 
2077          !  input: R1,G2,R3,(Rp3)
2078          ! output: G2,R1,R3,(Rp3)
2079          if (icplexwf==2) then
2080            call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
2081          else
2082            call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff))
2083          end if
2084 
2085        end do ! j
2086 
2087        ! transform along x axis
2088        ! input:  R2,R1,R3,(Rp3)
2089        ! output: R2,G1,R3,(Rp3)
2090        lot=ncache/(4*n1)
2091 
2092        do j=1,m2oeff,lot
2093          ma=j
2094          mb=min(j+(lot-1),m2oeff)
2095          n1dfft=mb-ma+1
2096          i=1
2097          call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
2098 &                    ftrig1,after1(i),now1(i),before1(i),-1)
2099 
2100          inzee=1
2101          do i=2,ic1
2102            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
2103 &                       ftrig1,after1(i),now1(i),before1(i),-1)
2104            inzee=3-inzee
2105          end do
2106 
2107          ! input:  G2,G1,R3,Gp2,(Rp3)
2108          ! output: G1,G2,R3,Gp2,(Rp3)
2109          if (nproc_fft == 1) then
2110            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
2111 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
2112          else
2113            call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,&
2114 &           md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat))
2115          end if
2116        end do ! j
2117      end if
2118    end do
2119 
2120    ! Interprocessor data transposition
2121    ! input:  G1,G2,R3,Gp2,(Rp3)
2122    ! output: G1,G2,R3,Rp3,(Gp2)
2123    if (nproc_fft > 1) then
2124      call timab(544,1,tsec)
2125      call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
2126 &                        zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat))
2127      call timab(544,2,tsec)
2128    end if
2129  end do ! idat
2130 
2131  do idat=1,ndat
2132    if (nproc_fft>1) call xmpi_wait(requests(idat),ierr)
2133 
2134    ! transform along z axis
2135    ! input: G1,G2,R3,(Gp2)
2136    lot=ncache/(4*n3)
2137    do j2=1,md2proc
2138      if (me_fft*md2proc+j2 <= m2oeff) then
2139        do i1=1,m1o,lot
2140          ma=i1
2141          mb=min(i1+(lot-1),m1o)
2142          n1dfft=mb-ma+1
2143 
2144          ! input:  G1,G2,R3,(Gp2)
2145          ! output: G1,R3,G2,(Gp2)
2146          call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1))
2147 
2148          inzee=1
2149          do i=1,ic3
2150            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
2151 &           ftrig3,after3(i),now3(i),before3(i),-1)
2152            inzee=3-inzee
2153          end do
2154 
2155          call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
2156          ! output: G1,G3,G2,(Gp2)
2157        end do
2158      end if
2159    end do
2160 
2161    ! Complete missing values with complex conjugate
2162    ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
2163    if (icplexwf==1) then
2164      do i3=1,m3o
2165        i3inv=m3o+2-i3
2166        if (i3==1) i3inv=1
2167        if (m2oeff>1)then
2168          do i2=2,m2oeff
2169            i2inv=m2o+2-i2
2170            zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
2171            zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
2172            do i1=2,m1o
2173              i1inv=m1o+2-i1
2174              zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
2175              zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
2176            end do
2177          end do
2178        end if
2179      end do
2180    end if
2181 
2182  end do ! idat
2183 
2184  ABI_FREE(btrig1)
2185  ABI_FREE(ftrig1)
2186  ABI_FREE(after1)
2187  ABI_FREE(now1)
2188  ABI_FREE(before1)
2189  ABI_FREE(btrig2)
2190  ABI_FREE(ftrig2)
2191  ABI_FREE(after2)
2192  ABI_FREE(now2)
2193  ABI_FREE(before2)
2194  ABI_FREE(btrig3)
2195  ABI_FREE(ftrig3)
2196  ABI_FREE(after3)
2197  ABI_FREE(now3)
2198  ABI_FREE(before3)
2199 
2200  ABI_FREE(zmpi2)
2201  ABI_FREE(zw)
2202  ABI_FREE(zt)
2203  if (nproc_fft > 1)  then
2204    ABI_FREE(zmpi1)
2205  end if
2206 
2207 end subroutine sg2002_applypot_many

m_sg2002/sg2002_back [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_back

FUNCTION

   CALCULATES THE DISCRETE FOURIER TRANSFORM  in parallel using MPI/OpenMP

   ZR(I1,I2,I3)= \sum_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

 Adopt standard convention that isign=1 for backward transform

 INPUTS:
    cplex=1 for real --> complex, 2 for complex --> complex
    ZF: input array in G-space (note the switch of i2 and i3)

         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS:
    ZR: output array in R space.

         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))

         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
    nd1,nd2,nd3: Dimension of ZF and ZR
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

 NOTES:
   The maximum number of processors that can reasonably be used is max(n2,n3)
   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
    half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

116 subroutine sg2002_back(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zf,zr,comm_fft)
117 
118  implicit none
119 
120 !Arguments ------------------------------------
121 ! real space input
122  integer,intent(in) :: cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,comm_fft
123  real(dp),intent(in) :: zf(2,nd1,nd3,nd2proc,ndat)
124  real(dp),intent(out) :: zr(2,nd1eff,nd2,nd3proc,ndat)
125 
126 !Local variables-------------------------------
127 !scalars
128  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,includelast,inzee,j2,j2st,j3,jeff,jp2st,lot,lzt
129  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft
130  character(len=500) :: msg
131 !arrays
132  real(dp), allocatable :: zt(:,:,:)  ! work arrays for transpositions
133  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
134  real(dp), allocatable :: zw(:,:,:) ! cache work array
135  real(dp) :: tsec(2)
136 ! FFT work arrays
137  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
138  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
139 
140 ! *************************************************************************
141 
142  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
143 
144  ! find cache size that gives optimal performance on machine
145  ncache=4*max(n1,n2,n3,1024)
146 
147  if (ncache/(4*max(n1,n2,n3))<1) then
148    write(msg,'(5a)') &
149 &    'ncache has to be enlarged to be able to hold at',ch10, &
150 &    'least one 1-d FFT of each size even though this will',ch10,&
151 &    'reduce the performance for shorter transform lengths'
152    ABI_ERROR(msg)
153  end if
154 
155 ! check input
156  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
157    ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3")
158  end if
159 
160  ! Effective n1 and n2 (complex-to-complex or real-to-complex)
161  n1eff=n1; n2eff=n2; n1zt=n1
162  if (cplex==1) then
163    n1eff=(n1+1)/2 ; n2eff=n2/2+1 ; n1zt=2*(n1/2+1)
164  end if
165 
166  lzt=n2eff
167  if (mod(n2eff,2) == 0) lzt=lzt+1
168  if (mod(n2eff,4) == 0) lzt=lzt+1
169 
170 ! maximal number of big box 3rd dim slices for all procs
171  nnd3=nd3proc*nproc_fft
172 
173  ABI_MALLOC(trig1,(2,n1))
174  ABI_MALLOC(after1,(mdata))
175  ABI_MALLOC(now1,(mdata))
176  ABI_MALLOC(before1,(mdata))
177  ABI_MALLOC(trig2,(2,n2))
178  ABI_MALLOC(after2,(mdata))
179  ABI_MALLOC(now2,(mdata))
180  ABI_MALLOC(before2,(mdata))
181  ABI_MALLOC(trig3,(2,n3))
182  ABI_MALLOC(after3,(mdata))
183  ABI_MALLOC(now3,(mdata))
184  ABI_MALLOC(before3,(mdata))
185  ABI_MALLOC(zw,(2,ncache/4,2))
186  ABI_MALLOC(zt,(2,lzt,n1zt))
187  ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3))
188  if (nproc_fft>1)  then
189    ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3))
190  end if
191 
192  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
193  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
194  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
195 
196 !DEBUG
197 ! write(std_out,'(a,3i4)' )'sg2002_back,zf n1,n2,n3',n1,n2,n3
198 ! write(std_out,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc
199 ! write(std_out,'(a,3i4)' )'m1,m2,m3',m1,m2,m3
200 ! write(std_out,'(a,3i4)' )'max1,max2,max3',max1,max2,max3
201 ! write(std_out,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3
202 ! write(std_out,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
203 !ENDDEBUG
204 
205  do idat=1,ndat
206    ! transform along z axis
207    ! input: I1,I3,J2,(Jp2)
208    lot=ncache/(4*n3)
209 
210    do j2=1,nd2proc
211      if (me_fft*nd2proc+j2 <= n2eff) then
212 
213        do i1=1,n1,lot
214          ma=i1
215          mb=min(i1+(lot-1),n1)
216          n1dfft=mb-ma+1
217 
218          ! input: G1,G3,G2,(Gp2)
219          call fill(nd1,nd3,lot,n1dfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
220 
221          inzee=1
222          do i=1,ic3
223            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
224 &                      trig3,after3(i),now3(i),before3(i),1)
225            inzee=3-inzee
226          end do
227 
228          ! input:  G1,R3,G2,(Gp2)
229          ! output: G1,G2,R3,(Gp2)
230          call scramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zw(1,1,inzee),zmpi2)
231        end do
232      end if
233    end do
234 
235    ! Interprocessor data transposition
236    ! input:  G1,G2,R3,Rp3,(Gp2)
237    ! output: G1,G2,G3,Gp2,(Rp3)
238    if (nproc_fft>1) then
239      call timab(543,1,tsec)
240      call xmpi_alltoall(zmpi2,2*n1*nd2proc*nd3proc, &
241 &                       zmpi1,2*n1*nd2proc*nd3proc,comm_fft,ierr)
242      call timab(543,2,tsec)
243    end if
244 
245    do j3=1,nd3proc
246      if (me_fft*nd3proc+j3 <= n3) then
247        Jp2st=1
248        J2st=1
249 
250        ! transform along x axis
251        lot=ncache/(4*n1)
252 
253        do j=1,n2eff,lot
254          ma=j
255          mb=min(j+(lot-1),n2eff)
256          n1dfft=mb-ma+1
257 
258          ! input:  G1,G2,R3,Gp2,(Rp3)
259          ! output: G2,G1,R3,Jp2,(Rp3)
260          if (nproc_fft == 1) then
261            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi2,zw(1,1,1))
262          else
263            call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi1,zw(1,1,1))
264          end if
265 
266          ! input:  G2,G1,R3,(Rp3)
267          ! output: G2,R1,R3,(Rp3)
268          inzee=1
269          do i=1,ic1-1
270            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
271 &                       trig1,after1(i),now1(i),before1(i),1)
272            inzee=3-inzee
273          end do
274 
275          i=ic1
276          call fftstp(lot,n1dfft,n1,lzt,n1zt,zw(1,1,inzee),zt(1,j,1), &
277 &                    trig1,after1(i),now1(i),before1(i),1)
278        end do
279 
280        ! transform along y axis
281        lot=ncache/(4*n2)
282 
283        do j=1,n1eff,lot
284          ma=j
285          mb=min(j+(lot-1),n1eff)
286          n1dfft=mb-ma+1
287          includelast=1
288 
289          if (cplex==1) then
290           jeff=2*j-1
291           includelast=1
292           if (mb==n1eff .and. n1eff*2/=n1) includelast=0
293          end if
294 
295          ! input:  G2,R1,R3,(Rp3)
296          ! output: R1,G2,R3,(Rp3)
297          if (cplex==2) then
298            call switch(n1dfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
299          else
300            call switchreal(includelast,n1dfft,n2,n2eff,lot,n1zt,lzt,zt(1,1,jeff),zw(1,1,1))
301          end if
302 
303          inzee=1
304          do i=1,ic2-1
305            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
306 &                       trig2,after2(i),now2(i),before2(i),1)
307            inzee=3-inzee
308          end do
309 
310          i=ic2
311          call fftstp(lot,n1dfft,n2,nd1eff,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), &
312 &                    trig2,after2(i),now2(i),before2(i),1)
313 
314        end do
315        ! output: R1,R2,R3,(Rp3)
316 
317      end if
318    end do
319  end do ! idat
320 
321  ABI_FREE(trig1)
322  ABI_FREE(after1)
323  ABI_FREE(now1)
324  ABI_FREE(before1)
325  ABI_FREE(trig2)
326  ABI_FREE(after2)
327  ABI_FREE(now2)
328  ABI_FREE(before2)
329  ABI_FREE(trig3)
330  ABI_FREE(after3)
331  ABI_FREE(now3)
332  ABI_FREE(before3)
333  ABI_FREE(zmpi2)
334  ABI_FREE(zw)
335  ABI_FREE(zt)
336  if (nproc_fft>1)  then
337    ABI_FREE(zmpi1)
338  end if
339 
340 end subroutine sg2002_back

m_sg2002/sg2002_forw [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_forw

FUNCTION

   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=
   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)
   in parallel using MPI/OpenMP and BLAS library calls.

INPUTS

    ZR: input array
         ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
         ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
 OUTPUTS
    ZF: output array (note the switch of i2 and i3)
         real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
         imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
         i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK
     n1,n2,n3: logical dimension of the transform. As transform lengths
               most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can
              be found in subroutine CTRIG
     nd1,nd2,nd3: Dimension of ZR and ZF
    nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices

NOTES

  SHOULD describe nd1eff
  SHOULD put cplex and nd1eff in OMP declarations
  SHOULD describe the change of value of nd2prod

  The maximum number of processors that can reasonably be used is max(n2,n3)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

SOURCE

395 subroutine sg2002_forw(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zr,zf,comm_fft)
396 
397  implicit none
398 
399 !Arguments ------------------------------------
400 !scalars
401  integer,intent(in) :: cplex,comm_fft
402  integer,intent(in) :: ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option
403 !arrays
404  real(dp),intent(in) :: zr(2,nd1eff,nd2,nd3proc,ndat)
405  real(dp),intent(out) :: zf(2,nd1,nd3,nd2proc,ndat)
406 
407 !Local variables-------------------------------
408 !scalars
409  integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j2,j2st,j3,jp2st,lot,lzt
410  integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft
411  character(len=500) :: msg
412 !arrays
413  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
414  real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI
415  real(dp), allocatable :: zw(:,:,:) ! cache work array
416  real(dp) :: tsec(2)
417 ! FFT work arrays
418  real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3
419  integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3
420 
421 ! *************************************************************************
422 
423  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
424 
425  ! find cache size that gives optimal performance on machine
426  ncache=4*max(n1,n2,n3,1024)
427  if (ncache/(4*max(n1,n2,n3))<1) then
428    write(msg,'(5a)')&
429 &     'ncache has to be enlarged to be able to hold at',ch10, &
430 &     'least one 1-d FFT of each size even though this will',ch10,&
431 &     'reduce the performance for shorter transform lengths'
432    ABI_ERROR(msg)
433  end if
434 
435  ! check input
436  if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then
437    ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3")
438  end if
439 
440 !Effective n1 and n2 (complex-to-complex or real-to-complex)
441  n1eff=n1; n2eff=n2; n1zt=n1
442  if (cplex==1) then
443    n1eff=(n1+1)/2; n2eff=n2/2+1; n1zt=2*(n1/2+1)
444  end if
445 
446  lzt=n2eff
447  if (mod(n2eff,2) == 0) lzt=lzt+1
448  if (mod(n2eff,4) == 0) lzt=lzt+1
449 
450  ! maximal number of big box 3rd dim slices for all procs
451  nnd3=nd3proc*nproc_fft
452 
453  ABI_MALLOC(trig1,(2,n1))
454  ABI_MALLOC(after1,(mdata))
455  ABI_MALLOC(now1,(mdata))
456  ABI_MALLOC(before1,(mdata))
457  ABI_MALLOC(trig2,(2,n2))
458  ABI_MALLOC(after2,(mdata))
459  ABI_MALLOC(now2,(mdata))
460  ABI_MALLOC(before2,(mdata))
461  ABI_MALLOC(trig3,(2,n3))
462  ABI_MALLOC(after3,(mdata))
463  ABI_MALLOC(now3,(mdata))
464  ABI_MALLOC(before3,(mdata))
465  ABI_MALLOC(zw,(2,ncache/4,2))
466  ABI_MALLOC(zt,(2,lzt,n1zt))
467  ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3))
468  if (nproc_fft>1)  then
469    ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3))
470  end if
471 
472  call ctrig(n2,trig2,after2,before2,now2,-1,ic2)
473  call ctrig(n1,trig1,after1,before1,now1,-1,ic1)
474  call ctrig(n3,trig3,after3,before3,now3,-1,ic3)
475 
476  do idat=1,ndat
477    do j3=1,nd3proc
478      if (me_fft*(nd3proc)+j3 <= n3) then
479        Jp2st=1; J2st=1
480 
481        ! transform along y axis
482        ! input: R1,R2,R3,(Rp3)
483        lot=ncache/(4*n2)
484 
485        do j=1,n1eff,lot
486          ma=j
487          mb=min(j+(lot-1),n1eff)
488          n1dfft=mb-ma+1
489          i=1
490          call fftstp(nd1eff,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), &
491 &                    trig2,after2(i),now2(i),before2(i),-1)
492 
493          inzee=1
494          do i=2,ic2
495            call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
496 &                       trig2,after2(i),now2(i),before2(i),-1)
497             inzee=3-inzee
498          end do
499 
500          !  input: R1,G2,R3,(Rp3)
501          ! output: G2,R1,R3,(Rp3)
502          if(cplex==2)then
503            call unswitch(n1dfft,n2,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,j))
504          else
505            call unswitchreal(n1dfft,n2,n2eff,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,2*j-1))
506          end if
507        end do
508 
509        ! transform along x axis
510        ! input: G2,R1,R3,(Rp3)
511        lot=ncache/(4*n1)
512 
513        do j=1,n2eff,lot
514          ma=j
515          mb=min(j+(lot-1),n2eff)
516          n1dfft=mb-ma+1
517 
518          i=1
519          call fftstp(lzt,n1dfft,n1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
520 &                    trig1,after1(i),now1(i),before1(i),-1)
521 
522          inzee=1
523          do i=2,ic1
524            call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
525 &                     trig1,after1(i),now1(i),before1(i),-1)
526            inzee=3-inzee
527          end do
528          ! output: G2,G1,R3,(Rp3)
529 
530          ! input:  G2,G1,R3,Gp2,(Rp3)
531          ! output: G1,G2,R3,Gp2,(Rp3)
532          ! write(std_out,*) 'J2st,Jp2st',J2st,Jp2st
533          if (nproc_fft == 1) then
534            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi2)
535          else
536            call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi1)
537          end if
538        end do
539 
540      end if
541    end do ! j3
542 
543    ! Interprocessor data transposition
544    ! input:  G1,G2,R3,Gp2,(Rp3)
545    ! output: G1,G2,R3,Rp3,(Gp2)
546    if (nproc_fft>1) then
547      call timab(544,1,tsec)
548      call xmpi_alltoall(zmpi1,2*n1*nd2proc*nd3proc, &
549 &                       zmpi2,2*n1*nd2proc*nd3proc,comm_fft,ierr)
550      call timab(544,2,tsec)
551    end if
552 
553    ! transform along z axis
554    ! input: G1,G2,R3,(Gp2)
555    lot=ncache/(4*n3)
556 
557    do j2=1,nd2proc
558      if (me_fft*(nd2proc)+j2 <= n2eff) then
559        do i1=1,n1,lot
560          ma=i1
561          mb=min(i1+(lot-1),n1)
562          n1dfft=mb-ma+1
563 
564          ! input:  G1,G2,R3,(Gp2)
565          ! output: G1,R3,G2,(Gp2)
566          call unscramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw(1,1,1))
567 
568          inzee=1
569          do i=1,ic3
570            call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
571 &            trig3,after3(i),now3(i),before3(i),-1)
572            inzee=3-inzee
573          end do
574 
575          call unfill(nd1,nd3,lot,n1dfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
576          ! output: G1,G3,G2,(Gp2)
577        end do
578      end if
579    end do
580 
581  end do ! idat
582 
583  ABI_FREE(trig1)
584  ABI_FREE(after1)
585  ABI_FREE(now1)
586  ABI_FREE(before1)
587  ABI_FREE(trig2)
588  ABI_FREE(after2)
589  ABI_FREE(now2)
590  ABI_FREE(before2)
591  ABI_FREE(trig3)
592  ABI_FREE(after3)
593  ABI_FREE(now3)
594  ABI_FREE(before3)
595  ABI_FREE(zmpi2)
596  ABI_FREE(zw)
597  ABI_FREE(zt)
598  if (nproc_fft>1)  then
599    ABI_FREE(zmpi1)
600  end if
601 
602 end subroutine sg2002_forw

m_sg2002/sg2002_mpiback_wf [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_mpiback_wf

FUNCTION

   Does multiple 3-dim backward FFTs from Fourier into real space
   Adopt standard convention that isign=1 for backward transform

   CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)=

   S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2)

   in parallel using MPI/OpenMP.

 INPUTS:
    icplexwf=1 if wavefunction is real, 2 if complex
    ndat=Number of wavefunctions to transform.
    n1,n2,n3: logical dimension of the transform. As transform lengths
              most products of the prime factors 2,3,5 are allowed.
              The detailed table with allowed transform lengths can be found in subroutine CTRIG
    nd1,nd2,nd3: Leading Dimension of ZR
    nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc
    max1 is positive or zero; m1 >=max1+1
      i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
      then, if m1 > max1+1, one has min1=max1-m1+1 and
      i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
    max2 and max3 have a similar definition of range
    m1,m2,m3=Size of the box enclosing the G-sphere.
    md1,md2,md3: Dimension of ZF given on the **small** FFT box.
    md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc
    nproc_fft: number of processors used as returned by MPI_COMM_SIZE
    comm_fft=MPI communicator for the FFT.
    ZF: input array (note the switch of i2 and i3)
          real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
          imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)

 OUTPUTS
    ZR: output array
          ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
          ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat

NOTES

   The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

   It is very important to find the optimal
   value of NCACHE. NCACHE determines the size of the work array ZW, that
   has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
   The optimal value of ncache can easily be determined by numerical
   experimentation. A too large value of ncache leads to a dramatic
   and sudden decrease of performance, a too small value to a to a
   slow and less dramatic decrease of performance. If NCACHE is set
   to a value so small, that not even a single one dimensional transform
   can be done in the workarray zw, the program stops with an error message.

SOURCE

665 subroutine sg2002_mpiback_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
666 &  max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft)
667 
668  implicit none
669 
670 !Arguments ------------------------------------
671  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
672  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
673  real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat)
674  real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat)
675 
676 !Local variables-------------------------------
677  integer :: i,j,i1,i2,ic1,ic2,ic3,idat,ierr,inzee,includelast
678  integer :: ioption,j2,j3,j2st,jp2st,jeff,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
679  integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft
680  character(len=500) :: msg
681 !arrays
682  real(dp),allocatable :: zt(:,:,:) ! work arrays for transpositions
683  real(dp),allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:)  ! work arrays for MPI
684  real(dp),allocatable :: zw(:,:,:) ! cache work array
685 ! FFT work arrays
686  real(dp),allocatable :: trig1(:,:),trig2(:,:),trig3(:,:)
687  integer,allocatable :: after1(:),now1(:),before1(:),after2(:)
688  integer,allocatable :: now2(:),before2(:),after3(:),now3(:),before3(:)
689  real(dp) :: tsec(2)
690 
691 ! *************************************************************************
692 
693  ! call timab(541,1,tsec)
694  ! FIXME must provide a default value but which one?
695  ! ioption = 0
696  ioption = 1
697  !if (paral_kgb==1) ioption=1
698 
699  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
700 
701  ! Find cache size that gives optimal performance on machine
702  ncache=4*max(n1,n2,n3,1024)
703  if (ncache/(4*max(n1,n2,n3))<1) then
704    write(msg,"(5a)") &
705 &    'ncache has to be enlarged to be able to hold at',ch10, &
706 &    'least one 1-d FFT of each size even though this will',ch10,&
707 &    'reduce the performance for shorter transform lengths'
708     ABI_ERROR(msg)
709  end if
710 
711  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
712  n1eff=n1; m2eff=m2; m1zt=n1
713  if (icplexwf==1) then
714    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
715  end if
716 
717  lzt=m2eff
718  if (mod(m2eff,2)==0) lzt=lzt+1
719  if (mod(m2eff,4)==0) lzt=lzt+1
720 
721  ! maximal number of big box 3rd dim slices for all procs
722  nnd3=nd3proc*nproc_fft
723 
724  ABI_MALLOC(trig1,(2,n1))
725  ABI_MALLOC(after1,(mdata))
726  ABI_MALLOC(now1,(mdata))
727  ABI_MALLOC(before1,(mdata))
728  ABI_MALLOC(trig2,(2,n2))
729  ABI_MALLOC(after2,(mdata))
730  ABI_MALLOC(now2,(mdata))
731  ABI_MALLOC(before2,(mdata))
732  ABI_MALLOC(trig3,(2,n3))
733  ABI_MALLOC(after3,(mdata))
734  ABI_MALLOC(now3,(mdata))
735  ABI_MALLOC(before3,(mdata))
736 
737  ! Allocate cache work array and work arrays for MPI transpositions.
738  ABI_MALLOC(zw,(2,ncache/4,2))
739  ABI_MALLOC(zt,(2,lzt,m1zt))
740  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
741  if (nproc_fft>1)  then
742    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
743  end if
744 
745  ! Compute twiddle coefficients.
746  call ctrig(n3,trig3,after3,before3,now3,1,ic3)
747  call ctrig(n1,trig1,after1,before1,now1,1,ic1)
748  call ctrig(n2,trig2,after2,before2,now2,1,ic2)
749 
750 !DEBUG
751 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': sg2002_mpiback_wf,zf n1,n2,n3',n1,n2,n3
752 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': nd1,nd2,nd3proc',nd1,nd2,nd3proc
753 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': m1,m2,m3',m1,m2,m3
754 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': max1,max2,max3',max1,max2,max3
755 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': md1,md2proc,md3',md1,md2proc,md3
756 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
757 !ENDDEBUG
758 
759  do idat=1,ndat
760 
761     ! transform along z axis
762     ! input: G1,G3,G2,(Gp2)
763     lot=ncache/(4*n3)
764 
765     zw(:,:,:)=zero
766     zt(:,:,:)=zero
767 
768     ! Loop over the y planes treated by this node and trasform n1ddft G_z lines.
769     do j2=1,md2proc
770 
771       ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard
772 
773       do i1=1,m1,lot
774         ma=i1
775         mb=min(i1+(lot-1),m1)
776         n1dfft=mb-ma+1
777 
778         ! zero-pad n1dfft G_z lines
779         ! input:  G1,G3,G2,(Gp2)
780         ! output: G1,R3,G2,(Gp2)
781         call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1))
782 
783         ! Transform along z.
784         inzee=1
785         do i=1,ic3
786           call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
787 &                     trig3,after3(i),now3(i),before3(i),1)
788           inzee=3-inzee
789         end do
790 
791         ! Local rotation.
792         ! input:  G1,R3,G2,(Gp2)
793         ! output: G1,G2,R3,(Gp2)
794         call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
795       end do
796       !
797     end do ! j2
798 
799     ! Interprocessor data transposition
800     ! input:  G1,G2,R3,Rp3,(Gp2)
801     ! output: G1,G2,R3,Gp2,(Rp3)
802     if (nproc_fft>1) then
803       call timab(543,1,tsec)
804       call xmpi_alltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
805 &                        zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr)
806       call timab(543,2,tsec)
807     end if
808 
809     ! Loop over the z treated by this node.
810     do j3=1,nd3proc
811       !j3glob = j3 + me_fft*nd3proc
812       if (me_fft*nd3proc+j3 <= n3) then
813         Jp2st=1; J2st=1
814 
815         lot=ncache/(4*n1)
816 
817         ! Loop over G_y in the small box.
818         do j=1,m2eff,lot
819           ma=j
820           mb=min(j+(lot-1),m2eff)
821           n1dfft=mb-ma+1
822 
823           ! Zero-pad input.
824           ! input:  G1,G2,R3,JG2,(Rp3)
825           ! output: G2,G1,R3,JG2,(Rp3)
826           if (nproc_fft==1) then
827             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
828 &             md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1),max2,m2,n2)
829           else
830             call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
831 &             md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1),max2,m2,n2)
832           end if
833 
834           ! Transform along x
835           ! input:  G2,G1,R3,(Rp3)
836           ! output: G2,R1,R3,(Rp3)
837           inzee=1
838           do i=1,ic1-1
839             call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
840 &                       trig1,after1(i),now1(i),before1(i),1)
841             inzee=3-inzee
842           end do
843 
844           i=ic1
845           call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), &
846 &                     trig1,after1(i),now1(i),before1(i),1)
847         end do
848 
849         ! Transform along y axis (take into account c2c or c2r case).
850         ! Must loop over the full box.
851         lot=ncache/(4*n2)
852 
853         do j=1,n1eff,lot
854           ma=j
855           mb=min(j+(lot-1),n1eff)
856           n1dfft=mb-ma+1
857           includelast=1
858           if (icplexwf==1) then
859             jeff=2*j-1
860             if (mb==n1eff .and. n1eff*2/=n1) includelast=0
861           end if
862 
863           ! Zero-pad the input.
864           ! input:  G2,R1,R3,(Rp3)
865           ! output: R1,G2,R3,(Rp3)
866           if (icplexwf==2) then
867             call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1))
868           else
869             call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1))
870           end if
871 
872           ! input:  R1,G2,R3,(Rp3)
873           ! output: R1,R2,R3,(Rp3)
874           inzee=1
875           do i=1,ic2-1
876             call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
877 &                        trig2,after2(i),now2(i),before2(i),1)
878             inzee=3-inzee
879           end do
880 
881           i=ic2
882 
883         call fftstp(lot,n1dfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), &
884 &                     trig2,after2(i),now2(i),before2(i),1)
885 
886 
887         end do
888 
889         ! Treat real wavefunctions.
890         if (icplexwf==1) then
891           n1half=n1/2
892           ! If odd
893           if (n1half*2/=n1) then
894             do i2=1,n2
895               zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat)
896               zr(2,n1,i2,j3,idat)=zero
897             end do
898           end if
899           do i2=1,n2
900             do i1=n1half,1,-1
901               zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat)
902               zr(1,2*i1  ,i2,j3,idat)=zr(2,i1,i2,j3,idat)
903               zr(2,2*i1-1,i2,j3,idat)=zero
904               zr(2,2*i1  ,i2,j3,idat)=zero
905             end do
906           end do
907         end if
908 
909       end if
910 
911    end do ! j3
912  end do ! idat
913 
914  ABI_FREE(trig1)
915  ABI_FREE(after1)
916  ABI_FREE(now1)
917  ABI_FREE(before1)
918  ABI_FREE(trig2)
919  ABI_FREE(after2)
920  ABI_FREE(now2)
921  ABI_FREE(before2)
922  ABI_FREE(trig3)
923  ABI_FREE(after3)
924  ABI_FREE(now3)
925  ABI_FREE(before3)
926  ABI_FREE(zmpi2)
927  ABI_FREE(zw)
928  ABI_FREE(zt)
929  if (nproc_fft>1)  then
930    ABI_FREE(zmpi1)
931  end if
932 
933  !call timab(541,2,tsec)
934 
935 end subroutine sg2002_mpiback_wf

m_sg2002/sg2002_mpiforw_wf [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

  sg2002_mpiforw_wf

FUNCTION

   Does multiple 3-dim backward FFTs from real into Fourier space
   Adopt standard convention that isign=-1 for forward transform
   CALCULATES THE DISCRETE FOURIERTRANSFORM

   ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3)

   in parallel using MPI/OpenMP.

 INPUT:
   ZR: input array
        ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat))
        ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat))
        i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
   NOTE that ZR is changed by the routine

   n1,n2,n3: logical dimension of the transform. As transform lengths
             most products of the prime factors 2,3,5 are allowed.
             The detailed table with allowed transform lengths can
             be found in subroutine CTRIG
   nd1,nd2,nd3: Dimension of ZR
   nd3proc=((nd3-1)/nproc_fft)+1  maximal number of big box 3rd dim slices for one proc

 OUTPUT:
   ZF: output array (note the switch of i2 and i3)
        real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat)
        imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat)
   max1 is positive or zero ; m1 >=max1+1
     i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1
     then, if m1 > max1+1, one has min1=max1-m1+1 and
     i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1
     i2 and i3 have a similar definition of range
   idat=1,ndat
   md1,md2,md3: Dimension of ZF
   md2proc=((md2-1)/nproc_fft)+1  maximal number of small box 2nd dim slices for one proc
   nproc_fft: number of processors used as returned by MPI_COMM_SIZE
   me_fft: [0:nproc-1] rank of the processor in the FFT communicator.
   comm_fft=MPI communicator for parallel FFT.

NOTES

  The maximum number of processors that can reasonably be used is max(n2/2,n3/2)

  It is very important to find the optimal
  value of NCACHE. NCACHE determines the size of the work array ZW, that
  has to fit into cache. It has therefore to be chosen to equal roughly
   half the size of the physical cache in units of real*8 numbers.
  The optimal value of ncache can easily be determined by numerical
  experimentation. A too large value of ncache leads to a dramatic
  and sudden decrease of performance, a too small value to a to a
  slow and less dramatic decrease of performance. If NCACHE is set
  to a value so small, that not even a single one dimensional transform
  can be done in the workarray zw, the program stops with an error message.

SOURCE

 999 subroutine sg2002_mpiforw_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,&
1000 &        max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft)
1001 
1002  implicit none
1003 
1004 !Arguments ------------------------------------
1005 !scalars
1006  integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc
1007  integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft
1008 !arrays
1009  real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat)
1010  real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat)
1011 
1012 !Local variables-------------------------------
1013 !scalars
1014  integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,nproc_fft,me_fft
1015  integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3
1016  integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv
1017  character(len=500) :: msg
1018 !arrays
1019  real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions
1020  real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI
1021  real(dp), allocatable :: zw(:,:,:) ! cache work array
1022 ! FFT work arrays
1023  real(dp), allocatable :: trig1(:,:),trig2(:,:),trig3(:,:)
1024  integer, allocatable :: after1(:),now1(:),before1(:),after2(:),now2(:),before2(:),after3(:),now3(:),before3(:)
1025  real(dp) :: tsec(2)
1026 
1027 ! *************************************************************************
1028 
1029  ! call timab(542,1,tsec)
1030 
1031  ! FIXME must provide a default value but which one?
1032  !ioption = 0
1033  ioption = 1
1034  !if (paral_kgb==1) ioption=1
1035 
1036  nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft)
1037 
1038  ! find cache size that gives optimal performance on machine
1039  ncache=4*max(n1,n2,n3,1024)
1040  if (ncache/(4*max(n1,n2,n3))<1) then
1041    write(msg,'(5a)') &
1042 &    'ncache has to be enlarged to be able to hold at',ch10, &
1043 &    'least one 1-d FFT of each size even though this will',ch10,&
1044 &    'reduce the performance for shorter transform lengths'
1045    ABI_ERROR(msg)
1046  end if
1047 
1048  ! Effective m1 and m2 (complex-to-complex or real-to-complex)
1049  n1eff=n1; m2eff=m2; m1zt=n1
1050  if (icplexwf==1) then
1051    n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1)
1052  end if
1053 
1054  lzt=m2eff
1055  if (mod(m2eff,2)==0) lzt=lzt+1
1056  if (mod(m2eff,4)==0) lzt=lzt+1
1057 
1058  ! maximal number of big box 3rd dim slices for all procs
1059  nnd3=nd3proc*nproc_fft
1060 
1061  ABI_MALLOC(trig1,(2,n1))
1062  ABI_MALLOC(after1,(mdata))
1063  ABI_MALLOC(now1,(mdata))
1064  ABI_MALLOC(before1,(mdata))
1065  ABI_MALLOC(trig2,(2,n2))
1066  ABI_MALLOC(after2,(mdata))
1067  ABI_MALLOC(now2,(mdata))
1068  ABI_MALLOC(before2,(mdata))
1069  ABI_MALLOC(trig3,(2,n3))
1070  ABI_MALLOC(after3,(mdata))
1071  ABI_MALLOC(now3,(mdata))
1072  ABI_MALLOC(before3,(mdata))
1073  ABI_MALLOC(zw,(2,ncache/4,2))
1074  ABI_MALLOC(zt,(2,lzt,m1zt))
1075  ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat))
1076  if (nproc_fft>1)  then
1077    ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat))
1078  end if
1079 
1080  call ctrig(n2,trig2,after2,before2,now2,-1,ic2)
1081  call ctrig(n1,trig1,after1,before1,now1,-1,ic1)
1082  call ctrig(n3,trig3,after3,before3,now3,-1,ic3)
1083 
1084 !DEBUG
1085 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'sg2002_mpiforw_wf, enter', i1,i2,i3,zr,n1,n2,n3',n1,n2,n3
1086 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'nd1,nd2,nd3proc',nd1,nd2,nd3proc
1087 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'m1,m2,m3',m1,m2,m3
1088 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'max1,max2,max3',max1,max2,max3
1089 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'md1,md2proc,md3',md1,md2proc,md3
1090 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt
1091 !ENDDEBUG
1092 
1093   do idat=1,ndat
1094     ! Loop over the z-planes treated by this node
1095     do j3=1,nd3proc
1096 
1097        if (me_fft*nd3proc+j3 <= n3) then
1098          Jp2st=1
1099          J2st=1
1100 
1101          ! Treat real wavefunctions.
1102          if (icplexwf==1) then
1103            n1half=n1/2
1104            do i2=1,n2
1105              do i1=1,n1half
1106                zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat)
1107                zr(2,i1,i2,j3,idat)=zr(1,2*i1  ,i2,j3,idat)
1108              end do
1109            end do
1110            ! If odd
1111            if(n1half*2/=n1)then
1112              do i2=1,n2
1113                zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat)
1114                zr(2,n1eff,i2,j3,idat)=zero
1115              end do
1116            end if
1117          end if
1118 
1119          ! transform along y axis
1120          ! input: R1,R2,R3,(Rp3)
1121          ! input: R1,G2,R3,(Rp3)
1122          lot=ncache/(4*n2)
1123 
1124          do j=1,n1eff,lot
1125            ma=j
1126            mb=min(j+(lot-1),n1eff)
1127            n1dfft=mb-ma+1
1128            i=1
1129            call fftstp(nd1,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), &
1130 &                      trig2,after2(i),now2(i),before2(i),-1)
1131 
1132            inzee=1
1133            do i=2,ic2
1134              call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), &
1135 &                         trig2,after2(i),now2(i),before2(i),-1)
1136               inzee=3-inzee
1137            end do
1138 
1139            ! input:  R1,G2,R3,(Rp3)
1140            ! output: G2,R1,R3,(Rp3)
1141            if(icplexwf==2)then
1142              call unswitch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j))
1143            else
1144              call unswitchreal_cent(n1dfft,max2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,2*j-1))
1145            end if
1146          end do
1147 
1148          ! transform along x axis
1149          ! input: G2,R1,R3,(Rp3)
1150          lot=ncache/(4*n1)
1151 
1152          do j=1,m2eff,lot
1153            ma=j
1154            mb=min(j+(lot-1),m2eff)
1155            n1dfft=mb-ma+1
1156            i=1
1157            call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), &
1158 &                      trig1,after1(i),now1(i),before1(i),-1)
1159 
1160            inzee=1
1161            do i=2,ic1
1162              call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), &
1163 &                        trig1,after1(i),now1(i),before1(i),-1)
1164              inzee=3-inzee
1165            end do
1166            ! output: G2,G1,R3,(Rp3)
1167 
1168            ! input:  G2,G1,R3,Gp2,(Rp3)
1169            ! output: G1,G2,R3,Gp2,(Rp3)
1170            if (nproc_fft==1) then
1171              call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
1172 &              md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat))
1173            else
1174              call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,&
1175 &              md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat))
1176            end if
1177          end do
1178 
1179         end if
1180      end do ! j3
1181 
1182      ! Interprocessor data transposition
1183      ! input:  G1,G2,R3,Gp2,(Rp3)
1184      ! output: G1,G2,R3,Rp3,(Gp2)
1185      if (nproc_fft>1) then
1186         call timab(544,1,tsec)
1187         call xmpi_alltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, &
1188 &                          zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr)
1189 
1190         call timab(544,2,tsec)
1191      end if
1192 
1193      ! transform along z axis
1194      ! input: G1,G2,R3,(Gp2)
1195      lot=ncache/(4*n3)
1196 
1197      do j2=1,md2proc
1198        if (me_fft*md2proc+j2 <= m2eff) then
1199          ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2
1200          do i1=1,m1,lot
1201            ma=i1
1202            mb=min(i1+(lot-1),m1)
1203            n1dfft=mb-ma+1
1204 
1205            ! input:  G1,G2,R3,(Gp2)
1206            ! output: G1,R3,G2,(Gp2)
1207            call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1))
1208 
1209            inzee=1
1210            do i=1,ic3
1211              call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), &
1212 &              trig3,after3(i),now3(i),before3(i),-1)
1213              inzee=3-inzee
1214            end do
1215 
1216            call unfill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat))
1217            ! output: G1,G3,G2,(Gp2)
1218          end do
1219        end if
1220      end do
1221 
1222      if (icplexwf==1) then
1223        ! Complete missing values with complex conjugate
1224        ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1.
1225        do i3=1,m3
1226          i3inv=m3+2-i3
1227          if(i3==1)i3inv=1
1228 
1229          if (m2eff>1) then
1230            do i2=2,m2eff
1231              i2inv=m2+2-i2
1232              zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat)
1233              zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat)
1234              do i1=2,m1
1235                i1inv=m1+2-i1
1236                zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat)
1237                zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat)
1238              end do
1239            end do
1240          end if
1241        end do
1242      end if
1243 
1244  end do ! idat
1245 
1246  ABI_FREE(trig1)
1247  ABI_FREE(after1)
1248  ABI_FREE(now1)
1249  ABI_FREE(before1)
1250  ABI_FREE(trig2)
1251  ABI_FREE(after2)
1252  ABI_FREE(now2)
1253  ABI_FREE(before2)
1254  ABI_FREE(trig3)
1255  ABI_FREE(after3)
1256  ABI_FREE(now3)
1257  ABI_FREE(before3)
1258  ABI_FREE(zmpi2)
1259  ABI_FREE(zw)
1260  ABI_FREE(zt)
1261  if (nproc_fft>1)  then
1262    ABI_FREE(zmpi1)
1263  end if
1264 
1265  !call timab(542,2,tsec)
1266 
1267 end subroutine sg2002_mpiforw_wf

m_sg2002/sg2002_mpifourdp [ Functions ]

[ Top ] [ m_sg2002 ] [ Functions ]

NAME

 sg2002_mpifourdp

FUNCTION

 Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on
 fft grid in real space, to create complex f(G)=fofg defined on full fft grid
 in reciprocal space, in full storage mode, or the reverse operation.
 For the reverse operation, the final data is divided by nfftot.
 REAL case when cplex=1, COMPLEX case when cplex=2
 Usually used for density and potentials.

INPUTS

 cplex=1 if fofr is real, 2 if fofr is complex
 nfft=(effective) number of FFT grid points (for this processor)
 ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft
 ndat=Numbre of FFT transforms
 isign=sign of Fourier transform exponent: current convention uses
    +1 for transforming from G to r
    -1 for transforming from r to G.
 fftn2_distrib(2),ffti2_local(2)
 fftn3_distrib(3),ffti3_local(3)
 comm_fft=MPI communicator

SIDE EFFECTS

 Input/Output
 fofg(2,nfft)=f(G), complex.
 fofr(cplex*nfft)=input function f(r) (real or complex)

TODO

  Write simplified API for sequential version.

SOURCE

1306 subroutine sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,&
1307 &  fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft)
1308 
1309  implicit none
1310 
1311 !Arguments ------------------------------------
1312 !scalars
1313  integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft
1314 !arrays
1315  integer,intent(in) :: ngfft(18)
1316  integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2))
1317  integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3))
1318  real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat)
1319 
1320 !Local variables-------------------------------
1321 !scalars
1322  integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,nproc_fft,me_fft
1323 !arrays
1324  real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:)
1325 
1326 ! *************************************************************************
1327 
1328  ! Note the only c2c is supported in parallel.
1329  n1=ngfft(1); n2=ngfft(2); n3=ngfft(3)
1330  n4=ngfft(4); n5=ngfft(5); n6=ngfft(6)
1331  me_fft=ngfft(11); nproc_fft=ngfft(10)
1332 
1333  nd2proc=((n2-1)/nproc_fft) +1
1334  nd3proc=((n6-1)/nproc_fft) +1
1335  ABI_MALLOC(workr,(2,n4,n5,nd3proc,ndat))
1336  ABI_MALLOC(workf,(2,n4,n6,nd2proc,ndat))
1337 
1338  ! Complex to Complex
1339  select case (isign)
1340  case (1)
1341    ! G --> R
1342    call mpifft_fg2dbox(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,workf)
1343 
1344    call sg2002_back(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workf,workr,comm_fft)
1345 
1346    call mpifft_dbox2fr(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,workr,cplex,nfft,fofr)
1347 
1348  case (-1)
1349    ! R --> G
1350    call mpifft_fr2dbox(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,workr)
1351 
1352    call sg2002_forw(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workr,workf,comm_fft)
1353 
1354    ! Transfer FFT output to the original fft box.
1355    call mpifft_dbox2fg(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,workf,nfft,fofg)
1356 
1357  case default
1358    ABI_BUG("Wrong isign")
1359  end select
1360 
1361  ABI_FREE(workr)
1362  ABI_FREE(workf)
1363 
1364 end subroutine sg2002_mpifourdp

TABLE OF CONTENTS

ABINIT/m_sg2002 [ Modules ]

m_sg2002/ctrig [ Functions ]

m_sg2002/fftstp [ Functions ]

m_sg2002/sg2002_accrho [ Functions ]

m_sg2002/sg2002_applypot [ Functions ]

m_sg2002/sg2002_applypot_many [ Functions ]

m_sg2002/sg2002_back [ Functions ]

m_sg2002/sg2002_forw [ Functions ]

m_sg2002/sg2002_mpiback_wf [ Functions ]

m_sg2002/sg2002_mpiforw_wf [ Functions ]

m_sg2002/sg2002_mpifourdp [ Functions ]