TABLE OF CONTENTS
- ABINIT/m_sg2002
- m_sg2002/ctrig
- m_sg2002/fftstp
- m_sg2002/sg2002_accrho
- m_sg2002/sg2002_applypot
- m_sg2002/sg2002_applypot_many
- m_sg2002/sg2002_back
- m_sg2002/sg2002_forw
- m_sg2002/sg2002_mpiback_wf
- m_sg2002/sg2002_mpiforw_wf
- m_sg2002/sg2002_mpifourdp
ABINIT/m_sg2002 [ Modules ]
NAME
m_sg2002
FUNCTION
COPYRIGHT
Copyright (C) 2002-2007 Stefan Goedecker, CEA Grenoble Copyright (C) 2014-2024 ABINIT group (XG) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
NOTES
SOURCE
18 #if defined HAVE_CONFIG_H 19 #include "config.h" 20 #endif 21 22 #include "abi_common.h" 23 24 module m_sg2002 25 26 use defs_basis 27 use defs_fftdata 28 use m_abicore 29 use m_errors 30 use m_xmpi 31 32 use m_time, only : timab 33 use m_fstrings, only : itoa 34 use m_fftcore, only : sphere_fft1, fill, scramble, switchreal, switch, mpiswitch,& 35 & unfill, unscramble, unswitchreal, unswitch, unmpiswitch,& 36 & fill_cent, switch_cent, switchreal_cent, mpiswitch_cent, multpot, addrho,& 37 & unfill_cent, unswitchreal_cent, unswitch_cent, unmpiswitch_cent, unscramble,& 38 & mpifft_fg2dbox, mpifft_dbox2fr, mpifft_fr2dbox, mpifft_dbox2fg 39 40 implicit none 41 42 private 43 44 ! Public API: 45 !public :: sg2002_seqfourdp ! seq-FFT of densities and potentials. 46 public :: sg2002_mpifourdp ! MPI-FFT of densities and potentials. 47 !public :: mpi_fourwf 48 !public :: sg2002_seqfourwf ! seq-FFT of wavefunctions. 49 !public :: sg2002_mpifourwf ! MPI-FFT of wavefunctions. 50 51 ! Low-level tools. 52 ! These procedure shouls be accessed via a wrapper that selected the library via fftalg 53 public :: sg2002_back ! G --> R for densities and potentials 54 public :: sg2002_forw ! R --> G for densities and potentials 55 public :: sg2002_mpiback_wf ! G --> R for wavefunctions 56 public :: sg2002_mpiforw_wf ! R --> G for wavefunctions 57 public :: sg2002_applypot ! Compute <G|vloc|u> where u is given in reciprocal space. 58 public :: sg2002_applypot_many ! Compute <G|vloc|u> where u is given in reciprocal space. 59 public :: sg2002_accrho ! Compute rho = weigth_r*Re(u(r))**2 + weigth_i*Im(u(r))**2 60 61 contains
m_sg2002/ctrig [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
ctrig
FUNCTION
INPUTS
OUTPUT
SOURCE
2513 subroutine ctrig(n,trig,after,before,now,isign,ic) 2514 2515 implicit none 2516 2517 !Arguments ------------------------------------ 2518 integer,intent(in) :: n,isign 2519 integer,intent(inout) :: ic 2520 integer,intent(inout) :: after(mdata),before(mdata),now(mdata) 2521 real(dp),intent(inout) :: trig(2,n) 2522 2523 !Local variables------------------------------- 2524 !scalars 2525 integer :: i,itt,j,nh 2526 real(dp) :: angle,trigc,trigs 2527 2528 ! ************************************************************************* 2529 2530 do i=1,ndata 2531 if (n.eq.ifftdata(1,i)) then 2532 ic=0 2533 do j=1,(mdata-1) 2534 itt=ifftdata(1+j,i) 2535 if (itt.gt.1) then 2536 ic=ic+1 2537 now(j)=ifftdata(1+j,i) 2538 else 2539 goto 1000 2540 end if 2541 end do 2542 goto 1000 2543 end if 2544 end do 2545 2546 write(std_out,*) 'VALUE OF',n,'NOT ALLOWED FOR FFT, ALLOWED VALUES ARE:' 2547 37 format(15(i5)) 2548 write(std_out,37) (ifftdata(1,i),i=1,ndata) 2549 ABI_ERROR("Aborting now") 2550 2551 1000 continue 2552 after(1)=1 2553 before(ic)=1 2554 do i=2,ic 2555 after(i)=after(i-1)*now(i-1) 2556 before(ic-i+1)=before(ic-i+2)*now(ic-i+2) 2557 end do 2558 2559 angle=isign*two_pi/n 2560 if (mod(n,2).eq.0) then 2561 nh=n/2 2562 trig(1,1)=one 2563 trig(2,1)=zero 2564 trig(1,nh+1)=-one 2565 trig(2,nh+1)=zero 2566 do i=1,nh-1 2567 trigc=cos(i*angle) 2568 trigs=sin(i*angle) 2569 trig(1,i+1)=trigc 2570 trig(2,i+1)=trigs 2571 trig(1,n-i+1)=trigc 2572 trig(2,n-i+1)=-trigs 2573 end do 2574 else 2575 nh=(n-1)/2 2576 trig(1,1)=one 2577 trig(2,1)=zero 2578 do i=1,nh 2579 trigc=cos(i*angle) 2580 trigs=sin(i*angle) 2581 trig(1,i+1)=trigc 2582 trig(2,i+1)=trigs 2583 trig(1,n-i+1)=trigc 2584 trig(2,n-i+1)=-trigs 2585 end do 2586 end if 2587 2588 end subroutine ctrig
m_sg2002/fftstp [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
fftstp
FUNCTION
INPUTS
mm n1dfft m nn n zin trig after now before isign
OUTPUT
zout
SOURCE
2615 subroutine fftstp(mm,n1dfft,m,nn,n,zin,zout,trig,after,now,before,isign) 2616 2617 implicit none 2618 2619 !Arguments ------------------------------------ 2620 integer,intent(in) :: after,before,mm,n1dfft,m,nn,n,now,isign 2621 real(dp),intent(in) :: trig(2,n),zin(2,mm,m) 2622 real(dp),intent(inout) :: zout(2,nn,n) 2623 2624 !Local variables------------------------------- 2625 integer :: atn,atb,ia,ias,ib,itrig,itt,j,nin1,nin2,nin3,nin4,nin5,nin6,nin7,nin8 2626 integer :: nout1,nout2,nout3,nout4,nout5,nout6,nout7,nout8 2627 real(dp) :: am,ap,bm,bp,ci3,ci4,ci5,ci6,ci7,ci8,cm,cos2,cos4,cp,cr2,cr3,cr4,cr5,cr6,cr7,cr8 2628 real(dp) :: dm,bb,ci2,dpp,r,r2,r25,r3,r34,r4,r5,r6,r7,r8,rt2i,s,r1,s1,s2,s3,s25,s34,s4,s5,s6,s7,s8 2629 real(dp) :: sin2,ui1,ui2,ui3,ur1,ur2,ur3,sin4,vi1,vi2,vi3,vr1,vr2,vr3 2630 2631 ! ************************************************************************* 2632 atn=after*now 2633 atb=after*before 2634 2635 ! sqrt(.5d0) 2636 rt2i=half_sqrt2 2637 if (now.eq.2) then 2638 ia=1 2639 nin1=ia-after 2640 nout1=ia-atn 2641 do ib=1,before 2642 nin1=nin1+after 2643 nin2=nin1+atb 2644 nout1=nout1+atn 2645 nout2=nout1+after 2646 do j=1,n1dfft 2647 r1=zin(1,j,nin1) 2648 s1=zin(2,j,nin1) 2649 r2=zin(1,j,nin2) 2650 s2=zin(2,j,nin2) 2651 zout(1,j,nout1)= r2 + r1 2652 zout(2,j,nout1)= s2 + s1 2653 zout(1,j,nout2)= r1 - r2 2654 zout(2,j,nout2)= s1 - s2 2655 enddo 2656 enddo 2657 do 2000,ia=2,after 2658 ias=ia-1 2659 if (2*ias.eq.after) then 2660 if (isign.eq.1) then 2661 nin1=ia-after 2662 nout1=ia-atn 2663 do ib=1,before 2664 nin1=nin1+after 2665 nin2=nin1+atb 2666 nout1=nout1+atn 2667 nout2=nout1+after 2668 do j=1,n1dfft 2669 r1=zin(1,j,nin1) 2670 s1=zin(2,j,nin1) 2671 r2=zin(2,j,nin2) 2672 s2=zin(1,j,nin2) 2673 zout(1,j,nout1)= r1 - r2 2674 zout(2,j,nout1)= s2 + s1 2675 zout(1,j,nout2)= r2 + r1 2676 zout(2,j,nout2)= s1 - s2 2677 enddo 2678 enddo 2679 else 2680 nin1=ia-after 2681 nout1=ia-atn 2682 do ib=1,before 2683 nin1=nin1+after 2684 nin2=nin1+atb 2685 nout1=nout1+atn 2686 nout2=nout1+after 2687 do j=1,n1dfft 2688 r1=zin(1,j,nin1) 2689 s1=zin(2,j,nin1) 2690 r2=zin(2,j,nin2) 2691 s2=zin(1,j,nin2) 2692 zout(1,j,nout1)= r2 + r1 2693 zout(2,j,nout1)= s1 - s2 2694 zout(1,j,nout2)= r1 - r2 2695 zout(2,j,nout2)= s2 + s1 2696 enddo 2697 enddo 2698 end if 2699 else if (4*ias.eq.after) then 2700 if (isign.eq.1) then 2701 nin1=ia-after 2702 nout1=ia-atn 2703 do ib=1,before 2704 nin1=nin1+after 2705 nin2=nin1+atb 2706 nout1=nout1+atn 2707 nout2=nout1+after 2708 do j=1,n1dfft 2709 r1=zin(1,j,nin1) 2710 s1=zin(2,j,nin1) 2711 r=zin(1,j,nin2) 2712 s=zin(2,j,nin2) 2713 r2=(r - s)*rt2i 2714 s2=(r + s)*rt2i 2715 zout(1,j,nout1)= r2 + r1 2716 zout(2,j,nout1)= s2 + s1 2717 zout(1,j,nout2)= r1 - r2 2718 zout(2,j,nout2)= s1 - s2 2719 enddo 2720 enddo 2721 else 2722 nin1=ia-after 2723 nout1=ia-atn 2724 do ib=1,before 2725 nin1=nin1+after 2726 nin2=nin1+atb 2727 nout1=nout1+atn 2728 nout2=nout1+after 2729 do j=1,n1dfft 2730 r1=zin(1,j,nin1) 2731 s1=zin(2,j,nin1) 2732 r=zin(1,j,nin2) 2733 s=zin(2,j,nin2) 2734 r2=(r + s)*rt2i 2735 s2=(s - r)*rt2i 2736 zout(1,j,nout1)= r2 + r1 2737 zout(2,j,nout1)= s2 + s1 2738 zout(1,j,nout2)= r1 - r2 2739 zout(2,j,nout2)= s1 - s2 2740 enddo 2741 enddo 2742 end if 2743 else if (4*ias.eq.3*after) then 2744 if (isign.eq.1) then 2745 nin1=ia-after 2746 nout1=ia-atn 2747 do ib=1,before 2748 nin1=nin1+after 2749 nin2=nin1+atb 2750 nout1=nout1+atn 2751 nout2=nout1+after 2752 do j=1,n1dfft 2753 r1=zin(1,j,nin1) 2754 s1=zin(2,j,nin1) 2755 r=zin(1,j,nin2) 2756 s=zin(2,j,nin2) 2757 r2=(r + s)*rt2i 2758 s2=(r - s)*rt2i 2759 zout(1,j,nout1)= r1 - r2 2760 zout(2,j,nout1)= s2 + s1 2761 zout(1,j,nout2)= r2 + r1 2762 zout(2,j,nout2)= s1 - s2 2763 enddo 2764 enddo 2765 else 2766 nin1=ia-after 2767 nout1=ia-atn 2768 do ib=1,before 2769 nin1=nin1+after 2770 nin2=nin1+atb 2771 nout1=nout1+atn 2772 nout2=nout1+after 2773 do j=1,n1dfft 2774 r1=zin(1,j,nin1) 2775 s1=zin(2,j,nin1) 2776 r=zin(1,j,nin2) 2777 s=zin(2,j,nin2) 2778 r2=(s - r)*rt2i 2779 s2=(r + s)*rt2i 2780 zout(1,j,nout1)= r2 + r1 2781 zout(2,j,nout1)= s1 - s2 2782 zout(1,j,nout2)= r1 - r2 2783 zout(2,j,nout2)= s2 + s1 2784 enddo 2785 enddo 2786 end if 2787 else 2788 itrig=ias*before+1 2789 cr2=trig(1,itrig) 2790 ci2=trig(2,itrig) 2791 nin1=ia-after 2792 nout1=ia-atn 2793 do ib=1,before 2794 nin1=nin1+after 2795 nin2=nin1+atb 2796 nout1=nout1+atn 2797 nout2=nout1+after 2798 do j=1,n1dfft 2799 r1=zin(1,j,nin1) 2800 s1=zin(2,j,nin1) 2801 r=zin(1,j,nin2) 2802 s=zin(2,j,nin2) 2803 r2=r*cr2 - s*ci2 2804 s2=r*ci2 + s*cr2 2805 zout(1,j,nout1)= r2 + r1 2806 zout(2,j,nout1)= s2 + s1 2807 zout(1,j,nout2)= r1 - r2 2808 zout(2,j,nout2)= s1 - s2 2809 enddo 2810 enddo 2811 end if 2812 2000 continue 2813 else if (now.eq.4) then 2814 if (isign.eq.1) then 2815 ia=1 2816 nin1=ia-after 2817 nout1=ia-atn 2818 do ib=1,before 2819 nin1=nin1+after 2820 nin2=nin1+atb 2821 nin3=nin2+atb 2822 nin4=nin3+atb 2823 nout1=nout1+atn 2824 nout2=nout1+after 2825 nout3=nout2+after 2826 nout4=nout3+after 2827 do j=1,n1dfft 2828 r1=zin(1,j,nin1) 2829 s1=zin(2,j,nin1) 2830 r2=zin(1,j,nin2) 2831 s2=zin(2,j,nin2) 2832 r3=zin(1,j,nin3) 2833 s3=zin(2,j,nin3) 2834 r4=zin(1,j,nin4) 2835 s4=zin(2,j,nin4) 2836 r=r1 + r3 2837 s=r2 + r4 2838 zout(1,j,nout1) = r + s 2839 zout(1,j,nout3) = r - s 2840 r=r1 - r3 2841 s=s2 - s4 2842 zout(1,j,nout2) = r - s 2843 zout(1,j,nout4) = r + s 2844 r=s1 + s3 2845 s=s2 + s4 2846 zout(2,j,nout1) = r + s 2847 zout(2,j,nout3) = r - s 2848 r=s1 - s3 2849 s=r2 - r4 2850 zout(2,j,nout2) = r + s 2851 zout(2,j,nout4) = r - s 2852 enddo 2853 enddo 2854 do 4000,ia=2,after 2855 ias=ia-1 2856 if (2*ias.eq.after) then 2857 nin1=ia-after 2858 nout1=ia-atn 2859 do ib=1,before 2860 nin1=nin1+after 2861 nin2=nin1+atb 2862 nin3=nin2+atb 2863 nin4=nin3+atb 2864 nout1=nout1+atn 2865 nout2=nout1+after 2866 nout3=nout2+after 2867 nout4=nout3+after 2868 do j=1,n1dfft 2869 r1=zin(1,j,nin1) 2870 s1=zin(2,j,nin1) 2871 r=zin(1,j,nin2) 2872 s=zin(2,j,nin2) 2873 r2=(r-s)*rt2i 2874 s2=(r+s)*rt2i 2875 r3=zin(2,j,nin3) 2876 s3=zin(1,j,nin3) 2877 r=zin(1,j,nin4) 2878 s=zin(2,j,nin4) 2879 r4=(r + s)*rt2i 2880 s4=(r - s)*rt2i 2881 r=r1 - r3 2882 s=r2 - r4 2883 zout(1,j,nout1) = r + s 2884 zout(1,j,nout3) = r - s 2885 r=r1 + r3 2886 s=s2 - s4 2887 zout(1,j,nout2) = r - s 2888 zout(1,j,nout4) = r + s 2889 r=s1 + s3 2890 s=s2 + s4 2891 zout(2,j,nout1) = r + s 2892 zout(2,j,nout3) = r - s 2893 r=s1 - s3 2894 s=r2 + r4 2895 zout(2,j,nout2) = r + s 2896 zout(2,j,nout4) = r - s 2897 enddo 2898 enddo 2899 else 2900 itt=ias*before 2901 itrig=itt+1 2902 cr2=trig(1,itrig) 2903 ci2=trig(2,itrig) 2904 itrig=itrig+itt 2905 cr3=trig(1,itrig) 2906 ci3=trig(2,itrig) 2907 itrig=itrig+itt 2908 cr4=trig(1,itrig) 2909 ci4=trig(2,itrig) 2910 nin1=ia-after 2911 nout1=ia-atn 2912 do ib=1,before 2913 nin1=nin1+after 2914 nin2=nin1+atb 2915 nin3=nin2+atb 2916 nin4=nin3+atb 2917 nout1=nout1+atn 2918 nout2=nout1+after 2919 nout3=nout2+after 2920 nout4=nout3+after 2921 do j=1,n1dfft 2922 r1=zin(1,j,nin1) 2923 s1=zin(2,j,nin1) 2924 r=zin(1,j,nin2) 2925 s=zin(2,j,nin2) 2926 r2=r*cr2 - s*ci2 2927 s2=r*ci2 + s*cr2 2928 r=zin(1,j,nin3) 2929 s=zin(2,j,nin3) 2930 r3=r*cr3 - s*ci3 2931 s3=r*ci3 + s*cr3 2932 r=zin(1,j,nin4) 2933 s=zin(2,j,nin4) 2934 r4=r*cr4 - s*ci4 2935 s4=r*ci4 + s*cr4 2936 r=r1 + r3 2937 s=r2 + r4 2938 zout(1,j,nout1) = r + s 2939 zout(1,j,nout3) = r - s 2940 r=r1 - r3 2941 s=s2 - s4 2942 zout(1,j,nout2) = r - s 2943 zout(1,j,nout4) = r + s 2944 r=s1 + s3 2945 s=s2 + s4 2946 zout(2,j,nout1) = r + s 2947 zout(2,j,nout3) = r - s 2948 r=s1 - s3 2949 s=r2 - r4 2950 zout(2,j,nout2) = r + s 2951 zout(2,j,nout4) = r - s 2952 enddo 2953 enddo 2954 end if 2955 4000 continue 2956 else 2957 ia=1 2958 nin1=ia-after 2959 nout1=ia-atn 2960 do ib=1,before 2961 nin1=nin1+after 2962 nin2=nin1+atb 2963 nin3=nin2+atb 2964 nin4=nin3+atb 2965 nout1=nout1+atn 2966 nout2=nout1+after 2967 nout3=nout2+after 2968 nout4=nout3+after 2969 do j=1,n1dfft 2970 r1=zin(1,j,nin1) 2971 s1=zin(2,j,nin1) 2972 r2=zin(1,j,nin2) 2973 s2=zin(2,j,nin2) 2974 r3=zin(1,j,nin3) 2975 s3=zin(2,j,nin3) 2976 r4=zin(1,j,nin4) 2977 s4=zin(2,j,nin4) 2978 r=r1 + r3 2979 s=r2 + r4 2980 zout(1,j,nout1) = r + s 2981 zout(1,j,nout3) = r - s 2982 r=r1 - r3 2983 s=s2 - s4 2984 zout(1,j,nout2) = r + s 2985 zout(1,j,nout4) = r - s 2986 r=s1 + s3 2987 s=s2 + s4 2988 zout(2,j,nout1) = r + s 2989 zout(2,j,nout3) = r - s 2990 r=s1 - s3 2991 s=r2 - r4 2992 zout(2,j,nout2) = r - s 2993 zout(2,j,nout4) = r + s 2994 enddo 2995 enddo 2996 do 4100,ia=2,after 2997 ias=ia-1 2998 if (2*ias.eq.after) then 2999 nin1=ia-after 3000 nout1=ia-atn 3001 do ib=1,before 3002 nin1=nin1+after 3003 nin2=nin1+atb 3004 nin3=nin2+atb 3005 nin4=nin3+atb 3006 nout1=nout1+atn 3007 nout2=nout1+after 3008 nout3=nout2+after 3009 nout4=nout3+after 3010 do j=1,n1dfft 3011 r1=zin(1,j,nin1) 3012 s1=zin(2,j,nin1) 3013 r=zin(1,j,nin2) 3014 s=zin(2,j,nin2) 3015 r2=(r + s)*rt2i 3016 s2=(s - r)*rt2i 3017 r3=zin(2,j,nin3) 3018 s3=zin(1,j,nin3) 3019 r=zin(1,j,nin4) 3020 s=zin(2,j,nin4) 3021 r4=(s - r)*rt2i 3022 s4=(r + s)*rt2i 3023 r=r1 + r3 3024 s=r2 + r4 3025 zout(1,j,nout1) = r + s 3026 zout(1,j,nout3) = r - s 3027 r=r1 - r3 3028 s=s2 + s4 3029 zout(1,j,nout2) = r + s 3030 zout(1,j,nout4) = r - s 3031 r=s1 - s3 3032 s=s2 - s4 3033 zout(2,j,nout1) = r + s 3034 zout(2,j,nout3) = r - s 3035 r=s1 + s3 3036 s=r2 - r4 3037 zout(2,j,nout2) = r - s 3038 zout(2,j,nout4) = r + s 3039 enddo 3040 enddo 3041 else 3042 itt=ias*before 3043 itrig=itt+1 3044 cr2=trig(1,itrig) 3045 ci2=trig(2,itrig) 3046 itrig=itrig+itt 3047 cr3=trig(1,itrig) 3048 ci3=trig(2,itrig) 3049 itrig=itrig+itt 3050 cr4=trig(1,itrig) 3051 ci4=trig(2,itrig) 3052 nin1=ia-after 3053 nout1=ia-atn 3054 do ib=1,before 3055 nin1=nin1+after 3056 nin2=nin1+atb 3057 nin3=nin2+atb 3058 nin4=nin3+atb 3059 nout1=nout1+atn 3060 nout2=nout1+after 3061 nout3=nout2+after 3062 nout4=nout3+after 3063 do j=1,n1dfft 3064 r1=zin(1,j,nin1) 3065 s1=zin(2,j,nin1) 3066 r=zin(1,j,nin2) 3067 s=zin(2,j,nin2) 3068 r2=r*cr2 - s*ci2 3069 s2=r*ci2 + s*cr2 3070 r=zin(1,j,nin3) 3071 s=zin(2,j,nin3) 3072 r3=r*cr3 - s*ci3 3073 s3=r*ci3 + s*cr3 3074 r=zin(1,j,nin4) 3075 s=zin(2,j,nin4) 3076 r4=r*cr4 - s*ci4 3077 s4=r*ci4 + s*cr4 3078 r=r1 + r3 3079 s=r2 + r4 3080 zout(1,j,nout1) = r + s 3081 zout(1,j,nout3) = r - s 3082 r=r1 - r3 3083 s=s2 - s4 3084 zout(1,j,nout2) = r + s 3085 zout(1,j,nout4) = r - s 3086 r=s1 + s3 3087 s=s2 + s4 3088 zout(2,j,nout1) = r + s 3089 zout(2,j,nout3) = r - s 3090 r=s1 - s3 3091 s=r2 - r4 3092 zout(2,j,nout2) = r - s 3093 zout(2,j,nout4) = r + s 3094 enddo 3095 enddo 3096 end if 3097 4100 continue 3098 end if 3099 else if (now.eq.8) then 3100 if (isign.eq.-1) then 3101 ia=1 3102 nin1=ia-after 3103 nout1=ia-atn 3104 do ib=1,before 3105 nin1=nin1+after 3106 nin2=nin1+atb 3107 nin3=nin2+atb 3108 nin4=nin3+atb 3109 nin5=nin4+atb 3110 nin6=nin5+atb 3111 nin7=nin6+atb 3112 nin8=nin7+atb 3113 nout1=nout1+atn 3114 nout2=nout1+after 3115 nout3=nout2+after 3116 nout4=nout3+after 3117 nout5=nout4+after 3118 nout6=nout5+after 3119 nout7=nout6+after 3120 nout8=nout7+after 3121 do j=1,n1dfft 3122 r1=zin(1,j,nin1) 3123 s1=zin(2,j,nin1) 3124 r2=zin(1,j,nin2) 3125 s2=zin(2,j,nin2) 3126 r3=zin(1,j,nin3) 3127 s3=zin(2,j,nin3) 3128 r4=zin(1,j,nin4) 3129 s4=zin(2,j,nin4) 3130 r5=zin(1,j,nin5) 3131 s5=zin(2,j,nin5) 3132 r6=zin(1,j,nin6) 3133 s6=zin(2,j,nin6) 3134 r7=zin(1,j,nin7) 3135 s7=zin(2,j,nin7) 3136 r8=zin(1,j,nin8) 3137 s8=zin(2,j,nin8) 3138 r=r1 + r5 3139 s=r3 + r7 3140 ap=r + s 3141 am=r - s 3142 r=r2 + r6 3143 s=r4 + r8 3144 bp=r + s 3145 bm=r - s 3146 r=s1 + s5 3147 s=s3 + s7 3148 cp=r + s 3149 cm=r - s 3150 r=s2 + s6 3151 s=s4 + s8 3152 dpp=r + s 3153 dm=r - s 3154 zout(1,j,nout1) = ap + bp 3155 zout(2,j,nout1) = cp + dpp 3156 zout(1,j,nout5) = ap - bp 3157 zout(2,j,nout5) = cp - dpp 3158 zout(1,j,nout3) = am + dm 3159 zout(2,j,nout3) = cm - bm 3160 zout(1,j,nout7) = am - dm 3161 zout(2,j,nout7) = cm + bm 3162 r=r1 - r5 3163 s=s3 - s7 3164 ap=r + s 3165 am=r - s 3166 r=s1 - s5 3167 s=r3 - r7 3168 bp=r + s 3169 bm=r - s 3170 r=s4 - s8 3171 s=r2 - r6 3172 cp=r + s 3173 cm=r - s 3174 r=s2 - s6 3175 s=r4 - r8 3176 dpp=r + s 3177 dm=r - s 3178 r = ( cp + dm)*rt2i 3179 s = ( dm - cp)*rt2i 3180 cp= ( cm + dpp)*rt2i 3181 dpp = ( cm - dpp)*rt2i 3182 zout(1,j,nout2) = ap + r 3183 zout(2,j,nout2) = bm + s 3184 zout(1,j,nout6) = ap - r 3185 zout(2,j,nout6) = bm - s 3186 zout(1,j,nout4) = am + cp 3187 zout(2,j,nout4) = bp + dpp 3188 zout(1,j,nout8) = am - cp 3189 zout(2,j,nout8) = bp - dpp 3190 enddo 3191 enddo 3192 do 8000,ia=2,after 3193 ias=ia-1 3194 itt=ias*before 3195 itrig=itt+1 3196 cr2=trig(1,itrig) 3197 ci2=trig(2,itrig) 3198 itrig=itrig+itt 3199 cr3=trig(1,itrig) 3200 ci3=trig(2,itrig) 3201 itrig=itrig+itt 3202 cr4=trig(1,itrig) 3203 ci4=trig(2,itrig) 3204 itrig=itrig+itt 3205 cr5=trig(1,itrig) 3206 ci5=trig(2,itrig) 3207 itrig=itrig+itt 3208 cr6=trig(1,itrig) 3209 ci6=trig(2,itrig) 3210 itrig=itrig+itt 3211 cr7=trig(1,itrig) 3212 ci7=trig(2,itrig) 3213 itrig=itrig+itt 3214 cr8=trig(1,itrig) 3215 ci8=trig(2,itrig) 3216 nin1=ia-after 3217 nout1=ia-atn 3218 do ib=1,before 3219 nin1=nin1+after 3220 nin2=nin1+atb 3221 nin3=nin2+atb 3222 nin4=nin3+atb 3223 nin5=nin4+atb 3224 nin6=nin5+atb 3225 nin7=nin6+atb 3226 nin8=nin7+atb 3227 nout1=nout1+atn 3228 nout2=nout1+after 3229 nout3=nout2+after 3230 nout4=nout3+after 3231 nout5=nout4+after 3232 nout6=nout5+after 3233 nout7=nout6+after 3234 nout8=nout7+after 3235 do j=1,n1dfft 3236 r1=zin(1,j,nin1) 3237 s1=zin(2,j,nin1) 3238 r=zin(1,j,nin2) 3239 s=zin(2,j,nin2) 3240 r2=r*cr2 - s*ci2 3241 s2=r*ci2 + s*cr2 3242 r=zin(1,j,nin3) 3243 s=zin(2,j,nin3) 3244 r3=r*cr3 - s*ci3 3245 s3=r*ci3 + s*cr3 3246 r=zin(1,j,nin4) 3247 s=zin(2,j,nin4) 3248 r4=r*cr4 - s*ci4 3249 s4=r*ci4 + s*cr4 3250 r=zin(1,j,nin5) 3251 s=zin(2,j,nin5) 3252 r5=r*cr5 - s*ci5 3253 s5=r*ci5 + s*cr5 3254 r=zin(1,j,nin6) 3255 s=zin(2,j,nin6) 3256 r6=r*cr6 - s*ci6 3257 s6=r*ci6 + s*cr6 3258 r=zin(1,j,nin7) 3259 s=zin(2,j,nin7) 3260 r7=r*cr7 - s*ci7 3261 s7=r*ci7 + s*cr7 3262 r=zin(1,j,nin8) 3263 s=zin(2,j,nin8) 3264 r8=r*cr8 - s*ci8 3265 s8=r*ci8 + s*cr8 3266 r=r1 + r5 3267 s=r3 + r7 3268 ap=r + s 3269 am=r - s 3270 r=r2 + r6 3271 s=r4 + r8 3272 bp=r + s 3273 bm=r - s 3274 r=s1 + s5 3275 s=s3 + s7 3276 cp=r + s 3277 cm=r - s 3278 r=s2 + s6 3279 s=s4 + s8 3280 dpp=r + s 3281 dm=r - s 3282 zout(1,j,nout1) = ap + bp 3283 zout(2,j,nout1) = cp + dpp 3284 zout(1,j,nout5) = ap - bp 3285 zout(2,j,nout5) = cp - dpp 3286 zout(1,j,nout3) = am + dm 3287 zout(2,j,nout3) = cm - bm 3288 zout(1,j,nout7) = am - dm 3289 zout(2,j,nout7) = cm + bm 3290 r=r1 - r5 3291 s=s3 - s7 3292 ap=r + s 3293 am=r - s 3294 r=s1 - s5 3295 s=r3 - r7 3296 bp=r + s 3297 bm=r - s 3298 r=s4 - s8 3299 s=r2 - r6 3300 cp=r + s 3301 cm=r - s 3302 r=s2 - s6 3303 s=r4 - r8 3304 dpp=r + s 3305 dm=r - s 3306 r = ( cp + dm)*rt2i 3307 s = ( dm - cp)*rt2i 3308 cp= ( cm + dpp)*rt2i 3309 dpp = ( cm - dpp)*rt2i 3310 zout(1,j,nout2) = ap + r 3311 zout(2,j,nout2) = bm + s 3312 zout(1,j,nout6) = ap - r 3313 zout(2,j,nout6) = bm - s 3314 zout(1,j,nout4) = am + cp 3315 zout(2,j,nout4) = bp + dpp 3316 zout(1,j,nout8) = am - cp 3317 zout(2,j,nout8) = bp - dpp 3318 enddo 3319 enddo 3320 8000 continue 3321 3322 else 3323 ia=1 3324 nin1=ia-after 3325 nout1=ia-atn 3326 do ib=1,before 3327 nin1=nin1+after 3328 nin2=nin1+atb 3329 nin3=nin2+atb 3330 nin4=nin3+atb 3331 nin5=nin4+atb 3332 nin6=nin5+atb 3333 nin7=nin6+atb 3334 nin8=nin7+atb 3335 nout1=nout1+atn 3336 nout2=nout1+after 3337 nout3=nout2+after 3338 nout4=nout3+after 3339 nout5=nout4+after 3340 nout6=nout5+after 3341 nout7=nout6+after 3342 nout8=nout7+after 3343 do j=1,n1dfft 3344 r1=zin(1,j,nin1) 3345 s1=zin(2,j,nin1) 3346 r2=zin(1,j,nin2) 3347 s2=zin(2,j,nin2) 3348 r3=zin(1,j,nin3) 3349 s3=zin(2,j,nin3) 3350 r4=zin(1,j,nin4) 3351 s4=zin(2,j,nin4) 3352 r5=zin(1,j,nin5) 3353 s5=zin(2,j,nin5) 3354 r6=zin(1,j,nin6) 3355 s6=zin(2,j,nin6) 3356 r7=zin(1,j,nin7) 3357 s7=zin(2,j,nin7) 3358 r8=zin(1,j,nin8) 3359 s8=zin(2,j,nin8) 3360 r=r1 + r5 3361 s=r3 + r7 3362 ap=r + s 3363 am=r - s 3364 r=r2 + r6 3365 s=r4 + r8 3366 bp=r + s 3367 bm=r - s 3368 r=s1 + s5 3369 s=s3 + s7 3370 cp=r + s 3371 cm=r - s 3372 r=s2 + s6 3373 s=s4 + s8 3374 dpp=r + s 3375 dm=r - s 3376 zout(1,j,nout1) = ap + bp 3377 zout(2,j,nout1) = cp + dpp 3378 zout(1,j,nout5) = ap - bp 3379 zout(2,j,nout5) = cp - dpp 3380 zout(1,j,nout3) = am - dm 3381 zout(2,j,nout3) = cm + bm 3382 zout(1,j,nout7) = am + dm 3383 zout(2,j,nout7) = cm - bm 3384 r= r1 - r5 3385 s=-s3 + s7 3386 ap=r + s 3387 am=r - s 3388 r=s1 - s5 3389 s=r7 - r3 3390 bp=r + s 3391 bm=r - s 3392 r=-s4 + s8 3393 s= r2 - r6 3394 cp=r + s 3395 cm=r - s 3396 r=-s2 + s6 3397 s= r4 - r8 3398 dpp=r + s 3399 dm=r - s 3400 r = ( cp + dm)*rt2i 3401 s = ( cp - dm)*rt2i 3402 cp= ( cm + dpp)*rt2i 3403 dpp= ( dpp - cm)*rt2i 3404 zout(1,j,nout2) = ap + r 3405 zout(2,j,nout2) = bm + s 3406 zout(1,j,nout6) = ap - r 3407 zout(2,j,nout6) = bm - s 3408 zout(1,j,nout4) = am + cp 3409 zout(2,j,nout4) = bp + dpp 3410 zout(1,j,nout8) = am - cp 3411 zout(2,j,nout8) = bp - dpp 3412 enddo 3413 enddo 3414 3415 do 8001,ia=2,after 3416 ias=ia-1 3417 itt=ias*before 3418 itrig=itt+1 3419 cr2=trig(1,itrig) 3420 ci2=trig(2,itrig) 3421 itrig=itrig+itt 3422 cr3=trig(1,itrig) 3423 ci3=trig(2,itrig) 3424 itrig=itrig+itt 3425 cr4=trig(1,itrig) 3426 ci4=trig(2,itrig) 3427 itrig=itrig+itt 3428 cr5=trig(1,itrig) 3429 ci5=trig(2,itrig) 3430 itrig=itrig+itt 3431 cr6=trig(1,itrig) 3432 ci6=trig(2,itrig) 3433 itrig=itrig+itt 3434 cr7=trig(1,itrig) 3435 ci7=trig(2,itrig) 3436 itrig=itrig+itt 3437 cr8=trig(1,itrig) 3438 ci8=trig(2,itrig) 3439 nin1=ia-after 3440 nout1=ia-atn 3441 do ib=1,before 3442 nin1=nin1+after 3443 nin2=nin1+atb 3444 nin3=nin2+atb 3445 nin4=nin3+atb 3446 nin5=nin4+atb 3447 nin6=nin5+atb 3448 nin7=nin6+atb 3449 nin8=nin7+atb 3450 nout1=nout1+atn 3451 nout2=nout1+after 3452 nout3=nout2+after 3453 nout4=nout3+after 3454 nout5=nout4+after 3455 nout6=nout5+after 3456 nout7=nout6+after 3457 nout8=nout7+after 3458 do j=1,n1dfft 3459 r1=zin(1,j,nin1) 3460 s1=zin(2,j,nin1) 3461 r=zin(1,j,nin2) 3462 s=zin(2,j,nin2) 3463 r2=r*cr2 - s*ci2 3464 s2=r*ci2 + s*cr2 3465 r=zin(1,j,nin3) 3466 s=zin(2,j,nin3) 3467 r3=r*cr3 - s*ci3 3468 s3=r*ci3 + s*cr3 3469 r=zin(1,j,nin4) 3470 s=zin(2,j,nin4) 3471 r4=r*cr4 - s*ci4 3472 s4=r*ci4 + s*cr4 3473 r=zin(1,j,nin5) 3474 s=zin(2,j,nin5) 3475 r5=r*cr5 - s*ci5 3476 s5=r*ci5 + s*cr5 3477 r=zin(1,j,nin6) 3478 s=zin(2,j,nin6) 3479 r6=r*cr6 - s*ci6 3480 s6=r*ci6 + s*cr6 3481 r=zin(1,j,nin7) 3482 s=zin(2,j,nin7) 3483 r7=r*cr7 - s*ci7 3484 s7=r*ci7 + s*cr7 3485 r=zin(1,j,nin8) 3486 s=zin(2,j,nin8) 3487 r8=r*cr8 - s*ci8 3488 s8=r*ci8 + s*cr8 3489 r=r1 + r5 3490 s=r3 + r7 3491 ap=r + s 3492 am=r - s 3493 r=r2 + r6 3494 s=r4 + r8 3495 bp=r + s 3496 bm=r - s 3497 r=s1 + s5 3498 s=s3 + s7 3499 cp=r + s 3500 cm=r - s 3501 r=s2 + s6 3502 s=s4 + s8 3503 dpp=r + s 3504 dm=r - s 3505 zout(1,j,nout1) = ap + bp 3506 zout(2,j,nout1) = cp + dpp 3507 zout(1,j,nout5) = ap - bp 3508 zout(2,j,nout5) = cp - dpp 3509 zout(1,j,nout3) = am - dm 3510 zout(2,j,nout3) = cm + bm 3511 zout(1,j,nout7) = am + dm 3512 zout(2,j,nout7) = cm - bm 3513 r= r1 - r5 3514 s=-s3 + s7 3515 ap=r + s 3516 am=r - s 3517 r=s1 - s5 3518 s=r7 - r3 3519 bp=r + s 3520 bm=r - s 3521 r=-s4 + s8 3522 s= r2 - r6 3523 cp=r + s 3524 cm=r - s 3525 r=-s2 + s6 3526 s= r4 - r8 3527 dpp=r + s 3528 dm=r - s 3529 r = ( cp + dm)*rt2i 3530 s = ( cp - dm)*rt2i 3531 cp= ( cm + dpp)*rt2i 3532 dpp= ( dpp - cm)*rt2i 3533 zout(1,j,nout2) = ap + r 3534 zout(2,j,nout2) = bm + s 3535 zout(1,j,nout6) = ap - r 3536 zout(2,j,nout6) = bm - s 3537 zout(1,j,nout4) = am + cp 3538 zout(2,j,nout4) = bp + dpp 3539 zout(1,j,nout8) = am - cp 3540 zout(2,j,nout8) = bp - dpp 3541 enddo 3542 enddo 3543 8001 continue 3544 3545 end if 3546 else if (now.eq.3) then 3547 ! .5d0*sqrt(3.d0) 3548 bb=isign*0.8660254037844387d0 3549 ia=1 3550 nin1=ia-after 3551 nout1=ia-atn 3552 do ib=1,before 3553 nin1=nin1+after 3554 nin2=nin1+atb 3555 nin3=nin2+atb 3556 nout1=nout1+atn 3557 nout2=nout1+after 3558 nout3=nout2+after 3559 do j=1,n1dfft 3560 r1=zin(1,j,nin1) 3561 s1=zin(2,j,nin1) 3562 r2=zin(1,j,nin2) 3563 s2=zin(2,j,nin2) 3564 r3=zin(1,j,nin3) 3565 s3=zin(2,j,nin3) 3566 r=r2 + r3 3567 s=s2 + s3 3568 zout(1,j,nout1) = r + r1 3569 zout(2,j,nout1) = s + s1 3570 r1=r1 - .5d0*r 3571 s1=s1 - .5d0*s 3572 r2=bb*(r2-r3) 3573 s2=bb*(s2-s3) 3574 zout(1,j,nout2) = r1 - s2 3575 zout(2,j,nout2) = s1 + r2 3576 zout(1,j,nout3) = r1 + s2 3577 zout(2,j,nout3) = s1 - r2 3578 enddo 3579 enddo 3580 do 3000,ia=2,after 3581 ias=ia-1 3582 if (4*ias.eq.3*after) then 3583 if (isign.eq.1) then 3584 nin1=ia-after 3585 nout1=ia-atn 3586 do ib=1,before 3587 nin1=nin1+after 3588 nin2=nin1+atb 3589 nin3=nin2+atb 3590 nout1=nout1+atn 3591 nout2=nout1+after 3592 nout3=nout2+after 3593 do j=1,n1dfft 3594 r1=zin(1,j,nin1) 3595 s1=zin(2,j,nin1) 3596 r2=zin(2,j,nin2) 3597 s2=zin(1,j,nin2) 3598 r3=zin(1,j,nin3) 3599 s3=zin(2,j,nin3) 3600 r=r3 + r2 3601 s=s2 - s3 3602 zout(1,j,nout1) = r1 - r 3603 zout(2,j,nout1) = s + s1 3604 r1=r1 + .5d0*r 3605 s1=s1 - .5d0*s 3606 r2=bb*(r2-r3) 3607 s2=bb*(s2+s3) 3608 zout(1,j,nout2) = r1 - s2 3609 zout(2,j,nout2) = s1 - r2 3610 zout(1,j,nout3) = r1 + s2 3611 zout(2,j,nout3) = s1 + r2 3612 enddo 3613 enddo 3614 else 3615 nin1=ia-after 3616 nout1=ia-atn 3617 do ib=1,before 3618 nin1=nin1+after 3619 nin2=nin1+atb 3620 nin3=nin2+atb 3621 nout1=nout1+atn 3622 nout2=nout1+after 3623 nout3=nout2+after 3624 do j=1,n1dfft 3625 r1=zin(1,j,nin1) 3626 s1=zin(2,j,nin1) 3627 r2=zin(2,j,nin2) 3628 s2=zin(1,j,nin2) 3629 r3=zin(1,j,nin3) 3630 s3=zin(2,j,nin3) 3631 r=r2 - r3 3632 s=s2 + s3 3633 zout(1,j,nout1) = r + r1 3634 zout(2,j,nout1) = s1 - s 3635 r1=r1 - .5d0*r 3636 s1=s1 + .5d0*s 3637 r2=bb*(r2+r3) 3638 s2=bb*(s2-s3) 3639 zout(1,j,nout2) = r1 + s2 3640 zout(2,j,nout2) = s1 + r2 3641 zout(1,j,nout3) = r1 - s2 3642 zout(2,j,nout3) = s1 - r2 3643 enddo 3644 enddo 3645 end if 3646 else if (8*ias.eq.3*after) then 3647 if (isign.eq.1) then 3648 nin1=ia-after 3649 nout1=ia-atn 3650 do ib=1,before 3651 nin1=nin1+after 3652 nin2=nin1+atb 3653 nin3=nin2+atb 3654 nout1=nout1+atn 3655 nout2=nout1+after 3656 nout3=nout2+after 3657 do j=1,n1dfft 3658 r1=zin(1,j,nin1) 3659 s1=zin(2,j,nin1) 3660 r=zin(1,j,nin2) 3661 s=zin(2,j,nin2) 3662 r2=(r - s)*rt2i 3663 s2=(r + s)*rt2i 3664 r3=zin(2,j,nin3) 3665 s3=zin(1,j,nin3) 3666 r=r2 - r3 3667 s=s2 + s3 3668 zout(1,j,nout1) = r + r1 3669 zout(2,j,nout1) = s + s1 3670 r1=r1 - .5d0*r 3671 s1=s1 - .5d0*s 3672 r2=bb*(r2+r3) 3673 s2=bb*(s2-s3) 3674 zout(1,j,nout2) = r1 - s2 3675 zout(2,j,nout2) = s1 + r2 3676 zout(1,j,nout3) = r1 + s2 3677 zout(2,j,nout3) = s1 - r2 3678 enddo 3679 enddo 3680 else 3681 nin1=ia-after 3682 nout1=ia-atn 3683 do ib=1,before 3684 nin1=nin1+after 3685 nin2=nin1+atb 3686 nin3=nin2+atb 3687 nout1=nout1+atn 3688 nout2=nout1+after 3689 nout3=nout2+after 3690 do j=1,n1dfft 3691 r1=zin(1,j,nin1) 3692 s1=zin(2,j,nin1) 3693 r=zin(1,j,nin2) 3694 s=zin(2,j,nin2) 3695 r2=(r + s)*rt2i 3696 s2=(s - r)*rt2i 3697 r3=zin(2,j,nin3) 3698 s3=zin(1,j,nin3) 3699 r=r2 + r3 3700 s=s2 - s3 3701 zout(1,j,nout1) = r + r1 3702 zout(2,j,nout1) = s + s1 3703 r1=r1 - .5d0*r 3704 s1=s1 - .5d0*s 3705 r2=bb*(r2-r3) 3706 s2=bb*(s2+s3) 3707 zout(1,j,nout2) = r1 - s2 3708 zout(2,j,nout2) = s1 + r2 3709 zout(1,j,nout3) = r1 + s2 3710 zout(2,j,nout3) = s1 - r2 3711 enddo 3712 enddo 3713 end if 3714 else 3715 itt=ias*before 3716 itrig=itt+1 3717 cr2=trig(1,itrig) 3718 ci2=trig(2,itrig) 3719 itrig=itrig+itt 3720 cr3=trig(1,itrig) 3721 ci3=trig(2,itrig) 3722 nin1=ia-after 3723 nout1=ia-atn 3724 do ib=1,before 3725 nin1=nin1+after 3726 nin2=nin1+atb 3727 nin3=nin2+atb 3728 nout1=nout1+atn 3729 nout2=nout1+after 3730 nout3=nout2+after 3731 do j=1,n1dfft 3732 r1=zin(1,j,nin1) 3733 s1=zin(2,j,nin1) 3734 r=zin(1,j,nin2) 3735 s=zin(2,j,nin2) 3736 r2=r*cr2 - s*ci2 3737 s2=r*ci2 + s*cr2 3738 r=zin(1,j,nin3) 3739 s=zin(2,j,nin3) 3740 r3=r*cr3 - s*ci3 3741 s3=r*ci3 + s*cr3 3742 r=r2 + r3 3743 s=s2 + s3 3744 zout(1,j,nout1) = r + r1 3745 zout(2,j,nout1) = s + s1 3746 r1=r1 - .5d0*r 3747 s1=s1 - .5d0*s 3748 r2=bb*(r2-r3) 3749 s2=bb*(s2-s3) 3750 zout(1,j,nout2) = r1 - s2 3751 zout(2,j,nout2) = s1 + r2 3752 zout(1,j,nout3) = r1 + s2 3753 zout(2,j,nout3) = s1 - r2 3754 enddo 3755 enddo 3756 end if 3757 3000 continue 3758 else if (now==5) then 3759 ! cos(2.d0*pi/5.d0) 3760 cos2=0.3090169943749474d0 3761 ! cos(4.d0*pi/5.d0) 3762 cos4=-0.8090169943749474d0 3763 ! sin(2.d0*pi/5.d0) 3764 sin2=isign*0.9510565162951536d0 3765 ! sin(4.d0*pi/5.d0) 3766 sin4=isign*0.5877852522924731d0 3767 ia=1 3768 nin1=ia-after 3769 nout1=ia-atn 3770 do ib=1,before 3771 nin1=nin1+after 3772 nin2=nin1+atb 3773 nin3=nin2+atb 3774 nin4=nin3+atb 3775 nin5=nin4+atb 3776 nout1=nout1+atn 3777 nout2=nout1+after 3778 nout3=nout2+after 3779 nout4=nout3+after 3780 nout5=nout4+after 3781 do j=1,n1dfft 3782 r1=zin(1,j,nin1) 3783 s1=zin(2,j,nin1) 3784 r2=zin(1,j,nin2) 3785 s2=zin(2,j,nin2) 3786 r3=zin(1,j,nin3) 3787 s3=zin(2,j,nin3) 3788 r4=zin(1,j,nin4) 3789 s4=zin(2,j,nin4) 3790 r5=zin(1,j,nin5) 3791 s5=zin(2,j,nin5) 3792 r25 = r2 + r5 3793 r34 = r3 + r4 3794 s25 = s2 - s5 3795 s34 = s3 - s4 3796 zout(1,j,nout1) = r1 + r25 + r34 3797 r = r1 + cos2*r25 + cos4*r34 3798 s = sin2*s25 + sin4*s34 3799 zout(1,j,nout2) = r - s 3800 zout(1,j,nout5) = r + s 3801 r = r1 + cos4*r25 + cos2*r34 3802 s = sin4*s25 - sin2*s34 3803 zout(1,j,nout3) = r - s 3804 zout(1,j,nout4) = r + s 3805 r25 = r2 - r5 3806 r34 = r3 - r4 3807 s25 = s2 + s5 3808 s34 = s3 + s4 3809 zout(2,j,nout1) = s1 + s25 + s34 3810 r = s1 + cos2*s25 + cos4*s34 3811 s = sin2*r25 + sin4*r34 3812 zout(2,j,nout2) = r + s 3813 zout(2,j,nout5) = r - s 3814 r = s1 + cos4*s25 + cos2*s34 3815 s = sin4*r25 - sin2*r34 3816 zout(2,j,nout3) = r + s 3817 zout(2,j,nout4) = r - s 3818 enddo 3819 enddo 3820 do 5000,ia=2,after 3821 ias=ia-1 3822 if (8*ias.eq.5*after) then 3823 if (isign.eq.1) then 3824 nin1=ia-after 3825 nout1=ia-atn 3826 do ib=1,before 3827 nin1=nin1+after 3828 nin2=nin1+atb 3829 nin3=nin2+atb 3830 nin4=nin3+atb 3831 nin5=nin4+atb 3832 nout1=nout1+atn 3833 nout2=nout1+after 3834 nout3=nout2+after 3835 nout4=nout3+after 3836 nout5=nout4+after 3837 do j=1,n1dfft 3838 r1=zin(1,j,nin1) 3839 s1=zin(2,j,nin1) 3840 r=zin(1,j,nin2) 3841 s=zin(2,j,nin2) 3842 r2=(r - s)*rt2i 3843 s2=(r + s)*rt2i 3844 r3=zin(2,j,nin3) 3845 s3=zin(1,j,nin3) 3846 r=zin(1,j,nin4) 3847 s=zin(2,j,nin4) 3848 r4=(r + s)*rt2i 3849 s4=(r - s)*rt2i 3850 r5=zin(1,j,nin5) 3851 s5=zin(2,j,nin5) 3852 r25 = r2 - r5 3853 r34 = r3 + r4 3854 s25 = s2 + s5 3855 s34 = s3 - s4 3856 zout(1,j,nout1) = r1 + r25 - r34 3857 r = r1 + cos2*r25 - cos4*r34 3858 s = sin2*s25 + sin4*s34 3859 zout(1,j,nout2) = r - s 3860 zout(1,j,nout5) = r + s 3861 r = r1 + cos4*r25 - cos2*r34 3862 s = sin4*s25 - sin2*s34 3863 zout(1,j,nout3) = r - s 3864 zout(1,j,nout4) = r + s 3865 r25 = r2 + r5 3866 r34 = r4 - r3 3867 s25 = s2 - s5 3868 s34 = s3 + s4 3869 zout(2,j,nout1) = s1 + s25 + s34 3870 r = s1 + cos2*s25 + cos4*s34 3871 s = sin2*r25 + sin4*r34 3872 zout(2,j,nout2) = r + s 3873 zout(2,j,nout5) = r - s 3874 r = s1 + cos4*s25 + cos2*s34 3875 s = sin4*r25 - sin2*r34 3876 zout(2,j,nout3) = r + s 3877 zout(2,j,nout4) = r - s 3878 enddo 3879 enddo 3880 else 3881 nin1=ia-after 3882 nout1=ia-atn 3883 do ib=1,before 3884 nin1=nin1+after 3885 nin2=nin1+atb 3886 nin3=nin2+atb 3887 nin4=nin3+atb 3888 nin5=nin4+atb 3889 nout1=nout1+atn 3890 nout2=nout1+after 3891 nout3=nout2+after 3892 nout4=nout3+after 3893 nout5=nout4+after 3894 do j=1,n1dfft 3895 r1=zin(1,j,nin1) 3896 s1=zin(2,j,nin1) 3897 r=zin(1,j,nin2) 3898 s=zin(2,j,nin2) 3899 r2=(r + s)*rt2i 3900 s2=(s - r)*rt2i 3901 r3=zin(2,j,nin3) 3902 s3=zin(1,j,nin3) 3903 r=zin(1,j,nin4) 3904 s=zin(2,j,nin4) 3905 r4=(s - r)*rt2i 3906 s4=(r + s)*rt2i 3907 r5=zin(1,j,nin5) 3908 s5=zin(2,j,nin5) 3909 r25 = r2 - r5 3910 r34 = r3 + r4 3911 s25 = s2 + s5 3912 s34 = s4 - s3 3913 zout(1,j,nout1) = r1 + r25 + r34 3914 r = r1 + cos2*r25 + cos4*r34 3915 s = sin2*s25 + sin4*s34 3916 zout(1,j,nout2) = r - s 3917 zout(1,j,nout5) = r + s 3918 r = r1 + cos4*r25 + cos2*r34 3919 s = sin4*s25 - sin2*s34 3920 zout(1,j,nout3) = r - s 3921 zout(1,j,nout4) = r + s 3922 r25 = r2 + r5 3923 r34 = r3 - r4 3924 s25 = s2 - s5 3925 s34 = s3 + s4 3926 zout(2,j,nout1) = s1 + s25 - s34 3927 r = s1 + cos2*s25 - cos4*s34 3928 s = sin2*r25 + sin4*r34 3929 zout(2,j,nout2) = r + s 3930 zout(2,j,nout5) = r - s 3931 r = s1 + cos4*s25 - cos2*s34 3932 s = sin4*r25 - sin2*r34 3933 zout(2,j,nout3) = r + s 3934 zout(2,j,nout4) = r - s 3935 enddo 3936 enddo 3937 end if 3938 else 3939 ias=ia-1 3940 itt=ias*before 3941 itrig=itt+1 3942 cr2=trig(1,itrig) 3943 ci2=trig(2,itrig) 3944 itrig=itrig+itt 3945 cr3=trig(1,itrig) 3946 ci3=trig(2,itrig) 3947 itrig=itrig+itt 3948 cr4=trig(1,itrig) 3949 ci4=trig(2,itrig) 3950 itrig=itrig+itt 3951 cr5=trig(1,itrig) 3952 ci5=trig(2,itrig) 3953 nin1=ia-after 3954 nout1=ia-atn 3955 do ib=1,before 3956 nin1=nin1+after 3957 nin2=nin1+atb 3958 nin3=nin2+atb 3959 nin4=nin3+atb 3960 nin5=nin4+atb 3961 nout1=nout1+atn 3962 nout2=nout1+after 3963 nout3=nout2+after 3964 nout4=nout3+after 3965 nout5=nout4+after 3966 do j=1,n1dfft 3967 r1=zin(1,j,nin1) 3968 s1=zin(2,j,nin1) 3969 r=zin(1,j,nin2) 3970 s=zin(2,j,nin2) 3971 r2=r*cr2 - s*ci2 3972 s2=r*ci2 + s*cr2 3973 r=zin(1,j,nin3) 3974 s=zin(2,j,nin3) 3975 r3=r*cr3 - s*ci3 3976 s3=r*ci3 + s*cr3 3977 r=zin(1,j,nin4) 3978 s=zin(2,j,nin4) 3979 r4=r*cr4 - s*ci4 3980 s4=r*ci4 + s*cr4 3981 r=zin(1,j,nin5) 3982 s=zin(2,j,nin5) 3983 r5=r*cr5 - s*ci5 3984 s5=r*ci5 + s*cr5 3985 r25 = r2 + r5 3986 r34 = r3 + r4 3987 s25 = s2 - s5 3988 s34 = s3 - s4 3989 zout(1,j,nout1) = r1 + r25 + r34 3990 r = r1 + cos2*r25 + cos4*r34 3991 s = sin2*s25 + sin4*s34 3992 zout(1,j,nout2) = r - s 3993 zout(1,j,nout5) = r + s 3994 r = r1 + cos4*r25 + cos2*r34 3995 s = sin4*s25 - sin2*s34 3996 zout(1,j,nout3) = r - s 3997 zout(1,j,nout4) = r + s 3998 r25 = r2 - r5 3999 r34 = r3 - r4 4000 s25 = s2 + s5 4001 s34 = s3 + s4 4002 zout(2,j,nout1) = s1 + s25 + s34 4003 r = s1 + cos2*s25 + cos4*s34 4004 s = sin2*r25 + sin4*r34 4005 zout(2,j,nout2) = r + s 4006 zout(2,j,nout5) = r - s 4007 r = s1 + cos4*s25 + cos2*s34 4008 s = sin4*r25 - sin2*r34 4009 zout(2,j,nout3) = r + s 4010 zout(2,j,nout4) = r - s 4011 enddo 4012 enddo 4013 end if 4014 5000 continue 4015 else if (now.eq.6) then 4016 ! .5d0*sqrt(3.d0) 4017 bb=isign*0.8660254037844387d0 4018 4019 ia=1 4020 nin1=ia-after 4021 nout1=ia-atn 4022 do ib=1,before 4023 nin1=nin1+after 4024 nin2=nin1+atb 4025 nin3=nin2+atb 4026 nin4=nin3+atb 4027 nin5=nin4+atb 4028 nin6=nin5+atb 4029 nout1=nout1+atn 4030 nout2=nout1+after 4031 nout3=nout2+after 4032 nout4=nout3+after 4033 nout5=nout4+after 4034 nout6=nout5+after 4035 do j=1,n1dfft 4036 r2=zin(1,j,nin3) 4037 s2=zin(2,j,nin3) 4038 r3=zin(1,j,nin5) 4039 s3=zin(2,j,nin5) 4040 r=r2 + r3 4041 s=s2 + s3 4042 r1=zin(1,j,nin1) 4043 s1=zin(2,j,nin1) 4044 ur1 = r + r1 4045 ui1 = s + s1 4046 r1=r1 - .5d0*r 4047 s1=s1 - .5d0*s 4048 r=r2-r3 4049 s=s2-s3 4050 ur2 = r1 - s*bb 4051 ui2 = s1 + r*bb 4052 ur3 = r1 + s*bb 4053 ui3 = s1 - r*bb 4054 4055 r2=zin(1,j,nin6) 4056 s2=zin(2,j,nin6) 4057 r3=zin(1,j,nin2) 4058 s3=zin(2,j,nin2) 4059 r=r2 + r3 4060 s=s2 + s3 4061 r1=zin(1,j,nin4) 4062 s1=zin(2,j,nin4) 4063 vr1 = r + r1 4064 vi1 = s + s1 4065 r1=r1 - .5d0*r 4066 s1=s1 - .5d0*s 4067 r=r2-r3 4068 s=s2-s3 4069 vr2 = r1 - s*bb 4070 vi2 = s1 + r*bb 4071 vr3 = r1 + s*bb 4072 vi3 = s1 - r*bb 4073 4074 zout(1,j,nout1)=ur1+vr1 4075 zout(2,j,nout1)=ui1+vi1 4076 zout(1,j,nout5)=ur2+vr2 4077 zout(2,j,nout5)=ui2+vi2 4078 zout(1,j,nout3)=ur3+vr3 4079 zout(2,j,nout3)=ui3+vi3 4080 zout(1,j,nout4)=ur1-vr1 4081 zout(2,j,nout4)=ui1-vi1 4082 zout(1,j,nout2)=ur2-vr2 4083 zout(2,j,nout2)=ui2-vi2 4084 zout(1,j,nout6)=ur3-vr3 4085 zout(2,j,nout6)=ui3-vi3 4086 enddo 4087 enddo 4088 4089 else 4090 ABI_ERROR('error fftstp') 4091 end if 4092 4093 end subroutine fftstp
m_sg2002/sg2002_accrho [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_accrho
FUNCTION
Accumulates the real space density rho from the ndat wavefunctions zf by transforming zf into real space and adding all the amplitudes squared INPUTS: ZF: input array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF md2proc=((md2-1)/nproc_fft)+1 ! maximal number of small box 2nd dim slices for one proc weight(ndat)= weight for the density accumulation OUTPUTS: RHOoutput(i1,i2,i3) = RHOinput(i1,i2,i3) + sum on idat of (Re(FFT(ZF))**2 *weight_r + weight_i*Im(FFT(ZF))**2 i1=1,n1 , i2=1,n2 , i3=1,n3 comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of RHO nd3proc=((nd3-1)/nproc_fft)+1 ! maximal number of big box 3rd dim slices for one proc NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
2264 subroutine sg2002_accrho(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 2265 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft,zf,rho,weight_r,weight_i) 2266 2267 implicit none 2268 2269 !Arguments ------------------------------------ 2270 integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 2271 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft,nproc_fft,me_fft 2272 real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat) 2273 real(dp),intent(in) :: weight_r(ndat), weight_i(ndat) 2274 real(dp),intent(inout) :: rho(nd1,nd2,nd3) 2275 2276 !Local variables------------------------------- 2277 !scalars 2278 integer,parameter :: unused0=0 2279 integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j3glob 2280 integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3 2281 integer :: m2eff,ncache,n1eff,jeff,includelast 2282 !arrays 2283 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 2284 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 2285 real(dp), allocatable :: zw(:,:,:) ! cache work array 2286 real(dp) :: tsec(2) 2287 ! FFT work arrays 2288 real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3 2289 integer, allocatable, dimension(:) :: after1,now1,before1, after2,now2,before2,after3,now3,before3 2290 2291 ! ************************************************************************* 2292 2293 !ioption=0 ! This was in the old version. 2294 ioption=1 ! This one is needed to be compatible with paral_kgb 2295 2296 !nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 2297 2298 ! find cache size that gives optimal performance on machine 2299 ncache=4*max(n1,n2,n3,1024) 2300 if (ncache/(4*max(n1,n2,n3)) < 1) then 2301 write(std_out,*) & 2302 & 'ncache has to be enlarged to be able to hold at', & 2303 & 'least one 1-d FFT of each size even though this will', & 2304 & 'reduce the performance for shorter transform lengths' 2305 ABI_ERROR("Aborting now") 2306 end if 2307 2308 !Effective m1 and m2 (complex-to-complex or real-to-complex) 2309 n1eff=n1; m2eff=m2 ; m1zt=n1 2310 if (icplexwf==1) then 2311 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 2312 end if 2313 2314 lzt=m2eff 2315 if (mod(m2eff,2) == 0) lzt=lzt+1 2316 if (mod(m2eff,4) == 0) lzt=lzt+1 2317 2318 ! maximal number of big box 3rd dim slices for all procs 2319 nnd3=nd3proc*nproc_fft 2320 2321 ABI_MALLOC(trig1,(2,n1)) 2322 ABI_MALLOC(after1,(mdata)) 2323 ABI_MALLOC(now1,(mdata)) 2324 ABI_MALLOC(before1,(mdata)) 2325 ABI_MALLOC(trig2,(2,n2)) 2326 ABI_MALLOC(after2,(mdata)) 2327 ABI_MALLOC(now2,(mdata)) 2328 ABI_MALLOC(before2,(mdata)) 2329 ABI_MALLOC(trig3,(2,n3)) 2330 ABI_MALLOC(after3,(mdata)) 2331 ABI_MALLOC(now3,(mdata)) 2332 ABI_MALLOC(before3,(mdata)) 2333 2334 ABI_MALLOC(zw,(2,ncache/4,2)) 2335 ABI_MALLOC(zt,(2,lzt,m1zt)) 2336 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 2337 if (nproc_fft > 1) then 2338 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 2339 end if 2340 2341 call ctrig(n3,trig3,after3,before3,now3,1,ic3) 2342 call ctrig(n1,trig1,after1,before1,now1,1,ic1) 2343 call ctrig(n2,trig2,after2,before2,now2,1,ic2) 2344 2345 do idat=1,ndat 2346 ! transform along z axis 2347 ! input: I1,I3,J2,(Jp2) 2348 lot=ncache/(4*n3) 2349 2350 ! Loop over the y planes treated by this node and trasform n1ddft G_z lines. 2351 do j2=1,md2proc 2352 if (me_fft*md2proc+j2 <= m2eff) then ! MG REMOVED TO BE COSISTENT WITH BACK_WF 2353 do i1=1,m1,lot 2354 ma=i1 2355 mb=min(i1+(lot-1),m1) 2356 n1dfft=mb-ma+1 2357 2358 ! zero-pad n1dfft G_z lines 2359 ! input: G1,G3,G2,(Gp2) 2360 ! output: G1,R3,G2,(Gp2) 2361 call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) 2362 2363 ! Transform along z. 2364 inzee=1 2365 do i=1,ic3 2366 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 2367 & trig3,after3(i),now3(i),before3(i),1) 2368 inzee=3-inzee 2369 end do 2370 2371 ! Local rotation. 2372 ! input: G1,R3,G2,(Gp2) 2373 ! output: G1,G2,R3,(Gp2) 2374 call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2) 2375 end do 2376 end if 2377 end do 2378 2379 ! Interprocessor data transposition 2380 ! input: G1,G2,R3,Rp3,(Gp2) 2381 ! output: G1,G2,R3,Gp2,(Rp3) 2382 if (nproc_fft > 1) then 2383 call timab(543,1,tsec) 2384 call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc, & 2385 & zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr) 2386 call timab(543,2,tsec) 2387 end if 2388 2389 ! Loop over the z treated by this node. 2390 do j3=1,nd3proc 2391 j3glob = j3 + me_fft*nd3proc 2392 !ABI_CHECK(j3glob <= n3, "j3glob") 2393 2394 if (me_fft*nd3proc+j3 <= n3) then 2395 Jp2st=1; J2st=1 2396 2397 lot=ncache/(4*n1) 2398 2399 ! Loop over G_y in the small box. 2400 do j=1,m2eff,lot 2401 ma=j 2402 mb=min(j+(lot-1),m2eff) 2403 n1dfft=mb-ma+1 2404 2405 ! Zero-pad input. 2406 ! input: G1,G2,R3,JG2,(Rp3) 2407 ! output: G2,G1,R3,JG2,(Rp3) 2408 2409 if (nproc_fft == 1) then 2410 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 2411 & md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1),unused0, unused0,unused0) 2412 else 2413 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 2414 & md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0,unused0,unused0) 2415 end if 2416 2417 ! Transform along x 2418 ! input: G2,G1,R3,(Rp3) 2419 ! output: G2,R1,R3,(Rp3) 2420 inzee=1 2421 do i=1,ic1-1 2422 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 2423 & trig1,after1(i),now1(i),before1(i),1) 2424 inzee=3-inzee 2425 end do 2426 2427 i=ic1 2428 call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), & 2429 & trig1,after1(i),now1(i),before1(i),1) 2430 end do 2431 2432 ! Transform along y axis (take into account c2c or c2r case). 2433 ! Must loop over the full box. 2434 lot=ncache/(4*n2) 2435 if (icplexwf==1) then 2436 if (mod(lot,2).ne.0) lot=lot-1 ! needed to introduce jeff 2437 end if 2438 2439 do j=1,n1eff,lot 2440 ma=j 2441 mb=min(j+(lot-1),n1eff) 2442 n1dfft=mb-ma+1 2443 jeff=j 2444 includelast=1 2445 2446 if (icplexwf==1) then 2447 jeff=2*j-1 2448 includelast=1 2449 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 2450 end if 2451 2452 ! Zero-pad the input. 2453 ! input: G2,R1,R3,(Rp3) 2454 ! output: R1,G2,R3,(Rp3) 2455 if (icplexwf==2) then 2456 call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) 2457 else 2458 call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1)) 2459 end if 2460 2461 inzee=1 2462 do i=1,ic2 2463 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 2464 & trig2,after2(i),now2(i),before2(i),1) 2465 inzee=3-inzee 2466 end do 2467 2468 ! Accumulate 2469 call addrho(icplexwf,includelast,nd1,nd2,n2,lot,n1dfft,& 2470 & zw(1,1,inzee),rho(jeff,1,j3glob),weight_r(idat),weight_i(idat)) 2471 end do 2472 ! output: i1,i2,j3,(jp3) 2473 2474 end if 2475 end do ! j3 2476 end do ! idat 2477 2478 ABI_FREE(trig1) 2479 ABI_FREE(after1) 2480 ABI_FREE(now1) 2481 ABI_FREE(before1) 2482 ABI_FREE(trig2) 2483 ABI_FREE(after2) 2484 ABI_FREE(now2) 2485 ABI_FREE(before2) 2486 ABI_FREE(trig3) 2487 ABI_FREE(after3) 2488 ABI_FREE(now3) 2489 ABI_FREE(before3) 2490 2491 ABI_FREE(zmpi2) 2492 ABI_FREE(zw) 2493 ABI_FREE(zt) 2494 if (nproc_fft > 1) then 2495 ABI_FREE(zmpi1) 2496 end if 2497 2498 end subroutine sg2002_accrho
m_sg2002/sg2002_applypot [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_applypot
FUNCTION
Applies the local real space potential to multiple wavefunctions in Fourier space
INPUTS
ZF: Wavefunction (input/output) (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc POT: Potential POT(cplex*i1,i2,i3) cplex=1 or 2 , i1=1,n1 , i2=1,n2 , i3=1,n3 nd1,nd2,nd3: dimension of pot comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
1419 subroutine sg2002_applypot(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 1420 & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 1421 & max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf) 1422 1423 implicit none 1424 1425 !Arguments ------------------------------------ 1426 integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 1427 integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3 1428 integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft 1429 real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3) 1430 real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat) 1431 1432 !Local variables------------------------------- 1433 !scalars 1434 integer,parameter :: unused0=0 1435 integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob 1436 integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3 1437 integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb 1438 integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff 1439 !arrays 1440 real(dp) :: tsec(2) 1441 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 1442 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 1443 real(dp), allocatable :: zw(:,:,:) ! cache work array 1444 ! FFT work arrays 1445 real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3 1446 real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3 1447 integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3 1448 1449 ! ************************************************************************* 1450 1451 !ioption=0 ! This was in the old version. 1452 ioption=1 ! This one is needed to be compatible with paral_kgb 1453 1454 ncache=4*max(n1,n2,n3,1024) 1455 if (ncache/(4*max(n1,n2,n3)) < 1) then 1456 write(std_out,*) & 1457 & 'ncache has to be enlarged to be able to hold at', & 1458 & 'least one 1-d FFT of each size even though this will', & 1459 & 'reduce the performance for shorter transform lengths' 1460 ABI_ERROR("Aborting now") 1461 end if 1462 1463 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 1464 n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1 1465 if (icplexwf==1) then 1466 n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1) 1467 end if 1468 1469 m2eff=max(m2ieff,m2oeff) 1470 lzt=m2eff 1471 if (mod(m2eff,2) == 0) lzt=lzt+1 1472 if (mod(m2eff,4) == 0) lzt=lzt+1 1473 1474 ! maximal number of big box 3rd dim slices for all procs 1475 nnd3=nd3proc*nproc_fft 1476 1477 ABI_MALLOC(btrig1,(2,n1)) 1478 ABI_MALLOC(ftrig1,(2,n1)) 1479 ABI_MALLOC(after1,(mdata)) 1480 ABI_MALLOC(now1,(mdata)) 1481 ABI_MALLOC(before1,(mdata)) 1482 ABI_MALLOC(btrig2,(2,n2)) 1483 ABI_MALLOC(ftrig2,(2,n2)) 1484 ABI_MALLOC(after2,(mdata)) 1485 ABI_MALLOC(now2,(mdata)) 1486 ABI_MALLOC(before2,(mdata)) 1487 ABI_MALLOC(btrig3,(2,n3)) 1488 ABI_MALLOC(ftrig3,(2,n3)) 1489 ABI_MALLOC(after3,(mdata)) 1490 ABI_MALLOC(now3,(mdata)) 1491 ABI_MALLOC(before3,(mdata)) 1492 1493 ABI_MALLOC(zw,(2,ncache/4,2)) 1494 ABI_MALLOC(zt,(2,lzt,m1zt)) 1495 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3)) 1496 if (nproc_fft > 1) then 1497 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3)) 1498 end if 1499 1500 call ctrig(n3,btrig3,after3,before3,now3,1,ic3) 1501 call ctrig(n1,btrig1,after1,before1,now1,1,ic1) 1502 call ctrig(n2,btrig2,after2,before2,now2,1,ic2) 1503 1504 do j=1,n1 1505 ftrig1(1,j)= btrig1(1,j) 1506 ftrig1(2,j)=-btrig1(2,j) 1507 end do 1508 do j=1,n2 1509 ftrig2(1,j)= btrig2(1,j) 1510 ftrig2(2,j)=-btrig2(2,j) 1511 end do 1512 do j=1,n3 1513 ftrig3(1,j)= btrig3(1,j) 1514 ftrig3(2,j)=-btrig3(2,j) 1515 end do 1516 1517 do idat=1,ndat 1518 ! 1519 ! transform along z axis 1520 ! input: G1,G3,G2,(Gp2) 1521 lot=ncache/(4*n3) 1522 do j2=1,md2proc 1523 if (me_fft*md2proc+j2 <= m2ieff) then 1524 do i1=1,m1i,lot 1525 ma=i1 1526 mb=min(i1+(lot-1),m1i) 1527 n1dfft=mb-ma+1 1528 1529 ! zero-pad n1dfft G_z lines 1530 ! input: G1,G3,G2,(Gp2) 1531 call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) 1532 1533 inzee=1 1534 do i=1,ic3 1535 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 1536 & btrig3,after3(i),now3(i),before3(i),1) 1537 inzee=3-inzee 1538 end do 1539 1540 ! Local rotation. 1541 ! input: G1,R3,G2,(Gp2) 1542 ! output: G1,G2,R3,(Gp2) 1543 call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2) 1544 end do 1545 end if 1546 end do 1547 1548 ! Interprocessor data transposition 1549 ! input: G1,G2,R3,Rp3,(Gp2) 1550 ! output: G1,G2,R3,Gp2,(Rp3) 1551 if (nproc_fft > 1) then 1552 call timab(543,1,tsec) 1553 call xmpi_alltoall(zmpi2,2*md1*md2proc*nd3proc,& 1554 & zmpi1,2*md1*md2proc*nd3proc,comm_fft,ierr) 1555 call timab(543,2,tsec) 1556 end if 1557 1558 do j3=1,nd3proc 1559 j3glob = j3 + me_fft*nd3proc 1560 1561 if (me_fft*nd3proc+j3 <= n3) then 1562 Jp2stb=1; J2stb=1 1563 Jp2stf=1; J2stf=1 1564 1565 ! transform along x axis 1566 lot=ncache/(4*n1) 1567 1568 do j=1,m2ieff,lot 1569 ma=j 1570 mb=min(j+(lot-1),m2ieff) 1571 n1dfft=mb-ma+1 1572 1573 ! Zero-pad input. 1574 ! input: G1,G2,R3,G2,(Rp3) 1575 ! output: G2,G1,R3,G2,(Rp3) 1576 if (nproc_fft == 1) then 1577 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,& 1578 & md2proc,nd3proc,nproc_fft,ioption,zmpi2,zw(1,1,1), unused0, unused0, unused0) 1579 else 1580 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,& 1581 & md2proc,nd3proc,nproc_fft,ioption,zmpi1,zw(1,1,1), unused0, unused0, unused0) 1582 end if 1583 1584 ! Transform along x 1585 ! input: G2,G1,R3,(Rp3) 1586 ! output: G2,R1,R3,(Rp3) 1587 inzee=1 1588 do i=1,ic1-1 1589 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 1590 & btrig1,after1(i),now1(i),before1(i),1) 1591 inzee=3-inzee 1592 end do 1593 1594 i=ic1 1595 call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), & 1596 & btrig1,after1(i),now1(i),before1(i),1) 1597 end do 1598 1599 ! Transform along y axis (take into account c2c or c2r case). 1600 ! Must loop over the full box. 1601 lot=ncache/(4*n2) 1602 1603 if (icplexwf==1) then 1604 if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff 1605 end if 1606 1607 do j=1,n1eff,lot 1608 ma=j 1609 mb=min(j+(lot-1),n1eff) 1610 n1dfft=mb-ma+1 1611 jeff=j 1612 includelast=1 1613 1614 if (icplexwf==1) then 1615 jeff=2*j-1 1616 includelast=1 1617 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 1618 end if 1619 1620 ! Zero-pad the input. 1621 ! input: G2,R1,R3,(Rp3) 1622 ! output: R1,G2,R3,(Rp3) 1623 if (icplexwf==2) then 1624 call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1)) 1625 else 1626 call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1)) 1627 end if 1628 1629 ! input: R1,G2,R3,(Rp3) 1630 ! output: R1,R2,R3,(Rp3) 1631 inzee=1 1632 do i=1,ic2 1633 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 1634 & btrig2,after2(i),now2(i),before2(i),1) 1635 inzee=3-inzee 1636 end do 1637 ! output: R1,R2,R3,(Rp3) 1638 1639 ! Multiply with potential in real space 1640 jx=cplex*(jeff-1)+1 1641 call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee)) 1642 1643 ! TRANSFORM BACK IN FOURIER SPACE 1644 ! transform along y axis 1645 ! input: R1,R2,R3,(Rp3) 1646 do i=1,ic2 1647 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 1648 & ftrig2,after2(i),now2(i),before2(i),-1) 1649 inzee=3-inzee 1650 end do 1651 1652 ! input: R1,G2,R3,(Rp3) 1653 ! output: G2,R1,R3,(Rp3) 1654 if (icplexwf==2) then 1655 call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff)) 1656 else 1657 call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff)) 1658 end if 1659 1660 end do ! j 1661 1662 ! transform along x axis 1663 ! input: R2,R1,R3,(Rp3) 1664 ! output: R2,G1,R3,(Rp3) 1665 lot=ncache/(4*n1) 1666 1667 do j=1,m2oeff,lot 1668 ma=j 1669 mb=min(j+(lot-1),m2oeff) 1670 n1dfft=mb-ma+1 1671 i=1 1672 call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), & 1673 & ftrig1,after1(i),now1(i),before1(i),-1) 1674 1675 inzee=1 1676 do i=2,ic1 1677 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 1678 & ftrig1,after1(i),now1(i),before1(i),-1) 1679 inzee=3-inzee 1680 end do 1681 1682 ! input: G2,G1,R3,Gp2,(Rp3) 1683 ! output: G1,G2,R3,Gp2,(Rp3) 1684 if (nproc_fft == 1) then 1685 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,& 1686 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2) 1687 else 1688 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,& 1689 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1) 1690 end if 1691 end do ! j 1692 end if 1693 end do 1694 1695 ! Interprocessor data transposition 1696 ! input: G1,G2,R3,Gp2,(Rp3) 1697 ! output: G1,G2,R3,Rp3,(Gp2) 1698 if (nproc_fft > 1) then 1699 call timab(544,1,tsec) 1700 call xmpi_alltoall(zmpi1,2*md1*md2proc*nd3proc, & 1701 & zmpi2,2*md1*md2proc*nd3proc,comm_fft,ierr) 1702 call timab(544,2,tsec) 1703 end if 1704 1705 ! transform along z axis 1706 ! input: G1,G2,R3,(Gp2) 1707 lot=ncache/(4*n3) 1708 do j2=1,md2proc 1709 if (me_fft*md2proc+j2 <= m2oeff) then 1710 do i1=1,m1o,lot 1711 ma=i1 1712 mb=min(i1+(lot-1),m1o) 1713 n1dfft=mb-ma+1 1714 1715 ! input: G1,G2,R3,(Gp2) 1716 ! output: G1,R3,G2,(Gp2) 1717 call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2,zw(1,1,1)) 1718 1719 inzee=1 1720 do i=1,ic3 1721 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 1722 & ftrig3,after3(i),now3(i),before3(i),-1) 1723 inzee=3-inzee 1724 end do 1725 1726 call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) 1727 ! output: G1,G3,G2,(Gp2) 1728 end do 1729 end if 1730 end do 1731 1732 ! Complete missing values with complex conjugate 1733 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 1734 if (icplexwf==1) then 1735 do i3=1,m3o 1736 i3inv=m3o+2-i3 1737 if (i3==1) i3inv=1 1738 if (m2oeff>1)then 1739 do i2=2,m2oeff 1740 i2inv=m2o+2-i2 1741 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 1742 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 1743 do i1=2,m1o 1744 i1inv=m1o+2-i1 1745 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 1746 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 1747 end do 1748 end do 1749 end if 1750 end do 1751 end if 1752 1753 end do ! idat 1754 1755 ABI_FREE(btrig1) 1756 ABI_FREE(ftrig1) 1757 ABI_FREE(after1) 1758 ABI_FREE(now1) 1759 ABI_FREE(before1) 1760 ABI_FREE(btrig2) 1761 ABI_FREE(ftrig2) 1762 ABI_FREE(after2) 1763 ABI_FREE(now2) 1764 ABI_FREE(before2) 1765 ABI_FREE(btrig3) 1766 ABI_FREE(ftrig3) 1767 ABI_FREE(after3) 1768 ABI_FREE(now3) 1769 ABI_FREE(before3) 1770 1771 ABI_FREE(zmpi2) 1772 ABI_FREE(zw) 1773 ABI_FREE(zt) 1774 if (nproc_fft > 1) then 1775 ABI_FREE(zmpi1) 1776 end if 1777 1778 end subroutine sg2002_applypot
m_sg2002/sg2002_applypot_many [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_applypot_many
FUNCTION
Applies the local real space potential to multiple wavefunctions in Fourier space
INPUTS
ZF: Wavefunction (input/output) (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF (input as well as output), distributed on different procs md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc POT: Potential POT(cplex*i1,i2,i3) cplex=1 or 2 , i1=1,n1 , i2=1,n2 , i3=1,n3 nd1,nd2,nd3: dimension of pot comm_fft: MPI communicator nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG NOTES: PERFORMANCE CONSIDERATIONS: The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
1833 subroutine sg2002_applypot_many(icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc,& 1834 & max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3,& 1835 & max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft,pot,zf) 1836 1837 implicit none 1838 1839 !Arguments ------------------------------------ 1840 integer,intent(in) :: icplexwf,cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd3proc 1841 integer,intent(in) :: max1i,max2i,max3i,m1i,m2i,m3i,md1,md2proc,md3 1842 integer,intent(in) :: max1o,max2o,max3o,m1o,m2o,m3o,comm_fft,nproc_fft,me_fft 1843 real(dp),intent(in) :: pot(cplex*nd1,nd2,nd3) 1844 real(dp),intent(inout) :: zf(2,md1,md3,md2proc,ndat) 1845 1846 !Local variables------------------------------- 1847 !scalars 1848 integer,parameter :: unused0=0 1849 integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,j3glob 1850 integer :: ioption,j2,j3,lot,lzt,m1zt,ma,mb,n1dfft,nnd3 1851 integer :: m2eff,ncache,n1eff,i1inv,i2inv,i3inv,jeff,includelast,j2stb 1852 integer :: jx,j2stf,Jp2stb,Jp2stf,m2ieff,m2oeff 1853 !arrays 1854 integer :: requests(ndat) 1855 real(dp) :: tsec(2) 1856 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 1857 real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 1858 real(dp), allocatable :: zw(:,:,:) ! cache work array 1859 ! FFT work arrays 1860 real(dp), allocatable, dimension(:,:) :: btrig1,btrig2,btrig3 1861 real(dp), allocatable, dimension(:,:) :: ftrig1,ftrig2,ftrig3 1862 integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3 1863 1864 ! ************************************************************************* 1865 1866 !ioption=0 ! This was in the old version. 1867 ioption=1 ! This one is needed to be compatible with paral_kgb 1868 1869 ! call timab(541,1,tsec) 1870 ncache=4*max(n1,n2,n3,1024) 1871 if (ncache/(4*max(n1,n2,n3)) < 1) then 1872 write(std_out,*) & 1873 & 'ncache has to be enlarged to be able to hold at', & 1874 & 'least one 1-d FFT of each size even though this will', & 1875 & 'reduce the performance for shorter transform lengths' 1876 ABI_ERROR("Aborting now") 1877 end if 1878 1879 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 1880 n1eff=n1; m2ieff=m2i; m2oeff=m2o; m1zt=n1 1881 if (icplexwf==1) then 1882 n1eff=(n1+1)/2; m2ieff=m2i/2+1; m2oeff=m2o/2+1; m1zt=2*(n1/2+1) 1883 end if 1884 1885 m2eff=max(m2ieff,m2oeff) 1886 lzt=m2eff 1887 if (mod(m2eff,2) == 0) lzt=lzt+1 1888 if (mod(m2eff,4) == 0) lzt=lzt+1 1889 1890 ! maximal number of big box 3rd dim slices for all procs 1891 nnd3=nd3proc*nproc_fft 1892 1893 ABI_MALLOC(btrig1,(2,n1)) 1894 ABI_MALLOC(ftrig1,(2,n1)) 1895 ABI_MALLOC(after1,(mdata)) 1896 ABI_MALLOC(now1,(mdata)) 1897 ABI_MALLOC(before1,(mdata)) 1898 ABI_MALLOC(btrig2,(2,n2)) 1899 ABI_MALLOC(ftrig2,(2,n2)) 1900 ABI_MALLOC(after2,(mdata)) 1901 ABI_MALLOC(now2,(mdata)) 1902 ABI_MALLOC(before2,(mdata)) 1903 ABI_MALLOC(btrig3,(2,n3)) 1904 ABI_MALLOC(ftrig3,(2,n3)) 1905 ABI_MALLOC(after3,(mdata)) 1906 ABI_MALLOC(now3,(mdata)) 1907 ABI_MALLOC(before3,(mdata)) 1908 1909 ABI_MALLOC(zw,(2,ncache/4,2)) 1910 ABI_MALLOC(zt,(2,lzt,m1zt)) 1911 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 1912 if (nproc_fft > 1) then 1913 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 1914 end if 1915 1916 call ctrig(n3,btrig3,after3,before3,now3,1,ic3) 1917 call ctrig(n1,btrig1,after1,before1,now1,1,ic1) 1918 call ctrig(n2,btrig2,after2,before2,now2,1,ic2) 1919 1920 do j=1,n1 1921 ftrig1(1,j)= btrig1(1,j) 1922 ftrig1(2,j)=-btrig1(2,j) 1923 end do 1924 do j=1,n2 1925 ftrig2(1,j)= btrig2(1,j) 1926 ftrig2(2,j)=-btrig2(2,j) 1927 end do 1928 do j=1,n3 1929 ftrig3(1,j)= btrig3(1,j) 1930 ftrig3(2,j)=-btrig3(2,j) 1931 end do 1932 1933 ! Here we take advantage of non-blocking IALLTOALL: 1934 ! Perform the first step of MPI-FFT for ndat wavefunctions. 1935 do idat=1,ndat 1936 1937 ! 1938 ! transform along z axis 1939 ! input: G1,G3,G2,(Gp2) 1940 lot=ncache/(4*n3) 1941 do j2=1,md2proc 1942 if (me_fft*md2proc+j2 <= m2ieff) then 1943 do i1=1,m1i,lot 1944 ma=i1 1945 mb=min(i1+(lot-1),m1i) 1946 n1dfft=mb-ma+1 1947 1948 ! zero-pad n1dfft G_z lines 1949 ! input: G1,G3,G2,(Gp2) 1950 call fill_cent(md1,md3,lot,n1dfft,max3i,m3i,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) 1951 1952 inzee=1 1953 do i=1,ic3 1954 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 1955 & btrig3,after3(i),now3(i),before3(i),1) 1956 inzee=3-inzee 1957 end do 1958 1959 ! Local rotation. 1960 ! input: G1,R3,G2,(Gp2) 1961 ! output: G1,G2,R3,(Gp2) 1962 call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat)) 1963 end do 1964 end if 1965 end do 1966 1967 ! Interprocessor data transposition 1968 ! input: G1,G2,R3,Rp3,(Gp2) 1969 ! output: G1,G2,R3,Gp2,(Rp3) 1970 if (nproc_fft > 1) then 1971 call timab(543,1,tsec) 1972 call xmpi_ialltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,& 1973 & zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 1974 call timab(543,2,tsec) 1975 end if 1976 end do ! idat 1977 1978 ! The second step of MPI-FFT 1979 do idat=1,ndat 1980 ! Make sure communication is completed. 1981 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 1982 1983 do j3=1,nd3proc 1984 j3glob = j3 + me_fft*nd3proc 1985 1986 if (me_fft*nd3proc+j3 <= n3) then 1987 Jp2stb=1; J2stb=1 1988 Jp2stf=1; J2stf=1 1989 1990 ! transform along x axis 1991 lot=ncache/(4*n1) 1992 1993 do j=1,m2ieff,lot 1994 ma=j 1995 mb=min(j+(lot-1),m2ieff) 1996 n1dfft=mb-ma+1 1997 1998 ! Zero-pad input. 1999 ! input: G1,G2,R3,G2,(Rp3) 2000 ! output: G2,G1,R3,G2,(Rp3) 2001 if (nproc_fft == 1) then 2002 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,& 2003 & md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0) 2004 else 2005 call mpiswitch_cent(j3,n1dfft,Jp2stb,J2stb,lot,max1i,md1,m1i,n1,& 2006 & md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1), unused0, unused0, unused0) 2007 end if 2008 2009 ! Transform along x 2010 ! input: G2,G1,R3,(Rp3) 2011 ! output: G2,R1,R3,(Rp3) 2012 inzee=1 2013 do i=1,ic1-1 2014 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 2015 & btrig1,after1(i),now1(i),before1(i),1) 2016 inzee=3-inzee 2017 end do 2018 2019 i=ic1 2020 call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), & 2021 & btrig1,after1(i),now1(i),before1(i),1) 2022 end do 2023 2024 ! Transform along y axis (take into account c2c or c2r case). 2025 ! Must loop over the full box. 2026 lot=ncache/(4*n2) 2027 2028 if (icplexwf==1) then 2029 if(mod(lot,2).ne.0)lot=lot-1 ! needed to introduce jeff 2030 end if 2031 2032 do j=1,n1eff,lot 2033 ma=j 2034 mb=min(j+(lot-1),n1eff) 2035 n1dfft=mb-ma+1 2036 jeff=j 2037 includelast=1 2038 2039 if (icplexwf==1) then 2040 jeff=2*j-1 2041 includelast=1 2042 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 2043 end if 2044 2045 ! Zero-pad the input. 2046 ! input: G2,R1,R3,(Rp3) 2047 ! output: R1,G2,R3,(Rp3) 2048 if (icplexwf==2) then 2049 call switch_cent(n1dfft,max2i,m2i,n2,lot,n1,lzt,zt(1,1,jeff),zw(1,1,1)) 2050 else 2051 call switchreal_cent(includelast,n1dfft,max2i,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1)) 2052 end if 2053 2054 ! input: R1,G2,R3,(Rp3) 2055 ! output: R1,R2,R3,(Rp3) 2056 inzee=1 2057 do i=1,ic2 2058 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 2059 & btrig2,after2(i),now2(i),before2(i),1) 2060 inzee=3-inzee 2061 end do 2062 ! output: R1,R2,R3,(Rp3) 2063 2064 ! Multiply with potential in real space 2065 jx=cplex*(jeff-1)+1 2066 call multpot(icplexwf,cplex,includelast,nd1,nd2,n2,lot,n1dfft,pot(jx,1,j3glob),zw(1,1,inzee)) 2067 2068 ! TRANSFORM BACK IN FOURIER SPACE 2069 ! transform along y axis 2070 ! input: R1,R2,R3,(Rp3) 2071 do i=1,ic2 2072 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 2073 & ftrig2,after2(i),now2(i),before2(i),-1) 2074 inzee=3-inzee 2075 end do 2076 2077 ! input: R1,G2,R3,(Rp3) 2078 ! output: G2,R1,R3,(Rp3) 2079 if (icplexwf==2) then 2080 call unswitch_cent(n1dfft,max2o,m2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff)) 2081 else 2082 call unswitchreal_cent(n1dfft,max2o,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,jeff)) 2083 end if 2084 2085 end do ! j 2086 2087 ! transform along x axis 2088 ! input: R2,R1,R3,(Rp3) 2089 ! output: R2,G1,R3,(Rp3) 2090 lot=ncache/(4*n1) 2091 2092 do j=1,m2oeff,lot 2093 ma=j 2094 mb=min(j+(lot-1),m2oeff) 2095 n1dfft=mb-ma+1 2096 i=1 2097 call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), & 2098 & ftrig1,after1(i),now1(i),before1(i),-1) 2099 2100 inzee=1 2101 do i=2,ic1 2102 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 2103 & ftrig1,after1(i),now1(i),before1(i),-1) 2104 inzee=3-inzee 2105 end do 2106 2107 ! input: G2,G1,R3,Gp2,(Rp3) 2108 ! output: G1,G2,R3,Gp2,(Rp3) 2109 if (nproc_fft == 1) then 2110 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,& 2111 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat)) 2112 else 2113 call unmpiswitch_cent(j3,n1dfft,Jp2stf,J2stf,lot,max1o,md1,m1o,n1,& 2114 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat)) 2115 end if 2116 end do ! j 2117 end if 2118 end do 2119 2120 ! Interprocessor data transposition 2121 ! input: G1,G2,R3,Gp2,(Rp3) 2122 ! output: G1,G2,R3,Rp3,(Gp2) 2123 if (nproc_fft > 1) then 2124 call timab(544,1,tsec) 2125 call xmpi_ialltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 2126 & zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,requests(idat)) 2127 call timab(544,2,tsec) 2128 end if 2129 end do ! idat 2130 2131 do idat=1,ndat 2132 if (nproc_fft>1) call xmpi_wait(requests(idat),ierr) 2133 2134 ! transform along z axis 2135 ! input: G1,G2,R3,(Gp2) 2136 lot=ncache/(4*n3) 2137 do j2=1,md2proc 2138 if (me_fft*md2proc+j2 <= m2oeff) then 2139 do i1=1,m1o,lot 2140 ma=i1 2141 mb=min(i1+(lot-1),m1o) 2142 n1dfft=mb-ma+1 2143 2144 ! input: G1,G2,R3,(Gp2) 2145 ! output: G1,R3,G2,(Gp2) 2146 call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1)) 2147 2148 inzee=1 2149 do i=1,ic3 2150 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 2151 & ftrig3,after3(i),now3(i),before3(i),-1) 2152 inzee=3-inzee 2153 end do 2154 2155 call unfill_cent(md1,md3,lot,n1dfft,max3o,m3o,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) 2156 ! output: G1,G3,G2,(Gp2) 2157 end do 2158 end if 2159 end do 2160 2161 ! Complete missing values with complex conjugate 2162 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 2163 if (icplexwf==1) then 2164 do i3=1,m3o 2165 i3inv=m3o+2-i3 2166 if (i3==1) i3inv=1 2167 if (m2oeff>1)then 2168 do i2=2,m2oeff 2169 i2inv=m2o+2-i2 2170 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 2171 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 2172 do i1=2,m1o 2173 i1inv=m1o+2-i1 2174 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 2175 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 2176 end do 2177 end do 2178 end if 2179 end do 2180 end if 2181 2182 end do ! idat 2183 2184 ABI_FREE(btrig1) 2185 ABI_FREE(ftrig1) 2186 ABI_FREE(after1) 2187 ABI_FREE(now1) 2188 ABI_FREE(before1) 2189 ABI_FREE(btrig2) 2190 ABI_FREE(ftrig2) 2191 ABI_FREE(after2) 2192 ABI_FREE(now2) 2193 ABI_FREE(before2) 2194 ABI_FREE(btrig3) 2195 ABI_FREE(ftrig3) 2196 ABI_FREE(after3) 2197 ABI_FREE(now3) 2198 ABI_FREE(before3) 2199 2200 ABI_FREE(zmpi2) 2201 ABI_FREE(zw) 2202 ABI_FREE(zt) 2203 if (nproc_fft > 1) then 2204 ABI_FREE(zmpi1) 2205 end if 2206 2207 end subroutine sg2002_applypot_many
m_sg2002/sg2002_back [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_back
FUNCTION
CALCULATES THE DISCRETE FOURIER TRANSFORM in parallel using MPI/OpenMP ZR(I1,I2,I3)= \sum_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2) Adopt standard convention that isign=1 for backward transform INPUTS: cplex=1 for real --> complex, 2 for complex --> complex ZF: input array in G-space (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat OUTPUTS: ZR: output array in R space. ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZF and ZR nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices NOTES: The maximum number of processors that can reasonably be used is max(n2,n3) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
116 subroutine sg2002_back(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zf,zr,comm_fft) 117 118 implicit none 119 120 !Arguments ------------------------------------ 121 ! real space input 122 integer,intent(in) :: cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,comm_fft 123 real(dp),intent(in) :: zf(2,nd1,nd3,nd2proc,ndat) 124 real(dp),intent(out) :: zr(2,nd1eff,nd2,nd3proc,ndat) 125 126 !Local variables------------------------------- 127 !scalars 128 integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,includelast,inzee,j2,j2st,j3,jeff,jp2st,lot,lzt 129 integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft 130 character(len=500) :: msg 131 !arrays 132 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 133 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 134 real(dp), allocatable :: zw(:,:,:) ! cache work array 135 real(dp) :: tsec(2) 136 ! FFT work arrays 137 real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3 138 integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3 139 140 ! ************************************************************************* 141 142 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 143 144 ! find cache size that gives optimal performance on machine 145 ncache=4*max(n1,n2,n3,1024) 146 147 if (ncache/(4*max(n1,n2,n3))<1) then 148 write(msg,'(5a)') & 149 & 'ncache has to be enlarged to be able to hold at',ch10, & 150 & 'least one 1-d FFT of each size even though this will',ch10,& 151 & 'reduce the performance for shorter transform lengths' 152 ABI_ERROR(msg) 153 end if 154 155 ! check input 156 if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then 157 ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3") 158 end if 159 160 ! Effective n1 and n2 (complex-to-complex or real-to-complex) 161 n1eff=n1; n2eff=n2; n1zt=n1 162 if (cplex==1) then 163 n1eff=(n1+1)/2 ; n2eff=n2/2+1 ; n1zt=2*(n1/2+1) 164 end if 165 166 lzt=n2eff 167 if (mod(n2eff,2) == 0) lzt=lzt+1 168 if (mod(n2eff,4) == 0) lzt=lzt+1 169 170 ! maximal number of big box 3rd dim slices for all procs 171 nnd3=nd3proc*nproc_fft 172 173 ABI_MALLOC(trig1,(2,n1)) 174 ABI_MALLOC(after1,(mdata)) 175 ABI_MALLOC(now1,(mdata)) 176 ABI_MALLOC(before1,(mdata)) 177 ABI_MALLOC(trig2,(2,n2)) 178 ABI_MALLOC(after2,(mdata)) 179 ABI_MALLOC(now2,(mdata)) 180 ABI_MALLOC(before2,(mdata)) 181 ABI_MALLOC(trig3,(2,n3)) 182 ABI_MALLOC(after3,(mdata)) 183 ABI_MALLOC(now3,(mdata)) 184 ABI_MALLOC(before3,(mdata)) 185 ABI_MALLOC(zw,(2,ncache/4,2)) 186 ABI_MALLOC(zt,(2,lzt,n1zt)) 187 ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3)) 188 if (nproc_fft>1) then 189 ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3)) 190 end if 191 192 call ctrig(n3,trig3,after3,before3,now3,1,ic3) 193 call ctrig(n1,trig1,after1,before1,now1,1,ic1) 194 call ctrig(n2,trig2,after2,before2,now2,1,ic2) 195 196 !DEBUG 197 ! write(std_out,'(a,3i4)' )'sg2002_back,zf n1,n2,n3',n1,n2,n3 198 ! write(std_out,'(a,3i4)' )'nd1,nd2,nd3proc',nd1,nd2,nd3proc 199 ! write(std_out,'(a,3i4)' )'m1,m2,m3',m1,m2,m3 200 ! write(std_out,'(a,3i4)' )'max1,max2,max3',max1,max2,max3 201 ! write(std_out,'(a,3i4)' )'md1,md2proc,md3',md1,md2proc,md3 202 ! write(std_out,'(a,3i4)' )'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 203 !ENDDEBUG 204 205 do idat=1,ndat 206 ! transform along z axis 207 ! input: I1,I3,J2,(Jp2) 208 lot=ncache/(4*n3) 209 210 do j2=1,nd2proc 211 if (me_fft*nd2proc+j2 <= n2eff) then 212 213 do i1=1,n1,lot 214 ma=i1 215 mb=min(i1+(lot-1),n1) 216 n1dfft=mb-ma+1 217 218 ! input: G1,G3,G2,(Gp2) 219 call fill(nd1,nd3,lot,n1dfft,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) 220 221 inzee=1 222 do i=1,ic3 223 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 224 & trig3,after3(i),now3(i),before3(i),1) 225 inzee=3-inzee 226 end do 227 228 ! input: G1,R3,G2,(Gp2) 229 ! output: G1,G2,R3,(Gp2) 230 call scramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zw(1,1,inzee),zmpi2) 231 end do 232 end if 233 end do 234 235 ! Interprocessor data transposition 236 ! input: G1,G2,R3,Rp3,(Gp2) 237 ! output: G1,G2,G3,Gp2,(Rp3) 238 if (nproc_fft>1) then 239 call timab(543,1,tsec) 240 call xmpi_alltoall(zmpi2,2*n1*nd2proc*nd3proc, & 241 & zmpi1,2*n1*nd2proc*nd3proc,comm_fft,ierr) 242 call timab(543,2,tsec) 243 end if 244 245 do j3=1,nd3proc 246 if (me_fft*nd3proc+j3 <= n3) then 247 Jp2st=1 248 J2st=1 249 250 ! transform along x axis 251 lot=ncache/(4*n1) 252 253 do j=1,n2eff,lot 254 ma=j 255 mb=min(j+(lot-1),n2eff) 256 n1dfft=mb-ma+1 257 258 ! input: G1,G2,R3,Gp2,(Rp3) 259 ! output: G2,G1,R3,Jp2,(Rp3) 260 if (nproc_fft == 1) then 261 call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi2,zw(1,1,1)) 262 else 263 call mpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zmpi1,zw(1,1,1)) 264 end if 265 266 ! input: G2,G1,R3,(Rp3) 267 ! output: G2,R1,R3,(Rp3) 268 inzee=1 269 do i=1,ic1-1 270 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 271 & trig1,after1(i),now1(i),before1(i),1) 272 inzee=3-inzee 273 end do 274 275 i=ic1 276 call fftstp(lot,n1dfft,n1,lzt,n1zt,zw(1,1,inzee),zt(1,j,1), & 277 & trig1,after1(i),now1(i),before1(i),1) 278 end do 279 280 ! transform along y axis 281 lot=ncache/(4*n2) 282 283 do j=1,n1eff,lot 284 ma=j 285 mb=min(j+(lot-1),n1eff) 286 n1dfft=mb-ma+1 287 includelast=1 288 289 if (cplex==1) then 290 jeff=2*j-1 291 includelast=1 292 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 293 end if 294 295 ! input: G2,R1,R3,(Rp3) 296 ! output: R1,G2,R3,(Rp3) 297 if (cplex==2) then 298 call switch(n1dfft,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) 299 else 300 call switchreal(includelast,n1dfft,n2,n2eff,lot,n1zt,lzt,zt(1,1,jeff),zw(1,1,1)) 301 end if 302 303 inzee=1 304 do i=1,ic2-1 305 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 306 & trig2,after2(i),now2(i),before2(i),1) 307 inzee=3-inzee 308 end do 309 310 i=ic2 311 call fftstp(lot,n1dfft,n2,nd1eff,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), & 312 & trig2,after2(i),now2(i),before2(i),1) 313 314 end do 315 ! output: R1,R2,R3,(Rp3) 316 317 end if 318 end do 319 end do ! idat 320 321 ABI_FREE(trig1) 322 ABI_FREE(after1) 323 ABI_FREE(now1) 324 ABI_FREE(before1) 325 ABI_FREE(trig2) 326 ABI_FREE(after2) 327 ABI_FREE(now2) 328 ABI_FREE(before2) 329 ABI_FREE(trig3) 330 ABI_FREE(after3) 331 ABI_FREE(now3) 332 ABI_FREE(before3) 333 ABI_FREE(zmpi2) 334 ABI_FREE(zw) 335 ABI_FREE(zt) 336 if (nproc_fft>1) then 337 ABI_FREE(zmpi1) 338 end if 339 340 end subroutine sg2002_back
m_sg2002/sg2002_forw [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_forw
FUNCTION
Adopt standard convention that isign=-1 for forward transform CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)= S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) in parallel using MPI/OpenMP and BLAS library calls.
INPUTS
ZR: input array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat OUTPUTS ZF: output array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc_fft-1] number of processor as returned by MPI_COMM_RANK n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZR and ZF nd2proc=((nd2-1)/nproc_fft)+1 maximal number of 2nd dim slices nd3proc=((nd3-1)/nproc_fft)+1 maximal number of 3rd dim slices
NOTES
SHOULD describe nd1eff SHOULD put cplex and nd1eff in OMP declarations SHOULD describe the change of value of nd2prod The maximum number of processors that can reasonably be used is max(n2,n3) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
395 subroutine sg2002_forw(cplex,ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option,zr,zf,comm_fft) 396 397 implicit none 398 399 !Arguments ------------------------------------ 400 !scalars 401 integer,intent(in) :: cplex,comm_fft 402 integer,intent(in) :: ndat,n1,n2,n3,nd1,nd2,nd3,nd1eff,nd2proc,nd3proc,option 403 !arrays 404 real(dp),intent(in) :: zr(2,nd1eff,nd2,nd3proc,ndat) 405 real(dp),intent(out) :: zf(2,nd1,nd3,nd2proc,ndat) 406 407 !Local variables------------------------------- 408 !scalars 409 integer :: i,j,i1,ic1,ic2,ic3,idat,ierr,inzee,j2,j2st,j3,jp2st,lot,lzt 410 integer :: ma,mb,n1dfft,n1eff,n2eff,n1zt,ncache,nnd3,nproc_fft,me_fft 411 character(len=500) :: msg 412 !arrays 413 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 414 real(dp), allocatable :: zmpi1(:,:,:,:),zmpi2(:,:,:,:) ! work arrays for MPI 415 real(dp), allocatable :: zw(:,:,:) ! cache work array 416 real(dp) :: tsec(2) 417 ! FFT work arrays 418 real(dp), allocatable, dimension(:,:) :: trig1,trig2,trig3 419 integer, allocatable, dimension(:) :: after1,now1,before1,after2,now2,before2,after3,now3,before3 420 421 ! ************************************************************************* 422 423 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 424 425 ! find cache size that gives optimal performance on machine 426 ncache=4*max(n1,n2,n3,1024) 427 if (ncache/(4*max(n1,n2,n3))<1) then 428 write(msg,'(5a)')& 429 & 'ncache has to be enlarged to be able to hold at',ch10, & 430 & 'least one 1-d FFT of each size even though this will',ch10,& 431 & 'reduce the performance for shorter transform lengths' 432 ABI_ERROR(msg) 433 end if 434 435 ! check input 436 if (nd1<n1 .or. nd2<n2 .or. nd3<n3) then 437 ABI_ERROR("nd1<n1 .or. nd2<n2 .or. nd3<n3") 438 end if 439 440 !Effective n1 and n2 (complex-to-complex or real-to-complex) 441 n1eff=n1; n2eff=n2; n1zt=n1 442 if (cplex==1) then 443 n1eff=(n1+1)/2; n2eff=n2/2+1; n1zt=2*(n1/2+1) 444 end if 445 446 lzt=n2eff 447 if (mod(n2eff,2) == 0) lzt=lzt+1 448 if (mod(n2eff,4) == 0) lzt=lzt+1 449 450 ! maximal number of big box 3rd dim slices for all procs 451 nnd3=nd3proc*nproc_fft 452 453 ABI_MALLOC(trig1,(2,n1)) 454 ABI_MALLOC(after1,(mdata)) 455 ABI_MALLOC(now1,(mdata)) 456 ABI_MALLOC(before1,(mdata)) 457 ABI_MALLOC(trig2,(2,n2)) 458 ABI_MALLOC(after2,(mdata)) 459 ABI_MALLOC(now2,(mdata)) 460 ABI_MALLOC(before2,(mdata)) 461 ABI_MALLOC(trig3,(2,n3)) 462 ABI_MALLOC(after3,(mdata)) 463 ABI_MALLOC(now3,(mdata)) 464 ABI_MALLOC(before3,(mdata)) 465 ABI_MALLOC(zw,(2,ncache/4,2)) 466 ABI_MALLOC(zt,(2,lzt,n1zt)) 467 ABI_MALLOC(zmpi2,(2,n1,nd2proc,nnd3)) 468 if (nproc_fft>1) then 469 ABI_MALLOC(zmpi1,(2,n1,nd2proc,nnd3)) 470 end if 471 472 call ctrig(n2,trig2,after2,before2,now2,-1,ic2) 473 call ctrig(n1,trig1,after1,before1,now1,-1,ic1) 474 call ctrig(n3,trig3,after3,before3,now3,-1,ic3) 475 476 do idat=1,ndat 477 do j3=1,nd3proc 478 if (me_fft*(nd3proc)+j3 <= n3) then 479 Jp2st=1; J2st=1 480 481 ! transform along y axis 482 ! input: R1,R2,R3,(Rp3) 483 lot=ncache/(4*n2) 484 485 do j=1,n1eff,lot 486 ma=j 487 mb=min(j+(lot-1),n1eff) 488 n1dfft=mb-ma+1 489 i=1 490 call fftstp(nd1eff,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & 491 & trig2,after2(i),now2(i),before2(i),-1) 492 493 inzee=1 494 do i=2,ic2 495 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 496 & trig2,after2(i),now2(i),before2(i),-1) 497 inzee=3-inzee 498 end do 499 500 ! input: R1,G2,R3,(Rp3) 501 ! output: G2,R1,R3,(Rp3) 502 if(cplex==2)then 503 call unswitch(n1dfft,n2,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,j)) 504 else 505 call unswitchreal(n1dfft,n2,n2eff,lot,n1zt,lzt,zw(1,1,inzee),zt(1,1,2*j-1)) 506 end if 507 end do 508 509 ! transform along x axis 510 ! input: G2,R1,R3,(Rp3) 511 lot=ncache/(4*n1) 512 513 do j=1,n2eff,lot 514 ma=j 515 mb=min(j+(lot-1),n2eff) 516 n1dfft=mb-ma+1 517 518 i=1 519 call fftstp(lzt,n1dfft,n1zt,lot,n1,zt(1,j,1),zw(1,1,1), & 520 & trig1,after1(i),now1(i),before1(i),-1) 521 522 inzee=1 523 do i=2,ic1 524 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 525 & trig1,after1(i),now1(i),before1(i),-1) 526 inzee=3-inzee 527 end do 528 ! output: G2,G1,R3,(Rp3) 529 530 ! input: G2,G1,R3,Gp2,(Rp3) 531 ! output: G1,G2,R3,Gp2,(Rp3) 532 ! write(std_out,*) 'J2st,Jp2st',J2st,Jp2st 533 if (nproc_fft == 1) then 534 call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi2) 535 else 536 call unmpiswitch(j3,n1dfft,Jp2st,J2st,lot,n1,nd2proc,nd3proc,nproc_fft,option,zw(1,1,inzee),zmpi1) 537 end if 538 end do 539 540 end if 541 end do ! j3 542 543 ! Interprocessor data transposition 544 ! input: G1,G2,R3,Gp2,(Rp3) 545 ! output: G1,G2,R3,Rp3,(Gp2) 546 if (nproc_fft>1) then 547 call timab(544,1,tsec) 548 call xmpi_alltoall(zmpi1,2*n1*nd2proc*nd3proc, & 549 & zmpi2,2*n1*nd2proc*nd3proc,comm_fft,ierr) 550 call timab(544,2,tsec) 551 end if 552 553 ! transform along z axis 554 ! input: G1,G2,R3,(Gp2) 555 lot=ncache/(4*n3) 556 557 do j2=1,nd2proc 558 if (me_fft*(nd2proc)+j2 <= n2eff) then 559 do i1=1,n1,lot 560 ma=i1 561 mb=min(i1+(lot-1),n1) 562 n1dfft=mb-ma+1 563 564 ! input: G1,G2,R3,(Gp2) 565 ! output: G1,R3,G2,(Gp2) 566 call unscramble(i1,j2,lot,n1dfft,n1,n3,nd2proc,nd3,zmpi2,zw(1,1,1)) 567 568 inzee=1 569 do i=1,ic3 570 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 571 & trig3,after3(i),now3(i),before3(i),-1) 572 inzee=3-inzee 573 end do 574 575 call unfill(nd1,nd3,lot,n1dfft,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) 576 ! output: G1,G3,G2,(Gp2) 577 end do 578 end if 579 end do 580 581 end do ! idat 582 583 ABI_FREE(trig1) 584 ABI_FREE(after1) 585 ABI_FREE(now1) 586 ABI_FREE(before1) 587 ABI_FREE(trig2) 588 ABI_FREE(after2) 589 ABI_FREE(now2) 590 ABI_FREE(before2) 591 ABI_FREE(trig3) 592 ABI_FREE(after3) 593 ABI_FREE(now3) 594 ABI_FREE(before3) 595 ABI_FREE(zmpi2) 596 ABI_FREE(zw) 597 ABI_FREE(zt) 598 if (nproc_fft>1) then 599 ABI_FREE(zmpi1) 600 end if 601 602 end subroutine sg2002_forw
m_sg2002/sg2002_mpiback_wf [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_mpiback_wf
FUNCTION
Does multiple 3-dim backward FFTs from Fourier into real space Adopt standard convention that isign=1 for backward transform CALCULATES THE DISCRETE FOURIER TRANSFORM ZF(I1,I2,I3)= S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZF(j1,j3,j2) in parallel using MPI/OpenMP. INPUTS: icplexwf=1 if wavefunction is real, 2 if complex ndat=Number of wavefunctions to transform. n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Leading Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc max1 is positive or zero; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 max2 and max3 have a similar definition of range m1,m2,m3=Size of the box enclosing the G-sphere. md1,md2,md3: Dimension of ZF given on the **small** FFT box. md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE comm_fft=MPI communicator for the FFT. ZF: input array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) OUTPUTS ZR: output array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
665 subroutine sg2002_mpiback_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 666 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zf,zr,comm_fft) 667 668 implicit none 669 670 !Arguments ------------------------------------ 671 integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 672 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 673 real(dp),intent(in) :: zf(2,md1,md3,md2proc,ndat) 674 real(dp),intent(out) :: zr(2,nd1,nd2,nd3proc,ndat) 675 676 !Local variables------------------------------- 677 integer :: i,j,i1,i2,ic1,ic2,ic3,idat,ierr,inzee,includelast 678 integer :: ioption,j2,j3,j2st,jp2st,jeff,lot,lzt,m1zt,ma,mb,n1dfft,nnd3 679 integer :: m2eff,ncache,n1eff,n1half,nproc_fft,me_fft 680 character(len=500) :: msg 681 !arrays 682 real(dp),allocatable :: zt(:,:,:) ! work arrays for transpositions 683 real(dp),allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 684 real(dp),allocatable :: zw(:,:,:) ! cache work array 685 ! FFT work arrays 686 real(dp),allocatable :: trig1(:,:),trig2(:,:),trig3(:,:) 687 integer,allocatable :: after1(:),now1(:),before1(:),after2(:) 688 integer,allocatable :: now2(:),before2(:),after3(:),now3(:),before3(:) 689 real(dp) :: tsec(2) 690 691 ! ************************************************************************* 692 693 ! call timab(541,1,tsec) 694 ! FIXME must provide a default value but which one? 695 ! ioption = 0 696 ioption = 1 697 !if (paral_kgb==1) ioption=1 698 699 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 700 701 ! Find cache size that gives optimal performance on machine 702 ncache=4*max(n1,n2,n3,1024) 703 if (ncache/(4*max(n1,n2,n3))<1) then 704 write(msg,"(5a)") & 705 & 'ncache has to be enlarged to be able to hold at',ch10, & 706 & 'least one 1-d FFT of each size even though this will',ch10,& 707 & 'reduce the performance for shorter transform lengths' 708 ABI_ERROR(msg) 709 end if 710 711 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 712 n1eff=n1; m2eff=m2; m1zt=n1 713 if (icplexwf==1) then 714 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 715 end if 716 717 lzt=m2eff 718 if (mod(m2eff,2)==0) lzt=lzt+1 719 if (mod(m2eff,4)==0) lzt=lzt+1 720 721 ! maximal number of big box 3rd dim slices for all procs 722 nnd3=nd3proc*nproc_fft 723 724 ABI_MALLOC(trig1,(2,n1)) 725 ABI_MALLOC(after1,(mdata)) 726 ABI_MALLOC(now1,(mdata)) 727 ABI_MALLOC(before1,(mdata)) 728 ABI_MALLOC(trig2,(2,n2)) 729 ABI_MALLOC(after2,(mdata)) 730 ABI_MALLOC(now2,(mdata)) 731 ABI_MALLOC(before2,(mdata)) 732 ABI_MALLOC(trig3,(2,n3)) 733 ABI_MALLOC(after3,(mdata)) 734 ABI_MALLOC(now3,(mdata)) 735 ABI_MALLOC(before3,(mdata)) 736 737 ! Allocate cache work array and work arrays for MPI transpositions. 738 ABI_MALLOC(zw,(2,ncache/4,2)) 739 ABI_MALLOC(zt,(2,lzt,m1zt)) 740 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 741 if (nproc_fft>1) then 742 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 743 end if 744 745 ! Compute twiddle coefficients. 746 call ctrig(n3,trig3,after3,before3,now3,1,ic3) 747 call ctrig(n1,trig1,after1,before1,now1,1,ic1) 748 call ctrig(n2,trig2,after2,before2,now2,1,ic2) 749 750 !DEBUG 751 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': sg2002_mpiback_wf,zf n1,n2,n3',n1,n2,n3 752 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': nd1,nd2,nd3proc',nd1,nd2,nd3proc 753 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': m1,m2,m3',m1,m2,m3 754 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': max1,max2,max3',max1,max2,max3 755 ! write(std_out,'(2a,3i4)' )itoa(me_fft),': md1,md2proc,md3',md1,md2proc,md3 756 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 757 !ENDDEBUG 758 759 do idat=1,ndat 760 761 ! transform along z axis 762 ! input: G1,G3,G2,(Gp2) 763 lot=ncache/(4*n3) 764 765 zw(:,:,:)=zero 766 zt(:,:,:)=zero 767 768 ! Loop over the y planes treated by this node and trasform n1ddft G_z lines. 769 do j2=1,md2proc 770 771 ! if (me_fft*md2proc+j2<=m2eff) then !a faire plus tard 772 773 do i1=1,m1,lot 774 ma=i1 775 mb=min(i1+(lot-1),m1) 776 n1dfft=mb-ma+1 777 778 ! zero-pad n1dfft G_z lines 779 ! input: G1,G3,G2,(Gp2) 780 ! output: G1,R3,G2,(Gp2) 781 call fill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zf(1,i1,1,j2,idat),zw(1,1,1)) 782 783 ! Transform along z. 784 inzee=1 785 do i=1,ic3 786 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 787 & trig3,after3(i),now3(i),before3(i),1) 788 inzee=3-inzee 789 end do 790 791 ! Local rotation. 792 ! input: G1,R3,G2,(Gp2) 793 ! output: G1,G2,R3,(Gp2) 794 call scramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zw(1,1,inzee),zmpi2(:,:,:,:,idat)) 795 end do 796 ! 797 end do ! j2 798 799 ! Interprocessor data transposition 800 ! input: G1,G2,R3,Rp3,(Gp2) 801 ! output: G1,G2,R3,Gp2,(Rp3) 802 if (nproc_fft>1) then 803 call timab(543,1,tsec) 804 call xmpi_alltoall(zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 805 & zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr) 806 call timab(543,2,tsec) 807 end if 808 809 ! Loop over the z treated by this node. 810 do j3=1,nd3proc 811 !j3glob = j3 + me_fft*nd3proc 812 if (me_fft*nd3proc+j3 <= n3) then 813 Jp2st=1; J2st=1 814 815 lot=ncache/(4*n1) 816 817 ! Loop over G_y in the small box. 818 do j=1,m2eff,lot 819 ma=j 820 mb=min(j+(lot-1),m2eff) 821 n1dfft=mb-ma+1 822 823 ! Zero-pad input. 824 ! input: G1,G2,R3,JG2,(Rp3) 825 ! output: G2,G1,R3,JG2,(Rp3) 826 if (nproc_fft==1) then 827 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 828 & md2proc,nd3proc,nproc_fft,ioption,zmpi2(:,:,:,:,idat),zw(1,1,1),max2,m2,n2) 829 else 830 call mpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 831 & md2proc,nd3proc,nproc_fft,ioption,zmpi1(:,:,:,:,idat),zw(1,1,1),max2,m2,n2) 832 end if 833 834 ! Transform along x 835 ! input: G2,G1,R3,(Rp3) 836 ! output: G2,R1,R3,(Rp3) 837 inzee=1 838 do i=1,ic1-1 839 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 840 & trig1,after1(i),now1(i),before1(i),1) 841 inzee=3-inzee 842 end do 843 844 i=ic1 845 call fftstp(lot,n1dfft,n1,lzt,m1zt,zw(1,1,inzee),zt(1,j,1), & 846 & trig1,after1(i),now1(i),before1(i),1) 847 end do 848 849 ! Transform along y axis (take into account c2c or c2r case). 850 ! Must loop over the full box. 851 lot=ncache/(4*n2) 852 853 do j=1,n1eff,lot 854 ma=j 855 mb=min(j+(lot-1),n1eff) 856 n1dfft=mb-ma+1 857 includelast=1 858 if (icplexwf==1) then 859 jeff=2*j-1 860 if (mb==n1eff .and. n1eff*2/=n1) includelast=0 861 end if 862 863 ! Zero-pad the input. 864 ! input: G2,R1,R3,(Rp3) 865 ! output: R1,G2,R3,(Rp3) 866 if (icplexwf==2) then 867 call switch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zt(1,1,j),zw(1,1,1)) 868 else 869 call switchreal_cent(includelast,n1dfft,max2,n2,lot,m1zt,lzt,zt(1,1,jeff),zw(1,1,1)) 870 end if 871 872 ! input: R1,G2,R3,(Rp3) 873 ! output: R1,R2,R3,(Rp3) 874 inzee=1 875 do i=1,ic2-1 876 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 877 & trig2,after2(i),now2(i),before2(i),1) 878 inzee=3-inzee 879 end do 880 881 i=ic2 882 883 call fftstp(lot,n1dfft,n2,nd1,nd2,zw(1,1,inzee),zr(1,j,1,j3,idat), & 884 & trig2,after2(i),now2(i),before2(i),1) 885 886 887 end do 888 889 ! Treat real wavefunctions. 890 if (icplexwf==1) then 891 n1half=n1/2 892 ! If odd 893 if (n1half*2/=n1) then 894 do i2=1,n2 895 zr(1,n1,i2,j3,idat)=zr(1,n1eff,i2,j3,idat) 896 zr(2,n1,i2,j3,idat)=zero 897 end do 898 end if 899 do i2=1,n2 900 do i1=n1half,1,-1 901 zr(1,2*i1-1,i2,j3,idat)=zr(1,i1,i2,j3,idat) 902 zr(1,2*i1 ,i2,j3,idat)=zr(2,i1,i2,j3,idat) 903 zr(2,2*i1-1,i2,j3,idat)=zero 904 zr(2,2*i1 ,i2,j3,idat)=zero 905 end do 906 end do 907 end if 908 909 end if 910 911 end do ! j3 912 end do ! idat 913 914 ABI_FREE(trig1) 915 ABI_FREE(after1) 916 ABI_FREE(now1) 917 ABI_FREE(before1) 918 ABI_FREE(trig2) 919 ABI_FREE(after2) 920 ABI_FREE(now2) 921 ABI_FREE(before2) 922 ABI_FREE(trig3) 923 ABI_FREE(after3) 924 ABI_FREE(now3) 925 ABI_FREE(before3) 926 ABI_FREE(zmpi2) 927 ABI_FREE(zw) 928 ABI_FREE(zt) 929 if (nproc_fft>1) then 930 ABI_FREE(zmpi1) 931 end if 932 933 !call timab(541,2,tsec) 934 935 end subroutine sg2002_mpiback_wf
m_sg2002/sg2002_mpiforw_wf [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_mpiforw_wf
FUNCTION
Does multiple 3-dim backward FFTs from real into Fourier space Adopt standard convention that isign=-1 for forward transform CALCULATES THE DISCRETE FOURIERTRANSFORM ZF(I1,I3,I2)=S_(j1,j2,j3) EXP(isign*i*2*pi*(j1*i1/n1+j2*i2/n2+j3*i3/n3)) ZR(j1,j2,j3) in parallel using MPI/OpenMP. INPUT: ZR: input array ZR(1,i1,i2,i3,idat)=real(R(i1,i2,i3,idat)) ZR(2,i1,i2,i3,idat)=imag(R(i1,i2,i3,idat)) i1=1,n1 , i2=1,n2 , i3=1,n3 , idat=1,ndat NOTE that ZR is changed by the routine n1,n2,n3: logical dimension of the transform. As transform lengths most products of the prime factors 2,3,5 are allowed. The detailed table with allowed transform lengths can be found in subroutine CTRIG nd1,nd2,nd3: Dimension of ZR nd3proc=((nd3-1)/nproc_fft)+1 maximal number of big box 3rd dim slices for one proc OUTPUT: ZF: output array (note the switch of i2 and i3) real(F(i1,i3,i2,idat))=ZF(1,i1,i3,i2,idat) imag(F(i1,i3,i2,idat))=ZF(2,i1,i3,i2,idat) max1 is positive or zero ; m1 >=max1+1 i1= 1... max1+1 corresponds to positive and zero wavevectors 0 ... max1 then, if m1 > max1+1, one has min1=max1-m1+1 and i1= max1+2 ... m1 corresponds to negative wavevectors min1 ... -1 i2 and i3 have a similar definition of range idat=1,ndat md1,md2,md3: Dimension of ZF md2proc=((md2-1)/nproc_fft)+1 maximal number of small box 2nd dim slices for one proc nproc_fft: number of processors used as returned by MPI_COMM_SIZE me_fft: [0:nproc-1] rank of the processor in the FFT communicator. comm_fft=MPI communicator for parallel FFT.
NOTES
The maximum number of processors that can reasonably be used is max(n2/2,n3/2) It is very important to find the optimal value of NCACHE. NCACHE determines the size of the work array ZW, that has to fit into cache. It has therefore to be chosen to equal roughly half the size of the physical cache in units of real*8 numbers. The optimal value of ncache can easily be determined by numerical experimentation. A too large value of ncache leads to a dramatic and sudden decrease of performance, a too small value to a to a slow and less dramatic decrease of performance. If NCACHE is set to a value so small, that not even a single one dimensional transform can be done in the workarray zw, the program stops with an error message.
SOURCE
999 subroutine sg2002_mpiforw_wf(icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc,& 1000 & max1,max2,max3,m1,m2,m3,md1,md2proc,md3,zr,zf,comm_fft) 1001 1002 implicit none 1003 1004 !Arguments ------------------------------------ 1005 !scalars 1006 integer,intent(in) :: icplexwf,ndat,n1,n2,n3,nd1,nd2,nd3proc 1007 integer,intent(in) :: max1,max2,max3,m1,m2,m3,md1,md2proc,md3,comm_fft 1008 !arrays 1009 real(dp),intent(inout) :: zr(2,nd1,nd2,nd3proc,ndat) 1010 real(dp),intent(out) :: zf(2,md1,md3,md2proc,ndat) 1011 1012 !Local variables------------------------------- 1013 !scalars 1014 integer :: i,j,i1,i2,i3,ic1,ic2,ic3,idat,ierr,inzee,nproc_fft,me_fft 1015 integer :: ioption,j2,j3,j2st,jp2st,lot,lzt,m1zt,ma,mb,n1dfft,nnd3 1016 integer :: m2eff,ncache,n1eff,n1half,i1inv,i2inv,i3inv 1017 character(len=500) :: msg 1018 !arrays 1019 real(dp), allocatable :: zt(:,:,:) ! work arrays for transpositions 1020 real(dp), allocatable :: zmpi1(:,:,:,:,:),zmpi2(:,:,:,:,:) ! work arrays for MPI 1021 real(dp), allocatable :: zw(:,:,:) ! cache work array 1022 ! FFT work arrays 1023 real(dp), allocatable :: trig1(:,:),trig2(:,:),trig3(:,:) 1024 integer, allocatable :: after1(:),now1(:),before1(:),after2(:),now2(:),before2(:),after3(:),now3(:),before3(:) 1025 real(dp) :: tsec(2) 1026 1027 ! ************************************************************************* 1028 1029 ! call timab(542,1,tsec) 1030 1031 ! FIXME must provide a default value but which one? 1032 !ioption = 0 1033 ioption = 1 1034 !if (paral_kgb==1) ioption=1 1035 1036 nproc_fft = xmpi_comm_size(comm_fft); me_fft = xmpi_comm_rank(comm_fft) 1037 1038 ! find cache size that gives optimal performance on machine 1039 ncache=4*max(n1,n2,n3,1024) 1040 if (ncache/(4*max(n1,n2,n3))<1) then 1041 write(msg,'(5a)') & 1042 & 'ncache has to be enlarged to be able to hold at',ch10, & 1043 & 'least one 1-d FFT of each size even though this will',ch10,& 1044 & 'reduce the performance for shorter transform lengths' 1045 ABI_ERROR(msg) 1046 end if 1047 1048 ! Effective m1 and m2 (complex-to-complex or real-to-complex) 1049 n1eff=n1; m2eff=m2; m1zt=n1 1050 if (icplexwf==1) then 1051 n1eff=(n1+1)/2; m2eff=m2/2+1; m1zt=2*(n1/2+1) 1052 end if 1053 1054 lzt=m2eff 1055 if (mod(m2eff,2)==0) lzt=lzt+1 1056 if (mod(m2eff,4)==0) lzt=lzt+1 1057 1058 ! maximal number of big box 3rd dim slices for all procs 1059 nnd3=nd3proc*nproc_fft 1060 1061 ABI_MALLOC(trig1,(2,n1)) 1062 ABI_MALLOC(after1,(mdata)) 1063 ABI_MALLOC(now1,(mdata)) 1064 ABI_MALLOC(before1,(mdata)) 1065 ABI_MALLOC(trig2,(2,n2)) 1066 ABI_MALLOC(after2,(mdata)) 1067 ABI_MALLOC(now2,(mdata)) 1068 ABI_MALLOC(before2,(mdata)) 1069 ABI_MALLOC(trig3,(2,n3)) 1070 ABI_MALLOC(after3,(mdata)) 1071 ABI_MALLOC(now3,(mdata)) 1072 ABI_MALLOC(before3,(mdata)) 1073 ABI_MALLOC(zw,(2,ncache/4,2)) 1074 ABI_MALLOC(zt,(2,lzt,m1zt)) 1075 ABI_MALLOC(zmpi2,(2,md1,md2proc,nnd3,ndat)) 1076 if (nproc_fft>1) then 1077 ABI_MALLOC(zmpi1,(2,md1,md2proc,nnd3,ndat)) 1078 end if 1079 1080 call ctrig(n2,trig2,after2,before2,now2,-1,ic2) 1081 call ctrig(n1,trig1,after1,before1,now1,-1,ic1) 1082 call ctrig(n3,trig3,after3,before3,now3,-1,ic3) 1083 1084 !DEBUG 1085 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'sg2002_mpiforw_wf, enter', i1,i2,i3,zr,n1,n2,n3',n1,n2,n3 1086 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'nd1,nd2,nd3proc',nd1,nd2,nd3proc 1087 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'m1,m2,m3',m1,m2,m3 1088 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'max1,max2,max3',max1,max2,max3 1089 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'md1,md2proc,md3',md1,md2proc,md3 1090 ! write(std_out,'(2a,3i4)' )itoa(me_fft),'n1eff,m2eff,m1zt',n1eff,m2eff,m1zt 1091 !ENDDEBUG 1092 1093 do idat=1,ndat 1094 ! Loop over the z-planes treated by this node 1095 do j3=1,nd3proc 1096 1097 if (me_fft*nd3proc+j3 <= n3) then 1098 Jp2st=1 1099 J2st=1 1100 1101 ! Treat real wavefunctions. 1102 if (icplexwf==1) then 1103 n1half=n1/2 1104 do i2=1,n2 1105 do i1=1,n1half 1106 zr(1,i1,i2,j3,idat)=zr(1,2*i1-1,i2,j3,idat) 1107 zr(2,i1,i2,j3,idat)=zr(1,2*i1 ,i2,j3,idat) 1108 end do 1109 end do 1110 ! If odd 1111 if(n1half*2/=n1)then 1112 do i2=1,n2 1113 zr(1,n1eff,i2,j3,idat)=zr(1,n1,i2,j3,idat) 1114 zr(2,n1eff,i2,j3,idat)=zero 1115 end do 1116 end if 1117 end if 1118 1119 ! transform along y axis 1120 ! input: R1,R2,R3,(Rp3) 1121 ! input: R1,G2,R3,(Rp3) 1122 lot=ncache/(4*n2) 1123 1124 do j=1,n1eff,lot 1125 ma=j 1126 mb=min(j+(lot-1),n1eff) 1127 n1dfft=mb-ma+1 1128 i=1 1129 call fftstp(nd1,n1dfft,nd2,lot,n2,zr(1,j,1,j3,idat),zw(1,1,1), & 1130 & trig2,after2(i),now2(i),before2(i),-1) 1131 1132 inzee=1 1133 do i=2,ic2 1134 call fftstp(lot,n1dfft,n2,lot,n2,zw(1,1,inzee),zw(1,1,3-inzee), & 1135 & trig2,after2(i),now2(i),before2(i),-1) 1136 inzee=3-inzee 1137 end do 1138 1139 ! input: R1,G2,R3,(Rp3) 1140 ! output: G2,R1,R3,(Rp3) 1141 if(icplexwf==2)then 1142 call unswitch_cent(n1dfft,max2,m2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,j)) 1143 else 1144 call unswitchreal_cent(n1dfft,max2,n2,lot,n1,lzt,zw(1,1,inzee),zt(1,1,2*j-1)) 1145 end if 1146 end do 1147 1148 ! transform along x axis 1149 ! input: G2,R1,R3,(Rp3) 1150 lot=ncache/(4*n1) 1151 1152 do j=1,m2eff,lot 1153 ma=j 1154 mb=min(j+(lot-1),m2eff) 1155 n1dfft=mb-ma+1 1156 i=1 1157 call fftstp(lzt,n1dfft,m1zt,lot,n1,zt(1,j,1),zw(1,1,1), & 1158 & trig1,after1(i),now1(i),before1(i),-1) 1159 1160 inzee=1 1161 do i=2,ic1 1162 call fftstp(lot,n1dfft,n1,lot,n1,zw(1,1,inzee),zw(1,1,3-inzee), & 1163 & trig1,after1(i),now1(i),before1(i),-1) 1164 inzee=3-inzee 1165 end do 1166 ! output: G2,G1,R3,(Rp3) 1167 1168 ! input: G2,G1,R3,Gp2,(Rp3) 1169 ! output: G1,G2,R3,Gp2,(Rp3) 1170 if (nproc_fft==1) then 1171 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 1172 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi2(:,:,:,:,idat)) 1173 else 1174 call unmpiswitch_cent(j3,n1dfft,Jp2st,J2st,lot,max1,md1,m1,n1,& 1175 & md2proc,nd3proc,nproc_fft,ioption,zw(1,1,inzee),zmpi1(:,:,:,:,idat)) 1176 end if 1177 end do 1178 1179 end if 1180 end do ! j3 1181 1182 ! Interprocessor data transposition 1183 ! input: G1,G2,R3,Gp2,(Rp3) 1184 ! output: G1,G2,R3,Rp3,(Gp2) 1185 if (nproc_fft>1) then 1186 call timab(544,1,tsec) 1187 call xmpi_alltoall(zmpi1(:,:,:,:,idat),2*md1*md2proc*nd3proc, & 1188 & zmpi2(:,:,:,:,idat),2*md1*md2proc*nd3proc,comm_fft,ierr) 1189 1190 call timab(544,2,tsec) 1191 end if 1192 1193 ! transform along z axis 1194 ! input: G1,G2,R3,(Gp2) 1195 lot=ncache/(4*n3) 1196 1197 do j2=1,md2proc 1198 if (me_fft*md2proc+j2 <= m2eff) then 1199 ! write(std_out,*)' forwf_wf : before unscramble, j2,md2proc,me_fft,m2=',j2,md2proc,me_fft,m2 1200 do i1=1,m1,lot 1201 ma=i1 1202 mb=min(i1+(lot-1),m1) 1203 n1dfft=mb-ma+1 1204 1205 ! input: G1,G2,R3,(Gp2) 1206 ! output: G1,R3,G2,(Gp2) 1207 call unscramble(i1,j2,lot,n1dfft,md1,n3,md2proc,nnd3,zmpi2(:,:,:,:,idat),zw(1,1,1)) 1208 1209 inzee=1 1210 do i=1,ic3 1211 call fftstp(lot,n1dfft,n3,lot,n3,zw(1,1,inzee),zw(1,1,3-inzee), & 1212 & trig3,after3(i),now3(i),before3(i),-1) 1213 inzee=3-inzee 1214 end do 1215 1216 call unfill_cent(md1,md3,lot,n1dfft,max3,m3,n3,zw(1,1,inzee),zf(1,i1,1,j2,idat)) 1217 ! output: G1,G3,G2,(Gp2) 1218 end do 1219 end if 1220 end do 1221 1222 if (icplexwf==1) then 1223 ! Complete missing values with complex conjugate 1224 ! Inverse of ix is located at nx+2-ix , except for ix=1, for which it is 1. 1225 do i3=1,m3 1226 i3inv=m3+2-i3 1227 if(i3==1)i3inv=1 1228 1229 if (m2eff>1) then 1230 do i2=2,m2eff 1231 i2inv=m2+2-i2 1232 zf(1,1,i3inv,i2inv,idat)= zf(1,1,i3,i2,idat) 1233 zf(2,1,i3inv,i2inv,idat)=-zf(2,1,i3,i2,idat) 1234 do i1=2,m1 1235 i1inv=m1+2-i1 1236 zf(1,i1inv,i3inv,i2inv,idat)= zf(1,i1,i3,i2,idat) 1237 zf(2,i1inv,i3inv,i2inv,idat)=-zf(2,i1,i3,i2,idat) 1238 end do 1239 end do 1240 end if 1241 end do 1242 end if 1243 1244 end do ! idat 1245 1246 ABI_FREE(trig1) 1247 ABI_FREE(after1) 1248 ABI_FREE(now1) 1249 ABI_FREE(before1) 1250 ABI_FREE(trig2) 1251 ABI_FREE(after2) 1252 ABI_FREE(now2) 1253 ABI_FREE(before2) 1254 ABI_FREE(trig3) 1255 ABI_FREE(after3) 1256 ABI_FREE(now3) 1257 ABI_FREE(before3) 1258 ABI_FREE(zmpi2) 1259 ABI_FREE(zw) 1260 ABI_FREE(zt) 1261 if (nproc_fft>1) then 1262 ABI_FREE(zmpi1) 1263 end if 1264 1265 !call timab(542,2,tsec) 1266 1267 end subroutine sg2002_mpiforw_wf
m_sg2002/sg2002_mpifourdp [ Functions ]
[ Top ] [ m_sg2002 ] [ Functions ]
NAME
sg2002_mpifourdp
FUNCTION
Conduct Fourier transform of REAL or COMPLEX function f(r)=fofr defined on fft grid in real space, to create complex f(G)=fofg defined on full fft grid in reciprocal space, in full storage mode, or the reverse operation. For the reverse operation, the final data is divided by nfftot. REAL case when cplex=1, COMPLEX case when cplex=2 Usually used for density and potentials.
INPUTS
cplex=1 if fofr is real, 2 if fofr is complex nfft=(effective) number of FFT grid points (for this processor) ngfft(18)=contain all needed information about 3D FFT, see ~abinit/doc/variables/vargs.htm#ngfft ndat=Numbre of FFT transforms isign=sign of Fourier transform exponent: current convention uses +1 for transforming from G to r -1 for transforming from r to G. fftn2_distrib(2),ffti2_local(2) fftn3_distrib(3),ffti3_local(3) comm_fft=MPI communicator
SIDE EFFECTS
Input/Output fofg(2,nfft)=f(G), complex. fofr(cplex*nfft)=input function f(r) (real or complex)
TODO
Write simplified API for sequential version.
SOURCE
1306 subroutine sg2002_mpifourdp(cplex,nfft,ngfft,ndat,isign,& 1307 & fftn2_distrib,ffti2_local,fftn3_distrib,ffti3_local,fofg,fofr,comm_fft) 1308 1309 implicit none 1310 1311 !Arguments ------------------------------------ 1312 !scalars 1313 integer,intent(in) :: cplex,isign,nfft,ndat,comm_fft 1314 !arrays 1315 integer,intent(in) :: ngfft(18) 1316 integer,intent(in) :: fftn2_distrib(ngfft(2)),ffti2_local(ngfft(2)) 1317 integer,intent(in) :: fftn3_distrib(ngfft(3)),ffti3_local(ngfft(3)) 1318 real(dp),intent(inout) :: fofg(2,nfft*ndat),fofr(cplex*nfft*ndat) 1319 1320 !Local variables------------------------------- 1321 !scalars 1322 integer :: n1,n2,n3,n4,n5,n6,nd2proc,nd3proc,nproc_fft,me_fft 1323 !arrays 1324 real(dp),allocatable :: workf(:,:,:,:,:),workr(:,:,:,:,:) 1325 1326 ! ************************************************************************* 1327 1328 ! Note the only c2c is supported in parallel. 1329 n1=ngfft(1); n2=ngfft(2); n3=ngfft(3) 1330 n4=ngfft(4); n5=ngfft(5); n6=ngfft(6) 1331 me_fft=ngfft(11); nproc_fft=ngfft(10) 1332 1333 nd2proc=((n2-1)/nproc_fft) +1 1334 nd3proc=((n6-1)/nproc_fft) +1 1335 ABI_MALLOC(workr,(2,n4,n5,nd3proc,ndat)) 1336 ABI_MALLOC(workf,(2,n4,n6,nd2proc,ndat)) 1337 1338 ! Complex to Complex 1339 select case (isign) 1340 case (1) 1341 ! G --> R 1342 call mpifft_fg2dbox(nfft,ndat,fofg,n1,n2,n3,n4,nd2proc,n6,fftn2_distrib,ffti2_local,me_fft,workf) 1343 1344 call sg2002_back(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workf,workr,comm_fft) 1345 1346 call mpifft_dbox2fr(n1,n2,n3,n4,n5,nd3proc,ndat,fftn3_distrib,ffti3_local,me_fft,workr,cplex,nfft,fofr) 1347 1348 case (-1) 1349 ! R --> G 1350 call mpifft_fr2dbox(cplex,nfft,ndat,fofr,n1,n2,n3,n4,n5,nd3proc,fftn3_distrib,ffti3_local,me_fft,workr) 1351 1352 call sg2002_forw(2,ndat,n1,n2,n3,n4,n5,n6,n4,nd2proc,nd3proc,2,workr,workf,comm_fft) 1353 1354 ! Transfer FFT output to the original fft box. 1355 call mpifft_dbox2fg(n1,n2,n3,n4,nd2proc,n6,ndat,fftn2_distrib,ffti2_local,me_fft,workf,nfft,fofg) 1356 1357 case default 1358 ABI_BUG("Wrong isign") 1359 end select 1360 1361 ABI_FREE(workr) 1362 ABI_FREE(workf) 1363 1364 end subroutine sg2002_mpifourdp