TABLE OF CONTENTS
- ABINIT/m_abi_gpu_linalg
- m_abi_gpu_linalg/abi_gpu_work_resizeC
- m_abi_gpu_linalg/abi_gpu_work_resizeCptr
- m_abi_gpu_linalg/abi_gpu_work_resizeI
- m_abi_gpu_linalg/abi_gpu_work_resizeR
- m_abi_gpu_linalg/abi_gpu_xaxpy
- m_abi_gpu_linalg/abi_gpu_xcopy
- m_abi_gpu_linalg/abi_gpu_xgemm
- m_abi_gpu_linalg/abi_gpu_xgemm_strided
- m_abi_gpu_linalg/abi_gpu_xheevd
- m_abi_gpu_linalg/abi_gpu_xhegvd
- m_abi_gpu_linalg/abi_gpu_xpotrf
- m_abi_gpu_linalg/abi_gpu_xscal
- m_abi_gpu_linalg/abi_gpu_xsymm
- m_abi_gpu_linalg/abi_gpu_xtrsm
- m_abi_gpu_linalg/abi_gpu_zhemm
- m_abi_gpu_linalg/alloc_on_gpu
- m_abi_gpu_linalg/check_gpu_mem
- m_abi_gpu_linalg/copy_from_gpu
- m_abi_gpu_linalg/copy_gpu_to_gpu
- m_abi_gpu_linalg/copy_on_gpu
- m_abi_gpu_linalg/dealloc_on_gpu
- m_abi_gpu_linalg/gpu_allocated_impl
- m_abi_gpu_linalg/gpu_device_synchronize
- m_abi_gpu_linalg/gpu_linalg_init
- m_abi_gpu_linalg/gpu_linalg_shutdown
- m_abi_gpu_linalg/gpu_managed_ptr_status
- m_abi_gpu_linalg/gpu_memset
- m_abi_gpu_linalg/gpu_xaxpy
- m_abi_gpu_linalg/gpu_xcopy
- m_abi_gpu_linalg/gpu_xgemm
- m_abi_gpu_linalg/gpu_xscal
- m_abi_gpu_linalg/gpu_xsygvd
- m_abi_gpu_linalg/gpu_xsygvd_bufferSize
- m_abi_gpu_linalg/gpu_xtrsm
- m_abi_linalg/gpu_xorthonormalize
ABINIT/m_abi_gpu_linalg [ Modules ]
NAME
m_abi_gpu_linalg
FUNCTION
Interfaces of GPU subroutines wrapper
COPYRIGHT
Copyright (C) 2011-2024 ABINIT group (FDahm )) This file is distributed under the terms of the GNU General Public License, see ~ABINIT/Infos/copyright or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
m_abi_gpu_linalg/abi_gpu_work_resizeC [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_work_resizeC
m_abi_gpu_linalg/abi_gpu_work_resizeCptr [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_work_resizeCptr
m_abi_gpu_linalg/abi_gpu_work_resizeI [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_work_resizeI
m_abi_gpu_linalg/abi_gpu_work_resizeR [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_work_resizeR
m_abi_gpu_linalg/abi_gpu_xaxpy [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xaxpy
FUNCTION
Compute a BLAS-1 AXPY operation on GPU y = alpha * x + y
INPUTS
cplx = 1 if real 2 if complex size = vector size alpha = scalar complex value x_gpu = pointer to gpu memory location of array x incrx = stride between consecutive elements of x y_gpu = pointer to gpu memory location of array y incry = stride between consecutive elements of y
SOURCE
1851 subroutine abi_gpu_xaxpy_cptr(cplx, size, alpha, x, incrx, y, incry) 1852 1853 ! !Arguments ------------------------------------ 1854 integer, intent(in) :: cplx 1855 integer, intent(in) :: size 1856 complex(dpc), intent(in) :: alpha 1857 type(c_ptr), intent(in) :: x 1858 integer, intent(in) :: incrx 1859 type(c_ptr), intent(in) :: y 1860 integer, intent(in) :: incry 1861 1862 ! ************************************************************************* 1863 1864 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 1865 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 1866 end if 1867 1868 #ifdef HAVE_GPU 1869 1870 call gpu_xaxpy(cplx, size, alpha, x, incrx, y, incry) 1871 1872 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 1873 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 1874 ! Therefore, we issue a stream sync here to avoid 1875 !potential mistakes in calling context. 1876 call gpu_linalg_stream_synchronize() 1877 end if 1878 1879 #else 1880 ! Unused if GPU code disabled 1881 ABI_UNUSED((/cplx,incrx,incry,size/)) 1882 ABI_UNUSED((/alpha/)) 1883 ABI_UNUSED((/x,y/)) 1884 #endif 1885 1886 end subroutine abi_gpu_xaxpy_cptr
m_abi_gpu_linalg/abi_gpu_xcopy [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xcopy
FUNCTION
Compute a BLAS-1 COPY operation on GPU y = x (copy x into y)
INPUTS
cplx = 1 if real 2 if complex size = input vector size x = pointer to gpu memory location of array x incrx = stride between consecutive elements of x y = pointer to gpu memory location of array y incry = stride between consecutive elements of y
SOURCE
2038 subroutine abi_gpu_xcopy_cptr(cplx, size, x, incrx, y, incry) 2039 2040 ! !Arguments ------------------------------------ 2041 integer, intent(in) :: cplx 2042 integer, intent(in) :: size 2043 type(c_ptr), intent(in) :: x 2044 integer, intent(in) :: incrx 2045 type(c_ptr), intent(in) :: y 2046 integer, intent(in) :: incry 2047 2048 ! ************************************************************************* 2049 2050 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 2051 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 2052 end if 2053 2054 #ifdef HAVE_GPU 2055 2056 call gpu_xcopy(cplx, size, x, incrx, y, incry) 2057 2058 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 2059 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 2060 ! Therefore, we issue a stream sync here to avoid 2061 !potential mistakes in calling context. 2062 call gpu_linalg_stream_synchronize() 2063 end if 2064 2065 #else 2066 ! Unused if GPU code disabled 2067 ABI_UNUSED((/cplx,incrx,incry,size/)) 2068 ABI_UNUSED((/x,y/)) 2069 #endif 2070 2071 end subroutine abi_gpu_xcopy_cptr
m_abi_gpu_linalg/abi_gpu_xgemm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xgemm
FUNCTION
Compute a scalar-matrix-matrix product and return a scalar-matrix product on GPU c = alpha * op(a) * op(b) + beta * c
INPUTS
cplx = 1 if real 2 if complex transa= from of op(a) to be used in the matrix multiplication transb= from of op(b) to be used in the matrix multiplication m = number of rows of the matrix op(a) and of the matrix c n = number of rows of the matrix op(b) and the number of columns of the matrix c k = number of columns of the matrix op(a) and the number of rows of the matrix op(b) alpha = alpha scalar coefficient for matrix op(a) a = pointer to gpu memory location of matrix a lda = first dimension of a b = pointer to gpu memory location of matrix b ldb = first dimension of b beta = beta scalar coefficient for matrix c c = pointer to gpu memory location of matrix c ldc = first dimension of c
OUTPUT
c = c matrix
SOURCE
776 subroutine abi_gpu_xgemm_cptr(cplx,transa,transb,m,n,k,alpha,a,lda,b,ldb,beta,c,ldc) 777 778 !Arguments ------------------------------------ 779 integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k 780 complex(dpc),intent(in) :: alpha,beta 781 character(len=1),intent(in) :: transa,transb 782 type(c_ptr),intent(in) :: a,b 783 type(c_ptr),intent(in) :: c 784 785 ! ********************************************************************* 786 787 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 788 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 789 end if 790 791 #ifdef HAVE_GPU 792 793 call gpu_xgemm(cplx,transa,transb,m,n,k,alpha,& 794 a,lda,b,ldb,beta,c,ldc) 795 796 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 797 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 798 ! Therefore, we issue a stream sync here to avoid 799 !potential mistakes in calling context. 800 call gpu_linalg_stream_synchronize() 801 end if 802 803 #else 804 ! Unused if GPU code disabled 805 ABI_UNUSED((/cplx,lda,ldb,ldc,m,n,k/)) 806 ABI_UNUSED((/alpha,beta/)) 807 ABI_UNUSED((/transa,transb/)) 808 ABI_UNUSED((/a,b,c/)) 809 #endif 810 811 end subroutine abi_gpu_xgemm_cptr
m_abi_gpu_linalg/abi_gpu_xgemm_strided [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xgemm_strided
FUNCTION
Compute a batched scalar-matrix-matrix product and return a scalar-matrix product on GPU. Meant to be used on non-contiguous matrixes with data is uniformly split in the same number of batches in each matrix. c = alpha * op(a) * op(b) + beta * c
INPUTS
cplx = 1 if real 2 if complex transa = from of op(a) to be used in the matrix multiplication transb = from of op(b) to be used in the matrix multiplication m = number of rows of the matrix op(a) and of the matrix c n = number of rows of the matrix op(b) and the number of columns of the matrix c k = number of columns of the matrix op(a) and the number of rows of the matrix op(b) alpha = alpha scalar coefficient for matrix op(a) a = pointer to gpu memory location of matrix a lda = first dimension of a strideC = stride between each batch in matrix a b = pointer to gpu memory location of matrix b ldb = first dimension of b strideC = stride between each batch in matrix b beta = beta scalar coefficient for matrix c c = pointer to gpu memory location of matrix c ldc = first dimension of c strideC = stride between each batch in matrix c batchCount = number of batches in any matrix
OUTPUT
c = c matrix
SOURCE
1003 subroutine abi_gpu_xgemm_strided_cptr(cplx,transa,transb,m,n,k,alpha,a,lda,strideA,b,ldb,strideB,beta,c,ldc,strideC,batchCount) 1004 1005 !Arguments ------------------------------------ 1006 integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k 1007 integer,intent(in) :: strideA,strideB,strideC,batchCount 1008 complex(dpc),intent(in) :: alpha,beta 1009 character(len=1),intent(in) :: transa,transb 1010 type(c_ptr),intent(in) :: a,b 1011 type(c_ptr),intent(in) :: c 1012 1013 ! ********************************************************************* 1014 1015 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 1016 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 1017 end if 1018 1019 #ifdef HAVE_GPU 1020 1021 call gpu_xgemm_strided_batched(cplx,transa,transb,m,n,k,alpha,& 1022 a,lda,strideA,b,ldb,strideB,beta,c,ldc,strideC,batchCount) 1023 1024 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 1025 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 1026 ! Therefore, we issue a stream sync here to avoid 1027 !potential mistakes in calling context. 1028 call gpu_linalg_stream_synchronize() 1029 end if 1030 1031 #else 1032 ! Unused if GPU code disabled 1033 ABI_UNUSED((/cplx,lda,ldb,ldc,m,n,k/)) 1034 ABI_UNUSED((/strideA,strideB,strideC,batchCount/)) 1035 ABI_UNUSED((/alpha,beta/)) 1036 ABI_UNUSED((/transa,transb/)) 1037 ABI_UNUSED((/a,b,c/)) 1038 #endif 1039 1040 end subroutine abi_gpu_xgemm_strided_cptr
m_abi_gpu_linalg/abi_gpu_xheevd [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xheevd
FUNCTION
Compute a LAPACK SYEVD operation on GPU compute eigen values/vectors of a real symmetric-definite eigenproblem See cusolver documentation https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-heevd See also LAPACK doc in reference implementation: https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dheevd.f
INPUTS
cplx = 1 if real 2 if complex jobz = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors) uplo = character, 'u' or 'l' A_nrows = matrix size A = pointer to gpu memory location of matrix A lda = leading dimension of matrix A W = pointer to gpu memory location of matrix W (output eigen values) devInfo =
SOURCE
3003 subroutine abi_gpu_xheevd_cptr(cplx, jobz, uplo, A_nrows, & 3004 A, lda, & 3005 W, & 3006 devInfo) 3007 3008 ! Arguments ------------------------------------ 3009 integer, intent(in ) :: cplx 3010 character(len=1),intent(in ) :: jobz 3011 character(len=1),intent(in ) :: uplo 3012 integer, intent(in ) :: A_nrows 3013 type(c_ptr), intent(in ) :: A 3014 integer, intent(in ) :: lda 3015 type(c_ptr), intent(in) :: W 3016 integer, intent(inout) :: devInfo 3017 3018 ! Local variables ------------------------------ 3019 integer :: bufferSize 3020 type(c_ptr) :: gpu_ptr 3021 3022 ! ************************************************************************* 3023 3024 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 3025 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 3026 end if 3027 3028 #ifdef HAVE_GPU 3029 3030 ! probe needed bufferSize 3031 call gpu_xsyevd_buffersize(cplx, jobz, uplo, & 3032 A_nrows, & 3033 A, lda, & 3034 W, & 3035 bufferSize) 3036 3037 select case(cplx) 3038 3039 case (1) 3040 ! resize work array if needed and retrieve work pointer to use 3041 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 3042 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 3043 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 3044 gpu_ptr = c_loc(r_work_managed) 3045 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3046 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 3047 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 3048 gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work)) 3049 #else 3050 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp) 3051 gpu_ptr = gpu_work 3052 #endif 3053 end if 3054 3055 case (2) 3056 ! resize work array if needed and retrieve work pointer to use 3057 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 3058 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 3059 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 3060 gpu_ptr = c_loc(c_work_managed) 3061 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3062 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 3063 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 3064 gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work)) 3065 #else 3066 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(2,c_size_t)*bufferSize*dp) 3067 gpu_ptr = gpu_work 3068 #endif 3069 end if 3070 3071 end select 3072 3073 ! and compute (finally) 3074 call gpu_xsyevd(cplx, jobz, uplo, & 3075 A_nrows, & 3076 A, lda, & 3077 W, & 3078 gpu_ptr, bufferSize, devInfo) 3079 3080 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3081 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 3082 ! Therefore, we issue a stream sync here to avoid 3083 !potential mistakes in calling context. 3084 call gpu_linalg_stream_synchronize() 3085 end if 3086 3087 #else 3088 ! Unused if GPU code disabled 3089 ABI_UNUSED((/cplx,A_nrows,lda,devInfo,bufferSize/)) 3090 ABI_UNUSED((/jobz,uplo/)) 3091 ABI_UNUSED((/A,W,gpu_ptr/)) 3092 #endif 3093 3094 end subroutine abi_gpu_xheevd_cptr
m_abi_gpu_linalg/abi_gpu_xhegvd [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xhegvd
FUNCTION
Compute a LAPACK SYGVD operation on GPU compute eigen values/vectors of a real generalized symmetric-definite eigenproblem See cusolver documentation https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-hegvd See also LAPACK doc in reference implementation: https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dhegvd.f
INPUTS
cplx = 1 if real 2 if complex itype = integer, type of problem jobz = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors) uplo = character, 'u' or 'l' A_nrows = matrix size A = pointer to gpu memory location of matrix A lda = leading dimension of matrix A B = pointer to gpu memory location of matrix B ldb = leading dimension of matrix B W = pointer to gpu memory location of matrix W (output eigen values) devInfo =
SOURCE
2694 subroutine abi_gpu_xhegvd_cptr(cplx, itype, jobz, uplo, A_nrows, & 2695 A, lda, & 2696 B, ldb, & 2697 W, & 2698 devInfo) 2699 2700 ! Arguments ------------------------------------ 2701 integer, intent(in ) :: cplx 2702 integer, intent(in ) :: itype 2703 character(len=1),intent(in ) :: jobz 2704 character(len=1),intent(in ) :: uplo 2705 integer, intent(in ) :: A_nrows 2706 type(c_ptr), intent(in ) :: A 2707 integer, intent(in ) :: lda 2708 type(c_ptr), intent(in ) :: B 2709 integer, intent(in ) :: ldb 2710 type(c_ptr), intent(in ) :: W 2711 integer, intent(inout) :: devInfo 2712 2713 ! Local variables ------------------------------ 2714 integer :: bufferSize 2715 type(c_ptr) :: gpu_ptr 2716 2717 ! ************************************************************************* 2718 2719 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 2720 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 2721 end if 2722 2723 #ifdef HAVE_GPU 2724 2725 ! probe needed bufferSize 2726 call gpu_xsygvd_buffersize(cplx, itype, jobz, uplo, & 2727 A_nrows, & 2728 A, lda, & 2729 B, ldb, & 2730 W, & 2731 bufferSize) 2732 2733 select case(cplx) 2734 2735 case (1) 2736 ! resize work array if needed and retrieve work pointer to use 2737 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 2738 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 2739 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 2740 gpu_ptr = c_loc(r_work_managed) 2741 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 2742 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 2743 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 2744 gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work)) 2745 #else 2746 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp) 2747 gpu_ptr = gpu_work 2748 #endif 2749 end if 2750 2751 case (2) 2752 ! resize work array if needed and retrieve work pointer to use 2753 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 2754 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 2755 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 2756 gpu_ptr = c_loc(c_work_managed) 2757 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 2758 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 2759 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 2760 gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work)) 2761 #else 2762 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(2,c_size_t)*bufferSize*dp) 2763 gpu_ptr = gpu_work 2764 #endif 2765 end if 2766 2767 end select 2768 2769 ! and compute (finally) 2770 call gpu_xsygvd(cplx, itype, jobz, uplo, & 2771 A_nrows, & 2772 A, lda, & 2773 B, ldb, & 2774 W, & 2775 gpu_ptr, bufferSize, devInfo) 2776 2777 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 2778 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 2779 ! Therefore, we issue a stream sync here to avoid 2780 !potential mistakes in calling context. 2781 call gpu_linalg_stream_synchronize() 2782 end if 2783 2784 #else 2785 ! Unused if GPU code disabled 2786 ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,devInfo,bufferSize/)) 2787 ABI_UNUSED((/jobz,uplo/)) 2788 ABI_UNUSED((/A,B,W,gpu_ptr/)) 2789 #endif 2790 2791 end subroutine abi_gpu_xhegvd_cptr
m_abi_gpu_linalg/abi_gpu_xpotrf [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xpotrf
FUNCTION
Compute a LAPACK SYGVD operation on GPU compute eigen values/vectors of a real symmetric-definite eigenproblem See cusolver documentation https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-potrf See also LAPACK doc in reference implementation: https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dpotrf.f
INPUTS
cplx = 1 if real 2 if complex uplo = character, 'u' or 'l' A_nrows = matrix size A = pointer to gpu memory location of matrix A lda = leading dimension of matrix A devInfo =
SOURCE
3282 subroutine abi_gpu_xpotrf_cptr(cplx, uplo, A_nrows, & 3283 A, lda, & 3284 devInfo) 3285 3286 ! Arguments ------------------------------------ 3287 integer, intent(in ) :: cplx 3288 character(len=1),intent(in ) :: uplo 3289 integer, intent(in ) :: A_nrows 3290 type(c_ptr), intent(in ) :: A 3291 integer, intent(in ) :: lda 3292 integer, intent(inout) :: devInfo 3293 3294 ! Local variables ------------------------------ 3295 integer :: bufferSize 3296 type(c_ptr) :: gpu_ptr 3297 3298 ! ************************************************************************* 3299 3300 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 3301 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 3302 end if 3303 3304 #ifdef HAVE_GPU 3305 3306 ! probe needed bufferSize 3307 call gpu_xpotrf_buffersize(cplx, uplo, & 3308 A_nrows, & 3309 A, lda, & 3310 bufferSize) 3311 3312 select case(cplx) 3313 3314 case (1) 3315 ! resize work array if needed and retrieve work pointer to use 3316 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 3317 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 3318 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 3319 gpu_ptr = c_loc(r_work_managed) 3320 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3321 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 3322 call abi_gpu_work_resize(r_work,r_work_managed,r_work_len,bufferSize) 3323 gpu_ptr = xomp_get_mapped_ptr(c_loc(r_work)) 3324 #else 3325 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp) 3326 gpu_ptr = gpu_work 3327 #endif 3328 end if 3329 3330 case (2) 3331 ! resize work array if needed and retrieve work pointer to use 3332 if(abi_linalg_gpu_mode == ABI_GPU_LEGACY & 3333 .or. abi_linalg_gpu_mode == ABI_GPU_KOKKOS) then 3334 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 3335 gpu_ptr = c_loc(c_work_managed) 3336 else if(abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3337 #ifdef HAVE_OPENMP_GET_MAPPED_PTR 3338 call abi_gpu_work_resize(c_work,c_work_managed,c_work_len,bufferSize) 3339 gpu_ptr = xomp_get_mapped_ptr(c_loc(c_work)) 3340 #else 3341 call abi_gpu_work_resizeCptr(gpu_work,gpu_work_len,INT(1,c_size_t)*bufferSize*dp) 3342 gpu_ptr = gpu_work 3343 #endif 3344 end if 3345 3346 end select 3347 3348 ! and compute (finally) 3349 call gpu_xpotrf(cplx, uplo, & 3350 A_nrows, & 3351 A, lda, & 3352 gpu_ptr, bufferSize, devInfo) 3353 3354 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 3355 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 3356 ! Therefore, we issue a stream sync here to avoid 3357 !potential mistakes in calling context. 3358 call gpu_linalg_stream_synchronize() 3359 end if 3360 3361 #else 3362 ! Unused if GPU code disabled 3363 ABI_UNUSED((/cplx,A_nrows,lda,devInfo,bufferSize/)) 3364 ABI_UNUSED((/uplo/)) 3365 ABI_UNUSED((/A,gpu_ptr/)) 3366 #endif 3367 3368 end subroutine abi_gpu_xpotrf_cptr
m_abi_gpu_linalg/abi_gpu_xscal [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xscal
FUNCTION
Compute a BLAS-1 SCAL operation on GPU x = alpha * x
INPUTS
cplx = 1 if real 2 if complex size = vector size alpha = scalar complex value x_gpu = pointer to gpu memory location of array x incrx = stride between consecutive elements of x
SOURCE
1673 subroutine abi_gpu_xscal_cptr(cplx, size, alpha, x, incrx) 1674 1675 ! !Arguments ------------------------------------ 1676 integer, intent(in) :: cplx 1677 integer, intent(in) :: size 1678 complex(dpc), intent(in) :: alpha 1679 type(c_ptr), intent(in) :: x 1680 integer, intent(in) :: incrx 1681 1682 ! ************************************************************************* 1683 1684 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 1685 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 1686 end if 1687 1688 #ifdef HAVE_GPU 1689 1690 call gpu_xscal(cplx, size, alpha, x, incrx) 1691 1692 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 1693 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 1694 ! Therefore, we issue a stream sync here to avoid 1695 !potential mistakes in calling context. 1696 call gpu_linalg_stream_synchronize() 1697 end if 1698 1699 #else 1700 ! Unused if GPU code disabled 1701 ABI_UNUSED((/cplx,incrx,size/)) 1702 ABI_UNUSED((/alpha/)) 1703 ABI_UNUSED((/x/)) 1704 #endif 1705 1706 end subroutine abi_gpu_xscal_cptr
m_abi_gpu_linalg/abi_gpu_xsymm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xsymm
FUNCTION
Compute a symmetric scalar-matrix-matrix product and return a scalar-matrix product on GPU c = alpha * op(a) * op(b) + beta * c , if side == L c = alpha * op(b) * op(a) + beta * c , if side == R
INPUTS
cplx = 1 if real 2 if complex side = Specifies whether op(a) appears on the left or right of x for the operation to be performed as follows: L or l op(a)*x = alpha*b R or r x*op(a) = alpha*b uplo = Specifies whether the matrix a is an upper or lower triangular matrix as follows: U or u Matrix a is an upper triangular matrix. L or l Matrix a is a lower triangular matrix m = number of rows of the matrix op(a) and of the matrix c n = number of rows of the matrix op(b) and the number of columns of the matrix c alpha = alpha scalar coefficient for matrix op(a) a = pointer to gpu memory location of matrix a lda = first dimension of a b = pointer to gpu memory location of matrix b ldb = first dimension of b beta = beta scalar coefficient for matrix c c = pointer to gpu memory location of matrix c ldc = first dimension of c
OUTPUT
c = c matrix
SOURCE
1237 subroutine abi_gpu_xsymm_cptr(cplx,side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc) 1238 1239 !Arguments ------------------------------------ 1240 integer,intent(in) :: cplx,lda,ldb,ldc,m,n 1241 complex(dpc),intent(in) :: alpha,beta 1242 character(len=1),intent(in) :: side,uplo 1243 type(c_ptr),intent(in) :: a,b 1244 type(c_ptr),intent(in) :: c 1245 1246 ! ********************************************************************* 1247 1248 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 1249 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 1250 end if 1251 1252 #ifdef HAVE_GPU 1253 1254 call gpu_xsymm(cplx,side,uplo,m,n,alpha,& 1255 a,lda,b,ldb,beta,c,ldc) 1256 1257 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 1258 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 1259 ! Therefore, we issue a stream sync here to avoid 1260 !potential mistakes in calling context. 1261 call gpu_linalg_stream_synchronize() 1262 end if 1263 1264 #else 1265 ! Unused if GPU code disabled 1266 ABI_UNUSED((/cplx,lda,ldb,ldc,m,n/)) 1267 ABI_UNUSED((/alpha,beta/)) 1268 ABI_UNUSED((/side,uplo/)) 1269 ABI_UNUSED((/a,b,c/)) 1270 #endif 1271 1272 end subroutine abi_gpu_xsymm_cptr
m_abi_gpu_linalg/abi_gpu_xtrsm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xtrsm
FUNCTION
Solves a matrix equation (one matrix operand is triangular) on GPU. The xtrsm routines solve one of the following matrix equations op(a)*x = alpha*b or x*op(a) = alpha*b,
INPUTS
cplx= 1 if real 2 if complex side= Specifies whether op(a) appears on the left or right of x for the operation to be performed as follows: L or l op(a)*x = alpha*b R or r x*op(a) = alpha*b uplo= Specifies whether the matrix a is an upper or lower triangular matrix as follows: U or u Matrix a is an upper triangular matrix. L or l Matrix a is a lower triangular matrix transa= Specifies the form of op(a) to be used in the matrix multiplication as follows: N or n op(a) = a T or t op(a) = a' C or c op(a) = conjg(a') diag= Specifies whether or not a is unit triangular as follows: U or u Matrix a is assumed to be unit triangular. N or n Matrix a is not assumed to be unit triangular. m= Specifies the number of rows of b. The value of m must be at least zero n= Specifies the number of columns of b. The value of n must be at least zero alpha= Specifies the scalar alpha. When alpha is zero, then a is not referenced and b need not be set before entry. a = pointer to gpu memory location of array a, DIMENSION (lda, k), where k is m when side = 'L' or 'l' and is n when side = 'R' or 'r'. lda= Specifies the first dimension of a as declared in the calling (sub)program. When side = 'L' or 'l', then lda must be at least max(1, m), when side = 'R' or 'r', then lda must be at least max(1, n). b = pointer to gpu memory location of b Array, DIMENSION (ldb,n). Before entry, the leading m-by-n part of the array b must contain the right-hand side matrix b. ldb= Specifies the first dimension of b as declared in the calling (sub)program. The value of ldb must be at least max(1, m).
OUTPUT
b
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled the correct one is in 17_toolbox/gpu_linalg.cu
SOURCE
2253 subroutine abi_gpu_xtrsm_cptr(cplx,side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb) 2254 2255 ! !Arguments ------------------------------------ 2256 integer, intent(in) :: cplx,lda,ldb,m,n 2257 complex(dpc), intent(in) :: alpha 2258 character(len=1), intent(in) :: side,uplo,transa,diag 2259 type(c_ptr),intent(in) :: a 2260 type(c_ptr),intent(in) :: b 2261 2262 ! ********************************************************************* 2263 2264 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 2265 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 2266 end if 2267 2268 #ifdef HAVE_GPU 2269 2270 call gpu_xtrsm(cplx,side,uplo,transa,diag,m,n,alpha,a,lda,b,ldb) 2271 2272 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 2273 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 2274 ! Therefore, we issue a stream sync here to avoid 2275 !potential mistakes in calling context. 2276 call gpu_linalg_stream_synchronize() 2277 end if 2278 2279 #else 2280 ! Unused if GPU code disabled 2281 ABI_UNUSED((/cplx,lda,ldb,m,n/)) 2282 ABI_UNUSED((/alpha/)) 2283 ABI_UNUSED((/side,uplo,transa,diag/)) 2284 ABI_UNUSED((/a,b/)) 2285 #endif 2286 2287 end subroutine abi_gpu_xtrsm_cptr
m_abi_gpu_linalg/abi_gpu_zhemm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
abi_gpu_xhemm
FUNCTION
Compute a Hermitian scalar-matrix-matrix product and return a scalar-matrix product on GPU c = alpha * op(a) * op(b) + beta * c , if side == L c = alpha * op(b) * op(a) + beta * c , if side == R
INPUTS
side = Specifies whether op(a) appears on the left or right of x for the operation to be performed as follows: L or l op(a)*x = alpha*b R or r x*op(a) = alpha*b uplo = Specifies whether the matrix a is an upper or lower triangular matrix as follows: U or u Matrix a is an upper triangular matrix. L or l Matrix a is a lower triangular matrix m = number of rows of the matrix op(a) and of the matrix c n = number of rows of the matrix op(b) and the number of columns of the matrix c alpha = alpha scalar coefficient for matrix op(a) a_gpu = pointer to gpu memory location of matrix a lda = first dimension of a b_gpu = pointer to gpu memory location of matrix b ldb = first dimension of b beta = beta scalar coefficient for matrix c c_gpu = pointer to gpu memory location of matrix c ldc = first dimension of c
OUTPUT
c = c matrix
SOURCE
1464 subroutine abi_gpu_zhemm_cptr(side,uplo,m,n,alpha,a,lda,b,ldb,beta,c,ldc) 1465 1466 !Arguments ------------------------------------ 1467 integer,intent(in) :: lda,ldb,ldc,m,n 1468 complex(dpc),intent(in) :: alpha,beta 1469 character(len=1),intent(in) :: side,uplo 1470 type(c_ptr),intent(in) :: a,b 1471 type(c_ptr),intent(in) :: c 1472 1473 ! ********************************************************************* 1474 1475 if (abi_linalg_gpu_mode == ABI_GPU_DISABLED) then 1476 ABI_BUG("You requested to run on CPU to a GPU wrapper :/") 1477 end if 1478 1479 #ifdef HAVE_GPU 1480 1481 call gpu_zhemm(side,uplo,m,n,alpha,& 1482 a,lda,b,ldb,beta,c,ldc) 1483 1484 if (abi_linalg_gpu_mode == ABI_GPU_OPENMP) then 1485 ! CUDA/HIP linalg calls are run asynchronously and OpenMP is unaware of them. 1486 ! Therefore, we issue a stream sync here to avoid 1487 !potential mistakes in calling context. 1488 call gpu_linalg_stream_synchronize() 1489 end if 1490 1491 #else 1492 ! Unused if GPU code disabled 1493 ABI_UNUSED((/lda,ldb,ldc,m,n/)) 1494 ABI_UNUSED((/alpha,beta/)) 1495 ABI_UNUSED((/side,uplo/)) 1496 ABI_UNUSED((/a,b,c/)) 1497 #endif 1498 1499 end subroutine abi_gpu_zhemm_cptr
m_abi_gpu_linalg/alloc_on_gpu [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
alloc_on_gpu
FUNCTION
Allocate size byte in gpu memory and returns in gpu_ptr this location
INPUTS
size= size in byte to allocate
OUTPUT
gpu_ptr= C_PTR on gpu memory location that has been allocated
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
98 subroutine alloc_on_gpu(gpu_ptr,size) 99 100 !Arguments ------------------------------------ 101 type(c_ptr), intent(inout) :: gpu_ptr 102 integer(kind=c_size_t), intent(in) :: size ! size in bytes to allocate 103 104 ABI_UNUSED(gpu_ptr) 105 ABI_UNUSED(size) 106 107 end subroutine alloc_on_gpu
m_abi_gpu_linalg/check_gpu_mem [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
check_gpu_mem
FUNCTION
Print information about amount of free memory on GPU and total amount of memory on GPU (current device).
INPUTS
str is a string message (character array).
OUTPUT
None
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
66 subroutine check_gpu_mem(str) 67 68 !Arguments ------------------------------------ 69 character (KIND=c_char), intent(in), target :: str(*) 70 !Local variables ------------------------------ 71 type(c_ptr) :: dummy 72 73 if(.false.) dummy=c_loc(str) 74 75 end subroutine check_gpu_mem
m_abi_gpu_linalg/copy_from_gpu [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
copy_from_gpu
FUNCTION
copy size byte from gpu memory (pointed by gpu_ptr) to cpu memory (pointed by cpu_ptr)
INPUTS
size_in_bytes = size in bytes to allocate gpu_ptr = C_PTR on gpu memory location that has been allocated
OUTPUT
dtab = fortran tab which will contains data
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
130 subroutine copy_from_gpu(dtab,gpu_ptr,size_in_bytes) 131 132 !Arguments ------------------------------------ 133 real(dp),dimension(*) :: dtab 134 type(c_ptr) :: gpu_ptr 135 integer(kind=c_size_t), intent(in) :: size_in_bytes ! size in byte (to be transfered) 136 137 !Local variables ------------------------------ 138 type(c_ptr) :: cpu_ptr 139 140 if(.false.) write(std_out,*) dtab(1) 141 ABI_UNUSED(cpu_ptr) 142 ABI_UNUSED(gpu_ptr) 143 ABI_UNUSED(size_in_bytes) 144 145 end subroutine copy_from_gpu
m_abi_gpu_linalg/copy_gpu_to_gpu [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
copy_gpu_to_gpu
FUNCTION
copy size byte from gpu (src) to gpu (dest)
INPUTS
size_in_bytes = size in bytes to copy src_gpu_ptr = C_PTR on gpu memory
OUTPUT
dest_gpu_ptr = C_PTR on gpu memory
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU_CUDA is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
PARENTS
lobpcgwf,m_abi_gpu_linalg
SOURCE
209 subroutine copy_gpu_to_gpu(cpu_ptr,gpu_ptr,size_in_bytes) 210 211 !Arguments ------------------------------------ 212 type(c_ptr) :: cpu_ptr 213 type(c_ptr) :: gpu_ptr 214 integer(kind=c_size_t), intent(in) :: size_in_bytes ! size in byte (to be transfered) 215 216 ABI_UNUSED(cpu_ptr) 217 ABI_UNUSED(gpu_ptr) 218 ABI_UNUSED(size_in_bytes) 219 220 end subroutine copy_gpu_to_gpu
m_abi_gpu_linalg/copy_on_gpu [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
copy_on_gpu
FUNCTION
copy size byte from cpu (pointed by cpu_ptr) to gpu memory (pointed by gpu_ptr)
INPUTS
size_in_bytes = size in bytes to allocate dtab = fortran tab to copy
OUTPUT
gpu_ptr= C_PTR on gpu memory location
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
168 subroutine copy_on_gpu(dtab,gpu_ptr,size_in_bytes) 169 170 !Arguments ------------------------------------ 171 real(dp),dimension(*) :: dtab 172 type(c_ptr) :: gpu_ptr 173 integer(kind=c_size_t), intent(in) :: size_in_bytes ! size in byte (to be transfered) 174 175 !Local variables ------------------------------ 176 type(c_ptr) :: cpu_ptr 177 178 if(.false.) write(std_out,*) dtab(1) 179 ABI_UNUSED(cpu_ptr) 180 ABI_UNUSED(gpu_ptr) 181 ABI_UNUSED(size_in_bytes) 182 183 end subroutine copy_on_gpu
m_abi_gpu_linalg/dealloc_on_gpu [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
dealloc_on_gpu
FUNCTION
free memory location pointed by gpu_ptr
INPUTS
OUTPUT
gpu_ptr= C_PTR on gpu memory location that has been allocated
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
241 subroutine dealloc_on_gpu(gpu_ptr) 242 243 !Arguments ------------------------------------ 244 type(c_ptr) :: gpu_ptr 245 246 ABI_UNUSED(gpu_ptr) 247 248 end subroutine dealloc_on_gpu
m_abi_gpu_linalg/gpu_allocated_impl [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_allocated_impl
FUNCTION
Check if pointer points to allocated gpu device memory.
INPUTS
gpu_ptr= C_PTR on gpu memory location
OUTPUT
is_allocate= logical(c_bool) : true (if allocated), false (if not allocated)
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
PARENTS
lobpcgwf
SOURCE
311 subroutine gpu_allocated_impl(gpu_ptr, is_allocated) 312 313 !Arguments ------------------------------------ 314 type(c_ptr) :: gpu_ptr 315 logical(kind=c_bool), intent(out) :: is_allocated 316 317 ABI_UNUSED(gpu_ptr) 318 319 is_allocated = .false. 320 321 end subroutine gpu_allocated_impl
m_abi_gpu_linalg/gpu_device_synchronize [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_device_synchronize
FUNCTION
Wait for any running operation, compute and memory transfer, to complete on GPU.
INPUTS
None
OUTPUT
None
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
SOURCE
40 subroutine gpu_device_synchronize() 41 use, intrinsic :: iso_c_binding 42 implicit none 43 end subroutine gpu_device_synchronize
m_abi_gpu_linalg/gpu_linalg_init [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_linalg_init
FUNCTION
initialisation of linalg environnement on GPU
INPUTS
OUTPUT
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/gpu_linalg.cu
SOURCE
375 subroutine gpu_linalg_init() 376 377 378 end subroutine gpu_linalg_init
m_abi_gpu_linalg/gpu_linalg_shutdown [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_linalg_shutdown
FUNCTION
close linalg environnement on GPU
INPUTS
OUTPUT
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/gpu_linalg.cu
SOURCE
397 subroutine gpu_linalg_shutdown() 398 399 end subroutine gpu_linalg_shutdown
m_abi_gpu_linalg/gpu_managed_ptr_status [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_managed_ptr_status_impl
FUNCTION
Print information about a managed pointer (host or device address when accessible).
INPUTS
gpu_ptr= C_PTR on gpu memory location
OUTPUT
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
PARENTS
SOURCE
344 subroutine gpu_managed_ptr_status(gpu_ptr, str) 345 346 !Arguments ------------------------------------ 347 type(c_ptr) :: gpu_ptr 348 character (KIND=c_char), intent(in), target :: str(*) 349 !Local variables ------------------------------ 350 type(c_ptr) :: dummy 351 352 ABI_UNUSED(gpu_ptr) 353 if(.false.) dummy=c_loc(str) 354 355 end subroutine gpu_managed_ptr_status
m_abi_gpu_linalg/gpu_memset [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_memset
FUNCTION
Initializes or sets device memory to a value.
INPUTS
gpu_ptr= C_PTR on gpu memory location val= value used to initialized each bytes size= number of bytes to initialize
OUTPUT
gpu_ptr= C_PTR on gpu memory location
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/dev_spec.cu
PARENTS
lobpcgwf
SOURCE
275 subroutine gpu_memset(gpu_ptr, val, array_size) 276 277 !Arguments ------------------------------------ 278 type(c_ptr) :: gpu_ptr 279 integer(kind=c_int32_t), intent(in) :: val 280 integer(kind=c_size_t), intent(in) :: array_size 281 282 ABI_UNUSED(gpu_ptr) 283 ABI_UNUSED(val) 284 ABI_UNUSED(array_size) 285 286 end subroutine gpu_memset
m_abi_gpu_linalg/gpu_xaxpy [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xaxpy
FUNCTION
Compute a BLAS-1 AXPY operation on GPU y = alpha * x + y
INPUTS
cplx = 1 if real 2 if complex size = vector size alpha = scalar complex value x_gpu = pointer to gpu memory location of array x incrx = stride between consecutive elements of x y_gpu = pointer to gpu memory location of array y incry = stride between consecutive elements of y
SOURCE
547 subroutine gpu_xaxpy(cplx, size, alpha, x_gpu, incrx, y_gpu, incry) 548 549 ! !Arguments ------------------------------------ 550 integer, intent(in) :: cplx 551 integer, intent(in) :: size 552 complex(dpc), intent(in) :: alpha 553 type(c_ptr), intent(in) :: x_gpu 554 integer, intent(in) :: incrx 555 type(c_ptr), intent(inout) :: y_gpu 556 integer, intent(in) :: incry 557 558 ABI_UNUSED((/cplx,size,incrx,incry/)) 559 ABI_UNUSED(alpha) 560 ABI_UNUSED_A(x_gpu) 561 ABI_UNUSED_A(y_gpu) 562 end subroutine gpu_xaxpy
m_abi_gpu_linalg/gpu_xcopy [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xcopy
FUNCTION
Compute a BLAS-1 COPY operation on GPU y = x (copy x into y)
INPUTS
cplx = 1 if real 2 if complex size = input vector size x_gpu = pointer to gpu memory location of array x incrx = stride between consecutive elements of x y_gpu = pointer to gpu memory location of array y incry = stride between consecutive elements of y
SOURCE
582 subroutine gpu_xcopy(cplx, size, x_gpu, incrx, y_gpu, incry) 583 584 ! !Arguments ------------------------------------ 585 integer, intent(in) :: cplx 586 integer, intent(in) :: size 587 type(c_ptr), intent(in) :: x_gpu 588 integer, intent(in) :: incrx 589 type(c_ptr), intent(inout) :: y_gpu 590 integer, intent(in) :: incry 591 592 ABI_UNUSED((/cplx,size,incrx,incry/)) 593 ABI_UNUSED_A(x_gpu) 594 ABI_UNUSED_A(y_gpu) 595 end subroutine gpu_xcopy
m_abi_gpu_linalg/gpu_xgemm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xgemm
FUNCTION
Compute a scalar-matrix-matrix product and return a scalar-matrix product on GPU c = alpha * op(a) * op(b) + beta * c
INPUTS
cplx = 1 if real 2 if complex transa= from of op(a) to be used in the matrix multiplication transb= from of op(b) to be used in the matrix multiplication m = number of rows of the matrix op(a) and of the matrix c n = number of rows of the matrix op(b) and the number of columns of the matrix c k = number of columns of the matrix op(a) and the number of rows of the matrix op(b) alpha = alpha scalar coefficient for matrix op(a) a_gpu = pointer to gpu memory location of matrix a lda = first dimension of a b_gpu = pointer to gpu memory location of matrix b ldb = first dimension of b beta = beta scalar coefficient for matrix c c_gpu = pointer to gpu memory location of matrix c ldc = first dimension of c
OUTPUT
c = c matrix
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/gpu_linalg.cu
SOURCE
436 subroutine gpu_xgemm(cplx,transa,transb,m,n,k,alpha,a_gpu,lda,b_gpu,ldb,beta,c_gpu,ldc) 437 438 !Arguments ------------------------------------ 439 integer,intent(in) :: cplx,lda,ldb,ldc,m,n,k 440 complex(dpc),intent(in) :: alpha,beta 441 character(len=1),intent(in) :: transa,transb 442 type(c_ptr),intent(in) :: a_gpu,b_gpu 443 type(c_ptr),intent(inout) :: c_gpu 444 !Local variables ------------------------------ 445 type(c_ptr) :: cptr 446 447 ! ********************************************************************* 448 449 if (.false.) then 450 cptr=a_gpu;cptr=b_gpu;cptr=c_gpu 451 write(std_out,*) transa,transb,cplx,lda,ldb,ldc,m,n,k,alpha,beta 452 end if 453 454 end subroutine gpu_xgemm
m_abi_gpu_linalg/gpu_xscal [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xscal
FUNCTION
Compute a BLAS-1 SCAL operation on GPU x = alpha * x
INPUTS
cplx = 1 if real 2 if complex size = vector size alpha = scalar complex value x_gpu = pointer to gpu memory location of array x incrx = stride between consecutive elements of x
SOURCE
614 subroutine gpu_xscal(cplx, size, alpha, x_gpu, incrx) 615 616 ! !Arguments ------------------------------------ 617 integer, intent(in) :: cplx 618 integer, intent(in) :: size 619 complex(dpc), intent(in) :: alpha 620 type(c_ptr), intent(in) :: x_gpu 621 integer, intent(in) :: incrx 622 623 ABI_UNUSED((/cplx,size,incrx/)) 624 ABI_UNUSED(alpha) 625 ABI_UNUSED_A(x_gpu) 626 end subroutine gpu_xscal
m_abi_gpu_linalg/gpu_xsygvd [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xsygvd
FUNCTION
Compute a LAPACK SYGVD operation on GPU compute eigen values/vectors of a real generalized symmetric-definite eigenproblem See cusolver documentation https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-sygvd See also LAPACK doc in reference implementation: https://github.com/Reference-LAPACK/lapack/blob/master/SRC/dsygvd.f
INPUTS
cplx = 1 if real 2 if complex itype = integer, type of problem jobz = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors) uplo = character, 'u' or 'l' A_nrows = matrix size A_ptr = pointer to gpu memory location of matrix A lda = leading dimension of matrix A B_ptr = pointer to gpu memory location of matrix B ldb = leading dimension of matrix B W_ptr = pointer to gpu memory location of matrix W (output eigen values) work_ptr = lwork = devInfo =
SOURCE
660 subroutine gpu_xsygvd(cplx, itype, jobz, uplo, A_nrows, & 661 & A_ptr, lda, & 662 & B_ptr, ldb, & 663 & W_ptr, & 664 & work_ptr, lwork, & 665 & devInfo) 666 667 ! Arguments ------------------------------------ 668 integer, intent(in ) :: cplx 669 integer, intent(in ) :: itype 670 character(len=1),intent(in ) :: jobz 671 character(len=1),intent(in ) :: uplo 672 integer, intent(in ) :: A_nrows 673 type(c_ptr), intent(in ) :: A_ptr 674 integer, intent(in ) :: lda 675 type(c_ptr), intent(in ) :: B_ptr 676 integer, intent(in ) :: ldb 677 type(c_ptr), intent(inout) :: W_ptr 678 type(c_ptr), intent(inout) :: work_ptr 679 integer, intent(in ) :: lwork 680 integer, intent(inout) :: devInfo 681 682 ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,lwork,devInfo/)) 683 ABI_UNUSED((/jobz,uplo/)) 684 ABI_UNUSED((/A_ptr,B_ptr,W_ptr,work_ptr/)) 685 end subroutine gpu_xsygvd
m_abi_gpu_linalg/gpu_xsygvd_bufferSize [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xsygvd_bufferSize
FUNCTION
Compute required size for auxiliary work buffer used internally by cusolver in cusolverDnDsygvd / cusolverDnZhegvd See cusolver documentation https://docs.nvidia.com/cuda/cusolver/index.html#cuSolverDN-lt-t-gt-sygvd
INPUTS
cplx = 1 if real 2 if complex itype = integer, type of problem jobz = character, 'n'(eigenvalues only) or 'v' (eigenvalues + eigenvectors) uplo = character, 'u' or 'l' A_nrows = matrix size A_ptr = pointer to gpu memory location of matrix A lda = leading dimension of matrix A B_ptr = pointer to gpu memory location of matrix B ldb = leading dimension of matrix B W_ptr = pointer to gpu memory location of matrix W (output eigen values) work_ptr = lwork = devInfo =
SOURCE
715 subroutine gpu_xsygvd_buffersize(cplx, itype, jobz, uplo, A_nrows, & 716 & A_ptr, lda, & 717 & B_ptr, ldb, & 718 & W_ptr, & 719 & lwork) 720 721 ! Arguments ------------------------------------ 722 integer, intent(in ) :: cplx 723 integer, intent(in ) :: itype 724 character(len=1),intent(in ) :: jobz 725 character(len=1),intent(in ) :: uplo 726 integer, intent(in ) :: A_nrows 727 type(c_ptr), intent(in ) :: A_ptr 728 integer, intent(in ) :: lda 729 type(c_ptr), intent(in ) :: B_ptr 730 integer, intent(in ) :: ldb 731 type(c_ptr), intent(inout) :: W_ptr 732 integer, intent(in ) :: lwork 733 734 ABI_UNUSED((/cplx,itype,A_nrows,lda,ldb,lwork/)) 735 ABI_UNUSED((/jobz,uplo/)) 736 ABI_UNUSED((/A_ptr,B_ptr,W_ptr/)) 737 end subroutine gpu_xsygvd_bufferSize
m_abi_gpu_linalg/gpu_xtrsm [ Functions ]
[ Top ] [ m_abi_gpu_linalg ] [ Functions ]
NAME
gpu_xtrsm
FUNCTION
Solves a matrix equation (one matrix operand is triangular) on GPU. The xtrsm routines solve one of the following matrix equations op(a)*x = alpha*b or x*op(a) = alpha*b,
INPUTS
cplx= 1 if real 2 if complex side= Specifies whether op(a) appears on the left or right of x for the operation to be performed as follows: L or l op(a)*x = alpha*b R or r x*op(a) = alpha*b uplo= Specifies whether the matrix a is an upper or lower triangular matrix as follows: U or u Matrix a is an upper triangular matrix. L or l Matrix a is a lower triangular matrix transa= Specifies the form of op(a) to be used in the matrix multiplication as follows: N or n op(a) = a T or t op(a) = a' C or c op(a) = conjg(a') diag= Specifies whether or not a is unit triangular as follows: U or u Matrix a is assumed to be unit triangular. N or n Matrix a is not assumed to be unit triangular. m= Specifies the number of rows of b. The value of m must be at least zero n= Specifies the number of columns of b. The value of n must be at least zero alpha= Specifies the scalar alpha. When alpha is zero, then a is not referenced and b need not be set before entry. a_gpu = pointer to gpu memory location of array a, DIMENSION (lda, k), where k is m when side = 'L' or 'l' and is n when side = 'R' or 'r'. lda= Specifies the first dimension of a as declared in the calling (sub)program. When side = 'L' or 'l', then lda must be at least max(1, m), when side = 'R' or 'r', then lda must be at least max(1, n). b_gpu = pointer to gpu memory location of b Array, DIMENSION (ldb,n). Before entry, the leading m-by-n part of the array b must contain the right-hand side matrix b. ldb= Specifies the first dimension of b as declared in the calling (sub)program. The value of ldb must be at least max(1, m).
OUTPUT
b_gpu
SIDE EFFECTS
WARNING! : this routine is a dummy one when HAVE_GPU is not enabled the correct one is in 17_gpu_toolbox/gpu_linalg.cu
SOURCE
508 subroutine gpu_xtrsm(cplx,side,uplo,transa,diag,m,n,alpha,a_gpu,lda,b_gpu,ldb) 509 510 ! !Arguments ------------------------------------ 511 integer, intent(in) :: cplx,lda,ldb,m,n 512 complex(dpc), intent(in) :: alpha 513 character(len=1), intent(in) :: side,uplo,transa,diag 514 type(c_ptr),intent(in) :: a_gpu 515 type(c_ptr),intent(inout) :: b_gpu 516 !Local variables ------------------------------ 517 type(c_ptr) :: cptr 518 519 ! ********************************************************************* 520 521 if (.false.) then 522 cptr=a_gpu;cptr=b_gpu 523 write(std_out,*) side,uplo,transa,diag,cplx,lda,ldb,m,n,alpha 524 end if 525 526 end subroutine gpu_xtrsm
m_abi_linalg/gpu_xorthonormalize [ Functions ]
[ Top ] [ m_abi_linalg ] [ Functions ]
NAME
gpu_xorthonormalize
FUNCTION
This routine computes the overlap of two complex wavefunctions (for a given number of bands) and orthonormalizes it using gpu: - Computes the products of two rectangular matrices containing the wavefunctions psi and S.psi (where S is the overlap (with the PAW terms if necessary)). - Does a Cholesky decomposition of this overlap - rotates the initial matrix blockvectorx by the triangular matrix to have an orthonormal set of wavefunctions
INPUTS
blockvectorbx = matrix of dimension (blocksize,vectsize) as a GPU ptr (e.g. block of overlap*wavefunction) blocksize = dimension of matrices (e.g number of bands) spaceComm = communicator used for MPI parallelization vectsize = dimension of matrices (e.g number of G vector)
OUTPUT
sqgram = Choleski decomposition of transpose(blockvector)*blockvectorx as a GPU ptr
SIDE EFFECTS
blockvectorx = on input, matrix of dimension (vectsize,blocksize) as a GPU ptr (e.g block of wavefunction) blockvectorx = on output, orthonormalized wavefunction. as a GPU ptr
SOURCE
3541 subroutine gpu_xorthonormalize(blockvectorx_gpu,blockvectorbx_gpu,blocksize,spaceComm,& 3542 & sqgram_gpu,vectsize,& 3543 & x_cplx,timopt,tim_xortho) ! optional arguments 3544 3545 use, intrinsic :: iso_c_binding 3546 implicit none 3547 3548 !Arguments ------------------------------------ 3549 !scalars 3550 integer,intent(in) :: blocksize,spaceComm,vectsize,x_cplx 3551 integer, intent(in), optional :: timopt,tim_xortho 3552 !arrays 3553 type(c_ptr),intent(inout) :: blockvectorbx_gpu, blockvectorx_gpu, sqgram_gpu 3554 !Local variables------------------------------- 3555 #if defined HAVE_GPU 3556 integer :: ierr,info 3557 real(dp), dimension(:,:),allocatable, target :: d_sqgram 3558 complex(dpc),dimension(:,:),allocatable, target :: z_sqgram 3559 character :: tr 3560 real(dp) :: tsec(2) 3561 integer(c_size_t) :: size 3562 #else 3563 type(c_ptr) :: cptr_a 3564 #endif 3565 character(len=500) :: message 3566 3567 ! ********************************************************************* 3568 3569 #if defined HAVE_GPU 3570 if (present(tim_xortho).and.present(timopt)) then 3571 if(abs(timopt)==3) then 3572 call timab(tim_xortho,1,tsec) 3573 end if 3574 end if 3575 3576 if ( x_cplx == 1 ) then 3577 tr='t' 3578 ABI_MALLOC(d_sqgram,(blocksize,blocksize)) 3579 else 3580 tr='c' 3581 ABI_MALLOC(z_sqgram,(blocksize,blocksize)) 3582 end if 3583 3584 call gpu_xgemm(x_cplx,tr,'n',blocksize,blocksize,vectsize, & 3585 & cone,blockvectorx_gpu,vectsize,blockvectorbx_gpu,vectsize,czero,sqgram_gpu,blocksize) 3586 call gpu_device_synchronize() 3587 size=x_cplx*dp*blocksize*blocksize 3588 3589 if ( x_cplx == 1 ) then 3590 call copy_from_gpu(d_sqgram, sqgram_gpu, INT(x_cplx, c_size_t)*dp*blocksize*blocksize) 3591 call xmpi_sum(d_sqgram,spaceComm,ierr) 3592 call abi_xpotrf('u',blocksize,d_sqgram,blocksize,info) 3593 call copy_on_gpu(d_sqgram, sqgram_gpu, INT(x_cplx, c_size_t)*dp*blocksize*blocksize) 3594 else 3595 call copy_from_gpu(z_sqgram, sqgram_gpu, size) 3596 call xmpi_sum(z_sqgram,spaceComm,ierr) 3597 call abi_xpotrf('u',blocksize,z_sqgram,blocksize,info) 3598 call copy_on_gpu(z_sqgram, sqgram_gpu, size) 3599 end if 3600 3601 if (info /= 0 ) then 3602 write(message,'(a,i3)') ' xpotrf, info=',info 3603 ABI_WARNING(message) 3604 end if 3605 3606 call gpu_xtrsm(x_cplx,'r','u','n','n',vectsize,blocksize,cone,sqgram_gpu,blocksize,& 3607 & blockvectorx_gpu,vectsize) 3608 3609 if(x_cplx==1) then 3610 ABI_FREE(d_sqgram) 3611 else 3612 ABI_FREE(z_sqgram) 3613 end if 3614 if (present(tim_xortho).and.present(timopt)) then 3615 if(abs(timopt)==3) then 3616 call timab(tim_xortho,2,tsec) 3617 end if 3618 end if 3619 return 3620 3621 #else 3622 message=' This routine is not allowed when running on GPU is disabled !' 3623 ABI_BUG(message) 3624 if (.false.) then 3625 write(std_out,*) blocksize,vectsize,spaceComm,x_cplx 3626 if (present(timopt)) write(std_out,*) timopt 3627 if (present(tim_xortho)) write(std_out,*) tim_xortho 3628 cptr_a=blockvectorbx_gpu;cptr_a=blockvectorx_gpu;cptr_a=sqgram_gpu 3629 end if 3630 #endif 3631 3632 end subroutine gpu_xorthonormalize