TABLE OF CONTENTS
- ABINIT/m_invovl
- m_invovl/apply_block
- m_invovl/apply_invovl
- m_invovl/destroy_invovl
- m_invovl/init_invovl
- m_invovl/invovl_kpt_type
- m_invovl/make_invovl
- m_invovl/make_invovl_kpt_gpu
- m_invovl/solve_inner
ABINIT/m_invovl [ Modules ]
NAME
m_invovl
FUNCTION
Provides functions to invert the overlap matrix S. Used by Chebyshev in PAW See paper by A. Levitt and M. Torrent for details S = 1 + projs * s_projs * projs' S^-1 = 1 + projs * inv_s_projs * projs', with inv_s_projs = - (s_projs^-1 + projs'*projs)^-1
COPYRIGHT
Copyright (C) 2013-2022 ABINIT group (AL) This file is distributed under the terms of the GNU General Public License, see ~abinit/COPYING or http://www.gnu.org/copyleft/gpl.txt .
SOURCE
20 #if defined HAVE_CONFIG_H 21 #include "config.h" 22 #endif 23 24 #include "abi_common.h" 25 26 ! nvtx related macro definition 27 #include "nvtx_macros.h" 28 29 MODULE m_invovl 30 31 use defs_basis 32 use m_errors 33 use m_xmpi 34 use m_abicore 35 36 use defs_abitypes, only : mpi_type 37 use m_time, only : timab 38 use m_hamiltonian, only : gs_hamiltonian_type 39 use m_bandfft_kpt, only : bandfft_kpt_get_ikpt 40 use m_pawcprj, only : pawcprj_type, pawcprj_alloc, pawcprj_free, pawcprj_axpby 41 use m_nonlop, only : nonlop 42 use m_prep_kgb, only : prep_nonlop 43 44 #ifdef HAVE_FC_ISO_C_BINDING 45 use, intrinsic :: iso_c_binding, only : c_ptr, c_int32_t, c_int64_t, c_float, c_double, c_size_t, c_loc 46 #endif 47 48 #if defined(HAVE_GPU_CUDA) && defined(HAVE_GPU_NVTX_V3) 49 use m_nvtx_data 50 #endif 51 52 implicit none 53 54 private 55 56 !public procedures. 57 public :: init_invovl 58 public :: make_invovl 59 public :: apply_invovl 60 public :: destroy_invovl
m_invovl/apply_block [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_block
FUNCTION
Helper function: applies a block-diagonal matrix mat(lmnmax, lmnmax, ntypat)
INPUTS
SOURCE
950 subroutine apply_block(ham, cplx, mat, nprojs, ndat, x, y, block_sliced) 951 952 use m_abi_linalg 953 implicit none 954 955 integer,intent(in) :: ndat, nprojs, cplx 956 real(dp), intent(inout) :: x(cplx, nprojs, ndat), y(cplx, nprojs, ndat) 957 type(gs_hamiltonian_type),intent(in) :: ham 958 real(dp), intent(in) :: mat(cplx, ham%lmnmax, ham%lmnmax, ham%ntypat) 959 integer, intent(in) :: block_sliced 960 961 integer :: nlmn, shift, itypat, idat 962 963 ! ************************************************************************* 964 965 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_APPLY_BLOCK) 966 if (block_sliced == 1) then 967 968 do idat = 1, ndat 969 shift = 1 970 do itypat=1, ham%ntypat 971 nlmn = count(ham%indlmn(3,:,itypat)>0) 972 !! apply mat to all atoms at once 973 ! perform natom multiplications of size nlmn 974 ! compute y = mat*x 975 if(cplx == 2) then 976 call ZHEMM('L','U', nlmn, ham%nattyp(itypat), cone, & 977 & mat(:, :, :, itypat), ham%lmnmax, & 978 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, czero, & 979 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn) 980 else 981 call DSYMM('L','U', nlmn, ham%nattyp(itypat), one, & 982 & mat(:, :, :, itypat), ham%lmnmax, & 983 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn, zero, & 984 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, idat), nlmn) 985 end if 986 shift = shift + nlmn*ham%nattyp(itypat) 987 end do 988 end do 989 990 else ! block_sliced = 0 991 992 shift = 1 993 do itypat=1, ham%ntypat 994 nlmn = count(ham%indlmn(3,:,itypat)>0) 995 !! apply mat to all atoms at once, all idat at once 996 ! perform natom multiplications of size nlmn 997 ! be careful here matrix extracted from x and y are not memory contiguous 998 ! ==> so in the GPU version we will need to adapt leading dimension 999 if(cplx == 2) then 1000 call ZHEMM('L','U', nlmn, ham%nattyp(itypat)*ndat, cone, & 1001 & mat(:, :, :, itypat), ham%lmnmax, & 1002 & x(:, 1:nlmn*ham%nattyp(itypat), 1:ndat), nlmn, czero, & 1003 & y(:, 1:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn) 1004 else 1005 call DSYMM('L','U', nlmn, ham%nattyp(itypat)*ndat, one, & 1006 & mat(:, :, :, itypat), ham%lmnmax, & 1007 & x(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn, zero, & 1008 & y(:, shift:shift+nlmn*ham%nattyp(itypat)-1, 1:ndat), nlmn) 1009 end if 1010 shift = shift + nlmn*ham%nattyp(itypat) 1011 end do 1012 1013 end if 1014 ABI_NVTX_END_RANGE() 1015 1016 end subroutine apply_block
m_invovl/apply_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
apply_invovl
FUNCTION
Applies the inverse of the overlap matrix to cwavef
INPUTS
SOURCE
629 subroutine apply_invovl(ham, cwavef, sm1cwavef, cwaveprj, npw, ndat, mpi_enreg, nspinor, block_sliced) 630 631 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 632 use, intrinsic :: iso_c_binding 633 #endif 634 635 implicit none 636 637 ! args 638 type(gs_hamiltonian_type), intent(in), target :: ham 639 integer, intent(in) :: npw, ndat 640 integer, intent(in) :: nspinor 641 integer, intent(in) :: block_sliced 642 real(dp), intent(inout) :: cwavef(2, npw*nspinor*ndat) ! TODO should be in, fix nonlop 643 type(mpi_type) :: mpi_enreg 644 real(dp), intent(inout) :: sm1cwavef(2, npw*nspinor*ndat) 645 type(pawcprj_type), intent(inout) :: cwaveprj(ham%natom,nspinor*ndat) 646 647 real(dp),allocatable, target :: proj(:,:,:),sm1proj(:,:,:),PtPsm1proj(:,:,:) 648 649 ! used to pass proj dimensions to cuda 650 integer(kind=c_int32_t) :: proj_dim(3) 651 integer(kind=c_int32_t) :: nattyp_dim 652 integer(kind=c_int32_t) :: indlmn_dim(3) 653 654 integer :: idat, iatom, nlmn, shift 655 real(dp) :: tsec(2) 656 657 integer :: choice, cpopt, paw_opt , cplx 658 character :: blas_transpose 659 type(pawcprj_type) :: cwaveprj_in(ham%natom,nspinor*ndat) 660 661 integer :: ikpt_this_proc 662 integer, parameter :: tim_nonlop = 13 663 ! dummies 664 real(dp) :: enlout(ndat), lambda_block(1), gvnlxc(1,1) 665 integer, parameter :: nnlout = 0, idir = 0, signs = 2 666 667 type(invovl_kpt_type), pointer :: invovl 668 669 integer, parameter :: & 670 & timer_apply_inv_ovl_opernla = 1630, & 671 & timer_apply_inv_ovl_opernlb = 1631, & 672 & timer_apply_inv_ovl_inv_s = 1632 673 674 ! ************************************************************************* 675 676 ikpt_this_proc=bandfft_kpt_get_ikpt() 677 invovl => invovl_kpt(ikpt_this_proc) 678 679 if(ham%istwf_k == 1) then 680 cplx = 2 681 blas_transpose = 'c' 682 else 683 cplx = 1 684 blas_transpose = 't' 685 end if 686 687 ABI_MALLOC(proj, (cplx,invovl%nprojs,nspinor*ndat)) 688 ABI_MALLOC(sm1proj, (cplx,invovl%nprojs,nspinor*ndat)) 689 ABI_MALLOC(PtPsm1proj, (cplx,invovl%nprojs,nspinor*ndat)) 690 proj = zero 691 sm1proj = zero 692 PtPsm1proj = zero 693 694 proj_dim = (/ size(proj,1), size(proj,2), size(proj,3) /) 695 696 nattyp_dim = size(ham%nattyp) 697 698 indlmn_dim = (/ size(ham%indlmn,1), size(ham%indlmn,2), size(ham%indlmn,3) /) 699 700 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 701 702 !! memory allocation of data used in solve_inner_gpu 703 !! note : this is actually done only once 704 if (ham%use_gpu_cuda==1) then 705 706 ! make sure to use sizes from apply_invovl 707 call f_gpu_apply_invovl_inner_alloc(proj_dim, ham%ntypat, 0) 708 709 ! TODO find a better place to put that initialization 710 call f_gpu_init_invovl_data(indlmn_dim, c_loc(ham%indlmn(1,1,1))) 711 712 end if 713 714 #endif 715 716 717 call timab(timer_apply_inv_ovl_opernla, 1, tsec) 718 719 call pawcprj_alloc(cwaveprj_in,0,ham%dimcprj) 720 721 ! get the cprj 722 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP1) 723 choice = 0 ! only compute cprj, nothing else 724 cpopt = 0 ! compute and save cprj 725 paw_opt = 3 ! S nonlocal operator 726 if (mpi_enreg%paral_kgb==1) then 727 call prep_nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,ndat,mpi_enreg,& 728 & nnlout,paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.) 729 else 730 call nonlop(choice,cpopt,cwaveprj_in,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,& 731 & paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc) 732 end if 733 ABI_NVTX_END_RANGE() 734 735 call timab(timer_apply_inv_ovl_opernla, 2, tsec) 736 call timab(timer_apply_inv_ovl_inv_s, 1, tsec) 737 738 ! copy cwaveprj_in to proj(:,:) 739 do idat=1, ndat*nspinor 740 shift = 0 741 do iatom = 1, ham%natom 742 nlmn = cwaveprj_in(iatom, idat)%nlmn 743 proj(1:cplx, shift+1:shift+nlmn, idat) = cwaveprj_in(iatom, idat)%cp(1:cplx, 1:nlmn) 744 shift = shift + nlmn 745 end do 746 end do 747 748 !multiply by S^1 749 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER) 750 ! TODO : when solve_inner_gpu is ready, update the following to activate GPU computation 751 if (ham%use_gpu_cuda == 1) then 752 753 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 754 755 if (mpi_enreg%nproc_fft /= 1) then 756 ABI_ERROR("[66_wfs/m_invovl.F90:apply_invovl] nproc_fft must be 1, when GPU/CUDA is activated") 757 end if 758 759 call f_solve_inner_gpu(proj_dim, c_loc(proj(1,1,1)), & 760 & c_loc(sm1proj(1,1,1)), c_loc(PtPsm1proj(1,1,1)), & 761 & nattyp_dim, c_loc(ham%nattyp(1)), ham%ntypat, & 762 & ham%lmnmax, cplx, block_sliced) 763 764 #endif 765 766 else 767 768 call solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat*nspinor, sm1proj, PtPsm1proj, block_sliced) 769 sm1proj = - sm1proj 770 PtPsm1proj = - PtPsm1proj 771 end if 772 773 ABI_NVTX_END_RANGE() 774 775 ! copy sm1proj to cwaveprj(:,:) 776 do idat=1, ndat*nspinor 777 shift = 0 778 do iatom = 1, ham%natom 779 nlmn = cwaveprj(iatom, idat)%nlmn 780 cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = sm1proj(1:cplx, shift+1:shift+nlmn, idat) 781 shift = shift + nlmn 782 end do 783 end do 784 call timab(timer_apply_inv_ovl_inv_s, 2, tsec) 785 call timab(timer_apply_inv_ovl_opernlb, 1, tsec) 786 787 ! get the corresponding wf 788 ABI_NVTX_START_RANGE(NVTX_INVOVL_NONLOP2) 789 cpopt = 2 ! reuse cprj 790 choice = 7 ! get wf from cprj, without the application of S 791 paw_opt = 3 792 if (mpi_enreg%paral_kgb==1) then 793 call prep_nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,ndat,mpi_enreg,nnlout,& 794 & paw_opt,signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc,already_transposed=.true.) 795 else 796 call nonlop(choice,cpopt,cwaveprj,enlout,ham,idir,lambda_block,mpi_enreg,ndat,nnlout,paw_opt,& 797 & signs,sm1cwavef,tim_nonlop,cwavef,gvnlxc) 798 end if 799 ABI_NVTX_END_RANGE() 800 801 call timab(timer_apply_inv_ovl_opernlb, 2, tsec) 802 803 ! copy PtPSm1proj to cwaveprj(:,:) 804 do idat=1, ndat*nspinor 805 shift = 0 806 do iatom = 1, ham%natom 807 nlmn = cwaveprj(iatom, idat)%nlmn 808 cwaveprj(iatom, idat)%cp(1:cplx, 1:nlmn) = PtPsm1proj(1:cplx, shift+1:shift+nlmn, idat) 809 shift = shift + nlmn 810 end do 811 end do 812 813 sm1cwavef = cwavef + sm1cwavef 814 call pawcprj_axpby(one, one, cwaveprj_in, cwaveprj) 815 call pawcprj_free(cwaveprj_in) 816 817 ABI_FREE(proj) 818 ABI_FREE(sm1proj) 819 ABI_FREE(PtPsm1proj) 820 821 end subroutine apply_invovl
m_invovl/destroy_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
destroy_invovl
FUNCTION
Destruction of the invovl_kpt array
INPUTS
nkpt= number of k-points
SOURCE
313 subroutine destroy_invovl(nkpt, use_gpu_cuda) 314 315 integer, intent(in) :: nkpt 316 integer, intent(in) :: use_gpu_cuda 317 integer :: ikpt 318 319 ! ************************************************************************* 320 321 ! TODO add cycling if kpt parallelism 322 do ikpt=1,nkpt 323 if(invovl_kpt(ikpt)%nprojs == -1) then 324 ! write(0, *) 'ERROR invovl_kpt is unallocated' 325 cycle 326 end if 327 328 ABI_FREE(invovl_kpt(ikpt)%gram_projs) 329 ABI_FREE(invovl_kpt(ikpt)%inv_sij) 330 ABI_FREE(invovl_kpt(ikpt)%inv_s_approx) 331 invovl_kpt(ikpt)%nprojs = -1 332 end do 333 334 ABI_FREE(invovl_kpt) 335 336 ABI_UNUSED(use_gpu_cuda) 337 338 #if defined(HAVE_GPU_CUDA) && defined(HAVE_FC_ISO_C_BINDING) 339 if (use_gpu_cuda == 1) then 340 call f_gpu_apply_invovl_inner_dealloc() 341 call f_gpu_apply_invovl_matrix_dealloc() 342 end if 343 #endif 344 345 end subroutine destroy_invovl
m_invovl/init_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
init_invovl
FUNCTION
Initalization of the invovl_kpt array
INPUTS
nkpt= number of k-points
SOURCE
286 subroutine init_invovl(nkpt) 287 288 integer, intent(in) :: nkpt 289 integer :: ikpt 290 291 ! ************************************************************************* 292 293 ABI_MALLOC(invovl_kpt, (nkpt)) 294 ! TODO add cycling if kpt parallelism 295 do ikpt=1,nkpt 296 invovl_kpt(ikpt)%nprojs = -1 297 end do 298 299 end subroutine init_invovl
m_invovl/invovl_kpt_type [ Types ]
[ Top ] [ m_invovl ] [ Types ]
NAME
invovl_kpt_type
FUNCTION
Contains information needed to invert the overlap matrix S
SOURCE
73 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 74 75 type, public :: invovl_kpt_type 76 77 integer(kind=c_int32_t) :: nprojs 78 ! total number of projectors 79 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 80 ! A value of -1 means that the following arrays are not allocated 81 82 real(kind=c_double), allocatable :: gram_projs(:,:,:) 83 ! gram_projs(cplx, nprojs, nprojs) 84 ! projs' * projs 85 86 real(kind=c_double), allocatable :: inv_sij(:,:,:,:) 87 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 88 ! inverse of ham%sij 89 90 real(kind=c_double), allocatable :: inv_s_approx(:,:,:,:) 91 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 92 ! preconditionner 93 94 end type invovl_kpt_type 95 96 !> companion type to invovl_kpt_type to pass data to gpu/cuda 97 type, bind(c), public :: invovl_kpt_gpu_type 98 99 integer(kind=c_int32_t) :: nprojs 100 ! total number of projectors 101 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 102 ! A value of -1 means that the following arrays are not allocated 103 104 type(c_ptr) :: gram_projs 105 ! gram_projs(cplx, nprojs, nprojs) 106 ! projs' * projs 107 108 integer(kind=c_int32_t) :: gram_projs_dim(3) 109 110 type(c_ptr) :: inv_sij 111 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 112 ! inverse of ham%sij 113 114 integer(kind=c_int32_t) :: inv_sij_dim(4) 115 116 type(c_ptr) :: inv_s_approx 117 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 118 ! preconditionner 119 120 integer(kind=c_int32_t) :: inv_s_approx_dim(4) 121 122 end type invovl_kpt_gpu_type 123 124 #else 125 126 type, public :: invovl_kpt_type 127 128 integer :: nprojs 129 ! total number of projectors 130 ! nlmn for a specific atom = count(indlmn(3,:,itypat)>0) 131 ! A value of -1 means that the following arrays are not allocated 132 133 real(dp), allocatable :: gram_projs(:,:,:) 134 ! gram_projs(cplx, nprojs, nprojs) 135 ! projs' * projs 136 137 real(dp), allocatable :: inv_sij(:,:,:,:) 138 ! inv_sij(cplx, lmnmax, lmnmax, ntypat) 139 ! inverse of ham%sij 140 141 real(dp), allocatable :: inv_s_approx(:,:,:,:) 142 ! inv_s_approx(cplx, lmnmax, lmnmax, ntypat) 143 ! preconditionner 144 145 end type invovl_kpt_type 146 147 #endif
m_invovl/make_invovl [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
make_invovl
FUNCTION
Builds of the invovl structure
INPUTS
SOURCE
359 subroutine make_invovl(ham, dimffnl, ffnl, ph3d, mpi_enreg) 360 361 use m_abi_linalg 362 implicit none 363 364 type(gs_hamiltonian_type),intent(in), target :: ham 365 integer, intent(in) :: dimffnl 366 real(dp),intent(in) :: ffnl(ham%npw_k,dimffnl,ham%lmnmax,ham%ntypat) 367 real(dp),intent(in) :: ph3d(2,ham%npw_k,ham%matblk) 368 type(mpi_type) :: mpi_enreg 369 370 real(dp) :: atom_projs(2, ham%npw_k, ham%lmnmax) 371 real(dp) :: temp(ham%npw_k) 372 complex(dpc), allocatable :: work(:) 373 real(dp), allocatable :: projs(:,:,:) 374 real(dp), allocatable :: gram_proj(:,:,:) 375 integer, allocatable :: ipiv(:) 376 377 integer :: itypat, ilmn, nlmn, jlmn, ia, iaph3d, shift 378 integer :: il, ilm, jlm, ipw, info, ierr, cplx 379 integer :: ikpt_this_proc 380 logical :: parity 381 real(dp) :: tsec(2) 382 character(len=500) :: message 383 character :: blas_transpose 384 385 type(invovl_kpt_type), pointer :: invovl 386 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 387 type(invovl_kpt_gpu_type) :: invovl_gpu 388 #endif 389 integer :: array_nprojs_pp(mpi_enreg%nproc_fft) 390 integer :: iproc, slice_size 391 real(dp), allocatable :: gramwork(:,:,:) 392 393 integer, parameter :: timer_mkinvovl = 1620, timer_mkinvovl_build_d = 1621, timer_mkinvovl_build_ptp = 1622 394 395 ! ************************************************************************* 396 397 !! S = 1 + PDP', so 398 !! S^-1 = 1 + P inv_s_projs P', with 399 !! inv_s_projs = - (D^-1 + P'*P)^-1 400 401 if(ham%istwf_k == 1) then 402 cplx = 2 403 blas_transpose = 'c' 404 else 405 cplx = 1 406 blas_transpose = 't' 407 end if 408 409 ikpt_this_proc=bandfft_kpt_get_ikpt() 410 invovl => invovl_kpt(ikpt_this_proc) 411 412 if(invovl%nprojs /= -1) then 413 ! We have been here before, cleanup before remaking 414 ABI_FREE(invovl%gram_projs) 415 ABI_FREE(invovl%inv_sij) 416 ABI_FREE(invovl%inv_s_approx) 417 invovl%nprojs = -1 418 end if 419 420 iaph3d = 1 421 422 call timab(timer_mkinvovl,1,tsec) 423 call timab(timer_mkinvovl_build_d,1,tsec) 424 425 ! build nprojs 426 invovl%nprojs = 0 427 do itypat=1,ham%ntypat 428 invovl%nprojs = invovl%nprojs + count(ham%indlmn(3,:,itypat)>0)*ham%nattyp(itypat) 429 end do 430 431 ABI_MALLOC(projs, (2, ham%npw_k, invovl%nprojs)) 432 ABI_MALLOC(invovl%inv_sij, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)) 433 ABI_MALLOC(invovl%inv_s_approx, (cplx, ham%lmnmax, ham%lmnmax, ham%ntypat)) 434 ! workspace for inversion 435 ABI_MALLOC(ipiv, (ham%lmnmax)) 436 ABI_MALLOC(work, (64*ham%lmnmax)) 437 438 projs = zero 439 invovl%inv_sij = zero 440 invovl%inv_s_approx = zero 441 442 shift = 0 443 do itypat = 1, ham%ntypat 444 nlmn = count(ham%indlmn(3,:,itypat)>0) 445 !! unpack ham%sij into inv_sij 446 do jlmn = 1, nlmn 447 invovl%inv_sij(1, jlmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + jlmn, itypat) 448 jlm=ham%indlmn(4,jlmn, itypat) 449 do ilmn = 1, jlmn-1 450 ilm=ham%indlmn(4,ilmn, itypat) 451 if (ilm == jlm) then ! sparsity check 452 invovl%inv_sij(1, ilmn, jlmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat) 453 invovl%inv_sij(1, jlmn, ilmn, itypat) = ham%sij(jlmn*(jlmn-1)/2 + ilmn, itypat) 454 end if 455 end do 456 end do 457 458 ! Invert sij 459 if(cplx == 2) then 460 call ZHETRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 461 call ZHETRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 462 else 463 call DSYTRF('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 464 call DSYTRI('U', nlmn, invovl%inv_sij(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 465 end if 466 ! complete the matrix 467 do ilm=1, nlmn 468 do jlm=1, ilm-1 469 invovl%inv_sij(1,ilm,jlm,itypat) = invovl%inv_sij(1,jlm,ilm,itypat) 470 end do 471 end do 472 473 !! loop on atoms to build atom_projs and fill projs, s_projs 474 do ia = 1, ham%nattyp(itypat) 475 476 !! build atom_projs, from opernlb 477 !! P = 4pi/sqrt(ucvol)* conj(diag(ph3d)) * ffnl * diag(parity), with parity = (-i)^l 478 atom_projs(:,:,:) = 0 479 480 ! start from 4pi/sqrt(ucvol)*ffnl 481 ! atom_projs(1, :, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(:, 1, 1:nlmn) 482 ! TODO vectorize (DCOPY with stride) 483 do ipw=1, ham%npw_k 484 atom_projs(1,ipw, 1:nlmn) = four_pi/sqrt(ham%ucvol) * ffnl(ipw, 1, 1:nlmn, itypat) 485 end do 486 487 ! multiply by (-i)^l 488 do ilmn=1,nlmn 489 il=mod(ham%indlmn(1,ilmn, itypat),4); 490 parity=(mod(il,2)==0) 491 if (il>1) then 492 ! multiply by -1 493 atom_projs(:,:,ilmn) = -atom_projs(:,:,ilmn) 494 end if 495 if(.not. parity) then 496 ! multiply by -i 497 temp = atom_projs(2,:,ilmn) 498 atom_projs(2,:,ilmn) = -atom_projs(1,:,ilmn) 499 atom_projs(1,:,ilmn) = temp 500 end if 501 end do 502 503 ! multiply by conj(ph3d) 504 do ilmn=1,nlmn 505 temp = atom_projs(1, :, ilmn) 506 atom_projs(1, :, ilmn) = atom_projs(1, :, ilmn) * ph3d(1, :, iaph3d) + atom_projs(2, :, ilmn) * ph3d(2, :, iaph3d) 507 atom_projs(2, :, ilmn) = atom_projs(2, :, ilmn) * ph3d(1, :, iaph3d) - temp * ph3d(2, :, iaph3d) 508 end do 509 510 ! me_g0 trick 511 if(ham%istwf_k == 2 .and. mpi_enreg%me_g0 == 1) then 512 atom_projs(1,1,:) = atom_projs(1,1,:) / sqrt2 513 atom_projs(2,1,:) = zero 514 end if 515 if(ham%istwf_k == 2) then 516 atom_projs(:,:,:) = atom_projs(:,:,:)*sqrt2 517 end if 518 519 520 !! atom_projs and typat_s_projs are built, copy them to projs and inv_s_projs 521 projs(:, :, shift+1:shift+nlmn) = atom_projs(:, :, 1:nlmn) 522 shift = shift + nlmn 523 524 ! build inv_s_approx = (D^-1+PtP)^-1 restricted to a single atom block 525 ! can be optimized (real, build directly from ffnl) 526 if(ia == 1) then 527 ! D^-1 528 invovl%inv_s_approx(1, :, :, itypat) = invovl%inv_sij(1, :, :, itypat) 529 ! + PtP 530 ABI_MALLOC(gram_proj, (cplx, nlmn, nlmn)) 531 call abi_xgemm(blas_transpose,'N', nlmn, nlmn, (3-cplx)*ham%npw_k, cone, atom_projs(:,:,1), (3-cplx)*ham%npw_k, & 532 & atom_projs(:,:,1), (3-cplx)*ham%npw_k, czero, gram_proj(:,:,1), nlmn,x_cplx=cplx) 533 call xmpi_sum(gram_proj,mpi_enreg%comm_bandspinorfft,ierr) 534 invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) = invovl%inv_s_approx(:,1:nlmn,1:nlmn,itypat) + gram_proj(:,:,:) 535 ABI_FREE(gram_proj) 536 ! ^-1 537 if(cplx == 2) then 538 call ZHETRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 539 call ZHETRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 540 else 541 call DSYTRF('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, (64*nlmn), info) 542 call DSYTRI('U', nlmn, invovl%inv_s_approx(:,:,:,itypat), ham%lmnmax, ipiv, work, info) 543 end if 544 ! complete lower triangle of matrix 545 do ilm=1, nlmn 546 do jlm=1, ilm-1 547 invovl%inv_s_approx(1, ilm, jlm, itypat) = invovl%inv_s_approx(1, jlm, ilm, itypat) 548 if(cplx == 2) then 549 invovl%inv_s_approx(2, ilm, jlm, itypat) = -invovl%inv_s_approx(2, jlm, ilm, itypat) 550 end if 551 end do 552 end do 553 end if 554 555 iaph3d = iaph3d + 1 556 end do 557 end do 558 ABI_FREE(ipiv) 559 ABI_FREE(work) 560 561 call timab(timer_mkinvovl_build_d, 2, tsec) 562 call timab(timer_mkinvovl_build_ptp, 1, tsec) 563 564 ! Compute P'P one column slice at a time (might be too big to fit in one proc) 565 if(mpi_enreg%paral_kgb == 1) then 566 ! Split the work evenly the fft processors 567 array_nprojs_pp(:) = invovl%nprojs / mpi_enreg%nproc_fft 568 ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left 569 ! assign them to the first ones 570 array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) = array_nprojs_pp(1:MOD(invovl%nprojs,mpi_enreg%nproc_fft)) + 1 571 else 572 array_nprojs_pp = invovl%nprojs 573 end if 574 ABI_MALLOC(invovl%gram_projs, (cplx,invovl%nprojs,array_nprojs_pp(mpi_enreg%me_fft+1))) 575 shift = 0 576 do iproc = 1, mpi_enreg%nproc_fft 577 ! compute local contribution to slice iproc of gram_projs 578 slice_size = array_nprojs_pp(iproc) 579 ABI_MALLOC(gramwork, (cplx,invovl%nprojs,slice_size)) 580 call abi_xgemm(blas_transpose,'N', invovl%nprojs, slice_size, (3-cplx)*ham%npw_k, cone, projs(:,:,1), (3-cplx)*ham%npw_k, & 581 & projs(:, :, shift+1), (3-cplx)*ham%npw_k, czero, gramwork(:,:,1), invovl%nprojs,x_cplx=cplx) 582 shift = shift + slice_size 583 ! reduce on proc i 584 call xmpi_sum_master(gramwork, iproc-1, mpi_enreg%comm_fft, ierr) 585 if(iproc == mpi_enreg%me_fft+1) then 586 invovl%gram_projs = gramwork 587 end if 588 ABI_FREE(gramwork) 589 end do 590 call xmpi_sum(invovl%gram_projs,mpi_enreg%comm_band,ierr) 591 592 call timab(timer_mkinvovl_build_ptp, 2, tsec) 593 call timab(timer_mkinvovl,2,tsec) 594 595 ABI_FREE(projs) 596 597 #if defined(HAVE_FC_ISO_C_BINDING) && defined(HAVE_GPU_CUDA) 598 599 ! upload inverse overlap matrices (sij and s_approx) to GPU memory 600 if (ham%use_gpu_cuda==1) then 601 ! allocate memory for sij and s_approx on GPU 602 call f_gpu_apply_invovl_matrix_alloc(cplx, invovl%nprojs, ham%lmnmax, ham%ntypat, 0) 603 604 invovl_gpu = make_invovl_kpt_gpu(invovl) 605 call f_upload_inverse_overlap(invovl_gpu, cplx, invovl%nprojs, ham%lmnmax, ham%ntypat) 606 write(message,*) 'Invovl uploaded to GPU memory' 607 call wrtout(std_out,message,'COLL') 608 end if 609 610 #endif 611 612 write(message,*) 'Invovl built' 613 call wrtout(std_out,message,'COLL') 614 615 end subroutine make_invovl
m_invovl/make_invovl_kpt_gpu [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
make_invovl_kpt
FUNCTION
Create a invovl_pkt_gpu_type from a cpu counter part for cuda interoperability
SOURCE
240 function make_invovl_kpt_gpu(invovl) result(invovl_gpu) 241 implicit none 242 type(invovl_kpt_type), intent(inout),target :: invovl 243 type(invovl_kpt_gpu_type) :: invovl_gpu 244 245 invovl_gpu%nprojs = invovl%nprojs 246 247 invovl_gpu%gram_projs = c_loc(invovl%gram_projs(1,1,1)) 248 invovl_gpu%gram_projs_dim = (/ & 249 & size(invovl%gram_projs,1), & 250 & size(invovl%gram_projs,2), & 251 & size(invovl%gram_projs,3) & 252 & /) 253 254 invovl_gpu%inv_sij = c_loc(invovl%inv_sij(1,1,1,1)) 255 invovl_gpu%inv_sij_dim = (/ & 256 & size(invovl%inv_sij,1), & 257 & size(invovl%inv_sij,2), & 258 & size(invovl%inv_sij,3), & 259 & size(invovl%inv_sij,4) & 260 & /) 261 262 invovl_gpu%inv_s_approx = c_loc(invovl%inv_s_approx(1,1,1,1)) 263 invovl_gpu%inv_s_approx_dim = (/ & 264 & size(invovl%inv_s_approx,1), & 265 & size(invovl%inv_s_approx,2), & 266 & size(invovl%inv_s_approx,3), & 267 & size(invovl%inv_s_approx,4) & 268 & /) 269 270 end function make_invovl_kpt_gpu
m_invovl/solve_inner [ Functions ]
[ Top ] [ m_invovl ] [ Functions ]
NAME
solve_inner
FUNCTION
Helper function: iteratively solves the inner system
INPUTS
SOURCE
834 subroutine solve_inner(invovl, ham, cplx, mpi_enreg, proj, ndat, sm1proj, PtPsm1proj, block_sliced) 835 836 use m_abi_linalg 837 implicit none 838 839 integer,intent(in) :: ndat,cplx 840 type(invovl_kpt_type), intent(in) :: invovl 841 real(dp), intent(inout) :: proj(cplx, invovl%nprojs,ndat) 842 real(dp), intent(inout) :: sm1proj(cplx, invovl%nprojs, ndat) 843 real(dp), intent(inout) :: PtPsm1proj(cplx, invovl%nprojs, ndat) 844 real(dp), allocatable :: temp_proj(:,:,:) 845 type(mpi_type), intent(in) :: mpi_enreg 846 type(gs_hamiltonian_type),intent(in) :: ham 847 integer, intent(in) :: block_sliced 848 849 integer :: array_nlmntot_pp(mpi_enreg%nproc_fft) 850 integer :: nlmntot_this_proc, ibeg, iend, ierr, i, nprojs 851 real(dp) :: resid(cplx, invovl%nprojs,ndat), precondresid(cplx, invovl%nprojs,ndat) 852 real(dp) :: normprojs(ndat), errs(ndat), maxerr, previous_maxerr 853 character(len=500) :: message 854 855 real(dp), parameter :: precision = 1e-16 ! maximum relative error. TODO: use tolwfr ? 856 real(dp) :: convergence_rate 857 integer :: additional_steps_to_take 858 859 ! ************************************************************************* 860 861 nprojs = invovl%nprojs 862 normprojs = SUM(SUM(proj**2, 1),1) 863 call xmpi_sum(normprojs, mpi_enreg%comm_fft, ierr) 864 865 ! Compute work distribution : split nprojs evenly between the fft processors 866 if(mpi_enreg%paral_kgb == 1) then 867 array_nlmntot_pp(:) = nprojs / mpi_enreg%nproc_fft 868 ! not enough work, there's MOD(nprojs,mpi_enreg%nproc_fft) tasks left 869 ! assign them to the first ones 870 array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) = array_nlmntot_pp(1:MOD(nprojs,mpi_enreg%nproc_fft)) + 1 871 ibeg = SUM(array_nlmntot_pp(1:mpi_enreg%me_fft)) + 1 872 iend = ibeg + array_nlmntot_pp(1+mpi_enreg%me_fft) - 1 873 nlmntot_this_proc = iend - ibeg + 1 874 else 875 ibeg = 1 876 iend = nprojs 877 nlmntot_this_proc = nprojs 878 end if 879 880 ABI_MALLOC(temp_proj, (cplx, nlmntot_this_proc, ndat)) 881 882 ! first guess for sm1proj 883 call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, proj, sm1proj, block_sliced) 884 885 ! Iterative refinement 886 ! TODO use a more efficient iterative algorithm than iterative refinement, use locking 887 additional_steps_to_take = -1 888 do i=1, 30 889 ! compute resid = proj - (D^-1 + PtP)sm1proj 890 call apply_block(ham, cplx, invovl%inv_sij, nprojs, ndat, sm1proj, resid, block_sliced) 891 temp_proj = sm1proj(:,ibeg:iend,:) 892 893 ! compute matrix multiplication : PtPsm1proj(:,:,1) = invovl%gram * temp_proj(:,:,1) 894 ABI_NVTX_START_RANGE(NVTX_INVOVL_INNER_GEMM) 895 call abi_xgemm('N', 'N', nprojs, ndat, nlmntot_this_proc, cone, & 896 & invovl%gram_projs(:,:,1), nprojs, & 897 & temp_proj(:,:,1), nlmntot_this_proc, czero, & 898 & PtPsm1proj(:,:,1), nprojs, & 899 & x_cplx=cplx) 900 call xmpi_sum(PtPsm1proj, mpi_enreg%comm_fft, ierr) 901 resid = proj - resid - Ptpsm1proj 902 ! exit check 903 errs = SUM(SUM(resid**2, 1),1) 904 call xmpi_sum(errs, mpi_enreg%comm_fft, ierr) 905 ABI_NVTX_END_RANGE() 906 907 maxerr = sqrt(MAXVAL(errs/normprojs)) 908 if(maxerr < precision .or. additional_steps_to_take == 1) then 909 exit 910 ! We might stall and never get to the specified precision because of machine errors. 911 ! If we got to 1e-10, extrapolate convergence rate and determine the number of additional 912 ! steps to take to reach precision 913 else if(maxerr < 1e-10 .and. additional_steps_to_take == -1) then 914 convergence_rate = -LOG(1e-10) / i 915 additional_steps_to_take = CEILING(-LOG(precision/1e-10)/convergence_rate) + 1 916 else if(additional_steps_to_take > 0) then 917 if(previous_maxerr<maxerr)exit 918 additional_steps_to_take = additional_steps_to_take - 1 919 end if 920 previous_maxerr=maxerr 921 922 ! add preconditionned residual 923 call apply_block(ham, cplx, invovl%inv_s_approx, nprojs, ndat, resid, precondresid, block_sliced) 924 sm1proj = sm1proj + precondresid 925 end do 926 927 if(maxerr >= precision .and. maxerr >= 1e-10) then 928 write(message, *) 'In invovl, max error was', maxerr, ' after 30 iterations' 929 ABI_WARNING(message) 930 else 931 ! write(message,'(a,i2,a,es13.5)') 'Iterative solver in invovl finished in ', i, ' iterations, error', maxerr 932 ! call wrtout(std_out,message,'COLL') 933 end if 934 935 ABI_FREE(temp_proj) 936 937 end subroutine solve_inner