From 73498ccdd1ebdc15885faf7387ce7143eb122d1e Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Wed, 13 Sep 2023 13:36:33 +0100 Subject: [PATCH 1/7] Clean-up duplicate and commented out code --- src/multiply_module.f90 | 100 +++++++++------------------------------- 1 file changed, 22 insertions(+), 78 deletions(-) diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index f2b83b7e9..c99ac82d2 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -146,7 +146,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer :: lab_const integer :: invdir,ierr,kpart,ind_part,ncover_yz,n_which,ipart,nnode integer :: icall,n_cont,kpart_next,ind_partN,k_off - integer :: icall2,stat,ilen2,lenb_rem + integer :: stat,ilen2,lenb_rem ! Remote variables to be allocated integer(integ),allocatable :: ibpart_rem(:) real(double),allocatable :: b_rem(:) @@ -164,10 +164,10 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) integer :: offset,sends,i,j integer, dimension(MPI_STATUS_SIZE) :: mpi_stat integer, allocatable, dimension(:) :: recv_part - - logical flag,call_flag real(double) :: t0,t1 + logical :: new_partition + call start_timer(tmr_std_matmult) call start_timer(tmr_std_allocation) @@ -220,85 +220,40 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ! !$omp private(kpart, icall, ind_part, ipart, nnode, b_rem, & ! !$omp lenb_rem, n_cont, part_array, ilen2, offset, & ! !$omp nbnab_rem, ibind_rem, ib_nd_acc_rem, ibseq_rem, & -! !$omp npxyz_rem, ibndimj_rem, k_off, icall2) +! !$omp npxyz_rem, ibndimj_rem, k_off) ! !$omp do ! #end if - do kpart = 1,a_b_c%ahalo%np_in_halo ! Main loop - !write(io_lun,*) 'Part: ',kpart,myid + main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo + icall=1 ind_part = a_b_c%ahalo%lab_hcell(kpart) - !write(io_lun,*) 'ind_part: ',ind_part - if(kpart>1) then ! Is it a periodic image of the previous partition ? + new_partition = .true. + + ! Check if this is a periodic image of the previous partition + if(kpart>1) then if(ind_part.eq.a_b_c%ahalo%lab_hcell(kpart-1)) then - icall=0 - else ! Get the data - !write(io_lun,*) myid,' seq: ',size(a_b_c%parts%i_cc2seq) - ipart = a_b_c%parts%i_cc2seq(ind_part) - !write(io_lun,*) myid,' Alloc b_rem part: ',ipart - nnode = a_b_c%comms%neigh_node_list(kpart) - recv_part(nnode) = recv_part(nnode)+1 - !write(io_lun,*) myid,' Alloc b_rem node: ',nnode - !write(io_lun,*) myid,' Alloc b_rem icc: ', a_b_c%parts%i_cc2node(ind_part) - !write(io_lun,*) myid,' Alloc b_rem alloc: ',allocated(b_rem) - if(allocated(b_rem)) deallocate(b_rem) - if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then - lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs - else - lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) - end if - allocate(b_rem(lenb_rem)) - call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& - n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& - mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) - !write(io_lun,*) 'b_rem: ',lenb_rem - ! Now point the _rem variables at the appropriate parts of - ! the array where we will receive the data - offset = 0 - nbnab_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibind_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ib_nd_acc_rem => part_array(offset+1:offset+n_cont) - offset = offset+n_cont - ibseq_rem => part_array(offset+1:offset+ilen2) - offset = offset+ilen2 - npxyz_rem => part_array(offset+1:offset+3*ilen2) - offset = offset+3*ilen2 - ibndimj_rem => part_array(offset+1:offset+ilen2) - if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ & - 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('mat_mult: error pointing to part_array ',kpart) - end if - ! Create ibpart_rem - call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& - ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) + new_partition = .false. end if - else ! Get the data - !write(io_lun,*) myid,' seq: ',size(a_b_c%parts%i_cc2seq) + end if + + if(new_partition) then + ! Get the data ipart = a_b_c%parts%i_cc2seq(ind_part) - !write(io_lun,*) myid,' Alloc b_rem part: ',ipart nnode = a_b_c%comms%neigh_node_list(kpart) recv_part(nnode) = recv_part(nnode)+1 - !write(io_lun,*) myid,' Alloc b_rem node: ',nnode - !write(io_lun,*) myid,' Alloc b_rem icc: ',a_b_c%parts%i_cc2node(ind_part) - !write(io_lun,*) myid,' Alloc b_rem alloc: ',allocated(b_rem) if(allocated(b_rem)) deallocate(b_rem) if(a_b_c%parts%i_cc2node(ind_part)==myid+1) then lenb_rem = a_b_c%bmat(ipart)%part_nd_nabs else lenb_rem = a_b_c%comms%ilen3rec(ipart,nnode) end if - call start_timer(tmr_std_allocation) allocate(b_rem(lenb_rem)) - call stop_timer(tmr_std_allocation) call prefetch(kpart,a_b_c%ahalo,a_b_c%comms,a_b_c%bmat,icall,& n_cont,part_array,a_b_c%bindex,b_rem,lenb_rem,b,myid,ilen2,& mx_msg_per_part,a_b_c%parts,a_b_c%prim,a_b_c%gcs,(recv_part(nnode)-1)*2) - lenb_rem = size(b_rem) - !write(io_lun,*) 'b_rem: ',lenb_rem - ! Now point the _rem variables at the appropriate parts of the array - ! where we will receive the data + ! Now point the _rem variables at the appropriate parts of + ! the array where we will receive the data offset = 0 nbnab_rem => part_array(offset+1:offset+n_cont) offset = offset+n_cont @@ -313,22 +268,14 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) ibndimj_rem => part_array(offset+1:offset+ilen2) if(offset+ilen2>3*a_b_c%parts%mx_mem_grp+ & 5*a_b_c%parts%mx_mem_grp*a_b_c%bmat(1)%mx_abs) then - call cq_abort('Error pointing to part_array !',kpart) + call cq_abort('mat_mult: error pointing to part_array ',kpart) end if + ! Create ibpart_rem call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) - end if ! End of the "if this isn't the first partition" loop + end if + k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs - icall2=1 - ! Check dimensions to be used in m_kern_min/max - !call check_mkm(kpart,a_b_c%ahalo%nh_part,a_b_c%ahalo%j_seq, & - ! a_b_c%ahalo%j_beg,& - ! ibind_rem,ibpart_rem,nbnab_rem,& - ! ibseq_rem,a_b_c%chalo%i_hbeg,k_off,icall2,& - ! a_b_c%ahalo%mx_part,a_b_c%gcs%mx_gcover,a_b_c%ahalo%mx_halo, & - ! a_b_c%parts%mx_mem_grp, & - ! a_b_c%bmat(1)%mx_abs,a_b_c%gcs%mx_mcover) - !if(icall2.eq.1) then ! If check is OK, do the mult if(a_b_c%mult_type.eq.1) then ! C is full mult call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,& ibpart_rem,ibseq_rem,ibndimj_rem,& @@ -342,10 +289,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & a_b_c%prim%mx_iprim, lena, lenb_rem, lenc) end if - !else - ! call cq_abort('mat_mult: error in check_mkm ',kpart) - !end if - end do ! End of the kpart=1,ahalo%np_in_halo loop ! + end do main_loop ! #ifdef OMP_M ! !$omp end do ! !$omp end parallel From 8364ba42d65b46211d1ab3bad6c80f741d87c708 Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Wed, 20 Sep 2023 15:54:40 +0100 Subject: [PATCH 2/7] Try using orphaned directives to optimize omp region creation https://stackoverflow.com/questions/35347944/fortran-openmp-with-subroutines-and-functions/35361665#35361665 --- src/multiply_kernel_ompGemm.f90 | 38 ++++++++++++++++----------------- src/multiply_module.f90 | 22 ++++++------------- 2 files changed, 24 insertions(+), 36 deletions(-) diff --git a/src/multiply_kernel_ompGemm.f90 b/src/multiply_kernel_ompGemm.f90 index b52e92691..e1123bc39 100644 --- a/src/multiply_kernel_ompGemm.f90 +++ b/src/multiply_kernel_ompGemm.f90 @@ -168,15 +168,15 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -!$omp tempa, tempb, tempc, prend1, maxlen, sofar) +! !$omp parallel default(none) & +! !$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & +! !$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & +! !$omp a, b, c) & +! !$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & +! !$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & +! !$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & +! !$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & +! !$omp tempa, tempb, tempc, prend1, maxlen, sofar) allocate(tempa(1,1), tempc(1,1)) do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -297,7 +297,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do ! end of k = 1, nahpart if (allocated(tempa)) deallocate(tempa) if (allocated(tempc)) deallocate(tempc) -!$omp end parallel return end subroutine m_kern_max !!***** @@ -445,15 +444,15 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! mx_a = maxnsf ! maxlen = maxnsf * max(nbnab) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -!$omp tempb, tempc, maxlen, sofar) +! !$omp parallel default(none) & +! !$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & +! !$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & +! !$omp a, b, c) & +! !$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & +! !$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & +! !$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & +! !$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & +! !$omp tempb, tempc, maxlen, sofar) do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 k_in_part = ahalo%j_seq(k_in_halo) @@ -532,7 +531,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do !$omp end do end do -!$omp end parallel return end subroutine m_kern_min !!***** diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index c99ac82d2..4788ddb4c 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -213,19 +213,10 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) !write(io_lun,*) 'Returned ',a_b_c%ahalo%np_in_halo,myid ncover_yz=a_b_c%gcs%ncovery*a_b_c%gcs%ncoverz -! #ifdef OMP_M -! !$omp parallel default(none) & -! !$omp shared(a, b, c, a_b_c, myid, lena, lenc, tmr_std_allocation, & -! !$omp ncover_yz, ibpart_rem, atrans, usegemm) & -! !$omp private(kpart, icall, ind_part, ipart, nnode, b_rem, & -! !$omp lenb_rem, n_cont, part_array, ilen2, offset, & -! !$omp nbnab_rem, ibind_rem, ib_nd_acc_rem, ibseq_rem, & -! !$omp npxyz_rem, ibndimj_rem, k_off) -! !$omp do -! #end if - + !$omp parallel default(shared) main_loop: do kpart = 1,a_b_c%ahalo%np_in_halo + !$omp master icall=1 ind_part = a_b_c%ahalo%lab_hcell(kpart) new_partition = .true. @@ -274,8 +265,10 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) call end_part_comms(myid,n_cont,nbnab_rem,ibind_rem,npxyz_rem,& ibpart_rem,ncover_yz,a_b_c%gcs%ncoverz) end if - + k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs + !$omp end master + if(a_b_c%mult_type.eq.1) then ! C is full mult call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,& ibpart_rem,ibseq_rem,ibndimj_rem,& @@ -290,10 +283,7 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%prim%mx_iprim, lena, lenb_rem, lenc) end if end do main_loop -! #ifdef OMP_M -! !$omp end do -! !$omp end parallel -! #end if +!$omp end parallel call start_timer(tmr_std_allocation) if(allocated(b_rem)) deallocate(b_rem) call stop_timer(tmr_std_allocation) From 042c1743e1c2a57bbea7fb498b3fdbed5231de16 Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Wed, 20 Sep 2023 16:56:46 +0100 Subject: [PATCH 3/7] Add barriers --- src/multiply_kernel_ompGemm.f90 | 37 ++++++--------------------------- src/multiply_module.f90 | 7 ++++++- 2 files changed, 12 insertions(+), 32 deletions(-) diff --git a/src/multiply_kernel_ompGemm.f90 b/src/multiply_kernel_ompGemm.f90 index e1123bc39..018295608 100644 --- a/src/multiply_kernel_ompGemm.f90 +++ b/src/multiply_kernel_ompGemm.f90 @@ -184,7 +184,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & nbkbeg = ibaddr(k_in_part) nb_nd_kbeg = ib_nd_acc(k_in_part) nd3 = ahalo%ndimj(k_in_halo) - ! if (PRESENT(debug)) write (21+debug,*) 'Details1: ', k, nb_nd_kbeg ! for OpenMP sub-array indexing nd1_1st(1) = 0 do i = 2, at%n_hnab(k_in_halo) @@ -207,15 +206,15 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & prend1 = 0 !$omp do schedule(runtime) ! Loop over primary-set A-neighbours of k - do i = 1, at%n_hnab(k_in_halo) - ! nabeg = at%i_beg(k_in_halo) + i - 1 + A_i : do i = 1, at%n_hnab(k_in_halo) i_in_prim = at%i_prim(at%i_beg(k_in_halo)+i-1) nd1 = ahalo%ndimi(i_in_prim) nabeg = at%i_nd_beg(k_in_halo) + nd1_1st(i) if (nd1 /= prend1) then - deallocate(tempc, tempa) - allocate(tempa(nd1,nd3), tempc(nd1,maxlen)) - ! allocate(tempa(nd3,nd1), tempc(nd1,maxlen)) + deallocate(tempc) + deallocate(tempa) + allocate(tempa(nd1,nd3)) + allocate(tempc(nd1,maxlen)) end if tempa = zero tempb = zero @@ -224,38 +223,17 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & naaddr = nabeg + nd3 * (n1 - 1) do n3 = 1, nd3 tempa(n1,n3) = a(naaddr+n3-1) - ! tempa(n3,n1) = a(naaddr+n3-1) end do end do icad = (i_in_prim - 1) * chalo%ni_in_halo - ! nbbeg = nb_nd_kbeg sofar = 0 do j = 1, nbnab(k_in_part) ! Loop over B-neighbours of atom k - ! nbbeg = nbkbeg + j - 1 nd2 = bndim2(nbkbeg+j-1) nbbeg = nb_nd_kbeg + nd2_1st(j) j_in_halo = jbnab2ch(j) if (j_in_halo /= 0) then ncbeg = chalo%i_h2d(icad+j_in_halo) - ! nd2 = chalo%ndimj(j_in_halo) if (ncbeg /= 0) then ! multiplication of ndim x ndim blocks - ! if (present(debug)) & - ! write (21+debug,*) 'Details2: ', j, nd2, & - ! (nabeg-1)/(nd1*nd3), & - ! (ncbeg-1)/(nd1*nd2), & - ! (nbbeg-1)/(nd2*nd3) -!DIR$ NOPATTERN - !! do n2=1, nd2 - !! nbaddr = nbbeg+nd3*(n2-1) - !! ncaddr = ncbeg+nd1*(n2-1) - !! do n1=1, nd1 - !! naaddr=nabeg+nd3*(n1-1) - !! do n3=1, nd3 - !! c(ncaddr+n1-1) = c(ncaddr+n1-1) & - !! +a(naaddr+n3-1)*b(nbaddr+n3-1) - !! end do - !! end do - !! end do do n2 = 1, nd2 nbaddr = nbbeg + nd3 * (n2 - 1) do n3 = 1, nd3 @@ -267,9 +245,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end if ! End of if (j_in_halo /= 0) end do ! End of 1, nbnab if (sofar > 0) then - ! m, n, k, alpha, a, lda, b, ldb, beta, c, ldc - ! call dgemm('t', 'n', nd1, sofar, nd3, 1.0_double, tempa, & - ! nd3, tempb, nd3,0.0_double, tempc, nd1) call dgemm('n', 'n', nd1, sofar, nd3, 1.0_double, tempa, & nd1, tempb, nd3, zero, tempc, nd1) end if @@ -291,7 +266,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end if end if end do - end do ! end of i = 1, at%n_hnab + end do A_i !$omp end do deallocate(tempb) end do ! end of k = 1, nahpart diff --git a/src/multiply_module.f90 b/src/multiply_module.f90 index 4788ddb4c..268e9e5f5 100644 --- a/src/multiply_module.f90 +++ b/src/multiply_module.f90 @@ -267,7 +267,11 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) end if k_off=a_b_c%ahalo%lab_hcover(kpart) ! --- offset for pbcs + ! Omp master doesn't include a implicit barrier. We want master + ! to be finished with comms before calling the multiply kernels + ! hence the explicit barrier !$omp end master + !$omp barrier if(a_b_c%mult_type.eq.1) then ! C is full mult call m_kern_max( k_off,kpart,ib_nd_acc_rem, ibind_rem,nbnab_rem,& @@ -282,8 +286,9 @@ subroutine mat_mult(myid,a,lena,b,lenb,c,lenc,a_b_c,debug) a_b_c%bmat(1)%mx_abs,a_b_c%parts%mx_mem_grp, & a_b_c%prim%mx_iprim, lena, lenb_rem, lenc) end if + !$omp barrier end do main_loop -!$omp end parallel + !$omp end parallel call start_timer(tmr_std_allocation) if(allocated(b_rem)) deallocate(b_rem) call stop_timer(tmr_std_allocation) From 29a9d97eb7ae0889c06d4b864cb950c479c24d18 Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Fri, 22 Sep 2023 11:20:55 +0100 Subject: [PATCH 4/7] Make non-threaded kernels explicitly serial --- src/multiply_kernel_default.f90 | 5 +++++ src/multiply_kernel_gemm.f90 | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/src/multiply_kernel_default.f90 b/src/multiply_kernel_default.f90 index cc21736ae..072e9dfe3 100644 --- a/src/multiply_kernel_default.f90 +++ b/src/multiply_kernel_default.f90 @@ -162,6 +162,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & integer :: nd1, nd2, nd3 integer :: naaddr, nbaddr, ncaddr + !$omp single ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -216,6 +217,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & nabeg = nabeg + nd1 * nd3 end do ! End of i = 1, at%n_hnab end do ! End of k = 1, nahpart + !$omp end single return end subroutine m_kern_max !!***** @@ -353,6 +355,8 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & integer :: nd1, nd2, nd3 integer :: naaddr, nbaddr, ncaddr + !$omp single + ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -402,6 +406,7 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & nabeg = nabeg + nd1 * nd3 end do end do + !$omp end single return end subroutine m_kern_min !!***** diff --git a/src/multiply_kernel_gemm.f90 b/src/multiply_kernel_gemm.f90 index f0e3ced45..e852079d3 100644 --- a/src/multiply_kernel_gemm.f90 +++ b/src/multiply_kernel_gemm.f90 @@ -166,6 +166,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & integer :: sofar, maxlen, max2, prend1 external :: dgemm + !$omp single allocate(tempa(1,1), tempc(1,1)) do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -273,6 +274,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do ! end of k = 1, nahpart if (allocated(tempa)) deallocate(tempa) if (allocated(tempc)) deallocate(tempc) + !$omp end signle return end subroutine m_kern_max !!***** @@ -414,6 +416,7 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & real(double), allocatable, dimension(:,:) :: tempb, tempc external :: dgemm + !$omp single do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 k_in_part = ahalo%j_seq(k_in_halo) @@ -481,6 +484,7 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & nabeg = nabeg + nd1 * nd3 end do end do + !$omp end single return end subroutine m_kern_min !!***** From eeeac6a27b2af7f9bc6fe2293cffa22dbf145a7e Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Fri, 22 Sep 2023 12:15:18 +0100 Subject: [PATCH 5/7] Remove inner omp regions in ompDoik kernel --- src/multiply_kernel_ompDoik.f90 | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/src/multiply_kernel_ompDoik.f90 b/src/multiply_kernel_ompDoik.f90 index e1ff52e69..e2a3ba3d5 100644 --- a/src/multiply_kernel_ompDoik.f90 +++ b/src/multiply_kernel_ompDoik.f90 @@ -184,14 +184,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -218,7 +210,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & jseq = ibseq(nbkbeg+j-1) jbnab2ch(j) = chalo%i_halo(chalo%i_hbeg(jpart)+jseq-1) end do -!$omp do schedule(runtime) + !$omp do schedule(runtime) ! Loop over primary-set A-neighbours of k do i = 1, at%n_hnab(k_in_halo) ! nabeg = at%i_beg(k_in_halo) + i - 1 @@ -259,7 +251,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do ! End of i = 1, at%n_hnab !$omp end do end do ! End of k = 1, nahpart -!$omp end parallel return end subroutine m_kern_max !!***** @@ -405,16 +396,8 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! For OpenMP integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, i, j, k, jpart, jseq, & -!$omp jbnab2ch, icad, nabeg, nbbeg, ncbeg, naaddr, nbaddr, & -!$omp ncaddr, n1, n2, n3, i_in_prim, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn -!$omp do schedule(runtime) + !$omp do schedule(runtime) do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 k_in_part = ahalo%j_seq(k_in_halo) @@ -472,7 +455,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do end do !$omp end do -!$omp end parallel return end subroutine m_kern_min !!***** From 4c7f37aeb53b437d28ece26643bb927fcea2c15e Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Thu, 28 Sep 2023 16:46:22 +0100 Subject: [PATCH 6/7] All omp kernels now use outer omp parallel region --- src/multiply_kernel_ompDoii.f90 | 18 ------------------ src/multiply_kernel_ompDojk.f90 | 18 ------------------ src/multiply_kernel_ompGemm.f90 | 18 ------------------ src/multiply_kernel_ompGemm_m.f90 | 23 +---------------------- 4 files changed, 1 insertion(+), 76 deletions(-) diff --git a/src/multiply_kernel_ompDoii.f90 b/src/multiply_kernel_ompDoii.f90 index 1b3a91c8e..2863bbd82 100644 --- a/src/multiply_kernel_ompDoii.f90 +++ b/src/multiply_kernel_ompDoii.f90 @@ -184,14 +184,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -257,7 +249,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end if ! End of if(j_in_halo.ne.0) end do ! End of j = 1, nbnab end do ! End of i = 1, at%n_hnab -!$omp end do end do ! End of k = 1, nahpart !$omp end parallel return @@ -405,14 +396,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! For OpenMP integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, i, j, k, jpart, jseq, & -!$omp jbnab2ch, icad, nabeg, nbbeg, ncbeg, naaddr, nbaddr, & -!$omp ncaddr, n1, n2, n3, i_in_prim, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -472,7 +455,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do !$omp end do end do -!$omp end parallel return end subroutine m_kern_min !!***** diff --git a/src/multiply_kernel_ompDojk.f90 b/src/multiply_kernel_ompDojk.f90 index 11c0cc0ee..76bb2588c 100644 --- a/src/multiply_kernel_ompDojk.f90 +++ b/src/multiply_kernel_ompDojk.f90 @@ -184,14 +184,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -259,7 +251,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & !$omp end do end do ! End of i = 1, at%n_hnab end do ! End of k = 1, nahpart -!$omp end parallel return end subroutine m_kern_max !!***** @@ -405,14 +396,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! For OpenMP integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, i, j, k, jpart, jseq, & -!$omp jbnab2ch, icad, nabeg, nbbeg, ncbeg, naaddr, nbaddr, & -!$omp ncaddr, n1, n2, n3, i_in_prim, nd1_1st, nd2_1st) ! Loop over atoms k in current A-halo partn !$omp do schedule(runtime) do k = 1, ahalo%nh_part(kpart) @@ -472,7 +455,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & end do end do !$omp end do -!$omp end parallel return end subroutine m_kern_min !!***** diff --git a/src/multiply_kernel_ompGemm.f90 b/src/multiply_kernel_ompGemm.f90 index 018295608..b9f3ef531 100644 --- a/src/multiply_kernel_ompGemm.f90 +++ b/src/multiply_kernel_ompGemm.f90 @@ -168,15 +168,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -! !$omp parallel default(none) & -! !$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -! !$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -! !$omp a, b, c) & -! !$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -! !$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -! !$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -! !$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -! !$omp tempa, tempb, tempc, prend1, maxlen, sofar) allocate(tempa(1,1), tempc(1,1)) do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 @@ -419,15 +410,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! mx_a = maxnsf ! maxlen = maxnsf * max(nbnab) -! !$omp parallel default(none) & -! !$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -! !$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -! !$omp a, b, c) & -! !$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -! !$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -! !$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -! !$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -! !$omp tempb, tempc, maxlen, sofar) do k = 1, ahalo%nh_part(kpart) ! Loop over atoms k in current A-halo partn k_in_halo = ahalo%j_beg(kpart) + k - 1 k_in_part = ahalo%j_seq(k_in_halo) diff --git a/src/multiply_kernel_ompGemm_m.f90 b/src/multiply_kernel_ompGemm_m.f90 index 6b369780a..fa933d6e9 100644 --- a/src/multiply_kernel_ompGemm_m.f90 +++ b/src/multiply_kernel_ompGemm_m.f90 @@ -168,16 +168,7 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -!$omp tempa, tempb, tempc, maxnd1, maxnd2, maxnd3, & -!$omp maxlen, sofar) + ! Allocate tempa, tempb, tempc to largest possible size outside the loop maxnd1 = maxval(ahalo%ndimi) maxnd2 = maxval(bndim2) maxnd3 = maxval(ahalo%ndimj) @@ -276,7 +267,6 @@ subroutine m_kern_max(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & !$omp end do end do ! end of k = 1, nahpart deallocate(tempa, tempb, tempc) -!$omp end parallel return end subroutine m_kern_max !!***** @@ -420,16 +410,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & ! OpenMP required indexing variables integer :: nd1_1st(at%mx_halo), nd2_1st(mx_absb) -!$omp parallel default(none) & -!$omp shared(kpart, ibaddr, ib_nd_acc, nbnab, ibpart, ibseq, & -!$omp k_off, bndim2, mx_absb, mx_part, at, ahalo, chalo, & -!$omp a, b, c) & -!$omp private(i, j, k, j_in_halo, k_in_halo, k_in_part, nbkbeg, & -!$omp nb_nd_kbeg, nd1, nd2, nd3, jpart, jseq, jbnab2ch, & -!$omp nabeg, nbbeg, ncbeg, i_in_prim, icad, naaddr, & -!$omp nbaddr, ncaddr, n1, n2, n3, nd1_1st, nd2_1st, & -!$omp tempb, tempc, maxnd1, maxnd2, maxnd3, maxlen, & -!$omp sofar) maxnd1 = maxval(ahalo%ndimi) maxnd2 = maxval(bndim2) maxnd3 = maxval(ahalo%ndimj) @@ -503,7 +483,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & !$omp end do end do deallocate(tempb, tempc) -!$omp end parallel return end subroutine m_kern_min !!***** From f8c1918102dbc6aa7a6465c1efe735265f17bd99 Mon Sep 17 00:00:00 2001 From: Tuomas Koskela Date: Wed, 4 Oct 2023 10:48:59 +0100 Subject: [PATCH 7/7] Remove empty line --- src/multiply_kernel_default.f90 | 1 - 1 file changed, 1 deletion(-) diff --git a/src/multiply_kernel_default.f90 b/src/multiply_kernel_default.f90 index 072e9dfe3..0f1f46dcc 100644 --- a/src/multiply_kernel_default.f90 +++ b/src/multiply_kernel_default.f90 @@ -356,7 +356,6 @@ subroutine m_kern_min(k_off, kpart, ib_nd_acc, ibaddr, nbnab, & integer :: naaddr, nbaddr, ncaddr !$omp single - ! Loop over atoms k in current A-halo partn do k = 1, ahalo%nh_part(kpart) k_in_halo = ahalo%j_beg(kpart) + k - 1