diff --git a/Project.toml b/Project.toml index 3db72e564..b25d7c22c 100644 --- a/Project.toml +++ b/Project.toml @@ -6,6 +6,7 @@ version = "0.4.13" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232" CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" @@ -26,6 +27,7 @@ Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ROCmDeviceLibs_jll = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +Revise = "295af30f-e4ad-537b-8983-00126c2a3abe" Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/src/array.jl b/src/array.jl index 83fb890dd..51ed8ea59 100644 --- a/src/array.jl +++ b/src/array.jl @@ -10,10 +10,7 @@ struct ROCKernelContext <: AbstractKernelContext end function GPUArrays.gpu_call(::ROCArrayBackend, f, args, threads::Int, blocks::Int; name::Union{String,Nothing}) groupsize, gridsize = threads, blocks * threads - wait(@roc groupsize=groupsize gridsize=gridsize f(ROCKernelContext(), args...)) -end -function GPUArrays.gpu_call(::ROCArrayBackend, f, args; elements::Int, name::Union{String,Nothing}=nothing) - wait(@roc groupsize=min(elements, 64) gridsize=elements f(ROCKernelContext(), args...)) + @roc groupsize=groupsize gridsize=gridsize name=name f(ROCKernelContext(), args...) end ## on-device @@ -21,11 +18,11 @@ end # indexing for (f, froc) in ( - (:blockidx, :blockIdx), - (:blockdim, :blockDim), - (:threadidx, :threadIdx), - (:griddim, :gridGroupDim) - ) + (:blockidx, :blockIdx), + (:blockdim, :blockDim), + (:threadidx, :threadIdx), + (:griddim, :gridGroupDim) +) @eval @inline GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x end @@ -50,7 +47,6 @@ end return end - # # Host abstractions # @@ -81,16 +77,34 @@ end unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf) -wait!(x::ROCArray) = wait!(x.syncstate) mark!(x::ROCArray, s) = mark!(x.syncstate, s) -wait!(xs::Vector{<:ROCArray}) = foreach(wait!, xs) -mark!(xs::Vector{<:ROCArray}, s) = foreach(x->mark!(x,s), xs) -wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(wait!, xs) -mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x->mark!(x,s), xs) +mark!(xs::Vector{<:ROCArray}, s) = foreach(x -> mark!(x,s), xs) +mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x -> mark!(x,s), xs) + +wait!(x::ROCArray; hip::Bool = true, hsa::Bool = true) = wait!(x.syncstate; hip, hsa) +wait!(xs::Vector{<:ROCArray}; hip::Bool = true, hsa::Bool = true) = foreach(x -> wait!(x; hip, hsa), xs) +wait!(xs::NTuple{N,<:ROCArray} where N; hip::Bool = true, hsa::Bool = true) = foreach(x -> wait!(x; hip, hsa), xs) + +hsa_wait!(x::ROCArray) = wait!(x.syncstate; hip=false, hsa=true) +hsa_wait!(xs::Vector{<:ROCArray}) = foreach(x -> wait!(x; hip=false, hsa=true), xs) +hsa_wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(x -> wait!(x; hip=false, hsa=true), xs) + +hip_wait!(x::ROCArray) = wait!(x.syncstate; hip=true, hsa=false) +hip_wait!(xs::Vector{<:ROCArray}) = foreach(x -> wait!(x; hip=true, hsa=false), xs) +hip_wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(x -> wait!(x; hip=true, hsa=false), xs) + function Adapt.adapt_storage(::Runtime.WaitAdaptor, x::ROCArray) Runtime.wait!(x.syncstate) x end +function Adapt.adapt_storage(::Runtime.HIPWaitAdaptor, x::ROCArray) + Runtime.wait!(x.syncstate; hip=true, hsa=false) + x +end +function Adapt.adapt_storage(::Runtime.HSAWaitAdaptor, x::ROCArray) + Runtime.wait!(x.syncstate; hip=false, hsa=true) + x +end function Adapt.adapt_storage(ma::Runtime.MarkAdaptor, x::ROCArray) Runtime.mark!(x.syncstate, ma.s) x @@ -183,6 +197,7 @@ function Base.copyto!(dest::Array{T}, d_offset::Integer, @boundscheck checkbounds(dest, d_offset+amount-1) @boundscheck checkbounds(source, s_offset+amount-1) wait!(source) + synchronize() Mem.download!(pointer(dest, d_offset), Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)), amount*sizeof(T)) diff --git a/src/blas/rocBLAS.jl b/src/blas/rocBLAS.jl index 9fd6135c3..0f8b4e86f 100644 --- a/src/blas/rocBLAS.jl +++ b/src/blas/rocBLAS.jl @@ -1,9 +1,9 @@ module rocBLAS using ..AMDGPU -import AMDGPU: wait!, mark!, librocblas, AnyROCArray +import AMDGPU: hsa_wait!, mark!, librocblas, AnyROCArray import AMDGPU: HandleCache, HIP, library_state -import .HIP: HIPContext, HIPStream, hipContext_t, hipStream_t, hipEvent_t +import .HIP: HIPContext, HIPStream, HIPEvent, hipContext_t, hipStream_t, hipEvent_t using LinearAlgebra using CEnum diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index 97fdcdf1d..779e50e32 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -57,10 +57,10 @@ for (fname, elty) in ((:rocblas_dcopy,:Float64), incx::Integer, DY::ROCArray{$elty}, incy::Integer) - wait!((DX,DY)) + hsa_wait!((DX,DY)) (; handle, stream) = lib_state() $(fname)(handle, n, DX, incx, DY, incy) |> check - mark!((DX,DY), stream) + mark!((DX,DY), HIPEvent(stream)) DY end end @@ -76,10 +76,10 @@ for (fname, elty) in ((:rocblas_dscal,:Float64), DA::$elty, DX::ROCArray{$elty}, incx::Integer) - wait!(DX) + hsa_wait!(DX) (; handle, stream) = lib_state() $(fname)(handle, n, Ref(DA), DX, incx) |> check - mark!(DX, stream) + mark!(DX, HIPEvent(stream)) DX end end @@ -92,10 +92,10 @@ for (fname, elty, celty) in ((:rocblas_sscal, :Float32, :ComplexF32), DA::$elty, DX::ROCArray{$celty}, incx::Integer) - wait!(DX) + hsa_wait!(DX) (; handle, stream) = lib_state() $(fname)(handle, 2*n, Ref(DA), DX, incx) |> check - mark!(DX, stream) + mark!(DX, HIPEvent(stream)) DX end end @@ -115,7 +115,7 @@ for (jname, fname, elty) in ((:dot,:rocblas_ddot,:Float64), DY::ROCArray{$elty}, incy::Integer) result = Ref{$elty}() - wait!((DX,DY)) + hsa_wait!((DX,DY)) $(fname)(handle(), n, DX, incx, DY, incy, result) |> check return result[] end @@ -132,7 +132,7 @@ for (fname, elty, ret_type) in ((:rocblas_dnrm2,:Float64,:Float64), X::ROCArray{$elty}, incx::Integer) result = Ref{$ret_type}() - wait!(X) + hsa_wait!(X) $(fname)(handle(), n, X, incx, result) |> check return result[] end @@ -151,7 +151,7 @@ for (fname, elty, ret_type) in ((:rocblas_dasum,:Float64,:Float64), X::ROCArray{$elty}, incx::Integer) result = Ref{$ret_type}() - wait!(X) + hsa_wait!(X) $(fname)(handle(), n, X, incx, result) |> check return result[] end @@ -171,10 +171,10 @@ for (fname, elty) in ((:rocblas_daxpy,:Float64), incx::Integer, dy::ROCArray{$elty}, incy::Integer) - wait!((dx,dy)) + hsa_wait!((dx,dy)) (; handle, stream) = lib_state() $(fname)(handle, n, Ref(alpha), dx, incx, dy, incy) |> check - mark!((dx,dy), stream) + mark!((dx,dy), HIPEvent(stream)) dy end end @@ -259,10 +259,10 @@ for (fname, elty) in ((:rocblas_dgemv,:Float64), lda = max(1,stride(A,2)) incx = stride(X,1) incy = stride(Y,1) - wait!((A,X,Y)) + hsa_wait!((A,X,Y)) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, Ref(alpha), A, lda, X, incx, Ref(beta), Y, incy) |> check - mark!((A,X,Y), stream) + mark!((A,X,Y), HIPEvent(stream)) Y end function gemv(trans::Char, alpha::($elty), A::ROCMatrix{$elty}, X::ROCVector{$elty}) @@ -298,10 +298,10 @@ for (fname, elty) in ((:rocblas_dgbmv,:Float64), lda = max(1,stride(A,2)) incx = stride(x,1) incy = stride(y,1) - wait!((A,x,y)) + hsa_wait!((A,x,y)) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, kl, ku, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) + mark!((A,x,y), HIPEvent(stream)) y end function gbmv(trans::Char, @@ -347,10 +347,10 @@ for (fname, elty) in ((:rocblas_dsymv,:Float64), lda = max(1,stride(A,2)) incx = stride(x,1) incy = stride(y,1) - wait!((A,x,y)) + hsa_wait!((A,x,y)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) + mark!((A,x,y), HIPEvent(stream)) y end function symv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, x::ROCVector{$elty}) @@ -381,10 +381,10 @@ for (fname, elty) in ((:rocblas_zhemv,:ComplexF64), lda = max(1,stride(A,2)) incx = stride(x,1) incy = stride(y,1) - wait!((A,x,y)) + hsa_wait!((A,x,y)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) + mark!((A,x,y), HIPEvent(stream)) y end function hemv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, @@ -420,10 +420,10 @@ for (fname, elty) in ((:rocblas_dsbmv,:Float64), lda = max(1,stride(A,2)) incx = stride(x,1) incy = stride(y,1) - wait!((A,x,y)) + hsa_wait!((A,x,y)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) + mark!((A,x,y), HIPEvent(stream)) y end function sbmv(uplo::Char, k::Integer, alpha::($elty), @@ -457,10 +457,10 @@ for (fname, elty) in ((:rocblas_zhbmv,:ComplexF64), lda = max(1,stride(A,2)) incx = stride(x,1) incy = stride(y,1) - wait!((A,x,y)) + hsa_wait!((A,x,y)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) + mark!((A,x,y), HIPEvent(stream)) y end function hbmv(uplo::Char, k::Integer, alpha::($elty), @@ -496,10 +496,10 @@ for (fname, elty) in ((:rocblas_stbmv,:Float32), if n != length(x) throw(DimensionMismatch("")) end lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) + hsa_wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check - mark!((A,x), stream) + mark!((A,x), HIPEvent(stream)) x end function tbmv(uplo::Char, @@ -533,10 +533,10 @@ for (fname, elty) in ((:rocblas_stbsv,:Float32), if n != length(x) throw(DimensionMismatch("")) end lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) + hsa_wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check - mark!((A,x), stream) + mark!((A,x), HIPEvent(stream)) x end function tbsv(uplo::Char, @@ -571,10 +571,10 @@ for (fname, elty) in ((:rocblas_dtrmv,:Float64), rocdiag = rocblasdiag(diag) lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) + hsa_wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check - mark!((A,x), stream) + mark!((A,x), HIPEvent(stream)) x end function trmv(uplo::Char, @@ -608,10 +608,10 @@ for (fname, elty) in ((:rocblas_dtrsv,:Float64), rocdiag = rocblasdiag(diag) lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) + hsa_wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check - mark!((A,x), stream) + mark!((A,x), HIPEvent(stream)) x end function trsv(uplo::Char, @@ -640,10 +640,10 @@ for (fname, elty) in ((:rocblas_dger,:Float64), incx = stride(x,1) incy = stride(y,1) lda = max(1,stride(A,2)) - wait!((x,y,A)) + hsa_wait!((x,y,A)) (; handle, stream) = lib_state() $(fname)(handle, m, n, Ref(alpha), x, incx, y, incy, A, lda) |> check - mark!((x,y,A), stream) + mark!((x,y,A), HIPEvent(stream)) A end end @@ -666,10 +666,10 @@ for (fname, elty) in ((:rocblas_dsyr,:Float64), length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions")) incx = stride(x,1) lda = max(1,stride(A,2)) - wait!((x,A)) + hsa_wait!((x,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check - mark!((x,A), stream) + mark!((x,A), HIPEvent(stream)) A end end @@ -689,10 +689,10 @@ for (fname, elty) in ((:rocblas_zher,:ComplexF64), length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions")) incx = stride(x,1) lda = max(1,stride(A,2)) - wait!((x,A)) + hsa_wait!((x,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check - mark!((x,A), stream) + mark!((x,A), HIPEvent(stream)) A end end @@ -715,10 +715,10 @@ for (fname, elty) in ((:rocblas_zher2,:ComplexF64), incx = stride(x,1) incy = stride(y,1) lda = max(1,stride(A,2)) - wait!((x,y,A)) + hsa_wait!((x,y,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, y, incy, A, lda) |> check - mark!((x,y,A), stream) + mark!((x,y,A), HIPEvent(stream)) A end end @@ -748,12 +748,12 @@ for (fname, elty) in lda = max(1, stride(A, 2)) ldb = max(1, stride(B, 2)) ldc = max(1, stride(C, 2)) - wait!((A, B, C)) + hsa_wait!((A, B, C)) (; handle, stream) = lib_state() $(fname)( handle, rocblasop(transA), rocblasop(transB), m, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A, B, C), stream) + mark!((A, B, C), HIPEvent(stream)) C end function gemm(transA::Char, @@ -866,7 +866,7 @@ for (fname, elty) in ) m, k, n, lda, ldb, ldc = _check_gemm_batched_dims( transA, transB, A, B, C) - wait!((A, B, C)) + hsa_wait!((A, B, C)) batch_count = size(C, 3) a_broadcast = (size(A, 3) == 1) && (batch_count > 1) @@ -880,7 +880,7 @@ for (fname, elty) in handle, rocblasop(transA), rocblasop(transB), m, n, k, Ref(alpha), Ab, lda, Bb, ldb, Ref(beta), Cb, ldc, batch_count) |> check - mark!((A, B, C), stream) + mark!((A, B, C), HIPEvent(stream)) C end function gemm_batched( @@ -946,10 +946,10 @@ for (fname, elty) in strideB = stride(B, 3) strideC = stride(C, 3) batchCount = size(A, 3) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, roctransA, roctransB, m, n, k, Ref(alpha), A, lda, strideA, B, ldb, strideB, Ref(beta), C, ldc, strideC, batchCount) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end function gemm_strided_batched(transA::Char, @@ -995,10 +995,10 @@ for (fname, elty) in ((:rocblas_dsymm,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end function symm(side::Char, @@ -1038,10 +1038,10 @@ for (fname, elty) in ((:rocblas_dsyrk,:Float64), k = size(A, trans == 'N' ? 2 : 1) lda = max(1,stride(A,2)) ldc = max(1,stride(C,2)) - wait!((A,C)) + hsa_wait!((A,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check - mark!((A,C), stream) + mark!((A,C), HIPEvent(stream)) C end end @@ -1081,10 +1081,10 @@ for (fname, elty) in ((:rocblas_zhemm,:ComplexF64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end function hemm(uplo::Char, @@ -1118,10 +1118,10 @@ for (fname, elty) in ((:rocblas_zherk,:ComplexF64), k = size(A, trans == 'N' ? 2 : 1) lda = max(1,stride(A,2)) ldc = max(1,stride(C,2)) - wait!((A,C)) + hsa_wait!((A,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check - mark!((A,C), stream) + mark!((A,C), HIPEvent(stream)) C end function herk(uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty}) @@ -1160,10 +1160,10 @@ for (fname, elty) in ((:rocblas_dsyr2k,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end end @@ -1206,10 +1206,10 @@ for (fname, elty1, elty2) in ((:rocblas_zher2k,:ComplexF64,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end function her2k(uplo::Char, @@ -1249,12 +1249,12 @@ for (mmname, smname, elty) in if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) - wait!((A,B)) + hsa_wait!((A,B)) (; handle, stream) = lib_state() $(mmname)( handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), A, lda, B, ldb) |> check - mark!((A,B), stream) + mark!((A,B), HIPEvent(stream)) B end function trmm( @@ -1278,10 +1278,10 @@ for (mmname, smname, elty) in if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trsm!")) end lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) - wait!((A,B)) + hsa_wait!((A,B)) (; handle, stream) = lib_state() $(smname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), A, lda, B, ldb) |> check - mark!((A,B), stream) + mark!((A,B), HIPEvent(stream)) B end function trsm( @@ -1325,10 +1325,10 @@ for (fname, elty) in ldb = max(1,stride(B[1],2)) Aptrs = device_batch(A) Bptrs = device_batch(B) - wait!((A,B)) + hsa_wait!((A,B)) (; handle, stream) = lib_state() $(fname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), Aptrs, lda, Bptrs, ldb, length(A)) |> check - mark!((A,B), stream) + mark!((A,B), HIPEvent(stream)) B end function trsm_batched(side::Char, @@ -1372,10 +1372,10 @@ for (fname, elty) in ((:rocblas_dgeam,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) + hsa_wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, roctransa, roctransb, m, n, Ref(alpha), A, lda, Ref(beta), B, ldb, C, ldc) |> check - mark!((A,B,C), stream) + mark!((A,B,C), HIPEvent(stream)) C end function geam(transa::Char, @@ -1417,14 +1417,15 @@ for (fname, elty) in Aptrs = device_batch(A) info = ROCArray{Cint}(undef, length(A)) pivotArray = Pivot ? ROCArray{Int32}(undef, (n, length(A))) : C_NULL - wait!(A) + hsa_wait!(A) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, pivotArray, info, length(A)) |> check if( !Pivot ) pivotArray = ROCArray(zeros(Cint, (n, length(A)))) end - mark!((A, info), stream) - pivotArray != C_NULL && mark!(pivotArray, stream) + event = HIPEvent(stream) + mark!((A, info), event) + pivotArray != C_NULL && mark!(pivotArray, event) pivotArray, info, A end function getrf_batched(A::Array{ROCMatrix{$elty},1}, @@ -1459,11 +1460,11 @@ for (fname, elty) in Aptrs = device_batch(A) Cptrs = device_batch(C) info = ROCArray(zeros(Cint,length(A))) - wait!(A) - wait!(pivotArray) + hsa_wait!(A) + hsa_wait!(pivotArray) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A)) |> check - mark!((A, pivotArray, info, C), stream) + mark!((A, pivotArray, info, C), HIPEvent(stream)) pivotArray, info, C end end @@ -1494,10 +1495,10 @@ for (fname, elty) in Aptrs = device_batch(A) Cptrs = device_batch(C) info = ROCArray(zeros(Cint,length(A))) - wait!(A) + hsa_wait!(A) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, Cptrs, ldc, info, length(A)) |> check - mark!((A, info, C), stream) + mark!((A, info, C), HIPEvent(stream)) info, C end end @@ -1522,13 +1523,13 @@ for (fname, elty) in end Tauptrs = device_batch(TauArray) info = zero(Cint) - wait!(A) + hsa_wait!(A) (; handle, stream) = lib_state() $(fname)(handle, m, n, Aptrs, lda, Tauptrs, Ref(info), length(A)) |> check if( info != 0 ) throw(ArgumentError,string("Invalid value at ",-info)) end - mark!((A, TauArray), stream) + mark!((A, TauArray), HIPEvent(stream)) TauArray, A end function geqrf_batched(A::Array{ROCMatrix{$elty},1}) @@ -1570,14 +1571,14 @@ for (fname, elty) in Cptrs = device_batch(C) info = zero(Cint) infoarray = ROCArray(zeros(Cint, length(A))) - wait!(A) - wait!(C) + hsa_wait!(A) + hsa_wait!(C) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, Ref(info), infoarray, length(A)) |> check if( info != 0 ) throw(ArgumentError,string("Invalid value at ",-info)) end - mark!((A, C, infoarray), stream) + mark!((A, C, infoarray), HIPEvent(stream)) A, C, infoarray end function gels_batched(trans::Char, @@ -1608,10 +1609,10 @@ for (fname, elty) in ((:rocblas_ddgmm,:Float64), lda = max(1,stride(A,2)) incx = stride(X,1) ldc = max(1,stride(C,2)) - wait!((A,X,C)) + hsa_wait!((A,X,C)) (; handle, stream) = lib_state() $(fname)(handle, rocside, m, n, A, lda, X, incx, C, ldc) |> check - mark!((A,X,C), stream) + mark!((A,X,C), HIPEvent(stream)) C end function dgmm(mode::Char, diff --git a/src/cache.jl b/src/cache.jl index 82a546aca..beda5fc83 100644 --- a/src/cache.jl +++ b/src/cache.jl @@ -115,7 +115,7 @@ function library_state( state = get!(() -> new_state(tls), states, tls.context) @noinline function update_stream(tls, state) - set_stream(new_handle, tls.stream) + set_stream(state.handle, tls.stream) return (; state.handle, tls.stream) end if state.stream != tls.stream diff --git a/src/device/gcn/array.jl b/src/device/gcn/array.jl index 8ba74490f..e16285a98 100644 --- a/src/device/gcn/array.jl +++ b/src/device/gcn/array.jl @@ -30,7 +30,7 @@ struct ROCDeviceArray{T,N,A} <: AbstractArray{T,N} ptr::LLVMPtr{T,A} # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) - ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N} = new(shape,ptr) + ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N} = new(shape, ptr) end # Define `khash` function to reduce runtime dispatches. diff --git a/src/dnn/MIOpen.jl b/src/dnn/MIOpen.jl index b619748b4..08b60e41e 100644 --- a/src/dnn/MIOpen.jl +++ b/src/dnn/MIOpen.jl @@ -4,7 +4,7 @@ using ..AMDGPU import AMDGPU.Runtime.Mem import AMDGPU: ROCArray, ROCDevice, LockedObject import AMDGPU: HandleCache, HIP, library_state -import .HIP: hipStream_t +import .HIP: HIPEvent, hipStream_t using CEnum using GPUArrays diff --git a/src/dnn/activations.jl b/src/dnn/activations.jl index 1db7635c5..6d77d6619 100644 --- a/src/dnn/activations.jl +++ b/src/dnn/activations.jl @@ -122,7 +122,7 @@ function _activation( miopenActivationForward( handle, desc.handle, Ref{Float32}(1f0), xdesc.handle, x, Ref{Float32}(0f0), ydesc.handle, y) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y end @@ -137,6 +137,6 @@ function _∇activation( handle, desc, Ref{Float32}(1f0), ydesc.handle, y, dydesc.handle, dy, xdesc.handle, x, Ref{Float32}(0f0), dxdesc.handle, dx) |> check - AMDGPU.mark!(dx, stream) + AMDGPU.mark!(dx, HIPEvent(stream)) dx end diff --git a/src/dnn/batchnorm.jl b/src/dnn/batchnorm.jl index 2178beb2e..7fefdece1 100644 --- a/src/dnn/batchnorm.jl +++ b/src/dnn/batchnorm.jl @@ -38,7 +38,7 @@ function batchnorm_training( handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, ydesc.handle, y, bndesc.handle, γ, β, factor, μ, ν, ϵ, μ_saved, ν_saved) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y, μ_saved, ν_saved end @@ -77,7 +77,7 @@ function batchnorm_inference( handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, ydesc.handle, y, bndesc.handle, γ, β, μ, ν, ϵ) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y end @@ -101,7 +101,7 @@ function ∇batchnorm( Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, dydesc.handle, dy, dxdesc.handle, dx, bndesc.handle, γ, dγ, dβ, ϵ, μ_saved, ν_saved) |> check - AMDGPU.mark!((dx, dγ, dβ), stream) + AMDGPU.mark!((dx, dγ, dβ), HIPEvent(stream)) dx, dγ, dβ end diff --git a/src/dnn/convolution.jl b/src/dnn/convolution.jl index 2cb491f5c..80a2c4396 100644 --- a/src/dnn/convolution.jl +++ b/src/dnn/convolution.jl @@ -132,7 +132,7 @@ function convolution!( handle, Ref{Float32}(1f0), xdesc.handle, x, wdesc.handle, w, cdesc.handle, perf_results.fwd_algo, Ref{Float32}(0f0), ydesc.handle, y, workspace.data.ptr, perf_results.memory) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y end @@ -176,7 +176,7 @@ function ∇convolution_weight!( handle, Ref{Float32}(1f0), dydesc.handle, dy, xdesc.handle, x, cdesc.handle, perf_algo.bwd_weights_algo, Ref{Float32}(0f0), ∇wdesc.handle, ∇w, workspace.data.ptr, perf_algo.memory) |> check - AMDGPU.mark!(∇w, stream) + AMDGPU.mark!(∇w, HIPEvent(stream)) ∇w end @@ -220,7 +220,7 @@ function ∇convolution_data!( handle, Ref{Float32}(1f0), dydesc.handle, dy, wdesc.handle, w, cdesc.handle, perf_algo.bwd_data_algo, Ref{Float32}(0f0), ∇xdesc.handle, ∇x, workspace.data.ptr, perf_algo.memory) |> check - AMDGPU.mark!(∇x, stream) + AMDGPU.mark!(∇x, HIPEvent(stream)) ∇x end diff --git a/src/dnn/pooling.jl b/src/dnn/pooling.jl index eeab61238..d4377b7d5 100644 --- a/src/dnn/pooling.jl +++ b/src/dnn/pooling.jl @@ -109,7 +109,7 @@ function pool!( handle, pdesc.handle, Ref{Float32}(alpha), xdesc.handle, x, Ref{Float32}(beta), ydesc.handle, y, do_backward, wptr, wsize) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y, workspace end @@ -127,6 +127,6 @@ function ∇pool!( ydesc.handle, y, dydesc.handle, dy, xdesc.handle, x, Ref{Float32}(beta), dxdesc.handle, dx, (isnothing(workspace) ? C_NULL : workspace.data.ptr)) |> check - AMDGPU.mark!(dx, stream) + AMDGPU.mark!(dx, HIPEvent(stream)) dx end diff --git a/src/dnn/softmax.jl b/src/dnn/softmax.jl index b24f73ecd..afafa5b61 100644 --- a/src/dnn/softmax.jl +++ b/src/dnn/softmax.jl @@ -63,7 +63,7 @@ function _softmax!( miopenSoftmaxForward_V2( handle, Ref{Float32}(1f0), xdesc.handle, x, Ref{Float32}(0f0), ydesc.handle, y, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check - AMDGPU.mark!(y, stream) + AMDGPU.mark!(y, HIPEvent(stream)) y end @@ -83,7 +83,7 @@ function _∇softmax!( handle, Ref{Float32}(1f0), ydesc.handle, y, dydesc.handle, dy, Ref{Float32}(0f0), dxdesc.handle, dx, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check - AMDGPU.mark!(dx, stream) + AMDGPU.mark!(dx, HIPEvent(stream)) dx end diff --git a/src/highlevel.jl b/src/highlevel.jl index 6e434fbf1..82b452b10 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -3,7 +3,7 @@ import AMDGPU: Runtime, Compiler import .Runtime: ROCDevice, ROCQueue, ROCExecutable, ROCKernel, ROCSignal, ROCKernelSignal, HSAError import .Runtime: ROCDim, ROCDim3 -import .Runtime: wait!, mark! +import .Runtime: wait!, hip_wait!, hsa_wait!, mark! import .Compiler: rocfunction export @roc, rocconvert, rocfunction @@ -602,7 +602,7 @@ macro roc(ex...) if $launch if $wait - foreach($wait!, ($(var_exprs...),)) + foreach($hip_wait!, ($(var_exprs...),)) end local $kernel_instance = $create_kernel($kernel; $(kernel_kwargs...)) local $signal = $create_event( diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl index e80646571..d4c4e09a3 100644 --- a/src/hip/HIP.jl +++ b/src/hip/HIP.jl @@ -153,4 +153,66 @@ function priority(stream::hipStream_t) priority_to_symbol(priority[]) end +mutable struct HIPEvent + handle::hipEvent_t + stream::hipStream_t +end + +Base.:(==)(a::HIPEvent, b::HIPEvent) = a.handle == b.handle + +function record(event::HIPEvent) + hipEventRecord(event.handle, event.stream) |> check + return event +end + +function isdone(event::HIPEvent) + query = hipEventQuery(event.handle) + if query == hipSuccess + return true + elseif query == hipErrorNotReady + return false + else + throw(HIPError(query)) + end +end + +function non_blocking_synchronize(event::HIPEvent) + isdone(event) && return true + + # spin (initially without yielding to minimize latency) + spins = 0 + while spins < 256 + if spins < 32 + ccall(:jl_cpu_pause, Cvoid, ()) + # Temporary solution before we have gc transition support in codegen. + ccall(:jl_gc_safepoint, Cvoid, ()) + else + yield() + end + isdone(event) && return true + spins += 1 + end + return false +end + +wait(event::HIPEvent) = hipEventSynchronize(event.handle) |> check + +function synchronize(event::HIPEvent) + non_blocking_synchronize(event) || wait(event) + return +end + +function HIPEvent(stream::hipStream_t; do_record::Bool = true) + event_ref = Ref{hipEvent_t}() + hipEventCreateWithFlags(event_ref, hipEventDisableTiming) |> check + event = HIPEvent(event_ref[], stream) + do_record && record(event) + + finalizer(event) do e + hipEventDestroy(e.handle) |> check + end + event +end +HIPEvent(stream::HIPStream; do_record::Bool = true) = HIPEvent(stream.stream; do_record) + end diff --git a/src/hip/libhip.jl b/src/hip/libhip.jl index d26cabc30..13050d771 100644 --- a/src/hip/libhip.jl +++ b/src/hip/libhip.jl @@ -42,6 +42,30 @@ function hipDeviceGetName(name::Ptr{Cuchar}, len::Cint, device::hipDevice_t) (Ptr{Cuchar}, Cint, hipDevice_t), name, len, device) end +function hipEventCreate(event_ref::Ref{hipEvent_t}) + ccall((:hipEventCreate, libhip), hipError_t, (Ptr{hipEvent_t},), event_ref) +end + +function hipEventCreateWithFlags(event_ref::Ref{hipEvent_t}, flags::hipEventFlag_t) + ccall((:hipEventCreateWithFlags, libhip), hipError_t, (Ptr{hipEvent_t}, Cuint), event_ref, flags) +end + +function hipEventDestroy(event::hipEvent_t) + ccall((:hipEventDestroy, libhip), hipError_t, (hipEvent_t,), event) +end + +function hipEventRecord(event::hipEvent_t, stream::hipStream_t) + ccall((:hipEventRecord, libhip), hipError_t, (hipEvent_t, hipStream_t), event, stream) +end + +function hipEventQuery(event::hipEvent_t) + ccall((:hipEventQuery, libhip), hipError_t, (hipEvent_t,), event) +end + +function hipEventSynchronize(event::hipEvent_t) + ccall((:hipEventSynchronize, libhip), hipError_t, (hipEvent_t,), event) +end + function hipStreamCreateWithPriority(stream_ref::Ref{hipStream_t}, flags::Cuint, priority::Cint) ccall((:hipStreamCreateWithPriority, libhip), hipError_t, (Ptr{hipStream_t}, Cuint, Cint), stream_ref, flags, priority) diff --git a/src/hip/libhip_common.jl b/src/hip/libhip_common.jl index 58031e072..4c809249a 100644 --- a/src/hip/libhip_common.jl +++ b/src/hip/libhip_common.jl @@ -1,5 +1,12 @@ +@cenum hipEventFlag_t::Cuint begin + hipEventDefault = 0 + hipEventDisableTiming = 2 + hipEventInterprocess = 4 +end + @cenum hipError_t::UInt32 begin hipSuccess = 0 + hipErrorInvalidValue = 1 hipErrorOutOfMemory = 2 hipErrorNotInitialized = 3 hipErrorDeinitialized = 4 @@ -7,7 +14,17 @@ hipErrorProfilerNotInitialized = 6 hipErrorProfilerAlreadyStarted = 7 hipErrorProfilerAlreadyStopped = 8 + hipErrorInvalidConfiguration = 9 + hipErrorInvalidPitchValue = 12 + hipErrorInvalidSymbol = 13 + hipErrorInvalidDevicePointer = 17 + hipErrorInvalidMemcpyDirection = 21 hipErrorInsufficientDriver = 35 + hipErrorMissingConfiguration = 52 + hipErrorPriorLaunchFailure = 53 + hipErrorInvalidDeviceFunction = 98 + hipErrorNoDevice = 100 + hipErrorInvalidDevice = 101 hipErrorInvalidImage = 200 hipErrorInvalidContext = 201 hipErrorContextAlreadyCurrent = 202 @@ -31,38 +48,37 @@ hipErrorSharedObjectSymbolNotFound = 302 hipErrorSharedObjectInitFailed = 303 hipErrorOperatingSystem = 304 - hipErrorSetOnActiveProcess = 305 hipErrorInvalidHandle = 400 + hipErrorIllegalState = 401 hipErrorNotFound = 500 + hipErrorNotReady = 600 hipErrorIllegalAddress = 700 - hipErrorInvalidSymbol = 701 - # Runtime Error Codes start here. - hipErrorMissingConfiguration = 1001 - hipErrorMemoryAllocation = 1002 - hipErrorInitializationError = 1003 - hipErrorLaunchFailure = 1004 - hipErrorPriorLaunchFailure = 1005 - hipErrorLaunchTimeOut = 1006 - hipErrorLaunchOutOfResources = 1007 - hipErrorInvalidDeviceFunction = 1008 - hipErrorInvalidConfiguration = 1009 - hipErrorInvalidDevice = 1010 - hipErrorInvalidValue = 1011 - hipErrorInvalidDevicePointer = 1017 - hipErrorInvalidMemcpyDirection = 1021 - hipErrorUnknown = 1030 - hipErrorInvalidResourceHandle = 1033 - hipErrorNotReady = 1034 - hipErrorNoDevice = 1038 - hipErrorPeerAccessAlreadyEnabled = 1050 - hipErrorPeerAccessNotEnabled = 1051 + hipErrorLaunchOutOfResources = 701 + hipErrorLaunchTimeOut = 702 + hipErrorPeerAccessAlreadyEnabled = 704 + hipErrorPeerAccessNotEnabled = 705 + hipErrorSetOnActiveProcess = 708 + hipErrorContextIsDestroyed = 709 + hipErrorAssert = 710 + hipErrorHostMemoryAlreadyRegistered = 712 + hipErrorHostMemoryNotRegistered = 713 + hipErrorLaunchFailure = 719 + hipErrorCooperativeLaunchTooLarge = 720 + hipErrorNotSupported = 801 + hipErrorStreamCaptureUnsupported = 900 + hipErrorStreamCaptureInvalidated = 901 + hipErrorStreamCaptureMerge = 902 + hipErrorStreamCaptureUnmatched = 903 + hipErrorStreamCaptureUnjoined = 904 + hipErrorStreamCaptureIsolation = 905 + hipErrorStreamCaptureImplicit = 906 + hipErrorCapturedEvent = 907 + hipErrorStreamCaptureWrongThread = 908 + hipErrorGraphExecUpdateFailure = 910 + hipErrorUnknown = 999 + # HSA Runtime Error Codes start here. hipErrorRuntimeMemory = 1052 hipErrorRuntimeOther = 1053 - hipErrorHostMemoryAlreadyRegistered = 1061 - hipErrorHostMemoryNotRegistered = 1062 - hipErrorMapBufferObjectFailed = 1071 - hipErrorAssert = 1081 - hipErrorNotSupported = 1082 hipErrorTbd end diff --git a/src/mapreduce.jl b/src/mapreduce.jl index e3e3de6b0..91c446fbe 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -169,8 +169,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, # perform the actual reduction if reduce_groups == 1 # we can cover the dimensions to reduce using a single group - wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device( - f, op, init, Val(items), Rreduce, Rother, R′, A)) + @roc gridsize=gridsize groupsize=items partial_mapreduce_device( + f, op, init, Val(items), Rreduce, Rother, R′, A) else # we need multiple steps to cover all values to reduce partial = similar(R, (size(R)..., reduce_groups)) @@ -178,8 +178,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, # without an explicit initializer we need to copy from the output container partial .= R end - wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device( - f, op, init, Val(items), Rreduce, Rother, partial, A)) + @roc gridsize=gridsize groupsize=items partial_mapreduce_device( + f, op, init, Val(items), Rreduce, Rother, partial, A) GPUArrays.mapreducedim!(identity, op, R′, partial; init=init) end diff --git a/src/rand/random.jl b/src/rand/random.jl index c27f48b73..1ec68d095 100644 --- a/src/rand/random.jl +++ b/src/rand/random.jl @@ -1,10 +1,5 @@ # interfacing with Random standard library -using Random - -using GPUArrays - - mutable struct RNG <: Random.AbstractRNG handle::rocrand_generator typ::rocrand_rng_type @@ -24,7 +19,6 @@ end Base.unsafe_convert(::Type{rocrand_generator}, rng::RNG) = rng.handle - ## seeding function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0) rocrand_set_seed(rng, seed) @@ -48,7 +42,7 @@ for (f,T) in ((:rocrand_generate, :UInt32), (:rocrand_generate_char,:Cuchar), function Random.rand!(rng::RNG, A::ROCArray{$(T)}) wait!(A) $(f)(rng, A, length(A)) - mark!(A, C_NULL) + mark!(A, HIPEvent(stream())) return A end end @@ -60,12 +54,12 @@ function inplace_pow2(A, f) len = length(A) if len > 1 && ispow2(len) f(A) - mark!(A, C_NULL) + mark!(A, HIPEvent(stream())) else padlen = max(2, nextpow(2, len)) B = similar(A, padlen) f(B) - mark!(B, C_NULL) + mark!(B, HIPEvent(stream())) copyto!(A, 1, B, 1, len) AMDGPU.unsafe_free!(B) end @@ -156,24 +150,24 @@ rand_poisson(rng::RNG, T::PoissonType, dim1::Integer, dims::Integer...; kwargs.. rand_poisson(rng, T, Dims((dim1, dims...)); kwargs...) # rand_logn! and rand_poisson! without specified rng -rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(default_rng(), A; kwargs...) -rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(default_rng(), A; kwargs...) +rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(handle(), A; kwargs...) +rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(handle(), A; kwargs...) -rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(default_rng(), T, dims; kwargs...) -rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(default_rng(), T, dims; kwargs...) +rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(handle(), T, dims; kwargs...) +rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(handle(), T, dims; kwargs...) rand_logn(T::rocRAND.LognormalType, dim1::Integer, dims::Integer...; kwargs...) = - rand_logn(default_rng(), T, Dims((dim1, dims...)); kwargs...) + rand_logn(handle(), T, Dims((dim1, dims...)); kwargs...) rand_poisson(T::rocRAND.PoissonType, dim1::Integer, dims::Integer...; kwargs...) = - rand_poisson(default_rng(), T, Dims((dim1, dims...)); kwargs...) + rand_poisson(handle(), T, Dims((dim1, dims...)); kwargs...) rand_logn(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_logn!(ROCArray{T}(undef, dim1, dims...); kwargs...) rand_poisson(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_poisson!(ROCArray{T}(undef, dim1, dims...); kwargs...) rand_logn(dim1::Integer, dims::Integer...; kwargs...) = - rand_logn(default_rng(), Dims((dim1, dims...)); kwargs...) + rand_logn(handle(), Dims((dim1, dims...)); kwargs...) rand_poisson(dim1::Integer, dims::Integer...; kwargs...) = - rand_poisson(default_rng(), Dims((dim1, dims...)); kwargs...) + rand_poisson(handle(), Dims((dim1, dims...)); kwargs...) rand_logn(T::Type, dims::Dims; kwargs...) = rand_logn!(ROCArray{T}(undef, dims...); kwargs...) rand_poisson(T::Type, dims::Dims; kwargs...) = rand_poisson!(ROCArray{T}(undef, dims...); kwargs...) rand_logn!(A::ROCArray; kwargs...) = diff --git a/src/rand/rocRAND.jl b/src/rand/rocRAND.jl index 48e54c878..b2cc09b80 100644 --- a/src/rand/rocRAND.jl +++ b/src/rand/rocRAND.jl @@ -1,12 +1,14 @@ module rocRAND import ..AMDGPU -import .AMDGPU: ROCArray, HandleCache, librocrand, mark!, wait! +import .AMDGPU: ROCArray, HandleCache, librocrand, mark!, wait!, library_state import ..HSA import ..HIP -import .HIP: HIPContext, HIPStream, hipStream_t +import .HIP: HIPContext, HIPStream, HIPEvent, hipStream_t using CEnum +using GPUArrays +using Random export rand_logn!, rand_poisson!, rand_logn, rand_poisson @@ -21,43 +23,17 @@ end # stdlib Random integration include("random.jl") -# Copied from CUDA.jl/lib/curand/CURAND.jl +const IDLE_RNGS = HandleCache{HIPContext, RNG}() -# cache for created, but unused handles -const idle_rngs = HandleCache{HIPContext,RNG}() +lib_state() = library_state( + :rocRAND, RNG, IDLE_RNGS, + () -> RNG(), r -> return, # RNG destroys itself in finalizer. + (nh, s) -> begin + Random.seed!(nh) + rocrand_set_stream(nh.handle, s) + end) -function default_rng() - tls = AMDGPU.task_local_state() - - # every task maintains library state per device - LibraryState = @NamedTuple{rng::RNG} - states = get!(task_local_storage(), :rocRAND) do - Dict{HIPContext,LibraryState}() - end::Dict{HIPContext,LibraryState} - - # get library state - @noinline function new_state(tls) - new_rng = pop!(idle_rngs, tls.context) do - RNG() - end - - finalizer(current_task()) do task - push!(idle_rngs, tls.context, new_rng) do - # no need to do anything, as the RNG is collected by its finalizer - end - end - - Random.seed!(new_rng) - - rocrand_set_stream(new_rng.handle, tls.stream) - - (; rng=new_rng) - end - state = get!(states, tls.context) do - new_state(tls) - end - - return state.rng -end +handle() = lib_state().handle +stream() = lib_state().stream end diff --git a/src/random.jl b/src/random.jl index f74971611..f8c40ea37 100644 --- a/src/random.jl +++ b/src/random.jl @@ -16,7 +16,7 @@ function GPUArrays.default_rng(::Type{<:ROCArray}) end gpuarrays_rng() = GPUArrays.default_rng(ROCArray) -const rocrand_rng = librocrand !== nothing ? rocRAND.default_rng : gpuarrays_rng +const rocrand_rng = librocrand !== nothing ? rocRAND.handle : gpuarrays_rng # the interface is split in two levels: # - functions that extend the Random standard library, and take an RNG as first argument, diff --git a/src/runtime/kernel.jl b/src/runtime/kernel.jl index 7b1e9734c..163884102 100644 --- a/src/runtime/kernel.jl +++ b/src/runtime/kernel.jl @@ -118,7 +118,34 @@ function ROCKernel(kernel #= ::HostKernel =#; localmem::Int=0) group_segment_size = executable_symbol_kernel_group_segment_size(exec_symbol) group_segment_size = UInt32(group_segment_size + localmem) private_segment_size = executable_symbol_kernel_private_segment_size(exec_symbol) + if private_segment_size > MAXIMUM_SCRATCH_ALLOCATION + @debug """ + Excessive scratch allocation requested: $(Base.format_bytes(private_segment_size)). + Reducing per-lane scratch to: $(Base.format_bytes(Int(MAXIMUM_SCRATCH_ALLOCATION))). + """ + private_segment_size = MAXIMUM_SCRATCH_ALLOCATION + end ROCKernel(device, exe, symbol, localmem, kernel_object, kernarg_segment_size, group_segment_size, private_segment_size, Ptr{Cvoid}(0)) end + +"Sets the maximum amount of per-lane scratch memory that can be allocated for a +kernel. Consider setting this to a value below 2^14 if encountering +`QueueError`s with the `HSA.STATUS_ERROR_OUT_OF_RESOURCES` code." +set_max_scratch!(scratch::Integer) = + @set_preferences!("max_scratch"=>scratch) +const MAXIMUM_SCRATCH_ALLOCATION = let + if haskey(ENV, "JULIA_AMDGPU_MAX_SCRATCH") + scratch = ENV["JULIA_AMDGPU_MAX_SCRATCH"] + scratch = if uppercase(scratch) == "MAX" + typemax(UInt32) + else + parse(UInt32, scratch) + end + set_max_scratch!(scratch) + scratch + else + UInt32(@load_preference("max_scratch", 8192)) + end +end::UInt32 diff --git a/src/runtime/queue.jl b/src/runtime/queue.jl index 80277f2f9..ccf8fd0b4 100644 --- a/src/runtime/queue.jl +++ b/src/runtime/queue.jl @@ -237,17 +237,17 @@ end function monitor_queue(queue::ROCQueue) kerns = queue.active_kernels::LinkedList{ROCKernelSignal} - while queue.active || length(kerns) > 0 + while queue.active || !isempty(kerns) # Fetch oldest signal, if any sig = lock(queue.lock) do - if length(kerns) > 0 - # Notify waiters that queue is running - notify(queue.running) - return first(kerns) - else + if isempty(kerns) # Reset event reset(queue.running) return nothing + else + # Notify waiters that queue is running + notify(queue.running) + return first(kerns) end end @@ -260,7 +260,7 @@ function monitor_queue(queue::ROCQueue) end # Move to the next kernel. Base.@lock queue.lock begin - kerns = next!(kerns) + next!(kerns) end else wait(queue.running) diff --git a/src/runtime/signal.jl b/src/runtime/signal.jl index 7b37ead73..115a1f8f5 100644 --- a/src/runtime/signal.jl +++ b/src/runtime/signal.jl @@ -113,9 +113,7 @@ function Base.wait( (diff_time > timeout) && throw(SignalTimeoutException(signal)) end - if queue !== nothing - ensure_active(queue) - end + isnothing(queue) || ensure_active(queue) # Allow another scheduled task to run. # This is especially needed in the case diff --git a/src/runtime/sync.jl b/src/runtime/sync.jl index 0f3579b88..561e06974 100644 --- a/src/runtime/sync.jl +++ b/src/runtime/sync.jl @@ -1,38 +1,91 @@ import ..AMDGPU: hip_configured "Tracks HSA signals and HIP streams to sync against." -struct SyncState +mutable struct SyncState signals::Vector{ROCKernelSignal} - streams::Vector{Ptr{Cvoid}} + events::Vector{HIP.HIPEvent} lock::Threads.ReentrantLock + + same_queue::Bool + same_stream::Bool end -SyncState() = SyncState(ROCKernelSignal[], Ptr{Cvoid}[], Threads.ReentrantLock()) +SyncState() = SyncState( + ROCKernelSignal[], HIP.HIPEvent[], Threads.ReentrantLock(), true, true) struct WaitAdaptor end +struct HIPWaitAdaptor end +struct HSAWaitAdaptor end struct MarkAdaptor{S} s::S end -function wait!(ss::SyncState) +function wait!(ss::SyncState; hip::Bool = true, hsa::Bool = true) lock(ss.lock) do - # FIXME: Use barrier_and on dedicated queue - foreach(wait, ss.signals) - empty!(ss.signals) - @static if hip_configured - for s in ss.streams - AMDGPU.HIP.@check AMDGPU.HIP.hipStreamSynchronize(s) - end - empty!(ss.streams) + # Force HSA wait if there are streams or if there are different queues. + hsa = hsa || !isempty(ss.events) || !ss.same_queue + # Force HIP wait if there are signals or if there are different streams. + hip = hip || !isempty(ss.signals) || !ss.same_stream + + if hsa + foreach(wait, ss.signals) + empty!(ss.signals) + else + # If not waiting on HSA, keep last signal if any. + # This will force a syncronization if HSA kernel is still running, + # but HIP stream was added to syncstate. + isempty(ss.signals) || (ss.signals = [last(ss.signals)];) + end + ss.same_queue = true + + if hip + foreach(AMDGPU.HIP.synchronize, ss.events) + empty!(ss.events) + else + # If not waiting on HIP, keep last stream if any. + # This will force a syncronization if HIP kernel is still running, + # but HSA signal was added to syncstate. + isempty(ss.events) || (ss.events = [last(ss.events)];) end + ss.same_stream = true end return end -mark!(ss::SyncState, signal::ROCKernelSignal) = - lock(()->push!(ss.signals, signal), ss.lock) -mark!(ss::SyncState, stream::Ptr{Cvoid}) = - lock(()->push!(ss.streams, stream), ss.lock) -mark!(ss::SyncState, stream::HIP.HIPStream) = - mark!(ss, stream.stream) + +function mark!(ss::SyncState, signal::ROCKernelSignal) + lock(ss.lock) do + if isempty(ss.signals) + push!(ss.signals, signal) + else + last_signal = last(ss.signals) + # Add new signal only if it is not the same as the last one. + if last_signal.signal != signal.signal + ss.same_queue &= last_signal.queue == signal.queue + push!(ss.signals, signal) + end + end + end +end + +function mark!(ss::SyncState, event::HIP.HIPEvent) + lock(ss.lock) do + if !isempty(ss.events) + last_event = last(ss.events) + last_event.stream != event.stream && (ss.same_stream &= false;) + end + push!(ss.events, event) + end +end wait!(x) = Adapt.adapt(WaitAdaptor(), x) mark!(x, s) = Adapt.adapt(MarkAdaptor(s), x) # TODO constrain type of `s` + +""" +Wait on HIP streams in syncstate if any. +Otherwise, rely on HSA queue serialization. +""" +hip_wait!(x) = Adapt.adapt(HIPWaitAdaptor(), x) +""" +Wait in HSA signals in syncstate if any. +Otherwise, rely on HIP stream serialization. +""" +hsa_wait!(x) = Adapt.adapt(HSAWaitAdaptor(), x) diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl index 1861e712c..f79dd6599 100644 --- a/test/rocarray/base.jl +++ b/test/rocarray/base.jl @@ -182,4 +182,38 @@ end @test refcount_live(A) == (0, false) end +@testset "Skip host wait" begin + # HSA signals. + + x = ROCArray(ones(16)) + @test isempty(x.syncstate.signals) + @test isempty(x.syncstate.streams) + broadcast!(cos, x, x) + @test length(x.syncstate.signals) == 1 + @test isempty(x.syncstate.streams) + broadcast!(cos, x, x) + # wait! before broadcast, skips waiting and we push new signal. + @test length(x.syncstate.signals) == 2 + @test isempty(x.syncstate.streams) + broadcast!(cos, x, x) + # wait! before broadcast, skips waiting, leaves only last signal + # and we push new signal. + @test length(x.syncstate.signals) == 2 + @test isempty(x.syncstate.streams) + + # HIP streams. + + x = ROCArray(ones(Float32, 16, 16)) + y = ROCArray(zeros(Float32, 16, 16)) + @test isempty(y.syncstate.signals) + @test isempty(y.syncstate.streams) + LinearAlgebra.mul!(y, x, x) + @test isempty(y.syncstate.signals) + @test length(y.syncstate.streams) == 1 + LinearAlgebra.mul!(y, x, x) + @test isempty(y.syncstate.signals) + # Same stream, do not add more. + @test length(y.syncstate.streams) == 1 +end + end diff --git a/test/rocarray/nmf.jl b/test/rocarray/nmf.jl index a9d37a9b9..19377a6c8 100644 --- a/test/rocarray/nmf.jl +++ b/test/rocarray/nmf.jl @@ -17,15 +17,20 @@ for scale in (1:5:50) ncol = 2001 nrow = 1002*scale nfeatures = 12 + X = rand(Float32, nrow, ncol) W = rand(Float32, nrow, nfeatures) H = rand(Float32, nfeatures, ncol) cpu_res = step(X, W, H) + RX = ROCArray(X) RW = ROCArray(W) RH = ROCArray(H) + gpu_res = step(RX, RW, RH) @test Array(gpu_res) ≈ cpu_res + + AMDGPU.unsafe_free!.((RX, RW, RH, gpu_res)) end end diff --git a/test/runtests.jl b/test/runtests.jl index 802f966a3..fb11dc235 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -46,58 +46,58 @@ AMDGPU.versioninfo() @info "Running tests with $(length(ws)) workers" -push!(tests, "HSA" => ()->begin - include("hsa/error.jl") - include("hsa/utils.jl") - include("hsa/getinfo.jl") - include("hsa/device.jl") - include("hsa/queue.jl") - include("hsa/memory.jl") - include("hsa/hashing.jl") -end) -push!(tests, "Codegen" => ()->begin - include("codegen/synchronization.jl") - include("codegen/trap.jl") -end) -if AMDGPU.Runtime.LOGGING_STATIC_ENABLED - push!(tests, "Logging" => ()->include("logging.jl")) -else - @warn """ - Logging is statically disabled, skipping logging tests. - This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests. - """ - @test_skip "Logging" -end -push!(tests, "Device Functions" => ()->begin - include("device/launch.jl") - include("device/array.jl") - include("device/vadd.jl") - include("device/memory.jl") - include("device/indexing.jl") - include("device/hostcall.jl") - include("device/output.jl") - include("device/globals.jl") - include("device/math.jl") - include("device/wavefront.jl") - include("device/execution_control.jl") - include("device/exceptions.jl") - # FIXME segfaults in a weird way (on check_ir) - # include("device/deps.jl") - include("device/queries.jl") -end) -push!(tests, "Multitasking" => ()->include("tls.jl")) -push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl")) +# push!(tests, "HSA" => ()->begin +# include("hsa/error.jl") +# include("hsa/utils.jl") +# include("hsa/getinfo.jl") +# include("hsa/device.jl") +# include("hsa/queue.jl") +# include("hsa/memory.jl") +# include("hsa/hashing.jl") +# end) +# push!(tests, "Codegen" => ()->begin +# include("codegen/synchronization.jl") +# include("codegen/trap.jl") +# end) +# if AMDGPU.Runtime.LOGGING_STATIC_ENABLED +# push!(tests, "Logging" => ()->include("logging.jl")) +# else +# @warn """ +# Logging is statically disabled, skipping logging tests. +# This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests. +# """ +# @test_skip "Logging" +# end +# push!(tests, "Device Functions" => ()->begin +# include("device/launch.jl") +# include("device/array.jl") +# include("device/vadd.jl") +# include("device/memory.jl") +# include("device/indexing.jl") +# include("device/hostcall.jl") +# include("device/output.jl") +# include("device/globals.jl") +# include("device/math.jl") +# include("device/wavefront.jl") +# include("device/execution_control.jl") +# include("device/exceptions.jl") +# # FIXME segfaults in a weird way (on check_ir) +# # include("device/deps.jl") +# include("device/queries.jl") +# end) +# push!(tests, "Multitasking" => ()->include("tls.jl")) +# push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl")) push!(tests, "ROCArray - Broadcast" => ()->include("rocarray/broadcast.jl")) -if CI - push!(tests, "ROCm libraries are functional" => ()->begin - @test AMDGPU.functional(:rocblas) - @test AMDGPU.functional(:rocrand) - if !AMDGPU.use_artifacts - # We don't have artifacts for these - @test AMDGPU.functional(:rocfft) - end - end) -end +# if CI +# push!(tests, "ROCm libraries are functional" => ()->begin +# @test AMDGPU.functional(:rocblas) +# @test AMDGPU.functional(:rocrand) +# if !AMDGPU.use_artifacts +# # We don't have artifacts for these +# @test AMDGPU.functional(:rocfft) +# end +# end) +# end push!(tests, "rocBLAS" => ()->begin if AMDGPU.functional(:rocblas) include("rocarray/blas.jl") @@ -112,13 +112,14 @@ push!(tests, "rocRAND" => ()->begin @test_skip "rocRAND" end end) -push!(tests, "rocFFT" => ()->begin - if AMDGPU.functional(:rocfft) - include("rocarray/fft.jl") - else - @test_skip "rocFFT" - end -end) +# FIXME outdated library +# push!(tests, "rocFFT" => ()->begin +# if AMDGPU.functional(:rocfft) +# include("rocarray/fft.jl") +# else +# @test_skip "rocFFT" +# end +# end) push!(tests, "NMF" => ()->begin if AMDGPU.functional(:rocblas) include("rocarray/nmf.jl") @@ -126,23 +127,23 @@ push!(tests, "NMF" => ()->begin @test_skip "NMF" end end) -push!(tests, "MIOpen" => ()->begin - if AMDGPU.functional(:MIOpen) - include("dnn/miopen.jl") - else - @test_skip "MIOpen" - end -end) -push!(tests, "External Packages" => ()->include("external/forwarddiff.jl")) -for (i, name) in enumerate(keys(TestSuite.tests)) - push!(tests, "GPUArrays TestSuite - $name" => - ()->TestSuite.tests[name](ROCArray)) -end -push!(tests, "KernelAbstractions" => ()->begin - Testsuite.testsuite( - ROCBackend, "ROCM", AMDGPU, ROCArray, AMDGPU.ROCDeviceArray; - skip_tests=Set(["sparse"])) -end) +# push!(tests, "MIOpen" => ()->begin +# if AMDGPU.functional(:MIOpen) +# include("dnn/miopen.jl") +# else +# @test_skip "MIOpen" +# end +# end) +# push!(tests, "External Packages" => ()->include("external/forwarddiff.jl")) +# for (i, name) in enumerate(keys(TestSuite.tests)) +# push!(tests, "GPUArrays TestSuite - $name" => +# ()->TestSuite.tests[name](ROCArray)) +# end +# push!(tests, "KernelAbstractions" => ()->begin +# Testsuite.testsuite( +# ROCBackend, "ROCM", AMDGPU, ROCArray, AMDGPU.ROCDeviceArray; +# skip_tests=Set(["sparse"])) +# end) function run_worker(w) while !isempty(tests) diff --git a/test/tls.jl b/test/tls.jl index c6d0f31d1..e1a670d7f 100644 --- a/test/tls.jl +++ b/test/tls.jl @@ -162,3 +162,32 @@ if AMDGPU.functional(:rocfft) end if AMDGPU.functional(:MIOpen) end + +@testset "LinkedList" begin + list = AMDGPU.Runtime.LinkedList{Int}() + @test isempty(list) + + push!(list, 1) + @test length(list) == 1 + push!(list, 2) + @test length(list) == 2 + + @test first(list) == 1 + @test last(list) == 2 + + vec_list = Array(list) + @test length(vec_list) == 2 + + copy_list = copy(list) + @test length(copy_list) == 2 + + AMDGPU.Runtime.next!(list) + @test length(list) == 1 + @test first(list) == 2 + + AMDGPU.Runtime.next!(list) + @test isempty(list) + + @test first(copy_list) == 1 + @test last(copy_list) == 2 +end