Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ version = "0.4.13"
[deps]
AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
BinaryProvider = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
Expand All @@ -26,6 +27,7 @@ Preferences = "21216c6a-2e73-6563-6e65-726566657250"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
ROCmDeviceLibs_jll = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Expand Down
45 changes: 30 additions & 15 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,19 @@ struct ROCKernelContext <: AbstractKernelContext end

function GPUArrays.gpu_call(::ROCArrayBackend, f, args, threads::Int, blocks::Int; name::Union{String,Nothing})
groupsize, gridsize = threads, blocks * threads
wait(@roc groupsize=groupsize gridsize=gridsize f(ROCKernelContext(), args...))
end
function GPUArrays.gpu_call(::ROCArrayBackend, f, args; elements::Int, name::Union{String,Nothing}=nothing)
wait(@roc groupsize=min(elements, 64) gridsize=elements f(ROCKernelContext(), args...))
@roc groupsize=groupsize gridsize=gridsize name=name f(ROCKernelContext(), args...)
end

## on-device

# indexing

for (f, froc) in (
(:blockidx, :blockIdx),
(:blockdim, :blockDim),
(:threadidx, :threadIdx),
(:griddim, :gridGroupDim)
)
(:blockidx, :blockIdx),
(:blockdim, :blockDim),
(:threadidx, :threadIdx),
(:griddim, :gridGroupDim)
)
@eval @inline GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x
end

Expand All @@ -50,7 +47,6 @@ end
return
end


#
# Host abstractions
#
Expand Down Expand Up @@ -81,16 +77,34 @@ end

unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf)

wait!(x::ROCArray) = wait!(x.syncstate)
mark!(x::ROCArray, s) = mark!(x.syncstate, s)
wait!(xs::Vector{<:ROCArray}) = foreach(wait!, xs)
mark!(xs::Vector{<:ROCArray}, s) = foreach(x->mark!(x,s), xs)
wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(wait!, xs)
mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x->mark!(x,s), xs)
mark!(xs::Vector{<:ROCArray}, s) = foreach(x -> mark!(x,s), xs)
mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x -> mark!(x,s), xs)

wait!(x::ROCArray; hip::Bool = true, hsa::Bool = true) = wait!(x.syncstate; hip, hsa)
wait!(xs::Vector{<:ROCArray}; hip::Bool = true, hsa::Bool = true) = foreach(x -> wait!(x; hip, hsa), xs)
wait!(xs::NTuple{N,<:ROCArray} where N; hip::Bool = true, hsa::Bool = true) = foreach(x -> wait!(x; hip, hsa), xs)

hsa_wait!(x::ROCArray) = wait!(x.syncstate; hip=false, hsa=true)
hsa_wait!(xs::Vector{<:ROCArray}) = foreach(x -> wait!(x; hip=false, hsa=true), xs)
hsa_wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(x -> wait!(x; hip=false, hsa=true), xs)

hip_wait!(x::ROCArray) = wait!(x.syncstate; hip=true, hsa=false)
hip_wait!(xs::Vector{<:ROCArray}) = foreach(x -> wait!(x; hip=true, hsa=false), xs)
hip_wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(x -> wait!(x; hip=true, hsa=false), xs)

function Adapt.adapt_storage(::Runtime.WaitAdaptor, x::ROCArray)
Runtime.wait!(x.syncstate)
x
end
function Adapt.adapt_storage(::Runtime.HIPWaitAdaptor, x::ROCArray)
Runtime.wait!(x.syncstate; hip=true, hsa=false)
x
end
function Adapt.adapt_storage(::Runtime.HSAWaitAdaptor, x::ROCArray)
Runtime.wait!(x.syncstate; hip=false, hsa=true)
x
end
function Adapt.adapt_storage(ma::Runtime.MarkAdaptor, x::ROCArray)
Runtime.mark!(x.syncstate, ma.s)
x
Expand Down Expand Up @@ -183,6 +197,7 @@ function Base.copyto!(dest::Array{T}, d_offset::Integer,
@boundscheck checkbounds(dest, d_offset+amount-1)
@boundscheck checkbounds(source, s_offset+amount-1)
wait!(source)
synchronize()
Mem.download!(pointer(dest, d_offset),
Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)),
amount*sizeof(T))
Expand Down
4 changes: 2 additions & 2 deletions src/blas/rocBLAS.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
module rocBLAS

using ..AMDGPU
import AMDGPU: wait!, mark!, librocblas, AnyROCArray
import AMDGPU: hsa_wait!, mark!, librocblas, AnyROCArray
import AMDGPU: HandleCache, HIP, library_state
import .HIP: HIPContext, HIPStream, hipContext_t, hipStream_t, hipEvent_t
import .HIP: HIPContext, HIPStream, HIPEvent, hipContext_t, hipStream_t, hipEvent_t

using LinearAlgebra
using CEnum
Expand Down
Loading