diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index a04fd4982..4353add8a 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -18,65 +18,28 @@ steps: agents: queue: "juliagpu" rocm: "*" + rocmgpu: "gfx1031" if: build.message !~ /\[skip docs\]/ timeout_in_minutes: 10 - # - label: "Julia 1.8 - GPUArrays 8" - # plugins: - # - JuliaCI/julia#v1: - # version: 1.8 - # - JuliaCI/julia-test#v1: - # - JuliaCI/julia-coverage#v1: - # codecov: true - # agents: - # queue: "juliagpu" - # rocm: "*" - # rocmgpu: "*" - # if: build.message !~ /\[skip tests\]/ - # command: "julia --project -e 'using Pkg; Pkg.update()'" - # soft_fail: true - # timeout_in_minutes: 180 - # env: - # JULIA_AMDGPU_CORE_MUST_LOAD: "1" - # JULIA_AMDGPU_HIP_MUST_LOAD: "1" - # - # - label: "Julia 1.8 - GPUArrays 8 - No artifacts" - # plugins: - # - JuliaCI/julia#v1: - # version: 1.8 - # - JuliaCI/julia-test#v1: - # - JuliaCI/julia-coverage#v1: - # codecov: true - # agents: - # queue: "juliagpu" - # rocm: "*" - # rocmgpu: "*" - # if: build.message !~ /\[skip tests\]/ - # command: "julia --project -e 'using Pkg; Pkg.update()'" - # soft_fail: true - # timeout_in_minutes: 180 - # env: - # JULIA_AMDGPU_CORE_MUST_LOAD: "1" - # JULIA_AMDGPU_HIP_MUST_LOAD: "1" - # JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" - # - - label: "Julia 1.9 - GPUArrays 8" - plugins: - - JuliaCI/julia#v1: - version: 1.9-nightly - - JuliaCI/julia-test#v1: - - JuliaCI/julia-coverage#v1: - codecov: true - agents: - queue: "juliagpu" - julia_1.9: "*" - rocm: "*" - rocmgpu: "*" - if: build.message !~ /\[skip tests\]/ - command: "julia --project -e 'using Pkg; Pkg.update()'" - timeout_in_minutes: 180 - env: - JULIA_AMDGPU_CORE_MUST_LOAD: "1" - JULIA_AMDGPU_HIP_MUST_LOAD: "1" + + # - label: "Julia 1.9 - GPUArrays 8" + # plugins: + # - JuliaCI/julia#v1: + # version: 1.9-nightly + # - JuliaCI/julia-test#v1: + # - JuliaCI/julia-coverage#v1: + # codecov: true + # agents: + # queue: "juliagpu" + # julia_1.9: "*" + # rocm: "*" + # rocmgpu: "*" + # if: build.message !~ /\[skip tests\]/ + # command: "julia --project -e 'using Pkg; Pkg.update()'" + # timeout_in_minutes: 180 + # env: + # JULIA_AMDGPU_CORE_MUST_LOAD: "1" + # JULIA_AMDGPU_HIP_MUST_LOAD: "1" - label: "Julia 1.9 - GPUArrays 8 - No Artifacts" plugins: @@ -89,74 +52,16 @@ steps: queue: "juliagpu" julia_1.9: "*" rocm: "*" - rocmgpu: "*" + rocmgpu: "gfx1031" if: build.message !~ /\[skip tests\]/ command: "julia --project -e 'using Pkg; Pkg.update()'" timeout_in_minutes: 180 env: + JULIA_NUM_THREADS: 8 JULIA_AMDGPU_CORE_MUST_LOAD: "1" JULIA_AMDGPU_HIP_MUST_LOAD: "1" JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" - - label: "Julia 1.9 - GPUArrays 8 - HIP Malloc" - plugins: - - JuliaCI/julia#v1: - version: 1.9-nightly - - JuliaCI/julia-test#v1: - - JuliaCI/julia-coverage#v1: - codecov: true - agents: - queue: "juliagpu" - julia_1.9: "*" - rocm: "*" - rocmgpu: "*" - if: build.message !~ /\[skip tests\]/ - command: "julia --project -e 'using Pkg; Pkg.update()'" - timeout_in_minutes: 180 - env: - JULIA_AMDGPU_CORE_MUST_LOAD: "1" - JULIA_AMDGPU_HIP_MUST_LOAD: "1" - JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE: "1" - - # - label: "Julia nightly - GPUArrays 8" - # plugins: - # - JuliaCI/julia#v1: - # version: nightly - # - JuliaCI/julia-test#v1: - # - JuliaCI/julia-coverage#v1: - # codecov: true - # agents: - # queue: "juliagpu" - # rocm: "*" - # rocmgpu: "*" - # if: build.message !~ /\[skip tests\]/ - # command: "julia --project -e 'using Pkg; Pkg.update()'" - # soft_fail: true - # timeout_in_minutes: 180 - # env: - # JULIA_AMDGPU_CORE_MUST_LOAD: "1" - # JULIA_AMDGPU_HIP_MUST_LOAD: "1" - - # - label: "Julia nightly - GPUArrays 8 - No Artifacts" - # plugins: - # - JuliaCI/julia#v1: - # version: nightly - # - JuliaCI/julia-test#v1: - # - JuliaCI/julia-coverage#v1: - # codecov: true - # agents: - # queue: "juliagpu" - # rocm: "*" - # rocmgpu: "*" - # if: build.message !~ /\[skip tests\]/ - # command: "julia --project -e 'using Pkg; Pkg.update()'" - # soft_fail: true - # timeout_in_minutes: 180 - # env: - # JULIA_AMDGPU_CORE_MUST_LOAD: "1" - # JULIA_AMDGPU_HIP_MUST_LOAD: "1" - # JULIA_AMDGPU_DISABLE_ARTIFACTS: "1" - env: JULIA_AMDGPU_LOGGING_ENABLED: true SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg==" diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml index e7487db37..2aa3ba45b 100644 --- a/.github/workflows/CompatHelper.yml +++ b/.github/workflows/CompatHelper.yml @@ -19,7 +19,7 @@ jobs: - uses: julia-actions/setup-julia@v1 with: # version: ${{ steps.julia_compat.outputs.version }} - version: "1.7-nightly" + version: "1.9" - name: Pkg.add("CompatHelper") run: julia -e 'using Pkg; Pkg.add("CompatHelper")' - name: CompatHelper.main() diff --git a/.github/workflows/ci-julia-1.7.yml b/.github/workflows/ci-julia-1.7.yml index ebc92e08f..a51440d92 100644 --- a/.github/workflows/ci-julia-1.7.yml +++ b/.github/workflows/ci-julia-1.7.yml @@ -1,4 +1,4 @@ -name: CI (Julia 1.7) +name: CI (Julia 1.9) on: push: branches: @@ -9,14 +9,14 @@ defaults: run: shell: bash jobs: - CI-julia-1-7: - name: CI-julia-1-7 + CI-julia-1-0: + name: CI-julia-1-9 runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: version: - - '1.7' + - '1.9' os: - ubuntu-latest - macOS-latest diff --git a/Project.toml b/Project.toml index 33272ff4c..e3d8998d4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "AMDGPU" uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e" authors = ["Julian P Samaroo ", "Valentin Churavy ", "Anton Smirnov "] -version = "0.4.15" +version = "0.5.0" [deps] AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c" @@ -19,14 +19,11 @@ LLVM_jll = "86de99a1-58d6-5da7-8064-bd56ce2e322c" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" -MsgPack = "99f44e22-a591-53d1-9472-aa23ef4bd671" -ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" ROCmDeviceLibs_jll = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" -Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63" @@ -37,29 +34,26 @@ rocRAND_jll = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996" rocSPARSE_jll = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a" [compat] -AbstractFFTs = "0.5, 1.0" +AbstractFFTs = "1.0" Adapt = "3.0" BinaryProvider = "0.5" -CEnum = "0.2, 0.3, 0.4" +CEnum = "0.4" ExprTools = "0.1" -GPUArrays = "6, 7, 8" -GPUCompiler = "0.19" -HIP_jll = "4, 5" +GPUArrays = "8" +GPUCompiler = "0.21" +HIP_jll = "5" KernelAbstractions = "0.9.2" -LLD_jll = "12, 13, 14, 15" -LLVM = "5" -LLVM_jll = "12, 13, 14, 15" +LLD_jll = "14, 15" +LLVM = "6" +LLVM_jll = "14, 15" MacroTools = "0.5" -MsgPack = "1" -ObjectFile = "0.3" Preferences = "1" -ROCmDeviceLibs_jll = "4, 5" -Setfield = "0.5, 0.6, 0.7, 1.0" -SpecialFunctions = "1, 2" +ROCmDeviceLibs_jll = "5" +SpecialFunctions = "2" TimespanLogging = "0.1" UnsafeAtomicsLLVM = "0.1" -hsa_rocr_jll = "4, 5" -julia = "1.7" -rocBLAS_jll = "4, 5" -rocRAND_jll = "4, 5" -rocSPARSE_jll = "4, 5" +hsa_rocr_jll = "5" +julia = "1.9" +rocBLAS_jll = "5" +rocRAND_jll = "5" +rocSPARSE_jll = "5" diff --git a/README.md b/README.md index 9a610bf0d..b01ef015a 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,6 @@ [docs-dev-url]: https://amdgpu.juliagpu.org/dev - ## Quick start AMDGPU.jl can be installed with the Julia package manager. @@ -38,7 +37,7 @@ julia> import Pkg; Pkg.add("AMDGPU") ## Project Status -The package is tested against, and being developed for, Julia 1.7, 1.9, and above. +The package is tested against and being developed for Julia 1.9, and above. *Julia 1.8 is not supported and should not be used with AMDGPU.jl, as Julia 1.8 cannot properly handle the code generated by AMDGPU.jl.* diff --git a/deps/.gitignore b/deps/.gitignore deleted file mode 100644 index bb75281e1..000000000 --- a/deps/.gitignore +++ /dev/null @@ -1,6 +0,0 @@ -build.log -runtime/ -usr/ -ext.jl -ext.jl.bak -deps.jl diff --git a/deps/bindeps.jl b/deps/bindeps.jl deleted file mode 100644 index d1025101f..000000000 --- a/deps/bindeps.jl +++ /dev/null @@ -1,399 +0,0 @@ -# copied from CUDAdrv/deps/build.jl - -using Libdl -import Base: @invokelatest -using Preferences - -enable_artifacts!(flag::Bool=true) = - @set_preferences!("use_artifacts" => flag) -if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS") - enable_artifacts!(!parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false"))) -end -const use_artifacts = @load_preference("use_artifacts", true) - -## library finding - -function find_roc_paths() - paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":") - paths = filter(path->path != "", paths) - paths = map(Base.Filesystem.abspath, paths) - push!(paths, "/opt/rocm/lib") # shim for Ubuntu rocm packages... - if haskey(ENV, "ROCM_PATH") - push!(paths, joinpath(ENV["ROCM_PATH"], "lib")) - end - return filter(isdir, paths) -end - -function find_rocm_library(lib, dirs, ext=dlext) - path = Libdl.find_library(lib) - if path != "" - return Libdl.dlpath(path) - end - for dir in dirs - files = readdir(dir) - for file in files - matched = startswith(basename(file), lib*".$ext") - if matched - return joinpath(dir, file) - end - end - end - return "" -end -function find_rocm_library(libs::Vector, dirs) - for lib in libs - path = find_rocm_library(lib, dirs) - if path != "" - return path - end - end - return "" -end - -function find_ld_lld() - paths = split(get(ENV, "PATH", ""), ":") - paths = filter(path->path != "", paths) - paths = map(Base.Filesystem.abspath, paths) - basedir = get(ENV, "ROCM_PATH", "/opt/rocm") - ispath(joinpath(basedir, "llvm/bin/ld.lld")) && - push!(paths, joinpath(basedir, "llvm/bin/")) - ispath(joinpath(basedir, "hcc/bin/ld.lld")) && - push!(paths, joinpath(basedir, "/hcc/bin/")) - ispath(joinpath(basedir, "opencl/bin/x86_64/ld.lld")) && - push!(paths, joinpath(basedir, "opencl/bin/x86_64/")) - for path in paths - exp_ld_path = joinpath(path, "ld.lld") - if ispath(exp_ld_path) - try - tmpfile = mktemp() - run(pipeline(`$exp_ld_path -v`; stdout=tmpfile[1])) - vstr = read(tmpfile[1], String) - rm(tmpfile[1]) - vstr = replace(vstr, "AMD " => "") - vstr_splits = split(vstr, ' ') - if VersionNumber(vstr_splits[2]) >= v"6.0.0" - return exp_ld_path - end - catch - @debug "bindeps: Failed running ld.lld in $exp_ld_path" - end - end - end - return "" -end - -function find_device_libs() - # Might be set by tools like Spack or the user - hip_devlibs_path = get(ENV, "HIP_DEVICE_LIB_PATH", "") - hip_devlibs_path !== "" && return hip_devlibs_path - devlibs_path = get(ENV, "DEVICE_LIB_PATH", "") - devlibs_path !== "" && return devlibs_path - - # The canonical location - if isdir("/opt/rocm/amdgcn/bitcode") - return "/opt/rocm/amdgcn/bitcode" - end - - # Search relative to LD_LIBRARY_PATH entries - paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":") - paths = filter(path->path != "", paths) - paths = map(Base.Filesystem.abspath, paths) - for path in paths - bitcode_path = joinpath(path, "../amdgcn/bitcode/") - if ispath(bitcode_path) - if isfile(joinpath(bitcode_path, "ocml.bc")) || - isfile(joinpath(bitcode_path, "ocml.amdgcn.bc")) - return bitcode_path - end - end - end - return nothing -end - -function detect_projects() - amdgpu_project = normpath(joinpath(@__DIR__, "..")) - current_project = Base.ACTIVE_PROJECT[] - julia_project = if Base.JLOptions().project != C_NULL - unsafe_string(Base.JLOptions().project) - elseif current_project !== nothing - current_project - else - amdgpu_project - end - return (;amdgpu_project, current_project, julia_project) -end -julia_exeflags(projects=detect_projects()) = - String["--startup-file=no", - "--project=$(projects.julia_project)"] -function julia_cmd_projects(jl_str) - projects = detect_projects() - - cmd = Base.julia_cmd() - append!(cmd.exec, julia_exeflags(projects)) - - (;amdgpu_project, current_project, julia_project) = projects - if current_project !== nothing - jl_str = "push!(LOAD_PATH, \"$current_project\");" * jl_str - end - jl_str = "push!(LOAD_PATH, \"$amdgpu_project\");" * jl_str - append!(cmd.exec, ("-e", jl_str)) - return cmd -end - -function populate_globals!(config) - for (key,val) in config - @eval const $key = $val - end -end - -const rocm_ext_libs = [ - (:rocblas, :rocBLAS_jll), - (:rocsparse, :rocSPARSE_jll), - (:rocsolver, nothing), - (:rocalution, nothing), - (:rocrand, :rocRAND_jll), - (:rocfft, nothing), - (:MIOpen, :MIOpen_jll), -] - -function bindeps_setup() - config = Dict{Symbol,Any}( - :use_artifacts => use_artifacts, - :configured => false, - :build_reason => "unknown", - :lld_configured => false, - :lld_build_reason => "unknown", - :lld_artifact => false, - :hsa_configured => false, - :hsa_build_reason => "unknown", - :hip_configured => false, - :hip_build_reason => "unknown", - :device_libs_configured => false, - :device_libs_build_reason => "unknown", - ) - for (name, _) in rocm_ext_libs - lib = Symbol(:lib, string(name)) - config[lib] = nothing - config[Symbol(name, :_configured)] = false - config[Symbol(name, :_build_reason)] = "unknown" - end - - ## discover stuff - - # check that we're running Linux - if !Sys.islinux() - @debug "Not running Linux, which is the only platform currently supported by the ROCm Runtime." - config[:build_reason] = "Unsupported OS: $(repr(Sys.KERNEL))" - @goto populate - end - - # Skip build if KFD is not available - if !ispath("/dev/kfd") - @debug "/dev/kfd not available, cannot use ROCm Runtime." - @goto populate - end - - # Find some paths for library search - roc_dirs = find_roc_paths() - - function safe_exec(str) - cmd = julia_cmd_projects(str) - success = false - error_str = mktemp() do path, _ - p = run(pipeline(cmd; stdout=path, stderr=path); wait=false) - wait(p) - success = p.exitcode == 0 - String(read(path)) - end - return success, error_str - end - function safe_import(pkg) - loaded, error_str = safe_exec("import $pkg") - if !loaded - return loaded, false, error_str - end - @eval import $pkg - available = @eval(isdefined($pkg, :is_available)) && @eval($pkg.is_available()) - return loaded, available, error_str - end - - # Find HSA runtime v1 - if use_artifacts - loaded, available, error_str = safe_import(:hsa_rocr_jll) - if loaded - if available - config[:libhsaruntime_path] = hsa_rocr_jll.libhsa_runtime64 - config[:hsa_configured] = true - else - config[:hsa_build_reason] = "hsa_rocr_jll is not available on this platform" - end - else - iob = IOBuffer() - println(iob, "`import hsa_rocr_jll` failed:") - print(iob, error_str) - config[:hsa_build_reason] = String(take!(iob)) - end - else - libhsaruntime_path = find_rocm_library("libhsa-runtime64", roc_dirs, "so.1") - if !isempty(something(libhsaruntime_path, "")) - loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhsaruntime_path\")") - if loaded - config[:libhsaruntime_path] = libhsaruntime_path - config[:hsa_configured] = true - else - iob = IOBuffer() - println(iob, "Loading `libhsa-runtime64` v1 failed:") - print(iob, error_str) - config[:hsa_build_reason] = String(take!(iob)) - end - else - config[:hsa_build_reason] = "Could not find `libhsa-runtime64` v1 library" - end - end - if !config[:hsa_configured] - @goto populate - end - - ### Find ld.lld - if use_artifacts - loaded, available, error_str = safe_import(:LLD_jll) - if loaded - if available || (Base.libllvm_version < v"14" && @invokelatest(LLD_jll.LLVM_jll.is_available())) - if isdefined(LLD_jll, :lld_path) - config[:lld_path] = LLD_jll.lld_path - config[:lld_artifact] = true - config[:lld_configured] = true - else - config[:lld_build_reason] = "LLD_jll does not export `lld_path`" - end - else - config[:lld_build_reason] = "LLD_jll is not available on this platform" - end - else - iob = IOBuffer() - println(iob, "`import LLD_jll` failed:") - print(iob, error_str) - config[:lld_build_reason] = String(take!(iob)) - end - else - lld_path = find_ld_lld() - if !isempty(something(lld_path, "")) - # TODO: Validate ld.lld can compile programs - config[:lld_path] = lld_path - config[:lld_configured] = true - else - config[:lld_build_reason] = "Could not find `ld.lld` executable" - end - end - - ### Find device libraries - if use_artifacts - loaded, available, error_str = safe_import(:ROCmDeviceLibs_jll) - if loaded - if available - config[:device_libs_path] = ROCmDeviceLibs_jll.bitcode_path - config[:device_libs_configured] = true - else - config[:device_libs_build_reason] = "ROCmDeviceLibs_jll is not available on this platform" - end - else - iob = IOBuffer() - println(iob, "`import ROCmDeviceLibs_jll` failed:") - print(iob, error_str) - config[:device_libs_build_reason] = String(take!(iob)) - end - else - device_libs_path = find_device_libs() - if !isempty(something(device_libs_path, "")) - # TODO: Validate bitcode files - config[:device_libs_path] = device_libs_path - config[:device_libs_configured] = true - else - config[:device_libs_build_reason] = "Couldn't find bitcode files" - end - end - - ### Find HIP - if use_artifacts - loaded, available, error_str = safe_import(:HIP_jll) - if loaded - if available - config[:libhip_path] = HIP_jll.libamdhip64 - config[:hip_configured] = true - else - config[:hip_build_reason] = "HIP_jll is not available on this platform" - end - else - iob = IOBuffer() - println(iob, "`import HIP_jll` failed:") - print(iob, error_str) - config[:hip_build_reason] = String(take!(iob)) - end - else - libhip_path = find_rocm_library(["libamdhip64", "libhip_hcc"], roc_dirs) - if !isempty(something(libhip_path, "")) - loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhip_path\")") - if loaded - config[:libhip_path] = libhip_path - config[:hip_configured] = true - else - iob = IOBuffer() - println(iob, "Loading HIP failed:") - print(iob, error_str) - config[:hip_build_reason] = String(take!(iob)) - end - else - config[:hip_build_reason] = "Could not find `libamdhip64` or `libhip_hcc` libraries" - end - end - if config[:hip_configured] - ### Find HIP-based libraries - @sync for (name, pkg) in rocm_ext_libs - @async begin - lib = Symbol(:lib, string(name)) - if use_artifacts - if pkg !== nothing - loaded, available, error_str = safe_import(pkg) - if loaded - if available - config[lib] = getfield(@eval($pkg), lib) - config[Symbol(name, :_configured)] = true - else - config[Symbol(name, :_build_reason)] = "$pkg is not available on this platform" - end - else - iob = IOBuffer() - println(iob, "`import $pkg` failed:") - print(iob, error_str) - config[Symbol(name, :_build_reason)] = String(take!(iob)) - end - end - else - libpath = find_rocm_library(string(lib), roc_dirs) - if !isempty(something(libpath, "")) - loaded, error_str = safe_exec("using Libdl; dlopen(\"$libpath\")") - if loaded - config[lib] = libpath - config[Symbol(name, :_configured)] = true - else - iob = IOBuffer() - println(iob, "Loading `$lib` failed:") - print(iob, error_str) - config[Symbol(name, :_build_reason)] = String(take!(iob)) - end - else - config[Symbol(name, :_build_reason)] = "Could not find `$lib` library" - end - end - end # @async - end - end - - config[:configured] = true - - @label populate - populate_globals!(config) -end - -# Load binary dependencies -bindeps_setup() diff --git a/docs/make.jl b/docs/make.jl index 302e16500..03dac8127 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -10,18 +10,18 @@ function main() pages = [ "Home" => "index.md", "Quick Start" => "quickstart.md", - "Devices/Agents" => "devices.md", - "Queues and Signals" => "queues_signals.md", - "Kernel Dependencies" => "kernel_deps.md", + "Devices" => "devices.md", + "Streams" => "streams.md", "Kernel Launch" => "kernel_launch.md", - "Global Variables" => "globals.md", "Exceptions" => "exceptions.md", - "Printing" => "printing.md", + "Profiling" => "profiling.md", "Memory" => "memory.md", + "Host-Call" => "hostcall.md", "Intrinsics" => [ "Execution Control" => "execution_control.md", "Wavefront Operations" => "wavefront_ops.md", ], + "Printing" => "printing.md", "Logging" => "logging.md", "API Reference" => "api.md" ] diff --git a/docs/src/assets/profiling-1.png b/docs/src/assets/profiling-1.png new file mode 100644 index 000000000..f565b6a13 Binary files /dev/null and b/docs/src/assets/profiling-1.png differ diff --git a/docs/src/assets/profiling-2.png b/docs/src/assets/profiling-2.png new file mode 100644 index 000000000..26e6653fa Binary files /dev/null and b/docs/src/assets/profiling-2.png differ diff --git a/docs/src/assets/profiling-3.png b/docs/src/assets/profiling-3.png new file mode 100644 index 000000000..50b052b61 Binary files /dev/null and b/docs/src/assets/profiling-3.png differ diff --git a/docs/src/assets/profiling-4.png b/docs/src/assets/profiling-4.png new file mode 100644 index 000000000..9c2f6088b Binary files /dev/null and b/docs/src/assets/profiling-4.png differ diff --git a/docs/src/devices.md b/docs/src/devices.md index 5a413f9c7..0cfbe8d2e 100644 --- a/docs/src/devices.md +++ b/docs/src/devices.md @@ -1,40 +1,43 @@ -# Devices/Agents +# Devices -In AMDGPU, all GPU devices (also known as "agents" in HSA parlance) are -auto-detected by the runtime, if they're supported. -There are three classes of devices: -- CPU -- GPU -- DSP - -In AMDGPU, we only support compilation and execution on **GPU** devices, -so we will henceforth limit discussion to those; -however, you may see a `kind` `Symbol` available in the APIs of many device -access functions, which defaults to `:gpu`, but could also be `:cpu` or `:dsp`. +In AMDGPU, all GPU devices are auto-detected by the runtime, if they're supported. AMDGPU maintains a global default device. -The default device is relevant for all kernel and GPUArray operations; -if one is not specified via `@roc` or an equivalent interface, +The default device is relevant for all kernel and GPUArray operations. +If one is not specified via `@roc` or an equivalent interface, then the default device is used for those operations, which affects compilation and kernel launch. !!! note "Task-Local Storage" - Since AMDGPU.jl relies on Task-Local Storage, this means that + AMDGPU.jl relies on Task-Local Storage, this means that default devices are default only within a given task. - Other tasks migh have different default devices if the user switched them. + Other tasks migh have different default devices if user switched them. + +The device bound to a current Julia task is accessible via [`AMDGPU.device()`](@ref). +The list of available devices can be queried with [`AMDGPU.devices`](@ref). + +If you have a [`HIPDevice`](@ref) object, you can also switch +the device with [`AMDGPU.device!`](@ref). +This will switch it **only within the task it is called from**. -The default device is accessible via [`AMDGPU.device()`](@ref). -This function returns a [`ROCDevice`](@ref), which is a handle that references the device. -The list of available devices can be queried with [`AMDGPU.devices`](@ref) to get -a list of all known and potentially usable devices. +```julia +xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device. -If you have a [`ROCDevice`](@ref) object, you can also switch -the default device via [`AMDGPU.device!`](@ref). -This will switch it only within the task it is called from. +AMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device. +xd2 = AMDPGU.ones(Float32, 16) # On second device. +``` -To select default device for newly created tasks, +To select a default device for newly created tasks, use [`AMDGPU.default_device!`](@ref). +```julia +AMDGPU.default_device!(AMDGPU.devices()[3]) # New tasks will use 3rd device by default. +Threads.@spawn begin + x = AMDGPU.ones(Float32, 16) # On third device. + return +end +``` + Additionally, devices have an associated numeric ID. The default device ID can be queried with [`AMDGPU.default_device_id`](@ref), which returns an `Int`. diff --git a/docs/src/exceptions.md b/docs/src/exceptions.md index 436407294..6ff55beb2 100644 --- a/docs/src/exceptions.md +++ b/docs/src/exceptions.md @@ -1,32 +1,56 @@ -# Kernel-thrown Exceptions +# Kernel Exceptions -Just like regular CPU-executed Julia functions, GPU kernels can throw -exceptions! For example, the following kernel will throw a `KernelException`: +Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions! + +For example, the following kernel will throw an out-of-bounds exception: ```julia -function throwkernel(A) - A[0] = 1 -end -RA = ROCArray(zeros(Int,1)) -wait(@roc throwkernel(RA)) +julia> using AMDGPU + +julia> function kerr(x) + x[0] = 1 + return + end +kerr (generic function with 1 method) + +julia> x = ROCArray([1]); + +julia> @roc kerr(x); + +julia> AMDGPU.synchronize() +ERROR: GPU Kernel Exception +Stacktrace: + [1] error(s::String) + @ Base ./error.jl:35 + [2] throw_if_exception(dev::HIPDevice) + @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115 + [3] synchronize(stm::HIPStream) + @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154 + [4] synchronize() + @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154 + [5] top-level scope + @ REPL[5]:1 + +julia> @roc kerr(x) # Next kernel launch also throws. +ERROR: GPU Kernel Exception +Stacktrace: + [1] error(s::String) + @ Base ./error.jl:35 + [2] throw_if_exception(dev::HIPDevice) + @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115 + [3] #_#30 + @ ~/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:44 [inlined] + [4] (::AMDGPU.Runtime.HIPKernel{typeof(kerr), Tuple{AMDGPU.Device.ROCDeviceVector{Int64, 1}}})(args::ROCVector{Int64}) + @ AMDGPU.Runtime ~/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:41 + [5] top-level scope + @ ~/.julia/dev/AMDGPU/src/highlevel.jl:228 ``` -Kernels that hit an exception will write some exception information into a -pre-allocated list for the CPU to inspect. Once complete, the wavefront -throwing the exception will stop itself, but other wavefronts will continue -executing (possibly throwing their own exceptions, or not). - -Kernel-thrown exceptions are thrown on the CPU in the call to `wait(event)`, -where `event` is the returned value of `@roc` calls. When the kernel signals -that it's completed, the `wait` function will check if an exception flag has -been set, and if it has, will collect all of the relevant exception information -that the kernels set up. Unlike CPU execution, GPU kernel exceptions aren't -very user-customizable and pretty (for now!). They don't call `Base.show`, but -instead pass the LLVM function name of their exception handler (details in -`GPUCompiler`, `src/irgen.jl`). Therefore, the exact error that occured might -be a bit hard to figure out. - -If exception checking turns out to be too expensive for your needs, you can -disable those checks by passing the kwarg `check_exceptions=false` to the -`wait` call, which will skip any error checking (although it will still wait -for the kernel to signal completion). +Kernel-thrown exceptions are thrown during the +host synchronization `AMDGPU.synchronize` or on the next kernel launch. + +Kernels that hit an exception will write its information into a pre-allocated +host buffer. +Once complete, the wavefront throwing the exception will lock the buffer +to prevent other wavefronts from overwriting the exception and stop itself, +but other wavefronts will continue executing. diff --git a/docs/src/globals.md b/docs/src/globals.md deleted file mode 100644 index fcb3a3ea3..000000000 --- a/docs/src/globals.md +++ /dev/null @@ -1,70 +0,0 @@ -# Global Variables - -Most programmers are familiar with the concept of a "global variable": a -variable which is globally accessible to any function in the user's program. In -Julia, programmers are told to avoid using global variables (also known as -"globals") because of their tendency to introduce type instabilities. However, -they're often useful for sharing data between functions in distinct areas of -the user's program. - -In the JuliaGPU ecosystem, globals in the Julia sense are not available unless -their value is constant and inlinable into the function referencing them, as -all GPU kernels must be statically compileable. However, a different sort of -global variable is available which serves a very similar purpose. This variant -of global variable is statically typed and sized, and is accessible from: all -kernels with the same function signature (e.g. `mykernel(a::Int32, -b::Float64)`), the CPU host, and other devices and kernels when accessed by -pointer. - -Global variables can be created within kernels with the -[`AMDGPU.Device.get_global_pointer`](@ref) function, which both declares the -global variable, and returns a pointer to it (specifically a -`Core.LLVMPtr`). Once a kernel which declares a global is -compiled for GPU execution (either by [`@roc`](@ref) or [`rocfunction`](@ref)), -the global is allocated memory and made available to the kernel (during the -linking stage). Globals are unique by name, and so you shouldn't attempt to -call `get_global_pointer` with the same name but a different type; if you do, -undefined behavior will result. Like regular pointers in Julia, you can use -functions like `Base.unsafe_load` and `Base.unsafe_store!` to read from and -write to the global variable, respectively. - -As a concrete example of global variable usage, let's define a kernel which -creates a global and uses its value to increment the indices of an array: - -```julia -function my_kernel(A) - idx = AMDGPU.Device.workitemIdx().x - ptr = AMDGPU.Device.get_global_pointer(Val(:myglobal), Float32) - A[idx] += Base.unsafe_load(ptr) - nothing -end -``` - -In order to access and modify this global before the kernel is launched, we can -specify a hook function to `@roc` which will be passed the global pointer as an -argument: - -```julia -function myglobal_hook(gbl, mod, dev) - gbl_ptr = Base.unsafe_convert(Ptr{Float32}, gbl.ptr) - Base.unsafe_store!(gbl_ptr, 42f0) -end -RA = ROCArray(ones(Float32, 4)) -wait(@roc groupsize=4 global_hooks=(myglobal=myglobal_hook,) my_kernel(RA)) -``` - -In the above function, `gbl_ptr` is a pointer (specifically a `Ptr{Float32}`) -to the memory that represents the global variable `myglobal`. We can't -guarantee the initial value of an uninitialized global variable, so we need -to write a value to that global variable (in this case `42::Float32`). - -We can then read the values of `RA` and see that it's what we expect: - -```julia-repl -julia> A = Array(RA) -4-element ROCArray{Float32,1}: - 43.0 - 43.0 - 43.0 - 43.0 -``` diff --git a/docs/src/hostcall.md b/docs/src/hostcall.md new file mode 100644 index 000000000..c42609f5e --- /dev/null +++ b/docs/src/hostcall.md @@ -0,0 +1,70 @@ +# Hostcall + +Hostcalls provide a means for GPU-CPU communications within running kernels. + +AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals. +Currently, hostcalls are used for device-side allocations, printing and exception reporting. + +Some of the hostcalls (global hostcalls), are launched automatically, if their +usage is detected during compilation (e.g. device-side allocations, exception reporting). + +Hostcalls require careful usage, since they each spawn their own Tasks. +There should be no blocking operations during this time. + +For example, using non-blocking synchronization instead of blocking with +`AMDGPU.synchronize(; blocking=false)`. + +Non-blocking synchronization is also responsible for stopping global hostcalls, +otherwise the performance might degarde because of constant pooling +of HSA signals in a loop. + +## Example + +```julia +hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x + return x + 42f0 +end + +function kernel!(y, hc) + y[1] = Device.hostcall!(hc, y[1]) + return +end + +y = ROCArray(Float32[0f0]) +@roc kernel!(y, hc) +AMDGPU.synchronize(; blocking=false) # Non-blocking sync to prevent hanging. + +@assert Array(y)[1] ≈ 42f0 +``` + +In this example, `HostCallHolder` is used to create and launch `HostCall`. +`HostCallHolder` contains the `HostCall` structure itself that is passed to kernel, +a task that is spawned on creation and some additional info for controlling +the lifetime of the task. + +First argument is a function we want to execute when we call the hostcall. +In this case we add `42f0` to input argument `x` and return the result. + +Second and third arguments are the return type `Float32` and the tuple of types +of input arguments `Tuple{Float32}`. + +`hostcall!` is used to execute the function on the host, wait on the result, +and obtain the return values. +At the moment, it is performed once per workgroup. + +## Continuous Host-Call + +By default, hostcalls can be used only once. +After executing the function on the host, the task finishes and exits. + +However, if you need your hostcall to live indefinitely, pass `continuous=true` +keyword argument to `HostCallHolder(...; continuous=true)`. + +To then stop the hostcall, call `Device.non_continuous!(hc)` +or `Device.finish!(hc)` on the `HostCallHolder`. + +The difference between them is that `non_continuous!` will allow calling +hostcall one more time before exiting, while `finish!` will exit immediately. + +`finish!` can be used on any `HostCallHolder` to force-exit the running +hostcall task. diff --git a/docs/src/kernel_deps.md b/docs/src/kernel_deps.md deleted file mode 100644 index e32e435e0..000000000 --- a/docs/src/kernel_deps.md +++ /dev/null @@ -1,53 +0,0 @@ -# Kernel Dependencies - -Unlike CUDA, ROCm does not have blocking queues; instead, all kernels placed on a queue will usually be processed and scheduled immediately. There is one exception: barrier packets may be placed on the queue to block the GPU's queue packet processor from proceeding until a given set of kernels has completed. These barriers come in two flavors: `barrier_and!` and `barrier_or!`. These functions can be called on a queue with a given set of kernel signals (those returned from `@roc`) to wait for all kernels or any one kernel to complete, respectively. - -Generally, the `barrier_and!` call should be the most useful tool for most users, since many codes require synchronization of all "threads of execution" at the end of one step before moving onto the next step. For example, the following code may look innocuous, but in fact the kernels might "race" and return unexpected results: - -```julia -function kernel(A) - A[1] += 1.0 - nothing -end - -RA = ROCArray(zeros(Float64, 1)) -@roc kernel(RA) -@roc kernel(RA) -@show Array(RA)[1] # could be 1.0 or 2.0 -``` - -To fix this example, we use a `barrier_and!` call to ensure proper ordering of execution: - -```julia -RA = ROCArray(zeros(Float64, 1)) -s1 = @roc kernel(RA) -barrier_and!([s1]) -s2 = @roc kernel(RA) -wait(s2) -@show Array(RA)[1] # will always be 2.0 -``` - -While likely less useful for most, `barrier_or!` can be useful in situations where any one of many "input" kernels can satisfy a condition necessary to allow later kernels to execute properly: - -```julia -function kernel1(A, i) - A[1] = i - nothing -end -function kernel2(A, i) - A[2] = i/A[1] -end - -RA = ROCArray(zeros(Float64, 2)) -s1 = @roc kernel1(RA, 1.0) -s2 = @roc kernel1(RA, 2.0) -barrier_or!([s1,s2]) -s3 = @roc kernel2(RA, 3.0) -wait(s3) -@show Array(RA)[1] # will either be 3.0 or 1.5, but will never throw due to divide-by-zero -``` - -!!! warning - Because of how barrier OR packets work, you can't use queue hardware to do a wait-any on more than 5 signals at a time. If more than 5 signals are specified, then the signals are split into sets of 5, and the total barrier won't be fulfilled until, for each set, one of the signals is satisfied. - - Contributions are welcome to workaround this issue, which will probably need to implemented in software either on the CPU or GPU side. diff --git a/docs/src/kernel_launch.md b/docs/src/kernel_launch.md index e1ec772cf..fba4d0bc3 100644 --- a/docs/src/kernel_launch.md +++ b/docs/src/kernel_launch.md @@ -4,11 +4,15 @@ While an almost arbitrarily large number of workitems can be executed per kernel launch, the hardware can only support executing a limited number of -wavefronts at one time. To alleviate this, the compiler calculates the +wavefronts at one time. + +To alleviate this, the compiler calculates the "occupancy" of each compiled kernel (which is the number of wavefronts that can be simultaneously executing on the GPU), and passes this information to the hardware; the hardware then launches a limited number of wavefronts at once, -based on the kernel's "occupancy" values. The rest of the wavefronts are not +based on the kernel's "occupancy" values. + +The rest of the wavefronts are not launched until hardware resources become available, which means that a kernel with better occupancy will see more of its wavefronts executing simultaneously (which often leads to better performance). Suffice to say, it's important to @@ -20,31 +24,15 @@ Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the ```julia kernel = @roc launch=false mykernel(args...) occupancy = AMDGPU.launch_configuration(kernel) +@show occupancy.gridsize @show occupancy.groupsize ``` Specifically, `launch_configuration` calculates the occupancy of `mykernel(args...)`, and then calculates an optimal groupsize based on the -occupancy. This value can then be used to select the groupsize for the kernel: - -```julia -wait(@roc groupsize=occupancy.groupsize mykernel(args...) -``` - -While it works, it's also pretty verbose. Conveniently, there's also a -mechanism to do all of the above automatically within `@roc`: +occupancy. +This value can then be used to select the groupsize for the kernel: ```julia -wait(@roc groupsize=:auto mykernel(args...)) -``` - -The above is safe to do in a hot path, as the occupancy is cached on a -per-kernel basis. - -There are also various other details available from the occupancy calculation, -such as SGPR, VGPR, and LDS usage, wavefront size, etc.: - -```julia -kernel = @roc launch=false mykernel(args...) -@show AMDGPU.Compiler.calculate_occupancy(kernel.fun, AMDGPU.default_device()) +@roc groupsize=occupancy.groupsize mykernel(args...) ``` diff --git a/docs/src/memory.md b/docs/src/memory.md index b345aab9c..72b6729fd 100644 --- a/docs/src/memory.md +++ b/docs/src/memory.md @@ -4,83 +4,88 @@ GPUs contain various kinds of memory, just like CPUs: -- Global: Globally accessible by all CUs on a GPU, and possibly accessible from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices, etc.). Slowest form of memory. -- Constant: Same as global memory, but signals to the hardware that it can use special instructions to access and cache this memory. Can be changed between kernel invocations. -- Region: Also known as Global Data Store (GDS), all wavefronts on a CU can access the same memory region from the same address. Faster than Global/Constant. Automatically allocated by the compiler/runtime, not user accessible. -- Local: Also known as Local Data Store (LDS), all wavefronts in the same workgroup can access the same memory region from the same address. Faster than GDS. -- Private: Uses the hardware scratch space, and is private to each SIMD lane in a wavefront. Fastest form of traditional memory. +- Global: + Globally accessible by all CUs on a GPU, and possibly accessible + from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices, + etc.). Slowest form of memory. +- Constant: + Same as global memory, but signals to the hardware that it can use + special instructions to access and cache this memory. + Can be changed between kernel invocations. +- Region: + Also known as Global Data Store (GDS), all wavefronts on a CU + can access the same memory region from the same address. + Faster than Global/Constant. + Automatically allocated by the compiler/runtime, not user accessible. +- Local: + Also known as Local Data Store (LDS), all wavefronts in the same workgroup + can access the same memory region from the same address. + Faster than GDS. +- Private: + Uses the hardware scratch space, and is private to each SIMD lane + in a wavefront. + Fastest form of traditional memory. + +## Local Memory + +Local memory may be allocated within a kernel by calling either: + +- `@ROCStaticLocalArray(T, dims)` - if `dims` is passed as a constant value, + known at compile-time. + E.g. `@ROCStaticLocalArray(Float32, 8)`. + +- `@ROCDynamicLocalArray(T, dims)` - otherwise. + E.g. `@ROCStaticLocalArray(Float32, length(X))`. + +Local memory is zero-initialized by default. +If this is unnecessary and undesired for performance reasons, +disable this, passing `false` as a last argument: +`@ROCStaticLocalArray(Float32, 8, false)` or +`@ROCStaticLocalArray(Float32, length(X), false)` -## Memory Queries - -Most useable memory can be queried via AMDGPU's APIs. There are two sets of -APIs for querying memory: the older "Regions" API, and the newer "Memory Pools" -API. The Regions API is able to query all kinds of memory segments for each -device: +Local memory does not need to be freed, as it is automatically freed by the +hardware. -```julia -for device in AMDGPU.devices() - foreach(println, AMDGPU.Runtime.regions(device)::Vector{AMDGPU.Runtime.ROCMemoryRegion}) -end -``` +If `@ROCDynamicLocalArray` is used, then local memory is dynamically allocated +at kernel execution time. +The `shmem` option to `@roc` must be set appropriately to ensure that +enough local memory is allocated by the hardware. -The Memory Pools API is currently only able to query Global and Group memory -segments, but is more reliable (due to getting more development attention from -AMD): +It is allocated in addition to the local memory that is statically allocated by +the kernel. ```julia -for device in AMDGPU.devices() - foreach(println, AMDGPU.Runtime.memory_pools(device)::Vector{AMDGPU.Runtime.ROCMemoryPool}) -end -``` - -Most of the details of each memory segment are available by printing the region -or memory pool in question; they may also be accessed programmatically with the -`AMDGPU.Runtime.region_*` and `AMDGPU.Runtime.pool_*` APIs. - -These details are generally not important to the average user, and are handled -automatically by AMDGPU when memory is allocated and freed. +function kernel(C, A) + # Allocate local memory dynamically + Ctmp = @ROCDynamicLocalArray(Float64, length(C)) + # Or, allocate local memory statically if the size is known ahead-of-time + Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements -## Memory Allocation/Deallocation + idx = AMDGPU.workitemIdx().x + Ctmp[idx] = A[idx] + C[1] + AMDGPU.Device.sync_workgroup() -Currently, we can explicitly allocate Global and Local memory from within -kernels, and Global from outside of kernels. Global memory allocations are done -with `AMDGPU.Mem.alloc`, like so: + C[idx] = Ctmp[idx] + return +end -```julia -buf = AMDGPU.Mem.alloc(device, bytes) -# Or with the extended API if a region or memory pool is already selected: -buf = AMDGPU.Mem.alloc(device, pool, bytes) -buf = AMDGPU.Mem.alloc(device, region, bytes) +... +# Note: The `shmem` option isn't necessary if `@ROCStaticLocalArray` is used +shmem = sizeof(Float64) * length(RC) +@roc groupsize=8 shmem=shmem kernel(RC, RA) ``` -`buf` in this example is a `Mem.Buffer`, which contains a pointer -to the allocated memory. The buffer can be converted to a pointer by doing -`Base.unsafe_convert(Ptr{Nothing}, buf)`, and may then be converted to the -appropriate pointer type, and loaded from/stored to. By default, memory is -allocated specifically on and for `device`, and is only accessible to that -device unless transferred using the various functions in the `Mem` module. If -memory should be globally accessible by the CPU and by all GPUs, the kwarg -`coherent=true` may be passed, which utilizes "finegrained" memory instead. -Memory should be freed once it's no longer in use with `Mem.free(buf)`. - -If allocations are done as -`Mem.alloc(device, bytes; coherent=false, slow_fallback=true)`, and the -allocation is larger than supported for that memory region, then coherent -memory will be automatically used (if possible) to service the allocation. This -can be disabled with `AMDGPU.Mem.enable_slow_allocation_fallback(false)` and -restarting Julia. - ## Device-Side Allocations -Global memory allocated by a kernel is automatically freed when the kernel -completes, which is done in the `wait` call on the host. This behavior can be -disabled by passing `cleanup=false` to `wait`. +Global memory may be allocated/freed dynamically from kernels by calling +`AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid}` +and `AMDGPU.Device.free(::Ptr{Cvoid})`. + +This memory allocation/deallocation uses hostcalls to operate, +and so is relatively slow, but is also very useful. +See [Hostcall](@ref) section for more info about them. -Global memory may also be allocated and freed dynamically from kernels by -calling `AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid}` and -`AMDGPU.Device.free(::Ptr{Cvoid})`. This memory allocation/deallocation uses -hostcalls to operate, and so is relatively slow, but is also very useful. -Currently, memory allocated with `AMDGPU.Device.malloc` is coherent by default. +Memory allocated with `AMDGPU.Device.malloc` is a host-pinned memory. Calls to `malloc` and `free` are performed once per workgroup, so ensure that enough memory has been allocated to feed the lanes that will be accessing it. @@ -88,68 +93,33 @@ As an example, here's how an array could be allocated on-device to store temporary results: ```julia -function kernel(C, A, B) - # Allocate memory dynamically and get a pointer to it - Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64)*length(C))) - # Turn it (a pointer to Float64 elements in Global memory) into a device-side array +function kernel(C, A) + # Allocate memory dynamically and get a pointer to it. + Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64) * length(C))) + # Turn a pointer into a device-side array. Ctmp = ROCDeviceArray(length(C), reinterpret(Core.LLVMPtr{Float64,1}, Ctmp_ptr)) + # Use it idx = AMDGPU.workitemIdx().x - Ctmp[idx] = A[idx] + B[idx] + C[1] + Ctmp[idx] = A[idx] + C[1] AMDGPU.Device.sync_workgroup() + C[idx] = Ctmp[idx] - # Make sure to free it + # Make sure to free it. AMDGPU.Device.free(Ctmp_ptr) - nothing + return end + RA = AMDGPU.rand(4) -RB = AMDGPU.rand(4) RC = AMDGPU.rand(4) RC_elem = Array(RC)[1] -wait(@roc groupsize=4 kernel(RC, RA, RB)) -@assert Array(RC) ≈ Array(RA) .+ Array(RB) .+ RC_elem +@roc groupsize=4 kernel(RC, RA) +@assert Array(RC) ≈ Array(RA) .+ RC_elem ``` -Local memory may be allocated within a kernel by calling either -`@ROCStaticLocalArray(T, dims)` or `@ROCDynamicLocalArray(T, dims)` - use the -former if `dims` is passed as a constant value, and otherwise use the latter. -Local memory does not need to be freed, as it is automatically freed by the -hardware. - -If `@ROCDynamicLocalArray` is used, then local memory is dynamically allocated -at kernel execution time; the `localmem` option to `@roc` must be set -appropriately to ensure that enough local memory is allocated by the hardware. -It is allocated in addition to the local memory that is statically allocated by -the kernel. - -```julia -function kernel(C, A, B) - # Allocate local memory dynamically - Ctmp = @ROCDynamicLocalArray(Float64, length(C)) - # Or, allocate local memory statically if the size is known ahead-of-time - # Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements - - # Use it - idx = AMDGPU.workitemIdx().x - Ctmp[idx] = A[idx] + B[idx] + C[1] - AMDGPU.Device.sync_workgroup() - C[idx] = Ctmp[idx] - nothing -end -# ... -# Note: The `localmem` option isn't necessary if `@ROCStaticLocalArray` is used -wait(@roc groupsize=4 localmem=sizeof(Float64)*length(RC) kernel(RC, RA, RB)) -``` - -Note that like CUDA's shared memory, AMDGPU's local memory is zero-initialized -automatically. If this behavior is unnecessary (and undesired for performance -reasons), zero-initialization can be disabled with `@ROCDynamicLocalArray(T, -dims, false)` or `@ROCStaticLocalArray(T, dims, false)` (the last argument -is `zeroinit`). - ## Memory Modification Intrinsics Like C, AMDGPU.jl provides the `memset!` and `memcpy!` intrinsics, which are useful for setting a memory region to a value, or copying one region to -another, respectively. Check `test/device/memory.jl` for examples of their -usage. +another, respectively. +Check `test/device/memory.jl` for examples of their usage. diff --git a/docs/src/profiling.md b/docs/src/profiling.md new file mode 100644 index 000000000..9ea7aaf32 --- /dev/null +++ b/docs/src/profiling.md @@ -0,0 +1,67 @@ +## rocprof + +rocprof allows profiling both HSA & HIP API calls. + +Let's profile simple copying kernel saved in `profile.jl` file: +```julia +using AMDGPU + +function mycopy!(dst, src) + i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x + if i ≤ length(dst) + @inbounds dst[i] = src[i] + end + return +end + +function main(N) + src = ROCArray{Float64}(undef, N) + dst = ROCArray{Float64}(undef, N) + nthreads = 256 + nblocks = cld(N, nthreads) + + for i in 1:10 + @roc groupsize=nthreads gridsize=nblocks mycopy!(dst, src) + AMDGPU.synchronize() + end + + AMDGPU.unsafe_free!(dst) + AMDGPU.unsafe_free!(src) + AMDGPU.synchronize() + return +end +main(2^24) +``` + +```bash +ENABLE_JITPROFILING=1 rocprof --hip-trace --hsa-trace julia ./profile.jl +``` + +This will produce `results.json` (among other files) which can be visualized +using [Perfetto UI](https://ui.perfetto.dev/). + +|Zoomed out|Zoomed in| +|:---:|:---:| +|![image](./assets/profiling-1.png)|![image](./assets/profiling-4.png)| + +Here we can clearly see that host synchronization after each kernel dispatch +causes poor device occupancy (empty spaces between kernel dispatches). + +We can fix this by moving synchronization outside the loop so that it happens only once. + +```julia + ... + for i in 1:10 + @roc groupsize=nthreads gridsize=nblocks mycopy!(dst, src) + end + AMDGPU.synchronize() + ... +``` + +Running profiling again and visualizing results we now see that +kernel launches are adjacent to each other and that the average +wall duaration is lower. + +|Zoomed out|Zoomed in| +|:---:|:---:| +|![image](./assets/profiling-2.png)|![image](./assets/profiling-3.png)| diff --git a/docs/src/queues_signals.md b/docs/src/queues_signals.md deleted file mode 100644 index 7e277f209..000000000 --- a/docs/src/queues_signals.md +++ /dev/null @@ -1,98 +0,0 @@ -# Queues - -Similar to CUDA streams, ROCm has the concept of queues, which are -buffers used to instruct the GPU hardware which kernels to launch. -ROCm queues are synchronous, like CUDA streams. - -Each device has a default queue associated, -which is accessible with [`AMDGPU.queue`](@ref). - -To specify which queue to launch a kernel on: - -- Using [`AMDGPU.queue!`](@ref), which will execute given function and reset - to the original queue after completion: - -```julia -q = AMDGPU.ROCQueue() -x = AMDGPU.queue!(() -> AMDGPU.ones(Float32, 16), q) -``` - -- Using `queue` argument to [`@roc`](@ref) macro: - -```julia -q = AMDGPU.ROCQueue() -@roc queue=q kernel(...) -``` - -Queues also have an inherent priority, which allows control of kernel -submission latency and on-device scheduling preference with respect to kernels -submitted on other queues. -There are three priorities: normal (the default), low, and high priority. - -Priority of the default `queue` can be set with [`AMDGPU.priority!`](@ref). -Alternatively, it can be set at queue creation time: - -```julia -low_prio_queue = ROCQueue(; priority=:low) -high_prio_queue = ROCQueue(; priority=:high) -normal_prio_queue = ROCQueue(; priority=:normal) # or just omit "priority" -``` - -To get kernels which are currently executing on a given queue, -use [`AMDGPU.active_kernels`](@ref). -It will return a `Vector{ROCKernelSignal}`, which can be inspected to -determine how many (and which) kernels are executing. - -If a kernel gets "stuck" and locks up the GPU (noticeable with 100% GPU usage in `rocm-smi`) -you can kill it and all other kernels associated with the queue it is running on -with [`AMDGPU.Runtime.kill_queue!(queue)`](@ref). -This can be "safely" done to the default queue (obtained via [`AMDGPU.queue`](@ref)), -since default queues are recreated as-needed. - -```@docs -AMDGPU.queue -AMDGPU.queue! -AMDGPU.priority! -AMDGPU.active_kernels -AMDGPU.ROCQueue -AMDGPU.Runtime.set_queue_pool_size! -AMDGPU.Runtime.kill_queue! -``` - -# Signals - -Unlike CUDA, ROCm kernels are tracked by an associated signal, which is -created and returned by `@roc`, and is `wait`ed on to track kernel completion. -Signals may also be used for manual synchronization (since they work for CPUs -and GPUs equally well). CPU usage is done with the `HSA.signal_*` functions, -and GPU usage is done with the `device_signal_*` and `hostcall_device_signal_*` -functions. For most signalling needs, consider using a hostcall instead. - -If custom signal handling is desired, signals can be manually constructed and -passed to `@roc`: - -```julia -# A kernel which waits on all signals in `sigs` -function multi_wait(sigs) - for i in 1:length(sigs) - AMDGPU.Device.hostcall_device_signal_wait(sigs[i], 0) - end - nothing -end - -# Create a set of signals -sigs = [ROCSignal() for i in 1:10] -# Get the device-safe signal handles -_sigs = ROCArray(map(sig->sig.signal, sigs)) - -# Launch multi-waiter ahead of time; this will block on the device -final_sig = @roc multi_wait(_sigs) - -# Associate kernels with signals -for sig in sigs - @roc signal=sig identity(nothing) -end - -# Wait on the multi-waiter -wait(final_sig) -``` diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md index 34fce1717..b6da54359 100644 --- a/docs/src/quickstart.md +++ b/docs/src/quickstart.md @@ -43,20 +43,23 @@ Pkg.test("AMDGPU") ## Running a simple kernel -As a simple test, we will try to add two random vectors and make sure that the results from the CPU and the GPU are indeed the same. +As a simple test, we will try to add two random vectors +and make sure that the results from the CPU and the GPU are indeed the same. We can start by first performing this simple calculation on the CPU: ```julia -N = 32 +N = 1024 a = rand(Float64, N) b = rand(Float64, N) c_cpu = a + b ``` -To do the same computation on the GPU, we first need to copy the two input arrays `a` and `b` to the device. +To do the same computation on the GPU, we first need to copy +the two input arrays `a` and `b` to the device. Toward that end, we will use the `ROCArray` type to represent our GPU arrays. -We can create the two arrays by passing the host data to the constructor as follows: +We can create the two arrays by passing the host data +to the constructor as follows: ```julia using AMDGPU @@ -70,67 +73,98 @@ We need to create one additional array `c_d` to store the results: c_d = similar(a_d) ``` -In this example, the postfix `_d` distinguishes a device memory object from its host memory counterpart. -This convention is completely arbitrary and you may name your device-side variables whatever you like; they are regular Julia variables. +In this example, the postfix `_d` distinguishes a device memory object +from its host memory counterpart. +This convention is completely arbitrary and you may name your +device-side variables whatever you like; they are regular Julia variables. Next, we will define the GPU kernel that does the actual computation: ```julia function vadd!(c, a, b) - i = workitemIdx().x + i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x c[i] = a[i] + b[i] return end ``` -This simple kernel starts by getting the current thread ID using [`workitemIdx`](@ref) and then performs the addition of the elements from `a` and `b`, storing the result in `c`. +The index of a single workitem can be uniquely identified by its grid index +(computed linearly as `(workgroupDim().x * (workgroupIdx().x - 1)) + workitemIdx().x` +when only a single dimension is used). + +The grid is the domain over which the *entire* kernel executes over. +The grid will be split into multiple workgroups by hardware automatically, +and the kernel does not complete until all workgroups complete. + +Like OpenCL, AMDGPU has the concept of "workitems", "workgroups", and the "grid". +A workitem is a single thread of execution, capable of performing arithmentic +operations. +Workitems are grouped into "wavefronts" ("warps" in CUDA) which +share the same compute unit, and execute the same instructions simulatenously. +The workgroup is a logical unit of compute supported by hardware +which comprises multiple wavefronts, which shares resources +(specifically local memory) and can be efficiently synchronized. +A workgroup may be executed by one or multiple hardware compute units, +making it often the only dimension of importance for smaller kernel launches. + +Notice how we explicitly specify that this function does not return a value +by adding the `return` statement. +This is necessary for all GPU kernels and we can enforce it by adding a `return`, +`return nothing`, or even `nothing` at the end of the kernel. +If this statement is omitted, Julia will attempt to return the value +of the last evaluated expression, in this case a `Float64`, +which will cause a compilation failure as kernels cannot return values. + +The easiest way to launch a GPU kernel is with the [`@roc`](@ref) macro, +specifying `groupsize` and `gridsize` to cover full array, +and calling it like a regular function: -Like OpenCL, AMDGPU has the concept of "workitems", "workgroups", and the "grid". A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into "wavefronts" ("warps" in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches. - -The grid is the domain over which the *entire* kernel executes over. The index of a single workitem can be uniquely identified by its grid index (computed linearly as `(workgroupDim().x * (workgroupIdx().x - 1)) + workitemIdx().x` when only a single dimension is used). The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete. - -Notice how we explicitly specify that this function does not return a value by adding the `return` statement. -This is necessary for all GPU kernels and we can enforce it by adding a `return`, `return nothing`, or even `nothing` at the end of the kernel. -If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a `Float64`, which will cause a compilation failure as kernels cannot return values. +```julia +groupsize = 128 +gridsize = cld(length(c_d), groupsize) +@roc gridsize=gridsize groupsize=groupsize vadd!(c_d, a_d, b_d) +``` -The easiest way to launch a GPU kernel is with the [`@roc`](@ref) macro, specifying that we want a single work group with `N` work items and calling it like an ordinary function: +Keep in mind that kernel launches are asynchronous, +meaning that you need to do some kind of synchronization before you use the result. +For instance, you can call `AMDGPU.synchronize()`: ```julia @roc groupsize=N vadd!(c_d, a_d, b_d) +AMDGPU.synchronize() ``` -Keep in mind that kernel launches are asynchronous, meaning that you need to do some kind of synchronization before you use the result. -For instance, you can call `wait()` on the returned HSA signal value: +Finally, we can make sure that the results match, +by first copying the data to the host and then comparing it with the CPU results: ```julia -wait(@roc groupsize=N vadd!(c_d, a_d, b_d)) -``` +c = Array(c_d) -!!! warning "Naming conventions" - Throughout this example we use terms like "work group" and "work item". - These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation. +using Test +@test isapprox(c, c_cpu) +``` - NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU. - As a quick summary, here is a mapping of the most common terms: +## Naming conventions - | AMDGPU | CUDA | - |:---:|:---:| - | [`workitemIdx`](@ref) | [`threadIdx`](@ref) | - | [`workgroupIdx`](@ref) | [`blockIdx`](@ref) | - | [`workgroupDim`](@ref) | [`blockDim`](@ref) | - | [`gridItemDim`](@ref) | No equivalent | - | [`gridGroupDim`](@ref) | `gridDim` | - | `groupsize` | `threads` | - | `gridsize` | `blocks * threads` | - | `queue` | `stream` | +Throughout this example we use terms like "work group" and "work item". +These terms are used by the Khronos consortium and their APIs +including OpenCL and Vulkan, as well as the HSA foundation. - For compatibilty reasons, the symbols in the CUDA column (except for `gridItemDim`) are also supported by AMDGPU. +NVIDIA, on the other hand, uses some different terms in their CUDA API, +which might be confusing to some users porting their kernels from CUDA to AMDGPU. -Finally, we can make sure that the results match, by first copying the data to the host and then comparing it with the CPU results: +As a quick summary, here is a mapping of the most common terms: -```julia -c = Array(c_d) +| AMDGPU | CUDA | +|:---:|:---:| +| [`workitemIdx`](@ref) | [`threadIdx`](@ref) | +| [`workgroupIdx`](@ref) | [`blockIdx`](@ref) | +| [`workgroupDim`](@ref) | [`blockDim`](@ref) | +| [`gridItemDim`](@ref) | No equivalent | +| [`gridGroupDim`](@ref) | `gridDim` | +| `groupsize` | `threads` | +| `gridsize` | `blocks * threads` | +| `queue` | `stream` | -using Test -@test isapprox(c, c_cpu) -``` +For compatibilty reasons, the symbols in the CUDA column +(except for `gridItemDim`) are also supported by AMDGPU. diff --git a/docs/src/streams.md b/docs/src/streams.md new file mode 100644 index 000000000..88fdbaf09 --- /dev/null +++ b/docs/src/streams.md @@ -0,0 +1,55 @@ +# Streams + +Similar to CUDA streams, ROCm has HIP streams, +which are buffers used to instruct the GPU hardware which kernels to launch. +HIP streams are synchronous, like CUDA streams. + +Each device has a default stream associated, +which is accessible with `AMDGPU.stream()`. + +There are several ways to specify which stream to launch a kernel on: + +- Using [`AMDGPU.stream!`](@ref) to change default stream to be used + **within the same Julia task**. + +```julia +stream = AMDGPU.HIPStream() +AMDGPU.stream!(stream) # Change default stream to be used for subsequent operations. +AMDGPU.ones(Float32, 16) # Will be executed on `stream`. +``` + +- Using [`AMDGPU.stream!`](@ref) to execute given function and reset + to the original stream after completion: + +```julia +stream = AMDGPU.HIPStream() +x = AMDGPU.stream!(() -> AMDGPU.ones(Float32, 16), stream) +``` + +- Using `stream` argument to [`@roc`](@ref) macro: + +```julia +stream = AMDGPU.HIPStream() +@roc stream=stream kernel(...) +``` + +Streams also have an inherent priority, which allows control of kernel +submission latency and on-device scheduling preference with respect to kernels +submitted on other streams. +There are three priorities: normal (the default), low, and high priority. + +Priority of the default `stream` can be set with [`AMDGPU.priority!`](@ref). +Alternatively, it can be set at stream creation time: + +```julia +low_prio = HIPStream(:low) +high_prio = HIPStream(:high) +normal_prio = HIPStream(:normal) # or just omit "priority" +``` + +```@docs +AMDGPU.stream +AMDGPU.stream! +AMDGPU.priority! +AMDGPU.HIPStream +``` diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl index 03f12f217..e15c3ca6d 100644 --- a/src/AMDGPU.jl +++ b/src/AMDGPU.jl @@ -1,21 +1,19 @@ module AMDGPU -### Imports ### - +using Adapt using CEnum -using Libdl -using LLVM, LLVM.Interop using GPUCompiler using GPUArrays -using Adapt +using Libdl +using LLVM, LLVM.Interop +using Preferences +using Printf + import LinearAlgebra import Core: LLVMPtr -### Exports ### - -export ROCDevice, ROCQueue, ROCExecutable, ROCKernel, ROCSignal +export HIPDevice export has_rocm_gpu - export ROCArray, ROCVector, ROCMatrix, ROCVecOrMat export roc @@ -32,19 +30,39 @@ function Base.lock(f, x::LockedObject) end end +struct KernelState + # Exception reporting buffers. + exception_flag::Ptr{Int32} + gate::Ptr{UInt64} + buffers_counter::Ptr{Int32} + str_buffers_counter::Ptr{Int32} + buffers::Ptr{Ptr{Cvoid}} + string_buffers::Ptr{Ptr{Cvoid}} + n_buffers::Int32 + n_str_buffers::Int32 + + # Malloc/free hostcalls. + malloc_hc::Ptr{Cvoid} + free_hc::Ptr{Cvoid} + + # Print hostcalls. + output_context::Ptr{Cvoid} + printf_output_context::Ptr{Cvoid} +end + # Load HSA Runtime. const libhsaruntime = "libhsa-runtime64.so.1" include(joinpath("hsa", "HSA.jl")) -import .HSA: Agent, Queue, Executable, Status, Signal -# Load binary dependencies -include(joinpath(dirname(@__DIR__), "deps", "bindeps.jl")) +# Load binary dependencies. +include("discovery_utils.jl") +include("rocm_discovery.jl") +populate_globals!(bindeps_setup()) # Utilities include("utils.jl") # Load HIP -const libhip = "libamdhip64.so" include(joinpath("hip", "HIP.jl")) import .HIP: HIPContext, HIPDevice, HIPStream export HIPContext, HIPDevice, HIPStream @@ -53,16 +71,18 @@ include("cache.jl") module Runtime using ..CEnum - using Setfield - import ..HSA - import ..Adapt using ..GPUCompiler + + import ..Adapt import Preferences: @load_preference, @set_preferences! import TimespanLogging import TimespanLogging: timespan_start, timespan_finish + import ..HSA + import ..HIP import ..AMDGPU - import ..AMDGPU: getinfo, LockedObject, HIP + import ..AMDGPU: getinfo, LockedObject + import .HIP: HIPDevice struct Adaptor end @@ -71,39 +91,51 @@ module Runtime include(joinpath("runtime", "logging.jl")) include(joinpath("runtime", "error.jl")) - include(joinpath("runtime", "thread-utils.jl")) + include(joinpath("runtime", "hsa_device.jl")) include(joinpath("runtime", "device.jl")) - include(joinpath("runtime", "linked-list.jl")) - include(joinpath("runtime", "queue.jl")) - include(joinpath("runtime", "signal.jl")) include(joinpath("runtime", "dims.jl")) + module Mem - include(joinpath("runtime", "memory.jl")) + using Preferences + + import AMDGPU + import AMDGPU: HIP, HSA + import AMDGPU: Runtime + import .HIP: HIPDevice + import .Runtime: ROCDim, ROCDim3, check + + const refcounts_lock = Threads.ReentrantLock() + + abstract type AbstractAMDBuffer end + + include(joinpath("runtime", "memory", "utils.jl")) + include(joinpath("runtime", "memory", "hip.jl")) + include(joinpath("runtime", "memory", "refcount.jl")) end - include(joinpath("runtime", "executable.jl")) - include(joinpath("runtime", "hashing.jl")) - include(joinpath("runtime", "kernel.jl")) - include(joinpath("runtime", "kernel-signal.jl")) - include(joinpath("runtime", "launch.jl")) + include(joinpath("runtime", "execution.jl")) - include(joinpath("runtime", "sync.jl")) + include(joinpath("runtime", "hip-execution.jl")) include(joinpath("runtime", "fault.jl")) -end # module Runtime +end + import .Runtime: Mem -import .Runtime: ROCDevice, ROCQueue const ci_cache = GPUCompiler.CodeCache() Base.Experimental.@MethodTable(method_table) module Device - import ..HSA - import ..Runtime - import ..Mem - import Core: LLVMPtr using ..GPUCompiler using ..LLVM using ..LLVM.Interop + + import ..Adapt + import Core: LLVMPtr import ..LinearAlgebra + + import ..HSA + import ..HIP + import ..Runtime + import ..Mem import ..AMDGPU import .AMDGPU: method_table @@ -117,12 +149,11 @@ module Device include(joinpath("device", "runtime.jl")) include(joinpath("device", "quirks.jl")) end -import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame -import .Device: ROCDeviceArray, AS, HostCall, hostcall! +import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame, report_exception_name +import .Device: ROCDeviceArray, AS, HostCall, HostCallHolder, hostcall! import .Device: @ROCDynamicLocalArray, @ROCStaticLocalArray import .Device: workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim -import .Device: threadIdx, blockIdx, blockDim -import .Device: sync_workgroup +import .Device: threadIdx, blockIdx, blockDim, sync_workgroup import .Device: @rocprint, @rocprintln, @rocprintf export ROCDeviceArray, @ROCDynamicLocalArray, @ROCStaticLocalArray @@ -131,26 +162,27 @@ export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim export sync_workgroup module Compiler + import Core: LLVMPtr + import LLD_jll + using ..GPUCompiler using ..LLVM - import ..Adapt - import Core: LLVMPtr using Printf import ..AMDGPU import ..AMDGPU: AS import ..Runtime import ..Device - import .Runtime: ROCDevice, ROCModule, ROCFunction - import .Runtime: Adaptor - import .Runtime: Mem + import ..HIP + import ..Mem - include(joinpath("compiler", "device-libs.jl")) - include(joinpath("compiler", "utils.jl")) - include(joinpath("compiler", "global-hooks.jl")) + include(joinpath("compiler", "zeroinit_lds.jl")) + include(joinpath("compiler", "device_libs.jl")) + include(joinpath("compiler", "exceptions.jl")) + include(joinpath("compiler", "output_context.jl")) + include(joinpath("compiler", "dynamic_memory.jl")) include(joinpath("compiler", "codegen.jl")) - include(joinpath("compiler", "occupancy.jl")) -end # module Compiler +end include("tls.jl") include("highlevel.jl") @@ -164,11 +196,10 @@ include("array.jl") include("conversions.jl") include("broadcast.jl") include("mapreduce.jl") +include("exception_handler.jl") allowscalar(x::Bool) = GPUArrays.allowscalar(x) -include("deprecations.jl") - ### Initialization and Shutdown ### const HSA_REFCOUNT = Threads.Atomic{UInt}(0) @@ -186,9 +217,6 @@ end # Load ROCm external libraries include(joinpath("blas", "rocBLAS.jl")) -#include(joinpath("sparse", "rocSPARSE.jl") -#include(joinpath("solver", "rocSOLVER.jl") -#include(joinpath("solver", "rocALUTION.jl") include(joinpath("rand", "rocRAND.jl")) include(joinpath("fft", "rocFFT.jl")) include(joinpath("dnn", "MIOpen.jl")) @@ -207,7 +235,7 @@ function __init__() if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS") env_use_artifacts = !parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false")) - if use_artifacts != env_use_artifacts + if use_artifacts() != env_use_artifacts enable_artifacts!(env_use_artifacts) @warn """ The environment variable JULIA_AMDGPU_DISABLE_ARTIFACTS does not match the value from preferences. @@ -252,12 +280,9 @@ function __init__() end # Select the default device - for device in Runtime.fetch_devices() - if !isassigned(Runtime.DEFAULT_DEVICE) && device_type(device) == :gpu - Runtime.DEFAULT_DEVICE[] = device - break - end - end + Runtime.fetch_hsa_devices() + devs = Runtime.fetch_devices() + Runtime.set_default_device!(first(devs)) # Setup HSA fault handler Runtime.setup_fault_handler() @@ -269,7 +294,6 @@ function __init__() HSA runtime is unavailable, compilation and runtime functionality will be disabled. Reason: $hsa_build_reason """ - if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0")) print_build_diagnostics() error("Failed to load HSA runtime, but HSA must load, bailing out") @@ -282,7 +306,6 @@ function __init__() LLD is unavailable, compilation functionality will be disabled. Reason: $lld_build_reason """ - if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0")) print_build_diagnostics() error("Failed to find ld.lld, but ld.lld must exist, bailing out") @@ -295,7 +318,6 @@ function __init__() Device libraries are unavailable, device intrinsics will be disabled. Reason: $device_libs_build_reason """ - if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0")) print_build_diagnostics() error("Failed to find Device Libs, but Device Libs must exist, bailing out") @@ -303,14 +325,11 @@ function __init__() end # Check whether HIP is available - if functional(:hip) - push!(Libdl.DL_LOAD_PATH, dirname(libhip_path)) - else + if !functional(:hip) @warn """ HIP library is unavailable, HIP integration will be disabled. Reason: $hip_build_reason """ - if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0")) print_build_diagnostics() error("Failed to load HIP runtime, but HIP must load, bailing out") @@ -322,14 +341,15 @@ function __init__() "dense BLAS", "sparse BLAS", "linear solver", "fancy linear solver", "RNG", "FFT", "DNN/convolution") for ((name, pkg), purpose) in zip(rocm_ext_libs, descriptions) - if use_artifacts && pkg !== nothing && !functional(name) + if use_artifacts() && pkg !== nothing && !functional(name) # These are numerous and thus noisy build_reason = getfield(AMDGPU, Symbol(name, :_build_reason)) @debug """ - $pkg is unavailable, $purpose functionality will be disabled. Reason: $build_reason + $pkg is unavailable, $purpose functionality will be disabled. + Reason: $build_reason. """ end end end -end # module +end diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl index a7041d3bd..5066682ad 100644 --- a/src/ROCKernels.jl +++ b/src/ROCKernels.jl @@ -37,7 +37,6 @@ end function KernelAbstractions.copyto!(::ROCBackend, A, B) GC.@preserve A B begin - # TODO: async copy copyto!(A, 1, B, 1, length(A)) end return nothing @@ -100,9 +99,7 @@ function (obj::Kernel{ROCBackend})(args...; ndrange=nothing, workgroupsize=nothi nthreads = length(workitems(iterspace)) nblocks == 0 && return nothing - AMDGPU.@roc( - groupsize=nthreads, gridsize=(nblocks * nthreads), - obj.f(ctx, args...)) + kernel(ctx, args...; groupsize=nthreads, gridsize=nblocks) return nothing end @@ -177,11 +174,12 @@ end AMDGPU.Device.sync_workgroup() end -@device_override @inline function __print(args...) - for arg in args - AMDGPU.Device.@rocprintf("%s", arg) - end -end +# TODO fix +# @device_override @inline function __print(args...) +# for arg in args +# AMDGPU.Device.@rocprintf("%s", arg) +# end +# end ### # GPU implementation of constant memory diff --git a/src/array.jl b/src/array.jl index c3b9bad26..c995f1f6f 100644 --- a/src/array.jl +++ b/src/array.jl @@ -8,12 +8,11 @@ struct ROCArrayBackend <: AbstractGPUBackend end struct ROCKernelContext <: AbstractKernelContext end -function GPUArrays.gpu_call(::ROCArrayBackend, f, args, threads::Int, blocks::Int; name::Union{String,Nothing}) - groupsize, gridsize = threads, blocks * threads - wait(@roc groupsize=groupsize gridsize=gridsize f(ROCKernelContext(), args...)) -end -function GPUArrays.gpu_call(::ROCArrayBackend, f, args; elements::Int, name::Union{String,Nothing}=nothing) - wait(@roc groupsize=min(elements, 64) gridsize=elements f(ROCKernelContext(), args...)) +@inline function GPUArrays.gpu_call( + ::ROCArrayBackend, f, args, threads::Int, blocks::Int; + name::Union{String, Nothing}, +) + @roc gridsize=blocks groupsize=threads name=name f(ROCKernelContext(), args...) end ## on-device @@ -21,11 +20,11 @@ end # indexing for (f, froc) in ( - (:blockidx, :blockIdx), - (:blockdim, :blockDim), - (:threadidx, :threadIdx), - (:griddim, :gridGroupDim) - ) + (:blockidx, :blockIdx), + (:blockdim, :blockDim), + (:threadidx, :threadIdx), + (:griddim, :gridGroupDim) +) @eval @inline GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x end @@ -50,54 +49,32 @@ end return end - # # Host abstractions # mutable struct ROCArray{T,N} <: AbstractGPUArray{T,N} - buf::Mem.Buffer + buf::Mem.HIPBuffer dims::Dims{N} offset::Int - syncstate::Runtime.SyncState function ROCArray{T,N}( - buf::Mem.Buffer, dims::Dims{N}; offset::Integer = 0, - syncstate::Runtime.SyncState = Runtime.SyncState(), + buf::Mem.HIPBuffer, dims::Dims{N}; offset::Integer = 0, ) where {T,N} @assert isbitstype(T) "ROCArray only supports bits types" - xs = new{T,N}(buf, dims, offset, syncstate) + xs = new{T,N}(buf, dims, offset) Mem.retain(buf) finalizer(_safe_free!, xs) return xs end end -_safe_free!(xs::ROCArray) = _safe_free!(xs.buf) -function _safe_free!(buf::Mem.Buffer) - Mem.release(buf) - return -end - -unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf) +_safe_free!(xs::ROCArray) = Mem.release(xs.buf; stream=default_stream()) -wait!(x::ROCArray) = wait!(x.syncstate) -mark!(x::ROCArray, s) = mark!(x.syncstate, s) -wait!(xs::Vector{<:ROCArray}) = foreach(wait!, xs) -mark!(xs::Vector{<:ROCArray}, s) = foreach(x->mark!(x,s), xs) -wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(wait!, xs) -mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x->mark!(x,s), xs) -function Adapt.adapt_storage(::Runtime.WaitAdaptor, x::ROCArray) - Runtime.wait!(x.syncstate) - x -end -function Adapt.adapt_storage(ma::Runtime.MarkAdaptor, x::ROCArray) - Runtime.mark!(x.syncstate, ma.s) - x -end +unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf; stream=stream()) """ - device(A::ROCArray) -> ROCDevice + device(A::ROCArray) -> HIPDevice Return the device associated with the array `A`. """ @@ -119,8 +96,8 @@ AnyROCVecOrMat{T} = Union{AnyROCVector{T}, AnyROCMatrix{T}} # type and dimensionality specified, accepting dims as tuples of Ints function ROCArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N} - buf = Mem.alloc(prod(dims)*sizeof(T)) - ROCArray{T,N}(buf, dims) + buf = Mem.HIPBuffer(prod(dims) * sizeof(T); stream=stream()) + ROCArray{T, N}(buf, dims) end # type and dimensionality specified, accepting dims as series of Ints @@ -134,14 +111,10 @@ ROCArray{T}(::UndefInitializer, dims::Integer...) where {T} = # from Base arrays function ROCArray{T,N}(x::Array{T,N}, dims::Dims{N}) where {T,N} r = ROCArray{T,N}(undef, dims) - Mem.upload!(r.buf, pointer(x), sizeof(x)) + Mem.upload!(r.buf, pointer(x), sizeof(x); stream=stream()) return r end -# type as first argument -# FIXME: Remove me! -#ROCArray(::Type{T}, dims::Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims) - # empty vector constructor ROCArray{T,1}() where {T} = ROCArray{T,1}(undef, 0) @@ -152,7 +125,6 @@ Base.similar(::ROCArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = ROCArray{T ## array interface Base.elsize(::Type{<:ROCArray{T}}) where {T} = sizeof(T) - Base.size(x::ROCArray) = x.dims Base.sizeof(x::ROCArray) = Base.elsize(x) * length(x) @@ -162,9 +134,9 @@ ROCArray{T,N}(x::AbstractArray{S,N}) where {T,N,S} = ROCArray{T,N}(convert(Array{T}, x), size(x)) # underspecified constructors +ROCArray(A::AbstractArray{T,N}) where {T,N} = ROCArray{T,N}(A) ROCArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = ROCArray{T,N}(xs) (::Type{ROCArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = ROCArray{S,N}(x) -ROCArray(A::AbstractArray{T,N}) where {T,N} = ROCArray{T,N}(A) # idempotency ROCArray{T,N}(xs::ROCArray{T,N}) where {T,N} = xs @@ -173,44 +145,47 @@ ROCArray{T,N}(xs::ROCArray{T,N}) where {T,N} = xs Base.convert(::Type{T}, x::T) where T <: ROCArray = x - ## memory operations -function Base.copyto!(dest::Array{T}, d_offset::Integer, - source::ROCArray{T}, s_offset::Integer, - amount::Integer) where T +function Base.copyto!( + dest::Array{T}, d_offset::Integer, + source::ROCArray{T}, s_offset::Integer, amount::Integer; + async::Bool = false, +) where T amount == 0 && return dest @boundscheck checkbounds(dest, d_offset+amount-1) @boundscheck checkbounds(source, s_offset+amount-1) - wait!(source) - Mem.download!(pointer(dest, d_offset), - Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)), - amount*sizeof(T)) + strm = stream() + Mem.download!( + pointer(dest, d_offset), + Mem.view(source.buf, source.offset + (s_offset - 1) * sizeof(T)), + amount * sizeof(T); stream=strm) + async || AMDGPU.synchronize(strm) dest end -function Base.copyto!(dest::ROCArray{T}, d_offset::Integer, - source::Array{T}, s_offset::Integer, - amount::Integer) where T +function Base.copyto!( + dest::ROCArray{T}, d_offset::Integer, + source::Array{T}, s_offset::Integer, amount::Integer, +) where T amount == 0 && return dest @boundscheck checkbounds(dest, d_offset+amount-1) @boundscheck checkbounds(source, s_offset+amount-1) - wait!(dest) - Mem.upload!(Mem.view(dest.buf, dest.offset + (d_offset-1)*sizeof(T)), - pointer(source, s_offset), - amount*sizeof(T)) + Mem.upload!( + Mem.view(dest.buf, dest.offset + (d_offset - 1) * sizeof(T)), + pointer(source, s_offset), amount * sizeof(T); stream=stream()) dest end -function Base.copyto!(dest::ROCArray{T}, d_offset::Integer, - source::ROCArray{T}, s_offset::Integer, - amount::Integer) where T +function Base.copyto!( + dest::ROCArray{T}, d_offset::Integer, + source::ROCArray{T}, s_offset::Integer, amount::Integer, +) where T amount == 0 && return dest - @boundscheck checkbounds(dest, d_offset+amount-1) - @boundscheck checkbounds(source, s_offset+amount-1) - wait!(dest) - wait!(source) - Mem.transfer!(Mem.view(dest.buf, dest.offset + (d_offset-1)*sizeof(T)), - Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)), - amount*sizeof(T)) + @boundscheck checkbounds(dest, d_offset + amount - 1) + @boundscheck checkbounds(source, s_offset + amount - 1) + Mem.transfer!( + Mem.view(dest.buf, dest.offset + (d_offset - 1) * sizeof(T)), + Mem.view(source.buf, source.offset + (s_offset - 1) * sizeof(T)), + amount * sizeof(T); stream=stream()) dest end @@ -221,13 +196,22 @@ function Base.copy(X::ROCArray{T}) where T Xnew end -function Base.unsafe_wrap(::Type{<:ROCArray}, ptr::Ptr{T}, dims::NTuple{N,<:Integer}; device=device(), lock::Bool=true) where {T,N} +function Base.unsafe_wrap( + ::Type{<:ROCArray}, ptr::Ptr{T}, dims::NTuple{N, <:Integer}; + lock::Bool = true, +) where {T,N} @assert isbitstype(T) "Cannot wrap a non-bitstype pointer as a ROCArray" + # TODO specialize ROCArray on a buffer type and pass HostBuffer. sz = prod(dims) * sizeof(T) - device_ptr = lock ? Mem.lock(ptr, sz, device) : ptr - buf = Mem.Buffer(device_ptr, Ptr{Cvoid}(ptr), device_ptr, sz, device, false, false) - return ROCArray{T, N}(buf, dims) + dptr = if lock + HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) |> HIP.check + Mem.device_ptr(Mem.HostBuffer(ptr, sz)) + else + Ptr{Cvoid}(ptr) + end + return ROCArray{T, N}(Mem.HIPBuffer(dptr, sz), dims) end + Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T = unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...) @@ -274,7 +258,7 @@ end end @inline function unsafe_contiguous_view(a::ROCArray{T}, I::NTuple{N,Base.ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M} offset = Base.compute_offset1(a, 1, I) * sizeof(T) - ROCArray{T,M}(a.buf, dims; offset=a.offset+offset, syncstate=a.syncstate) + ROCArray{T,M}(a.buf, dims; offset=a.offset + offset) end @inline function unsafe_view(A, I, ::NonContiguous) @@ -295,36 +279,9 @@ function Base.reshape(a::ROCArray{T,M}, dims::NTuple{N,Int}) where {T,N,M} if N == M && dims == size(a) return a end - ROCArray{T,N}(a.buf, dims; offset=a.offset, syncstate=a.syncstate) -end - - -## fft - -#= -using AbstractFFTs - -# defining our own plan type is the easiest way to pass around the plans in FFTW interface -# without ambiguities - -struct FFTPlan{T} - p::T + ROCArray{T,N}(a.buf, dims; offset=a.offset) end -AbstractFFTs.plan_fft(A::ROCArray; kw_args...) = FFTPlan(plan_fft(A.data; kw_args...)) -AbstractFFTs.plan_fft!(A::ROCArray; kw_args...) = FFTPlan(plan_fft!(A.data; kw_args...)) -AbstractFFTs.plan_bfft!(A::ROCArray; kw_args...) = FFTPlan(plan_bfft!(A.data; kw_args...)) -AbstractFFTs.plan_bfft(A::ROCArray; kw_args...) = FFTPlan(plan_bfft(A.data; kw_args...)) -AbstractFFTs.plan_ifft!(A::ROCArray; kw_args...) = FFTPlan(plan_ifft!(A.data; kw_args...)) -AbstractFFTs.plan_ifft(A::ROCArray; kw_args...) = FFTPlan(plan_ifft(A.data; kw_args...)) - -function Base.:(*)(plan::FFTPlan, A::ROCArray) - x = plan.p * A.data - ROCArray(x) -end -=# - - ## GPUArrays interfaces GPUArrays.device(x::ROCArray) = x.buf.device @@ -387,7 +344,7 @@ zeros(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), zero(T)) # create a derived array (reinterpreted or reshaped) that's still a ROCArray # TODO: Move this to GPUArrays? @inline function _derived_array(::Type{T}, N::Int, a::ROCArray, osize::Dims) where {T} - return ROCArray{T,N}(a.buf, osize; offset=a.offset, syncstate=a.syncstate) + return ROCArray{T,N}(a.buf, osize; offset=a.offset) end ## reinterpret @@ -514,14 +471,16 @@ Note that this operation is only supported on managed buffers, i.e., not on arrays that are created by `unsafe_wrap`. """ function Base.resize!(A::ROCVector{T}, n::Integer) where T - if A.buf.host_ptr != C_NULL - throw(ArgumentError("Cannot resize an unowned `ROCVector`")) - end + # TODO + # 1. Specialize ROCArray on storage type. + # 2. Check that it is not HostBuffer. + # if A.buf.host_ptr != C_NULL + # throw(ArgumentError("Cannot resize an unowned `ROCVector`")) + # end # TODO: add additional space to allow for quicker resizing - if n == length(A) - return A - end + n == length(A) && return A + maxsize = n * sizeof(T) bufsize = if Base.isbitsunion(T) # type tag array past the data @@ -529,15 +488,15 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T else maxsize end - new_buf = Mem.alloc(A.buf.device, bufsize) + new_buf = Mem.HIPBuffer(bufsize; stream=stream()) + copy_size = min(length(A), n) * sizeof(T) - wait!(A) if copy_size > 0 - Mem.transfer!(new_buf, A.buf, copy_size) + Mem.transfer!(new_buf, A.buf, copy_size; stream=stream()) end # Release old buffer - _safe_free!(A.buf) + _safe_free!(A) # N.B. Manually retain new buffer (this is normally done in ROCArray ctor) Mem.retain(new_buf) diff --git a/src/blas/rocBLAS.jl b/src/blas/rocBLAS.jl index 9fd6135c3..5cffd00ae 100644 --- a/src/blas/rocBLAS.jl +++ b/src/blas/rocBLAS.jl @@ -1,7 +1,7 @@ module rocBLAS using ..AMDGPU -import AMDGPU: wait!, mark!, librocblas, AnyROCArray +import AMDGPU: librocblas, AnyROCArray import AMDGPU: HandleCache, HIP, library_state import .HIP: HIPContext, HIPStream, hipContext_t, hipStream_t, hipEvent_t diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl index 97fdcdf1d..1413bd231 100644 --- a/src/blas/wrappers.jl +++ b/src/blas/wrappers.jl @@ -47,39 +47,35 @@ end # Level 1 ## copy -for (fname, elty) in ((:rocblas_dcopy,:Float64), - (:rocblas_scopy,:Float32), - (:rocblas_zcopy,:ComplexF64), - (:rocblas_ccopy,:ComplexF32)) +for (fname, elty) in ( + (:rocblas_dcopy,:Float64), + (:rocblas_scopy,:Float32), + (:rocblas_zcopy,:ComplexF64), + (:rocblas_ccopy,:ComplexF32), +) @eval begin - function blascopy!(n::Integer, - DX::ROCArray{$elty}, - incx::Integer, - DY::ROCArray{$elty}, - incy::Integer) - wait!((DX,DY)) - (; handle, stream) = lib_state() - $(fname)(handle, n, DX, incx, DY, incy) |> check - mark!((DX,DY), stream) - DY + function blascopy!( + n::Integer, DX::ROCArray{$elty}, incx::Integer, + DY::ROCArray{$elty}, incy::Integer, + ) + (; handle, stream) = lib_state() + $(fname)(handle, n, DX, incx, DY, incy) |> check + DY end end end ## scal -for (fname, elty) in ((:rocblas_dscal,:Float64), - (:rocblas_sscal,:Float32), - (:rocblas_zscal,:ComplexF64), - (:rocblas_cscal,:ComplexF32)) +for (fname, elty) in ( + (:rocblas_dscal,:Float64), + (:rocblas_sscal,:Float32), + (:rocblas_zscal,:ComplexF64), + (:rocblas_cscal,:ComplexF32), +) @eval begin - function scal!(n::Integer, - DA::$elty, - DX::ROCArray{$elty}, - incx::Integer) - wait!(DX) + function scal!(n::Integer, DA::$elty, DX::ROCArray{$elty}, incx::Integer) (; handle, stream) = lib_state() $(fname)(handle, n, Ref(DA), DX, incx) |> check - mark!(DX, stream) DX end end @@ -88,14 +84,9 @@ end for (fname, elty, celty) in ((:rocblas_sscal, :Float32, :ComplexF32), (:rocblas_dscal, :Float64, :ComplexF64)) @eval begin - function scal!(n::Integer, - DA::$elty, - DX::ROCArray{$celty}, - incx::Integer) - wait!(DX) + function scal!(n::Integer, DA::$elty, DX::ROCArray{$celty}, incx::Integer) (; handle, stream) = lib_state() $(fname)(handle, 2*n, Ref(DA), DX, incx) |> check - mark!(DX, stream) DX end end @@ -109,13 +100,11 @@ for (jname, fname, elty) in ((:dot,:rocblas_ddot,:Float64), (:dotu,:rocblas_zdotu,:ComplexF64), (:dotu,:rocblas_cdotu,:ComplexF32)) @eval begin - function $jname(n::Integer, - DX::ROCArray{$elty}, - incx::Integer, - DY::ROCArray{$elty}, - incy::Integer) + function $jname( + n::Integer, DX::ROCArray{$elty}, incx::Integer, + DY::ROCArray{$elty}, incy::Integer, + ) result = Ref{$elty}() - wait!((DX,DY)) $(fname)(handle(), n, DX, incx, DY, incy, result) |> check return result[] end @@ -128,11 +117,8 @@ for (fname, elty, ret_type) in ((:rocblas_dnrm2,:Float64,:Float64), (:rocblas_dznrm2,:ComplexF64,:Float64), (:rocblas_scnrm2,:ComplexF32,:Float32)) @eval begin - function nrm2(n::Integer, - X::ROCArray{$elty}, - incx::Integer) + function nrm2(n::Integer, X::ROCArray{$elty}, incx::Integer) result = Ref{$ret_type}() - wait!(X) $(fname)(handle(), n, X, incx, result) |> check return result[] end @@ -147,11 +133,8 @@ for (fname, elty, ret_type) in ((:rocblas_dasum,:Float64,:Float64), (:rocblas_dzasum,:ComplexF64,:Float64), (:rocblas_scasum,:ComplexF32,:Float32)) @eval begin - function asum(n::Integer, - X::ROCArray{$elty}, - incx::Integer) + function asum(n::Integer, X::ROCArray{$elty}, incx::Integer) result = Ref{$ret_type}() - wait!(X) $(fname)(handle(), n, X, incx, result) |> check return result[] end @@ -165,16 +148,12 @@ for (fname, elty) in ((:rocblas_daxpy,:Float64), (:rocblas_zaxpy,:ComplexF64), (:rocblas_caxpy,:ComplexF32)) @eval begin - function axpy!(n::Integer, - alpha::($elty), - dx::ROCArray{$elty}, - incx::Integer, - dy::ROCArray{$elty}, - incy::Integer) - wait!((dx,dy)) + function axpy!( + n::Integer, alpha::($elty), dx::ROCArray{$elty}, incx::Integer, + dy::ROCArray{$elty}, incy::Integer, + ) (; handle, stream) = lib_state() $(fname)(handle, n, Ref(alpha), dx, incx, dy, incy) |> check - mark!((dx,dy), stream) dy end end @@ -188,8 +167,10 @@ function axpy!( if minimum(rx) < 1 || maximum(rx) > length(x) || minimum(ry) < 1 || maximum(ry) > length(y) throw(BoundsError()) end - axpy!(length(rx), convert(T, alpha), pointer(x)+(first(rx)-1)*sizeof(T), - step(rx), pointer(y)+(first(ry)-1)*sizeof(T), step(ry)) + axpy!( + length(rx), convert(T, alpha), + pointer(x) + (first(rx) - 1) * sizeof(T), step(rx), + pointer(y) + (first(ry) - 1) * sizeof(T), step(ry)) y end @@ -244,12 +225,10 @@ for (fname, elty) in ((:rocblas_dgemv,:Float64), (:rocblas_zgemv,:ComplexF64), (:rocblas_cgemv,:ComplexF32)) @eval begin - function gemv!(trans::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - X::ROCVector{$elty}, - beta::($elty), - Y::ROCVector{$elty}) + function gemv!( + trans::Char, alpha::($elty), A::ROCMatrix{$elty}, X::ROCVector{$elty}, + beta::($elty), Y::ROCVector{$elty}, + ) # handle trans roctrans = rocblasop(trans) m,n = size(A) @@ -257,12 +236,9 @@ for (fname, elty) in ((:rocblas_dgemv,:Float64), length(X) == (trans == 'N' ? n : m) && length(Y) == (trans == 'N' ? m : n) || throw(DimensionMismatch("")) # compute increments lda = max(1,stride(A,2)) - incx = stride(X,1) - incy = stride(Y,1) - wait!((A,X,Y)) + incx, incy = stride(X,1), stride(Y,1) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, Ref(alpha), A, lda, X, incx, Ref(beta), Y, incy) |> check - mark!((A,X,Y), stream) Y end function gemv(trans::Char, alpha::($elty), A::ROCMatrix{$elty}, X::ROCVector{$elty}) @@ -280,48 +256,35 @@ for (fname, elty) in ((:rocblas_dgbmv,:Float64), (:rocblas_zgbmv,:ComplexF64), (:rocblas_cgbmv,:ComplexF32)) @eval begin - function gbmv!(trans::Char, - m::Integer, - kl::Integer, - ku::Integer, - alpha::($elty), - A::ROCMatrix{$elty}, - x::ROCVector{$elty}, - beta::($elty), - y::ROCVector{$elty}) + function gbmv!( + trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::($elty), + A::ROCMatrix{$elty}, x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty}, + ) # handle trans roctrans = rocblasop(trans) - n = size(A,2) + n = size(A, 2) # check dimensions length(x) == (trans == 'N' ? n : m) && length(y) == (trans == 'N' ? m : n) || throw(DimensionMismatch("")) # compute increments - lda = max(1,stride(A,2)) - incx = stride(x,1) - incy = stride(y,1) - wait!((A,x,y)) + lda = max(1, stride(A, 2)) + incx, incy = stride(x, 1), stride(y, 1) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, kl, ku, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) y end - function gbmv(trans::Char, - m::Integer, - kl::Integer, - ku::Integer, - alpha::($elty), - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function gbmv( + trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::($elty), + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) # TODO: fix gbmv bug in julia n = size(A,2) leny = trans == 'N' ? m : n gbmv!(trans, m, kl, ku, alpha, A, x, zero($elty), similar(x, $elty, leny)) end - function gbmv(trans::Char, - m::Integer, - kl::Integer, - ku::Integer, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function gbmv( + trans::Char, m::Integer, kl::Integer, ku::Integer, + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) gbmv(trans, m, kl, ku, one($elty), A, x) end end @@ -334,23 +297,18 @@ for (fname, elty) in ((:rocblas_dsymv,:Float64), (:rocblas_csymv,:ComplexF32)) # Note that the complex symv are not BLAS but auiliary functions in LAPACK @eval begin - function symv!(uplo::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - x::ROCVector{$elty}, - beta::($elty), - y::ROCVector{$elty}) + function symv!( + uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, x::ROCVector{$elty}, + beta::($elty), y::ROCVector{$elty}, + ) rocuplo = rocblasfill(uplo) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if m != length(x) || m != length(y) throw(DimensionMismatch("")) end - lda = max(1,stride(A,2)) - incx = stride(x,1) - incy = stride(y,1) - wait!((A,x,y)) + lda = max(1, stride(A, 2)) + incx, incy = stride(x, 1), stride(y,1) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) y end function symv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, x::ROCVector{$elty}) @@ -367,24 +325,19 @@ end for (fname, elty) in ((:rocblas_zhemv,:ComplexF64), (:rocblas_chemv,:ComplexF32)) @eval begin - function hemv!(uplo::Char, - alpha::$elty, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}, - beta::$elty, - y::ROCVector{$elty}) + function hemv!( + uplo::Char, alpha::$elty, A::ROCMatrix{$elty}, x::ROCVector{$elty}, + beta::$elty, y::ROCVector{$elty}, + ) # TODO: fix dimension check bug in julia rocuplo = rocblasfill(uplo) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if m != length(x) || m != length(y) throw(DimensionMismatch("")) end - lda = max(1,stride(A,2)) - incx = stride(x,1) - incy = stride(y,1) - wait!((A,x,y)) + lda = max(1, stride(A, 2)) + incx, incy = stride(x, 1), stride(y, 1) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) y end function hemv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, @@ -404,26 +357,20 @@ end for (fname, elty) in ((:rocblas_dsbmv,:Float64), (:rocblas_ssbmv,:Float32)) @eval begin - function sbmv!(uplo::Char, - k::Integer, - alpha::($elty), - A::ROCMatrix{$elty}, - x::ROCVector{$elty}, - beta::($elty), - y::ROCVector{$elty}) + function sbmv!( + uplo::Char, k::Integer, alpha::($elty), A::ROCMatrix{$elty}, + x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty}, + ) rocuplo = rocblasfill(uplo) m, n = size(A) #if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end if m < 1+k throw(DimensionMismatch("Array A has fewer than 1+k rows")) end if n != length(x) || n != length(y) throw(DimensionMismatch("")) end - lda = max(1,stride(A,2)) - incx = stride(x,1) - incy = stride(y,1) - wait!((A,x,y)) + lda = max(1, stride(A, 2)) + incx, incy = stride(x, 1), stride(y, 1) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) y end function sbmv(uplo::Char, k::Integer, alpha::($elty), @@ -442,25 +389,19 @@ end for (fname, elty) in ((:rocblas_zhbmv,:ComplexF64), (:rocblas_chbmv,:ComplexF32)) @eval begin - function hbmv!(uplo::Char, - k::Integer, - alpha::($elty), - A::ROCMatrix{$elty}, - x::ROCVector{$elty}, - beta::($elty), - y::ROCVector{$elty}) + function hbmv!( + uplo::Char, k::Integer, alpha::($elty), A::ROCMatrix{$elty}, + x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty}, + ) rocuplo = rocblasfill(uplo) m, n = size(A) if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end if m < 1+k throw(DimensionMismatch("Array A has fewer than 1+k rows")) end if n != length(x) || n != length(y) throw(DimensionMismatch("")) end - lda = max(1,stride(A,2)) - incx = stride(x,1) - incy = stride(y,1) - wait!((A,x,y)) + lda = max(1,stride(A, 2)) + incx, incy = stride(x, 1), stride(y, 1) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check - mark!((A,x,y), stream) y end function hbmv(uplo::Char, k::Integer, alpha::($elty), @@ -481,12 +422,10 @@ for (fname, elty) in ((:rocblas_stbmv,:Float32), (:rocblas_ztbmv,:ComplexF64), (:rocblas_ctbmv,:ComplexF32)) @eval begin - function tbmv!(uplo::Char, - trans::Char, - diag::Char, - k::Integer, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function tbmv!( + uplo::Char, trans::Char, diag::Char, k::Integer, + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) rocuplo = rocblasfill(uplo) roctrans = rocblasop(trans) rocdiag = rocblasdiag(diag) @@ -496,17 +435,14 @@ for (fname, elty) in ((:rocblas_stbmv,:Float32), if n != length(x) throw(DimensionMismatch("")) end lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check - mark!((A,x), stream) x end - function tbmv(uplo::Char, - trans::Char, - diag::Char, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function tbmv( + uplo::Char, trans::Char, diag::Char, + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) tbmv!(uplo, trans, diag, A, copy(x)) end end @@ -518,12 +454,10 @@ for (fname, elty) in ((:rocblas_stbsv,:Float32), (:rocblas_ztbsv,:ComplexF64), (:rocblas_ctbsv,:ComplexF32)) @eval begin - function tbsv!(uplo::Char, - trans::Char, - diag::Char, - k::Integer, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function tbsv!( + uplo::Char, trans::Char, diag::Char, k::Integer, + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) rocuplo = rocblasfill(uplo) roctrans = rocblasop(trans) rocdiag = rocblasdiag(diag) @@ -533,18 +467,14 @@ for (fname, elty) in ((:rocblas_stbsv,:Float32), if n != length(x) throw(DimensionMismatch("")) end lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check - mark!((A,x), stream) x end - function tbsv(uplo::Char, - trans::Char, - diag::Char, - k::Integer, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function tbsv( + uplo::Char, trans::Char, diag::Char, k::Integer, + A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) tbsv!(uplo, trans, diag, k, A, copy(x)) end end @@ -556,11 +486,9 @@ for (fname, elty) in ((:rocblas_dtrmv,:Float64), (:rocblas_ztrmv,:ComplexF64), (:rocblas_ctrmv,:ComplexF32)) @eval begin - function trmv!(uplo::Char, - trans::Char, - diag::Char, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function trmv!( + uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty}, + ) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if n != length(x) @@ -571,17 +499,11 @@ for (fname, elty) in ((:rocblas_dtrmv,:Float64), rocdiag = rocblasdiag(diag) lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check - mark!((A,x), stream) x end - function trmv(uplo::Char, - trans::Char, - diag::Char, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function trmv(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty}) trmv!(uplo, trans, diag, A, copy(x)) end end @@ -593,11 +515,7 @@ for (fname, elty) in ((:rocblas_dtrsv,:Float64), (:rocblas_ztrsv,:ComplexF64), (:rocblas_ctrsv,:ComplexF32)) @eval begin - function trsv!(uplo::Char, - trans::Char, - diag::Char, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function trsv!(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty}) m, n = size(A) if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end if n != length(x) @@ -608,17 +526,11 @@ for (fname, elty) in ((:rocblas_dtrsv,:Float64), rocdiag = rocblasdiag(diag) lda = max(1,stride(A,2)) incx = stride(x,1) - wait!((A,x)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check - mark!((A,x), stream) x end - function trsv(uplo::Char, - trans::Char, - diag::Char, - A::ROCMatrix{$elty}, - x::ROCVector{$elty}) + function trsv(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty}) trsv!(uplo, trans, diag, A, copy(x)) end end @@ -630,20 +542,15 @@ for (fname, elty) in ((:rocblas_dger,:Float64), (:rocblas_zgerc,:ComplexF64), (:rocblas_cgerc,:ComplexF32)) @eval begin - function ger!(alpha::$elty, - x::ROCVector{$elty}, - y::ROCVector{$elty}, - A::ROCMatrix{$elty}) + function ger!(alpha::$elty, x::ROCVector{$elty}, y::ROCVector{$elty}, A::ROCMatrix{$elty}) m, n = size(A) m == length(x) || throw(DimensionMismatch("")) n == length(y) || throw(DimensionMismatch("")) incx = stride(x,1) incy = stride(y,1) lda = max(1,stride(A,2)) - wait!((x,y,A)) (; handle, stream) = lib_state() $(fname)(handle, m, n, Ref(alpha), x, incx, y, incy, A, lda) |> check - mark!((x,y,A), stream) A end end @@ -656,20 +563,15 @@ for (fname, elty) in ((:rocblas_dsyr,:Float64), (:rocblas_zsyr,:ComplexF64), (:rocblas_csyr,:ComplexF32)) @eval begin - function syr!(uplo::Char, - alpha::$elty, - x::ROCVector{$elty}, - A::ROCMatrix{$elty}) + function syr!(uplo::Char, alpha::$elty, x::ROCVector{$elty}, A::ROCMatrix{$elty}) rocuplo = rocblasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions")) incx = stride(x,1) lda = max(1,stride(A,2)) - wait!((x,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check - mark!((x,A), stream) A end end @@ -679,20 +581,15 @@ end for (fname, elty) in ((:rocblas_zher,:ComplexF64), (:rocblas_cher,:ComplexF32)) @eval begin - function her!(uplo::Char, - alpha::$elty, - x::ROCVector{$elty}, - A::ROCMatrix{$elty}) + function her!(uplo::Char, alpha::$elty, x::ROCVector{$elty}, A::ROCMatrix{$elty}) rocuplo = rocblasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions")) incx = stride(x,1) lda = max(1,stride(A,2)) - wait!((x,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check - mark!((x,A), stream) A end end @@ -702,11 +599,10 @@ end for (fname, elty) in ((:rocblas_zher2,:ComplexF64), (:rocblas_cher2,:ComplexF32)) @eval begin - function her2!(uplo::Char, - alpha::$elty, - x::ROCVector{$elty}, - y::ROCVector{$elty}, - A::ROCMatrix{$elty}) + function her2!( + uplo::Char, alpha::$elty, x::ROCVector{$elty}, + y::ROCVector{$elty}, A::ROCMatrix{$elty}, + ) rocuplo = rocblasfill(uplo) m, n = size(A) m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square")) @@ -715,10 +611,8 @@ for (fname, elty) in ((:rocblas_zher2,:ComplexF64), incx = stride(x,1) incy = stride(y,1) lda = max(1,stride(A,2)) - wait!((x,y,A)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, y, incy, A, lda) |> check - mark!((x,y,A), stream) A end end @@ -748,27 +642,21 @@ for (fname, elty) in lda = max(1, stride(A, 2)) ldb = max(1, stride(B, 2)) ldc = max(1, stride(C, 2)) - wait!((A, B, C)) (; handle, stream) = lib_state() $(fname)( handle, rocblasop(transA), rocblasop(transB), m, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A, B, C), stream) C end - function gemm(transA::Char, - transB::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}) + function gemm( + transA::Char, transB::Char, alpha::($elty), + A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, + ) gemm!(transA, transB, alpha, A, B, zero($elty), similar(B, $elty, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1)))) end - function gemm(transA::Char, - transB::Char, - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}) + function gemm(transA::Char, transB::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) gemm(transA, transB, one($elty), A, B) end end @@ -866,7 +754,6 @@ for (fname, elty) in ) m, k, n, lda, ldb, ldc = _check_gemm_batched_dims( transA, transB, A, B, C) - wait!((A, B, C)) batch_count = size(C, 3) a_broadcast = (size(A, 3) == 1) && (batch_count > 1) @@ -880,7 +767,6 @@ for (fname, elty) in handle, rocblasop(transA), rocblasop(transB), m, n, k, Ref(alpha), Ab, lda, Bb, ldb, Ref(beta), Cb, ldc, batch_count) |> check - mark!((A, B, C), stream) C end function gemm_batched( @@ -920,17 +806,14 @@ for (fname, elty) in (:rocblas_zgemmStridedBatched,:ComplexF64), (:rocblas_cgemmStridedBatched,:ComplexF32)) @eval begin - function gemm_strided_batched!(transA::Char, - transB::Char, - alpha::($elty), - A::ROCArray{$elty, 3}, - B::ROCArray{$elty, 3}, - beta::($elty), - C::ROCArray{$elty, 3}) + function gemm_strided_batched!( + transA::Char, transB::Char, alpha::($elty), + A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3}, + beta::($elty), C::ROCArray{$elty, 3}, + ) m = size(A, transA == 'N' ? 1 : 2) k = size(A, transA == 'N' ? 2 : 1) n = size(B, transB == 'N' ? 2 : 1) - @assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch" if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2) @@ -946,24 +829,20 @@ for (fname, elty) in strideB = stride(B, 3) strideC = stride(C, 3) batchCount = size(A, 3) - wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, roctransA, roctransB, m, n, k, Ref(alpha), A, lda, strideA, B, ldb, strideB, Ref(beta), C, ldc, strideC, batchCount) |> check - mark!((A,B,C), stream) C end - function gemm_strided_batched(transA::Char, - transB::Char, - alpha::($elty), - A::ROCArray{$elty, 3}, - B::ROCArray{$elty, 3}) + function gemm_strided_batched( + transA::Char, transB::Char, alpha::($elty), + A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3}, + ) C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3))) - gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C ) + gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C) end - function gemm_strided_batched(transA::Char, - transB::Char, - A::ROCArray{$elty, 3}, - B::ROCArray{$elty, 3}) + function gemm_strided_batched( + transA::Char, transB::Char, A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3}, + ) gemm_strided_batched(transA, transB, one($elty), A, B) end end @@ -976,13 +855,11 @@ for (fname, elty) in ((:rocblas_dsymm,:Float64), (:rocblas_csymm,:ComplexF32)) # TODO: fix julia dimension checks in symm! @eval begin - function symm!(side::Char, - uplo::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}, - beta::($elty), - C::ROCMatrix{$elty}) + function symm!( + side::Char, uplo::Char, alpha::($elty), + A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, + beta::($elty), C::ROCMatrix{$elty}, + ) rocside = rocblasside(side) rocuplo = rocblasfill(uplo) k, nA = size(A) @@ -995,23 +872,17 @@ for (fname, elty) in ((:rocblas_dsymm,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) C end - function symm(side::Char, - uplo::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}) + function symm( + side::Char, uplo::Char, alpha::($elty), + A::ROCMatrix{$elty}, B::ROCMatrix{$elty}, + ) symm!(side, uplo, alpha, A, B, zero($elty), similar(B)) end - function symm(side::Char, - uplo::Char, - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}) + function symm(side::Char, uplo::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) symm(side, uplo, one($elty), A, B) end end @@ -1022,113 +893,93 @@ for (fname, elty) in ((:rocblas_dsyrk,:Float64), (:rocblas_ssyrk,:Float32), (:rocblas_zsyrk,:ComplexF64), (:rocblas_csyrk,:ComplexF32)) - @eval begin - function syrk!(uplo::Char, - trans::Char, - alpha::($elty), - A::ROCVecOrMat{$elty}, - beta::($elty), - C::ROCMatrix{$elty}) - rocuplo = rocblasfill(uplo) - roctrans = rocblasop(trans) - mC, n = size(C) - if mC != n throw(DimensionMismatch("C must be square")) end - nn = size(A, trans == 'N' ? 1 : 2) - if nn != n throw(DimensionMismatch("syrk!")) end - k = size(A, trans == 'N' ? 2 : 1) - lda = max(1,stride(A,2)) - ldc = max(1,stride(C,2)) - wait!((A,C)) - (; handle, stream) = lib_state() - $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check - mark!((A,C), stream) - C + @eval begin + function syrk!( + uplo::Char, trans::Char, alpha::($elty), + A::ROCVecOrMat{$elty}, beta::($elty), C::ROCMatrix{$elty}, + ) + rocuplo = rocblasfill(uplo) + roctrans = rocblasop(trans) + mC, n = size(C) + if mC != n throw(DimensionMismatch("C must be square")) end + nn = size(A, trans == 'N' ? 1 : 2) + if nn != n throw(DimensionMismatch("syrk!")) end + k = size(A, trans == 'N' ? 2 : 1) + lda = max(1,stride(A,2)) + ldc = max(1,stride(C,2)) + (; handle, stream) = lib_state() + $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check + C end end end -function syrk(uplo::Char, - trans::Char, - alpha::Number, - A::ROCVecOrMat) +function syrk(uplo::Char, trans::Char, alpha::Number, A::ROCVecOrMat) T = eltype(A) n = size(A, trans == 'N' ? 1 : 2) syrk!(uplo, trans, convert(T,alpha), A, zero(T), similar(A, T, (n, n))) end -syrk(uplo::Char, trans::Char, A::ROCVecOrMat) = syrk(uplo, trans, - one(eltype(A)), - A) +syrk(uplo::Char, trans::Char, A::ROCVecOrMat) = syrk(uplo, trans, one(eltype(A)), A) ## hemm for (fname, elty) in ((:rocblas_zhemm,:ComplexF64), (:rocblas_chemm,:ComplexF32)) - @eval begin - function hemm!(side::Char, - uplo::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}, - beta::($elty), - C::ROCMatrix{$elty}) - rocside = rocblasside(side) - rocuplo = rocblasfill(uplo) - mA, nA = size(A) - m, n = size(B) - mC, nC = size(C) - if mA != nA throw(DimensionMismatch("A must be square")) end - if ((m != mC) || (n != nC)) throw(DimensionMismatch("B and C must have same dimensions")) end - if ((side == 'L') && (mA != m)) throw(DimensionMismatch("")) end - if ((side == 'R') && (mA != n)) throw(DimensionMismatch("")) end - lda = max(1,stride(A,2)) - ldb = max(1,stride(B,2)) - ldc = max(1,stride(C,2)) - wait!((A,B,C)) - (; handle, stream) = lib_state() - $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) - C - end - function hemm(uplo::Char, - trans::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - B::ROCMatrix{$elty}) - m,n = size(B) - hemm!( uplo, trans, alpha, A, B, zero($elty), similar(B, $elty, (m,n) ) ) - end - hemm( uplo::Char, trans::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) = hemm( uplo, trans, one($elty), A, B) + @eval begin + function hemm!( + side::Char, uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, + B::ROCMatrix{$elty}, beta::($elty), C::ROCMatrix{$elty}, + ) + rocside = rocblasside(side) + rocuplo = rocblasfill(uplo) + mA, nA = size(A) + m, n = size(B) + mC, nC = size(C) + if mA != nA throw(DimensionMismatch("A must be square")) end + if ((m != mC) || (n != nC)) throw(DimensionMismatch("B and C must have same dimensions")) end + if ((side == 'L') && (mA != m)) throw(DimensionMismatch("")) end + if ((side == 'R') && (mA != n)) throw(DimensionMismatch("")) end + lda = max(1,stride(A,2)) + ldb = max(1,stride(B,2)) + ldc = max(1,stride(C,2)) + (; handle, stream) = lib_state() + $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check + C + end + function hemm(uplo::Char, trans::Char, alpha::($elty), A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) + m,n = size(B) + hemm!( uplo, trans, alpha, A, B, zero($elty), similar(B, $elty, (m,n) ) ) + end + hemm( uplo::Char, trans::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) = + hemm( uplo, trans, one($elty), A, B) end end ## herk for (fname, elty) in ((:rocblas_zherk,:ComplexF64), (:rocblas_cherk,:ComplexF32)) - @eval begin - function herk!(uplo::Char, - trans::Char, - alpha::($elty), - A::ROCVecOrMat{$elty}, - beta::($elty), - C::ROCMatrix{$elty}) - rocuplo = rocblasfill(uplo) - roctrans = rocblasop(trans) - mC, n = size(C) - if mC != n throw(DimensionMismatch("C must be square")) end - nn = size(A, trans == 'N' ? 1 : 2) - if nn != n throw(DimensionMismatch("syrk!")) end - k = size(A, trans == 'N' ? 2 : 1) - lda = max(1,stride(A,2)) - ldc = max(1,stride(C,2)) - wait!((A,C)) - (; handle, stream) = lib_state() - $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check - mark!((A,C), stream) - C - end - function herk(uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty}) - n = size(A, trans == 'N' ? 1 : 2) - herk!(uplo, trans, alpha, A, zero($elty), similar(A, $elty, (n,n))) - end - herk(uplo::Char, trans::Char, A::ROCVecOrMat{$elty}) = herk(uplo, trans, one($elty), A) + @eval begin + function herk!( + uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty}, + beta::($elty), C::ROCMatrix{$elty}, + ) + rocuplo = rocblasfill(uplo) + roctrans = rocblasop(trans) + mC, n = size(C) + if mC != n throw(DimensionMismatch("C must be square")) end + nn = size(A, trans == 'N' ? 1 : 2) + if nn != n throw(DimensionMismatch("syrk!")) end + k = size(A, trans == 'N' ? 2 : 1) + lda = max(1,stride(A,2)) + ldc = max(1,stride(C,2)) + (; handle, stream) = lib_state() + $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check + C + end + function herk(uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty}) + n = size(A, trans == 'N' ? 1 : 2) + herk!(uplo, trans, alpha, A, zero($elty), similar(A, $elty, (n,n))) + end + herk(uplo::Char, trans::Char, A::ROCVecOrMat{$elty}) = + herk(uplo, trans, one($elty), A) end end @@ -1138,13 +989,10 @@ for (fname, elty) in ((:rocblas_dsyr2k,:Float64), (:rocblas_zsyr2k,:ComplexF64), (:rocblas_csyr2k,:ComplexF32)) @eval begin - function syr2k!(uplo::Char, - trans::Char, - alpha::($elty), - A::ROCVecOrMat{$elty}, - B::ROCVecOrMat{$elty}, - beta::($elty), - C::ROCMatrix{$elty}) + function syr2k!( + uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty}, + B::ROCVecOrMat{$elty}, beta::($elty), C::ROCMatrix{$elty}, + ) # TODO: check size of B in julia (syr2k!) rocuplo = rocblasfill(uplo) roctrans = rocblasop(trans) @@ -1160,36 +1008,28 @@ for (fname, elty) in ((:rocblas_dsyr2k,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) C end end end -function syr2k(uplo::Char, - trans::Char, - alpha::Number, - A::ROCVecOrMat, - B::ROCVecOrMat) +function syr2k(uplo::Char, trans::Char, alpha::Number, A::ROCVecOrMat, B::ROCVecOrMat) T = eltype(A) n = size(A, trans == 'N' ? 1 : 2) syr2k!(uplo, trans, convert(T,alpha), A, B, zero(T), similar(A, T, (n, n))) end -syr2k(uplo::Char, trans::Char, A::ROCVecOrMat, B::ROCVecOrMat) = syr2k(uplo, trans, one(eltype(A)), A, B) +syr2k(uplo::Char, trans::Char, A::ROCVecOrMat, B::ROCVecOrMat) = + syr2k(uplo, trans, one(eltype(A)), A, B) ## her2k for (fname, elty1, elty2) in ((:rocblas_zher2k,:ComplexF64,:Float64), (:rocblas_cher2k,:ComplexF32,:Float32)) - @eval begin - function her2k!(uplo::Char, - trans::Char, - alpha::($elty1), - A::ROCVecOrMat{$elty1}, - B::ROCVecOrMat{$elty1}, - beta::($elty2), - C::ROCMatrix{$elty1}) + @eval begin + function her2k!( + uplo::Char, trans::Char, alpha::($elty1), A::ROCVecOrMat{$elty1}, + B::ROCVecOrMat{$elty1}, beta::($elty2), C::ROCMatrix{$elty1}, + ) # TODO: check size of B in julia (her2k!) rocuplo = rocblasfill(uplo) roctrans = rocblasop(trans) @@ -1206,24 +1046,19 @@ for (fname, elty1, elty2) in ((:rocblas_zher2k,:ComplexF64,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check - mark!((A,B,C), stream) C - end - function her2k(uplo::Char, - trans::Char, - alpha::($elty1), - A::ROCVecOrMat{$elty1}, - B::ROCVecOrMat{$elty1}) + end + function her2k( + uplo::Char, trans::Char, alpha::($elty1), + A::ROCVecOrMat{$elty1}, B::ROCVecOrMat{$elty1}, + ) n = size(A, trans == 'N' ? 1 : 2) her2k!(uplo, trans, alpha, A, B, zero($elty2), similar(A, $elty1, (n,n))) - end - her2k(uplo::Char, - trans::Char, - A::ROCVecOrMat{$elty1}, - B::ROCVecOrMat{$elty1}) = her2k(uplo, trans, one($elty1), A, B) + end + her2k(uplo::Char, trans::Char, A::ROCVecOrMat{$elty1}, B::ROCVecOrMat{$elty1}) = + her2k(uplo, trans, one($elty1), A, B) end end @@ -1249,12 +1084,10 @@ for (mmname, smname, elty) in if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) - wait!((A,B)) (; handle, stream) = lib_state() $(mmname)( handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), A, lda, B, ldb) |> check - mark!((A,B), stream) B end function trmm( @@ -1278,10 +1111,8 @@ for (mmname, smname, elty) in if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trsm!")) end lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) - wait!((A,B)) (; handle, stream) = lib_state() $(smname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), A, lda, B, ldb) |> check - mark!((A,B), stream) B end function trsm( @@ -1300,13 +1131,10 @@ for (fname, elty) in (:rocblas_ztrsmBatched,:ComplexF64), (:rocblas_ctrsmBatched,:ComplexF32)) @eval begin - function trsm_batched!(side::Char, - uplo::Char, - transa::Char, - diag::Char, - alpha::($elty), - A::Array{ROCMatrix{$elty},1}, - B::Array{ROCMatrix{$elty},1}) + function trsm_batched!( + side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty), + A::Array{ROCMatrix{$elty},1}, B::Array{ROCMatrix{$elty},1}, + ) rocside = rocblasside(side) rocuplo = rocblasfill(uplo) roctransa = rocblasop(transa) @@ -1325,19 +1153,14 @@ for (fname, elty) in ldb = max(1,stride(B[1],2)) Aptrs = device_batch(A) Bptrs = device_batch(B) - wait!((A,B)) (; handle, stream) = lib_state() $(fname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), Aptrs, lda, Bptrs, ldb, length(A)) |> check - mark!((A,B), stream) B end - function trsm_batched(side::Char, - uplo::Char, - transa::Char, - diag::Char, - alpha::($elty), - A::Array{ROCMatrix{$elty},1}, - B::Array{ROCMatrix{$elty},1}) + function trsm_batched( + side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty), + A::Array{ROCMatrix{$elty},1}, B::Array{ROCMatrix{$elty},1}, + ) trsm_batched!(side, uplo, transa, diag, alpha, A, copy(B) ) end end @@ -1352,14 +1175,11 @@ for (fname, elty) in ((:rocblas_dgeam,:Float64), (:rocblas_sgeam,:Float32), (:rocblas_zgeam,:ComplexF64), (:rocblas_cgeam,:ComplexF32)) - @eval begin - function geam!(transa::Char, - transb::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - beta::($elty), - B::ROCMatrix{$elty}, - C::ROCMatrix{$elty}) + @eval begin + function geam!( + transa::Char, transb::Char, alpha::($elty), A::ROCMatrix{$elty}, + beta::($elty), B::ROCMatrix{$elty}, C::ROCMatrix{$elty}, + ) roctransa = rocblasop(transa) roctransb = rocblasop(transb) mA, nA = size(A) @@ -1372,18 +1192,14 @@ for (fname, elty) in ((:rocblas_dgeam,:Float64), lda = max(1,stride(A,2)) ldb = max(1,stride(B,2)) ldc = max(1,stride(C,2)) - wait!((A,B,C)) (; handle, stream) = lib_state() $(fname)(handle, roctransa, roctransb, m, n, Ref(alpha), A, lda, Ref(beta), B, ldb, C, ldc) |> check - mark!((A,B,C), stream) C - end - function geam(transa::Char, - transb::Char, - alpha::($elty), - A::ROCMatrix{$elty}, - beta::($elty), - B::ROCMatrix{$elty}) + end + function geam( + transa::Char, transb::Char, alpha::($elty), A::ROCMatrix{$elty}, + beta::($elty), B::ROCMatrix{$elty}, + ) m,n = size(B) if ((transb == 'T' || transb == 'C')) geam!( transa, transb, alpha, A, beta, B, similar(B, $elty, (n,m) ) ) @@ -1404,8 +1220,7 @@ for (fname, elty) in (:rocblas_zgetrfBatched,:ComplexF64), (:rocblas_cgetrfBatched,:ComplexF32)) @eval begin - function getrf_batched!(A::Array{ROCMatrix{$elty},1}, - Pivot::Bool) + function getrf_batched!(A::Array{ROCMatrix{$elty},1}, Pivot::Bool) for As in A m,n = size(As) if m != n @@ -1417,18 +1232,14 @@ for (fname, elty) in Aptrs = device_batch(A) info = ROCArray{Cint}(undef, length(A)) pivotArray = Pivot ? ROCArray{Int32}(undef, (n, length(A))) : C_NULL - wait!(A) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, pivotArray, info, length(A)) |> check if( !Pivot ) pivotArray = ROCArray(zeros(Cint, (n, length(A)))) end - mark!((A, info), stream) - pivotArray != C_NULL && mark!(pivotArray, stream) pivotArray, info, A end - function getrf_batched(A::Array{ROCMatrix{$elty},1}, - Pivot::Bool) + function getrf_batched(A::Array{ROCMatrix{$elty},1}, Pivot::Bool) newA = copy(A) pivotarray, info = getrf_batched!(newA, Pivot) pivotarray, info, newA @@ -1444,8 +1255,7 @@ for (fname, elty) in (:rocblas_zgetriBatched,:ComplexF64), (:rocblas_cgetriBatched,:ComplexF32)) @eval begin - function getri_batched(A::Array{ROCMatrix{$elty},1}, - pivotArray::ROCMatrix{Cint}) + function getri_batched(A::Array{ROCMatrix{$elty},1}, pivotArray::ROCMatrix{Cint}) for As in A m,n = size(As) if m != n @@ -1459,11 +1269,8 @@ for (fname, elty) in Aptrs = device_batch(A) Cptrs = device_batch(C) info = ROCArray(zeros(Cint,length(A))) - wait!(A) - wait!(pivotArray) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A)) |> check - mark!((A, pivotArray, info, C), stream) pivotArray, info, C end end @@ -1494,10 +1301,8 @@ for (fname, elty) in Aptrs = device_batch(A) Cptrs = device_batch(C) info = ROCArray(zeros(Cint,length(A))) - wait!(A) (; handle, stream) = lib_state() $(fname)(handle, n, Aptrs, lda, Cptrs, ldc, info, length(A)) |> check - mark!((A, info, C), stream) info, C end end @@ -1522,13 +1327,11 @@ for (fname, elty) in end Tauptrs = device_batch(TauArray) info = zero(Cint) - wait!(A) (; handle, stream) = lib_state() $(fname)(handle, m, n, Aptrs, lda, Tauptrs, Ref(info), length(A)) |> check if( info != 0 ) throw(ArgumentError,string("Invalid value at ",-info)) end - mark!((A, TauArray), stream) TauArray, A end function geqrf_batched(A::Array{ROCMatrix{$elty},1}) @@ -1545,9 +1348,9 @@ for (fname, elty) in (:rocblas_zgelsBatched,:ComplexF64), (:rocblas_cgelsBatched,:ComplexF32)) @eval begin - function gels_batched!(trans::Char, - A::Array{ROCMatrix{$elty},1}, - C::Array{ROCMatrix{$elty},1}) + function gels_batched!( + trans::Char, A::Array{ROCMatrix{$elty},1}, C::Array{ROCMatrix{$elty},1}, + ) roctrans = rocblasop(trans) if( length(A) != length(C) ) throw(DimensionMismatch("")) @@ -1570,21 +1373,15 @@ for (fname, elty) in Cptrs = device_batch(C) info = zero(Cint) infoarray = ROCArray(zeros(Cint, length(A))) - wait!(A) - wait!(C) (; handle, stream) = lib_state() $(fname)(handle, roctrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, Ref(info), infoarray, length(A)) |> check if( info != 0 ) throw(ArgumentError,string("Invalid value at ",-info)) end - mark!((A, C, infoarray), stream) A, C, infoarray end - function gels_batched(trans::Char, - A::Array{ROCMatrix{$elty},1}, - C::Array{ROCMatrix{$elty},1}) + gels_batched(trans::Char, A::Array{ROCMatrix{$elty},1}, C::Array{ROCMatrix{$elty},1}) = gels_batched!(trans, copy(A), copy(C)) - end end end @@ -1594,10 +1391,9 @@ for (fname, elty) in ((:rocblas_ddgmm,:Float64), (:rocblas_zdgmm,:ComplexF64), (:rocblas_cdgmm,:ComplexF32)) @eval begin - function dgmm!(mode::Char, - A::ROCMatrix{$elty}, - X::ROCVector{$elty}, - C::ROCMatrix{$elty}) + function dgmm!( + mode::Char, A::ROCMatrix{$elty}, X::ROCVector{$elty}, C::ROCMatrix{$elty}, + ) rocside = rocblasside(mode) m, n = size(C) mA, nA = size(A) @@ -1608,15 +1404,11 @@ for (fname, elty) in ((:rocblas_ddgmm,:Float64), lda = max(1,stride(A,2)) incx = stride(X,1) ldc = max(1,stride(C,2)) - wait!((A,X,C)) (; handle, stream) = lib_state() $(fname)(handle, rocside, m, n, A, lda, X, incx, C, ldc) |> check - mark!((A,X,C), stream) C end - function dgmm(mode::Char, - A::ROCMatrix{$elty}, - X::ROCVector{$elty}) + function dgmm(mode::Char, A::ROCMatrix{$elty}, X::ROCVector{$elty}) m,n = size(A) dgmm!( mode, A, X, similar(A, $elty, (m,n) ) ) end diff --git a/src/cache.jl b/src/cache.jl index 82a546aca..beda5fc83 100644 --- a/src/cache.jl +++ b/src/cache.jl @@ -115,7 +115,7 @@ function library_state( state = get!(() -> new_state(tls), states, tls.context) @noinline function update_stream(tls, state) - set_stream(new_handle, tls.stream) + set_stream(state.handle, tls.stream) return (; state.handle, tls.stream) end if state.stream != tls.stream diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl index d4751a3d1..bcfb1241e 100644 --- a/src/compiler/codegen.jl +++ b/src/compiler/codegen.jl @@ -1,257 +1,169 @@ -import .Device: ExceptionEntry, HostCall +struct HIPCompilerParams <: AbstractCompilerParams end -## GPUCompiler interface +const HIPCompilerConfig = CompilerConfig{GCNCompilerTarget, HIPCompilerParams} +const HIPCompilerJob = CompilerJob{GCNCompilerTarget, HIPCompilerParams} -struct ROCCompilerParams <: AbstractCompilerParams - device::ROCDevice - global_hooks::NamedTuple -end - -const ROCCompilerConfig = CompilerConfig{GCNCompilerTarget, ROCCompilerParams} -const ROCCompilerJob = CompilerJob{GCNCompilerTarget, ROCCompilerParams} - -# Caches for GPUCompiler. -const _compiler_cache = Dict{ROCDevice, Dict{UInt, Any}}() -const _compiler_configs = Dict{UInt, ROCCompilerConfig}() -const _kernel_instances = Dict{UInt, Any}() # HostKernel +const _hip_compiler_cache = Dict{HIP.HIPDevice, Dict{Any, HIP.HIPFunction}}() -function compiler_cache(dev::ROCDevice) - get!(() -> Dict{UInt, Any}(), _compiler_cache, dev) -end - -function compiler_config(dev::ROCDevice; kwargs...) - h = hash(dev, hash(kwargs)) - get!(() -> _compiler_config(dev; kwargs...), _compiler_configs, h) -end -function _compiler_config( - dev::ROCDevice; global_hooks, kernel::Bool = true, name=nothing, - always_inline=true, kwargs..., -) - isa = AMDGPU.default_isa(dev) - dev_isa, features = Runtime.llvm_arch_features(isa) +# hash(fun, hash(f, hash(tt))) => HIPKernel +const _kernel_instances = Dict{UInt, Runtime.HIPKernel}() - target = GCNCompilerTarget(; dev_isa, features) - params = ROCCompilerParams(dev, global_hooks) - CompilerConfig(target, params; kernel, name, always_inline) +function compiler_cache(dev::HIP.HIPDevice) + get!(() -> Dict{UInt, Any}(), _hip_compiler_cache, dev) end -GPUCompiler.runtime_module(@nospecialize(::ROCCompilerJob)) = AMDGPU +GPUCompiler.runtime_module(@nospecialize(::HIPCompilerJob)) = AMDGPU -GPUCompiler.ci_cache(@nospecialize(::ROCCompilerJob)) = AMDGPU.ci_cache +GPUCompiler.ci_cache(@nospecialize(::HIPCompilerJob)) = AMDGPU.ci_cache -GPUCompiler.method_table(@nospecialize(::ROCCompilerJob)) = AMDGPU.method_table +GPUCompiler.method_table(@nospecialize(::HIPCompilerJob)) = AMDGPU.method_table -# filter out functions from device libs -GPUCompiler.isintrinsic(@nospecialize(job::ROCCompilerJob), fn::String) = - invoke(GPUCompiler.isintrinsic, - Tuple{CompilerJob{GCNCompilerTarget}, typeof(fn)}, - job, fn) || - startswith(fn, "rocm") - -function GPUCompiler.process_module!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module) - invoke(GPUCompiler.process_module!, - Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)}, - job, mod) - # Run this early (before optimization) to ensure we link OCKL - emit_exception_user!(mod) -end -function GPUCompiler.process_entry!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module, entry::LLVM.Function) - invoke(GPUCompiler.process_entry!, - Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)}, - job, mod, entry) - # Workaround for the lack of zeroinitializer support for LDS - zeroinit_lds!(mod, entry) -end -function GPUCompiler.finish_module!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module) - invoke(GPUCompiler.finish_module!, - Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)}, - job, mod) - delete_exception_user!(mod) -end +GPUCompiler.kernel_state_type(@nospecialize(::HIPCompilerJob)) = AMDGPU.KernelState function GPUCompiler.link_libraries!( - @nospecialize(job::ROCCompilerJob), mod::LLVM.Module, + @nospecialize(job::HIPCompilerJob), mod::LLVM.Module, undefined_fns::Vector{String}, ) + # @show undefined_fns invoke(GPUCompiler.link_libraries!, - Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(undefined_fns)}, - job, mod, undefined_fns) + Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(undefined_fns)}, + job, mod, undefined_fns) + link_device_libs!(job.config.target, mod) +end + +function GPUCompiler.finish_ir!( + @nospecialize(job::HIPCompilerJob), mod::LLVM.Module, entry::LLVM.Function, +) + # @show collect(GPUCompiler.decls(mod)) + # TODO fixx link_device_libs!(job.config.target, mod) + return entry end -const rocfunction_lock = ReentrantLock() +function GPUCompiler.finish_module!( + @nospecialize(job::HIPCompilerJob), mod::LLVM.Module, entry::LLVM.Function, +) + entry = invoke(GPUCompiler.finish_module!, + Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)}, + job, mod, entry) + + # Workaround for the lack of zeroinitializer support for LDS. + zeroinit_lds!(mod, entry) + + # Force-inline exception-related functions. + # LLVM gets confused when not all functions are inlined, + # causing huge scratch memory usage. + # And GPUCompiler fails to inline all functions without forcing + # always-inline attributes on them. Add them here. + target_fns = ( + "signal_exception", "report_exception", "malloc", "__throw_") + inline_attr = EnumAttribute("alwaysinline") + for fn in LLVM.functions(mod) + any(occursin.(target_fns, LLVM.name(fn))) || continue + attrs = LLVM.function_attributes(fn) + inline_attr ∈ collect(attrs) || push!(attrs, inline_attr) + end + + return entry +end -""" - rocfunction(f, tt=Tuple{}; kwargs...) +function compiler_config( + dev::HIP.HIPDevice; kernel::Bool = true, + name::Union{String, Nothing} = nothing, always_inline::Bool = true, +) + hsa_isa = AMDGPU.default_isa(dev) + dev_isa, features = hsa_isa.arch_features -Low-level interface to compile a function invocation for the currently-active -GPU, returning a callable kernel object. For a higher-level interface, use -[`@roc`](@ref). + target = GCNCompilerTarget(; dev_isa, features) + params = HIPCompilerParams() + CompilerConfig(target, params; kernel, name, always_inline) +end -The following keyword arguments are supported: -- `name`: overrides the name that the kernel will have in the generated code -- `device`: chooses which device to compile the kernel for -- `global_hooks`: specifies maps from global variable name to initializer hook +const hipfunction_lock = ReentrantLock() -The output of this function is automatically cached, i.e. you can simply call -`rocfunction` in a hot path without degrading performance. New code will be -generated automatically, when function definitions change, or when different -types or keyword arguments are provided. -""" -function rocfunction( - f::F, tt::Type = Tuple{}; device::ROCDevice = AMDGPU.device(), - global_hooks = NamedTuple(), kwargs..., -) where {F <: Core.Function} - Base.@lock rocfunction_lock begin - @debug "Compiling $f($(join(tt.parameters, ", ")))" - Runtime.@log_start(:cached_compile, (;f=F, tt), nothing) +function hipfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F <: Core.Function, TT} + Base.@lock hipfunction_lock begin + dev = AMDGPU.device() + cache = compiler_cache(dev) + config = compiler_config(dev; kwargs...) - cache = compiler_cache(device) - config = compiler_config(device; global_hooks, kwargs...) + source = methodinstance(F, tt) fun = GPUCompiler.cached_compilation( - cache, config, F, tt, compile, link)::ROCFunction + cache, source, config, hipcompile, hiplink) h = hash(fun, hash(f, hash(tt))) - kernel = get(_kernel_instances, h, Runtime.HostKernel{F,tt}(f, fun.mod, fun)) - Runtime.@log_finish(:cached_compile, (;f=F, tt), nothing) - return kernel::Runtime.HostKernel{F,tt} + kernel = get!(_kernel_instances, h) do + Runtime.HIPKernel{F, tt}(f, fun) + end + return kernel::Runtime.HIPKernel{F, tt} end end -# compile to GCN -function compile(@nospecialize(job::CompilerJob)) - Runtime.@log_start(:compile, (;fspec=job.source.specTypes), nothing) - JuliaContext() do ctx - obj, meta = GPUCompiler.compile(:obj, job; ctx) - # Find undefined globals and calculate sizes. - globals = map( - gbl -> Symbol(LLVM.name(gbl)) => llvmsize(eltype(value_type(gbl))), - filter!(isextinit, collect(LLVM.globals(meta.ir)))) - entry = LLVM.name(meta.entry) +function create_executable(obj) + lld = if AMDGPU.lld_artifact + `$(LLD_jll.lld()) -flavor gnu` + else + @assert !isempty(AMDGPU.lld_path) "ld.lld was not found; cannot link kernel" + `$(AMDGPU.lld_path)` + end - Runtime.@log_finish(:compile, (;fspec=job.source.specTypes), nothing) - return (; obj, entry, globals) + path_exe = mktemp() do path_o, io_o + write(io_o, obj) + flush(io_o) + path_exe = path_o * ".exe" + run(`$lld -shared -o $path_exe $path_o`) + path_exe end + return read(path_exe) end -function link(@nospecialize(job::CompilerJob), compiled) - Runtime.@log_start(:link, (;fspec=job.source.specTypes), nothing) - device = job.config.params.device - global_hooks = job.config.params.global_hooks - (;obj, entry, globals) = compiled - # create executable and kernel - obj = codeunits(obj) - exe = AMDGPU.create_executable(device, entry, obj; globals=globals) - mod = ROCModule(exe) - fun = ROCFunction(mod, entry, hash(job.source, UInt64(0))) - - # initialize globals from hooks - for gname in first.(globals) - hook = nothing - if haskey(default_global_hooks, gname) - hook = default_global_hooks[gname] - elseif haskey(global_hooks, gname) - hook = global_hooks[gname] - end - if hook !== nothing - @debug "Initializing global $gname" - Runtime.@log_start(:global_init, (;fspec=job.source.specTypes, gname), nothing) - gbl = Runtime.get_global(exe, gname) - hook(gbl, mod, device) - Runtime.@log_finish(:global_init, (;fspec=job.source.specTypes, gname), nothing) - else - @debug "Uninitialized global $gname" - continue - end +function hipcompile(@nospecialize(job::CompilerJob)) + obj, meta = JuliaContext() do ctx + GPUCompiler.compile(:obj, job) end - Runtime.@log_finish(:link, (;fspec=job.source.specTypes), nothing) - return fun -end + entry = LLVM.name(meta.entry) + globals = filter(isextinit, collect(LLVM.globals(meta.ir))) .|> LLVM.name -function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function) - if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv - return entry + global_hostcall_names = ( + :malloc_hostcall, :free_hostcall, :print_hostcall, :printf_hostcall) + global_hostcalls = Symbol[] + for gbl in LLVM.globals(meta.ir), gbl_name in global_hostcall_names + occursin("__$gbl_name", LLVM.name(gbl)) || continue + push!(global_hostcalls, gbl_name) end - to_init = [] - for gbl in LLVM.globals(mod) - if startswith(LLVM.name(gbl), "__zeroinit") - as = LLVM.addrspace(value_type(gbl)) - if as == AMDGPU.Device.AS.Local - push!(to_init, gbl) - end - end + if !isempty(global_hostcalls) + @warn """Global hostcalls detected: $global_hostcalls. + Use `AMDGPU.synchronize(; blocking=false)` to synchronize and stop them. + Otherwise, performance might degrade. + """ maxlog=1 end - if length(to_init) > 0 - ctx = LLVM.context(mod) - T_void = LLVM.VoidType(ctx) - LLVM.@dispose builder=LLVM.IRBuilder(ctx) begin - # Make these the first operations we do - position!(builder, first(LLVM.instructions(first(LLVM.blocks(entry))))) - # Use memset to clear all values to 0 - for gbl in to_init - sz = llvmsize(eltype(value_type(gbl))) - if sz > 0 - LLVM.memset!(builder, gbl, ConstantInt(UInt8(0); ctx), ConstantInt(sz; ctx), LLVM.alignment(gbl)) - end - end + if !isempty(globals) + @warn """ + HIP backend does not support setting extinit globals. + But kernel `$entry` has following: + $globals - # Synchronize the workgroup to prevent races - sync_ft = LLVM.FunctionType(LLVM.VoidType(ctx)) - sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier")) - call!(builder, sync_ft, sync_f) - end + Compilation will likely fail. + """ end - - return entry + (; obj=create_executable(codeunits(obj)), entry, global_hostcalls) end -## exception codegen -# emit a global variable for storing the current exception status -function emit_exception_user!(mod::LLVM.Module) - # add a fake user for __ockl_hsa_signal_store and __ockl_hsa_signal_load - if !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user") - ctx = LLVM.context(mod) - ft = LLVM.FunctionType(LLVM.VoidType(ctx)) - fn = LLVM.Function(mod, "__fake_global_exception_flag_user", ft) - IRBuilder(ctx) do builder - entry = BasicBlock(fn, "entry"; ctx) - position!(builder, entry) - T_nothing = LLVM.VoidType(ctx) - T_i32 = LLVM.Int32Type(ctx) - T_i64 = LLVM.Int64Type(ctx) - - T_signal_store = LLVM.FunctionType(T_nothing, [T_i64, T_i64, T_i32]) - signal_store = LLVM.Function(mod, "__ockl_hsa_signal_store", T_signal_store) - call!(builder, T_signal_store, signal_store, - [ConstantInt(0; ctx), ConstantInt(0; ctx), - #= __ATOMIC_RELEASE == 3 =# - ConstantInt(Int32(3); ctx)]) - - T_signal_load = LLVM.FunctionType(T_i64, [T_i64, T_i32]) - signal_load = LLVM.Function(mod, "__ockl_hsa_signal_load", T_signal_load) - loaded_value = call!(builder, T_signal_load, signal_load, - [ConstantInt(0; ctx), - #= __ATOMIC_ACQUIRE == 2 =# - ConstantInt(Int32(2); ctx)]) +function hiplink(@nospecialize(job::CompilerJob), compiled) + (; obj, entry, global_hostcalls) = compiled + mod = HIP.HIPModule(obj) + HIP.HIPFunction(mod, entry, global_hostcalls) +end - T_signal_cas = LLVM.FunctionType(T_i64, [T_i64, T_i64, T_i64, T_i32]) - signal_cas = LLVM.Function(mod, "__ockl_hsa_signal_cas", T_signal_cas) - loaded_value = call!(builder, T_signal_cas, signal_cas, - [ConstantInt(0; ctx), ConstantInt(0; ctx), ConstantInt(0; ctx), - #= __ATOMIC_ACQ_REL == 4 =# - ConstantInt(Int32(4); ctx)]) +function run_and_collect(cmd) + stdout = Pipe() + proc = run(pipeline(ignorestatus(cmd); stdout, stderr=stdout), wait=false) + close(stdout.in) - ret!(builder) - end - end - @assert haskey(LLVM.functions(mod), "__fake_global_exception_flag_user") -end -function delete_exception_user!(mod::LLVM.Module) - fns = LLVM.functions(mod) - if haskey(fns, "__fake_global_exception_flag_user") - unsafe_delete!(mod, fns["__fake_global_exception_flag_user"]) - end - @assert !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user") + reader = Threads.@spawn String(read(stdout)) + Base.wait(proc) + log = strip(fetch(reader)) + return proc, log end diff --git a/src/compiler/device-libs.jl b/src/compiler/device_libs.jl similarity index 73% rename from src/compiler/device-libs.jl rename to src/compiler/device_libs.jl index 9837e118a..ad89d9870 100644 --- a/src/compiler/device-libs.jl +++ b/src/compiler/device_libs.jl @@ -1,43 +1,37 @@ ## ROCm device library -import AMDGPU: device_libs_path +import AMDGPU: libdevice_libs function load_and_link!(mod, path) - ctx = LLVM.context(mod) - lib = parse(LLVM.Module, read(path); ctx) + lib = parse(LLVM.Module, read(path)) for f in LLVM.functions(lib) # FIXME: We should be able to inline this, that we can't means # we are inserting calls to it late. - name = LLVM.name(f) - name == "__ockl_hsa_signal_store" && continue - name == "__ockl_hsa_signal_load" && continue - startswith(name, "__ockl_hsa_signal") && continue + startswith(LLVM.name(f), "__ockl_hsa_signal") && continue + attrs = function_attributes(f) inline = true - noinline_attr = EnumAttribute("noinline"; ctx) + noinline_attr = EnumAttribute("noinline") for attr in collect(attrs) if kind(attr) == kind(noinline_attr) inline = false break end end - if inline - push!(attrs, EnumAttribute("alwaysinline"; ctx)) - end + inline && push!(attrs, EnumAttribute("alwaysinline")) end # override triple and datalayout to avoid warnings triple!(lib, triple(mod)) datalayout!(lib, datalayout(mod)) - LLVM.link!(mod, lib) end function locate_lib(file) - file_path = joinpath(device_libs_path, file*".bc") + file_path = joinpath(libdevice_libs, file*".bc") if !ispath(file_path) - file_path = joinpath(device_libs_path, file*".amdgcn.bc") + file_path = joinpath(libdevice_libs, file*".amdgcn.bc") if !ispath(file_path) # failed to find matching bitcode file return nothing @@ -50,21 +44,13 @@ function link_device_libs!(target, mod::LLVM.Module) # TODO: only link if used # TODO: make these globally/locally configurable - device_libs_path === nothing && return + isnothing(libdevice_libs) && return # https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/9420f6380990b09851edc2a5f9cbfaa88742b449/doc/OCML.md#controls # Note: It seems we need to load in reverse order, to avoid LLVM deleting the globals from the module, before we use them. # 1. Load other libraries - libs = ( - "hc", - "hip", - "irif", - "ockl", - "opencl", - "ocml", - ) - + libs = ("hc", "hip", "irif", "ockl", "opencl", "ocml") for lib in libs lib_path = locate_lib(lib) lib_path === nothing && continue @@ -82,10 +68,15 @@ function link_device_libs!(target, mod::LLVM.Module) try load_and_link!(mod, lib) catch err - @warn "Failed to load/link OCLC core library for ISA $(target.dev_isa)" err=err + @warn "Failed to load/link OCLC core library `$lib` for ISA $(target.dev_isa)." err=err end - # 3. Load options libraries + # 3. Load OCLC ABI library (required for printing). + lib = locate_lib("oclc_abi_version_500") + @assert lib !== nothing + load_and_link!(mod, lib) + + # 4. Load options libraries options = Dict( :finite_only => false, :unsafe_math => false, diff --git a/src/compiler/dynamic_memory.jl b/src/compiler/dynamic_memory.jl new file mode 100644 index 000000000..3f42d71a3 --- /dev/null +++ b/src/compiler/dynamic_memory.jl @@ -0,0 +1,43 @@ +function create_malloc_hostcall!() + dev = AMDGPU.device() + _, buf = Device.named_perdevice_hostcall(dev, :malloc_hostcall) do + holder = Device.HostCallHolder( + Ptr{Cvoid}, Tuple{Csize_t}; continuous=true, + ) do bytesize + buf = Mem.HostBuffer(bytesize, HIP.hipHostAllocMapped) + dev_ptr = Mem.device_ptr(buf) + @assert buf.ptr == dev_ptr # TODO + return dev_ptr + end + + # Create host pinned memory and store HostCall in it. + # It will be then accessed by kernels from kernel state. + buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped) + ptr = Base.unsafe_convert( + Ptr{Device.HostCall{Ptr{Cvoid}, Tuple{Csize_t}}}, buf) + Base.unsafe_store!(ptr, holder.hc) + return holder, buf + end + return Mem.device_ptr(buf) +end + +function create_free_hostcall!() + dev = AMDGPU.device() + _, buf = Device.named_perdevice_hostcall(dev, :free_hostcall) do + holder = Device.HostCallHolder( + Nothing, Tuple{Ptr{Cvoid}}; continuous=true, + ) do ptr + ptr == C_NULL && return + # FIXME for some reason it hangs on free function in hostcall... + # HIP.hipHostFree(ptr) |> HIP.check + return + end + + buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped) + ptr = Base.unsafe_convert( + Ptr{Device.HostCall{Nothing, Tuple{Ptr{Cvoid}}}}, buf) + Base.unsafe_store!(ptr, holder.hc) + return holder, buf + end + return Mem.device_ptr(buf) +end diff --git a/src/compiler/exceptions.jl b/src/compiler/exceptions.jl new file mode 100644 index 000000000..7f764a1ae --- /dev/null +++ b/src/compiler/exceptions.jl @@ -0,0 +1,27 @@ +struct KernelException <: Exception + dev::HIP.HIPDevice +end + +function Base.showerror(io::IO, ex::KernelException) + print(io, "KernelException: exception thrown during kernel execution on `$(ex.dev)`.") +end + +const _exception_flags = Dict{HIP.HIPDevice, Mem.HostBuffer}() + +function create_exception!(mod::HIP.HIPModule) + exception_flag = get!(_exception_flags, mod.dev, + Mem.HostBuffer(sizeof(Int), HIP.hipHostAllocMapped)) + return Mem.device_ptr(exception_flag) +end + +# Check for exceptions on every synchronization. +function check_exceptions() + for (dev, buf) in _exception_flags + ptr = Base.unsafe_convert(Ptr{Int}, buf) + flag = unsafe_load(ptr) + if flag != 0 + unsafe_store!(ptr, 0) + throw(KernelException(dev)) + end + end +end diff --git a/src/compiler/global-hooks.jl b/src/compiler/global-hooks.jl deleted file mode 100644 index da7d1edb4..000000000 --- a/src/compiler/global-hooks.jl +++ /dev/null @@ -1,94 +0,0 @@ -const default_global_hooks = Dict{Symbol,Function}() - -default_global_hooks[:__global_output_context] = (gbl, mod, device) -> begin - # initialize global output context - gbl_ptr = Base.unsafe_convert(Ptr{AMDGPU.Device.GLOBAL_OUTPUT_CONTEXT_TYPE}, gbl) - oc = Device.OutputContext(stdout; device, name=:__global_output, timeout=nothing) - Base.unsafe_store!(gbl_ptr, oc) -end -default_global_hooks[:__global_printf_context] = (gbl, mod, device) -> begin - # initialize global printf context - # Return type of Int to force synchronizing behavior - args_type = Tuple{LLVMPtr{UInt8, AS.Global}} - ret_type = Int - gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl) - - hc = Device.named_perdevice_hostcall(device, :__global_printf) do - HostCall(ret_type, args_type; device, continuous=true, buf_len=2^16, timeout=nothing) do _ - fmt, all_args = unsafe_load(reinterpret(LLVMPtr{AMDGPU.Device.ROCPrintfBuffer,AS.Global}, hc.buf_ptr)) - - for args in all_args - args = map(x -> x isa Cstring ? unsafe_string(x) : x, args) - @debug "@rocprintf with $fmt and $(args)" - try - @eval @printf($fmt, $(args...)) - catch err - @error "@rocprintf error" exception=(err,catch_backtrace()) - end - end - return 0 - end - end - Base.unsafe_store!(gbl_ptr, hc) -end -default_global_hooks[:__global_exception_flag] = (gbl, mod, device) -> begin - # initialize global exception flag - gbl_ptr = Base.unsafe_convert(Ptr{Int64}, gbl) - Base.unsafe_store!(gbl_ptr, 0) -end -default_global_hooks[:__global_exception_ring] = (gbl, mod, device) -> begin - # initialize exception ring buffer - gbl_ptr = Base.unsafe_convert(Ptr{Ptr{ExceptionEntry}}, gbl) - ex_ptr = Base.unsafe_convert(Ptr{ExceptionEntry}, mod.exceptions) - unsafe_store!(gbl_ptr, ex_ptr) - - # setup initial slots - for i in 1:Runtime.MAX_EXCEPTIONS-1 - unsafe_store!(ex_ptr, ExceptionEntry(0, LLVMPtr{UInt8,1}(0))) - ex_ptr += sizeof(ExceptionEntry) - end - # setup tail slot - unsafe_store!(ex_ptr, ExceptionEntry(1, LLVMPtr{UInt8,1}(0))) -end -default_global_hooks[:__global_malloc_hostcall] = (gbl, mod, device) -> begin - # initialize malloc hostcall - args_type = Tuple{UInt64, Csize_t} - ret_type = Ptr{Cvoid} - gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl) - - hc = Device.named_perdevice_hostcall(device, :__global_malloc) do - HostCall(ret_type, args_type; device, continuous=true, timeout=nothing) do kern, sz - buf = Mem.alloc(device, sz; coherent=true) - # FIXME: Lock - push!(mod.metadata, Runtime.KernelMetadata(kern, buf)) - @debug "Allocated $(buf.ptr) ($sz bytes) for kernel $kern on device $device" - return buf.ptr - end - end - Base.unsafe_store!(gbl_ptr, hc) -end -default_global_hooks[:__global_free_hostcall] = (gbl, mod, device) -> begin - # initialize free hostcall - args_type = Tuple{UInt64, Ptr{Cvoid}} - ret_type = Nothing - gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl) - - hc = Device.named_perdevice_hostcall(device, :__global_free) do - HostCall(ret_type, args_type; device, continuous=true, timeout=nothing) do kern, ptr - # FIXME: Lock - for idx in length(mod.metadata):-1:1 - meta = mod.metadata[idx] - same_kern = meta.kern == kern - same_ptr = meta.buf.ptr == ptr - if same_kern && same_ptr - Mem.free(meta.buf) - deleteat!(mod.metadata, idx) - @debug "Freed $ptr ($(meta.buf.bytesize) bytes) for kernel $kern on device $device." - break - end - end - return nothing - end - end - Base.unsafe_store!(gbl_ptr, hc) -end diff --git a/src/compiler/occupancy.jl b/src/compiler/occupancy.jl deleted file mode 100644 index ea7c50d11..000000000 --- a/src/compiler/occupancy.jl +++ /dev/null @@ -1,150 +0,0 @@ -import ObjectFile -import ObjectFile: readmeta, Sections, section_name, section_size, section_offset -import MsgPack - -# TODO use LockedObject -const OCCUPANCY_CACHE = Dict{Tuple{AMDGPU.Runtime.ROCDevice, UInt64, Int, Int}, NamedTuple}() -const OCCUPANCY_CACHE_LOCK = Threads.ReentrantLock() - -function read_metadata(fun::ROCFunction) - path, io = mktemp(; cleanup=false) - write(io, fun.mod.exe.data) - close(io) - mv(path, path*".exe") # so that readmeta knows that this is an ELF file - path = path * ".exe" - try - return open(path, "r") do io - elf = readmeta(io) - note_sec_idx = findfirst(sec->section_name(sec) == ".note", Sections(elf)) - note_sec = Sections(elf)[note_sec_idx] - note_sec_size = section_size(note_sec) - - seek(io, section_offset(note_sec)) - off = position(io) - while position(io) - off < note_sec_size - name_size = read(io, UInt32) - desc_size = read(io, UInt32) - note_type = read(io, UInt32) - if note_type != 0x20 # NT_AMDGPU_METADATA - # Skip this note - seek(io, position(io) + name_size + 1 + desc_size) - continue - end - name = readuntil(io, '\0'); read(io, UInt8) - desc = Vector{UInt8}(undef, desc_size) - readbytes!(io, desc) - return MsgPack.unpack(desc) - end - end - finally - rm(path) - end - return nothing -end - -calculate_occupancy(kernel::Runtime.HostKernel; kwargs...) = - calculate_occupancy(kernel.fun; kwargs...) -function calculate_occupancy(fun::ROCFunction; input_block_size=1, localmem=0) - lock(OCCUPANCY_CACHE_LOCK) do - get!(OCCUPANCY_CACHE, (fun.mod.exe.device, fun.hash, input_block_size, localmem)) do - _calculate_occupancy(fun, fun.mod.exe.device; input_block_size, localmem) - end - end -end -function _calculate_occupancy(fun::ROCFunction, device::ROCDevice; input_block_size, localmem) - # Calculate occupancy - # Copied from https://github.com/ROCm-Developer-Tools/hipamd/blob/3ec1ccdbbbee7090ba854eddd1dee281973a4498/src/hip_platform.cpp#L301 - isa = first(Runtime.isas(device)) - if input_block_size == 1 - # We assume the user is requesting groupsize optimization - input_block_size = Runtime.isa_workgroup_max_size(isa) - end - arch = Runtime.architecture(isa) - arch_major, arch_minor, arch_stepping = if startswith(arch, "gfx8") - 8, parse(Int, "0x"*arch[5]), parse(Int, "0x"*arch[6:end]) - elseif startswith(arch, "gfx9") - 9, parse(Int, "0x"*arch[5]), parse(Int, "0x"*arch[6:end]) - elseif startswith(arch, "gfx10") - 10, parse(Int, "0x"*arch[6]), parse(Int, "0x"*arch[7:end]) - elseif startswith(arch, "gfx11") - 11, parse(Int, "0x"*arch[6]), parse(Int, "0x"*arch[7:end]) - else - error("Unsupported architecture: $arch") - end - meta = read_metadata(fun) - kernel_idx = findfirst(k->startswith(k[".symbol"], fun.entry), meta["amdhsa.kernels"]) - kernel = meta["amdhsa.kernels"][kernel_idx] - SGPR_count = Int(kernel[".sgpr_count"]) - VGPR_count = Int(kernel[".vgpr_count"]) - LDS_size = Int(kernel[".group_segment_fixed_size"]) - wavefront_size = Int(kernel[".wavefront_size"]) - # TODO: Print signature - @debug "Calculating occupancy of $(fun.entry) for $arch ($arch_major, $arch_minor, $arch_stepping)" SGPR_count VGPR_count LDS_size - max_waves_per_SIMD = arch_major <= 9 ? 8 : 16 - VGPR_waves = max_waves_per_SIMD - local max_VGPRs, VGPR_granularity - if arch_major <= 9 - if arch == "gfx90a" - max_VGPRs = 512 - VGPR_granularity = 8 - else - max_VGPRs = 256 - VGPR_granularity = 4 - end - else - max_VGPRs = 1024 - VGPR_granularity = 8 - end - - function align_up(x, y) - r = rem(x, y) - r > 0 && return x + y-r - return x - end - if VGPR_count > 0 - VGPR_waves = max_VGPRs ÷ align_up(VGPR_count, VGPR_granularity) - end - - GPR_waves = VGPR_waves - if SGPR_count > 0 - max_SGPRs = if arch_major < 8 - 512 - elseif arch_major < 10 - 800 - else - typemax(Int64) - end - SGPR_waves = max_SGPRs ÷ align_up(SGPR_count, 16) - GPR_waves = min(VGPR_waves, SGPR_waves) - end - - alu_occupancy = Runtime.device_num_simds_per_compute_unit(device) * min(max_waves_per_SIMD, GPR_waves) - alu_limited_threads = alu_occupancy * wavefront_size - - LDS_occupancy_wgs = typemax(Int) - total_used_LDS = LDS_size + localmem - if total_used_LDS != 0 - LDS_occupancy_wgs = Int(Runtime.device_local_memory_size(device) ÷ total_used_LDS) - end - - # Calculate how many blocks of input_block_size we can fit per CU - max_blocks_per_CU = alu_limited_threads ÷ align_up(input_block_size, wavefront_size) - max_blocks_per_CU = min(max_blocks_per_CU, LDS_occupancy_wgs) - best_block_size = Int(min(alu_limited_threads, align_up(input_block_size, wavefront_size))) - best_block_size = min(best_block_size, AMDGPU.Device._max_group_size) - best_blocks_per_CU = alu_limited_threads ÷ best_block_size - num_blocks_per_grid = Runtime.device_num_compute_units(device) * min(best_blocks_per_CU, LDS_occupancy_wgs) - - # TODO: Print signature - @debug "Occupancy of $(fun.entry) for $arch ($arch_major, $arch_minor, $arch_stepping)" max_blocks_per_CU best_block_size best_blocks_per_CU num_blocks_per_grid - return (;max_blocks_per_CU, - best_block_size, - best_blocks_per_CU, - num_blocks_per_grid, - GPR_waves, - alu_limited_threads, - SGPR_count, - VGPR_count, - LDS_size, - wavefront_size) -end diff --git a/src/compiler/output_context.jl b/src/compiler/output_context.jl new file mode 100644 index 000000000..6122b5256 --- /dev/null +++ b/src/compiler/output_context.jl @@ -0,0 +1,57 @@ +function create_output_context!(#= TODO mod::HIP.HIPModule =#) + dev = AMDGPU.device() + _, buf = Device.named_perdevice_hostcall(dev, :print_hostcall) do + buf_len = 2^16 + holder = Device.HostCallHolder( + Nothing, Tuple{LLVMPtr{Device.DeviceStaticString{buf_len}, AS.Global}}; + continuous=true, buf_len, + ) do _ + str_ptr = reinterpret( + LLVMPtr{Device.DeviceStaticString{buf_len}, AS.Global}, + holder.hc.buf_ptr) + Core.print(unsafe_load(str_ptr)) + return + end + + # Pointer to HostCall to be read from device. + buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped) + ptr = Base.unsafe_convert(Ptr{Device.OUTPUT_CONTEXT_TYPE}, buf) + Base.unsafe_store!(ptr, holder.hc) + return holder, buf + end + return Mem.device_ptr(buf) +end + +function create_printf_output_context!() + dev = AMDGPU.device() + _, buf = Device.named_perdevice_hostcall(dev, :printf_hostcall) do + holder = Device.HostCallHolder( + Nothing, Tuple{LLVMPtr{UInt8, AS.Global}}; + continuous=true, buf_len=2^16, + ) do _ + printf_buf_ptr = reinterpret( + LLVMPtr{Device.ROCPrintfBuffer, AS.Global}, + holder.hc.buf_ptr) + fmt, all_args = unsafe_load(printf_buf_ptr) + format = Printf.Format(fmt) + + for args in all_args + try + args = map(x -> x isa Cstring ? unsafe_string(x) : x, args) + formatted = Printf.format(format, args...) + Core.print(formatted) + catch err + @error "@rocprintf error" exception=(err, catch_backtrace()) + end + end + return + end + # Pointer to HostCall to be read from device. + buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped) + ptr = Base.unsafe_convert( + Ptr{Device.PRINTF_OUTPUT_CONTEXT_TYPE}, buf) + Base.unsafe_store!(ptr, holder.hc) + return holder, buf + end + return Mem.device_ptr(buf) +end diff --git a/src/compiler/utils.jl b/src/compiler/utils.jl deleted file mode 100644 index 94973f25a..000000000 --- a/src/compiler/utils.jl +++ /dev/null @@ -1,327 +0,0 @@ -# Tools for implementing device functionality - -# which Julia types map to a given LLVM type -const llvmtypes = Dict{Type,Symbol}( - Nothing => :void, - Bool => :i1, - Int8 => :i8, - Int16 => :i16, - Int32 => :i32, - Int64 => :i64, - UInt8 => :i8, - UInt16 => :i16, - UInt32 => :i32, - UInt64 => :i64, - Float32 => :float, - Float64 => :double, -) - -# which LLVM types map to a given Julia type -const jltypes = Dict{Symbol,Type}( - :void => Nothing, - :i1 => Bool, - :i8 => Int8, - :i16 => Int16, - :i32 => Int32, - :i64 => Int64, - :float => Float32, - :double => Float64 -) - -# Decode an expression of the form: -# -# function(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type -# -# Returns a tuple containing the function name, a vector of argument, a vector of argument -# types and the return type (all in symbolic form). -function decode_call(e) - @assert e.head == :(::) - - # decode the return type expression: single symbol (the LLVM type), or a tuple of 2 - # symbols (the LLVM and corresponding Julia type) - retspec = e.args[2] - if isa(retspec, Symbol) - rettype = retspec - else - @assert retspec.head == :tuple - @assert length(retspec.args) == 2 - rettype = (retspec.args[1], retspec.args[2]) - end - - call = e.args[1] - @assert call.head == :call - - fn = Symbol(call.args[1]) - args = Symbol[arg.args[1] for arg in call.args[2:end]] - argtypes = Symbol[arg.args[2] for arg in call.args[2:end]] - - return fn, args, argtypes, rettype -end - -# Generate a `llvmcall` statement calling an intrinsic specified as follows: -# -# intrinsic(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type [attr] -# -# The argument types should be valid LLVM type identifiers (eg. i32, float, double). -# Conversions to the corresponding Julia type are automatically generated; make sure the -# actual arguments are of the same type to make these conversions no-ops. The optional -# argument `attr` indicates which LLVM function attributes (such as `readnone` or `nounwind`) -# to add to the intrinsic declaration. - -# For example, the following call: -# `@wrap __some_intrinsic(x::float, y::double)::float` -# -# will yield the following `llvmcall`: -# ``` -# Base.llvmcall(("declare float @__somme__intr(float, double)", -# "%3 = call float @__somme__intr(float %0, double %1) -# ret float %3"), -# Float32, Tuple{Float32,Float64}, -# convert(Float32,x), convert(Float64,y)) -# ``` -macro wrap(call, attrs="") - intrinsic, args, argtypes, rettype = decode_call(call) - - # decide on intrinsic return type - if isa(rettype, Symbol) - # only LLVM return type specified, match against known LLVM/Julia type combinations - llvm_ret_typ = rettype - julia_ret_typ = jltypes[rettype] - else - # both specified (for when there is a mismatch, eg. i32 -> UInt32) - llvm_ret_typ = rettype[1] - julia_ret_typ = rettype[2] - end - - llvm_args = String["%$i" for i in 0:length(argtypes)] - if llvm_ret_typ == :void - llvm_ret_asgn = "" - llvm_ret = "void" - else - llvm_ret_var = "%$(length(argtypes)+1)" - llvm_ret_asgn = "$llvm_ret_var = " - llvm_ret = "$llvm_ret_typ $llvm_ret_var" - end - llvm_declargs = join(argtypes, ", ") - llvm_defargs = join(("$t $arg" for (t,arg) in zip(argtypes, llvm_args)), ", ") - - julia_argtypes = (jltypes[t] for t in argtypes) - julia_args = (:(convert($argtype, $(esc(arg)))) for (arg, argtype) in zip(args, julia_argtypes)) - - dest = ("""declare $llvm_ret_typ @$intrinsic($llvm_declargs)""", - """$llvm_ret_asgn call $llvm_ret_typ @$intrinsic($llvm_defargs) - ret $llvm_ret""") - - return quote - Base.llvmcall($dest, $julia_ret_typ, Tuple{$(julia_argtypes...)}, $(julia_args...)) - end -end - -# generalization of word-based primitives - -# extract bits from a larger value -@inline function extract_word(val, ::Val{i}) where {i} - extract_value(val, UInt32, Val(32*(i-1))) -end - -@generated function extract_value(val, ::Type{sub}, ::Val{offset}) where {sub, offset} - Context() do ctx - T_val = convert(LLVMType, val; ctx) - T_sub = convert(LLVMType, sub; ctx) - bytes = Core.sizeof(val) - T_int = LLVM.IntType(8*bytes; ctx) - - # create function - llvm_f, _ = create_function(T_sub, [T_val]) - mod = LLVM.parent(llvm_f) - - # generate IR - IRBuilder(ctx) do builder - entry = BasicBlock(llvm_f, "entry"; ctx) - position!(builder, entry) - equiv = bitcast!(builder, parameters(llvm_f)[1], T_int) - shifted = lshr!(builder, equiv, LLVM.ConstantInt(T_int, offset)) - # extracted = and!(builder, shifted, 2^32-1) - extracted = trunc!(builder, shifted, T_sub) - ret!(builder, extracted) - end - end - - call_function(llvm_f, UInt32, Tuple{val}, :val) -end - -# insert bits into a larger value -@inline function insert_word(val, word::UInt32, ::Val{i}) where {i} - insert_value(val, word, Val(32*(i-1))) -end - -@generated function insert_value(val, sub, ::Val{offset}) where {offset} - Context() do ctx - T_val = convert(LLVMType, val; ctx) - T_sub = convert(LLVMType, sub; ctx) - bytes = Core.sizeof(val) - T_out_int = LLVM.IntType(8*bytes; ctx) - - # create function - llvm_f, _ = create_function(T_val, [T_val, T_sub]) - mod = LLVM.parent(llvm_f) - - # generate IR - IRBuilder(ctx) do builder - entry = BasicBlock(llvm_f, "entry"; ctx) - position!(builder, entry) - equiv = bitcast!(builder, parameters(llvm_f)[1], T_out_int) - ext = zext!(builder, parameters(llvm_f)[2], T_out_int) - shifted = shl!(builder, ext, LLVM.ConstantInt(T_out_int, offset)) - inserted = or!(builder, equiv, shifted) - orig = bitcast!(builder, inserted, T_val) - ret!(builder, orig) - end - end - - call_function(llvm_f, val, Tuple{val, sub}, :val, :sub) -end - -# split the invocation of a function `op` on a value `val` with non-struct eltype -# into multiple smaller invocations on byte-sized partial values. -@generated function split_value_invocation(op::Function, val, args...) - # TODO: control of lower-limit - - ex = quote - Base.@_inline_meta - end - - # disassemble into words - words = Symbol[] - for i in 1:Core.sizeof(val)÷4 - word = Symbol("word$i") - push!(ex.args, :( $word = extract_word(val, Val($i)) )) - push!(words, word) - end - - # perform the operation - for word in words - push!(ex.args, :( $word = op($word, args...)) ) - end - - # reassemble - push!(ex.args, :( out = zero(val) )) - for (i,word) in enumerate(words) - push!(ex.args, :( out = insert_word(out, $word, Val($i)) )) - end - - push!(ex.args, :( out )) - - return ex -end - -# split the invocation of a function `op` on a value `val` -# by invoking the function on each of its fields -@generated function recurse_value_invocation(op::Function, val, args...) - ex = quote - Base.@_inline_meta - end - - fields = fieldnames(val) - if isempty(fields) - push!(ex.args, :( split_value_invocation(op, val, args...) )) - else - ctor = Expr(:new, val) - for field in fields - push!(ctor.args, :( - recurse_value_invocation(op, getfield(val, $(QuoteNode(field))), args...) )) - end - push!(ex.args, ctor) - end - - return ex -end - -# split the invocation of a function `op` on a pointer `ptr` with non-struct eltype -# into multiple smaller invocations on any supported pointer as listed in `supported_ptrs`. -@generated function split_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs}, - args...) where {supported_ptrs} - T = eltype(ptr) - elsize(x) = Core.sizeof(eltype(x)) - supported_ptrs = reverse(Base.uniontypes(supported_ptrs)) - - ex = quote - Base.@_inline_meta - end - - # disassemble - vals = Tuple{Symbol,Int,Type}[] - offset = 0 - while offset < Core.sizeof(T) - val = Symbol("value.$(length(vals)+1)") - - # greedy selection of next pointer type - remaining = Core.sizeof(T)-offset - valid = filter(ptr->elsize(ptr)<=remaining, supported_ptrs) - if isempty(valid) - error("Cannot partition $T into values of $supported_typs") - end - ptr = first(sort(collect(valid); by=elsize, rev=true)) - - push!(vals, (val, offset, ptr)) - offset += elsize(ptr) - end - - # perform the operation - for (val, offset, ptr) in vals - subptr = :(convert($ptr, ptr+$offset)) - push!(ex.args, :( $val = op($subptr, args...)) ) - end - - # reassemble - push!(ex.args, :( out = zero($T) )) - for (val, offset, ptr) in vals - push!(ex.args, :( out = insert_value(out, $val, Val($offset)) )) - end - - push!(ex.args, :( out )) - - return ex -end - -# split the invocation of a function `op` on a pointer `ptr` -# by invoking the function on a pointer to each of its fields -@generated function recurse_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs}, - args...) where {supported_ptrs} - T = eltype(ptr) - ex = quote - Base.@_inline_meta - end - fields = fieldnames(T) - - if isempty(fields) - push!(ex.args, :( split_pointer_invocation(op, ptr, supported_ptrs, args...) )) - else - ctor = Expr(:new, T) - for (i,field) in enumerate(fields) - field_typ = fieldtype(T, i) - field_offset = fieldoffset(T, i) - field_ptr_typ = :($(ptr.name.wrapper){$field_typ}) - # NOTE: this ctor is a leap of faith - subptr = :(convert($field_ptr_typ, ptr+$field_offset)) - push!(ctor.args, :( - recurse_pointer_invocation(op, $subptr, supported_ptrs, args...) )) - end - push!(ex.args, ctor) - end - - return ex -end - -# calculate the size of an LLVM type -llvmsize(::LLVM.LLVMHalf) = sizeof(Float16) -llvmsize(::LLVM.LLVMFloat) = sizeof(Float32) -llvmsize(::LLVM.LLVMDouble) = sizeof(Float64) -llvmsize(::LLVM.IntegerType) = Context() do ctx - div(Int(intwidth(GenericValue(LLVM.Int128Type(ctx), -1))), 8) -end -llvmsize(ty::LLVM.ArrayType) = length(ty)*llvmsize(eltype(ty)) -llvmsize(ty::LLVM.StructType) = ispacked(ty) ? sum(llvmsize(elem) for elem in elements(ty)) : 8*length(elements(ty)) # FIXME: Properly determine non-packed sizing -llvmsize(ty::LLVM.PointerType) = div(Sys.WORD_SIZE, 8) -llvmsize(ty::LLVM.VectorType) = size(ty) -llvmsize(ty) = error("Unknown size for type: $ty, typeof: $(typeof(ty))") diff --git a/src/compiler/zeroinit_lds.jl b/src/compiler/zeroinit_lds.jl new file mode 100644 index 000000000..06e4daa5d --- /dev/null +++ b/src/compiler/zeroinit_lds.jl @@ -0,0 +1,55 @@ +# Calculate the size of an LLVM type. +llvmsize(::LLVM.LLVMHalf) = sizeof(Float16) +llvmsize(::LLVM.LLVMFloat) = sizeof(Float32) +llvmsize(::LLVM.LLVMDouble) = sizeof(Float64) +function llvmsize(::LLVM.IntegerType) + div(Int(intwidth(GenericValue(LLVM.Int128Type(), -1))), 8) +end + +llvmsize(ty::LLVM.ArrayType) = length(ty) * llvmsize(eltype(ty)) +llvmsize(ty::LLVM.StructType) = ispacked(ty) ? + sum(llvmsize(elem) for elem in elements(ty)) : + 8 * length(elements(ty)) # FIXME: Properly determine non-packed sizing +llvmsize(ty::LLVM.PointerType) = div(Sys.WORD_SIZE, 8) +llvmsize(ty::LLVM.VectorType) = size(ty) +llvmsize(ty) = error("Unknown size for type: $ty, typeof: $(typeof(ty))") + +function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function) + if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv + return entry + end + + to_init = [] + for gbl in LLVM.globals(mod) + if startswith(LLVM.name(gbl), "__zeroinit") + as = LLVM.addrspace(value_type(gbl)) + if as == AMDGPU.Device.AS.Local + push!(to_init, gbl) + end + end + end + isempty(to_init) && return entry + + @dispose builder=IRBuilder() begin + # Make these the first operations we do. + block = first(LLVM.blocks(entry)) + instruction = first(LLVM.instructions(block)) + position!(builder, instruction) + + # Use memset to clear all values to 0. + for gbl in to_init + sz = llvmsize(eltype(value_type(gbl))) + sz == 0 && continue + + LLVM.memset!(builder, gbl, + ConstantInt(UInt8(0)), ConstantInt(sz), + LLVM.alignment(gbl)) + end + + # Synchronize the workgroup to prevent races. + sync_ft = LLVM.FunctionType(LLVM.VoidType()) + sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier")) + call!(builder, sync_ft, sync_f) + end + return entry +end diff --git a/src/deprecations.jl b/src/deprecations.jl deleted file mode 100644 index 2a25fdb5c..000000000 --- a/src/deprecations.jl +++ /dev/null @@ -1,9 +0,0 @@ -@deprecate gridDim() gridItemDim() -@deprecate gridDimWG() gridGroupDim() -@deprecate HSAAgent ROCDevice -@deprecate HSAQueue ROCQueue -@deprecate HSASignal ROCSignal -@deprecate HSAStatusSignal ROCKernelSignal -@deprecate HSAKernelInstance ROCKernel -@deprecate HSARegion ROCMemoryRegion -@deprecate HSAMemoryPool ROCMemoryPool diff --git a/src/device/gcn/array.jl b/src/device/gcn/array.jl index 8ba74490f..305ad0e3e 100644 --- a/src/device/gcn/array.jl +++ b/src/device/gcn/array.jl @@ -28,17 +28,12 @@ ROCDeviceArray struct ROCDeviceArray{T,N,A} <: AbstractArray{T,N} shape::Dims{N} ptr::LLVMPtr{T,A} + len::Int # inner constructors, fully parameterized, exact types (ie. Int not <:Integer) - ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N} = new(shape,ptr) -end - -# Define `khash` function to reduce runtime dispatches. -function Runtime.khash(x::T, h::UInt=UInt(0)) where T <: AMDGPU.Device.ROCDeviceArray - for s in x.shape - h = hash(s, h) + function ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N} + new(shape, ptr, prod(shape)) end - Runtime.khash(x.ptr, h) end const ROCDeviceVector = ROCDeviceArray{T,1,A} where {T,A} @@ -67,11 +62,11 @@ ROCDeviceVector{T,A}(len::Integer, p::LLVMPtr{T,A}) where {T,A} Base.pointer(a::ROCDeviceArray) = a.ptr Base.pointer(a::ROCDeviceArray, i::Integer) = - pointer(a) + (i - 1) * Base.elsize(a) + pointer(a) + (i - 1) * Base.elsize(a) # TODO use _memory_offset(a, i) Base.elsize(::Type{<:ROCDeviceArray{T}}) where {T} = sizeof(T) Base.size(g::ROCDeviceArray) = g.shape -Base.length(g::ROCDeviceArray) = prod(g.shape) +Base.length(g::ROCDeviceArray) = g.len # conversions diff --git a/src/device/gcn/assertion.jl b/src/device/gcn/assertion.jl index 8d7c223db..d5988cf57 100644 --- a/src/device/gcn/assertion.jl +++ b/src/device/gcn/assertion.jl @@ -39,9 +39,9 @@ assert_counter = 0 @generated function rocassert_fail(::Val{msg}, ::Val{file}, ::Val{line}) where {msg, file, line} @dispose ctx=Context() begin - T_void = LLVM.VoidType(ctx) - T_int32 = LLVM.Int32Type(ctx) - T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx)) + T_void = LLVM.VoidType() + T_int32 = LLVM.Int32Type() + T_pint8 = LLVM.PointerType(LLVM.Int8Type()) # create function llvm_f, _ = create_function() @@ -49,8 +49,8 @@ assert_counter = 0 # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) global assert_counter assert_counter += 1 @@ -58,7 +58,7 @@ assert_counter = 0 file = globalstring_ptr!(builder, String(file), "assert_file_$(assert_counter)") line = ConstantInt(T_int32, line) func = globalstring_ptr!(builder, "unknown", "assert_function_$(assert_counter)") - charSize = ConstantInt(Csize_t(1); ctx) + charSize = ConstantInt(Csize_t(1)) # invoke __assertfail and return # TODO: mark noreturn since we don't use ptxas? diff --git a/src/device/gcn/atomics.jl b/src/device/gcn/atomics.jl index 53f5486ef..b1a9d2e0b 100644 --- a/src/device/gcn/atomics.jl +++ b/src/device/gcn/atomics.jl @@ -29,14 +29,14 @@ atomic_store!(ptr::LLVMPtr, val, order=Val{:release}()) = @generated function llvm_atomic_op(::Val{binop}, ptr::LLVMPtr{T,A}, val::T) where {binop, T, A} @dispose ctx=Context() begin - T_val = convert(LLVMType, T; ctx) - T_ptr = convert(LLVMType, ptr; ctx) + T_val = convert(LLVMType, T) + T_ptr = convert(LLVMType, ptr) T_typed_ptr = LLVM.PointerType(T_val, A) llvm_f, _ = create_function(T_val, [T_ptr, T_val]) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr) @@ -96,14 +96,14 @@ end @generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A} @dispose ctx=Context() begin - T_val = convert(LLVMType, T; ctx) - T_ptr = convert(LLVMType, ptr; ctx) + T_val = convert(LLVMType, T) + T_ptr = convert(LLVMType, ptr) T_typed_ptr = LLVM.PointerType(T_val, A) llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val]) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr) @@ -171,9 +171,11 @@ end """ atomic_cas!(ptr::LLVMPtr{T}, cmp::T, val::T) -Reads the value `old` located at address `ptr` and compare with `cmp`. If `old` equals to -`cmp`, stores `val` at the same address. Otherwise, doesn't change the value `old`. These -operations are performed in one atomic transaction. The function returns `old`. +Reads the value `old` located at address `ptr` and compare with `cmp`. +If `old` equals to `cmp`, stores `val` at the same address. +Otherwise, doesn't change the value `old`. +These operations are performed in one atomic transaction. +The function returns `old`. This operation is supported for values of type Int32, Int64, UInt32 and UInt64. """ diff --git a/src/device/gcn/execution_control.jl b/src/device/gcn/execution_control.jl index 52d053ec5..7648502bf 100644 --- a/src/device/gcn/execution_control.jl +++ b/src/device/gcn/execution_control.jl @@ -13,8 +13,8 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa @generated function _completion_signal() @dispose ctx=Context() begin - T_int8 = LLVM.Int8Type(ctx) - T_int64 = LLVM.Int64Type(ctx) + T_int8 = LLVM.Int8Type() + T_int64 = LLVM.Int64Type() _as = convert(Int, AS.Constant) T_ptr_i8 = LLVM.PointerType(T_int8, _as) T_ptr_i64 = LLVM.PointerType(T_int64, _as) @@ -24,8 +24,8 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa mod = LLVM.parent(llvm_f) # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) # get the kernel dispatch pointer @@ -34,7 +34,7 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa ptr = call!(builder, intr_typ, intr) # load the index - signal_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(completion_signal_base; ctx)]) + signal_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(completion_signal_base)]) signal_ptr = bitcast!(builder, signal_ptr_i8, T_ptr_i64) signal = load!(builder, T_int64, signal_ptr) ret!(builder, signal) diff --git a/src/device/gcn/helpers.jl b/src/device/gcn/helpers.jl index 9d378df35..cc9c3efe3 100644 --- a/src/device/gcn/helpers.jl +++ b/src/device/gcn/helpers.jl @@ -6,18 +6,18 @@ _packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names) @dispose ctx=Context() begin inp_exprs = [:( inp_args[$i] ) for i in 1:length(inp_args)] inp_types = [inp_args...] - out_type = convert(LLVMType, out_arg.parameters[1]; ctx) + out_type = convert(LLVMType, out_arg.parameters[1]) # create function bool_types = map(x->x===Bool, inp_types) - T_bool = LLVM.Int1Type(ctx) - param_types = LLVMType[convert.(LLVMType, inp_types; ctx=ctx)...] + T_bool = LLVM.Int1Type() + param_types = LLVMType[convert.(LLVMType, inp_types)...] llvm_f, _ = create_function(out_type, param_types) mod = LLVM.parent(llvm_f) # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) # call the intrinsic @@ -28,13 +28,13 @@ _packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names) for idx in 1:length(param_types) if bool_types[idx] attrs = parameter_attributes(intr, idx) - push!(attrs, EnumAttribute("zeroext", 0; ctx)) + push!(attrs, EnumAttribute("zeroext", 0)) end end params = map(x->bool_types[x[1]] ? trunc!(builder, x[2], T_bool) : x[2], enumerate(parameters(llvm_f))) value = call!(builder, intr_ftype, intr, [params...]) if out_arg === Type{Bool} - ret!(builder, zext!(builder, value, convert(LLVMType, Bool; ctx))) + ret!(builder, zext!(builder, value, convert(LLVMType, Bool))) else ret!(builder, value) end diff --git a/src/device/gcn/hostcall.jl b/src/device/gcn/hostcall.jl index d5086a966..a71b51b50 100644 --- a/src/device/gcn/hostcall.jl +++ b/src/device/gcn/hostcall.jl @@ -26,7 +26,7 @@ const DEVICE_ERR_SENTINEL = 5 const HOST_ERR_SENTINEL = 6 const DEFAULT_HOSTCALL_TIMEOUT = Ref{Union{Float64, Nothing}}(nothing) -const DEFAULT_HOSTCALL_LATENCY = Ref{Float64}(0.01) +const DEFAULT_HOSTCALL_LATENCY = 0.01 include("hostcall_signal_helpers.jl") @@ -35,14 +35,15 @@ include("hostcall_signal_helpers.jl") GPU-compatible struct for making hostcalls. """ -struct HostCall{RT,AT} +struct HostCall{RT, AT} signal_handle::UInt64 - buf_ptr::LLVMPtr{UInt8,AS.Global} + buf_ptr::LLVMPtr{UInt8, AS.Global} buf_len::UInt end -function HostCall(RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64; - device=AMDGPU.device(), buf_len=nothing) +function HostCall( + RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64; buf_len = nothing, +) if isnothing(buf_len) buf_len = 0 for T in AT.parameters @@ -52,274 +53,101 @@ function HostCall(RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64; end buf_len = max(sizeof(UInt64), buf_len) # make room for return buffer pointer - buf = Mem.alloc(device, buf_len; coherent=true) - buf_ptr = LLVMPtr{UInt8,AS.Global}(Base.unsafe_convert(Ptr{UInt8}, buf)) + buf = Mem.HostBuffer(buf_len, AMDGPU.HIP.hipHostAllocMapped) + buf_ptr = LLVMPtr{UInt8, AS.Global}(Base.unsafe_convert(Ptr{UInt8}, buf)) host_signal_store!(HSA.Signal(signal_handle), READY_SENTINEL) - HostCall{RT,AT}(signal_handle, buf_ptr, buf_len) + HostCall{RT, AT}(signal_handle, buf_ptr, buf_len) end -"Calls the host function stored in `hc` with arguments `args`." -@inline function hostcall!(hc::HostCall, args...) - hostcall!(Val{:group}(), hc, args...) +struct HostCallHolder + hc::HostCall + task::Task + finish::Ref{Bool} + continuous::Ref{Bool} end -@inline function hostcall!( - ::Val{mode}, hc::HostCall{RT, AT}, args..., -) where {mode, RT, AT} - hostcall_device_lock!(Val{mode}(), hc) - hostcall_device_write_args!(Val{mode}(), hc, args...) - return hostcall_device_trigger_and_return!(Val{mode}(), hc) -end - -macro device_execution_gate(mode, exec_ex) - if mode isa QuoteNode - mode = mode.value::Symbol - end - @assert mode in (:grid, :group, :wave, :lane) "Invalid mode: $mode" - ex = Expr(:block) - if mode == :grid - push!(ex.args, quote - # Must be on first item of first group - if $workgroupIdx().x != 1 || $workitemIdx().x != 1 - @goto gated_done - end - end) - elseif mode == :group - push!(ex.args, quote - # Must be on first item of each group - if $workitemIdx().x != 1 - @goto gated_done - end - end) - elseif mode == :wave - push!(ex.args, quote - # Must be on first lane of each wavefront of each group - if Core.Intrinsics.urem_int($workitemIdx().x - UInt32(1), - $wavefrontsize()) != 0 - @goto gated_done - end - end) - end - push!(ex.args, quote - $(esc(exec_ex)) - @label gated_done - $sync_workgroup() - end) - return ex -end - -@inline function hostcall_device_lock!(hc::HostCall) - hostcall_device_lock!(Val{:group}(), hc) -end - -@inline @generated function hostcall_device_lock!( - ::Val{mode}, hc::HostCall, -) where mode - return quote - @device_execution_gate $mode begin - # Acquire lock on hostcall buffer - hostcall_device_signal_wait_cas!(hc.signal_handle, READY_SENTINEL, DEVICE_LOCK_SENTINEL) - end - end -end - -@inline function hostcall_device_write_args!(hc::HostCall, args...) - hostcall_device_write_args!(Val{:group}(), hc, args...) -end - -@inline @generated function hostcall_device_write_args!( - ::Val{mode}, hc::HostCall{RT, AT}, args..., -) where {mode, RT, AT} - ex = Expr(:block) - - # Copy arguments into buffer - # Modified from CUDAnative src/device/cuda/dynamic_parallelism.jl - off = 1 - for i in 1:length(args) - T = args[i] - sz = sizeof(T) - # FIXME: Use proper alignment - ptr = :(reinterpret(LLVMPtr{$T,AS.Global}, hc.buf_ptr+$off-1)) - push!(ex.args, :(Base.unsafe_store!($ptr, args[$i]))) - off += sz - end - - return macroexpand(@__MODULE__, quote - @device_execution_gate $mode begin - $ex - end - end) -end - -@inline function hostcall_device_trigger_and_return!(hc::HostCall) - hostcall_device_trigger_and_return!(Val{:group}(), hc::HostCall) -end - -@inline @generated function hostcall_device_trigger_and_return!(::Val{mode}, hc::HostCall{RT, AT}) where {mode, RT, AT} - ex = Expr(:block) - @gensym shmem buf_ptr ret_ptr hostcall_return - - push!(ex.args, quote - if $RT !== Nothing - # FIXME: This is not valid without the @inline - # $shmem = $alloc_local($hostcall_return, $RT, 1) - # But this is fine (if slower) - $shmem = $get_global_pointer($(Val{hostcall_return}()), $RT) - end - - @device_execution_gate $mode begin - # Ensure arguments are written - $hostcall_device_signal_wait_cas!(hc.signal_handle, $DEVICE_LOCK_SENTINEL, $DEVICE_MSG_SENTINEL) - # Wait on host message - $hostcall_device_signal_wait(hc.signal_handle, $HOST_MSG_SENTINEL) - # Get return buffer and load first value - if $RT !== Nothing - $buf_ptr = reinterpret(LLVMPtr{LLVMPtr{$RT,AS.Global},AS.Global},hc.buf_ptr) - $ret_ptr = unsafe_load($buf_ptr) - if UInt64($ret_ptr) == 0 - $device_signal_store!(hc.signal_handle, $DEVICE_ERR_SENTINEL) - $signal_exception() - $trap() - end - unsafe_store!($shmem, unsafe_load($ret_ptr)::$RT) - end - $device_signal_store!(hc.signal_handle, $READY_SENTINEL) - end - if $RT !== Nothing - return unsafe_load($shmem) - else - return nothing - end - end) - - return ex -end - -@inline @generated function hostcall_device_args_size(args...) - sz = 0 - for arg in args - sz += sizeof(arg) - end - return sz -end - -@generated function hostcall_host_read_args(hc::HostCall{RT,AT}) where {RT,AT} - ex = Expr(:tuple) - - # Copy arguments into buffer - off = 1 - for i in 1:length(AT.parameters) - T = AT.parameters[i] - sz = sizeof(T) - # FIXME: Use correct alignment - push!(ex.args, quote - lref = Ref{$T}() - HSA.memory_copy(reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{$T}, lref)), - reinterpret(Ptr{Cvoid}, hc.buf_ptr+$off - 1), - $sz) |> Runtime.check - lref[] - end) - off += sz - end - - return ex -end - -struct HostCallException <: Exception - reason::String - err::Union{Exception, Nothing} - bt::Union{Vector, Nothing} -end - -HostCallException(reason) = HostCallException(reason, nothing, backtrace()) - -HostCallException(reason, err) = HostCallException(reason, err, catch_backtrace()) - -function Base.showerror(io::IO, err::HostCallException) - print(io, "HostCallException: $(err.reason)") - if err.err !== nothing || err.bt !== nothing - print(io, ":\n") - end - err.err !== nothing && Base.showerror(io, err.err) - err.bt !== nothing && Base.show_backtrace(io, err.bt) -end - -const NAMED_PERDEVICE_HOSTCALLS = Dict{Runtime.ROCDevice, Dict{Symbol, HostCall}}() - -function named_perdevice_hostcall(func, device::Runtime.ROCDevice, name::Symbol) - lock(Runtime.RT_LOCK) do - hcs = get!(()->Dict{Symbol, HostCall}(), NAMED_PERDEVICE_HOSTCALLS, device) - return get!(func, hcs, name) - end -end +include("hostcall_utils.jl") """ - HostCall(func, return_type::Type, arg_types::Type{Tuple}) -> HostCall + HostCallHolder(func, return_type::Type, arg_types::Type{Tuple}) -> HostCall Construct a `HostCall` that executes `func` with the arguments passed from the -calling kernel. `func` must be passed arguments of types contained in -`arg_types`, and must return a value of type `return_type`, or else the -hostcall will fail with undefined behavior. +calling kernel. + +`func` must be passed arguments of types contained in `arg_types`, +and must return a value of type `return_type`, +or else the hostcall will fail with undefined behavior. Note: This API is currently experimental and is subject to change at any time. """ -function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; return_task::Bool = false, - device=AMDGPU.device(), maxlat=DEFAULT_HOSTCALL_LATENCY[], - timeout=nothing, continuous=false, buf_len=nothing) - # Create raw HSA signal to avoid ROCSignal finalizer - # being called too early in the HostCall task. +function HostCallHolder( + func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; + timeout = nothing, continuous = false, buf_len = nothing, + maxlat = DEFAULT_HOSTCALL_LATENCY, +) signal_ref = Ref{HSA.Signal}() HSA.signal_create(1, 0, C_NULL, signal_ref) |> Runtime.check signal = signal_ref[] AMDGPU.hsaref!() - hc = HostCall(rettype, argtypes, signal.handle; device, buf_len) + hc = HostCall(rettype, argtypes, signal.handle; buf_len) + finish_ref = Ref{Bool}(false) + continuous_ref = Ref{Bool}(continuous) tsk = Threads.@spawn begin - ret_buf = Ref{Mem.Buffer}() + ret_buf = Ref{Mem.HostBuffer}(Mem.HostBuffer()) ret_len = 0 + try while true - if !hostcall_host_wait(signal; maxlat=maxlat, timeout=timeout) - throw(HostCallException("Hostcall: Timeout on signal $signal")) + if !hostcall_host_wait(signal, finish_ref; maxlat, timeout) + Runtime.RT_EXITING[] && break + finish_ref[] && break + throw(HostCallException("Timeout on signal $signal")) end - if length(argtypes.parameters) > 0 - args = try + + args = if isempty(argtypes.parameters) + () + else + try hostcall_host_read_args(hc) catch err throw(HostCallException("Error getting arguments", err)) end - @debug "Hostcall: Got arguments of length $(length(args))" - else - args = () end + ret = try func(args...,) catch err throw(HostCallException("Error executing host function", err)) end - if typeof(ret) != rettype - throw(HostCallException("Host function result of wrong type: $(typeof(ret)), expected $rettype")) - end - if !isbits(ret) - throw(HostCallException("Host function result not isbits: $(typeof(ret))")) - end - @debug "Hostcall: Host function returning value of type $(typeof(ret))" + + typeof(ret) == rettype || throw(HostCallException(""" + Host function result of wrong type: + - returned: $(typeof(ret)) + - expected: $rettype + """)) + isbits(ret) || throw(HostCallException( + "Host function result not isbits: $(typeof(ret))")) + try - if isassigned(ret_buf) && (ret_len < sizeof(ret)) + if ret_buf[].ptr != C_NULL && ret_len < sizeof(ret) Mem.free(ret_buf[]) ret_len = sizeof(ret) - ret_buf[] = Mem.alloc(device, ret_len; coherent=true) - elseif !isassigned(ret_buf) + ret_buf[] = Mem.HostBuffer(ret_len, AMDGPU.HIP.hipHostAllocMapped) + elseif ret_buf[].ptr == C_NULL ret_len = sizeof(ret) - ret_buf[] = Mem.alloc(device, ret_len; coherent=true) + ret_buf[] = Mem.HostBuffer(ret_len, AMDGPU.HIP.hipHostAllocMapped) end + ret_ref = Ref{rettype}(ret) GC.@preserve ret_ref begin ret_ptr = Base.unsafe_convert(Ptr{Cvoid}, ret_buf[]) if sizeof(ret) > 0 - src_ptr = reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{rettype}, ret_ref)) - HSA.memory_copy(ret_ptr, src_ptr, sizeof(ret)) |> Runtime.check + src_ptr = reinterpret(Ptr{Cvoid}, + Base.unsafe_convert(Ptr{rettype}, ret_ref)) + HSA.memory_copy( + ret_ptr, src_ptr, sizeof(ret)) |> Runtime.check end args_buf_ptr = reinterpret(Ptr{Ptr{Cvoid}}, hc.buf_ptr) @@ -327,10 +155,10 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r end host_signal_store!(signal, HOST_MSG_SENTINEL) catch err - throw(HostCallException("Error returning hostcall result", err)) + throw(HostCallException( + "Error returning hostcall result", err)) end - @debug "Hostcall: Host function return completed" - continuous || break + continuous_ref[] || break end catch err # Gracefully terminate all waiters @@ -342,17 +170,19 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r rethrow(err) end finally - # We need to free the memory buffers, but first we need to ensure that - # the device has read from these buffers. Therefore we wait either for - # READY_SENTINEL or else an error signal. + # We need to free the memory buffers, but first we need + # to ensure that the device has read from these buffers. + # Therefore we wait either for READY_SENTINEL or else an error signal. while !Runtime.RT_EXITING[] prev = host_signal_load(signal) - if prev == READY_SENTINEL || prev == HOST_ERR_SENTINEL || prev == DEVICE_ERR_SENTINEL - if isassigned(ret_buf) - Mem.free(ret_buf[]) - end + not_used = + prev == READY_SENTINEL || + prev == HOST_ERR_SENTINEL || + prev == DEVICE_ERR_SENTINEL + if not_used + Mem.free(ret_buf[]) # `free` checks for C_NULL. buf_ptr = reinterpret(Ptr{Cvoid}, hc.buf_ptr) - Mem.free(Mem.Buffer(buf_ptr, C_NULL, buf_ptr, 0, device, true, false)) + Mem.free(Mem.HostBuffer(buf_ptr, 0)) break end end @@ -360,29 +190,41 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r HSA.signal_destroy(signal) |> Runtime.check AMDGPU.hsaunref!() end + return end - - if return_task - return hc, tsk - else - return hc - end + HostCallHolder(hc, tsk, finish_ref, continuous_ref) end -function hostcall_host_wait(signal_handle::HSA.Signal; maxlat=DEFAULT_HOSTCALL_LATENCY[], timeout=DEFAULT_HOSTCALL_TIMEOUT[]) - @debug "Hostcall: Waiting on signal $signal_handle" +Adapt.adapt(to::Runtime.Adaptor, hc::HostCallHolder) = hc.hc + +non_continuous!(hc::HostCallHolder) = hc.continuous[] = false + +finish!(hc::HostCallHolder) = hc.finish[] = true + +Base.istaskdone(hc::HostCallHolder) = istaskdone(hc.task) + +function hostcall_host_wait( + signal_handle::HSA.Signal, finish_ref::Ref{Bool}; + maxlat=DEFAULT_HOSTCALL_LATENCY, timeout=DEFAULT_HOSTCALL_TIMEOUT[], +) + res::Bool = false start_time = time_ns() + while !Runtime.RT_EXITING[] + finish_ref[] && break prev = host_signal_load(signal_handle) + + # If device-sourced message is available, + # lock on host to prevent further writes from the device. + # If successfully locked on host, done waiting. if prev == DEVICE_MSG_SENTINEL prev = host_signal_cmpxchg!( signal_handle, DEVICE_MSG_SENTINEL, HOST_LOCK_SENTINEL) if prev == DEVICE_MSG_SENTINEL - @debug "Hostcall: Device message on signal $signal_handle" - return true + res = true + break end elseif prev == DEVICE_ERR_SENTINEL - @debug "Hostcall: Device error on signal $signal_handle" throw(HostCallException("Device error on signal $signal_handle")) end @@ -390,10 +232,13 @@ function hostcall_host_wait(signal_handle::HSA.Signal; maxlat=DEFAULT_HOSTCALL_L now_time = time_ns() diff_time = (now_time - start_time) / 10^9 if diff_time > timeout - @debug "Hostcall: Signal wait timeout on signal $signal_handle" - return false + res = false + break end end - sleep(maxlat) + + Libc.systemsleep(maxlat) + yield() end + return res end diff --git a/src/device/gcn/hostcall_utils.jl b/src/device/gcn/hostcall_utils.jl new file mode 100644 index 000000000..e2a95f207 --- /dev/null +++ b/src/device/gcn/hostcall_utils.jl @@ -0,0 +1,219 @@ +"Calls the host function stored in `hc` with arguments `args`." +@inline function hostcall!(hc::HostCall, args...) + hostcall!(Val{:group}(), hc, args...) +end + +@inline function hostcall!( + ::Val{mode}, hc::HostCall{RT, AT}, args..., +) where {mode, RT, AT} + hostcall_device_lock!(Val{mode}(), hc) + hostcall_device_write_args!(Val{mode}(), hc, args...) + return hostcall_device_trigger_and_return!(Val{mode}(), hc) +end + +macro device_execution_gate(mode, exec_ex) + if mode isa QuoteNode + mode = mode.value::Symbol + end + @assert mode in (:grid, :group, :wave, :lane) "Invalid mode: $mode" + ex = Expr(:block) + if mode == :grid + push!(ex.args, quote + # Must be on first item of first group + if $workgroupIdx().x != 1 || $workitemIdx().x != 1 + @goto gated_done + end + end) + elseif mode == :group + push!(ex.args, quote + # Must be on first item of each group + if $workitemIdx().x != 1 + @goto gated_done + end + end) + elseif mode == :wave + push!(ex.args, quote + # Must be on first lane of each wavefront of each group + is_not_first_lane = Core.Intrinsics.urem_int( + $workitemIdx().x - UInt32(1), $wavefrontsize()) != 0 + if is_not_first_lane + @goto gated_done + end + end) + end + push!(ex.args, quote + $(esc(exec_ex)) + @label gated_done + $sync_workgroup() + end) + return ex +end + +@inline function hostcall_device_lock!(hc::HostCall) + hostcall_device_lock!(Val{:group}(), hc) +end + +@inline @generated function hostcall_device_lock!( + ::Val{mode}, hc::HostCall, +) where mode + return quote + @device_execution_gate $mode begin + # Acquire lock on hostcall buffer + hostcall_device_signal_wait_cas!( + hc.signal_handle, READY_SENTINEL, DEVICE_LOCK_SENTINEL) + end + end +end + +@inline function hostcall_device_write_args!(hc::HostCall, args...) + hostcall_device_write_args!(Val{:group}(), hc, args...) +end + +@inline @generated function hostcall_device_write_args!( + ::Val{mode}, hc::HostCall{RT, AT}, args..., +) where {mode, RT, AT} + ex = Expr(:block) + + # Copy arguments into buffer + # Modified from CUDAnative src/device/cuda/dynamic_parallelism.jl + off = 1 + for i in 1:length(args) + T = args[i] + sz = sizeof(T) + # FIXME: Use proper alignment + ptr = :(reinterpret(LLVMPtr{$T,AS.Global}, hc.buf_ptr + $off - 1)) + push!(ex.args, :(Base.unsafe_store!($ptr, args[$i]))) + off += sz + end + + return macroexpand(@__MODULE__, quote + @device_execution_gate $mode begin + $ex + end + end) +end + +@inline function hostcall_device_trigger_and_return!(hc::HostCall) + hostcall_device_trigger_and_return!(Val{:group}(), hc::HostCall) +end + +@inline @generated function hostcall_device_trigger_and_return!( + ::Val{mode}, hc::HostCall{RT, AT}, +) where {mode, RT, AT} + ex = Expr(:block) + @gensym shmem buf_ptr ret_ptr + + push!(ex.args, quote + if $RT !== Nothing + $shmem = $alloc_local(:hostcall_return, $RT, 1) + end + + @device_execution_gate $mode begin + # Ensure arguments are written + $hostcall_device_signal_wait_cas!( + hc.signal_handle, $DEVICE_LOCK_SENTINEL, $DEVICE_MSG_SENTINEL) + # Wait on host message + $hostcall_device_signal_wait(hc.signal_handle, $HOST_MSG_SENTINEL) + # Get return buffer and load first value + if $RT !== Nothing + $buf_ptr = reinterpret(LLVMPtr{LLVMPtr{$RT,AS.Global}, AS.Global}, hc.buf_ptr) + $ret_ptr = unsafe_load($buf_ptr) + if UInt64($ret_ptr) == 0 + $device_signal_store!(hc.signal_handle, $DEVICE_ERR_SENTINEL) + $signal_exception() + $trap() + end + unsafe_store!($shmem, unsafe_load($ret_ptr)::$RT) + end + $device_signal_store!(hc.signal_handle, $READY_SENTINEL) + end + if $RT !== Nothing + return unsafe_load($shmem) + else + return nothing + end + end) + + return ex +end + +@inline @generated function hostcall_device_args_size(args...) + sz = 0 + for arg in args + sz += sizeof(arg) + end + return sz +end + +@generated function hostcall_host_read_args(hc::HostCall{RT,AT}) where {RT,AT} + ex = Expr(:tuple) + + # Copy arguments into buffer + off = 1 + for i in 1:length(AT.parameters) + T = AT.parameters[i] + sz = sizeof(T) + # FIXME: Use correct alignment + push!(ex.args, quote + lref = Ref{$T}() + HSA.memory_copy( + reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{$T}, lref)), + reinterpret(Ptr{Cvoid}, hc.buf_ptr + $off - 1), $sz) |> Runtime.check + lref[] + end) + off += sz + end + + return ex +end + +struct HostCallException <: Exception + reason::String + err::Union{Exception, Nothing} + bt::Union{Vector, Nothing} +end + +HostCallException(reason) = HostCallException(reason, nothing, backtrace()) + +HostCallException(reason, err) = HostCallException(reason, err, catch_backtrace()) + +function Base.showerror(io::IO, err::HostCallException) + print(io, "HostCallException: $(err.reason)") + if err.err !== nothing || err.bt !== nothing + print(io, ":\n") + end + err.err !== nothing && Base.showerror(io, err.err) + err.bt !== nothing && Base.show_backtrace(io, err.bt) +end + +const NAMED_PERDEVICE_HOSTCALLS = Dict{ + HIP.HIPDevice, Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}}() + +function named_perdevice_hostcall(func, dev::HIP.HIPDevice, name::Symbol) + Base.@lock Runtime.RT_LOCK begin + hcs = get!( + () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(), + NAMED_PERDEVICE_HOSTCALLS, dev) + get!(func, hcs, name) + end +end + +# TODO rename +function get_named_perdevice_hostcall(dev::HIP.HIPDevice, name::Symbol) + Base.@lock Runtime.RT_LOCK begin + hcs = get( + () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(), + NAMED_PERDEVICE_HOSTCALLS, dev) + get(hcs, name, nothing) + end +end + +function remove_perdevice_hostcall!(dev::HIP.HIPDevice, name::Symbol) + Base.@lock Runtime.RT_LOCK begin + dev_hcs = get(NAMED_PERDEVICE_HOSTCALLS, dev, nothing) + isnothing(dev_hcs) && return + + pop!(dev_hcs, name) + return + end +end diff --git a/src/device/gcn/indexing.jl b/src/device/gcn/indexing.jl index 938557ebb..4c09e720c 100644 --- a/src/device/gcn/indexing.jl +++ b/src/device/gcn/indexing.jl @@ -2,15 +2,15 @@ @generated function _index(::Val{fname}, ::Val{name}, ::Val{range}) where {fname, name, range} @dispose ctx=Context() begin - T_int32 = LLVM.Int32Type(ctx) + T_int32 = LLVM.Int32Type() # create function llvm_f, _ = create_function(T_int32) mod = LLVM.parent(llvm_f) # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) # call the indexing intrinsic @@ -19,10 +19,9 @@ idx = call!(builder, intr_typ, intr) # attach range metadata - range_metadata = MDNode([ConstantInt(UInt32(range.start); ctx), - ConstantInt(UInt32(range.stop); ctx)]; - ctx) - metadata(idx)[LLVM.MD_range] = range_metadata + metadata(idx)[LLVM.MD_range] = MDNode([ + ConstantInt(UInt32(range.start)), + ConstantInt(UInt32(range.stop))]) ret!(builder, idx) end @@ -32,14 +31,14 @@ end @generated function _dim(::Val{base}, ::Val{off}, ::Val{range}, ::Type{T}) where {base, off, range, T} @dispose ctx=Context() begin - T_int8 = LLVM.Int8Type(ctx) - T_int32 = LLVM.Int32Type(ctx) + T_int8 = LLVM.Int8Type() + T_int32 = LLVM.Int32Type() _as = convert(Int, AS.Constant) T_ptr_i8 = LLVM.PointerType(T_int8, _as) T_ptr_i32 = LLVM.PointerType(T_int32, _as) - T_T = convert(LLVMType, T; ctx) + T_T = convert(LLVMType, T) T_ptr_T = LLVM.PointerType(T_T, _as) # create function @@ -47,8 +46,8 @@ end mod = LLVM.parent(llvm_f) # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) # get the kernel dispatch pointer @@ -58,16 +57,15 @@ end # load the index offset = base + ((off - 1) * sizeof(T)) - idx_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(offset; ctx)]) + idx_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(offset)]) idx_ptr_T = bitcast!(builder, idx_ptr_i8, T_ptr_T) idx_T = load!(builder, T_T, idx_ptr_T) idx = zext!(builder, idx_T, T_int32) # attach range metadata - range_metadata = MDNode([ConstantInt(T(range.start); ctx), - ConstantInt(T(range.stop); ctx)]; - ctx) - metadata(idx_T)[LLVM.MD_range] = range_metadata + metadata(idx_T)[LLVM.MD_range] = MDNode([ + ConstantInt(T(range.start)), + ConstantInt(T(range.stop))]) ret!(builder, idx) end diff --git a/src/device/gcn/memory_dynamic.jl b/src/device/gcn/memory_dynamic.jl index 617b95e0a..a43f7e8c9 100644 --- a/src/device/gcn/memory_dynamic.jl +++ b/src/device/gcn/memory_dynamic.jl @@ -1,20 +1,20 @@ export malloc, free -malloc(sz) = device_malloc(sz) -function device_malloc(sz::Csize_t) - malloc_gbl = get_global_pointer(Val(:__global_malloc_hostcall), - HostCall{Ptr{Cvoid},Tuple{UInt64,Csize_t}}) - malloc_hc = Base.unsafe_load(malloc_gbl) - kern = _completion_signal() - ptr = hostcall!(malloc_hc, kern, sz) - return ptr +# @device_function function dm_alloc(sz::Csize_t) +# ccall("extern __ockl_dm_alloc", llvmcall, Ptr{Cvoid}, (Csize_t,), sz) +# end + +# @device_function function dm_free(ptr::Ptr{Cvoid}) +# ccall("extern __ockl_dm_free", llvmcall, Nothing, (Csize_t,), ptr) +# end + +function malloc(bytesize::Csize_t)::Ptr{Cvoid} + mhc = Base.unsafe_load(malloc_hc()) + return hostcall!(mhc, bytesize) end -free(ptr) = device_free(ptr) -function device_free(ptr::Ptr{Cvoid}) - free_gbl = get_global_pointer(Val(:__global_free_hostcall), - HostCall{Nothing,Tuple{UInt64,Ptr{Cvoid}}}) - free_hc = Base.unsafe_load(free_gbl) - kern = _completion_signal() - hostcall!(free_hc, kern, ptr) +function free(ptr::Ptr{Cvoid})::Nothing + fhc = Base.unsafe_load(free_hc()) + hostcall!(fhc, ptr) + return end diff --git a/src/device/gcn/memory_static.jl b/src/device/gcn/memory_static.jl index 58afd8868..26df9d614 100644 --- a/src/device/gcn/memory_static.jl +++ b/src/device/gcn/memory_static.jl @@ -1,7 +1,9 @@ "Allocates on-device memory statically from the specified address space." -@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit}=Val{false}()) where {id,T,as,len,zeroinit} +@generated function alloc_special( + ::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit} = Val{false}(), +) where {id,T,as,len,zeroinit} @dispose ctx=Context() begin - eltyp = convert(LLVMType, T; ctx) + eltyp = convert(LLVMType, T) # old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique if !isa(id, String) || !isa(id, Symbol) @@ -11,7 +13,7 @@ id = "__zeroinit_" * id end - T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as}; ctx) + T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as}) # create a function llvm_f, _ = create_function(T_ptr_i8) @@ -30,17 +32,18 @@ end end - # by requesting a larger-than-datatype alignment, we might be able to vectorize. + # By requesting a larger-than-datatype alignment, + # we might be able to vectorize. # TODO: Make the alignment configurable alignment!(gv, Base.max(32, Base.datatype_alignment(T))) # generate IR - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) ptr_with_as = gep!(builder, gv_typ, gv, - [ConstantInt(0; ctx), ConstantInt(0; ctx)]) + [ConstantInt(0), ConstantInt(0)]) ptr = bitcast!(builder, ptr_with_as, T_ptr_i8) ret!(builder, ptr) end @@ -49,8 +52,10 @@ end end -@inline alloc_local(id, T, len, zeroinit=false) = alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}()) -@inline alloc_scratch(id, T, len) = alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}()) +@inline alloc_local(id, T, len, zeroinit=false) = + alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}()) +@inline alloc_scratch(id, T, len) = + alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}()) macro ROCStaticLocalArray(T, dims, zeroinit=true) zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit @@ -59,9 +64,12 @@ macro ROCStaticLocalArray(T, dims, zeroinit=true) @gensym id len quote $len = prod($(esc(dims))) - $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))), $(esc(T)), $len, $zeroinit)) + $ROCDeviceArray($(esc(dims)), + $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))), + $(esc(T)), $len, $zeroinit)) end end + macro ROCDynamicLocalArray(T, dims, zeroinit=true) if Base.libllvm_version < v"14" @warn "@ROCDynamicLocalArray is unsupported on LLVM <14\nUndefined behavior may result" @@ -73,7 +81,9 @@ macro ROCDynamicLocalArray(T, dims, zeroinit=true) @gensym id DA quote let - $DA = $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))), $(esc(T)), 0, $zeroinit)) + $DA = $ROCDeviceArray($(esc(dims)), + $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))), + $(esc(T)), 0, $zeroinit)) if $zeroinit # Zeroinit doesn't work at the compiler level for dynamic LDS # allocations, so zero it here @@ -89,10 +99,10 @@ end @inline @generated function alloc_string(::Val{str}) where str @dispose ctx=Context() begin - T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx), AS.Global) + T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Global) llvm_f, _ = create_function(T_pint8) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) str_ptr = globalstring_ptr!(builder, String(str)) ptr = addrspacecast!(builder, str_ptr, T_pint8) @@ -105,18 +115,18 @@ end # TODO: Support various types of len @inline @generated function memcpy!(dest_ptr::LLVMPtr{UInt8,DestAS}, src_ptr::LLVMPtr{UInt8,SrcAS}, len::LT) where {DestAS,SrcAS,LT<:Union{Int64,UInt64}} @dispose ctx=Context() begin - T_nothing = LLVM.VoidType(ctx) - T_pint8_dest = convert(LLVMType, dest_ptr; ctx) - T_pint8_src = convert(LLVMType, src_ptr; ctx) - T_int64 = convert(LLVMType, len; ctx) - T_int1 = LLVM.Int1Type(ctx) + T_nothing = LLVM.VoidType() + T_pint8_dest = convert(LLVMType, dest_ptr) + T_pint8_src = convert(LLVMType, src_ptr) + T_int64 = convert(LLVMType, len) + T_int1 = LLVM.Int1Type() llvm_f, _ = create_function(T_nothing, [T_pint8_dest, T_pint8_src, T_int64]) mod = LLVM.parent(llvm_f) T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_pint8_src, T_int64, T_int1]) intr = LLVM.Function(mod, "llvm.memcpy.p$(DestAS)i8.p$(SrcAS)i8.i64", T_intr) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) dest_ptr_i8 = parameters(llvm_f)[1] @@ -131,18 +141,18 @@ memcpy!(dest_ptr::LLVMPtr{T,DestAS}, src_ptr::LLVMPtr{T,SrcAS}, len::Integer) wh memcpy!(reinterpret(LLVMPtr{UInt8,DestAS}, dest_ptr), reinterpret(LLVMPtr{UInt8,SrcAS}, src_ptr), UInt64(len)) @inline @generated function memset!(dest_ptr::LLVMPtr{UInt8,DestAS}, value::UInt8, len::LT) where {DestAS,LT<:Union{Int64,UInt64}} @dispose ctx=Context() begin - T_nothing = LLVM.VoidType(ctx) - T_pint8_dest = convert(LLVMType, dest_ptr; ctx) - T_int8 = convert(LLVMType, value; ctx) - T_int64 = convert(LLVMType, len; ctx) - T_int1 = LLVM.Int1Type(ctx) + T_nothing = LLVM.VoidType() + T_pint8_dest = convert(LLVMType, dest_ptr) + T_int8 = convert(LLVMType, value) + T_int64 = convert(LLVMType, len) + T_int1 = LLVM.Int1Type() llvm_f, _ = create_function(T_nothing, [T_pint8_dest, T_int8, T_int64]) mod = LLVM.parent(llvm_f) T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_int8, T_int64, T_int1]) intr = LLVM.Function(mod, "llvm.memset.p$(DestAS)i8.i64", T_intr) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) call!(builder, T_intr, intr, [parameters(llvm_f)[1], parameters(llvm_f)[2], parameters(llvm_f)[3], ConstantInt(T_int1, 0)]) diff --git a/src/device/gcn/output.jl b/src/device/gcn/output.jl index f854b6ba0..667baae57 100644 --- a/src/device/gcn/output.jl +++ b/src/device/gcn/output.jl @@ -5,64 +5,11 @@ Base.unsafe_load(ptr::LLVMPtr{<:DeviceStaticString,AS.Global}) = unsafe_string(reinterpret(Cstring, ptr)) Base.unsafe_store!(ptr::LLVMPtr{<:DeviceStaticString,AS.Global}, x) = nothing -struct OutputContext{HC} - hostcall::HC -end -function OutputContext(io::IO=stdout; device=AMDGPU.device(), continuous=true, buf_len=2^16, name=nothing, kwargs...) - hc = if name !== nothing - named_perdevice_hostcall(device, name) do - create_output_context_hostcall(io; device, continuous, buf_len, kwargs...) - end - else - create_output_context_hostcall(io; device, continuous, buf_len, kwargs...) - end - return OutputContext(hc) -end -function create_output_context_hostcall(io; buf_len, kwargs...) - hc = HostCall(Int64, Tuple{LLVMPtr{DeviceStaticString{buf_len},AS.Global}}; buf_len, kwargs...) do bytes - str = unsafe_load(reinterpret(LLVMPtr{DeviceStaticString{buf_len},AS.Global}, hc.buf_ptr)) - print(io, str) - return Int64(length(str)) - end - return hc -end - -const GLOBAL_OUTPUT_CONTEXT_TYPE = OutputContext{HostCall{Int64,Tuple{LLVMPtr{DeviceStaticString{2^16},AS.Global}}}} - -### macros - -macro rocprint(str...) - if first(str) isa String || Meta.isexpr(first(str), :string) - # No OutputContext - @gensym oc_ptr oc - ex = quote - $oc_ptr = $get_global_pointer($(Val(:__global_output_context)), - $GLOBAL_OUTPUT_CONTEXT_TYPE) - $oc = Base.unsafe_load($oc_ptr) - end - push!(ex.args, rocprint(oc, str...)) - return esc(ex) - else - return esc(rocprint(first(str), str[2:end]...)) - end -end -macro rocprintln(str...) - if first(str) isa String || Meta.isexpr(first(str), :string) - # No OutputContext - @gensym oc_ptr oc - ex = quote - $oc_ptr = $get_global_pointer($(Val(:__global_output_context)), - $GLOBAL_OUTPUT_CONTEXT_TYPE) - $oc = Base.unsafe_load($oc_ptr) - end - push!(ex.args, rocprint(oc, str..., '\n')) - return esc(ex) - else - return esc(rocprint(first(str), str[2:end]..., '\n')) - end -end +const OUTPUT_CONTEXT_TYPE = HostCall{ + Nothing, Tuple{LLVMPtr{DeviceStaticString{2^16}, AS.Global}}} -### parse-time helpers +const PRINTF_OUTPUT_CONTEXT_TYPE = HostCall{ + Nothing, Tuple{LLVMPtr{UInt8, AS.Global}}} function rocprint(oc, str...) ex = Expr(:block) @@ -74,82 +21,107 @@ function rocprint(oc, str...) @assert s.head == :string push!(strs, s) end - push!(ex.args, :($hostcall_device_lock!($oc.hostcall))) + push!(ex.args, :($hostcall_device_lock!($oc))) N = 1 + # Write strings & null termination to hostcall buffer. for str in strs N = rocprint!(ex, N, oc, str) end rocprint!(ex, N, oc, '\0') - push!(ex.args, :($hostcall_device_trigger_and_return!($oc.hostcall))) + # Make host read args, execute function & wait for return. + push!(ex.args, :($hostcall_device_trigger_and_return!($oc))) push!(ex.args, :(nothing)) return ex end + function rocprint!(ex, N, oc, str::String) @gensym str_ptr push!(ex.args, :($str_ptr = $alloc_string($(Val(Symbol(str)))))) - push!(ex.args, :($memcpy!($oc.hostcall.buf_ptr+$(N-1), $str_ptr, $(length(str))))) - return N+length(str) + push!(ex.args, :($memcpy!( + $oc.buf_ptr + $(N - 1), $str_ptr, $(length(str))))) + return N + length(str) end + function rocprint!(ex, N, oc, char::Char) @assert length(codeunits(string(char))) == 1 "Multi-codeunit chars not yet implemented" byte = UInt8(char) - ptr = :(reinterpret($(LLVMPtr{UInt8,AS.Global}), $oc.hostcall.buf_ptr)) + ptr = :(reinterpret($(LLVMPtr{UInt8, AS.Global}), $oc.buf_ptr)) push!(ex.args, :(Base.unsafe_store!($ptr, $byte, $N))) - return N+1 + return N + 1 end + function rocprint!(ex, N, oc, iex::Expr) for arg in iex.args N = rocprint!(ex, N, oc, arg) end return N end -function rocprint!(ex, N, oc, sym::S) where S + +function rocprint!(ex, N, oc, ::S) where S error("Dynamic printing of $S only supported via @rocprintf") end -## @rocprintf - -# Serializes execution of a function within a wavefront -# From implementation by @jonathanvdc in CUDAnative.jl#419 -function wave_serialized(func::Function) - # Get the current thread's ID - thread_id = workitemIdx().x - 1 - - # Get the size of a wavefront - size = wavefrontsize() +macro rocprint(str...) + if first(str) isa String || Meta.isexpr(first(str), :string) + # No OutputContext + @gensym oc_ptr oc + ex = quote + $oc_ptr = $output_context() + $oc = Base.unsafe_load($oc_ptr) + end + push!(ex.args, rocprint(oc, str...)) + return esc(ex) + else + return esc(rocprint(first(str), str[2:end]...)) + end +end - local result - i = 0 - while i < size - if thread_id % size == i - result = func() +macro rocprintln(str...) + if first(str) isa String || Meta.isexpr(first(str), :string) + # No OutputContext + @gensym oc_ptr oc + ex = quote + $oc_ptr = $output_context() + $oc = Base.unsafe_load($oc_ptr) end - i += 1 + push!(ex.args, rocprint(oc, str..., '\n')) + return esc(ex) + else + return esc(rocprint(first(str), str[2:end]..., '\n')) end - return result end +# @rocprintf impementation. + struct ROCPrintfBuffer end + Base.sizeof(::ROCPrintfBuffer) = 0 -Base.unsafe_store!(::LLVMPtr{ROCPrintfBuffer,as} where as, x) = nothing -function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as) + +Base.unsafe_store!(::LLVMPtr{ROCPrintfBuffer, AS.Global}, x) = nothing + +# TODO add docs about format. +""" +Read from the printf buffer on the host from HostCall task. +""" +function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer, AS.Global}) ptr = reinterpret(Ptr{UInt64}, ptr) - # Read number of argument blocks in buffer + # Read number of argument blocks in buffer. blocks = unsafe_load(ptr) ptr += sizeof(UInt64) - # Read pointer to format string + # Read pointer to format string. fmt_ptr = Ptr{UInt64}(unsafe_load(ptr)) ptr += sizeof(UInt64) - - # Read format string length + # Read format string length. fmt_len = unsafe_load(ptr) ptr += sizeof(UInt64) - # Read format string into host buffer + # Read format string into host buffer. fmt_buf = Vector{UInt8}(undef, fmt_len) - HSA.memory_copy(convert(Ptr{Cvoid}, pointer(fmt_buf)), convert(Ptr{Cvoid}, fmt_ptr), fmt_len) |> Runtime.check + HSA.memory_copy( + convert(Ptr{Cvoid}, pointer(fmt_buf)), + convert(Ptr{Cvoid}, fmt_ptr), fmt_len) |> Runtime.check fmt = String(fmt_buf) # Read arguments @@ -166,8 +138,7 @@ function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as) break end T = unsafe_pointer_to_objref(T_ptr) - - # Read argument + # Read argument. arg = unsafe_load(reinterpret(Ptr{T}, ptr)) push!(args, arg) ptr += sizeof(arg) @@ -176,59 +147,82 @@ function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as) block += 1 end - return (fmt, all_args) + return fmt, all_args end -function _rocprintf_fmt(ptr, fmt_ptr, fmt_len) +function _rocprintf_fmt(ptr::LLVMPtr{UInt64, AS.Global}, fmt_ptr, fmt_len::Int64) unsafe_store!(ptr, reinterpret(UInt64, fmt_ptr)) ptr += sizeof(UInt64) unsafe_store!(ptr, UInt64(fmt_len)) ptr += sizeof(UInt64) return ptr end -@generated function pointer_from_type(::Type{T}) where T - ptr = pointer_from_objref(T) - return UInt64(ptr) + +function _pointer_from_type(::Type{T}) where T + UInt64(pointer_from_objref(T)) end -function _rocprintf_arg(ptr, arg::T) where T - T_ptr = pointer_from_type(T) - unsafe_store!(ptr, T_ptr) + +function _rocprintf_arg(ptr::LLVMPtr{UInt64, AS.Global}, arg::T) where T + unsafe_store!(ptr, _pointer_from_type(T)) ptr += sizeof(UInt64) - unsafe_store!(reinterpret(LLVMPtr{T,1}, ptr), arg) + + unsafe_store!(reinterpret(LLVMPtr{T, AS.Global}, ptr), arg) ptr += sizeof(arg) - #= FIXME - ref_arg = Ref{T}(arg) - GC.@preserve ref_arg begin - ptr_arg = convert(DevicePtr{UInt8,AS.Global}, - convert(DevicePtr{T,AS.Global}, - Base.unsafe_convert(Ptr{T}, ref_arg))) - memcpy!(ptr, ptr_arg, sizeof(arg), Val(true)) - end - =# return ptr end -#= TODO: Not really useful until we can work with device-side strings -function _rocprintf_string(ptr, str::String) - @gensym T_str T_str_len str_ptr - quote - $T_str, $T_str_len = AMDGPU._rocprintf_T_str(String) - AMDGPU.Device.memcpy!($ptr, $T_str, $T_str_len) - $ptr += $T_str_len - unsafe_store!($ptr, UInt8(0)) - $ptr += 1 - $str_ptr = Base.unsafe_convert(DevicePtr{UInt8,AS.Generic}, $str_ptr) - $str_ptr = AMDGPU.Device.alloc_string($(Val(Symbol(str)))) - AMDGPU.Device.memcpy!($ptr, $str_ptr, $(length(str))) - $ptr += $(length(str)) - $ptr - end -end -@generated function _rocprintf_T_str(::Type{T}) where T - quote - (AMDGPU.Device.alloc_string($(Val(Symbol(repr(T))))), $(sizeof(repr(T)))) - end -end -=# + +# macro rocprintf(args...) +# mode = :group +# @assert first(args) isa Union{QuoteNode,String} "First argument must be an inline Symbol or String" +# if first(args) isa QuoteNode +# mode = args[1].value::Symbol +# args = args[2:end] +# @assert mode isa Symbol "Execution mode must be a Symbol" +# @assert mode in (:grid, :group, :wave, :lane) "Invalid execution mode: $mode" +# end + +# @assert first(args) isa String "@rocprintf format-string must be a String" +# fmt = args[1] +# args = args[2:end] + +# @gensym printf_hc device_ptr device_fmt_ptr write_size +# ex = quote +# # Load printf HostCall. +# $printf_hc = Base.unsafe_load($printf_output_context()) +# $device_ptr = reinterpret( +# $(LLVMPtr{UInt64, AS.Global}), $printf_hc.buf_ptr) +# # Allocate device-side format pointer. +# $device_fmt_ptr = $alloc_string($(Val(Symbol(fmt)))) +# # Lock hostcall buffer. +# $hostcall_device_lock!($printf_hc) +# # Write block count. +# Base.unsafe_store!($device_ptr, UInt64(1)) # TODO take into account mode +# $device_ptr += sizeof(UInt64) +# # Write fmt string pointer & its bytesize. +# $device_ptr = $_rocprintf_fmt( +# $device_ptr, $device_fmt_ptr, $(sizeof(fmt))) +# # Calculate total write size per args block. +# $write_size = +# $hostcall_device_args_size($(map(esc, args)...)) + # Space for arguments. +# $(length(args)) * sizeof(UInt64) + # Space for type tags. # TODO what if args are less than uint64? +# sizeof(UInt64) # Space for terminator. +# # TODO account for offset for different modes. +# end + +# # Write arguments & terminating null word. +# ex_args = Expr(:block) +# for arg in args +# push!(ex_args.args, :($device_ptr = $_rocprintf_arg( +# $device_ptr, $(esc(arg))))) +# end +# push!(ex_args.args, :(unsafe_store!($device_ptr, 0))) +# push!(ex.args, :(@device_execution_gate $mode $ex_args)) + +# # Submit & unlock hostcall buffer. +# push!(ex.args, :($hostcall_device_trigger_and_return!($printf_hc))) +# push!(ex.args, :(nothing)) +# ex +# end function unsafe_ceil(x, y) up = Core.Intrinsics.urem_int(x, y) > 0 @@ -237,13 +231,16 @@ end macro rocprintf(args...) mode = :group - @assert first(args) isa Union{QuoteNode,String} "First argument must be an inline Symbol or String" + @assert first(args) isa Union{QuoteNode,String} + "First argument must be an inline Symbol or String" if first(args) isa QuoteNode mode = args[1].value::Symbol args = args[2:end] @assert mode isa Symbol "Execution mode must be a Symbol" - @assert mode in (:grid, :group, :wave, :lane) "Invalid execution mode: $mode" + @assert mode in (:grid, :group, :wave, :lane) + "Invalid execution mode: $mode" end + @assert first(args) isa String "Format must be a String" fmt = args[1] args = args[2:end] @@ -264,10 +261,9 @@ macro rocprintf(args...) push!(ex.args, :($device_fmt_ptr = $alloc_string($(Val(Symbol(fmt)))))) # Load HostCall object - push!(ex.args, :($printf_hc = unsafe_load($get_global_pointer(Val(:__global_printf_context), - HostCall{Int64,Tuple{LLVMPtr{ROCPrintfBuffer,AS.Global}}})))) - push!(ex.args, :($device_ptr = reinterpret($(LLVMPtr{UInt64,AS.Global}), $printf_hc.buf_ptr))) - + push!(ex.args, :($printf_hc = unsafe_load($printf_output_context()))) + push!(ex.args, :($device_ptr = reinterpret( + $(LLVMPtr{UInt64,AS.Global}), $printf_hc.buf_ptr))) # Lock hostcall buffer push!(ex.args, :($hostcall_device_lock!($printf_hc))) @@ -278,21 +274,24 @@ macro rocprintf(args...) elseif mode == :group push!(ex.args, :(unsafe_store!($device_ptr, UInt64(1)))) elseif mode == :wave - waves_per_group = :($unsafe_ceil($workgroupDim().x, - $wavefrontsize())) - push!(ex.args, :(unsafe_store!($device_ptr, Base.unsafe_trunc(UInt64, $waves_per_group)))) + waves_per_group = :($unsafe_ceil($workgroupDim().x, $wavefrontsize())) + push!(ex.args, :(unsafe_store!( + $device_ptr, Base.unsafe_trunc(UInt64, $waves_per_group)))) elseif mode == :lane - push!(ex.args, :(unsafe_store!($device_ptr, Base.unsafe_trunc(UInt64, $workgroupDim().x)))) + push!(ex.args, :(unsafe_store!( + $device_ptr, Base.unsafe_trunc(UInt64, $workgroupDim().x)))) end push!(ex.args, :($device_ptr += sizeof(UInt64))) # Write format string pointer - push!(ex.args, :($device_ptr = $_rocprintf_fmt($device_ptr, $device_fmt_ptr, $(sizeof(fmt))))) + push!(ex.args, :($device_ptr = $_rocprintf_fmt( + $device_ptr, $device_fmt_ptr, $(sizeof(fmt))))) # Calculate total write size per args block - push!(ex.args, :($write_size = $hostcall_device_args_size($(map(esc, args)...)) # Space for arguments - + $(length(args))*sizeof(UInt64) + # Space for type tags - + sizeof(UInt64))) # Space for terminator + push!(ex.args, :($write_size = + $hostcall_device_args_size($(map(esc, args)...)) # Space for arguments + + $(length(args))*sizeof(UInt64) + # Space for type tags + + sizeof(UInt64))) # Space for terminator # Calulate offset into buffer # FIXME: Use y and z dims @@ -301,8 +300,8 @@ macro rocprintf(args...) elseif mode == :group :(0) elseif mode == :wave - wave_idx = :(Core.Intrinsics.udiv_int($workitemIdx().x - UInt32(1), - $wavefrontsize())) + wave_idx = :(Core.Intrinsics.udiv_int( + $workitemIdx().x - UInt32(1), $wavefrontsize())) :($wave_idx * $write_size) elseif mode == :lane lane_idx = :(workitemIdx().x - 1) @@ -313,7 +312,8 @@ macro rocprintf(args...) # Write arguments and terminating null word ex_args = Expr(:block) for arg in args - push!(ex_args.args, :($device_ptr = $_rocprintf_arg($device_ptr, $(esc(arg))))) + push!(ex_args.args, :($device_ptr = $_rocprintf_arg( + $device_ptr, $(esc(arg))))) end push!(ex_args.args, :(unsafe_store!($device_ptr, 0))) push!(ex.args, :(@device_execution_gate $mode $ex_args)) @@ -323,3 +323,57 @@ macro rocprintf(args...) push!(ex.args, :(nothing)) ex end + +@inline _to_linear(w, h, i, j, k) = + i + w * (j - 1 + ((k - 1) * h)) + +macro ⊡(exec_ex) + @gensym x y z value + quote + $x = $workitemIdx().x + ($workgroupIdx().x - UInt32(1)) * $workgroupDim().x + $y = $workitemIdx().y + ($workgroupIdx().y - UInt32(1)) * $workgroupDim().y + $z = $workitemIdx().z + ($workgroupIdx().z - UInt32(1)) * $workgroupDim().z + $value = _to_linear( + UInt64($gridItemDim().x), UInt64($gridItemDim().y), + UInt64($x), UInt64($y), UInt64($z)) + if gate!($value) + $(esc(exec_ex)) + end + end +end + +macro errprintf(args...) + fmt, args = args[1], args[2:end] + @assert fmt isa String "@errprintf format-string must be a String: $fmt." + + @gensym buffer_ptr device_fmt_str write_size + err_ex = quote + $buffer_ptr = $err_buffer!() + reinterpret(UInt64, $buffer_ptr) == 0 && return + $device_fmt_str = $alloc_string($(Val(Symbol(fmt)))) + # Write block count (compat with printf, not used). + Base.unsafe_store!($buffer_ptr, UInt64(1)) + $buffer_ptr += sizeof(UInt64) + # Write fmt string pointer & its bytesize. + $buffer_ptr = $_rocprintf_fmt( + $buffer_ptr, $device_fmt_str, $(sizeof(fmt))) + # Calculate total write size per args block. + $write_size = + $hostcall_device_args_size($(map(esc, args)...)) + # Space for arguments. + $(length(args)) * sizeof(UInt64) + # Space for type tags. + sizeof(UInt64) # Space for terminator. + end + + # Write arguments & terminating null word. + for arg in args + push!(err_ex.args, + :($buffer_ptr = $_rocprintf_arg($buffer_ptr, $(esc(arg))))) + end + push!(err_ex.args, :(unsafe_store!($buffer_ptr, 0))) + + # Pass through ⊡ gate. + ex = Expr(:block) + push!(ex.args, :(@⊡ $err_ex)) + push!(ex.args, :(nothing)) + ex +end diff --git a/src/device/globals.jl b/src/device/globals.jl index 5b6936460..125489965 100644 --- a/src/device/globals.jl +++ b/src/device/globals.jl @@ -5,8 +5,8 @@ # space. @inline @generated function get_global_pointer(::Val{global_name}, ::Type{T})::LLVMPtr{T} where {global_name, T} @dispose ctx=Context() begin - T_global = convert(LLVMType, T; ctx) - T_result = convert(LLVMType, Ptr{T}; ctx) + T_global = convert(LLVMType, T) + T_result = convert(LLVMType, Ptr{T}) # Create a thunk that computes a pointer to the global. llvm_f, _ = create_function(T_result) @@ -27,8 +27,8 @@ end # Generate IR that computes the global's address. - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") position!(builder, entry) # Cast the global variable's type to the result type. diff --git a/src/device/quirks.jl b/src/device/quirks.jl index 0322ebbb3..f85dfc69e 100644 --- a/src/device/quirks.jl +++ b/src/device/quirks.jl @@ -1,56 +1,57 @@ # Copied from CUDA.jl/src/device/quirks.jl -macro print_and_throw(arg) +macro print_and_throw(description) quote - str = $alloc_string($(Val(Symbol(arg)))) - $device_report_exception(reinterpret(Ptr{Cchar}, str)) - # FIXME: Report exception frames - signal_exception() + # FIXME + # all functions that take part in exception reporting are not inlined + # @errprintf($description) + throw(nothing) end end # math.jl -@device_override @noinline Base.Math.throw_complex_domainerror(f::Symbol, x) = - @print_and_throw "This operation requires a complex input to return a complex result" -@device_override @noinline Base.Math.throw_exp_domainerror(f::Symbol, x) = - @print_and_throw "Exponentiation yielding a complex result requires a complex argument" +@device_override Base.Math.throw_complex_domainerror(f::Symbol, x) = + @print_and_throw "This operation requires a complex input to return a complex result.\n" +@device_override Base.Math.throw_exp_domainerror(f::Symbol, x) = + @print_and_throw "Exponentiation yielding a complex result requires a complex argument.\n" # intfuncs.jl -@device_override @noinline Base.throw_domerr_powbysq(::Any, p) = - @print_and_throw "Cannot raise an integer to a negative power" -@device_override @noinline Base.throw_domerr_powbysq(::Integer, p) = - @print_and_throw "Cannot raise an integer to a negative power" -@device_override @noinline Base.throw_domerr_powbysq(::AbstractMatrix, p) = - @print_and_throw "Cannot raise an integer to a negative power" -@device_override @noinline Base.__throw_gcd_overflow(a, b) = - @print_and_throw "gcd overflow" +@device_override Base.throw_domerr_powbysq(::Any, p) = + @print_and_throw "Cannot raise an integer to a negative power.\n" +@device_override Base.throw_domerr_powbysq(::Integer, p) = + @print_and_throw "Cannot raise an integer to a negative power.\n" +@device_override Base.throw_domerr_powbysq(::AbstractMatrix, p) = + @print_and_throw "Cannot raise an integer to a negative power.\n" +@device_override Base.__throw_gcd_overflow(a, b) = + @print_and_throw "GCD overflow.\n" # checked.jl -@device_override @noinline Base.Checked.throw_overflowerr_binaryop(op, x, y) = - @print_and_throw "Binary operation overflowed" -@device_override @noinline Base.Checked.throw_overflowerr_negation(op, x, y) = - @print_and_throw "Negation overflowed" +@device_override Base.Checked.throw_overflowerr_binaryop(op, x, y) = + @print_and_throw "Binary operation overflowed.\n" +@device_override Base.Checked.throw_overflowerr_negation(op, x, y) = + @print_and_throw "Negation overflowed.\n" @device_override function Base.Checked.checked_abs(x::Base.Checked.SignedInt) - r = ifelse(x<0, -x, x) - r<0 && @print_and_throw("checked arithmetic: cannot compute |x|") + r = ifelse(x < 0, -x, x) + r < 0 && @print_and_throw("checked arithmetic: cannot compute |x|.\n") r end # boot.jl -@device_override @noinline Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} = - @print_and_throw "Inexact conversion" +@device_override Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} = + @print_and_throw "Inexact conversion.\n" # abstractarray.jl -@device_override @noinline Base.throw_boundserror(A, I) = - @print_and_throw "Out-of-bounds array access" +@device_override Base.throw_boundserror(A, I) = + @print_and_throw "Out-of-bounds array access.\n" # trig.jl -@device_override @noinline Base.Math.sincos_domain_error(x) = - @print_and_throw "sincos(x) is only defined for finite x." +@device_override Base.Math.sincos_domain_error(x) = + @print_and_throw "sincos(x) is only defined for finite x.\n" # multidimensional.jl -@device_override Base.@propagate_inbounds function Base.getindex(iter::CartesianIndices{N,R}, - I::Vararg{Int, N}) where {N,R} +@device_override Base.@propagate_inbounds function Base.getindex( + iter::CartesianIndices{N,R}, I::Vararg{Int, N}, +) where {N,R} @boundscheck checkbounds(iter, I...) index = map(iter.indices, I) do r, i @inbounds getindex(r, i) @@ -60,30 +61,31 @@ end # range.jl @eval begin - @device_override function Base.StepRangeLen{T,R,S,L}(ref::R, step::S, len::Integer, - offset::Integer=1) where {T,R,S,L} + @device_override function Base.StepRangeLen{T,R,S,L}( + ref::R, step::S, len::Integer, offset::Integer=1, + ) where {T,R,S,L} if T <: Integer && !isinteger(ref + step) - @print_and_throw("StepRangeLen{<:Integer} cannot have non-integer step") + @print_and_throw("StepRangeLen{<:Integer} cannot have non-integer step.\n") end len = convert(L, len) - len >= zero(len) || @print_and_throw("StepRangeLen length cannot be negative") + len >= zero(len) || @print_and_throw("StepRangeLen length cannot be negative.\n") offset = convert(L, offset) L1 = oneunit(typeof(len)) - L1 <= offset <= max(L1, len) || @print_and_throw("StepRangeLen: offset must be in [1,...]") - $( - Expr(:new, :(StepRangeLen{T,R,S,L}), :ref, :step, :len, :offset) - ) + L1 <= offset <= max(L1, len) || @print_and_throw("StepRangeLen: offset must be in [1,...].\n") + $(Expr(:new, :(StepRangeLen{T,R,S,L}), :ref, :step, :len, :offset)) end end # LinearAlgebra @static if VERSION >= v"1.8-" - @device_override function Base.setindex!(D::LinearAlgebra.Diagonal, v, i::Int, j::Int) + @device_override function Base.setindex!( + D::LinearAlgebra.Diagonal, v, i::Int, j::Int, + ) @boundscheck checkbounds(D, i, j) if i == j @inbounds D.diag[i] = v elseif !iszero(v) - @print_and_throw("cannot set off-diagonal entry to a nonzero value") + @print_and_throw("Cannot set off-diagonal entry to a nonzero value.\n") end return v end diff --git a/src/device/runtime.jl b/src/device/runtime.jl index cc0a037d2..3cf521c8a 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -1,84 +1,135 @@ +using Core: LLVMPtr # ROCm-specific runtime libraries - ## GPU runtime library # reset the runtime cache from global scope, so that any change triggers recompilation GPUCompiler.reset_runtime() -signal_exception() = device_signal_exception() -function device_signal_exception() - flag_ptr = get_global_pointer(Val(:__global_exception_flag), Int64) - unsafe_store!(flag_ptr, 1) +@inline @generated kernel_state() = GPUCompiler.kernel_state_value(AMDGPU.KernelState) + +exception_flag() = kernel_state().exception_flag + +function err_buffer!() + st = kernel_state() + counter_ptr = reinterpret(LLVMPtr{Int32, AS.Global}, st.buffers_counter) + idx = atomic_add!(counter_ptr, Int32(1)) + Int32(1) + idx > st.n_buffers && return reinterpret(LLVMPtr{UInt64, AS.Global}, 0) + + buf = unsafe_load(st.buffers, idx) + reinterpret(LLVMPtr{UInt64, AS.Global}, buf) +end + +function err_str_buffer!() + st = kernel_state() + counter_ptr = reinterpret(LLVMPtr{Int32, AS.Global}, st.str_buffers_counter) + idx = atomic_add!(counter_ptr, Int32(1)) + Int32(1) + idx > st.n_str_buffers && return reinterpret(LLVMPtr{UInt8, AS.Global}, 0) + + buf = unsafe_load(kernel_state().string_buffers, idx) + reinterpret(LLVMPtr{UInt8, AS.Global}, buf) +end + +function gate!(value::UInt64)::Bool + gate_ptr = reinterpret(LLVMPtr{UInt64, AS.Global}, kernel_state().gate) + old_value = atomic_cas!(gate_ptr, UInt64(0), value) + ifelse(iszero(old_value), true, value == old_value) +end + +function output_context() + ptr = convert(Ptr{OUTPUT_CONTEXT_TYPE}, kernel_state().output_context) + + x = alloc_local(:__print_hostcall, UInt64, 1) + unsafe_store!(x, reinterpret(UInt64, ptr)) + return ptr +end + +function printf_output_context() + ptr = convert( + Ptr{PRINTF_OUTPUT_CONTEXT_TYPE}, + kernel_state().printf_output_context) - # stop this wavefront + x = alloc_local(:__printf_hostcall, UInt64, 1) + unsafe_store!(x, reinterpret(UInt64, ptr)) + return ptr +end + +function malloc_hc() + ptr = convert( + Ptr{HostCall{Ptr{Cvoid}, Tuple{Csize_t}}}, + kernel_state().malloc_hc) + + # FIXME + # Hack to detect when global malloc hostcall is used. + # Create global variable and write pointer to it to prevent it + # from being optimized away. + x = alloc_local(:__malloc_hostcall, UInt64, 1) + unsafe_store!(x, reinterpret(UInt64, ptr)) + return ptr +end + +function free_hc() + ptr = convert( + Ptr{HostCall{Nothing, Tuple{Ptr{Cvoid}}}}, + kernel_state().free_hc) + + x = alloc_local(:__free_hostcall, UInt64, 1) + unsafe_store!(x, reinterpret(UInt64, ptr)) + return ptr +end + +function signal_exception() + unsafe_store!(exception_flag(), Int32(1)) + # Without endpgm we'll get hardware exception. endpgm() - trap() + return end -function device_string_to_host(ex) - # We get a ReadOnlyMemoryError on the host without making a copy because the data is pinned to the device - ex_ptr = reinterpret(LLVMPtr{UInt8,1}, ex) - ex_len = string_length(ex_ptr) - # TODO: Don't use an expensive host malloc - ex_str = reinterpret(LLVMPtr{UInt8,1}, device_malloc(Csize_t(ex_len+1))) - if reinterpret(UInt64, ex_str) == 0 - @rocprintf("Device-to-host string conversion failed\n") - return reinterpret(Cstring, 0) - end - memcpy!(ex_str, ex_ptr, ex_len) - unsafe_store!(ex_str+ex_len, UInt8(0)) - reinterpret(Cstring, ex_str) +function err_device_string_to_host(str::Ptr{Cchar}) + host_str = reinterpret(LLVMPtr{UInt8, AS.Global}, C_NULL) + @⊡ host_str = err_str_buffer!() + reinterpret(UInt64, host_str) == 0 && return reinterpret(Cstring, 0) + + str_ptr = reinterpret(LLVMPtr{UInt8, AS.Global}, str) + str_len = string_length(str_ptr) + + # Copy `ex` to allocated memory & null termination. + memcpy!(host_str, str_ptr, str_len) + unsafe_store!(host_str + str_len, UInt8(0)) + return reinterpret(Cstring, host_str) +end + +function report_oom(sz::Csize_t) + # @errprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) + return end -report_exception(ex) = device_report_exception(ex) -function device_report_exception(ex::Ptr{Cchar}) - # Add kernel ID and exception string to exception ring buffer - ring_ptr = get_global_pointer(Val(:__global_exception_ring), LLVMPtr{ExceptionEntry,AS.Global}) - ring_ptr = unsafe_load(ring_ptr) - our_signal = _completion_signal() - prev = UInt64(1) - while prev != UInt64(0) - # Try to write to this slot, and skip if we fail (because another wavefront wrote first) - prev = atomic_cas!(reinterpret(LLVMPtr{UInt64,AS.Global}, ring_ptr), UInt64(0), our_signal) - if prev == UInt64(0) - ex_str = device_string_to_host(ex) - Base.unsafe_store!(reinterpret(LLVMPtr{UInt64,AS.Global}, ring_ptr+sizeof(UInt64)), reinterpret(UInt64, ex_str)) - break - elseif prev == UInt64(1) - # Tail slot, give up - break - end - ring_ptr += sizeof(ExceptionEntry) - end +function report_exception(ex::Ptr{Cchar}) + # ex_str = err_device_string_to_host(ex) + # @errprintf(""" + # ERROR: a %s was thrown during kernel execution. + # Run Julia on debug level 2 for device stack traces. + # """, ex_str) return end -report_oom(sz) = device_report_oom(sz) -device_report_oom(sz::Csize_t) = - @rocprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) - -report_exception_name(ex) = device_report_exception_name(ex) -function device_report_exception_name(ex::Ptr{Cchar}) - device_report_exception(ex) - # Pass argument in host buffer - ex_str = device_string_to_host(ex) - @rocprintf(""" - ERROR: a %s was thrown during kernel execution. - Stacktrace: - """, ex_str) - device_free(reinterpret(Ptr{Cvoid}, ex_str)) +function report_exception_name(ex::Ptr{Cchar}) + # ex_str = err_device_string_to_host(ex) + # @errprintf(""" + # ERROR: a %s was thrown during kernel execution. + # Stacktrace: + # """, ex_str) return end -report_exception_frame(idx, func, file, line) = - device_report_exception_frame(idx, func, file, line) -function device_report_exception_frame(idx::Cint, func::Ptr{Cchar}, file::Ptr{Cchar}, line::Cint) - # Pass arguments in host buffers - func_str = device_string_to_host(func) - file_str = device_string_to_host(file) - @rocprintf(" [%i] %s at %s:%i\n", idx, func_str, file_str, line) - device_free(reinterpret(Ptr{Cvoid}, func_str)) - device_free(reinterpret(Ptr{Cvoid}, file_str)) +function report_exception_frame( + idx::Cint, func::Ptr{Cchar}, file::Ptr{Cchar}, line::Cint, +) + # func_str = err_device_string_to_host(func) + # file_str = err_device_string_to_host(file) + # @errprintf(""" + # [%i] %s + # @ %s:%i + # """, idx, func_str, file_str, line) return end diff --git a/src/device/strings.jl b/src/device/strings.jl index 48e492c62..ffd6a39bd 100644 --- a/src/device/strings.jl +++ b/src/device/strings.jl @@ -2,21 +2,21 @@ @generated function string_length(ex::Union{Ptr,LLVMPtr}) @dispose ctx=Context() begin - T_ex = convert(LLVMType, ex; ctx) + T_ex = convert(LLVMType, ex) T_ex_ptr = LLVM.PointerType(T_ex) - T_i8 = LLVM.Int8Type(ctx) + T_i8 = LLVM.Int8Type() T_i8_ptr = LLVM.PointerType(T_i8) - T_i64 = LLVM.Int64Type(ctx) + T_i64 = LLVM.Int64Type() llvm_f, _ = create_function(T_i64, [T_ex]) mod = LLVM.parent(llvm_f) - @dispose builder=IRBuilder(ctx) begin - entry = BasicBlock(llvm_f, "entry"; ctx) - check = BasicBlock(llvm_f, "check"; ctx) - done = BasicBlock(llvm_f, "done"; ctx) + @dispose builder=IRBuilder() begin + entry = BasicBlock(llvm_f, "entry") + check = BasicBlock(llvm_f, "check") + done = BasicBlock(llvm_f, "done") position!(builder, entry) - init_offset = ConstantInt(0; ctx) + init_offset = ConstantInt(0) input_ptr = if T_ex isa LLVM.PointerType parameters(llvm_f)[1] else @@ -30,12 +30,12 @@ position!(builder, check) offset = phi!(builder, T_i64) - next_offset = add!(builder, offset, ConstantInt(1; ctx)) + next_offset = add!(builder, offset, ConstantInt(1)) append!(LLVM.incoming(offset), [(init_offset, entry), (next_offset, check)]) ptr = gep!(builder, T_i8, input_ptr, [offset]) value = load!(builder, T_i8, ptr) - cond = icmp!(builder, LLVM.API.LLVMIntEQ, value, ConstantInt(0x0; ctx)) + cond = icmp!(builder, LLVM.API.LLVMIntEQ, value, ConstantInt(0x0)) br!(builder, cond, done, check) position!(builder, done) diff --git a/src/discovery_utils.jl b/src/discovery_utils.jl new file mode 100644 index 000000000..f42cb3f92 --- /dev/null +++ b/src/discovery_utils.jl @@ -0,0 +1,155 @@ +function detect_projects() + amdgpu_project = normpath(joinpath(@__DIR__, "..")) + current_project = Base.ACTIVE_PROJECT[] + julia_project = if Base.JLOptions().project != C_NULL + unsafe_string(Base.JLOptions().project) + elseif current_project !== nothing + current_project + else + amdgpu_project + end + return (;amdgpu_project, current_project, julia_project) +end + +julia_exeflags(projects = detect_projects()) = + String["--startup-file=no", "--project=$(projects.julia_project)"] + +function julia_cmd_projects(jl_str) + projects = detect_projects() + + cmd = Base.julia_cmd() + append!(cmd.exec, julia_exeflags(projects)) + + (;amdgpu_project, current_project, julia_project) = projects + if current_project !== nothing + jl_str = "push!(LOAD_PATH, \"$current_project\");" * jl_str + end + jl_str = "push!(LOAD_PATH, \"$amdgpu_project\");" * jl_str + append!(cmd.exec, ("-e", jl_str)) + return cmd +end + +function safe_exec(str) + cmd = julia_cmd_projects(str) + success = false + error_str = mktemp() do path, _ + p = run(pipeline(cmd; stdout=path, stderr=path); wait=false) + wait(p) + success = p.exitcode == 0 + String(read(path)) + end + return success, error_str +end + +function safe_import(pkg) + loaded, error_str = safe_exec("import $pkg") + loaded || return loaded, false, error_str + + @eval import $pkg + available = @eval(isdefined($pkg, :is_available)) && @eval($pkg.is_available()) + return loaded, available, error_str +end + +function find_rocm_library(lib, dirs, ext=dlext) + path = Libdl.find_library(lib) + if path != "" + return Libdl.dlpath(path) + end + for dir in dirs + files = readdir(dir) + for file in files + matched = startswith(basename(file), lib*".$ext") + if matched + return joinpath(dir, file) + end + end + end + return "" +end + +function find_roc_paths() + paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":") + paths = filter(path -> path != "", paths) + paths = map(Base.Filesystem.abspath, paths) + push!(paths, "/opt/rocm/lib") # shim for Ubuntu rocm packages... + if haskey(ENV, "ROCM_PATH") + push!(paths, joinpath(ENV["ROCM_PATH"], "lib")) + end + return filter(isdir, paths) +end + +function find_rocm_library(libs::Vector, dirs) + for lib in libs + path = find_rocm_library(lib, dirs) + if path != "" + return path + end + end + return "" +end + +function find_ld_lld() + paths = split(get(ENV, "PATH", ""), ":") + paths = filter(path -> path != "", paths) + paths = map(Base.Filesystem.abspath, paths) + basedir = get(ENV, "ROCM_PATH", "/opt/rocm") + ispath(joinpath(basedir, "llvm/bin/ld.lld")) && + push!(paths, joinpath(basedir, "llvm/bin/")) + ispath(joinpath(basedir, "hcc/bin/ld.lld")) && + push!(paths, joinpath(basedir, "/hcc/bin/")) + ispath(joinpath(basedir, "opencl/bin/x86_64/ld.lld")) && + push!(paths, joinpath(basedir, "opencl/bin/x86_64/")) + for path in paths + exp_ld_path = joinpath(path, "ld.lld") + if ispath(exp_ld_path) + try + tmpfile = mktemp() + run(pipeline(`$exp_ld_path -v`; stdout=tmpfile[1])) + vstr = read(tmpfile[1], String) + rm(tmpfile[1]) + vstr = replace(vstr, "AMD " => "") + vstr_splits = split(vstr, ' ') + if VersionNumber(vstr_splits[2]) >= v"6.0.0" + return exp_ld_path + end + catch + @debug "bindeps: Failed running ld.lld in $exp_ld_path" + end + end + end + return "" +end + +function find_device_libs() + # Might be set by tools like Spack or the user + hip_devlibs_path = get(ENV, "HIP_DEVICE_LIB_PATH", "") + hip_devlibs_path !== "" && return hip_devlibs_path + devlibs_path = get(ENV, "DEVICE_LIB_PATH", "") + devlibs_path !== "" && return devlibs_path + + # The canonical location + if isdir("/opt/rocm/amdgcn/bitcode") + return "/opt/rocm/amdgcn/bitcode" + end + + # Search relative to LD_LIBRARY_PATH entries + paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":") + paths = filter(path -> path != "", paths) + paths = map(Base.Filesystem.abspath, paths) + for path in paths + bitcode_path = joinpath(path, "../amdgcn/bitcode/") + if ispath(bitcode_path) + if isfile(joinpath(bitcode_path, "ocml.bc")) || + isfile(joinpath(bitcode_path, "ocml.amdgcn.bc")) + return bitcode_path + end + end + end + return nothing +end + +function populate_globals!(config) + for (key,val) in config + @eval const $key = $val + end +end diff --git a/src/dnn/MIOpen.jl b/src/dnn/MIOpen.jl index b619748b4..2fb5d8fb5 100644 --- a/src/dnn/MIOpen.jl +++ b/src/dnn/MIOpen.jl @@ -1,15 +1,13 @@ module MIOpen +using CEnum + using ..AMDGPU +import AMDGPU: ROCArray, LockedObject, HandleCache, HIP, library_state import AMDGPU.Runtime.Mem -import AMDGPU: ROCArray, ROCDevice, LockedObject -import AMDGPU: HandleCache, HIP, library_state import .HIP: hipStream_t -using CEnum -using GPUArrays - -if AMDGPU.use_artifacts && AMDGPU.functional(:MIOpen) +if AMDGPU.use_artifacts() && AMDGPU.functional(:MIOpen) using MIOpen_jll const libMIOpen_path = MIOpen_jll.libMIOpen_path else @@ -92,15 +90,6 @@ lib_state() = library_state( handle() = lib_state().handle stream() = lib_state().stream -mutable struct Workspace - data::Mem.Buffer - function Workspace(dev::ROCDevice, bytesize) - w = new(Mem.alloc(dev, bytesize)) - finalizer(w_ -> Mem.free(w_.data), w) - w - end -end - include("descriptors.jl") include("convolution.jl") include("pooling.jl") diff --git a/src/dnn/activations.jl b/src/dnn/activations.jl index 1db7635c5..ec0bae2b2 100644 --- a/src/dnn/activations.jl +++ b/src/dnn/activations.jl @@ -117,12 +117,10 @@ function _activation( ) where T <: MIOPENFloat y = similar(x) xdesc, ydesc = TensorDescriptor.((x, y)) - AMDGPU.wait!(x) (; handle, stream) = lib_state() miopenActivationForward( handle, desc.handle, Ref{Float32}(1f0), xdesc.handle, x, Ref{Float32}(0f0), ydesc.handle, y) |> check - AMDGPU.mark!(y, stream) y end @@ -131,12 +129,10 @@ function _∇activation( ) where T <: MIOPENFloat dx = similar(x) xdesc, ydesc, dydesc, dxdesc = TensorDescriptor.((x, y, dy, dx)) - AMDGPU.wait!((x, y, dy)) (; handle, stream) = lib_state() miopenActivationBackward( handle, desc, Ref{Float32}(1f0), ydesc.handle, y, dydesc.handle, dy, xdesc.handle, x, Ref{Float32}(0f0), dxdesc.handle, dx) |> check - AMDGPU.mark!(dx, stream) dx end diff --git a/src/dnn/batchnorm.jl b/src/dnn/batchnorm.jl index 2178beb2e..1e1c4d705 100644 --- a/src/dnn/batchnorm.jl +++ b/src/dnn/batchnorm.jl @@ -32,13 +32,11 @@ function batchnorm_training( # For backward pass. μ_saved, ν_saved = similar(x, n_features), similar(x, n_features) - AMDGPU.wait!((x, γ, β, μ, ν)) (; handle, stream) = lib_state() miopenBatchNormalizationForwardTraining( handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, ydesc.handle, y, bndesc.handle, γ, β, factor, μ, ν, ϵ, μ_saved, ν_saved) |> check - AMDGPU.mark!(y, stream) y, μ_saved, ν_saved end @@ -71,13 +69,11 @@ function batchnorm_inference( xdesc, ydesc = TensorDescriptor4D.((x, y)) bndesc = derive_beta_gamma_descriptors(xdesc, mode) - AMDGPU.wait!((x, γ, β, μ, ν)) (; handle, stream) = lib_state() miopenBatchNormalizationForwardInference( handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, ydesc.handle, y, bndesc.handle, γ, β, μ, ν, ϵ) |> check - AMDGPU.mark!(y, stream) y end @@ -93,7 +89,6 @@ function ∇batchnorm( xdesc, dxdesc, dydesc = TensorDescriptor4D.((x, dx, dy)) bndesc = derive_beta_gamma_descriptors(xdesc, mode) - AMDGPU.wait!((x, dy, γ, β, μ_saved, ν_saved)) (; handle, stream) = lib_state() miopenBatchNormalizationBackward( handle, mode, @@ -101,7 +96,6 @@ function ∇batchnorm( Ref{Float32}(1f0), Ref{Float32}(0f0), xdesc.handle, x, dydesc.handle, dy, dxdesc.handle, dx, bndesc.handle, γ, dγ, dβ, ϵ, μ_saved, ν_saved) |> check - AMDGPU.mark!((dx, dγ, dβ), stream) dx, dγ, dβ end diff --git a/src/dnn/convolution.jl b/src/dnn/convolution.jl index 2cb491f5c..da1224119 100644 --- a/src/dnn/convolution.jl +++ b/src/dnn/convolution.jl @@ -1,10 +1,7 @@ -# TODO free workspace once used - const CONV_ALGOS = Union{ Type{miopenConvFwdAlgorithm_t}, Type{miopenConvBwdWeightsAlgorithm_t}, - Type{miopenConvBwdDataAlgorithm_t}, -} + Type{miopenConvBwdDataAlgorithm_t}} # Struct for hashing convolution arguments. struct ConvolutionArgs @@ -40,12 +37,12 @@ get_conv_cache_type(::Type{miopenConvFwdAlgorithm_t}) = CONV_FWD_BENCHMARK_CACHE get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHMARK_CACHE get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE -function get_benchmark_cache(conv_type::C, conv_args, dev) where C <: CONV_ALGOS +function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS perf_results = lock(get_conv_cache_type(conv_type)) do cache get(cache, conv_args, nothing) end isnothing(perf_results) && return nothing - workspace = Workspace(dev, perf_results.memory) + workspace = ROCArray{UInt8}(undef, perf_results.memory) perf_results, workspace end @@ -85,8 +82,7 @@ function find_conv_algo( handle, a_desc.handle, a, b_desc.handle, b, conv_desc.handle, c_desc.handle, c, n_algos, perf_count_ref, perf_results_ref, - workspace.data.ptr, workspace.data.bytesize, - exhaustive_search) |> check + workspace, length(workspace), exhaustive_search) |> check perf_results_ref[] end @@ -94,15 +90,14 @@ function find_algorithm( conv_type::C, handle::miopenHandle_t, conv_args::ConvolutionArgs, a, a_desc, b, b_desc, conv_desc, c, c_desc, ) where C <: CONV_ALGOS - dev = GPUArrays.device(a) - cache = get_benchmark_cache(conv_type, conv_args, dev) + cache = get_benchmark_cache(conv_type, conv_args) isnothing(cache) || return cache - workspace = Workspace(dev, 0) + workspace = ROCArray{UInt8}(undef, 0) perf_results = find_conv_algo(conv_type; handle, workspace, a, a_desc, b, b_desc, conv_desc, c, c_desc) set_benchmark_cache!(conv_type, conv_args, perf_results) - workspace = Workspace(dev, perf_results.memory) + workspace = ROCArray{UInt8}(undef, perf_results.memory) perf_results, workspace end @@ -127,12 +122,11 @@ function convolution!( miopenConvFwdAlgorithm_t, handle, conv_args, x, xdesc, w, wdesc, cdesc, y, ydesc) - AMDGPU.wait!((x, y, w)) miopenConvolutionForward( handle, Ref{Float32}(1f0), xdesc.handle, x, wdesc.handle, w, cdesc.handle, perf_results.fwd_algo, Ref{Float32}(0f0), ydesc.handle, y, - workspace.data.ptr, perf_results.memory) |> check - AMDGPU.mark!(y, stream) + workspace, perf_results.memory) |> check + AMDGPU.unsafe_free!(workspace) y end @@ -171,12 +165,11 @@ function ∇convolution_weight!( perf_algo, workspace = find_algorithm( miopenConvBwdWeightsAlgorithm_t, handle, conv_args, dy, dydesc, x, xdesc, cdesc, ∇w, ∇wdesc) - AMDGPU.wait!((∇w, dy, x)) miopenConvolutionBackwardWeights( handle, Ref{Float32}(1f0), dydesc.handle, dy, xdesc.handle, x, cdesc.handle, perf_algo.bwd_weights_algo, Ref{Float32}(0f0), ∇wdesc.handle, ∇w, - workspace.data.ptr, perf_algo.memory) |> check - AMDGPU.mark!(∇w, stream) + workspace, perf_algo.memory) |> check + AMDGPU.unsafe_free!(workspace) ∇w end @@ -215,12 +208,11 @@ function ∇convolution_data!( perf_algo, workspace = find_algorithm( miopenConvBwdDataAlgorithm_t, handle, conv_args, dy, dydesc, w, wdesc, cdesc, ∇x, ∇xdesc) - AMDGPU.wait!((∇x, dy, w)) miopenConvolutionBackwardData( handle, Ref{Float32}(1f0), dydesc.handle, dy, wdesc.handle, w, cdesc.handle, perf_algo.bwd_data_algo, Ref{Float32}(0f0), ∇xdesc.handle, ∇x, - workspace.data.ptr, perf_algo.memory) |> check - AMDGPU.mark!(∇x, stream) + workspace, perf_algo.memory) |> check + AMDGPU.unsafe_free!(workspace) ∇x end diff --git a/src/dnn/pooling.jl b/src/dnn/pooling.jl index eeab61238..85ce46932 100644 --- a/src/dnn/pooling.jl +++ b/src/dnn/pooling.jl @@ -94,22 +94,12 @@ function pool!( x::ROCArray{T, N}, xdesc::TensorDescriptor, pdesc::PoolingDescriptor; alpha = 1f0, beta = 0f0, do_backward::Bool = true, ) where {T <: MIOPENFloat, N} - if do_backward - wsize = get_workspace_size(pdesc, ydesc) - workspace = Workspace(GPUArrays.device(y), wsize) - wptr = workspace.data.ptr - else - wsize = 0 - workspace = nothing - wptr = C_NULL - end - AMDGPU.wait!((x, y)) + wsize = do_backward ? get_workspace_size(pdesc, ydesc) : 0 + workspace = ROCArray{UInt8}(undef, wsize) (; handle, stream) = lib_state() miopenPoolingForward( handle, pdesc.handle, Ref{Float32}(alpha), xdesc.handle, x, - Ref{Float32}(beta), ydesc.handle, y, do_backward, - wptr, wsize) |> check - AMDGPU.mark!(y, stream) + Ref{Float32}(beta), ydesc.handle, y, do_backward, workspace, wsize) |> check y, workspace end @@ -120,13 +110,10 @@ function ∇pool!( x::ROCArray{T, N}, xdesc::TensorDescriptor, pdesc::PoolingDescriptor; alpha = 1f0, beta = 0f0, workspace, ) where {T <: MIOPENFloat, N} - AMDGPU.wait!((dx, dy, y, x)) (; handle, stream) = lib_state() miopenPoolingBackward( handle, pdesc.handle, Ref{Float32}(alpha), ydesc.handle, y, dydesc.handle, dy, xdesc.handle, x, - Ref{Float32}(beta), dxdesc.handle, dx, - (isnothing(workspace) ? C_NULL : workspace.data.ptr)) |> check - AMDGPU.mark!(dx, stream) + Ref{Float32}(beta), dxdesc.handle, dx, workspace) |> check dx end diff --git a/src/dnn/softmax.jl b/src/dnn/softmax.jl index b24f73ecd..eaee271ee 100644 --- a/src/dnn/softmax.jl +++ b/src/dnn/softmax.jl @@ -57,13 +57,12 @@ function _softmax!( _logsoftmax!(y, x; dims) : _softmax!(y, x; dims) end - AMDGPU.wait!((x, y)) - xdesc, ydesc = TensorDescriptor.((reshape(x, sdims), reshape(y, sdims))) + xr, yr = reshape(x, sdims), reshape(y, sdims) + xdesc, ydesc = TensorDescriptor.((xr, yr)) (; handle, stream) = lib_state() miopenSoftmaxForward_V2( - handle, Ref{Float32}(1f0), xdesc.handle, x, Ref{Float32}(0f0), - ydesc.handle, y, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check - AMDGPU.mark!(y, stream) + handle, Ref{Float32}(1f0), xdesc.handle, xr, Ref{Float32}(0f0), + ydesc.handle, yr, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check y end @@ -76,14 +75,12 @@ function _∇softmax!( _∇logsoftmax!(dx, dy, y; dims) : _∇softmax!(dx, dy, y; dims) end - AMDGPU.wait!((dx, dy, y)) ydesc, dydesc, dxdesc = TensorDescriptor.((reshape(y, sdims), reshape(dy, sdims), reshape(dx, sdims))) (; handle, stream) = lib_state() miopenSoftmaxBackward_V2( handle, Ref{Float32}(1f0), ydesc.handle, y, dydesc.handle, dy, Ref{Float32}(0f0), dxdesc.handle, dx, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check - AMDGPU.mark!(dx, stream) dx end diff --git a/src/exception_handler.jl b/src/exception_handler.jl new file mode 100644 index 000000000..7a75d81bd --- /dev/null +++ b/src/exception_handler.jl @@ -0,0 +1,160 @@ +""" +ExceptionHolder + +- `exception_flag::Mem.HostBuffer`: + Pinned host memory. Contains one element of `Int32` type. + If stored value is not 0, then there is an exception that occurred + during kernel execution on the respective device. +- `gate::ROCArray{UInt64}`: + Linear index for x, y & z dimensions at which the exception occurred. + This is used to filter out other threads from duplication exceptions. +- `buffers_counter::ROCArray{Int32}`: + Counts number of printf buffers `errprintf_buffers` currently used. +- `str_buffers_counter::ROCArray{Int32}`: + Error string counter. Counts number of string buffers `string_buffers` + used for exception reporting. +- `errprintf_buffers::Vector{Mem.HostBuffer}`: + Array of buffers used for writing exceptions. + These buffers are used in the same way as device-printf buffers, + except they are pre-allocated. +- `string_buffers::Vector{Mem.HostBuffer}`: + Array of string buffers. These buffers are used every time + we need to report the name of the exception, file, or line. +""" +struct ExceptionHolder + exception_flag::Mem.HostBuffer # Main buffer where printf context is written. + gate::ROCArray{UInt64} + buffers_counter::ROCArray{Int32} + str_buffers_counter::ROCArray{Int32} + + errprintf_buffers::Vector{Mem.HostBuffer} # Buffers used by `@errprintf`. + string_buffers::Vector{Mem.HostBuffer} # Buffers used for storing device strings on the host. + + errprintf_buffers_dev::ROCArray{Ptr{Cvoid}} # Pointers of `errprintf_buffers` on the device. + string_buffers_dev::ROCArray{Ptr{Cvoid}} # Pointers of `string_buffers` on the device. + + function ExceptionHolder() + buf_len = 2^11 # 2 KiB + str_len = 2^11 # 2 KiB + n_buffers = 50 + n_str_buffers = 100 + + exception_flag = Mem.HostBuffer(sizeof(Int32), HIP.hipHostAllocMapped) + gate = ROCArray(UInt64[0]) + buffers_counter = ROCArray(Int32[0]) + str_buffers_counter = ROCArray(Int32[0]) + + errprintf_buffers = [ + Mem.HostBuffer(buf_len, HIP.hipHostAllocMapped) + for _ in 1:n_buffers] + str_buffers = [ + Mem.HostBuffer(str_len, HIP.hipHostAllocMapped) + for _ in 1:n_str_buffers] + + errprintf_buffers_dev = ROCArray(Mem.device_ptr.(errprintf_buffers)) + str_buffers_dev = ROCArray(Mem.device_ptr.(str_buffers)) + + new( + exception_flag, gate, buffers_counter, str_buffers_counter, + errprintf_buffers, str_buffers, + errprintf_buffers_dev, str_buffers_dev) + end +end + +# hash(dev::HIPDevice) => ExceptionHolder +const GLOBAL_EXCEPTION_HOLDER = Dict{UInt, ExceptionHolder}() + +function exception_holder(dev::HIPDevice) + # TODO lock using RT_LOCK + get!(() -> ExceptionHolder(), GLOBAL_EXCEPTION_HOLDER, hash(dev)) +end + +function has_exception(dev::HIPDevice)::Bool + ex = exception_holder(dev) + ptr = Base.unsafe_convert(Ptr{Int}, ex.exception_flag) + unsafe_load(ptr) != 0 +end + +function reset_exception_holder!(dev::HIPDevice) + ex = exception_holder(dev) + ptr = Base.unsafe_convert(Ptr{Int}, ex.exception_flag) + unsafe_store!(ptr, 0) + + fill!(ex.buffers_counter, 0) + fill!(ex.str_buffers_counter, 0) + return +end + +function get_exception_string(dev::HIPDevice)::String + ex = exception_holder(dev) + + # Use async copy and HIP.synchronize() to avoid triggering + # error exception checking path and stack-overflowing. + n_used_buffers = eltype(ex.buffers_counter)[0] + Base.copyto!(n_used_buffers, 1, ex.buffers_counter, 1, 1; async=true) + HIP.synchronize(AMDGPU.stream()) + + n_strings = min(n_used_buffers[1], length(ex.errprintf_buffers)) + + exception_str = "" + for i in 1:n_strings + ptr = reinterpret( + LLVMPtr{Device.ROCPrintfBuffer, AS.Global}, + ex.errprintf_buffers[i].ptr) + fmt, all_args = unsafe_load(ptr) + + if isempty(all_args) + exception_str = "$(exception_str)$(fmt)\n" + else + args = map(x -> x isa Cstring ? unsafe_string(x) : x, first(all_args)) + str = Printf.format(Printf.Format(fmt), args...) + exception_str = "$(exception_str)$(str)" + end + end + return exception_str +end + +function throw_if_exception(dev::HIPDevice) + has_exception(dev) || return + + exception_str = get_exception_string(dev) + exception_str = isempty(exception_str) ? "" : "\n$exception_str" + error("GPU Kernel Exception$exception_str") +end + +function KernelState(dev::HIPDevice, global_hostcalls::Vector{Symbol}) + malloc_ptr = :malloc_hostcall in global_hostcalls ? + Compiler.create_malloc_hostcall!() : + C_NULL + free_ptr = :free_hostcall in global_hostcalls ? + Compiler.create_free_hostcall!() : + C_NULL + print_ptr = :print_hostcall in global_hostcalls ? + Compiler.create_output_context!() : + C_NULL + printf_ptr = :printf_hostcall in global_hostcalls ? + Compiler.create_printf_output_context!() : + C_NULL + + ex = exception_holder(dev) + KernelState( + # Exception reporting buffers. + Mem.device_ptr(ex.exception_flag), + pointer(ex.gate), + pointer(ex.buffers_counter), + pointer(ex.str_buffers_counter), + + pointer(ex.errprintf_buffers_dev), + pointer(ex.string_buffers_dev), + Int32(length(ex.errprintf_buffers_dev)), + Int32(length(ex.string_buffers_dev)), + + # Malloc/free hostcall pointer. + malloc_ptr, + free_ptr, + + # Print hostcalls. + print_ptr, + printf_ptr, + ) +end diff --git a/src/fft/fft.jl b/src/fft/fft.jl index 989d70d2d..747853d6d 100644 --- a/src/fft/fft.jl +++ b/src/fft/fft.jl @@ -347,35 +347,24 @@ function assert_applicable(p::ROCFFTPlan{T,K}, X::ROCArray{T}, Y::ROCArray{Ty}) end function unsafe_execute!(plan::cROCFFTPlan{T,K,true,N}, X::ROCArray{T,N}) where {T,K,N} - wait!(X) rocfft_execute(plan, [pointer(X),], C_NULL, plan.execution_info) - mark!(X, C_NULL) end function unsafe_execute!(plan::cROCFFTPlan{T,K,false,N}, X::ROCArray{T,N}, Y::ROCArray{T}) where {T,N,K} Xcopy = copy(X) # since input array can also be modified - wait!(Y) rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info) - mark!(Xcopy, C_NULL) - mark!(Y, C_NULL) end function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_FORWARD,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftComplexes,N}) where {T<:rocfftReals,N} @assert plan.xtype == rocfft_transform_type_real_forward Xcopy = copy(X) - wait!(Y) rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info) - mark!(Xcopy, C_NULL) - mark!(Y, C_NULL) end function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_INVERSE,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftReals,N}) where {T<:rocfftComplexes,N} @assert plan.xtype == rocfft_transform_type_real_inverse Xcopy = copy(X) - wait!(Y) rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info) - mark!(Xcopy, C_NULL) - mark!(Y, C_NULL) end diff --git a/src/fft/rocFFT.jl b/src/fft/rocFFT.jl index 9b05747ab..3de57145b 100644 --- a/src/fft/rocFFT.jl +++ b/src/fft/rocFFT.jl @@ -1,7 +1,7 @@ module rocFFT import ..AMDGPU -import .AMDGPU: librocfft, mark!, wait! +import .AMDGPU: librocfft import ..HIP: hipStream_t using CEnum @@ -14,6 +14,8 @@ include("librocfft.jl") include("util.jl") include("fft.jl") +# TODO use TLS library state + if AMDGPU.functional(:rocfft) const INITIALIZED = Threads.Atomic{Int64}(0) @eval function rocfft_setup_once() diff --git a/src/highlevel.jl b/src/highlevel.jl index 3fe07f56b..4a3511316 100644 --- a/src/highlevel.jl +++ b/src/highlevel.jl @@ -1,17 +1,15 @@ # High-level APIs import AMDGPU: Runtime, Compiler -import .Runtime: ROCDevice, ROCQueue, ROCExecutable, ROCKernel, ROCSignal, ROCKernelSignal, HSAError import .Runtime: ROCDim, ROCDim3 -import .Runtime: wait!, mark! -import .Compiler: rocfunction +import .Compiler: hipfunction -export @roc, rocconvert, rocfunction +export @roc, rocconvert ## Devices """ - default_device()::ROCDevice + default_device()::HIPDevice Default device which will be used by default in tasks. Meaning when a task is created, it selects this device as default. @@ -21,7 +19,7 @@ All subsequent uses rely on [`device()`](@ref) for device selection. default_device() = Runtime.get_default_device() """ - default_device!(device::ROCDevice) + default_device!(device::HIPDevice) Set default device that will be used when creating new tasks. @@ -29,10 +27,29 @@ Set default device that will be used when creating new tasks. This does not change current device being used. Refer to [`device!`](@ref) for that. """ -default_device!(device::ROCDevice) = Runtime.set_default_device!(device) +default_device!(device::HIPDevice) = Runtime.set_default_device!(device) """ - device()::ROCDevice + default_device_id() -> Int + +Returns the numeric ID of the current default device, +which is in the range of `1:length(AMDGPU.devices())`. +This number should be stable for all processes on the same node, +The [`default_device_id!`](@ref) function accepts the same +numeric ID that is produced by this function. +""" +default_device_id() = default_device().device_id + +""" + default_device_id!(idx::Integer, kind::Symbol=:gpu) + +Sets the default device to `AMDGPU.devices(kind)[idx]`. See +[`default_device_id`](@ref) for details on the numbering semantics. +""" +default_device_id!(idx::Integer) = default_device!(devices()[idx]) + +""" + device()::HIPDevice Get currently active device. This device is used when launching kernels via `@roc`. @@ -40,7 +57,7 @@ This device is used when launching kernels via `@roc`. device() = task_local_state().device """ - device!(device::ROCDevice) + device!(device::HIPDevice) Switch current device being used. This switches only for a task inside which it is called. @@ -49,51 +66,26 @@ This switches only for a task inside which it is called. To select default device that will be used when creating new tasks, refer to [`default_device!`](@ref) for that. """ -function device!(device::ROCDevice) +function device!(device::HIPDevice) task_local_state!(; device) return device end -device!(f::Base.Callable, device::ROCDevice) = task_local_state!(f; device) - -""" - devices(kind::Symbol = :gpu) - -Get list of all devices of the given `kind`. -`kind` can be `:cpu`, `:gpu` or `:dsp`, although AMDGPU.jl supports -execution only on `:gpu` devices. -""" -devices(kind::Symbol = :gpu) = - filter!(d -> device_type(d) == kind, copy(Runtime.ALL_DEVICES)) +device!(f::Base.Callable, device::HIPDevice) = task_local_state!(f; device) """ - default_device_id(kind::Symbol=:gpu) -> Int + devices() -Returns the numeric ID of the current default device, which is in the range of -`1:length(AMDGPU.devices(kind))`. This number should be stable for all -processes on the same node, so long as any device filtering is consistently -applied (such as `ROCR_VISIBLE_DEVICES`). The [`default_device_id!`](@ref) -function accepts the same numeric ID that is produced by this function. +Get list of all devices. """ -default_device_id(kind::Symbol=:gpu) = - something(findfirst(a->a==default_device(), devices(kind))) +devices() = Runtime.fetch_devices() """ - default_device_id!(idx::Integer, kind::Symbol=:gpu) - -Sets the default device to `AMDGPU.devices(kind)[idx]`. See -[`default_device_id`](@ref) for details on the numbering semantics. -""" -default_device_id!(idx::Integer, kind::Symbol=:gpu) = - default_device!(devices(kind)[idx]) - -""" - device_id(device::ROCDevice, kind::Symbol=:gpu) -> Int + device_id(device::HIPDevice) -> Int Returns the numerical device ID for `device`. See [`default_device_id`](@ref) for details on the numbering semantics. """ -device_id(device::ROCDevice, kind::Symbol=:gpu) = - something(findfirst(dev->dev === device, devices(kind))) +device_id(device::HIPDevice) = device.device_id """ device_id!(idx::Integer, kind::Symbol=:gpu) @@ -101,30 +93,7 @@ device_id(device::ROCDevice, kind::Symbol=:gpu) = Sets the current device to `AMDGPU.devices(kind)[idx]`. See [`device_id`](@ref) for details on the numbering semantics. """ -device_id!(idx::Integer, kind::Symbol=:gpu) = - device!(devices(kind)[idx]) - -""" - device_type(device::ROCDevice) -> Symbol - -Return the kind of `device` as a `Symbol`. CPU devices return `:cpu`, GPU -devices return `:gpu`, DSP devices return `:dsp`, and all others return -`:unknown`. -""" -function device_type(device::ROCDevice) - devtype = Runtime.device_type(device) - if devtype == HSA.DEVICE_TYPE_CPU - return :cpu - elseif devtype == HSA.DEVICE_TYPE_GPU - return :gpu - elseif devtype[] == HSA.DEVICE_TYPE_DSP - return :dsp - else - return :unknown - end -end - -wavefrontsize(device::ROCDevice) = Runtime.device_wavefront_size(device) +device_id!(idx::Integer) = device!(devices()[idx]) # Contexts @@ -135,40 +104,9 @@ function device(context::HIPContext) end end -device_id(device::HIPDevice) = device.device_id -HIPDevice(device::ROCDevice) = HIPDevice(device_id(device)) -HIPContext(device::ROCDevice) = HIPContext(HIPDevice(device)) - -# Queues/Streams - -""" - queue()::ROCQueue - -Get task-local default queue for the currently active device. -""" -queue() = task_local_state().queue::ROCQueue -@deprecate default_queue() queue() -function queue(device::ROCDevice) - tls = task_local_state() - q = tls.queues[device_id(device)] - isnothing(q) || return q - - tls.queues[device_id(device)] = ROCQueue(device) - return q -end -""" - queue!(f::Base.Callable, queue::ROCQueue) - -Change default queue, execute given function `f` -and revert back to the original queue. - -# Returns - -Return value of the function `f`. -""" -queue!(f::Base.Callable, queue::ROCQueue) = task_local_state!(f; queue) -device(queue::ROCQueue) = queue.device +# Streams. +default_stream() = HIP.default_stream() stream() = task_local_state().stream::HIPStream function stream!(stream::HIPStream) task_local_state!(;stream) @@ -182,7 +120,7 @@ priority() = task_local_state().priority """ priority!(priority::Symbol) -Change the priority of the default queue. +Change the priority of the default stream. Accepted values are `:normal` (the default), `:low` and `:high`. """ function priority!(priority::Symbol) @@ -193,7 +131,7 @@ end """ priority!(f::Base.Callable, priority::Symbol) -Chnage the priority of default queue, execute `f` and +Chnage the priority of default stream, execute `f` and revert to the original priority. Accepted values are `:normal` (the default), `:low` and `:high`. @@ -205,149 +143,36 @@ priority!(f::Base.Callable, priority::Symbol) = task_local_state!(f; priority) # Device ISAs -default_isa(device::ROCDevice) = Runtime.default_isa(device) -default_isa_architecture(device::ROCDevice) = Runtime.architecture(default_isa(device)) -default_isa_features(device::ROCDevice) = Runtime.features(default_isa(device)) - -## Executable creation - -function create_executable(device, entry, obj; globals=()) - # link with ld.lld - @assert lld_path != "" "ld.lld was not found; cannot link kernel" - path_exe = mktemp() do path_o, io_o - write(io_o, obj) - flush(io_o) - path_exe = path_o*".exe" - if lld_artifact - LLD_jll.lld() do lld - run(`$lld -flavor gnu -shared -o $path_exe $path_o`) - end - else - run(`$lld_path -shared -o $path_exe $path_o`) - end - path_exe - end - data = read(path_exe) - rm(path_exe) - - return ROCExecutable(device, data, entry; globals=globals) -end - -function get_kernel_queue(; - event_queue::Union{ROCQueue, Nothing}, device::Union{ROCDevice, Nothing}, -) - if !isnothing(event_queue) && !isnothing(device) - if event_queue.device != device - error( - "Specified both `device` and `queue`, " * - "but `queue` is on a different device than `device`.\n" * - "In this case, only one argument can be specified.") - else - return event_queue - end - end - isnothing(event_queue) && isnothing(device) && return queue() - isnothing(event_queue) && return queue(device) - event_queue -end - -## Event creation -function create_event(kernel::ROCKernel; - signal::Union{ROCKernelSignal, ROCSignal} = ROCSignal(), - device::Union{ROCDevice, Nothing} = nothing, - queue::Union{ROCQueue, Nothing} = nothing, - kwargs..., -) - if signal isa ROCKernelSignal - return signal - end - kernel_queue = get_kernel_queue(; event_queue=queue, device) - return ROCKernelSignal(signal, kernel_queue, kernel; kwargs...) -end - -## Kernel creation - -""" - create_kernel(kernel::HostKernel, f, args::Tuple; kwargs...) - -Constructs a `ROCKernel` object from a compiled kernel described by `kernel`. -`f` is the function being called, and `args` is the `Tuple` of arguments that -`f` is called with. - -See [`@roc`](@ref) for the list of available keyword arguments. -""" -create_kernel(kernel::Runtime.HostKernel; kwargs...) = - ROCKernel(kernel; kwargs...) - -## Kernel launch and barriers - -barrier_and!(signals::Vector) = barrier_and!(queue(), signals) -barrier_or!(signals::Vector) = barrier_or!(queue(), signals) -barrier_and!(queue::ROCQueue, signals::Vector{ROCKernelSignal}) = - barrier_and!(queue, map(x->x.signal,signals)) -barrier_or!(queue::ROCQueue, signals::Vector{ROCKernelSignal}) = - barrier_or!(queue, map(x->x.signal,signals)) -barrier_and!(queue::ROCQueue, signals::Vector{HSA.Signal}) = barrier_and!(queue, map(ROCSignal, signals)) -barrier_or!(queue::ROCQueue, signals::Vector{HSA.Signal}) = barrier_or!(queue, map(ROCSignal, signals)) -barrier_and!(queue::ROCQueue, signals::Vector{ROCSignal}) = - Runtime.launch_barrier!(HSA.BarrierAndPacket, queue, signals) -barrier_or!(queue::ROCQueue, signals::Vector{ROCSignal}) = - Runtime.launch_barrier!(HSA.BarrierOrPacket, queue, signals) - -""" - active_kernels(queue::ROCQueue = queue()) -> Vector{ROCKernelSignal} - -Returns the set of actively-executing kernels on `queue`. -""" -function active_kernels(queue::ROCQueue = queue()) - isempty(queue.active_kernels) && return NO_ACTIVE_KERNELS - return Array(queue.active_kernels) -end -const NO_ACTIVE_KERNELS = ROCKernelSignal[] - -""" - synchronize(; errors::Bool=true) - -Blocks until all kernels currently executing on the default queue and stream -have completed. See [`synchronize(::ROCQueue)`](@ref) for details on `errors`. -""" -function synchronize(; errors::Bool=true) - synchronize(queue(); errors) - synchronize(stream()) -end -""" - synchronize(queue::ROCQueue; errors::Bool=true) +default_isa(device::HIPDevice) = Runtime.default_isa(Runtime.hsa_device(device)) -Blocks until all kernels currently executing on `queue` have completed. If -`errors` is `true`, then any kernels currently on the queue which throw an -error will be re-thrown; only the first encountered error will be thrown. If -`false`, errors will not be thrown. -""" -function synchronize(queue::ROCQueue; errors::Bool=true) - isempty(queue.active_kernels) && return - - if errors - kerns = copy(queue.active_kernels) - while length(kerns) > 0 - sig = first(kerns) - wait(sig; check_exceptions=true, cleanup=false) - Runtime.next!(kerns) - end - else - sig = Runtime.maybelast(queue.active_kernels) - if sig !== nothing - wait(sig; check_exceptions=false, cleanup=false) - end - end - return -end """ - synchronize(stream::HIPStream) + synchronize(stream::HIPStream = stream()) Blocks until all kernels currently executing on `stream` have completed. """ -function synchronize(stream::HIPStream) - HIP.hipStreamSynchronize(stream.stream) |> check +# TODO +# allow non blocking sync of several HIPStreams +# and only then disable global hostcall +function synchronize(stm::HIPStream = stream(); blocking::Bool = true) + throw_if_exception(stm.device) + HIP.synchronize(stm; blocking) + throw_if_exception(stm.device) + + blocking && return + + # Stop any running global hostcall. + global_hostcall_names = ( + :malloc_hostcall, :free_hostcall, :print_hostcall, :printf_hostcall) + for gbl in global_hostcall_names + hc = AMDGPU.Device.get_named_perdevice_hostcall(stm.device, gbl) + isnothing(hc) && continue + hc[1].finish[] && continue + + # Signal HostCall to exit. + AMDGPU.Device.finish!(hc[1]) + # Remove it from global hostcalls, so that new one is created. + AMDGPU.Device.remove_perdevice_hostcall!(stm.device, gbl) + end return end @@ -365,179 +190,10 @@ register methods for the the `AMDGPU.Adaptor` type. """ rocconvert(arg) = adapt(Runtime.Adaptor(), arg) -### @roc helper functions - -# split keyword arguments to `@roc` into ones affecting the macro itself, the compiler -# and the code it generates, or the execution -function split_kwargs(kwargs) - alias_kws = Dict(:stream=>:queue) - macro_kws = [:dynamic, :launch, :wait, :mark] - compiler_kws = [:name, :global_hooks] - call_kws = [:gridsize, :groupsize, :config] - signal_kws = [:queue, :signal, :soft, :minlat, :timeout] - kernel_kws = [:localmem] - computed_kws = [:threads, :blocks] - - device_kwargs = [] - macro_kwargs = [] - compiler_kwargs = [] - call_kwargs = [] - signal_kwargs = [] - kernel_kwargs = [] - - for kwarg in kwargs - if !Meta.isexpr(kwarg, :(=)) - throw(ArgumentError("non-keyword argument like option '$kwarg'")) - end - - key, val = kwarg.args - oldkey = key - if key in keys(alias_kws) - key = alias_kws[key] - kwarg = :($key=$val) - end - - if !isa(key, Symbol) - throw(ArgumentError("non-symbolic keyword '$oldkey'")) - end - - if key == :device - push!(device_kwargs, kwarg) - elseif key in macro_kws - push!(macro_kwargs, kwarg) - elseif key in compiler_kws - push!(compiler_kwargs, kwarg) - elseif key in call_kws - push!(call_kwargs, kwarg) - elseif key in signal_kws - push!(signal_kwargs, kwarg) - elseif key in kernel_kws - push!(kernel_kwargs, kwarg) - elseif key in computed_kws - push!(call_kwargs, kwarg) - else - throw(ArgumentError("unknown keyword argument '$oldkey'")) - end - end - - return device_kwargs, macro_kwargs, compiler_kwargs, call_kwargs, signal_kwargs, kernel_kwargs -end -function simplify_call_kwargs!(call_kwargs) - call_kwargs_keys = map(x->x.args[1], call_kwargs) - has_threads = :threads in call_kwargs_keys - has_blocks = :blocks in call_kwargs_keys - has_threads || has_blocks || return - if :groupsize in call_kwargs_keys - throw(ArgumentError("cannot combine :threads/:blocks with :groupsize")) - elseif :gridsize in call_kwargs_keys - throw(ArgumentError("cannot combine :threads/:blocks with :gridsize")) - end - if has_threads - threads_idx = findfirst(x->x.args[1]==:threads, call_kwargs) - groupsize = call_kwargs[threads_idx].args[2] - deleteat!(call_kwargs, threads_idx) - else - groupsize = 1 - end - if has_blocks - blocks_idx = findfirst(x->x.args[1]==:blocks, call_kwargs) - blocks = call_kwargs[blocks_idx].args[2] - deleteat!(call_kwargs, blocks_idx) - else - blocks = 1 - end - push!(call_kwargs, :(groupsize=$groupsize)) - push!(call_kwargs, :(gridsize=$groupsize .* $blocks)) -end - -# assign arguments to variables, handle splatting -function assign_args!(code, args) - # handle splatting - splats = map(arg -> Meta.isexpr(arg, :(...)), args) - args = map(args, splats) do arg, splat - splat ? arg.args[1] : arg - end - - # assign arguments to variables - vars = Tuple(gensym() for arg in args) - map(vars, args) do var,arg - push!(code.args, :($var = $arg)) - end - - # convert the arguments, compile the function and call the kernel - # while keeping the original arguments alive - var_exprs = map(vars, args, splats) do var, arg, splat - splat ? Expr(:(...), var) : var - end - - return vars, var_exprs -end - -### @roc macro - -""" - @roc [kwargs...] func(args...) - -High-level interface for executing code on a GPU. The `@roc` macro should -prefix a call, with `func` a callable function or object that should return -nothing. It will be compiled to a GCN function via `rocfunction` upon first -use, and to a certain extent arguments will be converted and managed -automatically using `rocconvert`. Finally, a call to `roccall` is performed, -scheduling a kernel launch on the specified (or default) HSA queue. - -Several keyword arguments are supported that influence the behavior of `@roc`. - -Keyword arguments that control general `@roc` behavior: -- `dynamic::Bool = false`: Use dynamic parallelism to launch as a device-side kernel -- `launch::Bool = true`: Whether to launch the kernel -- `wait::Bool = true`: Whether to wait on all arguments' dependencies -- `mark::Bool = true`: Whether to mark this kernel as a dependency for all arguments - -Keyword arguments that affect various parts of `@roc`: -- `device::ROCDevice = AMDGPU.default_device()`: The device to compile code for, and launch the kernel on. -- `queue::ROCQueue = AMDGPU.queue(device)`: Which queue to associate the kernel (and its completion signal) with. May also be specified as `stream` for compatibility with CUDA.jl. - -Keyword arguments that control kernel compilation via [`rocfunction`](@ref) and [`dynamic_rocfunction`](@ref): -- `name::Union{String,Nothing} = nothing`: If not `nothing`, the name to use for the generated kernel. -- `global_hooks::NamedTuple = (;)`: The set of global compiler hooks to use to initialize memory accessed by the kernel. See `AMDGPU.Compiler.default_global_hooks` for an example of how to implement these. - -Keyword arguments that control signal creation via [`AMDGPU.create_event`](@ref): -- `signal::ROCSignal = ROCSignal()`: The underlying signal object to associate the high-level `ROCKernelSignal` with. -- `soft::Bool = true`: Whether to use the "soft" busy-poll waiter algorithm. If `false`, uses HSA's built-in blocking wait. -- `minlat::Float64 = 0.000001`: The minimum latency allowed on the first wait cycle. Specifically, if the kernel completes in less than this amount of time, then the observed latency from kernel launch to return from `wait` is this value, in seconds. -- `timeout::Union{Float64, Nothing} = nothing`: How long to wait for the signal to complete before throwing an `AMDGPU.Runtime.SignalTimeoutException`, in seconds. If `nothing`, then timeouts are disabled and the `wait` call may hang forever if the kernel never completes. - -Keyword arguments that control kernel creation via [`AMDGPU.create_kernel`](@ref): -- `localmem::Int = 0`: The amount of dynamic local memory to allocate for the kernel. This value is separate from the amount of static local memory required by the kernel (as reported by the compiler). - -Keyword arguments that control kernel launch via [`AMDGPU.HostKernel`](@ref) and [`AMDGPU.DeviceKernel`](@ref): -- `groupsize::Union{Tuple,Integer} = 1`: The size of the groups to execute over the grid. If an `Integer` or `Tuple{<:Integer}`, only activate the X dimension of the group. If `Tuple{<:Integer,<:Integer}`, activate the X and Y dimensions of the group. If `Tuple{<:Integer,<:Integer,<:Integer}`, activate the X, Y, and Z dimensions of the group. All sizes must be greater than 0. -- `gridsize::Union{Tuple,Integer} = 1`: The size of the grid to execute the kernel over. If an `Integer` or `Tuple{<:Integer}`, only activate the X dimension of the grid. If `Tuple{<:Integer,<:Integer}`, activate the X and Y dimensions of the grid. If `Tuple{<:Integer,<:Integer,<:Integer}`, activate the X, Y, and Z dimensions of the grid. All sizes must be greater than 0. -- `threads::Union{Tuple,Integer}` - Alias for `groupsize`, for compatibility with CUDA.jl. -- `blocks::Union{Tuple,Integer}` - How many groups to execute across the grid. Potentially a more convenient way to specify groupsize, and intended for compatibility with CUDA.jl. - -The underlying operations (argument conversion, kernel compilation, kernel call) can be -performed explicitly when more control is needed, e.g. to reflect on the resource usage of a -kernel to determine the launch configuration. A host-side kernel launch is done as follows: - - args = ... - GC.@preserve args begin - kernel_f = rocconvert(f) - kernel_args = rocconvert.(args) - kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - kernel = rocfunction(kernel_f, kernel_tt; compilation_kwargs) - kernel(kernel_args...; launch_kwargs) - end - -A device-side launch, aka. dynamic parallelism, is similar but more restricted: +const MACRO_KWARGS = [:dynamic, :launch] +const COMPILER_KWARGS = [:name] +const LAUNCH_KWARGS = [:gridsize, :groupsize, :shmem, :stream] - args = ... - # GC.@preserve is not supported - # we're on the device already, so no need to rocconvert - kernel_tt = Tuple{Core.Typeof(args[1]), ...} # this needs to be fully inferred! - kernel = dynamic_rocfunction(f, kernel_tt) # no compiler kwargs supported - kernel(args...; launch_kwargs) -""" macro roc(ex...) # destructure the `@roc` expression call = ex[end] @@ -549,29 +205,28 @@ macro roc(ex...) args = call.args[2:end] code = quote end - device_kwargs, macro_kwargs, compiler_kwargs, call_kwargs, signal_kwargs, kernel_kwargs = split_kwargs(kwargs) - simplify_call_kwargs!(call_kwargs) vars, var_exprs = assign_args!(code, args) - # handle keyword arguments that influence the macro's behavior - dynamic = false + macro_kwargs, compiler_kwargs, launch_kwargs, other_kwargs = + split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS) + if !isempty(other_kwargs) + key, val = first(other_kwargs).args + throw(ArgumentError("Unsupported keyword argument: `$key`.")) + end + + dynamic = false # TODO unsupported for now launch = true - wait = true - mark = true for kwarg in macro_kwargs - key,val = kwarg.args + key, val = kwarg.args if key == :dynamic - isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @roc should be a constant Bool")) + isa(val, Bool) || throw(ArgumentError( + "`dynamic` keyword argument to @roc should be a constant Bool")) dynamic = val::Bool + @assert false "`dynamic` kernel launch is not yet implemented" elseif key == :launch - isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @roc should be a constant Bool")) + isa(val, Bool) || throw(ArgumentError( + "`launch` keyword argument to @roc should be a constant Bool")) launch = val::Bool - elseif key == :wait - isa(val, Bool) || throw(ArgumentError("`wait` keyword argument to @roc should be a constant Bool")) - wait = val::Bool - elseif key == :mark - isa(val, Bool) || throw(ArgumentError("`mark` keyword argument to @roc should be a constant Bool")) - mark = val::Bool else throw(ArgumentError("Unsupported keyword argument '$key'")) end @@ -579,62 +234,30 @@ macro roc(ex...) # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5) # we esc() the whole thing now, necessitating gensyms... - @gensym kernel_f kernel_args kernel_tt kernel kernel_instance device queue signal - if dynamic - # FIXME: we could probably somehow support kwargs with constant values by either - # saving them in a global Dict here, or trying to pick them up from the Julia - # IR when processing the dynamic parallelism marker - isempty(compiler_kwargs) || error("@roc dynamic parallelism does not support compiler keyword arguments") - - # dynamic, device-side kernel launch - push!(code.args, - quote - # we're in kernel land already, so no need to rocconvert arguments - local $kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)} - local $kernel = $dynamic_rocfunction($f, $kernel_tt) - $kernel($(var_exprs...); $(call_kwargs...)) - end) - else - # regular, host-side kernel launch - # - # convert the function, its arguments, call the compiler and launch the kernel - # while keeping the original arguments alive - push!(code.args, - quote - GC.@preserve $(vars...) begin - local $kernel_f = $rocconvert($f) - local $kernel_args = map($rocconvert, ($(var_exprs...),)) - local $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} - local $kernel = $rocfunction( - $kernel_f, $kernel_tt; - $(device_kwargs...), $(compiler_kwargs...)) - - if $launch - if $wait - foreach($wait!, ($(var_exprs...),)) - end - local $kernel_instance = $create_kernel($kernel; $(kernel_kwargs...)) - local $signal = $create_event( - $kernel_instance; $(device_kwargs...), $(signal_kwargs...)) - $kernel($kernel_args...; signal=$signal, $(call_kwargs...)) - if $mark - foreach(x->$mark!(x, $signal), ($(var_exprs...),)) - end - $signal - else - $kernel - end + @gensym kernel_f kernel_args kernel_tt kernel + push!(code.args, + quote + GC.@preserve $(vars...) begin + local $kernel_f = $rocconvert($f) + local $kernel_args = map($rocconvert, ($(var_exprs...),)) + local $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...} + local $kernel = $hipfunction($kernel_f, $kernel_tt; $(compiler_kwargs...)) + + if $launch + $kernel($(var_exprs...); $(launch_kwargs...)) end - end) + $kernel + end + end) + return esc(quote + let + $code end - return esc(code) + end) end -# launch config - -launch_configuration(kern::Runtime.HostKernel; kwargs...) = - launch_configuration(kern.fun) -function launch_configuration(fun::Runtime.ROCFunction; input_block_size=1, localmem=0) - occ = Compiler.calculate_occupancy(fun; input_block_size, localmem) - return (;groupsize=occ.best_block_size) +function launch_configuration( + kern::Runtime.HIPKernel; shmem::Integer = 0, max_block_size::Integer = 0, +) + HIP.launch_configuration(kern.fun; shmem, max_block_size) end diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl index e80646571..978a77064 100644 --- a/src/hip/HIP.jl +++ b/src/hip/HIP.jl @@ -1,50 +1,14 @@ module HIP +using CEnum + import ..AMDGPU import ..AMDGPU.libhip -using CEnum include("libhip_common.jl") include("error.jl") include("libhip.jl") - -struct HIPDevice - device::hipDevice_t - device_id::Cint -end -function HIPDevice(device_id::Integer) - device_ref = Ref{hipDevice_t}() - hipDeviceGet(device_ref, Cint(device_id-1)) |> check - return HIPDevice(device_ref[], device_id) -end -Base.unsafe_convert(::Type{Ptr{T}}, device::HIPDevice) where T = - reinterpret(Ptr{T}, device.device) -function name(device::HIPDevice) - name_vec = zeros(Cuchar, 64) - hipDeviceGetName(pointer(name_vec), Cint(64), device.device) |> check - return String(name_vec) -end -function Base.show(io::IO, device::HIPDevice) - print(io, "HIPDevice(name=\"$(name(device))\", id=$(device.device_id))") -end - -function device() - device_id_ref = Ref{Cint}() - hipGetDevice(device_id_ref) |> check - return HIPDevice(device_id_ref[]+1) -end -device!(device::HIPDevice) = hipSetDevice(device.device_id-Int32(1)) |> check -device!(device_id::Integer) = hipSetDevice(Cint(device_id-1)) |> check -function device!(f::Base.Callable, device::HIPDevice) - old_device_id_ref = Ref{Cint}() - hipGetDevice(old_device_id_ref) |> check - device!(device) - try - f() - finally - device!(old_device_id_ref[]+1) - end -end +include("device.jl") mutable struct HIPContext context::hipContext_t @@ -84,73 +48,19 @@ function context!(f::Base.Callable, context::HIPContext) end end -mutable struct HIPStream - stream::hipStream_t - priority::Symbol - device::HIPDevice -end - -""" - HIPStream(priority::Symbol = :normal) - -# Arguments: - -- `priority::Symbol`: Priority of the stream: `:normal`, `:high` or `:low`. - -Create HIPStream with given priority. -Device is the default device that's currently in use. -""" -function HIPStream(priority::Symbol = :normal) - priority_int = symbol_to_priority(priority) - - stream_ref = Ref{hipStream_t}() - hipStreamCreateWithPriority(stream_ref, Cuint(0), priority_int) |> check - stream = HIPStream(stream_ref[], priority, device()) - finalizer(stream) do s - hipStreamSynchronize(s.stream) |> check - hipStreamDestroy(s.stream) |> check - end - return stream -end - -""" - HIPStream(stream::hipStream_t) - -Create HIPStream from `hipStream_t` handle. -Device is the default device that's currently in use. -""" -HIPStream(stream::hipStream_t) = HIPStream(stream, priority(stream), device()) +include("stream.jl") +include("event.jl") +include("pool.jl") -Base.unsafe_convert(::Type{Ptr{T}}, stream::HIPStream) where T = - reinterpret(Ptr{T}, stream.stream) -function Base.show(io::IO, stream::HIPStream) - print(io, "HIPStream(device=$(stream.device), ptr=$(repr(UInt64(stream.stream))), priority=$(stream.priority))") -end - -function priority_to_symbol(priority) - priority == 0 && return :normal - priority == -1 && return :high - priority == 1 && return :low - throw(ArgumentError(""" - Invalid HIP priority: $priority. - Valid values are: 0, -1, 1. - """)) -end +include("module.jl") -function symbol_to_priority(priority::Symbol) - priority == :normal && return Cint(0) - priority == :high && return Cint(-1) - priority == :low && return Cint(1) - throw(ArgumentError(""" - Invalid HIP priority symbol: $priority. - Valid values are: `:normal`, `:low`, `:high`. - """)) +function device_synchronize() + hipDeviceSynchronize() |> check end -function priority(stream::hipStream_t) - priority = Ref{Cint}() - hipStreamGetPriority(stream, priority) |> check - priority_to_symbol(priority[]) +function reclaim(bytes_to_keep::Integer = 0) + device_synchronize() + trim(memory_pool(device()), bytes_to_keep) end end diff --git a/src/hip/device.jl b/src/hip/device.jl new file mode 100644 index 000000000..094dbc21d --- /dev/null +++ b/src/hip/device.jl @@ -0,0 +1,99 @@ +struct HIPDevice + device::hipDevice_t + device_id::Cint +end + +function HIPDevice(device_id::Integer) + device_ref = Ref{hipDevice_t}() + hipDeviceGet(device_ref, device_id - 1) |> check + return HIPDevice(device_ref[], device_id) +end + +device_id(d::HIPDevice) = d.device_id - 1 + +function stack_size() + value = Ref{Csize_t}() + hipDeviceGetLimit(value, hipLimitStackSize) |> check + value[] +end + +function stack_size!(value::Integer) + hipDeviceSetLimit(hipLimitStackSize, value) |> check +end + +# TODO heap_size tweaking available since 5.5 + +function heap_size() + value = Ref{Csize_t}() + hipDeviceGetLimit(value, hipLimitMallocHeapSize) |> check + value[] +end + +function heap_size!(value::Integer) + hipDeviceSetLimit(hipLimitMallocHeapSize, value) |> check +end + +Base.hash(dev::HIPDevice, h::UInt) = hash(dev.device, h) + +Base.unsafe_convert(::Type{Ptr{T}}, device::HIPDevice) where T = + reinterpret(Ptr{T}, device.device) + +function name(dev::HIPDevice) + name_vec = zeros(Cuchar, 64) + hipDeviceGetName(pointer(name_vec), Cint(64), dev.device) |> check + name_vec[1] == Cuchar(0) || return strip(String(name_vec), '\0') + + # Fallback to HSA device name if HIP failed to report. + AMDGPU.Runtime.hsa_device(dev).name +end + +function properties(dev::HIPDevice) + init_arch_ref = Ref(hipDeviceArch_t()) + arch_field_id = findfirst(i -> i == :arch, fieldnames(hipDeviceProp_t)) + arch_offset = fieldoffset(hipDeviceProp_t, arch_field_id) + + props_ref = Ref{hipDeviceProp_t}() + ccall(:memset, + Cvoid, (Ptr{Cvoid}, Cint, Csize_t), + props_ref, 0, sizeof(hipDeviceProp_t)) + ccall(:memcpy, + Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), + Base.unsafe_convert(Ptr{Cvoid}, props_ref) + arch_offset, + init_arch_ref, sizeof(hipDeviceArch_t)) + + hipGetDeviceProperties(props_ref, device_id(dev)) |> check + props_ref[] +end + +function Base.show(io::IO, device::HIPDevice) + print(io, "HIPDevice(name=\"$(name(device))\", id=$(device.device_id))") +end + +function ndevices() + count_ref = Ref{Cint}() + hipGetDeviceCount(count_ref) |> check + count_ref[] +end + +devices() = [HIPDevice(i) for i in 1:ndevices()] + +function device() + device_id_ref = Ref{Cint}() + hipGetDevice(device_id_ref) |> check + return HIPDevice(device_id_ref[]+1) +end + +device!(device::HIPDevice) = hipSetDevice(device_id(device)) |> check + +device!(device_id::Integer) = hipSetDevice(device_id - 1) |> check + +function device!(f::Base.Callable, device::HIPDevice) + old_device_id_ref = Ref{Cint}() + hipGetDevice(old_device_id_ref) |> check + device!(device) + try + f() + finally + device!(old_device_id_ref[] + 1) + end +end diff --git a/src/hip/event.jl b/src/hip/event.jl new file mode 100644 index 000000000..b15832b5a --- /dev/null +++ b/src/hip/event.jl @@ -0,0 +1,64 @@ +mutable struct HIPEvent + handle::hipEvent_t + stream::hipStream_t +end + +Base.:(==)(a::HIPEvent, b::HIPEvent) = a.handle == b.handle + +Base.unsafe_convert(::Type{Ptr{T}}, event::HIPEvent) where T = + reinterpret(Ptr{T}, event.handle) + +function record(event::HIPEvent) + hipEventRecord(event.handle, event.stream) |> check + return event +end + +function isdone(event::HIPEvent) + query = hipEventQuery(event) + if query == hipSuccess + return true + elseif query == hipErrorNotReady + return false + else + throw(HIPError(query)) + end +end + +function non_blocking_synchronize(event::HIPEvent) + isdone(event) && return true + + # spin (initially without yielding to minimize latency) + spins = 0 + while spins < 256 + if spins < 32 + ccall(:jl_cpu_pause, Cvoid, ()) + # Temporary solution before we have gc transition support in codegen. + ccall(:jl_gc_safepoint, Cvoid, ()) + else + yield() + end + isdone(event) && return true + spins += 1 + end + return false +end + +wait(event::HIPEvent) = hipEventSynchronize(event) |> check + +function synchronize(event::HIPEvent) + non_blocking_synchronize(event) || wait(event) + return +end + +function HIPEvent(stream::hipStream_t; do_record::Bool = true) + event_ref = Ref{hipEvent_t}() + hipEventCreateWithFlags(event_ref, hipEventDisableTiming) |> check + event = HIPEvent(event_ref[], stream) + do_record && record(event) + + finalizer(event) do e + hipEventDestroy(e) |> check + end + event +end +HIPEvent(stream::HIPStream; do_record::Bool = true) = HIPEvent(stream.stream; do_record) diff --git a/src/hip/libhip.jl b/src/hip/libhip.jl index d26cabc30..7fcc9acaf 100644 --- a/src/hip/libhip.jl +++ b/src/hip/libhip.jl @@ -1,65 +1,102 @@ -function hipInit(flags::Cint) +function hipInit(flags) ccall((:hipInit, libhip), hipError_t, (Cint,), flags) end -function hipDeviceGet(device_ref::Ref{hipDevice_t}, device_id::Cint) +function hipDeviceGet(dev, device_id) ccall((:hipDeviceGet, libhip), hipError_t, - (Ptr{hipDevice_t}, Cint), device_ref, device_id) + (Ptr{hipDevice_t}, Cint), dev, device_id) end -function hipCtxCreate(ctx_ref::Ref{hipContext_t}, flags::Cuint, device::hipDevice_t) +function hipCtxCreate(ctx_ref, flags, device) ccall((:hipCtxCreate, libhip), hipError_t, - (Ptr{hipContext_t}, Cuint, hipDevice_t), - ctx_ref, flags, device) + (Ptr{hipContext_t}, Cuint, hipDevice_t), + ctx_ref, flags, device) end -function hipCtxDestroy(ctx::hipContext_t) +function hipCtxDestroy(ctx) ccall((:hipCtxDestroy, libhip), hipError_t, (hipContext_t,), ctx) end -function hipCtxSetCurrent(ctx::hipContext_t) +function hipCtxSetCurrent(ctx) ccall((:hipCtxSetCurrent, libhip), hipError_t, (hipContext_t,), ctx) end -function hipCtxGetCurrent(ctx_ref::Ref{hipContext_t}) +function hipCtxGetCurrent(ctx_ref) ccall((:hipCtxGetCurrent, libhip), hipError_t, (Ptr{hipContext_t},), ctx_ref) end -function hipGetDevice(device_id_ref::Ref{Cint}) +function hipGetDevice(device_id_ref) ccall((:hipGetDevice, libhip), hipError_t, (Ptr{Cint},), device_id_ref) end -function hipSetDevice(device_id::Cint) +function hipSetDevice(device_id) ccall((:hipSetDevice, libhip), hipError_t, (Cint,), device_id) end -function hipDeviceGetCount(count_ref::Ref{Cint}) - ccall((:hipDeviceGetCount, libhip), hipError_t, (Ptr{Cint},), count_ref) +function hipGetDeviceCount(count_ref) + ccall((:hipGetDeviceCount, libhip), hipError_t, (Ptr{Cint},), count_ref) end -function hipDeviceGetName(name::Ptr{Cuchar}, len::Cint, device::hipDevice_t) +function hipGetDeviceProperties(prop, dev_id) + ccall((:hipGetDeviceProperties, libhip), hipError_t, + (Ptr{hipDeviceProp_t}, Cint), prop, dev_id) +end + +function hipDeviceGetName(name, len, device) ccall((:hipDeviceGetName, libhip), hipError_t, (Ptr{Cuchar}, Cint, hipDevice_t), name, len, device) end -function hipStreamCreateWithPriority(stream_ref::Ref{hipStream_t}, flags::Cuint, priority::Cint) +function hipDeviceGetAttribute(val, attribute, device_id) + ccall((:hipDeviceGetAttribute, libhip), hipError_t, + (Ptr{Cint}, hipDeviceAttribute_t, Cint), val, attribute, device_id) +end + +function hipEventCreate(event_ref) + ccall((:hipEventCreate, libhip), hipError_t, (Ptr{hipEvent_t},), event_ref) +end + +function hipEventCreateWithFlags(event_ref, flags) + ccall((:hipEventCreateWithFlags, libhip), hipError_t, + (Ptr{hipEvent_t}, Cuint), event_ref, flags) +end + +function hipEventDestroy(event) + ccall((:hipEventDestroy, libhip), hipError_t, (hipEvent_t,), event) +end + +function hipEventRecord(event, stream) + ccall((:hipEventRecord, libhip), hipError_t, + (hipEvent_t, hipStream_t), event, stream) +end + +function hipEventQuery(event) + ccall((:hipEventQuery, libhip), hipError_t, (hipEvent_t,), event) +end + +function hipEventSynchronize(event) + ccall((:hipEventSynchronize, libhip), hipError_t, (hipEvent_t,), event) +end + +function hipStreamCreateWithPriority(stream_ref, flags, priority) ccall((:hipStreamCreateWithPriority, libhip), hipError_t, - (Ptr{hipStream_t}, Cuint, Cint), stream_ref, flags, priority) + (Ptr{hipStream_t}, Cuint, Cint), stream_ref, flags, priority) end -function hipStreamGetPriority(stream::hipStream_t, priority::Ref{Cint}) - ccall((:hipStreamGetPriority, libhip), hipError_t, (hipStream_t, Ptr{Cint}), stream, priority) +function hipStreamGetPriority(stream, priority) + ccall((:hipStreamGetPriority, libhip), hipError_t, + (hipStream_t, Ptr{Cint}), stream, priority) end -function hipStreamSynchronize(stream::hipStream_t) +function hipStreamSynchronize(stream) ccall((:hipStreamSynchronize, libhip), hipError_t, (hipStream_t,), stream) end -function hipStreamDestroy(stream::hipStream_t) +function hipStreamDestroy(stream) ccall((:hipStreamDestroy, libhip), hipError_t, (hipStream_t,), stream) end -function hipStreamQuery(stream::hipStream_t) +function hipStreamQuery(stream) ccall((:hipStreamQuery, libhip), hipError_t, (hipStream_t,), stream) end @@ -74,3 +111,169 @@ end function hipFree(ptr::Ptr{Cvoid}) ccall((:hipFree, libhip), hipError_t, (Ptr{Cvoid},), ptr) end + +function hipHostMalloc(ptr, sz, flags) + ccall((:hipHostMalloc, libhip), hipError_t, + (Ptr{Ptr{Cvoid}}, Csize_t, Cuint), ptr, sz, flags) +end + +function hipHostFree(ptr) + ccall((:hipHostFree, libhip), hipError_t, (Ptr{Cvoid},), ptr) +end + +function hipHostRegister(hostPtr, sizeBytes, flags) + ccall((:hipHostRegister, libhip), hipError_t, + (Ptr{Cvoid}, Csize_t, Cuint), + hostPtr, sizeBytes, flags) +end + +function hipHostUnregister(hostPtr) + ccall((:hipHostUnregister, libhip), hipError_t, (Ptr{Cvoid},), hostPtr) +end + +function hipHostGetDevicePointer(devPtr, hstPtr, flags) + ccall((:hipHostGetDevicePointer, libhip), hipError_t, + (Ptr{Ptr{Cvoid}}, Ptr{Cvoid}, Cuint), devPtr, hstPtr, flags) +end + +function hipMallocAsync(ptr, sz, stream) + ccall((:hipMallocAsync, libhip), hipError_t, + (Ptr{Ptr{Cvoid}}, Csize_t, hipStream_t), + ptr, sz, stream) +end + +function hipFreeAsync(ptr, stream) + ccall((:hipFreeAsync, libhip), hipError_t, + (Ptr{Cvoid}, hipStream_t), ptr, stream) +end + +function hipMemcpyHtoDAsync(dst, src, sz, stream) + ccall((:hipMemcpyHtoDAsync, libhip), hipError_t, + (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t), + dst, src, sz, stream) +end + +function hipMemcpyDtoHAsync(dst, src, sz, stream) + ccall((:hipMemcpyDtoHAsync, libhip), hipError_t, + (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t), + dst, src, sz, stream) +end + +function hipMemcpyDtoDAsync(dst, src, sz, stream) + ccall((:hipMemcpyDtoDAsync, libhip), hipError_t, + (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t), + dst, src, sz, stream) +end + +function hipMemGetInfo(free, total) + ccall((:hipMemGetInfo, libhip), hipError_t, + (Ptr{Csize_t}, Ptr{Csize_t}), free, total) +end + +function hipDeviceGetDefaultMemPool(pool, device_id) + ccall((:hipDeviceGetDefaultMemPool, libhip), hipError_t, + (Ptr{hipMemPool_t}, Cint), pool, device_id) +end + +function hipDeviceGetMemPool(pool, device_id) + ccall((:hipDeviceGetMemPool, libhip), hipError_t, + (Ptr{hipMemPool_t}, Cint), pool, device_id) +end + +function hipDeviceSetMemPool(device_id, pool) + ccall((:hipDeviceSetMemPool, libhip), hipError_t, + (Cint, hipMemPool_t), device_id, pool) +end + +function hipMemPoolTrimTo(pool, min_bytes_to_hold) + ccall((:hipMemPoolTrimTo, libhip), hipError_t, + (hipMemPool_t, Csize_t), pool, min_bytes_to_hold) +end + +function hipMemPoolSetAttribute(pool, attr, value) + ccall((:hipMemPoolSetAttribute, libhip), hipError_t, + (hipMemPool_t, hipMemPoolAttr, Ptr{Cvoid}), pool, attr, value) +end + +function hipMemPoolGetAttribute(pool, attr, value) + ccall((:hipMemPoolGetAttribute, libhip), hipError_t, + (hipMemPool_t, hipMemPoolAttr, Ptr{Cvoid}), pool, attr, value) +end + +function hipMemPoolCreate(pool, props) + ccall((:hipMemPoolCreate, libhip), hipError_t, + (Ptr{hipMemPool_t}, Ptr{hipMemPoolProps}), pool, props) +end + +function hipMemPoolDestroy(pool) + ccall((:hipMemPoolDestroy, libhip), hipError_t, (hipMemPool_t,), pool) +end + +function hipDeviceGetLimit(value, limit) + ccall((:hipDeviceGetLimit, libhip), hipError_t, + (Ptr{Csize_t}, hipLimit_t), value, limit) +end + +function hipDeviceSetLimit(limit, value) + ccall((:hipDeviceSetLimit, libhip), hipError_t, + (hipLimit_t, Csize_t), limit, value) +end + +function hiprtcLinkCreate(n_options, option_ptr, option_vals_pptr, hip_link_state_ptr) + ccall((:hiprtcLinkCreate, libhip), hiprtcResult, + (Cuint, Ptr{hiprtcJIT_option}, Ptr{Ptr{Cvoid}}, Ptr{hiprtcLinkState}), + n_options, option_ptr, option_vals_pptr, hip_link_state_ptr) +end + +function hiprtcLinkAddFile( + hip_link_state, input_type, file_path, + num_options, options_ptr, option_vals_pptr, +) + ccall((:hiprtcLinkAddFile, libhip), hiprtcResult, + (hiprtcLinkState, hiprtcJITInputType, Ptr{Cchar}, Cuint, Ptr{hiprtcJIT_option}, Ptr{Ptr{Cvoid}}), + hip_link_state, input_type, file_path, num_options, options_ptr, option_vals_pptr) +end + +function hiprtcLinkComplete(hip_link_state, bin_out, size_out) + ccall((:hiprtcLinkComplete, libhip), hiprtcResult, + (hiprtcLinkState, Ptr{Ptr{Cvoid}}, Ptr{Csize_t}), + hip_link_state, bin_out, size_out) +end + +function hipModuleLoad(mod, fname) + ccall((:hipModuleLoad, libhip), hipError_t, + (Ptr{hipModule_t}, Ptr{Cchar}), mod, fname) +end + +function hipModuleLoadData(mod, img) + ccall((:hipModuleLoadData, libhip), hipError_t, + (Ptr{hipModule_t}, Ptr{Cvoid}), mod, img) +end + +function hipModuleGetFunction(func, mod, name) + ccall((:hipModuleGetFunction, libhip), hipError_t, + (Ptr{hipFunction_t}, hipModule_t, Ptr{Cchar}), func, mod, name) +end + +function hipModuleUnload(mod) + ccall((:hipModuleUnload, libhip), hipError_t, (hipModule_t,), mod) +end + +function hipModuleLaunchKernel( + func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes, stream, kernelParams, extra, +) + ccall((:hipModuleLaunchKernel, libhip), hipError_t, + (hipFunction_t, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint, + Cuint, hipStream_t, Ptr{Ptr{Cvoid}}, Ptr{Ptr{Cvoid}}), + func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, + sharedMemBytes, stream, kernelParams, extra) +end + +function hipModuleOccupancyMaxPotentialBlockSize( + gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit, +) + ccall((:hipModuleOccupancyMaxPotentialBlockSize, libhip), hipError_t, + (Ptr{Cint}, Ptr{Cint}, hipFunction_t, Csize_t, Cint), + gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit) +end diff --git a/src/hip/libhip_common.jl b/src/hip/libhip_common.jl index 58031e072..baa4a1904 100644 --- a/src/hip/libhip_common.jl +++ b/src/hip/libhip_common.jl @@ -1,13 +1,129 @@ +const HIP_LAUNCH_PARAM_BUFFER_POINTER = Ptr{Cvoid}(1) +const HIP_LAUNCH_PARAM_BUFFER_SIZE = Ptr{Cvoid}(2) +const HIP_LAUNCH_PARAM_END = Ptr{Cvoid}(3) + +@cenum hiprtcResult::UInt32 begin + HIPRTC_SUCCESS = 0 + HIPRTC_ERROR_OUT_OF_MEMORY = 1 + HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2 + HIPRTC_ERROR_INVALID_INPUT = 3 + HIPRTC_ERROR_INVALID_PROGRAM = 4 + HIPRTC_ERROR_INVALID_OPTION = 5 + HIPRTC_ERROR_COMPILATION = 6 + HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7 + HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8 + HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9 + HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10 + HIPRTC_ERROR_INTERNAL_ERROR = 11 + HIPRTC_ERROR_LINKING = 100 +end + +@cenum hiprtcJIT_option::UInt32 begin + HIPRTC_JIT_MAX_REGISTERS = 0 + HIPRTC_JIT_THREADS_PER_BLOCK + HIPRTC_JIT_WALL_TIME + HIPRTC_JIT_INFO_LOG_BUFFER + HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES + HIPRTC_JIT_ERROR_LOG_BUFFER + HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES + HIPRTC_JIT_OPTIMIZATION_LEVEL + HIPRTC_JIT_TARGET_FROM_HIPCONTEXT + HIPRTC_JIT_TARGET + HIPRTC_JIT_FALLBACK_STRATEGY + HIPRTC_JIT_GENERATE_DEBUG_INFO + HIPRTC_JIT_LOG_VERBOSE + HIPRTC_JIT_GENERATE_LINE_INFO + HIPRTC_JIT_CACHE_MODE + HIPRTC_JIT_NEW_SM3X_OPT + HIPRTC_JIT_FAST_COMPILE + HIPRTC_JIT_GLOBAL_SYMBOL_NAMES + HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS + HIPRTC_JIT_GLOBAL_SYMBOL_COUNT + HIPRTC_JIT_LTO + HIPRTC_JIT_FTZ + HIPRTC_JIT_PREC_DIV + HIPRTC_JIT_PREC_SQRT + HIPRTC_JIT_FMA + HIPRTC_JIT_NUM_OPTIONS +end + +@cenum hiprtcJITInputType::UInt32 begin + HIPRTC_JIT_INPUT_CUBIN = 0 + HIPRTC_JIT_INPUT_PTX + HIPRTC_JIT_INPUT_FATBINARY + HIPRTC_JIT_INPUT_OBJECT + HIPRTC_JIT_INPUT_LIBRARY + HIPRTC_JIT_INPUT_NVVM + HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES + HIPRTC_JIT_INPUT_LLVM_BITCODE = 100 + HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101 + HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102 +end + +@cenum hipLimit_t::UInt32 begin + hipLimitStackSize = 0 + hipLimitPrintfFifoSize = 1 + hipLimitMallocHeapSize = 2 +end + +@cenum hipMemAllocationHandleType::UInt32 begin + hipMemHandleTypeNone = 0 + hipMemHandleTypePosixFileDescriptor = 1 + hipMemHandleTypeWin32 = 2 + hipMemHandleTypeWin32Kmt = 4 +end + +@cenum hipMemAllocationType::UInt32 begin + hipMemAllocationTypeInvalid = 0 + hipMemAllocationTypePinned = 1 + hipMemAllocationTypeMax = 2147483647 +end + +@cenum hipMemLocationType::UInt32 begin + hipMemLocationTypeInvalid = 0 + hipMemLocationTypeDevice = 1 +end + +@cenum hipMemPoolAttr::UInt32 begin + hipMemPoolReuseFollowEventDependencies = 1 + hipMemPoolReuseAllowOpportunistic = 2 + hipMemPoolReuseAllowInternalDependencies = 3 + hipMemPoolAttrReleaseThreshold = 4 + hipMemPoolAttrReservedMemCurrent = 5 + hipMemPoolAttrReservedMemHigh = 6 + hipMemPoolAttrUsedMemCurrent = 7 + hipMemPoolAttrUsedMemHigh = 8 +end + +@cenum hipEventFlag_t::Cuint begin + hipEventDefault = 0 + hipEventDisableTiming = 2 + hipEventInterprocess = 4 +end + @cenum hipError_t::UInt32 begin hipSuccess = 0 + hipErrorInvalidValue = 1 hipErrorOutOfMemory = 2 + hipErrorMemoryAllocation = 2 hipErrorNotInitialized = 3 + hipErrorInitializationError = 3 hipErrorDeinitialized = 4 hipErrorProfilerDisabled = 5 hipErrorProfilerNotInitialized = 6 hipErrorProfilerAlreadyStarted = 7 hipErrorProfilerAlreadyStopped = 8 + hipErrorInvalidConfiguration = 9 + hipErrorInvalidPitchValue = 12 + hipErrorInvalidSymbol = 13 + hipErrorInvalidDevicePointer = 17 + hipErrorInvalidMemcpyDirection = 21 hipErrorInsufficientDriver = 35 + hipErrorMissingConfiguration = 52 + hipErrorPriorLaunchFailure = 53 + hipErrorInvalidDeviceFunction = 98 + hipErrorNoDevice = 100 + hipErrorInvalidDevice = 101 hipErrorInvalidImage = 200 hipErrorInvalidContext = 201 hipErrorContextAlreadyCurrent = 202 @@ -31,42 +147,368 @@ hipErrorSharedObjectSymbolNotFound = 302 hipErrorSharedObjectInitFailed = 303 hipErrorOperatingSystem = 304 - hipErrorSetOnActiveProcess = 305 hipErrorInvalidHandle = 400 + hipErrorIllegalState = 401 hipErrorNotFound = 500 + hipErrorNotReady = 600 hipErrorIllegalAddress = 700 - hipErrorInvalidSymbol = 701 - # Runtime Error Codes start here. - hipErrorMissingConfiguration = 1001 - hipErrorMemoryAllocation = 1002 - hipErrorInitializationError = 1003 - hipErrorLaunchFailure = 1004 - hipErrorPriorLaunchFailure = 1005 - hipErrorLaunchTimeOut = 1006 - hipErrorLaunchOutOfResources = 1007 - hipErrorInvalidDeviceFunction = 1008 - hipErrorInvalidConfiguration = 1009 - hipErrorInvalidDevice = 1010 - hipErrorInvalidValue = 1011 - hipErrorInvalidDevicePointer = 1017 - hipErrorInvalidMemcpyDirection = 1021 - hipErrorUnknown = 1030 - hipErrorInvalidResourceHandle = 1033 - hipErrorNotReady = 1034 - hipErrorNoDevice = 1038 - hipErrorPeerAccessAlreadyEnabled = 1050 - hipErrorPeerAccessNotEnabled = 1051 + hipErrorLaunchOutOfResources = 701 + hipErrorLaunchTimeOut = 702 + hipErrorPeerAccessAlreadyEnabled = 704 + hipErrorPeerAccessNotEnabled = 705 + hipErrorSetOnActiveProcess = 708 + hipErrorContextIsDestroyed = 709 + hipErrorAssert = 710 + hipErrorHostMemoryAlreadyRegistered = 712 + hipErrorHostMemoryNotRegistered = 713 + hipErrorLaunchFailure = 719 + hipErrorCooperativeLaunchTooLarge = 720 + hipErrorNotSupported = 801 + hipErrorStreamCaptureUnsupported = 900 + hipErrorStreamCaptureInvalidated = 901 + hipErrorStreamCaptureMerge = 902 + hipErrorStreamCaptureUnmatched = 903 + hipErrorStreamCaptureUnjoined = 904 + hipErrorStreamCaptureIsolation = 905 + hipErrorStreamCaptureImplicit = 906 + hipErrorCapturedEvent = 907 + hipErrorStreamCaptureWrongThread = 908 + hipErrorGraphExecUpdateFailure = 910 + hipErrorUnknown = 999 + # HSA Runtime Error Codes start here. hipErrorRuntimeMemory = 1052 hipErrorRuntimeOther = 1053 - hipErrorHostMemoryAlreadyRegistered = 1061 - hipErrorHostMemoryNotRegistered = 1062 - hipErrorMapBufferObjectFailed = 1071 - hipErrorAssert = 1081 - hipErrorNotSupported = 1082 hipErrorTbd end hipContext_t = Ptr{Cvoid} + hipDevice_t = Ptr{Cvoid} + hipStream_t = Ptr{Cvoid} + hipEvent_t = Ptr{Cvoid} + +hipMemPool_t = Ptr{Cvoid} + +hipModule_t = Ptr{Cvoid} + +hipFunction_t = Ptr{Cvoid} + +hiprtcLinkState = Ptr{Cvoid} + +struct hipMemLocation + type::hipMemLocationType + id::Cint +end + +struct hipMemPoolProps + allocType::hipMemAllocationType + handleTypes::hipMemAllocationHandleType + location::hipMemLocation + win32SecurityAttributes::Ptr{Cvoid} + reserved::NTuple{64,Cuchar} +end + +Base.@kwdef struct hipDeviceArch_t + hasGlobalInt32Atomics::Cuint = 1 + hasGlobalFloatAtomicExch::Cuint = 1 + hasSharedInt32Atomics::Cuint = 1 + hasSharedFloatAtomicExch::Cuint = 1 + hasFloatAtomicAdd::Cuint = 1 + + hasGlobalInt64Atomics::Cuint = 1 + hasSharedInt64Atomics::Cuint = 1 + + # Doubles + hasDoubles::Cuint = 1 + + # Warp cross-lane operations + hasWarpVote::Cuint = 1 + hasWarpBallot::Cuint = 1 + hasWarpShuffle::Cuint = 1 + hasFunnelShift::Cuint = 1 + + # Sync + hasThreadFenceSystem::Cuint = 1 + hasSyncThreadsExt::Cuint = 1 + + # Misc + hasSurfaceFuncs::Cuint = 1 + has3dGrid::Cuint = 1 + hasDynamicParallelism::Cuint = 1 +end + +function Base.show(io::IO, arch::hipDeviceArch_t) + print(io, + """ + struct hipDeviceArch_t + hasGlobalInt32Atomics = $(arch.hasGlobalInt32Atomics) + hasGlobalFloatAtomicExch = $(arch.hasGlobalFloatAtomicExch) + hasSharedInt32Atomics = $(arch.hasSharedInt32Atomics) + hasSharedFloatAtomicExch = $(arch.hasSharedFloatAtomicExch) + hasFloatAtomicAdd = $(arch.hasFloatAtomicAdd) + + hasGlobalInt64Atomics = $(arch.hasGlobalInt64Atomics) + hasSharedInt64Atomics = $(arch.hasSharedInt64Atomics) + + # Doubles + hasDoubles = $(arch.hasDoubles) + + # Warp cross-lane operations + hasWarpVote = $(arch.hasWarpVote) + hasWarpBallot = $(arch.hasWarpBallot) + hasWarpShuffle = $(arch.hasWarpShuffle) + hasFunnelShift = $(arch.hasFunnelShift) + + # Sync + hasThreadFenceSystem = $(arch.hasThreadFenceSystem) + hasSyncThreadsExt = $(arch.hasSyncThreadsExt) + + # Misc + hasSurfaceFuncs = $(arch.hasSurfaceFuncs) + has3dGrid = $(arch.has3dGrid) + hasDynamicParallelism = $(arch.hasDynamicParallelism) + end + """) +end + +struct hipDeviceProp_t + name::NTuple{256, Cchar} + totalGlobalMem::Csize_t + sharedMemPerBlock::Csize_t + regsPerBlock::Cint + warpSize::Cint + maxThreadsPerBlock::Cint + maxThreadsDim::NTuple{3, Cint} + maxGridSize::NTuple{3, Cint} + clockRate::Cint + memoryClockRate::Cint + memoryBusWidth::Cint + totalConstMem::Csize_t + major::Cint + minor::Cint + multiProcessorCount::Cint + l2CacheSize::Cint + maxThreadsPerMultiProcessor::Cint + computeMode::Cint + clockInstructionRate::Cint + arch::hipDeviceArch_t + concurrentKernels::Cint + pciDomainID::Cint + pciBusID::Cint + pciDeviceID::Cint + maxSharedMemoryPerMultiProcessor::Csize_t + isMultiGpuBoard::Cint + canMapHostMemory::Cint + gcnArch::Cint + gcnArchName::NTuple{256, Cchar} + integrated::Cint + cooperativeLaunch::Cint + cooperativeMultiDeviceLaunch::Cint + maxTexture1DLinear::Cint + maxTexture1D::Cint + maxTexture2D::NTuple{2, Cint} + maxTexture3D::NTuple{3, Cint} + hdpMemFlushCntl::Ptr{Cuint} + hdpRegFlushCntl::Ptr{Cuint} + memPitch::Csize_t + textureAlignment::Csize_t + texturePitchAlignment::Csize_t + kernelExecTimeoutEnabled::Cint + ECCEnabled::Cint + tccDriver::Cint + cooperativeMultiDeviceUnmatchedFunc::Cint + cooperativeMultiDeviceUnmatchedGridDim::Cint + cooperativeMultiDeviceUnmatchedBlockDim::Cint + cooperativeMultiDeviceUnmatchedSharedMem::Cint + isLargeBar::Cint + asicRevision::Cint + managedMemory::Cint + directManagedMemAccessFromHost::Cint + concurrentManagedAccess::Cint + pageableMemoryAccess::Cint + pageableMemoryAccessUsesHostPageTables::Cint +end + +function Base.show(io::IO, props::hipDeviceProp_t) + name_ptr = pointer([props.name...]) + gcn_name = pointer([props.gcnArchName...]) + print(io, + """ + struct hipDeviceProp_t + name = $(unsafe_string(name_ptr)) + totalGlobalMem = $(Base.format_bytes(props.totalGlobalMem)) + sharedMemPerBlock = $(Base.format_bytes(props.sharedMemPerBlock)) + regsPerBlock = $(props.regsPerBlock) + warpSize = $(props.warpSize) + maxThreadsPerBlock = $(props.maxThreadsPerBlock) + maxThreadsDim = $(props.maxThreadsDim) + maxGridSize = $(props.maxGridSize) + clockRate = $(props.clockRate) + memoryClockRate = $(props.memoryClockRate) + memoryBusWidth = $(props.memoryBusWidth) + totalConstMem = $(Base.format_bytes(props.totalConstMem)) + major = $(props.major) + minor = $(props.minor) + multiProcessorCount = $(props.multiProcessorCount) + l2CacheSize = $(Base.format_bytes(props.l2CacheSize)) + maxThreadsPerMultiProcessor = $(props.maxThreadsPerMultiProcessor) + computeMode = $(props.computeMode) + clockInstructionRate = $(props.clockInstructionRate) + arch = [printed separately below] + concurrentKernels = $(props.concurrentKernels) + pciBusID = $(props.pciBusID) + pciDeviceID = $(props.pciDeviceID) + maxSharedMemoryPerMultiProcessor = $(Base.format_bytes(props.maxSharedMemoryPerMultiProcessor)) + isMultiGpuBoard = $(props.isMultiGpuBoard) + canMapHostMemory = $(props.canMapHostMemory) + gcnArch = $(props.gcnArch) + gcnArchName = $(unsafe_string(gcn_name)) + ... + end + """) + println(io) + show(io, props.arch) +end + +@cenum hipDeviceAttribute_t begin + hipDeviceAttributeCudaCompatibleBegin = 0 + + hipDeviceAttributeEccEnabled = 0 + hipDeviceAttributeAccessPolicyMaxWindowSize + hipDeviceAttributeAsyncEngineCount + hipDeviceAttributeCanMapHostMemory + hipDeviceAttributeCanUseHostPointerForRegisteredMem + + hipDeviceAttributeClockRate + hipDeviceAttributeComputeMode + hipDeviceAttributeComputePreemptionSupported + hipDeviceAttributeConcurrentKernels + hipDeviceAttributeConcurrentManagedAccess + hipDeviceAttributeCooperativeLaunch + hipDeviceAttributeCooperativeMultiDeviceLaunch + hipDeviceAttributeDeviceOverlap + + hipDeviceAttributeDirectManagedMemAccessFromHost + + hipDeviceAttributeGlobalL1CacheSupported + hipDeviceAttributeHostNativeAtomicSupported + hipDeviceAttributeIntegrated + hipDeviceAttributeIsMultiGpuBoard + hipDeviceAttributeKernelExecTimeout + hipDeviceAttributeL2CacheSize + hipDeviceAttributeLocalL1CacheSupported + hipDeviceAttributeLuid + hipDeviceAttributeLuidDeviceNodeMask + hipDeviceAttributeComputeCapabilityMajor + hipDeviceAttributeManagedMemory + hipDeviceAttributeMaxBlocksPerMultiProcessor + hipDeviceAttributeMaxBlockDimX + hipDeviceAttributeMaxBlockDimY + hipDeviceAttributeMaxBlockDimZ + hipDeviceAttributeMaxGridDimX + hipDeviceAttributeMaxGridDimY + hipDeviceAttributeMaxGridDimZ + hipDeviceAttributeMaxSurface1D + hipDeviceAttributeMaxSurface1DLayered + hipDeviceAttributeMaxSurface2D + hipDeviceAttributeMaxSurface2DLayered + hipDeviceAttributeMaxSurface3D + hipDeviceAttributeMaxSurfaceCubemap + hipDeviceAttributeMaxSurfaceCubemapLayered + hipDeviceAttributeMaxTexture1DWidth + hipDeviceAttributeMaxTexture1DLayered + hipDeviceAttributeMaxTexture1DLinear + + hipDeviceAttributeMaxTexture1DMipmap + hipDeviceAttributeMaxTexture2DWidth + hipDeviceAttributeMaxTexture2DHeight + hipDeviceAttributeMaxTexture2DGather + hipDeviceAttributeMaxTexture2DLayered + hipDeviceAttributeMaxTexture2DLinear + hipDeviceAttributeMaxTexture2DMipmap + hipDeviceAttributeMaxTexture3DWidth + hipDeviceAttributeMaxTexture3DHeight + hipDeviceAttributeMaxTexture3DDepth + hipDeviceAttributeMaxTexture3DAlt + hipDeviceAttributeMaxTextureCubemap + hipDeviceAttributeMaxTextureCubemapLayered + hipDeviceAttributeMaxThreadsDim + hipDeviceAttributeMaxThreadsPerBlock + hipDeviceAttributeMaxThreadsPerMultiProcessor + hipDeviceAttributeMaxPitch + hipDeviceAttributeMemoryBusWidth + hipDeviceAttributeMemoryClockRate + hipDeviceAttributeComputeCapabilityMinor + hipDeviceAttributeMultiGpuBoardGroupID + hipDeviceAttributeMultiprocessorCount + hipDeviceAttributeName + hipDeviceAttributePageableMemoryAccess + + hipDeviceAttributePageableMemoryAccessUsesHostPageTables + hipDeviceAttributePciBusId + hipDeviceAttributePciDeviceId + hipDeviceAttributePciDomainID + hipDeviceAttributePersistingL2CacheMaxSize + hipDeviceAttributeMaxRegistersPerBlock + + hipDeviceAttributeMaxRegistersPerMultiprocessor + hipDeviceAttributeReservedSharedMemPerBlock + hipDeviceAttributeMaxSharedMemoryPerBlock + hipDeviceAttributeSharedMemPerBlockOptin + hipDeviceAttributeSharedMemPerMultiprocessor + hipDeviceAttributeSingleToDoublePrecisionPerfRatio + hipDeviceAttributeStreamPrioritiesSupported + hipDeviceAttributeSurfaceAlignment + hipDeviceAttributeTccDriver + hipDeviceAttributeTextureAlignment + hipDeviceAttributeTexturePitchAlignment + hipDeviceAttributeTotalConstantMemory + hipDeviceAttributeTotalGlobalMem + hipDeviceAttributeUnifiedAddressing + hipDeviceAttributeUuid + hipDeviceAttributeWarpSize + hipDeviceAttributeMemoryPoolsSupported + hipDeviceAttributeVirtualMemoryManagementSupported + + hipDeviceAttributeCudaCompatibleEnd = 9999 + hipDeviceAttributeAmdSpecificBegin = 10000 + + hipDeviceAttributeClockInstructionRate = 10000 + hipDeviceAttributeArch + hipDeviceAttributeMaxSharedMemoryPerMultiprocessor + hipDeviceAttributeGcnArch + hipDeviceAttributeGcnArchName + hipDeviceAttributeHdpMemFlushCntl + hipDeviceAttributeHdpRegFlushCntl + hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc + + hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim + + hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim + + hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem + + hipDeviceAttributeIsLargeBar + hipDeviceAttributeAsicRevision + hipDeviceAttributeCanUseStreamWaitValue + + hipDeviceAttributeImageSupport + hipDeviceAttributePhysicalMultiProcessorCount + + hipDeviceAttributeFineGrainSupport + hipDeviceAttributeWallClockRate + + hipDeviceAttributeAmdSpecificEnd = 19999 + hipDeviceAttributeVendorSpecificBegin = 20000 +end + +const hipHostAllocDefault = 0x00 +const hipHostAllocPortable = 0x01 +const hipHostAllocMapped = 0x02 + +const hipHostRegisterDefault = 0x0 +const hipHostRegisterPortable = 0x1 +const hipHostRegisterMapped = 0x2 + diff --git a/src/hip/module.jl b/src/hip/module.jl new file mode 100644 index 000000000..882c270ad --- /dev/null +++ b/src/hip/module.jl @@ -0,0 +1,46 @@ +mutable struct HIPModule + handle::hipModule_t + dev::HIPDevice + + function HIPModule(data) + dev = device() + device_synchronize() + + # TODO use alloc_retry? + mod_ref = Ref{hipModule_t}() + hipModuleLoadData(mod_ref, data) |> check + mod = new(mod_ref[], dev) + + finalizer(mod) do mod + hipModuleUnload(mod) |> check + end + mod + end +end + +Base.unsafe_convert(::Type{hipModule_t}, mod::HIPModule) = mod.handle + +struct HIPFunction + handle::hipFunction_t + mod::HIPModule + global_hostcalls::Vector{Symbol} + + function HIPFunction( + mod::HIPModule, name::String, global_hostcalls::Vector{Symbol}, + ) + fun_ref = Ref{hipFunction_t}() + hipModuleGetFunction(fun_ref, mod, name) |> check + new(fun_ref[], mod, global_hostcalls) + end +end + +Base.unsafe_convert(::Type{hipFunction_t}, fun::HIPFunction) = fun.handle + +function launch_configuration( + fun::HIPFunction; shmem::Integer = 0, max_block_size::Integer = 0, +) + grid_size_ref, block_size_ref = Ref{Cint}(), Ref{Cint}() + hipModuleOccupancyMaxPotentialBlockSize( + grid_size_ref, block_size_ref, fun, shmem, max_block_size) |> check + return (; gridsize=grid_size_ref[], groupsize=block_size_ref[]) +end diff --git a/src/hip/pool.jl b/src/hip/pool.jl new file mode 100644 index 000000000..f27f81636 --- /dev/null +++ b/src/hip/pool.jl @@ -0,0 +1,62 @@ +mutable struct HIPMemoryPool + handle::hipMemPool_t + + function HIPMemoryPool(dev::HIPDevice; + alloc_type::hipMemAllocationType = hipMemAllocationTypePinned, + handle_type::hipMemAllocationHandleType = hipMemHandleTypeNone, + ) + location = hipMemLocation(hipMemLocationTypeDevice, device_id(dev)) + props = Ref(hipMemPoolProps( + alloc_type, handle_type, location, + C_NULL, ntuple(i->Cuchar(0), 64))) + + handle_ref = Ref{hipMemPool_t}() + hipMemPoolCreate(handle_ref, props) |> check + pool = new(handle_ref[]) + + finalizer(pool) do pool + hipMemPoolDestroy(pool) |> check + end + return pool + end + + global function default_memory_pool(dev::HIPDevice) + handle_ref = Ref{hipMemPool_t}() + hipDeviceGetDefaultMemPool(handle_ref, device_id(dev)) |> check + new(handle_ref[]) + end + + global function memory_pool(dev::HIPDevice) + handle_ref = Ref{hipMemPool_t}() + hipDeviceGetMemPool(handle_ref, device_id(dev)) |> check + new(handle_ref[]) + end +end + +Base.unsafe_convert(::Type{hipMemPool_t}, pool::HIPMemoryPool) = pool.handle + +Base.:(==)(a::HIPMemoryPool, b::HIPMemoryPool) = a.handle == b.handle + +Base.hash(pool::HIPMemoryPool, h::UInt) = hash(pool.handle, h) + +function attribute(X::Type, pool::HIPMemoryPool, attr::hipMemPoolAttr) + value = Ref{X}() + hipMemPoolGetAttribute(pool, attr, value) |> check + value[] +end + +function attribute!(pool::HIPMemoryPool, attr::hipMemPoolAttr, value) + hipMemPoolSetAttribute(pool, attr, Ref(value)) |> check +end + +function trim(pool::HIPMemoryPool, bytes_to_keep::Integer = 0) + hipMemPoolTrimTo(pool, bytes_to_keep) |> check +end + +function memory_pool!(dev::HIPDevice, pool::HIPMemoryPool) + hipDeviceSetMemPool(device_id(dev), pool) |> check +end + +used_memory(pool::HIPMemoryPool) = attribute(UInt64, pool, hipMemPoolAttrUsedMemCurrent) + +reserved_memory(pool::HIPMemoryPool) = attribute(UInt64, pool, hipMemPoolAttrReservedMemCurrent) diff --git a/src/hip/stream.jl b/src/hip/stream.jl new file mode 100644 index 000000000..48736914a --- /dev/null +++ b/src/hip/stream.jl @@ -0,0 +1,119 @@ +mutable struct HIPStream + stream::hipStream_t + priority::Symbol + device::HIPDevice +end + +""" + HIPStream(priority::Symbol = :normal) + +# Arguments: + +- `priority::Symbol`: Priority of the stream: `:normal`, `:high` or `:low`. + +Create HIPStream with given priority. +Device is the default device that's currently in use. +""" +function HIPStream(priority::Symbol = :normal) + priority_int = symbol_to_priority(priority) + + stream_ref = Ref{hipStream_t}() + hipStreamCreateWithPriority(stream_ref, Cuint(0), priority_int) |> check + stream = HIPStream(stream_ref[], priority, device()) + finalizer(stream) do s + hipStreamDestroy(s.stream) |> check + end + return stream +end + +default_stream() = HIPStream(convert(hipStream_t, C_NULL), :normal, device()) + +""" + HIPStream(stream::hipStream_t) + +Create HIPStream from `hipStream_t` handle. +Device is the default device that's currently in use. +""" +HIPStream(stream::hipStream_t) = HIPStream(stream, priority(stream), device()) + +function isdone(stream::HIPStream) + query = hipStreamQuery(stream) + if query == hipSuccess + return true + elseif query == hipErrorNotReady + return false + else + throw(HIPError(query)) + end +end + +function _low_latency_synchronize(stream::HIPStream) + isdone(stream) && return true + + # spin (initially without yielding to minimize latency) + spins = 0 + while spins < 256 + if spins < 32 + ccall(:jl_cpu_pause, Cvoid, ()) + # Temporary solution before we have gc transition support in codegen. + ccall(:jl_gc_safepoint, Cvoid, ()) + else + yield() + end + isdone(stream) && return true + spins += 1 + end + return false +end + +function non_blocking_synchronize(stream::HIPStream) + while true + yield() + isdone(stream) && return true + end + return false +end + +wait(stream::HIPStream) = hipStreamSynchronize(stream) |> check + +function synchronize(stream::HIPStream; blocking::Bool = true) + if blocking + _low_latency_synchronize(stream) || wait(stream) + else + non_blocking_synchronize(stream) + end + return +end + +Base.unsafe_convert(::Type{Ptr{T}}, stream::HIPStream) where T = + reinterpret(Ptr{T}, stream.stream) + +function Base.show(io::IO, stream::HIPStream) + print(io, "HIPStream(device=$(stream.device), ptr=$(repr(UInt64(stream.stream))), priority=$(stream.priority))") +end + +function priority_to_symbol(priority) + priority == 0 && return :normal + priority == -1 && return :high + priority == 1 && return :low + throw(ArgumentError(""" + Invalid HIP priority: $priority. + Valid values are: 0, -1, 1. + """)) +end + +function symbol_to_priority(priority::Symbol) + priority == :normal && return Cint(0) + priority == :high && return Cint(-1) + priority == :low && return Cint(1) + throw(ArgumentError(""" + Invalid HIP priority symbol: $priority. + Valid values are: `:normal`, `:low`, `:high`. + """)) +end + +function priority(stream::hipStream_t) + priority = Ref{Cint}() + hipStreamGetPriority(stream, priority) |> check + priority_to_symbol(priority[]) +end diff --git a/src/mapreduce.jl b/src/mapreduce.jl index e3e3de6b0..4056f2bb6 100644 --- a/src/mapreduce.jl +++ b/src/mapreduce.jl @@ -5,26 +5,28 @@ # - group-stride loop to delay need for second kernel launch # Reduce a value across a group, using local memory for communication -@inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems} - items = workgroupDim().x - item = workitemIdx().x +@inline function reduce_group(op, val::T, neutral) where T + items::UInt32 = workgroupDim().x + item::UInt32 = workitemIdx().x - # shared mem for a complete reduction - shared = ROCDeviceArray((2*maxitems,), Device.alloc_special(Val(:reduce_block), T, Val(AS.Local), Val(2*maxitems))) + # Shared mem for a complete reduction. + shared = @ROCDynamicLocalArray(T, items, false) @inbounds shared[item] = val - # perform a reduction - d = items>>1 - while d > 0 + # Perform a reduction. + d::UInt32 = UInt32(1) + while d < items sync_workgroup() - if item <= d - shared[item] = op(shared[item], shared[item+d]) + index::UInt32 = UInt32(2) * d * (item - UInt32(1)) + UInt32(1) + @inbounds if index ≤ items + other_val = ifelse(index + d ≤ items, shared[index + d], neutral) + shared[index] = op(shared[index], other_val) end - d >>= 1 + d *= UInt32(2) end - # load the final value on the first item - if item == 1 + # Load the final value on the first item. + if item == UInt32(1) val = @inbounds shared[item] end @@ -38,43 +40,42 @@ Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = () # Reduce an array across the grid. All elements to be processed can be addressed by the # product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have # singleton entries for the dimensions that should be reduced (and vice versa). -function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R, As...) +function partial_mapreduce_device(f, op, neutral, Rreduce, Rother, R, As...) # decompose the 1D hardware indices into separate ones for reduction (across items # and possibly groups if it doesn't fit) and other elements (remaining groups) localIdx_reduce = workitemIdx().x localDim_reduce = workgroupDim().x - groupIdx_reduce, groupIdx_other = fldmod1(workgroupIdx().x, length(Rother)) - groupDim_reduce = gridGroupDim().x ÷ length(Rother) + + n_elements_other::UInt32 = length(Rother) + groupIdx_reduce, groupIdx_other = fldmod1(workgroupIdx().x, n_elements_other) + groupDim_reduce = gridGroupDim().x ÷ n_elements_other # group-based indexing into the values outside of the reduction dimension # (that means we can safely synchronize items within this group) iother = groupIdx_other - @inbounds if iother <= length(Rother) + @inbounds if iother ≤ length(Rother) Iother = Rother[iother] # load the neutral value Iout = CartesianIndex(Tuple(Iother)..., groupIdx_reduce) - neutral = if neutral === nothing - R[Iout] - else - neutral - end + neutral = ifelse(neutral ≡ nothing, R[Iout], neutral) val = op(neutral, neutral) # reduce serially across chunks of input vector that don't fit in a group - ireduce = localIdx_reduce + (groupIdx_reduce - 1) * localDim_reduce - while ireduce <= length(Rreduce) + ireduce = localIdx_reduce + (groupIdx_reduce - UInt32(1)) * localDim_reduce + n_elements_reduce::UInt32 = length(Rreduce) + while ireduce ≤ n_elements_reduce Ireduce = Rreduce[ireduce] J = Base.max(Iother, Ireduce) val = op(val, f(_map_getindex(As, J)...)) ireduce += localDim_reduce * groupDim_reduce end - val = reduce_group(op, val, neutral, maxitems) + val = reduce_group(op, val, neutral) # write back to memory - if localIdx_reduce == 1 + if localIdx_reduce == UInt32(1) R[Iout] = val end end @@ -84,9 +85,10 @@ end ## COV_EXCL_STOP -function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, - A::Union{AbstractArray,Broadcast.Broadcasted}; - init=nothing) where {F, OP, T} +function GPUArrays.mapreducedim!( + f::F, op::OP, R::AnyROCArray{T}, + A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing, +) where {F, OP, T} Base.check_reducedims(R, A) length(A) == 0 && return R # isempty(::Broadcasted) iterates @@ -128,30 +130,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, # we might not be able to launch all those items to reduce each slice in one go. # that's why each items also loops across their inputs, processing multiple values # so that we can span the entire reduction dimension using a single item group. - - # group size is restricted by local memory - device = AMDGPU.device(R) - pools = filter(p -> Runtime.pool_segment(p) == HSA.AMD_SEGMENT_GROUP, - Runtime.memory_pools(device)) - max_items = if !isempty(pools) - pool = first(pools) - max_lmem_elements = Runtime.pool_size(pool) ÷ sizeof(T) - isa = first(Runtime.isas(device)) - Base.min(Runtime.isa_workgroup_max_size(isa), compute_items(max_lmem_elements ÷ 2)) - else - @warn "No group segment detected for device $device; assuming 64 elements\nThis message will not be shown again" maxlog=1 - 64 - end - # TODO: dynamic local memory to avoid two compilations - - #= TODO: let the runtime suggest a group size - args = (f, op, init, Val(max_items), Rreduce, Rother, R′, A) - kernel_args = rocconvert.(args) - kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - kernel = rocfunction(partial_mapreduce_device, kernel_tt) - reduce_items = compute_items(suggest_groupsize(kernel.fun, wanted_items).x) - =# - reduce_items = max_items + max_block_size = 256 + compute_shmem(items) = items * sizeof(T) + max_shmem = max_block_size |> compute_items |> compute_shmem + kernel = @roc launch=false partial_mapreduce_device( + f, op, init, Rreduce, Rother, R′, A) + kernel_config = launch_configuration(kernel; shmem=max_shmem, max_block_size) + reduce_items = compute_items(kernel_config.groupsize) + reduce_shmem = compute_shmem(reduce_items) # how many groups should we launch? # @@ -162,15 +148,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, reduce_groups = cld(length(Rreduce), reduce_items) # determine the launch configuration - items = reduce_items - groups = reduce_groups*other_groups - gridsize = items*groups + blocks = reduce_items + grid = reduce_groups * other_groups # perform the actual reduction if reduce_groups == 1 # we can cover the dimensions to reduce using a single group - wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device( - f, op, init, Val(items), Rreduce, Rother, R′, A)) + @roc gridsize=grid groupsize=blocks shmem=reduce_shmem partial_mapreduce_device( + f, op, init, Rreduce, Rother, R′, A) else # we need multiple steps to cover all values to reduce partial = similar(R, (size(R)..., reduce_groups)) @@ -178,8 +163,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T}, # without an explicit initializer we need to copy from the output container partial .= R end - wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device( - f, op, init, Val(items), Rreduce, Rother, partial, A)) + @roc gridsize=grid groupsize=blocks shmem=reduce_shmem partial_mapreduce_device( + f, op, init, Rreduce, Rother, partial, A) GPUArrays.mapreducedim!(identity, op, R′, partial; init=init) end diff --git a/src/rand/random.jl b/src/rand/random.jl index c27f48b73..fe75f56e2 100644 --- a/src/rand/random.jl +++ b/src/rand/random.jl @@ -1,10 +1,5 @@ # interfacing with Random standard library -using Random - -using GPUArrays - - mutable struct RNG <: Random.AbstractRNG handle::rocrand_generator typ::rocrand_rng_type @@ -24,7 +19,6 @@ end Base.unsafe_convert(::Type{rocrand_generator}, rng::RNG) = rng.handle - ## seeding function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0) rocrand_set_seed(rng, seed) @@ -46,9 +40,7 @@ for (f,T) in ((:rocrand_generate, :UInt32), (:rocrand_generate_char,:Cuchar), (:rocrand_generate_uniform_double, :Float64), (:rocrand_generate_uniform_half, :Float16)) @eval begin function Random.rand!(rng::RNG, A::ROCArray{$(T)}) - wait!(A) $(f)(rng, A, length(A)) - mark!(A, C_NULL) return A end end @@ -56,16 +48,13 @@ end # some functions need pow2 lengths: use a padded array and copy back to the original one function inplace_pow2(A, f) - wait!(A) len = length(A) if len > 1 && ispow2(len) f(A) - mark!(A, C_NULL) else padlen = max(2, nextpow(2, len)) B = similar(A, padlen) f(B) - mark!(B, C_NULL) copyto!(A, 1, B, 1, len) AMDGPU.unsafe_free!(B) end @@ -156,24 +145,24 @@ rand_poisson(rng::RNG, T::PoissonType, dim1::Integer, dims::Integer...; kwargs.. rand_poisson(rng, T, Dims((dim1, dims...)); kwargs...) # rand_logn! and rand_poisson! without specified rng -rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(default_rng(), A; kwargs...) -rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(default_rng(), A; kwargs...) +rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(handle(), A; kwargs...) +rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(handle(), A; kwargs...) -rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(default_rng(), T, dims; kwargs...) -rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(default_rng(), T, dims; kwargs...) +rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(handle(), T, dims; kwargs...) +rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(handle(), T, dims; kwargs...) rand_logn(T::rocRAND.LognormalType, dim1::Integer, dims::Integer...; kwargs...) = - rand_logn(default_rng(), T, Dims((dim1, dims...)); kwargs...) + rand_logn(handle(), T, Dims((dim1, dims...)); kwargs...) rand_poisson(T::rocRAND.PoissonType, dim1::Integer, dims::Integer...; kwargs...) = - rand_poisson(default_rng(), T, Dims((dim1, dims...)); kwargs...) + rand_poisson(handle(), T, Dims((dim1, dims...)); kwargs...) rand_logn(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_logn!(ROCArray{T}(undef, dim1, dims...); kwargs...) rand_poisson(T::Type, dim1::Integer, dims::Integer...; kwargs...) = rand_poisson!(ROCArray{T}(undef, dim1, dims...); kwargs...) rand_logn(dim1::Integer, dims::Integer...; kwargs...) = - rand_logn(default_rng(), Dims((dim1, dims...)); kwargs...) + rand_logn(handle(), Dims((dim1, dims...)); kwargs...) rand_poisson(dim1::Integer, dims::Integer...; kwargs...) = - rand_poisson(default_rng(), Dims((dim1, dims...)); kwargs...) + rand_poisson(handle(), Dims((dim1, dims...)); kwargs...) rand_logn(T::Type, dims::Dims; kwargs...) = rand_logn!(ROCArray{T}(undef, dims...); kwargs...) rand_poisson(T::Type, dims::Dims; kwargs...) = rand_poisson!(ROCArray{T}(undef, dims...); kwargs...) rand_logn!(A::ROCArray; kwargs...) = diff --git a/src/rand/rocRAND.jl b/src/rand/rocRAND.jl index 48e54c878..8c4e2f184 100644 --- a/src/rand/rocRAND.jl +++ b/src/rand/rocRAND.jl @@ -1,12 +1,13 @@ module rocRAND import ..AMDGPU -import .AMDGPU: ROCArray, HandleCache, librocrand, mark!, wait! -import ..HSA +import .AMDGPU: ROCArray, HandleCache, librocrand, library_state import ..HIP import .HIP: HIPContext, HIPStream, hipStream_t using CEnum +using GPUArrays +using Random export rand_logn!, rand_poisson!, rand_logn, rand_poisson @@ -21,43 +22,17 @@ end # stdlib Random integration include("random.jl") -# Copied from CUDA.jl/lib/curand/CURAND.jl +const IDLE_RNGS = HandleCache{HIPContext, RNG}() -# cache for created, but unused handles -const idle_rngs = HandleCache{HIPContext,RNG}() +lib_state() = library_state( + :rocRAND, RNG, IDLE_RNGS, + () -> RNG(), r -> return, # RNG destroys itself in finalizer. + (nh, s) -> begin + Random.seed!(nh) + rocrand_set_stream(nh.handle, s) + end) -function default_rng() - tls = AMDGPU.task_local_state() - - # every task maintains library state per device - LibraryState = @NamedTuple{rng::RNG} - states = get!(task_local_storage(), :rocRAND) do - Dict{HIPContext,LibraryState}() - end::Dict{HIPContext,LibraryState} - - # get library state - @noinline function new_state(tls) - new_rng = pop!(idle_rngs, tls.context) do - RNG() - end - - finalizer(current_task()) do task - push!(idle_rngs, tls.context, new_rng) do - # no need to do anything, as the RNG is collected by its finalizer - end - end - - Random.seed!(new_rng) - - rocrand_set_stream(new_rng.handle, tls.stream) - - (; rng=new_rng) - end - state = get!(states, tls.context) do - new_state(tls) - end - - return state.rng -end +handle() = lib_state().handle +stream() = lib_state().stream end diff --git a/src/random.jl b/src/random.jl index f74971611..23b392683 100644 --- a/src/random.jl +++ b/src/random.jl @@ -6,8 +6,8 @@ const GPUARRAY_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing) function GPUArrays.default_rng(::Type{<:ROCArray}) if GPUARRAY_RNG[] == nothing - device = AMDGPU.default_device() - N = Int(Runtime.device_workgroup_max_size(device)) + device = AMDGPU.device() + N = HIP.properties(device).maxThreadsPerBlock state = ROCArray{NTuple{4, UInt32}}(undef, N) GPUARRAY_RNG[] = GPUArrays.RNG(state) Random.seed!(GPUARRAY_RNG[]) @@ -16,7 +16,7 @@ function GPUArrays.default_rng(::Type{<:ROCArray}) end gpuarrays_rng() = GPUArrays.default_rng(ROCArray) -const rocrand_rng = librocrand !== nothing ? rocRAND.default_rng : gpuarrays_rng +const rocrand_rng = librocrand !== nothing ? rocRAND.handle : gpuarrays_rng # the interface is split in two levels: # - functions that extend the Random standard library, and take an RNG as first argument, diff --git a/src/reflection.jl b/src/reflection.jl index b02a93fe2..f1dc93db3 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -12,11 +12,10 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) @eval begin function $method( io::IO, @nospecialize(func), @nospecialize(types); - kernel::Bool=false, device=default_device(), kwargs..., + kernel::Bool=false, device=HIP.device(), kwargs..., ) source = methodinstance(typeof(func), Base.to_tuple_type(types)) - config = Compiler.compiler_config( - device; kernel, global_hooks=NamedTuple()) + config = Compiler.compiler_config(device; kernel) job = CompilerJob(source, config) GPUCompiler.$method($(args...); kwargs...) end diff --git a/src/rocm_discovery.jl b/src/rocm_discovery.jl new file mode 100644 index 000000000..1e506b738 --- /dev/null +++ b/src/rocm_discovery.jl @@ -0,0 +1,223 @@ +const rocm_ext_libs = [ + (:rocblas, :rocBLAS_jll), + (:rocsparse, :rocSPARSE_jll), + (:rocsolver, nothing), + (:rocalution, nothing), + (:rocrand, :rocRAND_jll), + (:rocfft, nothing), + (:MIOpen, :MIOpen_jll)] + +function enable_artifacts!(flag::Bool = true; show_message::Bool = true) + @set_preferences!("use_artifacts" => flag) + if show_message + @info """ + Switched `use_artifacts` to `$flag`. + Restart Julia session for the changes to take effect. + """ + end +end + +# TODO need updated ROCm artifacts to enable them again (5.4+). +use_artifacts()::Bool = @load_preference("use_artifacts", false) + +if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS") + disable_artifacts = parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false")) + enable_artifacts!(!disable_artifacts; show_message=false) +end + +function find_artifact_library!(config, pkg; name::Symbol, lib::Symbol) + if pkg !== nothing + loaded, available, error_str = safe_import(pkg) + if loaded + if available + config[Symbol(:lib, name)] = getfield(@eval($pkg), lib) + config[Symbol(name, :_configured)] = true + else + config[Symbol(name, :_build_reason)] = "`$pkg` is not available on this platform." + end + else + iob = IOBuffer() + println(iob, "`import $pkg` failed:") + print(iob, error_str) + config[Symbol(name, :_build_reason)] = String(take!(iob)) + end + end +end + +function find_system_library!( + config, name::Symbol; lib, dirs = find_roc_paths(), ext = dlext, +) + lib_path = find_rocm_library(lib, dirs, ext) + if !isempty(something(lib_path, "")) + loaded, error_str = safe_exec("using Libdl; dlopen(\"$lib_path\")") + if loaded + config[Symbol(:lib, name)] = lib_path + config[Symbol(name, :_configured)] = true + else + iob = IOBuffer() + println(iob, "Loading `$lib_path` failed:") + print(iob, error_str) + config[Symbol(name, :_build_reason)] = String(take!(iob)) + end + else + config[Symbol(name, :_build_reason)] = "Could not find `$lib` libraries." + end +end + +function find_hsa_runtime!(config) + if use_artifacts() + loaded, available, error_str = safe_import(:hsa_rocr_jll) + if loaded + if available + config[:libhsaruntime_path] = hsa_rocr_jll.libhsa_runtime64 + config[:hsa_configured] = true + else + config[:hsa_build_reason] = "hsa_rocr_jll is not available on this platform." + end + else + iob = IOBuffer() + println(iob, "`import hsa_rocr_jll` failed:") + print(iob, error_str) + config[:hsa_build_reason] = String(take!(iob)) + end + else + roc_dirs = find_roc_paths() + libhsaruntime_path = find_rocm_library("libhsa-runtime64", roc_dirs, "so.1") + if !isempty(something(libhsaruntime_path, "")) + loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhsaruntime_path\")") + if loaded + config[:libhsaruntime_path] = libhsaruntime_path + config[:hsa_configured] = true + else + iob = IOBuffer() + println(iob, "Loading `libhsa-runtime64` v1 failed:") + print(iob, error_str) + config[:hsa_build_reason] = String(take!(iob)) + end + else + config[:hsa_build_reason] = "Could not find `libhsa-runtime64` v1 library" + end + end +end + +function find_ld_lld!(config) + if use_artifacts() + loaded, available, error_str = safe_import(:LLD_jll) + if loaded + if available + if isdefined(LLD_jll, :lld_path) + config[:lld_path] = LLD_jll.lld_path + config[:lld_artifact] = true + config[:lld_configured] = true + else + config[:lld_build_reason] = "LLD_jll does not export `lld_path`" + end + else + config[:lld_build_reason] = "LLD_jll is not available on this platform" + end + else + iob = IOBuffer() + println(iob, "`import LLD_jll` failed:") + print(iob, error_str) + config[:lld_build_reason] = String(take!(iob)) + end + else + lld_path = find_ld_lld() + if !isempty(something(lld_path, "")) + # TODO: Validate ld.lld can compile programs + config[:lld_path] = lld_path + config[:lld_configured] = true + else + config[:lld_build_reason] = "Could not find `ld.lld` executable" + end + end +end + +function find_device_libs!(config) + if use_artifacts() + find_artifact_library!( + config, :ROCmDeviceLibs_jll; + name=:device_libs, lib=:bitcode_path) + else + device_libs_path = find_device_libs() + if !isempty(something(device_libs_path, "")) + # TODO: Validate bitcode files + config[:libdevice_libs] = device_libs_path + config[:device_libs_configured] = true + else + config[:device_libs_build_reason] = "Couldn't find bitcode files" + end + end +end + +function find_hip!(config) + if use_artifacts() + find_artifact_library!(config, :HIP_jll; name=:hip, lib=:libamdhip64) + else + find_system_library!(config, :hip; lib=["libamdhip64", "libhip_hcc"]) + end +end + +function find_hip_based_libs!(config, rocm_ext_libs) + @sync for (name, pkg) in rocm_ext_libs + @async begin + lib = Symbol(:lib, string(name)) + if use_artifacts() + find_artifact_library!(config, pkg; name, lib) + else + find_system_library!(config, name; lib=string(lib)) + end + end + end +end + +function bindeps_setup() + config = Dict{Symbol, Any}( + :configured => false, + :build_reason => "unknown", + + :lld_configured => false, + :lld_build_reason => "unknown", + :lld_artifact => false, + + :hsa_configured => false, + :hsa_build_reason => "unknown", + + :hip_configured => false, + :hip_build_reason => "unknown", + + :device_libs_configured => false, + :device_libs_build_reason => "unknown") + + for (name, _) in rocm_ext_libs + lib = Symbol(:lib, string(name)) + config[lib] = nothing + config[Symbol(name, :_configured)] = false + config[Symbol(name, :_build_reason)] = "unknown" + end + + if !Sys.islinux() + @warn "AMDGPU.jl only supports Linux OS." + config[:build_reason] = "Unsupported OS: $(repr(Sys.KERNEL))" + return config + end + if !ispath("/dev/kfd") + @warn "/dev/kfd is not available. Cannot use ROCm Runtime." + end + + find_hsa_runtime!(config) + config[:hsa_configured] || return config + + find_ld_lld!(config) + config[:lld_configured] || return config + + find_device_libs!(config) + config[:device_libs_configured] || return config + + find_hip!(config) + if config[:hip_configured] + find_hip_based_libs!(config, rocm_ext_libs) + end + config[:configured] = true + return config +end diff --git a/src/runtime/device.jl b/src/runtime/device.jl index f63a437c0..ae149522b 100644 --- a/src/runtime/device.jl +++ b/src/runtime/device.jl @@ -1,53 +1,17 @@ -# Utilities for working with HSA devices +const DEFAULT_DEVICE = Ref{HIPDevice}() +const ALL_DEVICES = Vector{HIPDevice}() +const HSA_DEVICES = Vector{ROCDevice}() -mutable struct ROCDevice - agent::HSA.Agent - - # Cached information - type::HSA.DeviceType - name::String - productname::String - uuid::String - - function ROCDevice(handle::HSA.Agent) - device = new(handle) - device.type = device_type(device) - device.name = name(device) - device.productname = product_name(device) - device.uuid = uuid(device) - - return device - end -end -ROCDevice() = AMDGPU.device() -get_handle(device::ROCDevice) = device.agent.handle - -Base.:(==)(device1::ROCDevice, device2::ROCDevice) = - device1.agent == device2.agent - -const DEFAULT_DEVICE = Ref{ROCDevice}() -const ALL_DEVICES = Vector{ROCDevice}() -const DEVICES = Dict{UInt64, ROCDevice}() # Map from device handles to ROCDevice structs - -### @cfunction callbacks ### +function fetch_devices() + isempty(ALL_DEVICES) || return copy(ALL_DEVICES) -function agent_iterate_isas_cb(isa::HSA.ISA, isas) - push!(isas, isa) - return HSA.STATUS_SUCCESS + devs = HIP.devices() + append!(ALL_DEVICES, devs) + return devs end -function iterate_agents_cb(agent::HSA.Agent, devices) - push!(devices, ROCDevice(agent)) - return HSA.STATUS_SUCCESS -end - -""" - fetch_devices() -> Vector{ROCDevice} - -Returns the list of HSA devices available on the system. -""" -function fetch_devices() - isempty(ALL_DEVICES) || return copy(ALL_DEVICES) +function fetch_hsa_devices() + isempty(HSA_DEVICES) || return copy(HSA_DEVICES) devices = Ref(Vector{ROCDevice}()) GC.@preserve devices begin @@ -57,21 +21,21 @@ function fetch_devices() _devices = devices[] end - # Update the entries in the device handle dictionary - for device in _devices - push!(ALL_DEVICES, device) - DEVICES[device.agent.handle] = device - end - + filter!(d -> device_type(d) == HSA.DEVICE_TYPE_GPU, _devices) + append!(HSA_DEVICES, _devices) return _devices end +hsa_device(device::HIPDevice) = HSA_DEVICES[device.device_id] + """ - get_default_device() -> ROCDevice + get_default_device() -> HIPDevice + +TODO update docs -Returns the default device, which is used for all kernel and array operations -when one is not explicitly specified. May be changed with -[`set_default_device!`](@ref). +# Returns the default device, which is used for all kernel and array operations +# when one is not explicitly specified. May be changed with +# [`set_default_device!`](@ref). """ function get_default_device() if !isassigned(DEFAULT_DEVICE) @@ -79,356 +43,16 @@ function get_default_device() end DEFAULT_DEVICE[] end + """ set_default_device!(device::ROCDevice) -> ROCDevice Sets the default device to `device`. See [`get_default_device`](@ref) for more details. """ -function set_default_device!(device::ROCDevice) +function set_default_device!(device::HIPDevice) DEFAULT_DEVICE[] = device end "Return all devices available to the runtime." devices() = copy(ALL_DEVICES) - -# Pretty-printing -function Base.show(io::IO, device::ROCDevice) - name = device.uuid - - name *= if device.name == device.productname || isempty(device.productname) - " [$(device.name)]" - else - " [$(device.productname) ($(device.name))]" - end - - print(io, name) -end - -### Device details - -getinfo( - agent::HSA.Agent, attribute::HSA.AgentInfo, - value::Union{Vector, Base.RefValue}, -) = HSA.agent_get_info(agent, attribute, value) - -getinfo( - agent::HSA.Agent, info::HSA.AMDAgentInfo, - value::Union{Vector, Base.RefValue}, -) = getinfo(agent, reinterpret(HSA.AgentInfo, info), value) - -getinfo(device::ROCDevice, query, value) = getinfo(device.agent, query, value) - -const AnyROCDevice = Union{ROCDevice,HSA.Agent} - -name(device::AnyROCDevice) = - getinfo(String, device, HSA.AGENT_INFO_NAME) - -product_name(device::AnyROCDevice) = - getinfo(String, device, HSA.AMD_AGENT_INFO_PRODUCT_NAME) - -uuid(device::AnyROCDevice) = - getinfo(String, device, HSA.AMD_AGENT_INFO_UUID) - -profile(device::AnyROCDevice) = - getinfo(HSA.Profile, device, HSA.AGENT_INFO_PROFILE) - -device_type(device::AnyROCDevice) = - getinfo(HSA.DeviceType, device, HSA.AGENT_INFO_DEVICE) - -device_wavefront_size(device::AnyROCDevice) = - getinfo(UInt32, device, HSA.AGENT_INFO_WAVEFRONT_SIZE) - -device_workgroup_max_size(device::AnyROCDevice) = - getinfo(UInt32, device, HSA.AGENT_INFO_WORKGROUP_MAX_SIZE) - -device_num_compute_units(device::AnyROCDevice) = - getinfo(UInt32, device, HSA.AMD_AGENT_INFO_COMPUTE_UNIT_COUNT) - -device_num_simds_per_compute_unit(device::AnyROCDevice) = - getinfo(UInt32, device, HSA.AMD_AGENT_INFO_NUM_SIMDS_PER_CU) - -function device_local_memory_size(device::AnyROCDevice) - _regions = regions(device) - for region in _regions - if region_segment(region) == HSA.REGION_SEGMENT_GROUP - return region_size(region) - end - end - error("Failed to find local memory region for $device") -end - -### ISAs - -isas(device::ROCDevice) = isas(device.agent) -function isas(agent::HSA.Agent) - isas = Ref(HSA.ISA[]) - func = @cfunction(agent_iterate_isas_cb, HSA.Status, (HSA.ISA, Ref{Vector{HSA.ISA}})) - HSA.agent_iterate_isas(agent, func, isas) |> check - isas[] -end - -# Device handle => default ISA. -const DEFAULT_ISAS = Dict{UInt64, HSA.ISA}() - -function default_isa(device::ROCDevice) - lock(RT_LOCK) do - get!( - () -> first(Runtime.isas(device)), - DEFAULT_ISAS, Runtime.get_handle(device)) - end -end - -# TODO: PCRE regexes are not thread-safe -const isa_regex = r"([a-z]*)-([a-z]*)-([a-z]*)--([a-z0-9]*)([a-zA-Z0-9+\-:]*)" -function parse_isa(isa::HSA.ISA) - len = isa_name_length(isa) - name = Vector{UInt8}(undef, len) - getinfo(isa, HSA.ISA_INFO_NAME, name) |> check - name = String(name) - m = match(isa_regex, name) - @assert m !== nothing "Failed to match ISA name pattern: $name" - m -end - -function llvm_arch_features(isa::HSA.ISA) - @memoize isa::HSA.ISA begin - m = parse_isa(isa) - arch = String(m.captures[4]) - features = join(map(x->x[1:end-1], - filter!(x->!isempty(x) && (x[end]=='+'), - split(m.captures[5], ':'))), - ",+") - if !isempty(features) - features = '+'*features - end - if Base.libllvm_version < v"12" - features = replace(features, "sramecc"=>"sram-ecc") - end - (arch, features) - end::Tuple{String, String} -end -architecture(isa::HSA.ISA) = llvm_arch_features(isa)[1] -features(isa::HSA.ISA) = llvm_arch_features(isa)[2] - -getinfo(isa::HSA.ISA, attribute::HSA.ISAInfo, - value::Union{Vector, Base.RefValue}) = - HSA.isa_get_info_alt(isa, attribute, value) - -isa_name_length(isa::HSA.ISA) = - getinfo(Cuint, isa, HSA.ISA_INFO_NAME_LENGTH) - -isa_workgroup_max_size(isa::HSA.ISA) = - getinfo(UInt32, isa, HSA.ISA_INFO_WORKGROUP_MAX_SIZE) - -### Regions - -struct ROCMemoryRegion - region::HSA.Region -end -get_handle(region::ROCMemoryRegion) = region.region.handle - -function iterate_regions_cb(region::HSA.Region, regions) - push!(regions, region) - return HSA.STATUS_SUCCESS -end - -function regions(agent::HSA.Agent) - @memoize agent::HSA.Agent begin - regions = Ref(HSA.Region[]) - func = @cfunction(iterate_regions_cb, HSA.Status, (HSA.Region, Ref{Vector{HSA.Region}})) - HSA.agent_iterate_regions(agent, func, regions) |> check - map(ROCMemoryRegion, regions[]) - end::Vector{ROCMemoryRegion} -end -regions(device::ROCDevice) = regions(device.agent) - -getinfo(region::HSA.Region, attribute::HSA.RegionInfo, - value::Union{Vector,Base.RefValue}) = - HSA.region_get_info(region, attribute, value) -getinfo(region::HSA.Region, attribute::HSA.AMDRegionInfo, - value::Union{Vector,Base.RefValue}) = - getinfo(region, reinterpret(HSA.RegionInfo, attribute), value) - -getinfo(region::ROCMemoryRegion, query, value) = getinfo(region.region, query, value) - -const AnyROCMemoryRegion = Union{ROCMemoryRegion,HSA.Region} - -region_segment(region::AnyROCMemoryRegion) = - getinfo(HSA.RegionSegment, region, HSA.REGION_INFO_SEGMENT) -region_global_flags(region::AnyROCMemoryRegion) = - getinfo(HSA.RegionGlobalFlag, region, HSA.REGION_INFO_GLOBAL_FLAGS) -region_size(region::AnyROCMemoryRegion) = - getinfo(Csize_t, region, HSA.REGION_INFO_SIZE) -region_alloc_max_size(region::AnyROCMemoryRegion) = - getinfo(Csize_t, region, HSA.REGION_INFO_ALLOC_MAX_SIZE) -region_alloc_max_private_workgroup_size(region::AnyROCMemoryRegion) = - getinfo(Csize_t, region, HSA.REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE) -region_runtime_alloc_allowed(region::AnyROCMemoryRegion) = - getinfo(Bool, region, HSA.REGION_INFO_RUNTIME_ALLOC_ALLOWED) -region_runtime_alloc_granule(region::AnyROCMemoryRegion) = - getinfo(Csize_t, region, HSA.REGION_INFO_RUNTIME_ALLOC_GRANULE) -region_runtime_alloc_alignment(region::AnyROCMemoryRegion) = - getinfo(Csize_t, region, HSA.REGION_INFO_RUNTIME_ALLOC_ALIGNMENT) -region_host_accessible(region::AnyROCMemoryRegion) = - getinfo(Bool, region, HSA.AMD_REGION_INFO_HOST_ACCESSIBLE) - -function get_region(device::ROCDevice, kind::Symbol) - flag = if kind == :finegrained - HSA.REGION_GLOBAL_FLAG_FINE_GRAINED - elseif kind == :coarsegrained || kind == :coarsegrained_host - HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED - elseif kind == :kernarg - HSA.REGION_GLOBAL_FLAG_KERNARG - else - error("Region kind $kind not supported") - end - _regions = regions(device) - _regions = filter(r -> region_global_flags(r) & flag > 0, _regions) - kind == :coarsegrained && filter!(!region_host_accessible, _regions) - @assert !isempty(_regions) "No region of kind $kind in device $device" - return first(_regions) -end - -function Base.show(io::IO, region::ROCMemoryRegion) - segment_map = Dict(HSA.REGION_SEGMENT_GLOBAL => :global, - HSA.REGION_SEGMENT_READONLY => :readonly, - HSA.REGION_SEGMENT_PRIVATE => :private, - HSA.REGION_SEGMENT_GROUP => :group, - HSA.REGION_SEGMENT_KERNARG => :kernarg) - segment = segment_map[region_segment(region)] - - _flags = region_global_flags(region) - flags = Symbol[] - flag_map = Dict(UInt32(HSA.REGION_GLOBAL_FLAG_KERNARG) => :kernarg, - UInt32(HSA.REGION_GLOBAL_FLAG_FINE_GRAINED) => :finegrained, - UInt32(HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED) => :coarsegrained) - for (flag, flag_name) in CEnum.namemap(HSA.RegionGlobalFlag) - if flag & _flags > 0 - push!(flags, flag_map[flag]) - end - end - flags = "($(join(flags, ", ")))" - - size = Base.format_bytes(region_size(region)) - - private_workgroup_size = segment == :private ? Base.format_bytes(region_alloc_max_private_workgroup_size(region)) : nothing - - runtime_alloc = region_runtime_alloc_allowed(region) - - alloc_granule = Base.format_bytes(region_runtime_alloc_granule(region)) - - alloc_align = Base.format_bytes(region_runtime_alloc_alignment(region)) - - max_size = Base.format_bytes(region_alloc_max_size(region)) - - host_access = region_host_accessible(region) - - print(io, "ROCMemoryRegion @ $(repr(region.region.handle)): Segment $segment, Flags $flags, Size $size ($max_size max allocation), ") - if segment == :private - print(io, "Workgroup Max Alloc: $private_workgroup_size, ") - end - print(io, "Runtime Alloc: "); printstyled(io, "$runtime_alloc"; color=runtime_alloc ? :green : :red); print(io, " ($alloc_granule granularity, $alloc_align alignment), ") - print(io, "Host Accessible: "); printstyled(io, "$host_access"; color=host_access ? :green : :red) -end - -### Memory Pools - -struct ROCMemoryPool - pool::HSA.AMDMemoryPool -end -get_handle(pool::ROCMemoryPool) = pool.pool.handle - -function iterate_pools_cb(pool::HSA.AMDMemoryPool, pools) - push!(pools, pool) - return HSA.STATUS_SUCCESS -end - -function memory_pools(agent::HSA.Agent) - @memoize agent::HSA.Agent begin - pools = Ref(HSA.AMDMemoryPool[]) - func = @cfunction(iterate_pools_cb, HSA.Status, (HSA.AMDMemoryPool, Ref{Vector{HSA.AMDMemoryPool}})) - HSA.amd_agent_iterate_memory_pools(agent, func, pools) |> check - map(ROCMemoryPool, pools[]) - end::Vector{ROCMemoryPool} -end -memory_pools(device::ROCDevice) = memory_pools(device.agent) - -getinfo(pool::HSA.AMDMemoryPool, attribute::HSA.AMDMemoryPoolInfo, - value::Union{Vector,Base.RefValue}) = - HSA.amd_memory_pool_get_info(pool, attribute, value) - -getinfo(pool::ROCMemoryPool, query, value) = getinfo(pool.pool, query, value) - -const AnyROCMemoryPool = Union{ROCMemoryPool, HSA.AMDMemoryPool} - -pool_segment(pool::AnyROCMemoryPool) = - getinfo(HSA.AMDSegment, pool, HSA.AMD_MEMORY_POOL_INFO_SEGMENT) -pool_global_flags(pool::AnyROCMemoryPool) = - getinfo(HSA.AMDMemoryPoolGlobalFlag, pool, HSA.AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS) -pool_size(pool::AnyROCMemoryPool) = - getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_SIZE) -pool_alloc_max_size(pool::AnyROCMemoryPool) = - getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE) -pool_runtime_alloc_allowed(pool::AnyROCMemoryPool) = - getinfo(Bool, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED) -pool_runtime_alloc_granule(pool::AnyROCMemoryPool) = - getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE) -pool_runtime_alloc_alignment(pool::AnyROCMemoryPool) = - getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT) -pool_accessible_by_all(pool::AnyROCMemoryPool) = - getinfo(Bool, pool, HSA.AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL) - -function Base.show(io::IO, pool::ROCMemoryPool) - segment_map = Dict(HSA.AMD_SEGMENT_GLOBAL => :global, - HSA.AMD_SEGMENT_READONLY => :readonly, - HSA.AMD_SEGMENT_PRIVATE => :private, - HSA.AMD_SEGMENT_GROUP => :group) - segment = segment_map[pool_segment(pool)] - - _flags = pool_global_flags(pool) - flags = Symbol[] - flag_map = Dict(UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) => :kernarg, - UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) => :finegrained, - UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) => :coarsegrained) - for (flag, flag_name) in CEnum.namemap(HSA.AMDMemoryPoolGlobalFlag) - if flag & _flags > 0 - push!(flags, flag_map[flag]) - end - end - flags = "($(join(flags, ", ")))" - - size = Base.format_bytes(pool_size(pool)) - - runtime_alloc = pool_runtime_alloc_allowed(pool) - - alloc_granule = Base.format_bytes(pool_runtime_alloc_granule(pool)) - - alloc_align = Base.format_bytes(pool_runtime_alloc_alignment(pool)) - - accessible_all = pool_accessible_by_all(pool) - - max_size = Base.format_bytes(pool_alloc_max_size(pool)) - - print(io, "ROCMemoryPool @ $(repr(pool.pool.handle)): Segment $segment, Flags $flags, Size $size ($max_size max allocation), ") - print(io, "Runtime Alloc: "); printstyled(io, "$runtime_alloc"; color=runtime_alloc ? :green : :red); print(io, " ($alloc_granule granularity, $alloc_align alignment), ") - print(io, "All Accessible: "); printstyled(io, "$accessible_all"; color=accessible_all ? :green : :red) -end - -function get_memory_pool(device::ROCDevice, kind::Symbol) - flag = if kind == :finegrained - HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED - elseif kind == :coarsegrained || kind == :coarsegrained_host - HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED - elseif kind == :kernarg - HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT - else - error("Region kind $kind not supported") - end - - _pools = memory_pools(device) - _pools = filter(p -> pool_global_flags(p) & flag > 0, _pools) - kind == :coarsegrained && filter!(!pool_accessible_by_all, _pools) - @assert !isempty(_pools) "No memory pool of kind $kind in device $device" - return first(_pools) -end diff --git a/src/runtime/executable.jl b/src/runtime/executable.jl deleted file mode 100644 index 54f6959ad..000000000 --- a/src/runtime/executable.jl +++ /dev/null @@ -1,139 +0,0 @@ -getinfo(exsym::HSA.ExecutableSymbol, attribute::HSA.ExecutableSymbolInfo, - value::Union{Vector, Base.RefValue}) = - HSA.executable_symbol_get_info(exsym, attribute, value) - -executable_symbol_type(sym::HSA.ExecutableSymbol) = - getinfo(HSA.SymbolKind, sym, HSA.EXECUTABLE_SYMBOL_INFO_TYPE) - -# TODO: Symbol name length - -executable_symbol_name(sym::HSA.ExecutableSymbol) = - getinfo(String, sym, HSA.EXECUTABLE_SYMBOL_INFO_NAME) - -executable_symbol_kernel_object(sym::HSA.ExecutableSymbol) = - getinfo(UInt64, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT) - -executable_symbol_kernel_kernarg_segment_size(sym::HSA.ExecutableSymbol) = - getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE) - -executable_symbol_kernel_group_segment_size(sym::HSA.ExecutableSymbol) = - getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE) - -executable_symbol_kernel_private_segment_size(sym::HSA.ExecutableSymbol) = - getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE) - -### @cfunction Callbacks ### - -function iterate_exec_agent_syms_cb(exe::HSA.Executable, agent::HSA.Agent, - sym::HSA.ExecutableSymbol, - sym_ref::Ptr{HSA.ExecutableSymbol}) - if executable_symbol_type(sym) == HSA.SYMBOL_KIND_KERNEL - # FIXME: Ensure name matches - #name = executable_symbol_name(sym) - Base.unsafe_store!(sym_ref, sym) - return HSA.STATUS_INFO_BREAK - end - - return HSA.STATUS_SUCCESS -end - -mutable struct ROCExecutable - device::ROCDevice - executable::HSA.Executable - data::Vector{UInt8} - globals::Dict{Symbol, Mem.Buffer} -end - -# TODO Docstring -function ROCExecutable(device::ROCDevice, data::Vector{UInt8}, symbol::String; globals=()) - code_object_reader_ref = Ref{HSA.CodeObjectReader}(HSA.CodeObjectReader(0)) - HSA.code_object_reader_create_from_memory( - data, sizeof(data), code_object_reader_ref) |> check - code_object_reader = code_object_reader_ref[] - - executable_ref = Ref{HSA.Executable}() - HSA.executable_create_alt( - profile(device), HSA.DEFAULT_FLOAT_ROUNDING_MODE_NEAR, - C_NULL, executable_ref) |> check - executable = executable_ref[] - - _globals = Dict{Symbol,Mem.Buffer}() - for (gbl,sz) in globals - gbl_buf = Mem.alloc(device, sz; coherent=true) - HSA.executable_agent_global_variable_define( - executable, device.agent, string(gbl), gbl_buf.ptr) |> check - _globals[gbl] = gbl_buf - end - - HSA.executable_load_agent_code_object( - executable, device.agent, code_object_reader, - C_NULL, C_NULL) |> check - - HSA.executable_freeze(executable, "") |> check - - exe = ROCExecutable(device, executable, data, _globals) - - # TODO: Ensure no derived kernels are in flight during finalization - AMDGPU.hsaref!() - finalizer(exe) do e - HSA.executable_destroy(e.executable) |> check - for buf in values(e.globals) - Mem.free(buf) - end - HSA.code_object_reader_destroy(code_object_reader) |> check - AMDGPU.hsaunref!() - end - - return exe -end - -function get_global(exe::ROCExecutable, symbol::Symbol) - @assert symbol in keys(exe.globals) "No such global in executable: $symbol" - return exe.globals[symbol] -end - -has_exception(e::ROCExecutable) = haskey(e.globals, :__global_exception_flag) - -function get_exception( - exe::ROCExecutable; cleanup::Bool = true, signal_handle::UInt64, -) - has_exception(exe) || return nothing - - # Check if any wavefront for this kernel threw an exception - ex_flag = get_global(exe, :__global_exception_flag) - ex_flag_ptr = Base.unsafe_convert(Ptr{Int64}, ex_flag) - ex_flag_value = Base.unsafe_load(ex_flag_ptr) - ex_flag_value == 0 && return nothing - - ex_string = nothing - fetch_ex_strings = haskey(exe.globals, :__global_exception_ring) - - if fetch_ex_strings - ex_strings = String[] - # Check for and collect any exceptions, and clear their slots - ex_ring = get_global(exe, :__global_exception_ring) - ex_ring_ptr_ptr = Base.unsafe_convert( - Ptr{Ptr{AMDGPU.Device.ExceptionEntry}}, ex_ring) - ex_ring_ptr = unsafe_load(ex_ring_ptr_ptr) - - while (ex_ring_value = unsafe_load(ex_ring_ptr)).kern != 1 - if ex_ring_value.kern == signal_handle && reinterpret(Ptr{UInt8}, ex_ring_value.ptr) != C_NULL - ex_ring_value_str = unsafe_string( - reinterpret(Ptr{UInt8}, ex_ring_value.ptr)) - push!(ex_strings, ex_ring_value_str) - - if cleanup - # FIXME: Write rest of entry first, then CAS 0 to kern field - entry = AMDGPU.Device.ExceptionEntry( - UInt64(0), Core.LLVMPtr{UInt8,1}(0)) - unsafe_store!(ex_ring_ptr, entry) - end - end - ex_ring_ptr += sizeof(AMDGPU.Device.ExceptionEntry) - end - unique!(ex_strings) - ex_string = join(ex_strings, '\n') - end - - return KernelException(exe.device, ex_string) -end diff --git a/src/runtime/execution.jl b/src/runtime/execution.jl index 44c00764b..1260a9479 100644 --- a/src/runtime/execution.jl +++ b/src/runtime/execution.jl @@ -24,218 +24,3 @@ The following keyword arguments are supported: - `queue` (defaults to the default queue) """ AbstractKernel - -@generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT} - sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type - args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...) - - # filter out ghost arguments that shouldn't be passed - predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt) - to_pass = map(!predicate, sig.parameters) - call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] - call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] - - # replace non-isbits arguments (they should be unused, or compilation would have failed) - # alternatively, allow `launch` with non-isbits arguments. - for (i,dt) in enumerate(call_t) - if !isbitstype(dt) - call_t[i] = Ptr{Any} - call_args[i] = :C_NULL - end - end - - # finalize types - call_tt = Base.to_tuple_type(call_t) - - quote - Base.@_inline_meta - - roccall(kernel, $call_tt, $(call_args...); call_kwargs...) - end -end - - -## host-side kernels - -struct HostKernel{F,TT} <: AbstractKernel{F,TT} - f::F - mod::ROCModule - fun::ROCFunction -end - -@doc (@doc AbstractKernel) HostKernel - -@inline function roccall(kernel::HostKernel, tt, args...; signal::ROCKernelSignal, groupsize=1, kwargs...) - if groupsize == :auto - config = AMDGPU.launch_configuration(kernel; signal.kernel.localmem) - roccall(signal, tt, args...; config..., kwargs...) - else - roccall(signal, tt, args...; kwargs..., groupsize) - end -end - - -## host-side API - - -# https://github.com/JuliaLang/julia/issues/14919 -(kernel::HostKernel)(args...; kwargs...) = call(kernel, args...; kwargs...) - -export roccall - -""" - roccall(signal::ROCKernelSignal, types, values...; groupsize::ROCDim, gridsize::ROCDim) - -`ccall`-like interface for launching a ROC function `f` on a GPU. - -For example: - - vadd = ROCFunction(md, "vadd") - a = rand(Float32, 10) - b = rand(Float32, 10) - ad = Mem.upload(a) - bd = Mem.upload(b) - c = zeros(Float32, 10) - cd = Mem.alloc(c) - - roccall(vadd, (Ptr{Cfloat},Ptr{Cfloat},Ptr{Cfloat}), ad, bd, cd; - gridsize=10) - Mem.download!(c, cd) - -The `groupsize` and `gridsize` arguments control the launch configuration, and should both -consist of either an integer, or a tuple of 1 to 3 integers (omitted dimensions default to -1). The `types` argument can contain both a tuple of types, and a tuple type, the latter -being slightly faster. -""" -roccall - -# we need a generated function to get a tuple of converted arguments (using unsafe_convert), -# without having to inspect the types at runtime -@generated function roccall( - signal::ROCKernelSignal, tt::Type, args::Vararg{Any,N}; - groupsize::ROCDim=1, gridsize::ROCDim=groupsize, -) where N - - # the type of `tt` is Type{Tuple{<:DataType...}} - types = tt.parameters[1].parameters - - ex = Expr(:block) - push!(ex.args, :(Base.@_inline_meta)) - - # convert the argument values to match the kernel's signature (specified by the user) - # (this mimics `lower-ccall` in julia-syntax.scm) - converted_args = Vector{Symbol}(undef, length(args)) - arg_ptrs = Vector{Symbol}(undef, length(args)) - for i in 1:length(args) - converted_args[i] = gensym() - arg_ptrs[i] = gensym() - push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) - push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) - end - - append!(ex.args, (quote - write_args!(signal.kernel, $(arg_ptrs...)) - #GC.@preserve $(converted_args...) begin - launch_kernel!(signal, groupsize, gridsize, ($(arg_ptrs...),)) - #end - end).args) - - return ex -end - -function write_args!(kernel::ROCKernel, args...) - # Allocate the kernel argument buffer - key = khash(args) - kernarg_address, do_write = Mem.alloc_pooled(kernel.device, key, :kernarg, - kernel.kernarg_segment_size) - - if do_write - # Fill kernel argument buffer - # FIXME: Query kernarg segment alignment - ctr = 0 - for arg in args - sz = sizeof(typeof(arg)) - if sz == 0 - continue - end - rarg = Ref(arg) - align = Base.datatype_alignment(typeof(arg)) - rem = mod(ctr, align) - if rem > 0 - ctr += align-rem - end - ccall(:memcpy, Cvoid, - (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t), - kernarg_address+ctr, rarg, sz) - ctr += sz - end - end - - # Register kernarg buffer - kernel.kernarg_address = kernarg_address - AMDGPU.hsaref!() - finalizer(kernel) do k - Mem.free_pooled(k.device, key, :kernarg, kernarg_address) - AMDGPU.hsaunref!() - end -end - -## device-side kernels - -struct DeviceKernel{F,TT} <: AbstractKernel{F,TT} - fun::Ptr{Cvoid} -end - -@doc (@doc AbstractKernel) DeviceKernel - -@inline roccall(kernel::DeviceKernel, tt, args...; kwargs...) = - dynamic_roccall(kernel.fun, tt, args...; kwargs...) - -# FIXME: duplication with roccall -@generated function dynamic_roccall(f::Ptr{Cvoid}, tt::Type, args...; - blocks=UInt32(1), threads=UInt32(1), shmem=UInt32(0), - queue=queue()) - types = tt.parameters[1].parameters # the type of `tt` is Type{Tuple{<:DataType...}} - - ex = quote - Base.@_inline_meta - end - - # convert the argument values to match the kernel's signature (specified by the user) - # (this mimics `lower-ccall` in julia-syntax.scm) - converted_args = Vector{Symbol}(undef, length(args)) - arg_ptrs = Vector{Symbol}(undef, length(args)) - for i in 1:length(args) - converted_args[i] = gensym() - arg_ptrs[i] = gensym() - push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) - push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) - end - - append!(ex.args, (quote - #GC.@preserve $(converted_args...) begin - launch(f, blocks, threads, shmem, stream, $(arg_ptrs...)) - #end - end).args) - - return ex -end - - -## device-side API - -""" - dynamic_rocfunction(f, tt=Tuple{}) - -Low-level interface to compile a function invocation for the currently-active GPU, returning -a callable kernel object. Device-side equivalent of [`AMDGPU.rocfunction`](@ref). - -No keyword arguments are supported. -""" -@inline function dynamic_rocfunction(f::F, tt::Type=Tuple{}) where {F <: Function} - fptr = GPUCompiler.deferred_codegen(Val(F), Val(tt)) - DeviceKernel{f,tt}(fptr) -end - -# https://github.com/JuliaLang/julia/issues/14919 -(kernel::DeviceKernel)(args...; kwargs...) = call(kernel, args...; kwargs...) diff --git a/src/runtime/hashing.jl b/src/runtime/hashing.jl deleted file mode 100644 index d15e04e5e..000000000 --- a/src/runtime/hashing.jl +++ /dev/null @@ -1,22 +0,0 @@ -# Kernel argument hashing - -## Arguments which are written to the kernarg buffer identically should have -## the same khash value. Array contents are not hashed; instead, we hash the -## array pointer. - -function khash(x::T, h::UInt=UInt(0)) where T - # Generic hashing - h = khash(T, h) - if isstructtype(T) - for name in fieldnames(T) - h = khash(getfield(x, name), h) - end - elseif isprimitivetype(T) - h = hash(x, h) - else - error("Can't hash: $T") - end - return h -end -khash(::Type{T}, h::UInt=UInt(0)) where T = hash(T, h) -khash(x::Symbol, h::UInt=UInt(0)) = hash(x, h) diff --git a/src/runtime/hip-execution.jl b/src/runtime/hip-execution.jl new file mode 100644 index 000000000..1e68dc4ba --- /dev/null +++ b/src/runtime/hip-execution.jl @@ -0,0 +1,115 @@ +struct HIPKernel{F, TT} <: AbstractKernel{F, TT} + f::F + fun::HIP.HIPFunction +end + +@inline @generated function call( + kernel::HIPKernel{F, TT}, args...; + stream::HIP.HIPStream, call_kwargs..., +) where {F, TT} + sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type + args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...) + + # filter out ghost arguments that shouldn't be passed + predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt) + to_pass = map(!predicate, sig.parameters) + call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] + call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] + + # replace non-isbits arguments (they should be unused, or compilation would have failed) + # alternatively, allow `launch` with non-isbits arguments. + for (i,dt) in enumerate(call_t) + if !isbitstype(dt) + call_t[i] = Ptr{Any} + call_args[i] = :C_NULL + end + end + + # add the kernel state + pushfirst!(call_t, AMDGPU.KernelState) + pushfirst!(call_args, :(AMDGPU.KernelState( + stream.device, kernel.fun.global_hostcalls))) + + # finalize types + call_tt = Base.to_tuple_type(call_t) + + quote + roccall(kernel.fun, $call_tt, $(call_args...); stream, call_kwargs...) + end +end + +function (ker::HIPKernel{F, TT})( + args...; stream::HIP.HIPStream = AMDGPU.stream(), call_kwargs..., +) where {F, TT} + # Check if previous kernels threw an exception. + AMDGPU.throw_if_exception(stream.device) + call(ker, map(AMDGPU.rocconvert, args)...; stream, call_kwargs...) +end + +@inline @generated function convert_arguments(f::Function, ::Type{tt}, args...) where tt + types = tt.parameters + + ex = quote end + + converted_args = Vector{Symbol}(undef, length(args)) + arg_ptrs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) + converted_args[i] = gensym() + arg_ptrs[i] = gensym() + push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i]))) + push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i])))) + end + + append!(ex.args, (quote + GC.@preserve $(converted_args...) begin + f($(arg_ptrs...)) + end + end).args) + return ex +end + +function roccall(fun::HIP.HIPFunction, tt::Type, args...; kwargs...) + convert_arguments(tt, args...) do pointers... + launch(fun, pointers...; kwargs...) + end +end + +@inline @generated function pack_arguments(f::Function, args...) + for arg in args + isbitstype(arg) || throw(ArgumentError( + "Arguments to kernel should be bitstype, instead `$(arg)` was given.")) + end + + ex = quote end + + arg_refs = Vector{Symbol}(undef, length(args)) + for i in 1:length(args) + arg_refs[i] = gensym() + push!(ex.args, :($(arg_refs[i]) = Base.RefValue(args[$i]))) + end + + arg_ptrs = [ + :(Base.unsafe_convert(Ptr{Cvoid}, $(arg_refs[i]))) + for i in 1:length(args)] + + append!(ex.args, (quote + GC.@preserve $(arg_refs...) begin + kernel_params = [$(arg_ptrs...)] + f(kernel_params) + end + end).args) + return ex +end + +function launch( + fun::HIP.HIPFunction, args::Vararg{Any, N}; + gridsize::ROCDim = 1, groupsize::ROCDim = 1, + shmem::Integer = 0, stream::HIP.HIPStream, +) where N + gd, bd = ROCDim3(gridsize), ROCDim3(groupsize) + pack_arguments(args...) do kernel_params + HIP.hipModuleLaunchKernel( + fun, gd.x, gd.y, gd.z, bd.x, bd.y, bd.z, + shmem, stream, kernel_params, C_NULL) |> HIP.check + end +end diff --git a/src/runtime/hsa_device.jl b/src/runtime/hsa_device.jl new file mode 100644 index 000000000..4c66b73de --- /dev/null +++ b/src/runtime/hsa_device.jl @@ -0,0 +1,170 @@ +mutable struct ROCDevice + agent::HSA.Agent + + # Cached information + type::HSA.DeviceType + name::String + productname::String + uuid::String + + function ROCDevice(handle::HSA.Agent) + device = new(handle) + device.type = device_type(device) + device.name = name(device) + device.productname = product_name(device) + device.uuid = uuid(device) + + return device + end +end +ROCDevice() = AMDGPU.device() +get_handle(device::ROCDevice) = device.agent.handle + +Base.:(==)(device1::ROCDevice, device2::ROCDevice) = + device1.agent == device2.agent + +function agent_iterate_isas_cb(isa::HSA.ISA, isas) + push!(isas, isa) + return HSA.STATUS_SUCCESS +end + +function iterate_agents_cb(agent::HSA.Agent, devices) + push!(devices, ROCDevice(agent)) + return HSA.STATUS_SUCCESS +end + +# Pretty-printing +function Base.show(io::IO, device::ROCDevice) + name = device.uuid + + name *= if device.name == device.productname || isempty(device.productname) + " [$(device.name)]" + else + " [$(device.productname) ($(device.name))]" + end + + print(io, name) +end + +### Device details + +getinfo( + agent::HSA.Agent, attribute::HSA.AgentInfo, + value::Union{Vector, Base.RefValue}, +) = HSA.agent_get_info(agent, attribute, value) + +getinfo( + agent::HSA.Agent, info::HSA.AMDAgentInfo, + value::Union{Vector, Base.RefValue}, +) = getinfo(agent, reinterpret(HSA.AgentInfo, info), value) + +getinfo(device::ROCDevice, query, value) = getinfo(device.agent, query, value) + +const AnyROCDevice = Union{ROCDevice,HSA.Agent} + +name(device::AnyROCDevice) = + getinfo(String, device, HSA.AGENT_INFO_NAME) + +product_name(device::AnyROCDevice) = + getinfo(String, device, HSA.AMD_AGENT_INFO_PRODUCT_NAME) + +uuid(device::AnyROCDevice) = + getinfo(String, device, HSA.AMD_AGENT_INFO_UUID) + +profile(device::AnyROCDevice) = + getinfo(HSA.Profile, device, HSA.AGENT_INFO_PROFILE) + +device_type(device::AnyROCDevice) = + getinfo(HSA.DeviceType, device, HSA.AGENT_INFO_DEVICE) + +device_wavefront_size(device::AnyROCDevice) = + getinfo(UInt32, device, HSA.AGENT_INFO_WAVEFRONT_SIZE) + +device_workgroup_max_size(device::AnyROCDevice) = + getinfo(UInt32, device, HSA.AGENT_INFO_WORKGROUP_MAX_SIZE) + +device_num_compute_units(device::AnyROCDevice) = + getinfo(UInt32, device, HSA.AMD_AGENT_INFO_COMPUTE_UNIT_COUNT) + +device_num_simds_per_compute_unit(device::AnyROCDevice) = + getinfo(UInt32, device, HSA.AMD_AGENT_INFO_NUM_SIMDS_PER_CU) + +function device_local_memory_size(device::AnyROCDevice) + _regions = regions(device) + for region in _regions + if region_segment(region) == HSA.REGION_SEGMENT_GROUP + return region_size(region) + end + end + error("Failed to find local memory region for $device") +end + +### ISAs + +struct HSAISA + hsa_isa::HSA.ISA + arch_features::Tuple{String, String} +end + +function HSAISA(hsa_isa::HSA.ISA) + HSAISA(hsa_isa, llvm_arch_features(hsa_isa)) +end + +isas(device::ROCDevice) = isas(device.agent) +function isas(agent::HSA.Agent) + isas = Ref(HSA.ISA[]) + func = @cfunction(agent_iterate_isas_cb, HSA.Status, (HSA.ISA, Ref{Vector{HSA.ISA}})) + HSA.agent_iterate_isas(agent, func, isas) |> check + HSAISA.(isas[]) +end + +# Device handle => default ISA. +const DEFAULT_ISAS = Dict{UInt64, HSAISA}() + +function default_isa(device::ROCDevice) + lock(RT_LOCK) do + get!( + () -> first(Runtime.isas(device)), + DEFAULT_ISAS, Runtime.get_handle(device)) + end +end + +# TODO: PCRE regexes are not thread-safe +const isa_regex = r"([a-z]*)-([a-z]*)-([a-z]*)--([a-z0-9]*)([a-zA-Z0-9+\-:]*)" +function parse_isa(isa::HSA.ISA) + len = isa_name_length(isa) + name = Vector{UInt8}(undef, len) + getinfo(isa, HSA.ISA_INFO_NAME, name) |> check + name = String(name) + m = match(isa_regex, name) + @assert m !== nothing "Failed to match ISA name pattern: $name" + m +end + +function llvm_arch_features(isa::HSA.ISA) + m = parse_isa(isa) + arch = String(m.captures[4]) + features = join(map(x->x[1:end-1], + filter!(x->!isempty(x) && (x[end]=='+'), + split(m.captures[5], ':'))), + ",+") + if !isempty(features) + features = '+'*features + end + if Base.libllvm_version < v"12" + features = replace(features, "sramecc"=>"sram-ecc") + end + (arch, features) +end + +architecture(isa::HSA.ISA) = llvm_arch_features(isa)[1] + +features(isa::HSA.ISA) = llvm_arch_features(isa)[2] + +getinfo(isa::HSA.ISA, attribute::HSA.ISAInfo, value::Union{Vector, Base.RefValue}) = + HSA.isa_get_info_alt(isa, attribute, value) + +isa_name_length(isa::HSA.ISA) = getinfo(Cuint, isa, HSA.ISA_INFO_NAME_LENGTH) + +isa_workgroup_max_size(isa::HSA.ISA) = + getinfo(UInt32, isa, HSA.ISA_INFO_WORKGROUP_MAX_SIZE) diff --git a/src/runtime/kernel-signal.jl b/src/runtime/kernel-signal.jl deleted file mode 100644 index 9582d67e1..000000000 --- a/src/runtime/kernel-signal.jl +++ /dev/null @@ -1,105 +0,0 @@ -struct KernelException <: Exception - device::ROCDevice - exstr::Union{String, Nothing} -end - -function Base.showerror(io::IO, err::KernelException) - print(io, "KernelException: exception(s) thrown during kernel execution on device $(err.device)") - if err.exstr !== nothing - println(io, ":") - print(io, err.exstr) - end -end - -mutable struct ROCKernelSignal - signal::ROCSignal - queue::ROCQueue - kernel::ROCKernel - exception::Union{Exception, Nothing} - @atomic active::Bool - - function ROCKernelSignal( - signal::ROCSignal, queue::ROCQueue, kernel::ROCKernel; - ) - kersig = new(signal, queue, kernel, nothing, true) - finalizer(kersig) do k - cleanup!(k; finished=true) - end - kersig - end -end - -function Base.wait( - kersig::ROCKernelSignal; check_exceptions::Bool = true, - cleanup::Bool = false, signal_kwargs..., -) - @log_start(:wait, (;entry=kersig.kernel.sym, signal=get_handle(kersig.signal)), nothing) - finished = try - wait(kersig.signal; queue=kersig.queue, signal_kwargs...) - true - catch err - if isa(err, SignalTimeoutException) && SIGNAL_TIMEOUT_KILL_QUEUE[] - kill_queue!(kersig.queue) - end - isnothing(kersig.exception) && (kersig.exception = err;) - false - finally - @log_finish(:wait, (;entry=kersig.kernel.sym, signal=get_handle(kersig.signal)), nothing) - end - - if cleanup - cleanup!(kersig; finished, check_exceptions) - elseif check_exceptions - # Ensure we check for and propagate errors - ex = get_exception(kersig; finished, cleanup=false) - if ex !== nothing - kersig.exception = ex - end - end - - if check_exceptions - # Report any kernel-specific exceptions - if kersig.exception !== nothing - throw(kersig.exception) - end - # Report any queue-specific exceptions - ensure_active(kersig.queue) - end - - return finished -end - -function cleanup!( - kersig::ROCKernelSignal; finished::Bool, check_exceptions::Bool = true, -) - _, succ = @atomicreplace kersig.active true => false - succ || return - - unpreserve!(kersig) - - if finished - ex = get_exception(kersig; finished, cleanup=true) - isnothing(ex) || (kersig.exception = ex;) - end - - exe = kersig.kernel.exe::ROCExecutable - lock(RT_LOCK) do - mod = EXE_TO_MODULE_MAP[exe].value::ROCModule - signal_handle = get_handle(kersig.signal)::UInt64 - delete_metadata!(mod; signal_handle) - end - - return -end -function get_exception(kersig::ROCKernelSignal; finished::Bool, cleanup::Bool) - exe = kersig.kernel.exe::ROCExecutable - signal_handle::UInt64 = get_handle(kersig.signal) - return get_exception(exe; signal_handle, cleanup) -end - -function Base.show(io::IO, kersig::ROCKernelSignal) - ex = kersig.exception - print(io, "ROCKernelSignal(signal=$(kersig.signal)$(ex !== nothing ? ", exception=$ex" : ""))") -end - -Base.notify(kersig::ROCKernelSignal) = notify(kersig.signal) diff --git a/src/runtime/kernel.jl b/src/runtime/kernel.jl deleted file mode 100644 index 7b1e9734c..000000000 --- a/src/runtime/kernel.jl +++ /dev/null @@ -1,124 +0,0 @@ -export barrier_and!, barrier_or! - -## Kernel allocations - -struct KernelMetadata - kern::UInt64 - buf::Mem.Buffer -end - -## Kernel module and function - -export ROCModule, ROCFunction - -const MAX_EXCEPTIONS = 256 -const EXE_TO_MODULE_MAP = IdDict{ROCExecutable, WeakRef}() -mutable struct ROCModule - exe::ROCExecutable - metadata::Vector{KernelMetadata} - exceptions::Mem.Buffer -end - -function delete_metadata!(m::ROCModule; signal_handle::UInt64 = UInt64(0)) - isempty(m.metadata) && return nothing - - only_handle = signal_handle != UInt64(0) - for i in length(m.metadata):-1:1 - meta = m.metadata[i] - Mem.free(meta.buf) - if only_handle && (meta.kern == signal_handle) - deleteat!(m.metadata, i) - end - end - only_handle || empty!(m.metadata) - return nothing -end - -function ROCModule(exe::ROCExecutable) - device = exe.device - metadata = KernelMetadata[] - bytesize = sizeof(AMDGPU.Device.ExceptionEntry) * MAX_EXCEPTIONS - exceptions = Mem.alloc(device, bytesize; coherent=true) - - mod = ROCModule(exe, metadata, exceptions) - EXE_TO_MODULE_MAP[exe] = WeakRef(mod) - - AMDGPU.hsaref!() - return finalizer(mod) do m - delete_metadata!(m) - Mem.free(m.exceptions) - delete!(EXE_TO_MODULE_MAP, m.exe) - AMDGPU.hsaunref!() - end -end -mutable struct ROCFunction - mod::ROCModule - entry::String - hash::UInt64 -end - -## Kernel instance - -mutable struct ROCKernel - device::ROCDevice - exe::ROCExecutable - sym::String - localmem::Int64 - kernel_object::UInt64 - kernarg_segment_size::UInt32 - group_segment_size::UInt32 - private_segment_size::UInt32 - kernarg_address::Ptr{Cvoid} -end - -function executable_symbol_any(exe::ROCExecutable, device::ROCDevice) - agent_func = @cfunction(iterate_exec_agent_syms_cb, HSA.Status, - (HSA.Executable, HSA.Agent, HSA.ExecutableSymbol, Ptr{HSA.ExecutableSymbol})) - exec_symbol_ref = Ref{HSA.ExecutableSymbol}() - ret = HSA.executable_iterate_agent_symbols( - exe.executable, device.agent, agent_func, exec_symbol_ref) - @assert ret == HSA.STATUS_SUCCESS || ret == HSA.STATUS_INFO_BREAK - if isassigned(exec_symbol_ref) - return exec_symbol_ref[] - end - return nothing -end -function executable_symbol_by_name(exe::ROCExecutable, device::ROCDevice, name::Symbol) - agent_ref = Ref(device.agent) - exec_symbol_ref = Ref{HSA.ExecutableSymbol}() - GC.@preserve agent_ref begin - HSA.executable_get_symbol_by_name( - exe.executable, symbol, agent_ref, exec_symbol_ref) |> check - end - if isassigned(exec_symbol_ref) - return exec_symbol_ref[] - end - return nothing -end - -function ROCKernel(kernel #= ::HostKernel =#; localmem::Int=0) - exe = kernel.mod.exe - device = exe.device - symbol = kernel.fun.entry - - exec_symbol = executable_symbol_any(exe, device) - # TODO: Conditionally disable once ROCR is fixed - if exec_symbol === nothing - exec_symbol = something(executable_symbol_by_name(exe, device, symbol)) - end - - kernel_object = executable_symbol_kernel_object(exec_symbol) - kernarg_segment_size = executable_symbol_kernel_kernarg_segment_size(exec_symbol) - if kernarg_segment_size == 0 - # FIXME: Hidden arguments! - # Allocate some memory anyway, #10 - kernarg_segment_size = UInt32(max(kernarg_segment_size, 8)) - end - - group_segment_size = executable_symbol_kernel_group_segment_size(exec_symbol) - group_segment_size = UInt32(group_segment_size + localmem) - private_segment_size = executable_symbol_kernel_private_segment_size(exec_symbol) - ROCKernel(device, exe, symbol, localmem, kernel_object, - kernarg_segment_size, group_segment_size, - private_segment_size, Ptr{Cvoid}(0)) -end diff --git a/src/runtime/launch.jl b/src/runtime/launch.jl deleted file mode 100644 index 560aa79b9..000000000 --- a/src/runtime/launch.jl +++ /dev/null @@ -1,201 +0,0 @@ -# Kernel and barrier launch -# modeled after: CUDAdrv/src/execution.jl - -## HSA object preservation while a kernel is active - -const SIGNAL_PRESERVED = IdDict{ROCSignal, Vector{Any}}() - -function preserve!(sig::ROCSignal, @nospecialize(x)) - set = get!(()->Any[], SIGNAL_PRESERVED, sig) - push!(set, x) -end -preserve!(sig::ROCKernelSignal, @nospecialize(x)) = preserve!(sig.signal, x) - -unpreserve!(sig::ROCSignal) = delete!(SIGNAL_PRESERVED, sig) -unpreserve!(sig::ROCKernelSignal) = unpreserve!(sig.signal) - -# we need a generated function to get an args array, -# without having to inspect the types at runtime -@generated function launch_kernel!( - signal::ROCKernelSignal, groupsize::ROCDim, - gridsize::ROCDim, args::NTuple{N,Any}, -) where N - all(isbitstype, args.parameters) || - throw(ArgumentError("Arguments to kernel should be bitstype.")) - - ex = Expr(:block) - push!(ex.args, :(Base.@_inline_meta)) - - # put arguments in Ref boxes so that we can get a pointers to them - arg_refs = Vector{Symbol}(undef, N) - for i in 1:N - arg_refs[i] = gensym() - push!(ex.args, :($(arg_refs[i]) = Base.RefValue(args[$i]))) - end - - append!(ex.args, (quote - GC.@preserve $(arg_refs...) begin - # validate launch parameters - groupsize, gridsize = normalize_launch_dimensions(groupsize, gridsize) - - # launch kernel - Base.@lock signal.queue.lock begin - push!(signal.queue.active_kernels, signal) - end - launch_kernel!(signal.queue, signal.kernel, signal.signal, groupsize, gridsize) - - # preserve kernel and arguments - $preserve!(signal, signal.kernel) - for arg in args - $preserve!(signal, arg) - end - - notify(signal.queue.running) - end - end).args) - - return ex -end - -struct ROCSignalSet{T} - signals::Vector{ROCSignal} -end -ROCSignalSet{T}() where T = ROCSignalSet{T}(ROCSignal[]) - -Base.wait(ss::ROCSignalSet{HSA.BarrierAndPacket}) = wait.(ss.signals) - -function Base.wait(ss::ROCSignalSet{HSA.BarrierOrPacket}) - #= FIXME - # We need to hack around the fact that barrier OR packets don't handle more - # than 5 dependencies. We could implement the waiting in software, and emit - # a barrier that waits on a signal tied to that waiter. - =# - @warn "Waiting on OR barriers waits on all partitioned barriers to complete" maxlog=1 - wait.(ss.signals) -end - -"Normalize and validate launch groupsize and gridsize." -function normalize_launch_dimensions(groupsize, gridsize) - groupsize = ROCDim3(groupsize) - gridsize = ROCDim3(gridsize) - - # Validate group and grid dimensions - (groupsize.x > 0 && groupsize.y > 0 && groupsize.z > 0) || - throw(ArgumentError("Group dimensions must be non-zero")) - (gridsize.x > 0 && gridsize.y > 0 && gridsize.z > 0) || - throw(ArgumentError("Grid dimensions must be non-zero")) - (groupsize.x <= AMDGPU.Device._max_group_size && - groupsize.y <= AMDGPU.Device._max_group_size && - groupsize.z <= AMDGPU.Device._max_group_size && - groupsize.x * groupsize.y * groupsize.z <= AMDGPU.Device._max_group_size) || - throw(ArgumentError("Group dimensions too large")) - - return groupsize, gridsize -end - -""" - launch_kernel!(queue::ROCQueue, kern::ROCKernel, signal::ROCSignal, - groupsize::ROCDim, gridsize::ROCDim) - -Low-level call to launch a function (encoded in `kern`) on the GPU, using -`groupsize` and `gridsize` as the grid and block configuration, respectively. -The kernel is launched on `queue` and will notify `signal` upon completion. - -Arguments to a kernel should either be bitstype, in which case they will be -copied to the internal kernel parameter buffer, or a pointer to device memory. - -This is a low-level call, preferably use [`roccall`](@ref) instead. -""" -function launch_kernel!( - queue::ROCQueue, kernel::ROCKernel, signal::ROCSignal, - groupsize::ROCDim3, gridsize::ROCDim3, -) - @log_start(:launch_kernel!, (;entry=kernel.sym, signal=get_handle(signal)), (;queue=UInt64(get_handle(queue)))) - - enqueue_packet!(HSA.KernelDispatchPacket, queue) do _packet - @set! _packet.setup = 3 << Int(HSA.KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS) - @set! _packet.workgroup_size_x = groupsize.x - @set! _packet.workgroup_size_y = groupsize.y - @set! _packet.workgroup_size_z = groupsize.z - @set! _packet.grid_size_x = gridsize.x - @set! _packet.grid_size_y = gridsize.y - @set! _packet.grid_size_z = gridsize.z - @set! _packet.completion_signal = signal.signal - @set! _packet.kernel_object = kernel.kernel_object - @set! _packet.kernarg_address = kernel.kernarg_address - @set! _packet.private_segment_size = kernel.private_segment_size - @set! _packet.group_segment_size = kernel.group_segment_size - _packet - end - - @log_finish(:launch_kernel!, (;entry=kernel.sym, signal=get_handle(signal)), (;queue=UInt64(get_handle(queue)))) -end - -function launch_barrier!(T, queue::ROCQueue, signals::Vector{ROCSignal}) - outset = ROCSignalSet{T}() - if !isempty(signals) - for signal_set in Iterators.partition(signals, 5) - comp_signal = ROCSignal() - enqueue_packet!(T, queue) do _packet - @set! _packet.dep_signal = ntuple(i->length(signal_set)>=i ? signal_set[i].signal : HSA.Signal(0), 5) - _packet - end - push!(outset.signals, comp_signal) - end - end - return outset -end - -# Atomic store using LLVM intrinsics -# Necessary for writing the AQL packet header to the queue -# prior to launching a kernel. -@eval atomic_store_n!(x::Ptr{UInt16}, v::UInt16) = - Base.llvmcall($""" - %ptr = inttoptr i$(Sys.WORD_SIZE) %0 to i16* - store atomic i16 %1, i16* %ptr release, align 64 - ret void - """, Cvoid, Tuple{Ptr{UInt16}, UInt16}, x, v) - -function enqueue_packet!(f::Base.Callable, ::Type{T}, queue::ROCQueue) where T - # Obtain the current queue write index and queue size - ensure_active(queue) - queue_ptr = queue.queue - _queue = unsafe_load(queue.queue) - queue_size = _queue.size - write_index = HSA.queue_add_write_index_scacq_screl(queue_ptr, UInt64(1)) - - # Yield until queue has space - while write_index - HSA.queue_load_read_index_scacquire(queue_ptr) >= queue_size - # TODO: Exponential backoff with initial `Libc.systemsleep` calls - yield() - end - - # TODO: Make this less ugly - dispatch_packet = Ref{T}() - ccall(:memset, Cvoid, - (Ptr{Cvoid}, Cint, Csize_t), - dispatch_packet, 0, sizeof(T)) - dispatch_packet[] = f(dispatch_packet[]) - - queueMask = UInt32(queue_size - 1) - baseaddr_ptr = Ptr{HSA.KernelDispatchPacket}(_queue.base_address) - baseaddr_ptr = baseaddr_ptr + sizeof(HSA.KernelDispatchPacket) * (write_index & queueMask) - dispatch_packet_ptr = convert(Ptr{HSA.KernelDispatchPacket}, Base.unsafe_convert(Ptr{T}, dispatch_packet)) - unsafe_copyto!(baseaddr_ptr, dispatch_packet_ptr, 1) - - # TODO: Generalize to allow barrier on kernel - packetheadertype(::Type{HSA.KernelDispatchPacket}) = HSA.PACKET_TYPE_KERNEL_DISPATCH - packetheadertype(::Type{HSA.BarrierAndPacket}) = HSA.PACKET_TYPE_BARRIER_AND - packetheadertype(::Type{HSA.BarrierOrPacket}) = HSA.PACKET_TYPE_BARRIER_OR - - # Create and atomically store the header - # TODO: Generalize to make scopes configurable - header = Ref{UInt16}(0) - header[] |= UInt16(HSA.FENCE_SCOPE_SYSTEM) << UInt16(HSA.PACKET_HEADER_SCACQUIRE_FENCE_SCOPE) - header[] |= UInt16(HSA.FENCE_SCOPE_SYSTEM) << UInt16(HSA.PACKET_HEADER_SCRELEASE_FENCE_SCOPE) - header[] |= UInt16(packetheadertype(T)) << UInt16(HSA.PACKET_HEADER_TYPE) - atomic_store_n!(Base.unsafe_convert(Ptr{UInt16}, baseaddr_ptr), header[]) - - # Ring the doorbell to dispatch the kernel - HSA.signal_store_relaxed(_queue.doorbell_signal, Int64(write_index)) -end diff --git a/src/runtime/linked-list.jl b/src/runtime/linked-list.jl deleted file mode 100644 index 54d420916..000000000 --- a/src/runtime/linked-list.jl +++ /dev/null @@ -1,121 +0,0 @@ -mutable struct LinkedListNode{T} - data::T - @atomic next::Union{LinkedListNode{T},Nothing} - LinkedListNode(data::T) where T = new{T}(data, nothing) -end - -mutable struct LinkedList{T} - @atomic head::Union{LinkedListNode{T},Nothing} - LinkedList{T}() where T = new{T}(nothing) -end - -function Base.push!(list::LinkedList{T}, data::T) where T - ours = LinkedListNode(data) - if @atomic(list.head) === nothing && @atomicreplace(list.head, nothing => ours)[2] - return data - end - node = lastnode(list.head) - while true - if @atomic(node.next) === nothing - # Attempt to swap with our node - _, succ = @atomicreplace node.next nothing => ours - if succ - # Success, done - return data - end - end - # Advance to end - node = last(node)::LinkedListNode{T} - end -end -function next!(list::LinkedList) - head = @atomic(list.head) - isnothing(head) && throw(BoundsError()) - - # N.B. We assume this is only done single-threaded - next = @atomic(head.next) - @atomic list.head = next - return list -end -function Base.empty!(list::LinkedList) - @atomic list.head = nothing - return list -end -Base.isempty(list::LinkedList) = @atomic(list.head) === nothing - -function Base.first(list::LinkedList) - head = @atomic(list.head) - isnothing(head) && throw(BoundsError()) - return head.data -end - -function Base.last(list::LinkedList) - head = @atomic(list.head) - isnothing(head) && throw(BoundsError()) - return last(head) -end - -function lastnode(node::LinkedListNode) - while true - # Test if this is the last node - if @atomic(node.next) === nothing - return node - else - # Advance to next node - node = @atomic node.next - end - end -end - -Base.last(node::LinkedListNode) = lastnode(node).data - -function maybelast(list::LinkedList) - head = @atomic(list.head) - isnothing(head) ? nothing : last(head) -end - -function Base.length(list::LinkedList) - head = @atomic(list.head) - isnothing(head) ? 0 : length(head) -end - -function Base.length(node::LinkedListNode) - ctr = 1 - while @atomic(node.next) !== nothing - node = @atomic node.next - ctr += 1 - end - return ctr -end - -function Base.show(io::IO, list::LinkedList{T}) where T - print(io, "LinkedList{$T}($(length(list)) entries)") -end - -function Base.copy(list::LinkedList{T}) where T - new_list = LinkedList{T}() - head = @atomic list.head - isnothing(head) && return new_list - - @atomic new_list.head = head - return new_list -end - -function Base.Array(list::LinkedList{T}) where T - vec = T[] - node = @atomic(list.head) - while node !== nothing - push!(vec, node.data) - node = @atomic(node.next) - end - return vec -end - -function Base.iterate(list::LinkedList) - head = @atomic list.head - isnothing(head) ? nothing : (head.data, head.next) -end - -function Base.iterate(_::LinkedList, node) - isnothing(node) ? nothing : (node.data, node.next) -end diff --git a/src/runtime/memory.jl b/src/runtime/memory.jl deleted file mode 100644 index d5b4abffb..000000000 --- a/src/runtime/memory.jl +++ /dev/null @@ -1,802 +0,0 @@ -# Raw memory management -# Copied from CUDAdrv.jl - -import AMDGPU -import AMDGPU: HSA -if AMDGPU.hip_configured - import AMDGPU: HIP -end -import AMDGPU: Runtime -import .Runtime: ROCDevice, ROCSignal, ROCMemoryRegion, ROCMemoryPool, ROCDim, ROCDim3 -import .Runtime: DEVICES, check, get_region, get_memory_pool, get_handle -using Preferences - -## buffer type - -struct Buffer - ptr::Ptr{Cvoid} - host_ptr::Ptr{Cvoid} - base_ptr::Ptr{Cvoid} - bytesize::Int - device::ROCDevice - coherent::Bool - pool_alloc::Bool - # Unique ID used for refcounting. - _id::UInt64 - - function Buffer( - ptr::Ptr{Cvoid}, host_ptr::Ptr{Cvoid}, base_ptr::Ptr{Cvoid}, - bytesize::Int, device::ROCDevice, coherent::Bool, pool_alloc::Bool, - ) - _id = _buffer_id!() - new(ptr, host_ptr, base_ptr, bytesize, device, coherent, pool_alloc, _id) - end -end - -Base.unsafe_convert(::Type{Ptr{T}}, buf::Buffer) where {T} = convert(Ptr{T}, buf.ptr) - -function view(buf::Buffer, bytes::Int) - bytes > buf.bytesize && throw(BoundsError(buf, bytes)) - return Buffer(buf.ptr+bytes, - buf.host_ptr != C_NULL ? buf.host_ptr+bytes : C_NULL, - buf.base_ptr, - buf.bytesize-bytes, buf.device, buf.coherent, buf.pool_alloc) -end - -## refcounting - -const _ID_COUNTER = Threads.Atomic{UInt64}(0) -const refcounts = Dict{UInt64, Int}() -const liveness = Dict{UInt64, Bool}() -const refcounts_lock = Threads.ReentrantLock() - -function _buffer_id!()::UInt64 - return Threads.atomic_add!(_ID_COUNTER, UInt64(1)) -end - -function refcount(buf::Buffer) - Base.lock(refcounts_lock) do - get(refcounts, buf._id, 0) - end -end - -""" - retain(buf::Buffer) - -Increase the refcount of a buffer. -""" -function retain(buf::Buffer) - Base.lock(refcounts_lock) do - live = get!(liveness, buf._id, true) - @assert live "Trying to retain dead buffer!" - count = get!(refcounts, buf._id, 0) - refcounts[buf._id] = count + 1 - end - return -end - -""" - release(buf::Buffer) - -Decrease the refcount of a buffer. Returns `true` if the refcount has dropped -to 0, and some action needs to be taken. -""" -function release(buf::Buffer) - while !Base.trylock(refcounts_lock) end - try - count = refcounts[buf._id] - @assert count >= 1 "Buffer refcount dropping below 0!" - refcounts[buf._id] = count - 1 - done = count == 1 - - live = liveness[buf._id] - - if done - if live - free(buf) - end - untrack(buf) - end - return done - finally - Base.unlock(refcounts_lock) - end -end - -""" - free_if_live(buf::Buffer) - -Frees the base pointer for `buf` if it is still live (not yet freed). Does not -update refcounts. -""" -function free_if_live(buf::Buffer) - Base.lock(refcounts_lock) do - if liveness[buf._id] - liveness[buf._id] = false - free(buf) - end - end -end - -""" - untrack(buf::Buffer) - -Removes refcount tracking information for a buffer. -""" -function untrack(buf::Buffer) - while !Base.trylock(refcounts_lock) end - try - delete!(liveness, buf._id) - delete!(refcounts, buf._id) - finally - Base.unlock(refcounts_lock) - end -end - - -## memory info - -""" - info() - -Returns a tuple of two integers, indicating respectively the free and total amount of memory -(in bytes) available for allocation on the device. -""" -function info() - free_ref = Ref{Csize_t}() - total_ref = Ref{Csize_t}() - # FIXME: I'm not sure HSA has an API for this... - return convert(Int, free_ref[]), convert(Int, total_ref[]) -end - -""" - free() - -Returns the free amount of memory (in bytes), available for allocation on the device. -""" -free() = info()[1] - -""" - total() - -Returns the total amount of memory (in bytes), available for allocation on the device. -""" -total() = info()[2] - -""" - used() - -Returns the used amount of memory (in bytes), allocated on the device. -""" -used() = total()-free() - -""" - pointerinfo(ptr::Ptr) - pointerinfo(buf::Buffer) - pointerinfo(a::Array) - -Retrieve information about the allocation referenced by the given pointer. -""" -function pointerinfo(ptr::Ptr) - ptrinfo = Ref{HSA.AMDPointerInfo}() - ptrinfo_ptr = Base.unsafe_convert(Ptr{HSA.AMDPointerInfo}, ptrinfo) - ccall(:memset, Ptr{Cvoid}, - (Ptr{HSA.AMDPointerInfo}, UInt8, Csize_t), - ptrinfo_ptr, UInt8(0), sizeof(HSA.AMDPointerInfo)) - unsafe_store!(reinterpret(Ptr{Csize_t}, ptrinfo_ptr), sizeof(HSA.AMDPointerInfo)) - HSA.amd_pointer_info(Ptr{Cvoid}(ptr), ptrinfo, C_NULL, Ptr{UInt32}(C_NULL), C_NULL) |> check - return ptrinfo[] -end -pointerinfo(buf::Buffer) = pointerinfo(buf.ptr) -pointerinfo(a::Array) = pointerinfo(pointer(a)) - -## Page-locking - -""" - lock(ptr::Ptr, bytesize::Integer, device::ROCDevice) - lock(ptr, bytesize) - lock(a::Array, device) - lock(a) - -Page-lock a host pointer allocated by the OS allocator and return a new pointer from -the given `device`. For more information, see `hsa_amd_memory_lock()`. - -See also: [`unlock`](@ref) -""" -function lock(ptr::Ptr, bytesize::Integer, device::ROCDevice) - plocked = Ref{Ptr{Cvoid}}() - ccall(:memset, Ptr{Cvoid}, - (Ptr{Ptr{Cvoid}}, UInt8, Csize_t), - Base.unsafe_convert(Ptr{Ptr{Cvoid}}, plocked), UInt8(0), sizeof(Ptr{Cvoid})) - HSA.amd_memory_lock(Ptr{Cvoid}(ptr), bytesize, Ref(device.agent), 1, plocked) |> check - return plocked[] -end -lock(ptr, bytesize) = lock(ptr, bytesize, AMDGPU.device()) -lock(a::Array, device::ROCDevice) = lock(pointer(a), sizeof(a), device) -lock(a::Array) = lock(pointer(a), sizeof(a), AMDGPU.device()) - -""" - unlock(ptr::Ptr) - unlock(a::Array) - -Unlock the host pointer previously page-locked with [`lock`](@ref). -NB: `ptr` should be the original locked host pointer and not the pointer returned by `lock`! -""" -function unlock(ptr::Ptr) - HSA.amd_memory_unlock(Ptr{Cvoid}(ptr)) |> check -end -unlock(a::Array) = unlock(pointer(a)) - -## generic interface (for documentation purposes) - -""" -Allocate linear memory on the device and return a buffer to the allocated memory. The -allocated memory is suitably aligned for any kind of variable. The memory will not be freed -automatically, use [`free(::Buffer)`](@ref) for that. -""" -function alloc end - -""" -Free device memory. -""" -function free end - -""" -Initialize device memory with a repeating value. -""" -function set! end - -""" -Upload memory from host to device. -Executed asynchronously on `queue` if `async` is true. -""" -function upload end -@doc (@doc upload) upload! - -""" -Download memory from device to host. -Executed asynchronously on `queue` if `async` is true. -""" -function download end -@doc (@doc download) download! - -""" -Transfer memory from device to device. -Executed asynchronously on `queue` if `async` is true. -""" -function transfer end -@doc (@doc transfer) transfer! - - -## pointer-based - -"Enables or disables the slow allocation fallback for non-coherent allocations." -enable_slow_allocation_fallback!(flag::Bool) = @set_preferences!("use_slow_allocation_fallback" => flag) -const USE_SLOW_ALLOCATION_FALLBACK = let - if haskey(ENV, "JULIA_AMDGPU_USE_SLOW_ALLOCATION_FALLBACK") - flag = parse(Bool, ENV["JULIA_AMDGPU_USE_SLOW_ALLOCATION_FALLBACK"]) - enable_slow_allocation_fallback!(flag) - flag - else - @load_preference("use_slow_allocation_fallback", true) - end -end - -"Enables or disables using hipMalloc/hipFree for non-coherent allocations." -enable_hip_malloc_override!(flag::Bool) = @set_preferences!("use_hip_malloc_override" => flag) -const USE_HIP_MALLOC_OVERRIDE = let - if haskey(ENV, "JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE") - flag = parse(Bool, ENV["JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE"]) - enable_hip_malloc_override!(flag) - flag - else - @load_preference("use_hip_malloc_override", false) - end -end - -"Sets a limit for total GPU memory allocations." -set_memory_alloc_limit!(limit::Integer) = - @set_preferences!("memory_alloc_limit" => limit) -const MEMORY_ALLOC_LIMIT = let - if haskey(ENV, "JULIA_AMDGPU_MEMORY_ALLOC_LIMIT") - limit = parse(Int, ENV["JULIA_AMDGPU_MEMORY_ALLOC_LIMIT"]) - set_memory_alloc_limit!(limit) - limit - else - @load_preference("memory_alloc_limit", typemax(Int)) - end -end - -""" - alloc(bytesize::Integer; coherent=false) -> Buffer - -Allocate `bytesize` bytes of HSA-managed memory on the default device. - - alloc(device::ROCDevice, bytesize::Integer; coherent=false) -> Buffer - -Allocate `bytesize` bytes of HSA-managed memory on `device`. - -When using the above two methods, allocations are not coherent by default, -meaning that the allocated buffer is only accessible from the given device. - -If `coherent` is set to `true`, the allocated buffer will be accessible from -all HSA devices, including the host CPU. Even though this is convenient, it can -sometimes be slower than explicit memory transfers if memory accesses are not -carefully managed. - - alloc(device::ROCDevice, pool::ROCMemoryPool, bytesize::Integer) -> Buffer - alloc(device::ROCDevice, region::ROCMemoryRegion, bytesize::Integer) -> Buffer - -Allocate `bytesize` bytes of HSA-managed memory on the region `region` or -memory pool `pool`. -""" -function alloc(device::ROCDevice, bytesize::Integer; coherent=false, slow_fallback=!coherent && USE_SLOW_ALLOCATION_FALLBACK) - alloc_id = rand(UInt64) - Runtime.@log_start(:alloc, (;alloc_id), (;device=get_handle(device), size=bytesize, coherent)) - - bytesize == 0 && return Buffer(C_NULL, C_NULL, C_NULL, 0, device, coherent, false) - - region_kind = coherent ? :finegrained : :coarsegrained - - buf = nothing - region = nothing - try - if region_kind != :coarsegrained - region = get_region(device, region_kind) - @debug "Allocating $(Base.format_bytes(bytesize)) from $region" - buf = alloc(device, region, bytesize) - else - if USE_HIP_MALLOC_OVERRIDE - @debug "Allocating $(Base.format_bytes(bytesize)) from HIP" - buf = alloc_hip(bytesize) - else - region = get_memory_pool(device, region_kind) - @debug "Allocating $(Base.format_bytes(bytesize)) from $region" - buf = alloc(device, region, bytesize) - # This is a no-op and we need to make sure that we use the right region instead - # check(HSA.memory_assign_agent(buf.ptr, device.agent, HSA.ACCESS_PERMISSION_RW)) - end - end - catch err - if slow_fallback && - !coherent && - err isa Runtime.HSAError && - (err.code == HSA.STATUS_ERROR_OUT_OF_RESOURCES || - err.code == HSA.STATUS_ERROR_INVALID_ALLOCATION) - # TODO: How to handle this with logging? - buf = alloc(device, bytesize; coherent=true) - else - rethrow(err) - end - finally - ptr = buf !== nothing ? buf.ptr : C_NULL - region = region !== nothing ? get_handle(region) : C_NULL - Runtime.@log_finish(:alloc, (;alloc_id), (;ptr, region)) - end - return buf -end -function alloc_or_retry!(f) - for phase in 1:3 - if phase == 2 - GC.gc(false) - yield() - elseif phase == 3 - GC.gc(true) - yield() - end - status = f() - @debug "Allocation phase $phase: $status" - if status == HSA.STATUS_SUCCESS - break - elseif status == HSA.STATUS_ERROR_OUT_OF_RESOURCES || status == HSA.STATUS_ERROR_INVALID_ALLOCATION - if phase == 3 - check(status) - end - else - check(status) - end - end -end - -const ALL_ALLOCS = Threads.Atomic{Int64}(0) - -_alloc(p::ROCMemoryPool, bytesize::Integer, ptr_ref) = HSA.amd_memory_pool_allocate(p.pool, bytesize, 0, ptr_ref) -_alloc(p::ROCMemoryRegion, bytesize::Integer, ptr_ref) = HSA.memory_allocate(p.region, bytesize, ptr_ref) - -_accessible(p::ROCMemoryRegion)::Bool = Runtime.region_host_accessible(p) -_accessible(p::ROCMemoryPool)::Bool = Runtime.pool_accessible_by_all(p) - -function alloc( - device::ROCDevice, space::S, bytesize::Integer, -) where S <: Union{ROCMemoryPool, ROCMemoryRegion} - ptr_ref = Ref{Ptr{Cvoid}}() - alloc_or_retry!(() -> _alloc(space, bytesize, ptr_ref)) - ptr = ptr_ref[] - AMDGPU.hsaref!() - Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize)) - Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, _accessible(space), S <: ROCMemoryPool) -end - -alloc(bytesize; kwargs...) = alloc(AMDGPU.device(), bytesize; kwargs...) - -@static if AMDGPU.hip_configured - function alloc_hip(bytesize::Integer) - ptr_ref = Ref{Ptr{Cvoid}}() - # FIXME: Set HIP device - alloc_or_retry!() do - try - HIP.@check HIP.hipMalloc(ptr_ref, Csize_t(bytesize)) - HSA.STATUS_SUCCESS - catch - # FIXME: Actually check error code - HSA.STATUS_ERROR_OUT_OF_RESOURCES - end - end - AMDGPU.hsaref!() - ptr = ptr_ref[] - Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize)) - Buffer(ptr, C_NULL, ptr, Int64(bytesize), AMDGPU.device(), false, true) - end -end - -function free(buf::Buffer) - buf.ptr == C_NULL && return - - Runtime.@log_start(:free, (;ptr=buf.ptr), nothing) - if buf.host_ptr == C_NULL - # HSA-backed - if buf.pool_alloc - if USE_HIP_MALLOC_OVERRIDE - @static if AMDGPU.hip_configured - # Actually HIP-backed - HIP.@check HIP.hipFree(buf.base_ptr) - end - else - memory_check(HSA.amd_memory_pool_free(buf.base_ptr), buf.base_ptr) - end - Threads.atomic_sub!(ALL_ALLOCS, Int64(buf.bytesize)) - else - memory_check(HSA.memory_free(buf.base_ptr), buf.base_ptr) - end - AMDGPU.hsaunref!() - else - # Wrapped - unlock(buf.ptr) - end - Runtime.@log_finish(:free, (;ptr=buf.ptr), nothing) - return -end -# N.B. We try to keep this from yielding or throwing, since this usually runs -# in a finalizer -function memory_check(status::HSA.Status, ptr::Ptr{Cvoid}) - if status != HSA.STATUS_SUCCESS - err_str = Runtime.description(status) - Core.println("Error when attempting to free an HSA buffer:\n $err_str") - pinfo = pointerinfo(ptr) - Core.println(sprint(io->Base.show(io, pinfo))) - return false - end - return true -end - -struct PoolAllocation - addr::Ptr{Cvoid} - refs::Threads.Atomic{Int} -end -PoolAllocation(addr) = - PoolAllocation(addr, Threads.Atomic{Int}(1)) -Base.hash(p::PoolAllocation) = hash(p.addr, hash(PoolAllocation)) -Base.isequal(p1::P, p2::P) where P<:PoolAllocation = p1.addr == p2.addr - -const ALLOC_POOL_BINNED = IdDict{ROCDevice,Dict{Int,Vector{Ptr{Cvoid}}}}() -const ALLOC_POOL_PTR_BIN_MAP = Dict{Ptr{Cvoid},Int}() -const ALLOC_POOL_SHARED = IdDict{ROCDevice,Dict{UInt64,PoolAllocation}}() -const ALLOC_POOL_LOCK = Threads.SpinLock() -const ALLOC_POOL_MAX_SIZE = Ref{Int}(64) -const ALLOC_POOL_MAX_BINS = 8 - -function alloc_pooled(device::ROCDevice, key::UInt64, kind::Symbol, bytesize::Integer) - @assert kind == :kernarg "Pooled non-kernarg allocations not implemented" - - if bytesize == 0 - return C_NULL, false - end - - # Try to grab from pool - Base.lock(ALLOC_POOL_LOCK) do - # Try to grab a shared allocation - device_dict_shared = get!(()->Dict{UInt64,PoolAllocation}(), ALLOC_POOL_SHARED, device) - if (alloc = get(device_dict_shared, key, nothing)) !== nothing - Threads.atomic_add!(alloc.refs, 1) - return alloc.addr, false - end - # Fallback, try to grab a binned (unshared) allocation - device_dict_binned = get!(ALLOC_POOL_BINNED, device) do - d = Dict{Int,Vector{Ptr{Cvoid}}}() - for bin in 1:ALLOC_POOL_MAX_BINS - d[bin] = Vector{Ptr{Cvoid}}() - end - d - end - bin_min = ceil(Int, log2(bytesize)) - if bin_min <= ALLOC_POOL_MAX_BINS - # Find any compatible allocation - bin = findfirst(bin->bin >= bin_min && length(device_dict_binned[bin]) > 0, bin_min:ALLOC_POOL_MAX_BINS) - if bin !== nothing - ptr = pop!(device_dict_binned[bin]) - ALLOC_POOL_PTR_BIN_MAP[ptr] = bin - return ptr, true - end - end - - # No available allocations to grab, make a new one - Base.unlock(ALLOC_POOL_LOCK) - - if bin_min <= ALLOC_POOL_MAX_BINS - # Round-up bytesize to allow reuse in bins - bytesize = 2^bin_min - end - - # N.B. We use the region API because kernarg allocations don't - # show up in the memory pools API - kernarg_region = Runtime.get_region(device, :kernarg) - kernarg_address = Ref{Ptr{Nothing}}(Ptr{Nothing}(0)) - alloc_or_retry!() do - HSA.memory_allocate(kernarg_region.region, - bytesize, - kernarg_address) - end - - Base.lock(ALLOC_POOL_LOCK) - - # Try to share this allocation - if length(device_dict_shared) < ALLOC_POOL_MAX_SIZE[] - device_dict_shared[key] = PoolAllocation(kernarg_address[]) - end - - return kernarg_address[], true - end -end - -function free_pooled(device::ROCDevice, key::UInt64, kind::Symbol, ptr::Ptr{Cvoid}) - # Return to pool - Runtime.@spinlock ALLOC_POOL_LOCK begin - # Check if this pointer is a shared allocation - device_dict_shared = get!(()->Dict{UInt64,PoolAllocation}(), ALLOC_POOL_SHARED, device) - if (alloc = get(device_dict_shared, key, nothing)) !== nothing - if Threads.atomic_sub!(alloc.refs, 1) == 1 - # TODO: Don't delete unless we're out of space - delete!(device_dict_shared, key) - # TODO: Consider putting into a bin if power-of-two bytesize - check(HSA.memory_free(ptr)) - end - return - end - # Check if this pointer is a binned allocation - if !haskey(ALLOC_POOL_PTR_BIN_MAP, ptr) - # Not binned or shared - check(HSA.memory_free(ptr)) - return - end - bin = ALLOC_POOL_PTR_BIN_MAP[ptr] - allocs = ALLOC_POOL_BINNED[device][bin] - if length(allocs) < ALLOC_POOL_MAX_SIZE[] - # Save for later - push!(allocs, ptr) - else - # No free space - check(HSA.memory_free(ptr)) - end - return - end -end - -""" - set!(buf::Buffer, value::UInt32, len::Integer) - -Write `len` copies of the 32-bit `value` at the start of `buf`. -""" -function set!(buf::Buffer, value::UInt32, len::Integer) - HSA.amd_memory_fill(buf.ptr, value, len) |> check -end - -""" - upload!(dst::Buffer, src, nbytes::Integer) - -Upload `nbytes` memory from `src` at the host to `dst` on the device. -""" -function upload!(dst::Buffer, src::Ptr{T}, nbytes::Integer) where T - Runtime.@log_start(:upload!, (;dest=dst.ptr, src=reinterpret(Ptr{Cvoid}, src)), (;nbytes)) - nbytes > 0 || return - if dst.host_ptr == C_NULL - HSA.memory_copy(Ptr{T}(dst.ptr), src, nbytes) |> check - else - Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst.host_ptr), - reinterpret(Ptr{UInt8}, src), - nbytes) - end - Runtime.@log_finish(:upload!, (;dest=dst.ptr, src=reinterpret(Ptr{Cvoid}, src)), (;nbytes)) -end - -""" - download!(dst::Ref, src::Buffer, nbytes::Integer) - -Download `nbytes` memory from `src` on the device to `dst` on the host. -""" -function download!(dst::Ptr{T}, src::Buffer, nbytes::Integer) where T - Runtime.@log_start(:download!, (;dest=reinterpret(Ptr{Cvoid}, dst), src=src.ptr), (;nbytes)) - nbytes > 0 || return - if src.host_ptr == C_NULL - HSA.memory_copy(dst, Ptr{T}(src.ptr), nbytes) |> check - else - Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst), - reinterpret(Ptr{UInt8}, src.host_ptr), - nbytes) - end - Runtime.@log_finish(:download!, (;dest=reinterpret(Ptr{Cvoid}, dst), src=src.ptr), (;nbytes)) -end - -""" - transfer!(dst::Buffer, src::Buffer, nbytes::Integer) - -Transfer `nbytes` of device memory from `src` to `dst`. -""" -function transfer!(dst::Buffer, src::Buffer, nbytes::Integer) - Runtime.@log_start(:transfer!, (;dest=dst.ptr, src=src.ptr), (;nbytes)) - nbytes > 0 || return - if dst.host_ptr != C_NULL && src.host_ptr != C_NULL - Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst.host_ptr), - reinterpret(Ptr{UInt8}, src.host_ptr), - nbytes) - elseif dst.host_ptr != C_NULL - download!(dst.host_ptr, src, nbytes) - elseif src.host_ptr != C_NULL - upload!(dst, src.host_ptr, nbytes) - else - HSA.memory_copy(dst.ptr, src.ptr, nbytes) |> check - end - Runtime.@log_finish(:transfer!, (;dest=dst.ptr, src=src.ptr), (;nbytes)) -end - -""" - unsafe_copy3d!(dst::Ptr{T}, src::Ptr{T}, width, height=1, depth=1; - dstPos::ROCDim=(1,1,1), dstPitch=0, dstSlice=0, - srcPos::ROCDim=(1,1,1), srcPitch=0, srcSlice=0, - async::Bool=false, signal::ROCSignal=nothing) where T - -Perform a 3D memory copy between pointers `src` and `dst` at respectively position `srcPos` and `dstPos` -(1-indexed). Both pitch and slice can be specified for both the source and destination. This call is -executed asynchronously if `async` is set, otherwise `signal` is synchronized. -""" -function unsafe_copy3d!(dst::Ptr{T}, src::Ptr{T}, width, height=1, depth=1; - dstPos::ROCDim=(1,1,1), dstPitch=0, dstSlice=0, - srcPos::ROCDim=(1,1,1), srcPitch=0, srcSlice=0, - async::Bool=false, signal::ROCSignal=nothing) where T - (T == Nothing) && error("Type of Ptr is Nothing") - - dstPtr_info = pointerinfo(dst) - srcPtr_info = pointerinfo(src) - - if dstPtr_info.type == HSA.EXT_POINTER_TYPE_UNKNOWN || srcPtr_info.type == HSA.EXT_POINTER_TYPE_UNKNOWN - error("Only device pointers or locked host pointers are supported, see unsafe_wrap and Mem.lock") - end - - if dstPtr_info.type == HSA.EXT_POINTER_TYPE_HSA && srcPtr_info.type == HSA.EXT_POINTER_TYPE_LOCKED - Runtime.device_type(dstPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU || error("dst should point to device memory") - hsaCopyDir = HSA.LibHSARuntime.hsaHostToDevice - elseif dstPtr_info.type == HSA.EXT_POINTER_TYPE_LOCKED && srcPtr_info.type == HSA.EXT_POINTER_TYPE_HSA - Runtime.device_type(srcPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU || error("src should point to device memory") - hsaCopyDir = HSA.LibHSARuntime.hsaDeviceToHost - elseif dstPtr_info.type == HSA.EXT_POINTER_TYPE_HSA && srcPtr_info.type == HSA.EXT_POINTER_TYPE_HSA - (Runtime.device_type(dstPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU && Runtime.device_type(srcPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU) || error("dst and src should point to device memory") - hsaCopyDir = HSA.LibHSARuntime.hsaDeviceToDevice - else - error("Only device to device, host to device, and device to host memory transfer is supported") - end - - dstOffset = (sizeof(T)*(dstPos[1]-1), dstPos[2]-1, dstPos[3]-1) - srcOffset = (sizeof(T)*(srcPos[1]-1), srcPos[2]-1, srcPos[3]-1) - - dstRef = Ref(HSA.PitchedPtr(dst, dstPitch, dstSlice)) - srcRef = Ref(HSA.PitchedPtr(src, srcPitch, srcSlice)) - dstOffsetRef = Ref(HSA.Dim3(dstOffset...)) - srcOffsetRef = Ref(HSA.Dim3(srcOffset...)) - rangeRef = Ref(HSA.Dim3(sizeof(T)*width, height, depth)) - - AMDGPU.HSA.amd_memory_async_copy_rect( - Base.unsafe_convert(Ptr{HSA.PitchedPtr}, dstRef), - Base.unsafe_convert(Ptr{HSA.Dim3}, dstOffsetRef), - Base.unsafe_convert(Ptr{HSA.PitchedPtr}, srcRef), - Base.unsafe_convert(Ptr{HSA.Dim3}, srcOffsetRef), - Base.unsafe_convert(Ptr{HSA.Dim3}, rangeRef), - AMDGPU.device().agent, hsaCopyDir, - UInt32(0), C_NULL, signal.signal) |> check - - async || wait(signal) - return nothing -end - - -## array based - -""" - alloc(src::AbstractArray; alloc_kwargs...) - -Allocate space to store the contents of `src`. -""" -function alloc(src::AbstractArray; alloc_kwargs...) - return alloc(sizeof(src); alloc_kwargs...) -end - -""" - upload!(dst::Buffer, src::AbstractArray) - -Upload the contents of an array `src` to `dst`. -""" -function upload!(dst::Buffer, src::AbstractArray) - GC.@preserve src upload!(dst, pointer(src), sizeof(src)) -end - -""" - upload(src::AbstractArray; alloc_kwargs...)::Buffer - -Allocates space for and uploads the contents of an array `src`, returning a Buffer. -For the allocation keywoard arguments see [`alloc`](@ref). -""" -function upload(src::AbstractArray; alloc_kwargs...) - dst = alloc(src; alloc_kwargs...) - upload!(dst, src) - return dst -end - -""" - download!(dst::AbstractArray, src::Buffer) - -Downloads memory from `src` to the array at `dst`. The amount of memory downloaded is -determined by calling `sizeof` on the array, so it needs to be properly preallocated. -""" -function download!(dst::AbstractArray, src::Buffer) - GC.@preserve dst download!(pointer(dst), src, sizeof(dst)) - return -end - -## type based - -function check_type(::Type{Buffer}, T) - if isa(T, UnionAll) || isabstracttype(T) || !isconcretetype(T) - throw(ArgumentError("cannot represent abstract or non-leaf object")) - end - Base.datatype_pointerfree(T) || throw(ArgumentError("cannot handle non-ptrfree objects")) - sizeof(T) == 0 && throw(ArgumentError("cannot represent singleton objects")) -end - -""" - alloc(T::Type, [count::Integer=1]; alloc_kwargs...) - -Allocate space for `count` objects of type `T`. -""" -function alloc(::Type{T}, count::Integer=1; alloc_kwargs...) where {T} - check_type(Buffer, T) - - return alloc(sizeof(T)*count; alloc_kwargs...) -end - -""" - download(::Type{T}, src::Buffer, [count::Integer=1])::Vector{T} - -Download `count` objects of type `T` from the device at `src`, returning a vector. -""" -function download(::Type{T}, src::Buffer, count::Integer=1) where {T} - dst = Vector{T}(undef, count) - download!(dst, src) - return dst -end - -# Pretty-printing -function Base.show(io::IO, ptrinfo::HSA.AMDPointerInfo) - println(io, "Pointer type: $(ptrinfo.type)") - println(io, "Owner: $(DEVICES[ptrinfo.agentOwner.handle])") - println(io, "Agent base address: $(ptrinfo.agentBaseAddress)") - println(io, "Host base address: $(ptrinfo.hostBaseAddress)") - print(io, "Size (bytes): $(ptrinfo.sizeInBytes)") -end diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl new file mode 100644 index 000000000..ffe9fb2f1 --- /dev/null +++ b/src/runtime/memory/hip.jl @@ -0,0 +1,180 @@ +const _POOL_STATUS = AMDGPU.LockedObject( + Dict{HIP.HIPDevice, Base.RefValue{Union{Nothing, Bool}}}()) + +function pool_status(dev::HIP.HIPDevice) + Base.lock(_POOL_STATUS) do ps + get!(ps, dev, Ref{Union{Nothing, Bool}}(nothing)) + end +end + +const __pool_cleanup = Ref{Task}() +function pool_cleanup() + idle_counters = Base.fill(0, HIP.ndevices()) + devices = HIP.devices() + while true + for (i, dev) in enumerate(devices) + status = pool_status(dev) + isnothing(status[]) && continue + + if status[]::Bool + idle_counters[i] = 0 + else + idle_counters[i] += 1 + end + status[] = false + + if idle_counters[i] == 5 + old_device = HIP.device() + old_device != dev && HIP.device!(dev) + HIP.reclaim() + old_device != dev && HIP.device!(old_device) + end + end + + try + sleep(60) + catch ex + if ex isa EOFError + # If we get EOF here, it's because Julia is shutting down, + # so we should just exit the loop. + break + else + rethrow() + end + end + end +end + +function mark_pool!(dev::HIP.HIPDevice) + status = pool_status(dev) + if isnothing(status[]) + # Default to `0` which is the default value in HIP. + limit = parse_memory_limit(@load_preference("soft_memory_limit", "0 MiB")) + HIP.attribute!( + HIP.memory_pool(dev), HIP.hipMemPoolAttrReleaseThreshold, limit) + if !isassigned(__pool_cleanup) + __pool_cleanup[] = errormonitor(Threads.@spawn pool_cleanup()) + end + end + status[] = true +end + +struct HIPBuffer <: AbstractAMDBuffer + device::HIPDevice + ptr::Ptr{Cvoid} + bytesize::Int + _id::UInt64 # Unique ID used for refcounting. +end + +# TODO pass device? +function HIPBuffer(bytesize; stream::HIP.HIPStream) + dev = stream.device + bytesize == 0 && return HIPBuffer(dev, C_NULL, 0, _buffer_id!()) + + mark_pool!(dev) + pool = HIP.memory_pool(dev) + + has_limit = HARD_MEMORY_LIMIT != typemax(UInt64) + + ptr_ref = Ref{Ptr{Cvoid}}() + alloc_or_retry!() do + try + # Try to ensure there is enough memory before even trying to allocate. + if has_limit + used = HIP.used_memory(pool) + (used + bytesize) > HARD_MEMORY_LIMIT && + throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) + end + + # Try to allocate. + HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check + ptr_ref[] == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory)) + return HSA.STATUS_SUCCESS + catch err + # TODO rethrow if not out of memory error + @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace()) + return HSA.STATUS_ERROR_OUT_OF_RESOURCES + end + end + ptr = ptr_ref[] + @assert ptr != C_NULL "hipMallocAsync resulted in C_NULL for $(Base.format_bytes(bytesize))" + + # TODO do not reclaim (ROCm 5.5+ has hard pool size limit) + if has_limit + if HIP.reserved_memory(pool) > HARD_MEMORY_LIMIT + HIP.reclaim() # TODO do not reclaim all memory + end + @assert HIP.reserved_memory(pool) ≤ HARD_MEMORY_LIMIT + end + + HIPBuffer(dev, ptr, bytesize, _buffer_id!()) +end + +HIPBuffer(ptr::Ptr{Cvoid}, bytesize::Int) = HIPBuffer( + AMDGPU.device(), ptr, bytesize, _buffer_id!()) + +Base.unsafe_convert(::Type{Ptr{T}}, buf::HIPBuffer) where T = convert(Ptr{T}, buf.ptr) + +function view(buf::HIPBuffer, bytesize::Int) + bytesize > buf.bytesize && throw(BoundsError(buf, bytesize)) + HIPBuffer(buf.device, buf.ptr + bytesize, buf.bytesize - bytesize, buf._id) +end + +function free(buf::HIPBuffer; stream::HIP.HIPStream) + buf.ptr == C_NULL && return + HIP.hipFreeAsync(buf, stream) |> HIP.check + return +end + +function upload!(dst::HIPBuffer, src::Ptr, bytesize::Int; stream::HIP.HIPStream) + bytesize == 0 && return nothing + HIP.hipMemcpyHtoDAsync(dst, src, bytesize, stream) |> HIP.check + HIP.HIPEvent(stream) +end + +function download!(dst::Ptr, src::HIPBuffer, bytesize::Int; stream::HIP.HIPStream) + bytesize == 0 && return nothing + HIP.hipMemcpyDtoHAsync(dst, src, bytesize, stream) |> HIP.check + HIP.HIPEvent(stream) +end + +function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HIPStream) + bytesize == 0 && return nothing + HIP.hipMemcpyDtoDAsync(dst, src, bytesize, stream) |> HIP.check + HIP.HIPEvent(stream) +end + +struct HostBuffer <: AbstractAMDBuffer + ptr::Ptr{Cvoid} + bytesize::Int +end + +HostBuffer() = HostBuffer(C_NULL, 0) + +Base.unsafe_convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr) + +function HostBuffer(bytesize::Integer, flags = 0) + bytesize == 0 && return HostBuffer() + + ptr_ref = Ref{Ptr{Cvoid}}() + HIP.hipHostMalloc(ptr_ref, bytesize, flags) |> HIP.check + HostBuffer(ptr_ref[], bytesize) +end + +function free(buf::HostBuffer) + buf.ptr == C_NULL && return + HIP.hipHostFree(buf) |> HIP.check + return +end + +# TODO +# - introduce hipPtr. +# - use Base.convert instead of `device_ptr`. +# - define unsafe_copyto! for all buffers instead of upload!, etc. + +function device_ptr(buf::HostBuffer) + buf.ptr == C_NULL && return C_NULL + ptr_ref = Ref{Ptr{Cvoid}}() + HIP.hipHostGetDevicePointer(ptr_ref, buf.ptr, 0) |> HIP.check + ptr_ref[] +end diff --git a/src/runtime/memory/refcount.jl b/src/runtime/memory/refcount.jl new file mode 100644 index 000000000..2b8e3762d --- /dev/null +++ b/src/runtime/memory/refcount.jl @@ -0,0 +1,62 @@ +const _ID_COUNTER = Threads.Atomic{UInt64}(0) +const refcounts = Dict{UInt64, Int}() +const liveness = Dict{UInt64, Bool}() + +function _buffer_id!()::UInt64 + return Threads.atomic_add!(_ID_COUNTER, UInt64(1)) +end + +function refcount(buf::AbstractAMDBuffer) + Base.lock(refcounts_lock) do + get(refcounts, buf._id, 0) + end +end + +function retain(buf::AbstractAMDBuffer) + Base.lock(refcounts_lock) do + live = get!(liveness, buf._id, true) + @assert live "Trying to retain dead buffer!" + count = get!(refcounts, buf._id, 0) + refcounts[buf._id] = count + 1 + end + return +end + +function release(buf::HIPBuffer; stream::HIP.HIPStream) + while !Base.trylock(refcounts_lock) end + try + count = refcounts[buf._id] + @assert count >= 1 "Buffer refcount dropping below 0!" + refcounts[buf._id] = count - 1 + done = count == 1 + + live = liveness[buf._id] + + if done + live && free(buf; stream) + untrack(buf) + end + return done + finally + Base.unlock(refcounts_lock) + end +end + +function free_if_live(buf::HIPBuffer; stream::HIP.HIPStream) + Base.lock(refcounts_lock) do + if liveness[buf._id] + liveness[buf._id] = false + free(buf; stream) + end + end +end + +function untrack(buf::AbstractAMDBuffer) + while !Base.trylock(refcounts_lock) end + try + delete!(liveness, buf._id) + delete!(refcounts, buf._id) + finally + Base.unlock(refcounts_lock) + end +end diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl new file mode 100644 index 000000000..22736c8ca --- /dev/null +++ b/src/runtime/memory/utils.jl @@ -0,0 +1,157 @@ +""" + info() + +Returns a tuple of two integers, indicating respectively the free and total amount of memory +(in bytes) available for allocation on the device. +""" +function info() + free_ref = Ref{Csize_t}() + total_ref = Ref{Csize_t}() + HIP.hipMemGetInfo(free_ref, total_ref) |> HIP.check + return convert(Int, free_ref[]), convert(Int, total_ref[]) +end + +""" + free() + +Returns the free amount of memory (in bytes), available for allocation on the device. +""" +free() = info()[1] + +""" + total() + +Returns the total amount of memory (in bytes), available for allocation on the device. +""" +total() = info()[2] + +""" + used() + +Returns the used amount of memory (in bytes), allocated on the device. +""" +used() = total() - free() + +const ALL_ALLOCS = Threads.Atomic{Int64}(0) + +function parse_memory_limit(limit_str::String) + limit_str == "none" && return typemax(UInt64) + + units = ("%", "MiB", "GiB") + + value, unit = split(limit_str) # TODO check length 2 before split + unit in units || throw(ArgumentError(""" + Memory limit must be specified in `$units` units, but `$unit` was given. + """)) + + total_memory = total() + limit = if unit == "%" + v = parse(Int, value) + 0 < v ≤ 100 || throw(ArgumentError(""" + Invalid percentage value for memory limit `$v`. + Must be in (0, 100] range or 'none'. + """)) + floor(UInt64, total_memory * (v / 100)) + else + scale = unit == "MiB" ? (1024^2) : (1024^3) + parse(UInt64, value) * scale + end + + limit > total_memory && throw(ArgumentError(""" + Memory limit `$(Base.format_bytes(limit))` is bigger than the actual memory `$(Base.format_bytes(total_memory))`. + Set to `none` to disable memory limit. + """)) + + limit +end + +""" +Set a hard limit for total GPU memory allocations. +""" +set_memory_alloc_limit!(limit::String) = + @set_preferences!("hard_memory_limit" => limit) + +const HARD_MEMORY_LIMIT = parse_memory_limit( + @load_preference("hard_memory_limit", "none")) + +function alloc_or_retry!(f) + status = f() + status == HSA.STATUS_SUCCESS && return + + stream = AMDGPU.stream() + + phase = 1 + while true + if phase == 1 + HIP.synchronize(stream) + elseif phase == 2 + HIP.device_synchronize() + elseif phase == 3 + GC.gc(false) + HIP.device_synchronize() + elseif phase == 4 + GC.gc(true) + HIP.device_synchronize() + elseif phase == 5 + HIP.trim(HIP.memory_pool(stream.device)) + else + break + end + phase += 1 + + status = f() + status == HSA.STATUS_SUCCESS && break + end + + if status != HSA.STATUS_SUCCESS + pool = HIP.memory_pool(stream.device) + @warn """ + Failed to successfully execute function and free resources for it. + Reporting current memory usage: + - HIP pool used: $(Base.format_bytes(HIP.used_memory(pool))). + - HIP pool reserved: $(Base.format_bytes(HIP.reserved_memory(pool))). + - Hard memory limit: $(Base.format_bytes(HARD_MEMORY_LIMIT)). + """ + end + + check(status) + return +end + +""" +Allocate linear memory on the device and return a buffer to the allocated memory. The +allocated memory is suitably aligned for any kind of variable. The memory will not be freed +automatically, use [`free(::Buffer)`](@ref) for that. +""" +function alloc end + +""" +Free device memory. +""" +function free end + +""" +Initialize device memory with a repeating value. +""" +function set! end + +""" +Upload memory from host to device. +Executed asynchronously on `queue` if `async` is true. +""" +function upload end +@doc (@doc upload) upload! + +""" +Download memory from device to host. +Executed asynchronously on `queue` if `async` is true. +""" +function download end +@doc (@doc download) download! + +""" +Transfer memory from device to device. +Executed asynchronously on `queue` if `async` is true. +""" +function transfer end +@doc (@doc transfer) transfer! diff --git a/src/runtime/queue.jl b/src/runtime/queue.jl deleted file mode 100644 index 80277f2f9..000000000 --- a/src/runtime/queue.jl +++ /dev/null @@ -1,331 +0,0 @@ -mutable struct ROCQueue - device::ROCDevice - queue::Ptr{HSA.Queue} - priority::Symbol - status::HSA.Status - @atomic active::Bool - active_kernels::LinkedList # TODO: Concrete type - running::Base.Event - lock::Threads.ReentrantLock -end - -""" - ROCQueue(; priority::Symbol=:normal, pooled::Bool=false) - -Create an HSA queue on the currently active device. - -!!! note - Users are encouraged to use this method, - instead of manually providing device since this one - correctly handles device changes. -""" -function ROCQueue(; priority::Symbol=:normal, pooled::Bool=false) - ROCQueue(AMDGPU.device(); priority, pooled) -end - -get_handle(queue::ROCQueue) = reinterpret(Ptr{Cvoid}, queue.queue) - -function Base.show(io::IO, queue::ROCQueue) - print(io, "ROCQueue(device=$(queue.device), ptr=$(repr(UInt(queue.queue))), priority=$(queue.priority), status=$(queue.status), active=$(queue.active), running=$(queue.running.set))") -end - -const QUEUES = Dict{Ptr{HSA.Queue}, WeakRef}() - -function queue_error_handler( - status::HSA.Status, _queue::Ptr{HSA.Queue}, queue_obj_ptr::Ptr{Cvoid}, -)::Nothing - if status != HSA.STATUS_SUCCESS - queue::ROCQueue = unsafe_pointer_to_objref(queue_obj_ptr) - queue.status = status - end - nothing -end - -struct QueueError <: Exception - queue::ROCQueue - exception::Union{Exception,Nothing} -end -function QueueError(queue::ROCQueue) - err = if queue.status != HSA.STATUS_SUCCESS - HSAError(queue.status) - else - nothing - end - return QueueError(queue, err) -end -function Base.showerror(io::IO, err::QueueError) - queue = err.queue - println(io, "QueueError(Queue $(repr(reinterpret(UInt64, queue.queue))) on $(queue.device)) due to:") - if err.exception !== nothing - Base.showerror(io, err.exception) - else - print(io, "Queue was killed") - end - println(io); print(io, "You can select a new queue with `AMDGPU.reset_dead_queue!()`") -end - -mutable struct QueuePool - pool::Dict{ROCDevice,Dict{Symbol,Vector{ROCQueue}}} - max_size::NTuple{3, Int} - idx::Int -end -QueuePool() = QueuePool( - Dict{ROCDevice,Dict{Symbol,Vector{ROCQueue}}}(), - (@load_preference("queue_pool_max_size", [12, 1, 1])...,), - 0) - -const QUEUE_POOL = LockedObject(QueuePool()) - -""" - set_queue_pool_size!(nums::NTuple{3, Int}) - -Set HSA queue pool max size for each priority. -Restart Julia session for the changes to take effect. - -# Arguments: - -- `nums::NTuple{3, Int}`: Maximum number of queues for `:normal`, - `:low` and `:high` priority. - Providing `0` for specific priority, disables pool for it. -""" -function set_queue_pool_size!(nums::NTuple{3, Int}) - @set_preferences!("queue_pool_max_size" => [nums...]) - @info """Successfully set queue pool max size to `$nums` (:normal, :low, :high). - Reset your Julia session for the changes to take effect.""" -end - -function get_pool_queue!(device::ROCDevice, priority::Symbol) - prio_idx = priority == :normal ? 1 : (priority == :low ? 2 : 3) - - lock(QUEUE_POOL) do pool - device_pool = get!(() -> Dict{Symbol, Vector{ROCQueue}}(), pool.pool, device) - queues = get!(() -> ROCQueue[], device_pool, priority) - length(queues) < pool.max_size[prio_idx] && return nothing - - # If all queues are allocated, pick next one. - idx = pool.idx % length(queues) + 1 - pool.idx += 1 - - queue = queues[idx] - queue.active && return queue - - @debug "Removing dead queue from pool" - deleteat!(queues, idx) - return nothing - end -end - -function pool_queue!(queue::ROCQueue) - prio_idx = queue.priority == :normal ? 1 : (queue.priority == :low ? 2 : 3) - QUEUE_POOL.payload.max_size[prio_idx] == 0 && return false - - lock(QUEUE_POOL) do pool - queues = pool.pool[queue.device][queue.priority] - length(queues) < pool.max_size[prio_idx] ? - (push!(queues, queue); true) : - false - end -end - -function remove_pooled_queue!(queue::ROCQueue) - lock(QUEUE_POOL) do pool - device_pool = get(pool.pool, queue.device, nothing) - isnothing(device_pool) && return - - queues = get(device_pool, queue.priority, nothing) - isnothing(queues) && return - - idx = findfirst(q -> q === queue, queues) - isnothing(idx) || deleteat!(queues, idx) - end -end - -device_queue_max_size(device::AnyROCDevice) = - getinfo(UInt32, device, HSA.AGENT_INFO_QUEUE_MAX_SIZE) - -device_queue_type(device::AnyROCDevice) = - getinfo(HSA.QueueType, device, HSA.AGENT_INFO_QUEUE_TYPE) - -""" - ROCQueue(device::ROCDevice; priority::Symbol=:normal, pooled::Bool=false) - -Create an HSA queue which will be used to -instruct GPU hardware which kernels to launch. - -Each queue, spawns an error monitoring thread that's responsible -for actually waiting on kernels and performing a cleanup after -kernel finished its execution. - -!!! note "Oversubscribed Command Queues in GPUs" - Be careful, with the number of HSA queues in use. - When the number of allocated HSA queues is greater than - the number of hardware queues, the GPU wastes significant time - rotating between all allocated queues in search of ready tasks. - -# Arguments: - -- `device::ROCDevice`: Device on which to create queue. -- `priority::Symbol`: Queue's priority. Can be `:normal`, `:low`, `:high`. -- `pooled::Bool`: Whether to use pool when creating queues. - When `true`, queues are drawn from it on creation - and returned to pool instead of destroyed. -""" -function ROCQueue(device::ROCDevice; priority::Symbol=:normal, pooled::Bool=false) - if !in(priority, (:normal, :low, :high)) - throw(ArgumentError( - "Invalid queue priority: $priority\n" * - "Options are :low, :normal, :high")) - end - - queue = pooled ? get_pool_queue!(device, priority) : nothing - isnothing(queue) || return queue - - alloc_id = rand(UInt64) - @log_start(:alloc_queue, (;alloc_id), (;device=get_handle(device), priority)) - - # Allocate a new queue from HSA. - c_queue_error_handler = @cfunction(queue_error_handler, - Cvoid, (HSA.Status, Ptr{HSA.Queue}, Ptr{Cvoid})) - - queue_size = device_queue_max_size(device) - queue_type = device_queue_type(device) - @assert queue_size > 0 - @assert queue_type == HSA.QUEUE_TYPE_MULTI - - # Create ROCQueue before HSA queue to be able to pass it to error handler. - queue = ROCQueue( - device, Ptr{HSA.Queue}(0), priority, HSA.STATUS_SUCCESS, true, - LinkedList{ROCKernelSignal}(), Base.Event(), Threads.ReentrantLock()) - - # Create HSA queue. - r_queue = Ref{Ptr{HSA.Queue}}() - HSA.queue_create( - device.agent, queue_size, queue_type, - c_queue_error_handler, pointer_from_objref(queue), - typemax(UInt32), typemax(UInt32), r_queue) |> check - - AMDGPU.hsaref!() - queue.queue = r_queue[] - - lock(RT_LOCK) do - @assert !haskey(QUEUES, queue.queue) - QUEUES[queue.queue] = WeakRef(queue) - end - - HSA.amd_queue_set_priority(queue.queue, hsa_priority(priority)) |> check - errormonitor(Threads.@spawn monitor_queue(queue)) - - finalizer(queue) do q - kill_queue!(q) - AMDGPU.hsaunref!() - end - - pooled && pool_queue!(queue) - @log_finish(:alloc_queue, (;alloc_id), (;queue=reinterpret(UInt64, queue.queue))) - return queue -end - -function hsa_priority(priority::Symbol) - if priority == :normal - HSA.AMD_QUEUE_PRIORITY_NORMAL - elseif priority == :low - HSA.AMD_QUEUE_PRIORITY_LOW - elseif priority == :high - HSA.AMD_QUEUE_PRIORITY_HIGH - end -end - -function monitor_queue(queue::ROCQueue) - kerns = queue.active_kernels::LinkedList{ROCKernelSignal} - while queue.active || length(kerns) > 0 - # Fetch oldest signal, if any - sig = lock(queue.lock) do - if length(kerns) > 0 - # Notify waiters that queue is running - notify(queue.running) - return first(kerns) - else - # Reset event - reset(queue.running) - return nothing - end - end - - # Wait for signal completion or new launches - if sig !== nothing - try - wait(sig; check_exceptions=true, cleanup=true) - catch err - @debug "Kernel exception" exception=(err,catch_backtrace()) - end - # Move to the next kernel. - Base.@lock queue.lock begin - kerns = next!(kerns) - end - else - wait(queue.running) - end - end -end - -function ensure_active(queue::ROCQueue) - @label check - if !queue.active - throw(QueueError(queue)) - elseif queue.status != HSA.STATUS_SUCCESS - # We track status updates from the queue callback - kill_queue!(queue) - @goto check - end -end - -""" -Determine if there are active kernels for the queue. -If not, we can re-use it. -""" -function has_active_kernels(q::ROCQueue) - lock(q.lock) do - return !isempty(q.active_kernels) - end -end - -""" - kill_queue!(queue::ROCQueue) - -Kill `queue` and propagate queue error to -all waiter signals in case if there is one. - -If queue is in the pool, it will be removed from it. - -!!! note - No need to manually call this function during regular use, - it will be called automatically from [`ROCQueue`](@ref) finalizer. -""" -function kill_queue!(queue::ROCQueue) - _, succ = @atomicreplace queue.active true => false - succ || return - - # TODO: Eliminate race from active=false to setting exception - - @log_start(:kill_queue!, (;queue=reinterpret(UInt64, queue.queue)), nothing) - remove_pooled_queue!(queue) - - lock(RT_LOCK) do - delete!(QUEUES, queue.queue) - end - lock(queue.lock) do - # Send exception to all waiter signals - if queue.status != HSA.STATUS_SUCCESS - err = QueueError(queue) - for kersig in queue.active_kernels::LinkedList{ROCKernelSignal} - kersig::ROCKernelSignal - kersig.exception = err - notify(kersig) - end - end - end - - HSA.queue_destroy(queue.queue) |> check - @log_finish(:kill_queue!, (;queue=reinterpret(UInt64, queue.queue)), nothing) - return -end diff --git a/src/runtime/signal.jl b/src/runtime/signal.jl deleted file mode 100644 index 7b37ead73..000000000 --- a/src/runtime/signal.jl +++ /dev/null @@ -1,131 +0,0 @@ -const DEFAULT_SIGNAL_TIMEOUT = Ref{Union{Float64, Nothing}}(nothing) - -const SIGNAL_TIMEOUT_KILL_QUEUE = Ref{Bool}(true) - -struct SignalPool - pool::Set{HSA.Signal} - max_size::Int -end -SignalPool() = SignalPool(Set{HSA.Signal}(), @load_preference("signal_pool_max_size", 128)) - -const SIGNAL_POOL = LockedObject(SignalPool()) - -function set_signal_pool_size!(num::Integer) - @set_preferences!("signal_pool_max_size" => num) - @info """Successfully set signal pool max size to `$num`. - Reset your Julia session for the changes to take effect.""" -end - -function get_pool_signal!()::Union{HSA.Signal, Nothing} - lock(SIGNAL_POOL) do pool - isempty(pool.pool) ? nothing : pop!(pool.pool) - end -end - -""" -Return `true` if destroyed a signal, otherwise `false`. -If `destroy=true` then destroy signal immediately. -""" -function free_pool_signal!(signal::HSA.Signal; destroy::Bool)::Bool - destroy && (check(HSA.signal_destroy(signal)); return true) - lock(SIGNAL_POOL) do pool - destroy = length(pool.pool) < pool.max_size - destroy ? - check(HSA.signal_destroy(signal)) : - push!(pool.pool, signal) - destroy - end -end - -mutable struct ROCSignal - signal::HSA.Signal -end -Adapt.adapt_structure(::Adaptor, sig::ROCSignal) = sig.signal - -struct SignalTimeoutException <: Exception - signal::ROCSignal -end - -""" - ROCSignal(init::Integer = 1; pooled::Bool=true, ipc::Bool=false) -> ROCSignal - -Acquires an HSA signal object which can be used to communicate values between -the host and device. - -- `pooled::Bool`: If `true`, the signal may be taken from an existing pool of - signals; if `false`, or if the pool is empty, the signal is allocated from HSA. -- `ipc::Bool`: If `true`, signal may be used for interprocess communication. - IPC signals can be read, written, and waited on from any process. - Disables signal pooling when `true`. -""" -function ROCSignal(init::Int64 = 1; pooled::Bool = true, ipc::Bool = false) - pooled = ipc ? false : pooled - raw_signal = pooled ? get_pool_signal!() : nothing - - if isnothing(raw_signal) - signal_ref = Ref{HSA.Signal}() - check(ipc ? - HSA.amd_signal_create(init, 0, C_NULL, HSA.AMD_SIGNAL_IPC, signal_ref) : - HSA.signal_create(init, 0, C_NULL, signal_ref)) - raw_signal = signal_ref[] - else - HSA.signal_store_relaxed(raw_signal, init) |> check - end - - AMDGPU.hsaref!() - signal = ROCSignal(raw_signal) - finalizer(signal) do signal - # Destroy if not using pool, otherwise return to pool. - destroyed = free_pool_signal!(signal.signal; destroy=!pooled) - destroyed && AMDGPU.hsaunref!() - end - signal -end - -get_handle(signal::ROCSignal) = signal.signal.handle - -load_acquire(signal::ROCSignal) = HSA.signal_load_scacquire(signal.signal) - -Base.isdone(signal::ROCSignal) = load_acquire(signal) < 1 - -Base.show(io::IO, signal::ROCSignal) = - print(io, "ROCSignal($(repr(get_handle(signal))))") - -function Base.wait( - signal::ROCSignal; timeout::Union{Real, Nothing} = DEFAULT_SIGNAL_TIMEOUT[], - min_latency::Int64 = 1_000, #= 1 micro-second =# - queue = nothing, -) - has_timeout = !isnothing(timeout) - has_timeout && (timeout < 0) && error( - "Timeout `$timeout` must be a positive real value or `nothing`.") - - start_time = time_ns() - finished = false - - GC.@preserve signal while !finished - finished = 0 == HSA.signal_wait_scacquire( - signal.signal, HSA.SIGNAL_CONDITION_LT, 1, - min_latency, HSA.WAIT_STATE_BLOCKED) - - if has_timeout && !finished - diff_time = (time_ns() - start_time) / 1e9 - (diff_time > timeout) && throw(SignalTimeoutException(signal)) - end - - if queue !== nothing - ensure_active(queue) - end - - # Allow another scheduled task to run. - # This is especially needed in the case - # when kernels need to perform HostCalls. - yield() - end -end - -function Base.wait(signal::HSA.Signal; timeout = DEFAULT_SIGNAL_TIMEOUT[]) - wait(ROCSignal(signal); timeout) -end - -Base.notify(signal::ROCSignal) = HSA.signal_store_screlease(signal.signal, 0) diff --git a/src/runtime/sync.jl b/src/runtime/sync.jl deleted file mode 100644 index 0f3579b88..000000000 --- a/src/runtime/sync.jl +++ /dev/null @@ -1,38 +0,0 @@ -import ..AMDGPU: hip_configured - -"Tracks HSA signals and HIP streams to sync against." -struct SyncState - signals::Vector{ROCKernelSignal} - streams::Vector{Ptr{Cvoid}} - lock::Threads.ReentrantLock -end -SyncState() = SyncState(ROCKernelSignal[], Ptr{Cvoid}[], Threads.ReentrantLock()) - -struct WaitAdaptor end -struct MarkAdaptor{S} - s::S -end - -function wait!(ss::SyncState) - lock(ss.lock) do - # FIXME: Use barrier_and on dedicated queue - foreach(wait, ss.signals) - empty!(ss.signals) - @static if hip_configured - for s in ss.streams - AMDGPU.HIP.@check AMDGPU.HIP.hipStreamSynchronize(s) - end - empty!(ss.streams) - end - end - return -end -mark!(ss::SyncState, signal::ROCKernelSignal) = - lock(()->push!(ss.signals, signal), ss.lock) -mark!(ss::SyncState, stream::Ptr{Cvoid}) = - lock(()->push!(ss.streams, stream), ss.lock) -mark!(ss::SyncState, stream::HIP.HIPStream) = - mark!(ss, stream.stream) - -wait!(x) = Adapt.adapt(WaitAdaptor(), x) -mark!(x, s) = Adapt.adapt(MarkAdaptor(s), x) # TODO constrain type of `s` diff --git a/src/runtime/thread-utils.jl b/src/thread-utils.jl similarity index 99% rename from src/runtime/thread-utils.jl rename to src/thread-utils.jl index 0517d9196..fb85eae06 100644 --- a/src/runtime/thread-utils.jl +++ b/src/thread-utils.jl @@ -1,5 +1,3 @@ -import ..LLVM - ## Lazy Initialization # Borrowed from CUDA.jl @@ -55,8 +53,6 @@ end ## Memoization # Borrowed from CUDA.jl -export @memoize - """ @memoize [key::T] [maxlen=...] begin # expensive computation diff --git a/src/tls.jl b/src/tls.jl index ef3cca0eb..abe4581bf 100644 --- a/src/tls.jl +++ b/src/tls.jl @@ -1,37 +1,22 @@ struct TaskLocalState - device::ROCDevice + device::HIPDevice context::HIPContext - queues::Vector{Union{ROCQueue,Nothing}} streams::Vector{Union{HIPStream,Nothing}} priority::Symbol end -function TaskLocalState(device::Union{ROCDevice,Nothing}, - context::Union{HIPContext,Nothing}, - queue::Union{ROCQueue,Nothing}, - stream::Union{HIPStream,Nothing}, - priority::Symbol) +function TaskLocalState( + device::Union{HIPDevice,Nothing}, context::Union{HIPContext,Nothing}, + stream::Union{HIPStream,Nothing}, priority::Symbol, +) if device === nothing - if queue === nothing - device = Runtime.get_default_device() - queue = ROCQueue(device; priority, pooled=true) - else - device = AMDGPU.device(queue) - end - else - if queue === nothing - queue = ROCQueue(device; priority, pooled=true) - else - queue.device == device || throw(ArgumentError(""" - Provided ROCQueue is on a differen device `$(queue.device)` - from the default one `$device`. - """)) - @assert queue.priority == priority - end + # TODO get from stream if provided + device = Runtime.get_default_device() end if context === nothing - context = HIPContext(device_id(device)) + context = HIPContext(device) end HIP.context!(context) # Switches HIP active device as well. + if stream === nothing stream = HIPStream(priority) else @@ -41,45 +26,27 @@ function TaskLocalState(device::Union{ROCDevice,Nothing}, """)) @assert stream.priority == priority end - queues = Union{ROCQueue,Nothing}[nothing for _ in 1:length(devices())] - streams = Union{HIPStream,Nothing}[nothing for _ in 1:length(devices())] - queues[device_id(device)] = queue + streams = Union{HIPStream, Nothing}[nothing for _ in 1:length(devices())] streams[device_id(device)] = stream - return TaskLocalState(device, context, queues, streams, priority) + return TaskLocalState(device, context, streams, priority) end -TaskLocalState() = TaskLocalState(nothing, nothing, nothing, nothing, :normal) +TaskLocalState() = TaskLocalState(nothing, nothing, nothing, :normal) function Base.getproperty(state::TaskLocalState, field::Symbol) - # Helpers to return active queue or stream - if field == :queue - return state.queues[device_id(state.device)]::ROCQueue - elseif field == :stream + # Helpers to return active stream + if field == :stream return state.streams[device_id(state.device)]::HIPStream else return getfield(state, field) end end -Base.copy(state::TaskLocalState) = - TaskLocalState(state.device, - state.context, - copy(state.queues), - copy(state.streams), - state.priority) - -function reset_dead_queue!() - state = task_local_state() - queue = state.queue - if !queue.active - queue = state.queues[device_id(state.device)] = ROCQueue(state.device; priority=state.priority) - end - return queue -end +Base.copy(state::TaskLocalState) = TaskLocalState( + state.device, state.context, copy(state.streams), state.priority) function Base.show(io::IO, state::TaskLocalState) println(io, "TaskLocalState:") println(io, " Device: $(state.device)") println(io, " HIP Context: $(state.context)") - println(io, " HSA Queue: $(state.queue)") println(io, " HIP Stream: $(state.stream)") print(io, " Priority: $(state.priority)") end @@ -88,42 +55,36 @@ end task_local_state() -> TaskLocalState Returns the task-local state in the form of a `TaskLocalState`. Automatically -picks a device, context, queue, and stream if they haven't already been selected. +picks a device, context, and stream if they haven't already been selected. """ task_local_state()::TaskLocalState = get!(()->TaskLocalState(), task_local_storage(), :AMDGPU) """ - task_local_state!(; device=nothing, context=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal) + task_local_state!(; device=nothing, context=nothing, stream=nothing, priority::Symbol=:normal) -Sets the task-local device, queue (with the specified priority), and HIP stream. If -`device`, `queue`, or `stream` is `nothing` and an existing task-local state has been -configured, then those values are retrived from the existing state (unless the -`priority` has changed, in which case a new queue is selected); if no -task-local state has been configured, then defaults are used when `nothing` is -supplied. +Sets the task-local device and HIP stream. +If `device`, , or `stream` is `nothing` and an existing task-local state +has been configured, then those values are retrived from the existing state +(unless the `priority` has changed, in which case a new stream is selected); +if no task-local state has been configured, then defaults are used +when `nothing` is supplied. -Note that these are only task-local defaults; when a device, queue or stream is +Note that these are only task-local defaults; when a device or stream is manually passed to an AMDGPU operation (such as `@roc`), then the task-local value is ignored in favor of the passed argument. """ -function task_local_state!(; device=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal) +function task_local_state!(; device=nothing, stream=nothing, priority::Symbol=:normal) if haskey(task_local_storage(), :AMDGPU) old_state = task_local_state() if device === nothing device = old_state.device context = old_state.context else - context = HIPContext(device_id(device)) + context = HIPContext(device) end HIP.context!(context) - if queue === nothing - if priority == old_state.priority && old_state.queues[device_id(device)] !== nothing - queue = old_state.queues[device_id(device)] - else - queue = ROCQueue(device; priority, pooled=true) - end - end + if stream === nothing if priority == old_state.priority && old_state.streams[device_id(device)] !== nothing stream = old_state.streams[device_id(device)] @@ -131,46 +92,41 @@ function task_local_state!(; device=nothing, queue=nothing, stream=nothing, prio stream = HIPStream(priority) end end - queues = copy(old_state.queues) streams = copy(old_state.streams) else # TODO Use default constructor? if device === nothing device = Runtime.get_default_device() end + context = HIPContext(device_id(device)) HIP.context!(context) - if queue === nothing - queue = ROCQueue(device; priority) - end if stream === nothing stream = HIPStream(priority) end - queues = Union{ROCQueue,Nothing}[nothing for _ in 1:length(devices())] streams = Union{HIPStream,Nothing}[nothing for _ in 1:length(devices())] end - queues[device_id(device)] = queue - streams[device_id(device)] = stream - new_state = TaskLocalState(device, context, queues, streams, priority) + streams[device_id(device)] = stream + new_state = TaskLocalState(device, context, streams, priority) task_local_storage(:AMDGPU, new_state) end task_local_state!(state::TaskLocalState) = task_local_storage(:AMDGPU, state) """ - task_local_state!(f::Base.Callable; device=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal) + task_local_state!(f::Base.Callable; device=nothing, stream=nothing, priority::Symbol=:normal) Executes `f` with the given task-local state, and when finished, resets the state back to previous values and returns the result of `f()`. """ -function task_local_state!(f::Base.Callable; - device=nothing, queue=nothing, stream=nothing, - priority::Symbol=:normal) +function task_local_state!( + f::Base.Callable; device=nothing, stream=nothing, priority::Symbol=:normal, +) restore = haskey(task_local_storage(), :AMDGPU) if restore old_state = task_local_state() end - task_local_state!(; device, queue, stream, priority) + task_local_state!(; device, stream, priority) return try f() @@ -178,7 +134,7 @@ function task_local_state!(f::Base.Callable; if restore task_local_state!(old_state) else - # We want a fresh state with pooled queues and default priority + # We want a fresh state and default priority delete!(task_local_storage(), :AMDGPU) task_local_state!() end diff --git a/src/utils.jl b/src/utils.jl index a6a4b9bb1..782ee4a96 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -12,12 +12,11 @@ function hsa_version() end function versioninfo(io::IO=stdout) - println("Using ROCm provided by: ", use_artifacts ? "JLLs" : "System") + println("Using ROCm provided by: ", use_artifacts() ? "JLLs" : "System") println("HSA Runtime ($(functional(:hsa) ? "ready" : "MISSING"))") if functional(:hsa) println("- Path: $libhsaruntime_path") println("- Version: $(hsa_version())") - #println("- Initialized: $(repr(HSA_REFCOUNT[] > 0))") end println("ld.lld ($(functional(:lld) ? "ready" : "MISSING"))") if functional(:lld) @@ -25,13 +24,11 @@ function versioninfo(io::IO=stdout) end println("ROCm-Device-Libs ($(functional(:device_libs) ? "ready" : "MISSING"))") if functional(:device_libs) - println("- Path: $device_libs_path") - # TODO: println("- Version: $(device_libs_version)") + println("- Path: $libdevice_libs") end println("HIP Runtime ($(functional(:hip) ? "ready" : "MISSING"))") if functional(:hip) - println("- Path: $libhip_path") - # TODO: println("- Version: $(libhip_version)") + println("- Path: $libhip") end println("rocBLAS ($(functional(:rocblas) ? "ready" : "MISSING"))") if functional(:rocblas) @@ -63,7 +60,7 @@ function versioninfo(io::IO=stdout) end if functional(:hsa) - println("HSA Agents ($(length(Runtime.devices()))):") + println("HIP Devices ($(length(Runtime.devices()))):") for device in Runtime.devices() println("- ", repr(device)) end @@ -155,11 +152,8 @@ function functional(component::Symbol) end function has_rocm_gpu() - if !functional(:hsa) - return false - else - return length(devices(:gpu)) > 0 - end + (functional(:hsa) && functional(:hip)) || return false + return length(devices()) > 0 end function print_build_diagnostics() diff --git a/test/codegen/trap.jl b/test/codegen/trap.jl index 5d3043d97..ee75fd2d8 100644 --- a/test/codegen/trap.jl +++ b/test/codegen/trap.jl @@ -1,19 +1,18 @@ -if !IS_NAVI_2 - @testset "Trapping" begin - function trapkern() - Device.trap() - nothing - end - function debugtrapkern() - Device.debugtrap() - nothing - end - - iob = IOBuffer() - AMDGPU.code_gcn(iob, trapkern, Tuple{}; kernel=true) - @test occursin("s_trap 2", String(take!(iob))) - iob = IOBuffer() - AMDGPU.code_gcn(iob, debugtrapkern, Tuple{}; kernel=true) - @test occursin("s_trap 3", String(take!(iob))) +@testset "Trapping" begin + function trapkern() + Device.trap() + nothing + end + function debugtrapkern() + Device.debugtrap() + nothing end + + iob = IOBuffer() + AMDGPU.code_gcn(iob, trapkern, Tuple{}; kernel=true) + @test occursin("s_trap 2", String(take!(iob))) + + iob = IOBuffer() + AMDGPU.code_gcn(iob, debugtrapkern, Tuple{}; kernel=true) + @test occursin("s_trap 3", String(take!(iob))) end diff --git a/test/device/array.jl b/test/device/array.jl index ef76ee2e3..34c97deaa 100644 --- a/test/device/array.jl +++ b/test/device/array.jl @@ -19,8 +19,4 @@ @test occursin("4×4 device array at", sprint(io->show(io, RD))) @test occursin("2×2 device array view", sprint(io->show(io, RD_view))) @test occursin("4×4 device array wrapper Adjoint", sprint(io->show(io, RD_adj))) - - # Custom hash methods are defined - @test AMDGPU.Runtime.khash(RD) isa UInt # test that hashing doesn't segfault - @test AMDGPU.Runtime.khash(RD_view) isa UInt # test that SubArray hashing works end diff --git a/test/device/deps.jl b/test/device/deps.jl deleted file mode 100644 index facd626be..000000000 --- a/test/device/deps.jl +++ /dev/null @@ -1,73 +0,0 @@ -@testset "Kernel Dependencies" begin - function kernel(sig, waitval, A, val) - i = workitemIdx().x - AMDGPU.Device.hostcall_device_signal_wait(sig, waitval) - A[i] = val - return nothing - end - - @testset "Barrier AND" begin - for i in (0, 1, 5, 7) - @testset "$i inputs" begin - RA = ROCArray(zeros(Float64, 1)) - sig = AMDGPU.ROCSignal(0) - - # Disable wait and mark because: - # - We need the kernels (ret1 vs ret2) to race - # - We're accessing RA before the kernels are complete - ret1 = map(1:i) do _ - @roc wait=false mark=false kernel(sig, 3, RA, 1.0) - end - - retb = AMDGPU.barrier_and!(ret1) - - ret2 = @roc wait=false mark=false kernel(sig, 0, RA, 2.0) - - if i > 0 - sleep(0.5) - @test Array(RA)[1] == 0.0 - HSA.signal_store_screlease(sig.signal, 3) - wait.(ret1) - @test Array(RA)[1] == 1.0 - end - HSA.signal_store_screlease(sig.signal, 0) - # FIXME: wait(retb) - wait(ret2) - @test Array(RA)[1] == 2.0 - end - end - end - - #= FIXME - @testset "Barrier OR" begin - for i in (0, 1, 5, 7) - @testset "$i inputs" begin - RA = ROCArray(zeros(Float64, 1)) - sig = AMDGPU.ROCSignal(0) - - ret1 = [@roc(kernel(sig, 7, RA, 5.0)) for _ in 1:i] - pushfirst!(ret1, @roc(kernel(sig, 3, RA, 1.0))) - - retb = AMDGPU.barrier_or!(ret1) - ret2 = @roc kernel(sig, 0, RA, 2.0) - - if i > 0 - sleep(0.5) - @test Array(RA)[1] == 0.0 - HSA.signal_store_release(sig.signal, 3) - - wait(ret1[1]) - @test Array(RA)[1] == 1.0 - end - HSA.signal_store_screlease(sig.signal, 0) - sleep(0.5) - @test Array(RA)[1] == 2.0 - wait(ret2) - # FIXME: wait(retb) - # clear waiting kernels - HSA.signal_store_screlease(sig.signal, 7) - end - end - end - =# -end diff --git a/test/device/exceptions.jl b/test/device/exceptions.jl index 8fb47dbf9..10c38c49a 100644 --- a/test/device/exceptions.jl +++ b/test/device/exceptions.jl @@ -1,19 +1,18 @@ @testset "Exceptions" begin - -function oob_kernel(X) - X[0] = 1 - nothing -end - -RA = ROCArray(ones(Float32, 4)) -try - wait(@roc oob_kernel(RA)) -catch err - @test err isa Runtime.KernelException - if err isa Runtime.KernelException - @test err.exstr !== nothing - @test occursin("Out-of-bounds array access", err.exstr) + function oob_kernel(X) + X[0] = 1 + nothing end -end + RA = ROCArray(ones(Float32, 4)) + @roc oob_kernel(RA) + try + AMDGPU.synchronize() + catch err + @test err isa ErrorException + finally + AMDGPU.reset_exception_holder!(AMDGPU.device()) + end + # TODO check exception message + # TODO check specific exception type end diff --git a/test/device/execution_control.jl b/test/device/execution_control.jl index cacf53d12..27f1dc036 100644 --- a/test/device/execution_control.jl +++ b/test/device/execution_control.jl @@ -1,66 +1,51 @@ @testset "Execution Control Intrinsics" begin + @testset "sendmsg/sendmsghalt/endpgm" begin + function exec_ctl_kernel() + Device.sendmsg(5) + Device.sendmsghalt(6) + Device.endpgm() + end -@testset "Completion Signal" begin - function completion_signal_kernel(X) - X[1] = AMDGPU.Device._completion_signal() - nothing - end - - RA = ROCArray(rand(UInt64, 1)) - - ev = @roc completion_signal_kernel(RA) - wait(ev) - @test Array(RA)[1] == ev.signal.signal.handle -end - -@testset "sendmsg/sendmsghalt/endpgm" begin - function exec_ctl_kernel() - Device.sendmsg(5) - Device.sendmsghalt(6) - Device.endpgm() + iob = IOBuffer() + AMDGPU.code_native(iob, exec_ctl_kernel, Tuple{}) + str = String(take!(iob)) + @test occursin("s_sendmsg ", str) + @test occursin("s_sendmsghalt ", str) + # TODO: Can't easily count these, since they're automatically inserted + @test occursin("s_endpgm", str) end - iob = IOBuffer() - AMDGPU.code_native(iob, exec_ctl_kernel, Tuple{}) - str = String(take!(iob)) - @test occursin("s_sendmsg ", str) - @test occursin("s_sendmsghalt ", str) - # TODO: Can't easily count these, since they're automatically inserted - @test occursin("s_endpgm", str) -end - -@testset "device_sleep/memtime/memrealtime" begin - function time_kernel(X) - t1 = AMDGPU.Device.memtime() - tr1 = AMDGPU.Device.memrealtime() - AMDGPU.Device.device_sleep(Int32(1)) - t2 = AMDGPU.Device.memtime() - tr2 = AMDGPU.Device.memrealtime() - X[1] = t2 > t1 - X[2] = tr2 > tr1 - return + @testset "device_sleep/memtime/memrealtime" begin + function time_kernel(X) + t1 = AMDGPU.Device.memtime() + tr1 = AMDGPU.Device.memrealtime() + AMDGPU.Device.device_sleep(Int32(2)) + t2 = AMDGPU.Device.memtime() + tr2 = AMDGPU.Device.memrealtime() + X[1] = t2 > t1 + X[2] = tr2 > tr1 + return + end + RX = ROCArray(zeros(Bool, 2)) + @roc time_kernel(RX) + @test all(Array(RX)) end - RX = ROCArray(zeros(Bool, 2)) - wait(@roc time_kernel(RX)) - @test all(Array(RX)) -end -@testset "readfirstlane" begin - function readfirstlane_kernel(B, A) - idx = workitemIdx().x - if idx > 1 - B[idx] = AMDGPU.Device.readfirstlane(A[idx]) - else - B[idx] = A[idx] + @testset "readfirstlane" begin + function readfirstlane_kernel(B, A) + idx = workitemIdx().x + if idx > 1 + B[idx] = AMDGPU.Device.readfirstlane(A[idx]) + else + B[idx] = A[idx] + end + return end - return + RB = ROCArray(zeros(Int32, 8)) + RA = ROCArray(Int32(1):Int32(8)) + @roc groupsize=8 readfirstlane_kernel(RB, RA) + B = Array(RB) + @test B[1] == Int32(1) + @test all(B[2:8] .== Int32(2)) end - RB = ROCArray(zeros(Int32, 8)) - RA = ROCArray(Int32(1):Int32(8)) - wait(@roc groupsize=8 readfirstlane_kernel(RB, RA)) - B = Array(RB) - @test B[1] == Int32(1) - @test all(B[2:8] .== Int32(2)) -end - end diff --git a/test/device/globals.jl b/test/device/globals.jl deleted file mode 100644 index d71370075..000000000 --- a/test/device/globals.jl +++ /dev/null @@ -1,20 +0,0 @@ -@testset "Globals" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - function kernel(X) - ptr = Device.get_global_pointer(Val(:myglobal), Float32) - Base.unsafe_store!(ptr, 3f0) - nothing - end - - mygbl_ptr = Ref{Any}() - function gbl_init(gbl, mod, dev) - gbl_ptr = Base.unsafe_convert(Ptr{Float32}, gbl.ptr) - mygbl_ptr[] = gbl_ptr - - Base.unsafe_store!(gbl_ptr, 2f0) - end - - wait(@roc groupsize=1 global_hooks=(myglobal=gbl_init,) kernel(Int32(1))) - @test Base.unsafe_load(mygbl_ptr[]) == 3f0 -end diff --git a/test/device/hostcall.jl b/test/device/hostcall.jl index 806f3a75d..06ca26fd0 100644 --- a/test/device/hostcall.jl +++ b/test/device/hostcall.jl @@ -1,260 +1,214 @@ @testset "Hostcall" begin @testset "Call: No return or arguments" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) hostcall!(sig) b[1] = a[1] - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) dref = Ref{Bool}(false) - hc = HostCall(Nothing, Tuple{}) do + hc = HostCallHolder(Nothing, Tuple{}) do dref[] = true nothing end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 1f0 @test dref[] == true end @testset "Call: Error" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) hostcall!(sig) b[1] = a[1] - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) dref = Ref{Bool}(false) - # This should throw an exception and the error message should be logged @test_logs (:error, "HostCall error") begin - hc, hc_task = HostCall(Nothing, Tuple{}; return_task=true) do + hc = HostCallHolder(Nothing, Tuple{}) do error("Some error") dref[] = true nothing end - @test_throws Runtime.KernelException wait(@roc kernel(RA, RB, hc)) + @roc kernel(RA, RB, hc) + @test_throws ErrorException AMDGPU.synchronize(; blocking=false) + AMDGPU.reset_exception_holder!(AMDGPU.device()) - empty!(RB.syncstate.signals) @test Array(RB)[1] == 0f0 @test dref[] == false - @test Base.istaskfailed(hc_task) + sleep(1) # Give time for the task to shut down. + @test Base.istaskfailed(hc.task) end end @testset "Call: (0 args)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc = hostcall!(sig)::Float32 b[1] = a[1] + inc - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Float32, Tuple{}) do + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Float32, Tuple{}) do 1f0 end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 2f0 end @testset "Call: (1 arg)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc = hostcall!(sig, 42f0)::Float32 b[1] = a[1] + inc - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Float32, Tuple{Float32}) do arg1 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Float32, Tuple{Float32}) do arg1 arg1 + 1f0 end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 44f0 end @testset "Call: (2 homogeneous args)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc = hostcall!(sig, 42f0, 3f0)::Float32 b[1] = a[1] + inc - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Float32, Tuple{Float32,Float32}) do arg1, arg2 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Float32, Tuple{Float32,Float32}) do arg1, arg2 arg1 + arg2 + 1f0 end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 47f0 end @testset "Call: (2 heterogeneous args)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc = hostcall!(sig, 42f0, Int16(3))::Float32 b[1] = a[1] + inc - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Float32, Tuple{Float32,Int16}) do arg1, arg2 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Float32, Tuple{Float32,Int16}) do arg1, arg2 arg1 + Float32(arg2) + 1f0 end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 47f0 end @testset "Call: (2 heterogeneous args, return homogeneous tuple)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc1, inc2 = hostcall!(sig, 42f0, Int16(3))::Tuple{Float32,Float32} b[1] = a[1] + inc1 + inc2 - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Tuple{Float32,Float32}, Tuple{Float32,Int16}) do arg1, arg2 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Tuple{Float32,Float32}, Tuple{Float32,Int16}) do arg1, arg2 (arg1 + Float32(arg2) + 1f0, 1f0) end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 48f0 end @testset "Call: (2 heterogeneous args, return heterogeneous tuple)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc1, inc2 = hostcall!(sig, 42f0, Int16(3))::Tuple{Float32,Int64} b[1] = a[1] + inc1 + Float32(inc2) - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Tuple{Float32,Int64}, Tuple{Float32,Int16}) do arg1, arg2 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Tuple{Float32,Int64}, Tuple{Float32,Int16}) do arg1, arg2 (arg1 + Float32(arg2) + 1f0, 1) end - wait(@roc kernel(RA, RB, hc)) - + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 48f0 end @testset "Call: (2 hostcalls, 1 kernel)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig1,sig2) inc1 = hostcall!(sig1, 3f0)::Float32 inc2 = hostcall!(sig2, 4f0)::Float32 b[1] = a[1] + inc1 + inc2 - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc1 = HostCall(Float32, Tuple{Float32}) do arg1 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc1 = HostCallHolder(Float32, Tuple{Float32}) do arg1 arg1 + 1f0 end - hc2 = HostCall(Float32, Tuple{Float32}) do arg1 + hc2 = HostCallHolder(Float32, Tuple{Float32}) do arg1 arg1 + 2f0 end - wait(@roc kernel(RA, RB, hc1, hc2)) + @roc kernel(RA, RB, hc1, hc2) + AMDGPU.synchronize(; blocking=false) @test Array(RB)[1] == 11f0 end @testset "Call: (1 hostcall, 2 kernels)" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - function kernel(a,b,sig) inc = hostcall!(sig, 3f0)::Float32 b[1] = a[1] + inc - nothing end - A = ones(Float32, 1) - B = zeros(Float32, 1) - RA = ROCArray(A) - RB = ROCArray(B) - - hc = HostCall(Float32, Tuple{Float32}; continuous=true) do arg1 + RA = ROCArray(ones(Float32, 1)) + RB = ROCArray(zeros(Float32, 1)) + hc = HostCallHolder(Float32, Tuple{Float32}; continuous=true) do arg1 arg1 + 1f0 end - wait(@roc kernel(RA, RB, hc)) - wait(@roc kernel(RA, RB, hc)) + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) + + @roc kernel(RA, RB, hc) + AMDGPU.synchronize(; blocking=false) + + # Next time HC will be called from the kernel is its last time. + # So that it shutdowns correctly and does not stick to the end. + AMDGPU.Device.finish!(hc) @test Array(RB)[1] == 5f0 + + # Give HostCall task time to exit. + sleep(2) + @test istaskdone(hc) end end diff --git a/test/device/indexing.jl b/test/device/indexing.jl index 2a4b175b8..982c39fdd 100644 --- a/test/device/indexing.jl +++ b/test/device/indexing.jl @@ -13,7 +13,7 @@ A = zeros(Int64, 6) RA = ROCArray(A) - wait(@roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA)) + @roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA) A = Array(RA) @test all(A .> 0) @@ -35,7 +35,9 @@ A = zeros(Int64, 9) RA = ROCArray(A) - wait(@roc groupsize=(1,2,3) gridsize=(4,4,6) dim_kern(RA)) + groupsize = (1, 2, 3) + gridsize = (4, 4, 6) + @roc groupsize=groupsize gridsize=gridsize dim_kern(RA) A = Array(RA) - @test A == [1,2,3,4,4,6,4,2,2] + @test A == [groupsize..., (groupsize .* gridsize)..., gridsize...] end diff --git a/test/device/launch.jl b/test/device/launch.jl index 77543a91b..415d42cf5 100644 --- a/test/device/launch.jl +++ b/test/device/launch.jl @@ -1,9 +1,6 @@ @testset "Launch Options" begin kernel() = nothing - device = AMDGPU.default_device() - queue = AMDGPU.queue(device) - # Group/grid size selection and aliases for (groupsize,gridsize) in ( (1,1), @@ -19,53 +16,32 @@ ((1,1,1),2), (1,(1024,1,1)), ) - eval(:(wait(@roc groupsize=$groupsize $kernel()))) - eval(:(wait(@roc groupsize=$groupsize gridsize=$gridsize $kernel()))) - eval(:(wait(@roc gridsize=$gridsize $kernel()))) - - threads = groupsize - blocks = gridsize .÷ groupsize - eval(:(wait(@roc threads=$threads $kernel()))) - eval(:(wait(@roc blocks=$blocks $kernel()))) - eval(:(wait(@roc threads=$threads blocks=$blocks $kernel()))) + @roc groupsize=groupsize kernel() + @roc groupsize=groupsize gridsize=gridsize kernel() + @roc gridsize=gridsize kernel() end - # Device/queue selection and aliases - # FIXME: Test that device/queue are used! - eval(:(wait(@roc device=$device $kernel()))) - eval(:(wait(@roc device=$device queue=$queue $kernel()))) - eval(:(wait(@roc queue=$queue $kernel()))) - eval(:(wait(@roc stream=$queue $kernel()))) + stream = AMDGPU.stream() + @roc stream=stream kernel() + AMDGPU.synchronize() - # Non-default queue - queue2 = ROCQueue() - sig = @roc queue=queue2 kernel() - @test sig.queue === queue2 + # Non-default stream + stream2 = HIPStream() + @roc stream=stream2 kernel() + AMDGPU.synchronize(stream2) # Group size validity - @test_throws ArgumentError eval(:(wait(@roc groupsize=0 $kernel()))) - eval(:(wait(@roc groupsize=1024 $kernel()))) - @test_throws ArgumentError eval(:(wait(@roc groupsize=1025 $kernel()))) - @test_throws ArgumentError eval(:(wait(@roc groupsize=(1024,2) $kernel()))) - @test_throws ArgumentError eval(:(wait(@roc groupsize=(512,2,2) $kernel()))) + @test_throws AMDGPU.HIP.HIPError @roc groupsize=0 kernel() + @test_throws AMDGPU.HIP.HIPError @roc groupsize=1025 kernel() + @test_throws AMDGPU.HIP.HIPError @roc groupsize=(1024, 2) kernel() + @test_throws AMDGPU.HIP.HIPError @roc groupsize=(512, 2, 2) kernel() # No-launch - kersig = eval(:(@roc launch=true $kernel())) - @test isa(kersig, AMDGPU.ROCKernelSignal) - wait(kersig) - - host_kernel = eval(:(@roc launch=false $kernel())) - @test isa(host_kernel, Runtime.HostKernel) - + host_kernel = @roc launch=false kernel() + @test isa(host_kernel, Runtime.HIPKernel) @test_throws Exception eval(:(@roc launch=1 $kernel())) # TODO: ArgumentError end -@testset "No-argument kernel" begin - kernel() = nothing - - wait(@roc kernel()) -end - @testset "Kernel argument alignment" begin function kernel(x, y) if Int64(x) != y @@ -75,7 +51,8 @@ end end x = rand(UInt32) y = Int64(x) - wait(@roc kernel(x, y)) + @roc kernel(x, y) + AMDGPU.synchronize() end @testset "Function/Argument Conversion" begin @@ -89,81 +66,53 @@ end @roc kernel(f) end - a = [1.] - a_dev = ROCArray(a) - - outer(a_dev, 2.) - - @test Array(a_dev) == [2.] + a_dev = ROCArray([1.0]) + outer(a_dev, 2.0) + @test Array(a_dev) == [2.0] end end -@testset "Signal waiting" begin - sig = @roc identity(nothing) - wait(sig) - wait(sig.signal) - wait(sig.signal.signal) - @test sig.queue === AMDGPU.queue() -end - -@testset "Custom signal" begin - sig = ROCSignal() - sig2 = @roc signal=sig identity(nothing) - @test sig2.signal == sig - wait(sig) - wait(sig2) -end - if length(AMDGPU.devices()) > 1 @testset "Multi-GPU" begin - # HSA will throw if the compiler and launch use different devices - a1, a2 = AMDGPU.devices()[1:2] - wait(@roc device=a1 identity(nothing)) - wait(@roc device=a2 identity(nothing)) + dev = AMDGPU.device() + + AMDGPU.device!(AMDGPU.devices()[2]) + @roc identity(nothing) + + AMDGPU.device!(dev) + @roc identity(nothing) end else @warn "Only 1 GPU detected; skipping multi-GPU tests" - @test_broken "Multi-GPU" + @test_skip "Multi-GPU" end @testset "Launch Configuration" begin - kern = @roc launch=false identity(nothing) + function f(x) + x[1] = 1 + return + end + x = ROCArray([1]) + kern = @roc launch=false f(x) occ = AMDGPU.launch_configuration(kern) @test occ isa NamedTuple - @test haskey(occ, :groupsize) + @test haskey(occ, :groupsize) && haskey(occ, :gridsize) # This kernel has no occupancy constraints @test occ.groupsize == AMDGPU.Device._max_group_size - @testset "Automatic groupsize selection" begin - function groupsize_kernel(A) - A[1] = workgroupDim().x - nothing - end - A = AMDGPU.ones(Int, 1) - kern = @roc launch=false groupsize_kernel(A) - # Verify first that there are no occupancy constraints - @test AMDGPU.launch_configuration(kern).groupsize == AMDGPU.Device._max_group_size - # Then check that this value was used - wait(@roc groupsize=:auto groupsize_kernel(A)) - @test Array(A)[1] == AMDGPU.Device._max_group_size - end - - @testset "Function redefinition" begin - RX = ROCArray(rand(Float32, 1)) - function f(X) - Y = @ROCStaticLocalArray(Float32, 1) - X[1] = Y[1] - return - end - occ1 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX)) - function f(X) - Y = @ROCStaticLocalArray(Float32, 1024) - X[1] = Y[1] - return - end - occ2 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX)) - @test occ1 != occ2 - end + # TODO + # @testset "Automatic groupsize selection" begin + # function groupsize_kernel(A) + # A[1] = workgroupDim().x + # nothing + # end + # A = AMDGPU.ones(Int, 1) + # kern = @roc launch=false groupsize_kernel(A) + # # Verify first that there are no occupancy constraints + # @test AMDGPU.launch_configuration(kern).groupsize == AMDGPU.Device._max_group_size + # @roc groupsize=:auto groupsize_kernel(A) + # @test Array(A)[1] == AMDGPU.Device._max_group_size + # end @testset "Local memory" begin function f(X) @@ -172,16 +121,11 @@ end unsafe_store!(Y.ptr, unsafe_load(X.ptr)) return end + RX = ROCArray(rand(Float32, 1)) - @testset "Static" begin - occ = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX)) - @test occ.LDS_size == sizeof(Float32) * 16 - end - @testset "Dynamic" begin - # Test that localmem is properly accounted for - occ1 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX)) - occ2 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX); localmem=65536÷2) - @test occ1 != occ2 - end + # Test that localmem is properly accounted for + occ1 = AMDGPU.launch_configuration(@roc launch=false f(RX)) + occ2 = AMDGPU.launch_configuration(@roc launch=false f(RX); shmem=65536 ÷ 2) + @test occ1 != occ2 end end diff --git a/test/device/math.jl b/test/device/math.jl index 1961c827a..bd7962b20 100644 --- a/test/device/math.jl +++ b/test/device/math.jl @@ -1,8 +1,6 @@ using Base.FastMath @testset "Math Intrinsics" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - for T in (Float16, Float32, Float64) a = rand(T, 16) .* T(42) d_a = ROCArray(a) @@ -11,8 +9,9 @@ using Base.FastMath b = map(f, a) d_b = map(f, d_a) for out_idx in 1:length(f(a[1])) - @test all(sc->(sc[1][out_idx] ≈ sc[2][out_idx]), - zip(b, Array(d_b))) + @test all( + sc -> (sc[1][out_idx] ≈ sc[2][out_idx]), + zip(b, Array(d_b))) end end end diff --git a/test/device/memory.jl b/test/device/memory.jl index 8a23eac1c..8be356c19 100644 --- a/test/device/memory.jl +++ b/test/device/memory.jl @@ -24,7 +24,7 @@ RB = ROCArray(zeros(Float32, 8)) RC = ROCArray(ones(Float32, 8)) - wait(@roc groupsize=8 memory_static_kernel(RA, RB, RC)) + @roc groupsize=8 memory_static_kernel(RA, RB, RC) @test Array(RA) ≈ Array(RB) # Test zero-initialization @test all(iszero, Array(RC)) @@ -49,7 +49,8 @@ RA = ROCArray(A) RC = ROCArray(ones(Float32, N)) - wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA, RC)) + shmem = N * sizeof(Float32) + @roc shmem=shmem dynamic_localmem_kernel(RA, RC) @test Array(RA) ≈ A .+ 1f0 # Test zero-initialization @@ -58,43 +59,40 @@ end end -@testset "Memory: Dynamic" begin - function malloc_kernel(X) - ptr = AMDGPU.Device.malloc(Csize_t(4)) - X[1] = reinterpret(UInt64, ptr) - AMDGPU.Device.free(ptr) - nothing - end +# TODO +# @testset "Memory: Dynamic" begin +# function malloc_kernel(X) +# ptr = AMDGPU.Device.malloc(Csize_t(4)) +# X[1] = reinterpret(UInt64, ptr) +# AMDGPU.Device.free(ptr) +# nothing +# end - RA = ROCArray(zeros(UInt64, 1)) - wait(@roc malloc_kernel(RA)) - @test Array(RA)[1] != 0 -end +# RA = ROCArray(zeros(UInt64, 1)) +# @roc malloc_kernel(RA) +# @test Array(RA)[1] != 0 +# end @testset "Memcpy/Memset" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - function memcpy_kernel(X,Y) - AMDGPU.Device.memcpy!(Y.ptr, X.ptr, sizeof(Float32)*length(X)) + function memcpy_kernel(X, Y) + AMDGPU.Device.memcpy!(Y.ptr, X.ptr, sizeof(Float32) * length(X)) nothing end A = rand(Float32, 4) B = zeros(Float32, 4) - RA, RB = ROCArray.((A,B)) - - wait(@roc memcpy_kernel(RA,RB)) + RA, RB = ROCArray.((A, B)) + @roc memcpy_kernel(RA, RB) @test A == collect(RA) == collect(RB) - function memset_kernel(X,y) - AMDGPU.Device.memset!(X.ptr, y, div(length(X),2)) + function memset_kernel(X, y) + AMDGPU.Device.memset!(X.ptr, y, length(X) ÷ 2) nothing end A = zeros(UInt8, 4) RA = ROCArray(A) - wait(@roc memset_kernel(RA,0x3)) - + @roc memset_kernel(RA, 0x3) @test all(collect(RA)[1:2] .== 0x3) @test all(collect(RA)[3:4] .== 0x0) end diff --git a/test/device/output.jl b/test/device/output.jl index bb09d1061..c353a109d 100644 --- a/test/device/output.jl +++ b/test/device/output.jl @@ -1,232 +1,237 @@ -import .Device: OutputContext - @testset "@rocprintln" begin - -@testset "Plain, no newline" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel(oc) = @rocprint oc "Hello World!" - - iob = IOBuffer() - oc = OutputContext(iob) - wait(@roc kernel(oc)) - @test String(take!(iob)) == "Hello World!" -end - -@testset "Plain, with newline" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel(oc) = @rocprintln oc "Hello World!" - - iob = IOBuffer() - oc = OutputContext(iob) - wait(@roc kernel(oc)) - @test String(take!(iob)) == "Hello World!\n" -end - -@testset "Plain, multiple calls" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - function kernel(oc) - @rocprint oc "Hello World!" - @rocprintln oc "Goodbye World!" + @testset "Plain, no newline" begin + kernel() = @rocprint "Hello World!" + + _, msg = @grab_output begin + @roc kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Hello World!" end - iob = IOBuffer() - oc = OutputContext(iob) - wait(@roc kernel(oc)) - @test String(take!(iob)) == "Hello World!Goodbye World!\n" -end - -@testset "Plain, global context" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. + @testset "Plain, with newline" begin + kernel() = @rocprintln "Hello World!" - function kernel() - @rocprint "Hello World!" - @rocprintln "Goodbye World!" + _, msg = @grab_output begin + @roc kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Hello World!\n" end - _, msg = @grab_output wait(@roc kernel()) - @test msg == "Hello World!Goodbye World!\n" -end - -#= TODO -@testset "Interpolated string" begin - inner_str = "to the" - function kernel(oc) - @rocprintln oc "Hello $inner_str World!" - nothing + @testset "Plain, multiple calls" begin + function kernel() + @rocprint "Hello World!" + @rocprintln "Goodbye World!" + end + + _, msg = @grab_output begin + @roc kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Hello World!Goodbye World!\n" end - iob = IOBuffer() - oc = OutputContext(iob) - @roc kernel(oc) - sleep(1) - @test String(take!(iob)) == "Hello to the World!\n" -end -=# - + #= TODO + @testset "Interpolated string" begin + inner_str = "to the" + function kernel(oc) + @rocprintln oc "Hello $inner_str World!" + nothing + end + + iob = IOBuffer() + oc = OutputContext(iob) + @roc kernel(oc) + sleep(1) + @test String(take!(iob)) == "Hello to the World!\n" + end + =# end @testset "@rocprintf" begin - -@testset "Plain" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel() = @rocprintf "Hello World!\n" - - _, msg = @grab_output wait(@roc kernel()) - @test msg == "Hello World!\n" -end - -@testset "Integer argument" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel(x) = @rocprintf "Value: %d\n" x - - _, msg = @grab_output wait(@roc kernel(42)) - @test msg == "Value: 42\n" -end - -@testset "Multiple arguments" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - function kernel(x) - y = 0.123401 - @rocprintf "Value: %d | %.4f\n" x y + @testset "Plain" begin + kernel() = @rocprintf "Hello World!\n" + + _, msg = @grab_output begin + @roc kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Hello World!\n" end - _, msg = @grab_output wait(@roc kernel(42)) - @test msg == "Value: 42 | 0.1234\n" -end - -@testset "Per-lane" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel() = @rocprintf :lane "[%d] " workitemIdx().x - - # One group, one wavefront - exp = reduce(*, ["[$i] " for i in 1:8]) - _, msg = @grab_output wait(@roc groupsize=8 kernel()) - @test msg == exp - - # One group, multiple wavefronts - exp = reduce(*, ["[$i] " for i in 1:128]) - _, msg = @grab_output wait(@roc groupsize=128 kernel()) - @test msg == exp - - # Multiple groups, one wavefront each - exp = reduce(*, ["[$i] " for i in vcat(1:64, 1:64, 1:64, 1:64)]) - _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel()) - @test msg == exp + @testset "Integer argument" begin + kernel(x) = @rocprintf "Value: %d\n" x - # Multiple groups, multiple wavefronts each - exp = reduce(*, ["[$i] " for i in vcat(1:128, 1:128)]) - _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel()) - @test msg == exp -end - -@testset "Per-wavefront" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel() = @rocprintf :wave "[%d] " workitemIdx().x - wsize::Int64 = AMDGPU.wavefrontsize(ROCDevice()) - - # One group, one wavefront - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=1 kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # One group, multiple wavefronts - groupsize = 128 - exp = reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]) - _, msg = @grab_output wait(@roc groupsize=groupsize kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # Multiple groups, one wavefront each - gridsize = 256 - exp = repeat("[1] ", gridsize ÷ wsize) - _, msg = @grab_output(wait(@roc groupsize=wsize gridsize=gridsize kernel())) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # Multiple groups, multiple wavefronts each - groupsize = 128 - n_groups = gridsize ÷ groupsize - exp = repeat( - reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]), - n_groups) - _, msg = @grab_output(wait(@roc groupsize=128 gridsize=256 kernel())) - @test msg == exp -end - -@testset "Per-workgroup" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - kernel() = @rocprintf :group "[%d] " workitemIdx().x - - # One group, one wavefront - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=8 kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # One group, multiple wavefronts - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=128 kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # Multiple groups, one wavefront each - exp = reduce(*, ["[$i] " for i in [1, 1, 1, 1]]) - _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # Multiple groups, multiple wavefronts each - exp = reduce(*, ["[$i] " for i in [1, 1]]) - _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel()) - @test msg == exp -end - -@testset "Per-grid" begin - kernel() = @rocprintf :grid "[%d] " workitemIdx().x - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # One group, one wavefront - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=8 kernel()) - @test msg == exp - - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - # One group, multiple wavefronts - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=128 kernel()) - @test msg == exp + _, msg = @grab_output begin + @roc kernel(42) + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Value: 42\n" + end - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. + @testset "Multiple arguments" begin + function kernel(x) + y = 0.123401 + @rocprintf "Value: %d | %.4f\n" x y + end + + _, msg = @grab_output begin + @roc kernel(42) + AMDGPU.synchronize(; blocking=false) + end + @test msg == "Value: 42 | 0.1234\n" + end - # Multiple groups, one wavefront each - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel()) - @test msg == exp + @testset "Per-lane" begin + kernel() = @rocprintf :lane "[%d] " workitemIdx().x + + # One group, one wavefront + exp = reduce(*, ["[$i] " for i in 1:8]) + _, msg = @grab_output begin + @roc groupsize=8 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # One group, multiple wavefronts + exp = reduce(*, ["[$i] " for i in 1:128]) + _, msg = @grab_output begin + @roc groupsize=128 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, one wavefront each + exp = reduce(*, ["[$i] " for i in vcat(1:64, 1:64, 1:64, 1:64)]) + _, msg = @grab_output begin + @roc groupsize=64 gridsize=4 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, multiple wavefronts each + exp = reduce(*, ["[$i] " for i in vcat(1:128, 1:128)]) + _, msg = @grab_output begin + @roc groupsize=128 gridsize=2 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + end - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. + @testset "Per-wavefront" begin + kernel() = @rocprintf :wave "[%d] " workitemIdx().x + hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device()) + wsize::Int = AMDGPU.Runtime.device_wavefront_size(hsa_dev) + + # One group, one wavefront + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=1 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # One group, multiple wavefronts + groupsize = 128 + exp = reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]) + _, msg = @grab_output begin + @roc groupsize=groupsize kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, one wavefront each + gridsize = 256 ÷ wsize + exp = repeat("[1] ", 256 ÷ wsize) + _, msg = @grab_output begin + @roc groupsize=wsize gridsize=gridsize kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, multiple wavefronts each + groupsize = 128 + n_groups = 256 ÷ groupsize + exp = repeat( + reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]), + n_groups) + _, msg = @grab_output begin + @roc groupsize=128 gridsize=2 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + end - # Multiple groups, multiple wavefronts each - exp = "[1] " - _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel()) - @test msg == exp -end + @testset "Per-workgroup" begin + kernel() = @rocprintf :group "[%d] " workitemIdx().x + + # One group, one wavefront + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=8 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # One group, multiple wavefronts + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=128 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, one wavefront each + exp = reduce(*, ["[$i] " for i in [1, 1, 1, 1]]) + _, msg = @grab_output begin + @roc groupsize=64 gridsize=4 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, multiple wavefronts each + exp = reduce(*, ["[$i] " for i in [1, 1]]) + _, msg = @grab_output begin + @roc groupsize=128 gridsize=2 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + end + @testset "Per-grid" begin + kernel() = @rocprintf :grid "[%d] " workitemIdx().x + + # One group, one wavefront + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=8 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # One group, multiple wavefronts + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=128 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, one wavefront each + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=64 gridsize=4 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + + # Multiple groups, multiple wavefronts each + exp = "[1] " + _, msg = @grab_output begin + @roc groupsize=128 gridsize=2 kernel() + AMDGPU.synchronize(; blocking=false) + end + @test msg == exp + end end diff --git a/test/device/queries.jl b/test/device/queries.jl deleted file mode 100644 index e73e1c92a..000000000 --- a/test/device/queries.jl +++ /dev/null @@ -1,21 +0,0 @@ -@testset "Active kernels" begin - AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout. - - function kernel(sig) - hostcall!(sig) - nothing - end - - wait_ev = Base.Event() - hc = HostCall(Nothing, Tuple{}) do - wait(wait_ev) - end - - sig = @roc kernel(hc) - @test sig in AMDGPU.active_kernels() - @test (@atomic sig.active) - - notify(wait_ev) - wait(sig) - @test !(sig in AMDGPU.active_kernels()) -end diff --git a/test/device/vadd.jl b/test/device/vadd.jl index 8a32baa75..390616a1d 100644 --- a/test/device/vadd.jl +++ b/test/device/vadd.jl @@ -16,7 +16,7 @@ d_c = similar(d_a) len = prod(dims) - wait(@roc groupsize=len vadd(d_a, d_b, d_c)) + @roc groupsize=len vadd(d_a, d_b, d_c) c = Array(d_c) @test a+b ≈ c end diff --git a/test/device/wavefront.jl b/test/device/wavefront.jl index 628f20bed..c33c041a8 100644 --- a/test/device/wavefront.jl +++ b/test/device/wavefront.jl @@ -1,5 +1,6 @@ @testset "Wavefront Operations" begin - wavefrontsize = AMDGPU.wavefrontsize(AMDGPU.default_device()) + hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device()) + wavefrontsize = AMDGPU.Runtime.device_wavefront_size(hsa_dev) function reduce_kernel(op,X,Y) idx = workitemIdx().x @@ -23,42 +24,46 @@ X = rand(T(1):T(100), wavefrontsize) for op in (Base.:+, max, min, Base.:&, Base.:|, Base.:⊻) RX, RY = ROCArray(X), ROCArray(zeros(T,1)) - wait(@roc groupsize=wavefrontsize reduce_kernel(op,RX,RY)) + @roc groupsize=wavefrontsize reduce_kernel(op,RX,RY) @test Array(RY)[1] == reduce(op,X) RX, RY = ROCArray(X), ROCArray(zeros(T,wavefrontsize)) - wait(@roc groupsize=wavefrontsize scan_kernel(op,RX,RY)) + @roc groupsize=wavefrontsize scan_kernel(op,RX,RY) @test Array(RY) == accumulate(op,X) end end + for T in (Float16, Float32, Float64) X = rand(T, wavefrontsize) for op in (Base.:+, max, min) RX, RY = ROCArray(X), ROCArray(zeros(T,1)) - wait(@roc groupsize=wavefrontsize reduce_kernel(op,RX,RY)) + @roc groupsize=wavefrontsize reduce_kernel(op,RX,RY) @test Array(RY)[1] ≈ reduce(op,X) RX, RY = ROCArray(X), ROCArray(zeros(T,wavefrontsize)) - wait(@roc groupsize=wavefrontsize scan_kernel(op,RX,RY)) + @roc groupsize=wavefrontsize scan_kernel(op,RX,RY) @test Array(RY) ≈ accumulate(op,X) end end - for X in (rand(Cint(0):Cint(1), wavefrontsize), - zeros(Cint, wavefrontsize), - ones(Cint, wavefrontsize), - ) + + for X in ( + rand(Cint(0):Cint(1), wavefrontsize), + zeros(Cint, wavefrontsize), + ones(Cint, wavefrontsize), + ) RX, RY = ROCArray(X), ROCArray(zeros(Bool,3)) - wait(@roc groupsize=wavefrontsize bool_kernel(RX,RY)) + @roc groupsize=wavefrontsize bool_kernel(RX,RY) Y = Array(RY) - @test_skip Y[1] == all(x->x==1,X) + + @test_skip Y[1] == all(x -> x == 1, X) @test_skip Y[2] == any(x->x==1,X) @test_skip Y[3] == (length(unique(X)) == 1) end end @testset "Wavefront Information" begin - wavefrontsize = AMDGPU.wavefrontsize(AMDGPU.default_device()) - + hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device()) + wavefrontsize = AMDGPU.Runtime.device_wavefront_size(hsa_dev) @test wavefrontsize == 32 || wavefrontsize == 64 function kernel(X) @@ -66,6 +71,7 @@ end nothing end RX = ROCArray(zeros(UInt32, 1)) - wait(@roc kernel(RX)) + @roc kernel(RX) + AMDGPU.synchronize() @allowscalar @test RX[1] == wavefrontsize end diff --git a/test/dnn/pool.jl b/test/dnn/pool.jl index 7265a9682..a9b2a8551 100644 --- a/test/dnn/pool.jl +++ b/test/dnn/pool.jl @@ -12,7 +12,6 @@ yd, workspace = MIOpen.maxpool(xd; pkwargs...) yd, workspace = MIOpen.maxpool!(yd, xd; pkwargs...) @test Array(yd) ≈ y - wh1 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize) @test Array(yd) ≈ y dy = ones(Float32, size(y)) @@ -21,13 +20,9 @@ dx = NNlib.∇maxpool(dy, y, x, pdims) dxd = MIOpen.∇maxpool(dyd, yd, xd; workspace, pkwargs...) @test Array(dxd) ≈ dx - wh2 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize) - @test wh1 ≈ wh2 # Check that workspace was not modified. dxd = MIOpen.∇maxpool!(dxd, dyd, yd, xd; workspace, pkwargs...) @test Array(dxd) ≈ dx - wh3 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize) - @test wh1 ≈ wh3 # Check that workspace was not modified. # Mean pooling. diff --git a/test/dnn/softmax.jl b/test/dnn/softmax.jl index c4cfe5817..559cbf6a3 100644 --- a/test/dnn/softmax.jl +++ b/test/dnn/softmax.jl @@ -3,6 +3,7 @@ for (sz, dims) in [ ((5,), :), ((5,), 1), ((5, 5), :), ((5, 5), 1), ((5, 5), 2), + ((5, 5, 5), (1, 2)), ((5, 5, 5), (1, 3)), ((5, 5, 5, 5), (2, 3)), ((5, 5, 5, 5), (2, 4)), ] if T == Float16 diff --git a/test/external/forwarddiff.jl b/test/external/forwarddiff.jl index c4de55d73..1be23821c 100644 --- a/test/external/forwarddiff.jl +++ b/test/external/forwarddiff.jl @@ -9,7 +9,8 @@ function test_derivative(f, x::T) where T a[] = ForwardDiff.derivative(f, x) return end - wait(@roc kernel(buf, x)) + @roc kernel(buf, x) + AMDGPU.synchronize() return AMDGPU.@allowscalar buf[] end diff --git a/test/hsa/device.jl b/test/hsa/device.jl index ce7e5ab67..b68e156ef 100644 --- a/test/hsa/device.jl +++ b/test/hsa/device.jl @@ -1,20 +1,17 @@ @testset "Devices" begin @testset "Device IDs" begin - for kind in (:cpu, :gpu) - devices = AMDGPU.devices() - for (idx,device) in enumerate(devices) - @test AMDGPU.device_id(device) == idx - end + devices = AMDGPU.devices() + for (idx,device) in enumerate(devices) + @test AMDGPU.device_id(device) == idx end end @testset "Default selection" begin device = AMDGPU.default_device() @test device !== nothing - @test AMDGPU.device_type(device) == :gpu - @test ROCDevice().agent == device.agent + @test device == AMDGPU.device() - device_name = Runtime.name(device) + device_name = HIP.name(device) @test length(device_name) > 0 @test !occursin('\0', device_name) @@ -23,9 +20,11 @@ init_device = AMDGPU.default_device() init_device_id = AMDGPU.default_device_id() @test init_device_id == 1 + AMDGPU.default_device_id!(2) @test AMDGPU.default_device_id() == 2 @test AMDGPU.default_device() != init_device + AMDGPU.default_device_id!(1) @test AMDGPU.default_device_id() == 1 @test AMDGPU.default_device() == init_device @@ -40,7 +39,7 @@ @testset "ISAs" begin device = AMDGPU.default_device() - device_isa = string(AMDGPU.default_isa_architecture(device)) + device_isa, features = AMDGPU.default_isa(device).arch_features @test length(device_isa) > 0 @test occursin("gfx", device_isa) end diff --git a/test/hsa/error.jl b/test/hsa/error.jl deleted file mode 100644 index 923866312..000000000 --- a/test/hsa/error.jl +++ /dev/null @@ -1,11 +0,0 @@ -@testset "HSA Status Error" begin - errorcode = AMDGPU.HSAError(HSA.STATUS_SUCCESS) - @test Runtime.description(errorcode) == "HSA_STATUS_SUCCESS: The function has been executed successfully." -end - -if !IS_NAVI_2 - @testset "HSA Async Queue Error" begin - kernel() = (Device.trap(); nothing) - @test_throws Runtime.QueueError wait(@roc kernel()) - end -end diff --git a/test/hsa/getinfo.jl b/test/hsa/getinfo.jl index dc1026ec9..7e5fa7c2e 100644 --- a/test/hsa/getinfo.jl +++ b/test/hsa/getinfo.jl @@ -1,39 +1,15 @@ @testset "getinfo queries" begin @testset "ROCDevice" begin device = AMDGPU.default_device() - @test AMDGPU.Runtime.name(device) isa String - @test AMDGPU.Runtime.device_type(device) isa AMDGPU.HSA.DeviceType - @test AMDGPU.Runtime.device_wavefront_size(device) isa UInt32 + hsa_dev = AMDGPU.Runtime.hsa_device(device) + @test AMDGPU.Runtime.name(hsa_dev) isa String + @test AMDGPU.Runtime.device_type(hsa_dev) isa AMDGPU.HSA.DeviceType + @test AMDGPU.Runtime.device_wavefront_size(hsa_dev) isa UInt32 end + @testset "HSA.ISA" begin device = AMDGPU.default_device() - device_isa = AMDGPU.default_isa(device) + device_isa = AMDGPU.default_isa(device).hsa_isa @test AMDGPU.Runtime.isa_workgroup_max_size(device_isa) isa UInt32 end - @testset "ROCMemoryRegion" begin - device = AMDGPU.default_device() - region = first(AMDGPU.Runtime.regions(device)) - @test AMDGPU.Runtime.region_segment(region) isa AMDGPU.HSA.RegionSegment - @test AMDGPU.Runtime.region_runtime_alloc_allowed(region) isa Bool - @test AMDGPU.Runtime.region_runtime_alloc_granule(region) isa Csize_t - end - @testset "ROCMemoryPool" begin - device = AMDGPU.default_device() - pool = first(AMDGPU.Runtime.memory_pools(device)) - @test AMDGPU.Runtime.pool_segment(pool) isa AMDGPU.HSA.AMDSegment - @test AMDGPU.Runtime.pool_size(pool) isa Csize_t - @test AMDGPU.Runtime.pool_accessible_by_all(pool) isa Bool - end - @testset "HSA.ExecutableSymbol" begin - device = AMDGPU.default_device() - kernel = @roc launch=false identity(nothing) - exe = kernel.mod.exe - sym_name = kernel.fun.entry - exec_sym = AMDGPU.Runtime.executable_symbol_any(exe, device) - if exec_sym === nothing - exec_sym = AMDGPU.Runtime.executable_symbol_by_name(exe, device, sym_name) - end - @test AMDGPU.Runtime.executable_symbol_name(exec_sym) isa String - @test AMDGPU.Runtime.executable_symbol_kernel_private_segment_size(exec_sym) isa UInt32 - end end diff --git a/test/hsa/hashing.jl b/test/hsa/hashing.jl deleted file mode 100644 index fc155a44b..000000000 --- a/test/hsa/hashing.jl +++ /dev/null @@ -1,100 +0,0 @@ -@testset "Kernel Argument Hashing" begin - khash = AMDGPU.Runtime.khash - - @testset "Primitives" begin - x = UInt8(1) - y = UInt8(2) - - @test khash(x) == khash(x) - @test khash(y) == khash(y) - @test khash(x) != khash(y) - - @test khash(x, UInt(1)) == khash(x, UInt(1)) - @test khash(y, UInt(1)) == khash(y, UInt(1)) - @test khash(x, UInt(1)) != khash(y, UInt(1)) - - @test khash(x, UInt(1)) != khash(x, UInt(2)) - @test khash(y, UInt(1)) != khash(y, UInt(2)) - @test khash(x, UInt(1)) != khash(y, UInt(2)) - - for T in [UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64] - xc = convert(T, x) - yc = convert(T, y) - @test khash(xc) == khash(xc) - @test khash(yc) == khash(yc) - @test khash(xc) != khash(yc) - - @test khash(xc, UInt(1)) == khash(xc, UInt(1)) - @test khash(yc, UInt(1)) == khash(yc, UInt(1)) - @test khash(xc, UInt(1)) != khash(yc, UInt(1)) - - @test khash(xc, UInt(1)) != khash(xc, UInt(2)) - @test khash(yc, UInt(1)) != khash(yc, UInt(2)) - @test khash(xc, UInt(1)) != khash(yc, UInt(2)) - end - end - - @testset "Tuples" begin - z1 = (UInt8(1), UInt8(2), UInt8(3)) - z2 = (UInt8(1), UInt8(2), UInt8(4)) - z3 = (UInt8(1), UInt8(2), UInt16(3)) - - @test khash(z1) == khash(z1) - @test khash(z1, UInt(1)) == khash(z1, UInt(1)) - @test khash(z1, UInt(1)) != khash(z1, UInt(2)) - - @test khash(z1) != khash(z2) - @test khash(z1, UInt(1)) != khash(z2, UInt(1)) - - @test khash(z1) != khash(z3) - @test khash(z1, UInt(1)) != khash(z3, UInt(1)) - end - - @testset "Functions" begin - @test khash(+) == khash(+) - @test khash(+, UInt(1)) == khash(+, UInt(1)) - @test khash(+, UInt(1)) != khash(+, UInt(2)) - @test khash(+) != khash(-) != khash(/) != khash(identity) - - x = 1 - f() = x - @test khash(f) == khash(f) - @test khash(f, UInt(1)) == khash(f, UInt(1)) - - g() = x - @test khash(f) != khash(g) - @test khash(f, UInt(1)) != khash(g, UInt(1)) - end - - @testset "ROCDeviceArray" begin - RA = ROCArray(rand(4)) - DA = rocconvert(RA) - - @test khash(DA) == khash(DA) - @test khash(DA, UInt(1)) == khash(DA, UInt(1)) - @test khash(DA, UInt(1)) != khash(DA, UInt(2)) - - A_hash = khash(DA) - RA .= RA .+ 1 - @test khash(DA) == A_hash - - RB = copy(RA) - DB = rocconvert(RB) - - @test khash(RA) != khash(RB) - end - - @testset "ROCDeviceArray wrappers" begin - RC = ROCArray(rand(4, 4)) - DC = rocconvert(RC) - RCv = @view RC[2:3, 2:3] - DCv = rocconvert(RCv) - - @test khash(DC) != khash(DCv) - @test khash(DC, UInt(1)) != khash(DCv, UInt(1)) - - @test khash(DCv) == khash(DCv) - @test khash(DCv, UInt(1)) == khash(DCv, UInt(1)) - @test khash(DCv, UInt(1)) != khash(DCv, UInt(2)) - end -end diff --git a/test/hsa/memory.jl b/test/hsa/memory.jl deleted file mode 100644 index d600c68e1..000000000 --- a/test/hsa/memory.jl +++ /dev/null @@ -1,278 +0,0 @@ -@testset "Memory" begin - -@testset "Pointer-based" begin - @testset "Mem transfers" begin - src = 42 - - buf1 = Mem.alloc(sizeof(src); coherent=true) - - Mem.set!(buf1, UInt32(57), 1) - x = Mem.download(UInt32, buf1) - @test x[1] == UInt32(57) - - GC.@preserve Mem.upload!(buf1, pointer_from_objref(Ref(src)), sizeof(src)) - - dst1 = Ref(0) - GC.@preserve Mem.download!(pointer_from_objref(dst1), buf1, sizeof(src)) - @test src == dst1[] - - buf2 = Mem.alloc(sizeof(src)) - - Mem.transfer!(buf2, buf1, sizeof(src)) - - dst2 = Ref(0) - GC.@preserve Mem.download!(pointer_from_objref(dst2), buf2, sizeof(src)) - @test src == dst2[] - - Mem.free(buf2) - Mem.free(buf1) - end - - @testset "Unsafe copy h2d and d2h" begin - nx,ny,nz = 7,6,5 - - P = zeros(nx, ny, nz) - P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)] - P = ROCArray(P) - P2 = AMDGPU.zeros(eltype(P),size(P)) - ranges = [1:size(P,1), 1:size(P,2), 1:size(P,3)] - - # init buffers - buf = zeros(size(P)) - dbuf = AMDGPU.zeros(eltype(P),size(P)) - - # lock host pointer and convert it to eltype(buf) - buf_Ptr = convert(Ptr{eltype(buf)}, AMDGPU.Mem.lock(pointer(buf), sizeof(buf), AMDGPU.default_device())) - - # 1. test device to host - P_Ptr = convert(Ptr{eltype(buf)}, pointer(P)) - signal1 = ROCSignal() - Mem.unsafe_copy3d!( - buf_Ptr, P_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]); - dstPitch=sizeof(eltype(buf))*length(ranges[1]), dstSlice=sizeof(eltype(buf))*length(ranges[1])*length(ranges[2]), - srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), srcPitch=sizeof(eltype(buf))*size(P,1), srcSlice=sizeof(eltype(buf))*size(P,1)*size(P,2), - async=true, signal=signal1 - ) - wait(signal1) - @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:])) - - # 2. test host to device - P2_Ptr = convert(Ptr{eltype(buf)}, pointer(P2)) - signal2 = ROCSignal() - Mem.unsafe_copy3d!( - P2_Ptr, buf_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]); - dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]), dstPitch=sizeof(eltype(buf))*size(P,1), dstSlice=sizeof(eltype(buf))*size(P,1)*size(P,2), - srcPitch=sizeof(eltype(buf))*length(ranges[1]), srcSlice=sizeof(eltype(buf))*length(ranges[1])*length(ranges[2]), - async=true, signal=signal2 - ) - wait(signal2) - @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:])) - - # unlock host pointer - Mem.unlock(pointer(buf)) - - # 3. test device to device - dbuf_Ptr = convert(Ptr{eltype(dbuf)}, pointer(dbuf)) - signal3 = ROCSignal() - Mem.unsafe_copy3d!( - dbuf_Ptr, P_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]); - dstPitch=sizeof(eltype(dbuf))*length(ranges[1]), dstSlice=sizeof(eltype(dbuf))*length(ranges[1])*length(ranges[2]), - srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), srcPitch=sizeof(eltype(dbuf))*size(P,1), srcSlice=sizeof(eltype(dbuf))*size(P,1)*size(P,2), - async=true, signal=signal3 - ) - wait(signal3) - @test all(Array(dbuf[:]) .== Array(P[ranges[1],ranges[2],ranges[3]][:])) - end -end - -@testset "Array-based" begin - src = [42] - - buf1 = Mem.alloc(src) - - Mem.upload!(buf1, src) - - dst1 = similar(src) - Mem.download!(dst1, buf1) - @test src == dst1 - - buf2 = Mem.upload(src) - - dst2 = similar(src) - Mem.download!(dst2, buf2) - @test src == dst2 - - Mem.free(buf1) -end - -@testset "Type-based" begin - buf = Mem.alloc(Int) - - # there's no type-based upload, duh - src = [42] - Mem.upload!(buf, src) - - dst = Mem.download(eltype(src), buf) - @test src == dst -end - -@testset "Pointer information" begin - default_device = AMDGPU.default_device() - - N = 1024 - a = rand(N) - b = Mem.alloc(default_device, N) - - ptrinfo_host = Mem.pointerinfo(a) - ptrinfo_hsa = Mem.pointerinfo(b) - - @test ptrinfo_host.type == HSA.EXT_POINTER_TYPE_UNKNOWN - @test ptrinfo_hsa.type == HSA.EXT_POINTER_TYPE_HSA - @test_skip ptrinfo_hsa.agentOwner.handle == default_device.agent.handle - - Mem.free(b) -end - -@testset "Page-locked memory (OS allocations)" begin - a = rand(1024) - plocked = Mem.lock(a) - - # NOTE - For a single device, it seems that plocked == pointer(a) - @test Mem.pointerinfo(pointer(a)).type == HSA.EXT_POINTER_TYPE_LOCKED - @test Mem.pointerinfo(plocked).type == HSA.EXT_POINTER_TYPE_LOCKED - @test Mem.pointerinfo(plocked).sizeInBytes == sizeof(a) - - Mem.unlock(a) - @test Mem.pointerinfo(pointer(a)).type == HSA.EXT_POINTER_TYPE_UNKNOWN - @test Mem.pointerinfo(plocked).type == HSA.EXT_POINTER_TYPE_UNKNOWN -end - -@testset "Memory Region Queries" begin - @testset "Region API Queries" begin - for (idx, device) in enumerate(AMDGPU.devices()) - regions = Runtime.regions(device) - regions_global = filter(r->Runtime.region_segment(r) == HSA.REGION_SEGMENT_GLOBAL, regions) - regions_global_coarse_nohost = filter(r->(Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED > 0) && - !Runtime.region_host_accessible(r), regions) - regions_group = filter(r->Runtime.region_segment(r) == HSA.REGION_SEGMENT_GROUP, regions) - regions_finegrained = filter(r->Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_FINE_GRAINED > 0, regions) - regions_kernarg = filter(r->Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_KERNARG > 0, regions) - - @test length(regions_global) > 0 - if idx == 1 - @test length(regions_global_coarse_nohost) >= 1 - @test length(regions_group) == 1 - else - # BUG: https://github.com/RadeonOpenCompute/ROCR-Runtime/issues/134 - @test length(regions_global_coarse_nohost) == 0 - @test length(regions_group) == 0 - end - @test length(regions_finegrained) > 0 - @test length(regions_kernarg) > 0 - - @test all(r->Runtime.region_size(r) > 0, regions) - - @test all(Runtime.region_runtime_alloc_allowed, regions_global) - @test all(r->Runtime.region_runtime_alloc_granule(r) > 0, regions_global) - @test all(r->Runtime.region_runtime_alloc_alignment(r) > 0, regions_global) - @test all(r->Runtime.region_alloc_max_size(r) > 0, regions_global) - - @test !any(Runtime.region_runtime_alloc_allowed, regions_group) - @test !any(Runtime.region_host_accessible, regions_group) - - @test all(Runtime.region_runtime_alloc_allowed, regions_finegrained) - @test all(Runtime.region_host_accessible, regions_finegrained) - - @test all(Runtime.region_runtime_alloc_allowed, regions_kernarg) - @test all(Runtime.region_host_accessible, regions_kernarg) - - for region in filter(Runtime.region_runtime_alloc_allowed, regions) - buf = Mem.alloc(device, region, 8) - @test buf.ptr != C_NULL - @test !buf.pool_alloc - Mem.free(buf) - end - end - end - @testset "Memory Pool API Queries" begin - for device in AMDGPU.devices() - pools = Runtime.memory_pools(device) - pools_global = filter(r->Runtime.pool_segment(r) == HSA.AMD_SEGMENT_GLOBAL, pools) - pools_group = filter(r->Runtime.pool_segment(r) == HSA.AMD_SEGMENT_GROUP, pools) - - @test length(pools_global) >= 1 - @test length(pools_group) == 1 - - @test all(r->Runtime.pool_size(r) > 0, pools) - @test !any(Runtime.pool_accessible_by_all, pools) - - @test all(Runtime.pool_runtime_alloc_allowed, pools_global) - @test all(r->Runtime.pool_runtime_alloc_granule(r) > 0, pools_global) - @test all(r->Runtime.pool_runtime_alloc_alignment(r) > 0, pools_global) - @test all(r->Runtime.pool_alloc_max_size(r) > 0, pools_global) - - @test !any(Runtime.pool_runtime_alloc_allowed, pools_group) - - for pool in filter(Runtime.pool_runtime_alloc_allowed, pools) - buf = Mem.alloc(device, pool, 8) - @test buf.ptr != C_NULL - @test buf.pool_alloc - Mem.free(buf) - end - end - end -end - -@testset "Exceptions" begin - @test_throws ArgumentError Mem.alloc(Function, 1) # abstract - @test_throws ArgumentError Mem.alloc(Array{Int}, 1) # UnionAll - @test_throws ArgumentError Mem.alloc(Integer, 1) # abstract - # TODO: can we test for the third case? - # !abstract && leaftype seems to imply UnionAll nowadays... - - # zero-width allocations should be permitted - null = Mem.alloc(Int, 0) - Mem.free(null) - - # double-free should throw - x = Mem.alloc(1) - Mem.free(x) - # FIXME: Segfaults... @test_throws HSAError Mem.free(x) -end - -@testset "Mutable structs" begin - @eval mutable struct MutablePtrFree - foo::Int - bar::Int - end - buf = Mem.alloc(MutablePtrFree) - Mem.upload!(buf, [MutablePtrFree(0,0)]) - Mem.free(buf) - - @eval mutable struct MutableNonPtrFree - foo::Int - bar::String - end - @test_throws ArgumentError Mem.alloc(MutableNonPtrFree) -end - -@testset "Retry" begin - device = AMDGPU.default_device() - finegrained_region = Runtime.get_region(device, :finegrained) - coarsegrained_pool = Runtime.get_memory_pool(device, :coarsegrained) - finegrained_max = Runtime.region_size(finegrained_region) - coarsegrained_max = Runtime.pool_size(coarsegrained_pool) - - if coarsegrained_max < finegrained_max - @testset "Coherent Fallback" begin - # This will still work because we fallback to coherent allocations - A = ROCVector{UInt8}(undef, Int(coarsegrained_max+8)) - @test A.buf.coherent - A = nothing - end - else - @test_skip "Coherent Fallback" - end -end - -end diff --git a/test/hsa/queue.jl b/test/hsa/queue.jl deleted file mode 100644 index 55a990a8d..000000000 --- a/test/hsa/queue.jl +++ /dev/null @@ -1,9 +0,0 @@ -@testset "Queues" begin - @testset "Priorities" begin - # Test that priorities can be set - for priority in (:low, :normal, :high) - ROCQueue(; priority) - end - @test_throws ArgumentError ROCQueue(; priority=:fake) - end -end diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl index 1861e712c..d07f1f1ac 100644 --- a/test/rocarray/base.jl +++ b/test/rocarray/base.jl @@ -58,39 +58,40 @@ end end end -@testset "unsafe_wrap" begin - A = rand(4, 3) - A_orig = copy(A) - RA = Base.unsafe_wrap(ROCArray, pointer(A), size(A)) - @test RA.buf.device == AMDGPU.default_device() - @test RA isa ROCArray{Float64,2} - - # GPU pointer works - RA .+= 1.0 - - # Host pointer is updated - @test A ≈ A_orig .+ 1.0 - - # Base.show - @test (println(devnull, RA); true) - - # Mem.download! - B = zeros(4, 3) - copyto!(B, RA) - @test B ≈ Array(RA) - - # Mem.upload! - C = rand(4, 3) - copyto!(RA, C) - @test Array(RA) ≈ C - - # Mem.transfer! - D = rand(4, 3) - D_orig = copy(D) - RD = Base.unsafe_wrap(ROCArray, pointer(D), size(D)) - copyto!(RD, RA) - @test Array(RD) ≈ Array(RA) ≈ C -end +# FIXME +# @testset "unsafe_wrap" begin +# A = rand(4, 3) +# A_orig = copy(A) +# RA = Base.unsafe_wrap(ROCArray, pointer(A), size(A)) +# @test RA.buf.device == AMDGPU.default_device() +# @test RA isa ROCArray{Float64,2} + +# # GPU pointer works +# RA .+= 1.0 + +# # Host pointer is updated +# @test A ≈ A_orig .+ 1.0 + +# # Base.show +# @test (println(devnull, RA); true) + +# # Mem.download! +# B = zeros(4, 3) +# copyto!(B, RA) +# @test B ≈ Array(RA) + +# # Mem.upload! +# C = rand(4, 3) +# copyto!(RA, C) +# @test Array(RA) ≈ C + +# # Mem.transfer! +# D = rand(4, 3) +# D_orig = copy(D) +# RD = Base.unsafe_wrap(ROCArray, pointer(D), size(D)) +# copyto!(RD, RA) +# @test Array(RD) ≈ Array(RA) ≈ C +# end @testset "unsafe_free" begin A = AMDGPU.ones(4, 3) @@ -111,7 +112,7 @@ end A = AMDGPU.ones(16) @test refcount_live(A) == (1, true) B = f(A) - @test A.buf.base_ptr == B.buf.base_ptr + @test A.buf.ptr == B.buf.ptr @test refcount_live(A) == refcount_live(B) @test refcount_live(B) == (2-switch, true) finalize(B) diff --git a/test/rocarray/broadcast.jl b/test/rocarray/broadcast.jl index 2abd13471..2c6b21070 100644 --- a/test/rocarray/broadcast.jl +++ b/test/rocarray/broadcast.jl @@ -1,41 +1,41 @@ @testset "broadcast" begin - @test testf((x) -> fill!(x, 1), rand(3,3)) - @test testf((x, y) -> map(+, x, y), rand(2, 3), rand(2, 3)) - @test testf((x) -> sin.(x), rand(2, 3)) - @test testf((x) -> log.(x) .+ 1, rand(2, 3)) - @test testf((x) -> 2x, rand(2, 3)) - @test testf((x) -> x .^ 0, rand(2, 3)) - @test testf((x) -> x .^ 1, rand(2, 3)) - @test testf((x) -> x .^ 2, rand(2, 3)) - @test testf((x) -> x .^ 3, rand(2, 3)) - @test testf((x) -> x .^ 5, rand(2, 3)) - @test testf((x) -> (z = Int32(5); x .^ z), rand(2, 3)) - @test testf((x) -> (z = Float64(π); x .^ z), rand(2, 3)) - @test testf((x) -> (z = Float32(π); x .^ z), rand(Float32, 2, 3)) - @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) - @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) - @test (ROCArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == ROCArray([C_NULL]) - @test ROCArray([1,2,3]) .+ ROCArray([1.0,2.0,3.0]) == ROCArray([2,4,6]) + @test testf((x) -> fill!(x, 1), rand(3,3)) + @test testf((x, y) -> map(+, x, y), rand(2, 3), rand(2, 3)) + @test testf((x) -> sin.(x), rand(2, 3)) + @test testf((x) -> log.(x) .+ 1, rand(2, 3)) + @test testf((x) -> 2x, rand(2, 3)) + @test testf((x) -> x .^ 0, rand(2, 3)) + @test testf((x) -> x .^ 1, rand(2, 3)) + @test testf((x) -> x .^ 2, rand(2, 3)) + @test testf((x) -> x .^ 3, rand(2, 3)) + @test testf((x) -> x .^ 5, rand(2, 3)) + @test testf((x) -> (z = Int32(5); x .^ z), rand(2, 3)) + @test testf((x) -> (z = Float64(π); x .^ z), rand(2, 3)) + @test testf((x) -> (z = Float32(π); x .^ z), rand(Float32, 2, 3)) + @test testf((x, y) -> x .+ y, rand(2, 3), rand(1, 3)) + @test testf((z, x, y) -> z .= x .+ y, rand(2, 3), rand(2, 3), rand(2)) + @test (ROCArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == ROCArray([C_NULL]) + @test ROCArray([1,2,3]) .+ ROCArray([1.0,2.0,3.0]) == ROCArray([2,4,6]) - @eval struct Whatever{T} - x::Int - end - @test Array(Whatever{Int}.(ROCArray([1]))) == Whatever{Int}.([1]) + @eval struct Whatever{T} + x::Int + end + @test Array(Whatever{Int}.(ROCArray([1]))) == Whatever{Int}.([1]) end # https://github.com/JuliaGPU/CUDA.jl/issues/223 @testset "Ref Broadcast" begin - foobar(idx, A) = A[idx] - @test ROCArray([42]) == foobar.(ROCArray([1]), Base.RefValue(ROCArray([42]))) + foobar(idx, A) = A[idx] + @test ROCArray([42]) == foobar.(ROCArray([1]), Base.RefValue(ROCArray([42]))) end @testset "Broadcast Fix" begin - @test testf(x -> log.(x), rand(3,3)) - @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3)) + @test testf(x -> log.(x), rand(3,3)) + @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3)) end # https://github.com/JuliaGPU/CUDA.jl/issues/261 @testset "Broadcast Ref{<:Type}" begin - A = ROCArray{ComplexF64}(undef, (2,2)) - @test eltype(convert.(ComplexF32, A)) == ComplexF32 + A = ROCArray{ComplexF64}(undef, (2,2)) + @test eltype(convert.(ComplexF32, A)) == ComplexF32 end diff --git a/test/rocarray/nmf.jl b/test/rocarray/nmf.jl deleted file mode 100644 index a9d37a9b9..000000000 --- a/test/rocarray/nmf.jl +++ /dev/null @@ -1,31 +0,0 @@ -@testset "NMF" begin - -## A simple NMF implementation, which is useful to test mark/wait - -function step(X, W, H) - # H update - H = (H .* (W' * (X ./ (W * H))) - ./ (sum(W; dims=1))') - # W update - W = (W .* ((X ./ (W * H)) * (H')) - ./ (sum(H; dims=2)')) - # error estimate - X - W * H -end - -for scale in (1:5:50) - ncol = 2001 - nrow = 1002*scale - nfeatures = 12 - X = rand(Float32, nrow, ncol) - W = rand(Float32, nrow, nfeatures) - H = rand(Float32, nfeatures, ncol) - cpu_res = step(X, W, H) - RX = ROCArray(X) - RW = ROCArray(W) - RH = ROCArray(H) - gpu_res = step(RX, RW, RH) - @test Array(gpu_res) ≈ cpu_res -end - -end diff --git a/test/runtests.jl b/test/runtests.jl index 802f966a3..5ec43d1b6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -6,11 +6,13 @@ include("setup.jl") @testset "AMDGPU" begin # Run tests in parallel -np = Threads.nthreads() + +# FIXME +# HostCall tests hang with multiple workers. +np = 1 # Threads.nthreads() ws = Int[] ws_pids = Int[] if np == 1 - include("setup.jl") push!(ws, 1) push!(ws_pids, getpid()) else @@ -44,46 +46,38 @@ tasks = Dict{Int,String}() @info "Testing using device $(AMDGPU.default_device())" AMDGPU.versioninfo() -@info "Running tests with $(length(ws)) workers" +@info "Running tests with $(length(ws)) workers and $(Threads.nthreads()) threads." push!(tests, "HSA" => ()->begin - include("hsa/error.jl") include("hsa/utils.jl") include("hsa/getinfo.jl") include("hsa/device.jl") - include("hsa/queue.jl") - include("hsa/memory.jl") - include("hsa/hashing.jl") end) push!(tests, "Codegen" => ()->begin include("codegen/synchronization.jl") include("codegen/trap.jl") end) -if AMDGPU.Runtime.LOGGING_STATIC_ENABLED - push!(tests, "Logging" => ()->include("logging.jl")) -else - @warn """ - Logging is statically disabled, skipping logging tests. - This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests. - """ - @test_skip "Logging" -end +# if AMDGPU.Runtime.LOGGING_STATIC_ENABLED +# push!(tests, "Logging" => ()->include("logging.jl")) +# else +# @warn """ +# Logging is statically disabled, skipping logging tests. +# This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests. +# """ +# @test_skip "Logging" +# end push!(tests, "Device Functions" => ()->begin include("device/launch.jl") include("device/array.jl") include("device/vadd.jl") include("device/memory.jl") include("device/indexing.jl") - include("device/hostcall.jl") - include("device/output.jl") - include("device/globals.jl") include("device/math.jl") include("device/wavefront.jl") include("device/execution_control.jl") include("device/exceptions.jl") - # FIXME segfaults in a weird way (on check_ir) - # include("device/deps.jl") - include("device/queries.jl") + include("device/hostcall.jl") + include("device/output.jl") end) push!(tests, "Multitasking" => ()->include("tls.jl")) push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl")) @@ -92,7 +86,7 @@ if CI push!(tests, "ROCm libraries are functional" => ()->begin @test AMDGPU.functional(:rocblas) @test AMDGPU.functional(:rocrand) - if !AMDGPU.use_artifacts + if !AMDGPU.use_artifacts() # We don't have artifacts for these @test AMDGPU.functional(:rocfft) end @@ -112,20 +106,14 @@ push!(tests, "rocRAND" => ()->begin @test_skip "rocRAND" end end) -push!(tests, "rocFFT" => ()->begin - if AMDGPU.functional(:rocfft) - include("rocarray/fft.jl") - else - @test_skip "rocFFT" - end -end) -push!(tests, "NMF" => ()->begin - if AMDGPU.functional(:rocblas) - include("rocarray/nmf.jl") - else - @test_skip "NMF" - end -end) +# # FIXME outdated library +# push!(tests, "rocFFT" => ()->begin +# if AMDGPU.functional(:rocfft) +# include("rocarray/fft.jl") +# else +# @test_skip "rocFFT" +# end +# end) push!(tests, "MIOpen" => ()->begin if AMDGPU.functional(:MIOpen) include("dnn/miopen.jl") @@ -134,14 +122,23 @@ push!(tests, "MIOpen" => ()->begin end end) push!(tests, "External Packages" => ()->include("external/forwarddiff.jl")) -for (i, name) in enumerate(keys(TestSuite.tests)) - push!(tests, "GPUArrays TestSuite - $name" => - ()->TestSuite.tests[name](ROCArray)) +for (i, name) in enumerate(sort(collect(keys(TestSuite.tests)))) + push!(tests, "GPUArrays TestSuite - $name" => () -> begin + TestSuite.tests[name](ROCArray) + # Multidimensional indexing contains boxing, + # launching global malloc hostcall. + # Synchronize to disable it. + if name == "indexing multidimensional" + AMDGPU.synchronize(; blocking=false) + end + end) end -push!(tests, "KernelAbstractions" => ()->begin +push!(tests, "KernelAbstractions" => ()-> begin Testsuite.testsuite( ROCBackend, "ROCM", AMDGPU, ROCArray, AMDGPU.ROCDeviceArray; - skip_tests=Set(["sparse"])) + skip_tests=Set(["Printing", "sparse"])) # TODO fix KA printing + # Disable global malloc hostcall started by conversion tests. + AMDGPU.synchronize(; blocking=false) end) function run_worker(w) diff --git a/test/setup.jl b/test/setup.jl index 89e49b212..be69fbaea 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,8 +1,5 @@ using AMDGPU -using AMDGPU: Runtime, Mem, Device, HSA, AS -if AMDGPU.functional(:hip) - using AMDGPU: HIP -end +using AMDGPU: Runtime, Mem, Device, HIP, HSA, AS using GPUCompiler using LinearAlgebra using LLVM, LLVM.Interop @@ -33,7 +30,7 @@ if isdefined(TestSuite, :WrapArray) end import AMDGPU: allowscalar, @allowscalar -import AMDGPU.Device: HostCall, hostcall! +import AMDGPU.Device: HostCallHolder, hostcall! allowscalar(false) CI = parse(Bool, get(ENV, "CI", "false")) @@ -43,7 +40,4 @@ if CI AMDGPU.Runtime.EXIT_ON_MEMORY_FAULT[] = true end -Runtime.DEFAULT_SIGNAL_TIMEOUT[] = 5.0 Device.DEFAULT_HOSTCALL_TIMEOUT[] = 5.0 - -const IS_NAVI_2 = AMDGPU.default_device().name in ("gfx1030", "gfx1031", "gfx1032") diff --git a/test/tls.jl b/test/tls.jl index c6d0f31d1..52af31d88 100644 --- a/test/tls.jl +++ b/test/tls.jl @@ -17,16 +17,13 @@ end @testset "Basics" begin device = @inferred AMDGPU.device() - @test device isa ROCDevice + @test device isa HIPDevice @test device === AMDGPU.Runtime.get_default_device() context = @inferred AMDGPU.context() @test context isa HIPContext @test AMDGPU.device_id(AMDGPU.device(context)) == AMDGPU.device_id(device) - queue = @inferred AMDGPU.queue() - @test queue isa ROCQueue - stream = @inferred AMDGPU.stream() @test stream isa HIPStream @test AMDGPU.device_id(AMDGPU.device(context)) == AMDGPU.device_id(device) @@ -34,96 +31,78 @@ end tls = @inferred AMDGPU.task_local_state() @test tls isa AMDGPU.TaskLocalState @test device === tls.device - @test queue === tls.queue - @test queue.priority == tls.priority @test stream === tls.stream @test stream.priority == tls.priority @test context === tls.context end if length(AMDGPU.devices()) > 1 -@testset "Devices" begin - dev1 = AMDGPU.devices()[1] - tls1 = copy(AMDGPU.task_local_state()) - @assert tls1.device === dev1 - dev2 = AMDGPU.devices()[2] - AMDGPU.device!(dev2) - tls2 = copy(AMDGPU.task_local_state()) - AMDGPU.device!(dev1) - tls3 = copy(AMDGPU.task_local_state()) - - @test tls2.device === dev2 - @test tls1.device !== tls2.device - @test tls1.context !== tls2.context - @test tls2.queue isa ROCQueue - @test tls1.queue !== tls2.queue - @test tls2.stream isa HIPStream - @test tls1.stream !== tls2.stream - @test AMDGPU.device(tls2.queue) == dev2 - @test AMDGPU.device_id(AMDGPU.device(tls2.context)) == 2 - @test AMDGPU.device_id(AMDGPU.device(tls2.stream)) == 2 - - @test tls3.device === dev1 - @test tls1.device === tls3.device - @test tls1.context === tls3.context - @test tls3.queue isa ROCQueue - @test tls1.queue === tls3.queue - @test tls2.stream isa HIPStream - @test tls1.stream === tls3.stream - @test AMDGPU.device(tls3.queue) == dev1 - @test AMDGPU.device_id(AMDGPU.device(tls3.context)) == 1 - @test AMDGPU.device_id(AMDGPU.device(tls3.stream)) == 1 -end + @testset "Devices" begin + dev1 = AMDGPU.devices()[1] + tls1 = copy(AMDGPU.task_local_state()) + @assert tls1.device === dev1 + dev2 = AMDGPU.devices()[2] + AMDGPU.device!(dev2) + tls2 = copy(AMDGPU.task_local_state()) + AMDGPU.device!(dev1) + tls3 = copy(AMDGPU.task_local_state()) + + @test tls2.device === dev2 + @test tls1.device !== tls2.device + @test tls1.context !== tls2.context + @test tls2.stream isa HIPStream + @test tls1.stream !== tls2.stream + @test AMDGPU.device_id(AMDGPU.device(tls2.context)) == 2 + @test AMDGPU.device_id(AMDGPU.device(tls2.stream)) == 2 + + @test tls3.device === dev1 + @test tls1.device === tls3.device + @test tls1.context === tls3.context + @test tls2.stream isa HIPStream + @test tls1.stream === tls3.stream + @test AMDGPU.device_id(AMDGPU.device(tls3.context)) == 1 + @test AMDGPU.device_id(AMDGPU.device(tls3.stream)) == 1 + end else -@test_skip "Devices" + @test_skip "Devices" end -@testset "Queues/Streams" begin +@testset "Streams" begin tls1 = copy(AMDGPU.task_local_state()) - queue1 = AMDGPU.queue() - @test tls1.queue === queue1 === AMDGPU.queue() stream1 = AMDGPU.stream() @test tls1.stream === stream1 === AMDGPU.stream() - @test tls1.priority == queue1.priority == stream1.priority == :normal + @test tls1.priority == stream1.priority == :normal tls2 = async_tls() @test tls2.device === tls1.device @test tls2.context === tls1.context - @test tls2.queue !== tls1.queue @test tls2.stream !== tls1.stream @test tls2.priority == :normal tls3 = copy(AMDGPU.task_local_state()) - @test tls3.queue === queue1 === AMDGPU.queue() @test tls3.stream === stream1 === AMDGPU.stream() @testset "Priorities" begin AMDGPU.priority!(:high) tlsh = copy(AMDGPU.task_local_state()) @test tlsh.priority == :high - @test tlsh.queue !== tls1.queue - @test tlsh.queue.priority == :high @test tlsh.stream !== tls1.stream @test tlsh.stream.priority == :high AMDGPU.priority!(:low) tlsl = copy(AMDGPU.task_local_state()) @test tlsl.priority == :low - @test tlsl.queue !== tls1.queue - @test tlsl.queue.priority == :low @test tlsl.stream !== tls1.stream @test tlsl.stream.priority == :low AMDGPU.priority!(:normal) tlsn = copy(AMDGPU.task_local_state()) @test tlsn.priority == :normal - @test tlsn.queue.priority == :normal @test tlsn.stream.priority == :normal AMDGPU.priority!(:high) tlsn2 = async_tls() @test tlsn2.priority == :normal - @test tlsn2.queue.priority == :normal @test tlsn2.stream.priority == :normal AMDGPU.priority!(:normal) @@ -131,7 +110,6 @@ end AMDGPU.priority!(:high) end @test tlsh2.priority == :high - @test tlsh2.queue.priority == :high @test tlsh2.stream.priority == :high @test AMDGPU.task_local_state().priority == :normal end