diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index a04fd4982..4353add8a 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -18,65 +18,28 @@ steps:
     agents:
       queue: "juliagpu"
       rocm: "*"
+      rocmgpu: "gfx1031"
     if: build.message !~ /\[skip docs\]/
     timeout_in_minutes: 10
-  #  - label: "Julia 1.8 - GPUArrays 8"
-  #    plugins:
-  #      - JuliaCI/julia#v1:
-  #          version: 1.8
-  #      - JuliaCI/julia-test#v1:
-  #      - JuliaCI/julia-coverage#v1:
-  #          codecov: true
-  #    agents:
-  #      queue: "juliagpu"
-  #      rocm: "*"
-  #      rocmgpu: "*"
-  #    if: build.message !~ /\[skip tests\]/
-  #    command: "julia --project -e 'using Pkg; Pkg.update()'"
-  #    soft_fail: true
-  #    timeout_in_minutes: 180
-  #    env:
-  #      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-  #      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-  #
-  #  - label: "Julia 1.8 - GPUArrays 8 - No artifacts"
-  #    plugins:
-  #      - JuliaCI/julia#v1:
-  #          version: 1.8
-  #      - JuliaCI/julia-test#v1:
-  #      - JuliaCI/julia-coverage#v1:
-  #          codecov: true
-  #    agents:
-  #      queue: "juliagpu"
-  #      rocm: "*"
-  #      rocmgpu: "*"
-  #    if: build.message !~ /\[skip tests\]/
-  #    command: "julia --project -e 'using Pkg; Pkg.update()'"
-  #    soft_fail: true
-  #    timeout_in_minutes: 180
-  #    env:
-  #      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-  #      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-  #      JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
-  #
-  - label: "Julia 1.9 - GPUArrays 8"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.9-nightly
-      - JuliaCI/julia-test#v1:
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-    agents:
-      queue: "juliagpu"
-      julia_1.9: "*"
-      rocm: "*"
-      rocmgpu: "*"
-    if: build.message !~ /\[skip tests\]/
-    command: "julia --project -e 'using Pkg; Pkg.update()'"
-    timeout_in_minutes: 180
-    env:
-      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
+
+  # - label: "Julia 1.9 - GPUArrays 8"
+  #   plugins:
+  #     - JuliaCI/julia#v1:
+  #         version: 1.9-nightly
+  #     - JuliaCI/julia-test#v1:
+  #     - JuliaCI/julia-coverage#v1:
+  #         codecov: true
+  #   agents:
+  #     queue: "juliagpu"
+  #     julia_1.9: "*"
+  #     rocm: "*"
+  #     rocmgpu: "*"
+  #   if: build.message !~ /\[skip tests\]/
+  #   command: "julia --project -e 'using Pkg; Pkg.update()'"
+  #   timeout_in_minutes: 180
+  #   env:
+  #     JULIA_AMDGPU_CORE_MUST_LOAD: "1"
+  #     JULIA_AMDGPU_HIP_MUST_LOAD: "1"
 
   - label: "Julia 1.9 - GPUArrays 8 - No Artifacts"
     plugins:
@@ -89,74 +52,16 @@ steps:
       queue: "juliagpu"
       julia_1.9: "*"
       rocm: "*"
-      rocmgpu: "*"
+      rocmgpu: "gfx1031"
     if: build.message !~ /\[skip tests\]/
     command: "julia --project -e 'using Pkg; Pkg.update()'"
     timeout_in_minutes: 180
     env:
+      JULIA_NUM_THREADS: 8
       JULIA_AMDGPU_CORE_MUST_LOAD: "1"
       JULIA_AMDGPU_HIP_MUST_LOAD: "1"
       JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
 
-  - label: "Julia 1.9 - GPUArrays 8 - HIP Malloc"
-    plugins:
-      - JuliaCI/julia#v1:
-          version: 1.9-nightly
-      - JuliaCI/julia-test#v1:
-      - JuliaCI/julia-coverage#v1:
-          codecov: true
-    agents:
-      queue: "juliagpu"
-      julia_1.9: "*"
-      rocm: "*"
-      rocmgpu: "*"
-    if: build.message !~ /\[skip tests\]/
-    command: "julia --project -e 'using Pkg; Pkg.update()'"
-    timeout_in_minutes: 180
-    env:
-      JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-      JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-      JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE: "1"
-
-  # - label: "Julia nightly - GPUArrays 8"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: nightly
-  #     - JuliaCI/julia-test#v1:
-  #     - JuliaCI/julia-coverage#v1:
-  #         codecov: true
-  #   agents:
-  #     queue: "juliagpu"
-  #     rocm: "*"
-  #     rocmgpu: "*"
-  #   if: build.message !~ /\[skip tests\]/
-  #   command: "julia --project -e 'using Pkg; Pkg.update()'"
-  #   soft_fail: true
-  #   timeout_in_minutes: 180
-  #   env:
-  #     JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-  #     JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-
-  # - label: "Julia nightly - GPUArrays 8 - No Artifacts"
-  #   plugins:
-  #     - JuliaCI/julia#v1:
-  #         version: nightly
-  #     - JuliaCI/julia-test#v1:
-  #     - JuliaCI/julia-coverage#v1:
-  #         codecov: true
-  #   agents:
-  #     queue: "juliagpu"
-  #     rocm: "*"
-  #     rocmgpu: "*"
-  #   if: build.message !~ /\[skip tests\]/
-  #   command: "julia --project -e 'using Pkg; Pkg.update()'"
-  #   soft_fail: true
-  #   timeout_in_minutes: 180
-  #   env:
-  #     JULIA_AMDGPU_CORE_MUST_LOAD: "1"
-  #     JULIA_AMDGPU_HIP_MUST_LOAD: "1"
-  #     JULIA_AMDGPU_DISABLE_ARTIFACTS: "1"
-
 env:
   JULIA_AMDGPU_LOGGING_ENABLED: true
   SECRET_CODECOV_TOKEN: "lVqFGgrywYmQrILXBcP8i6TosP+q/W2oTDVLIdkWFWscd/a61oSVb8Tycq3qvngsrdmKU9EevdQo+1x+w7cu4IuTq63ahQc0RFgi4Q29hC52OgN2wFql984Cqq3T96P3jyV0ZljaRT+a+9AY0oWmmCph55amvvQ4DOMq3tfGDbp7gdueQvJmSYQGVT3/9Sjn4/esYppcKBGltQqQX2E7WrHLpnqRmsmjcSeZ/S/+PgPRb4ZnpBecAUP2d/MlPgKfP0ZUGbDlcbGu+ZDZNksxKIYuAlNrWPhpNAro7hACfEk4T5RRpNiwmJyXJZ8LUD8zNYIUKSmHjUtmqhNXgujWXA==;U2FsdGVkX1/v/P2Y7KZsvC55Au6eET37uDE6M5I6J275maix+SMD0EoJQ19cFp/lae+G8V7dvpPGfrh4hj2nOg=="
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
index e7487db37..2aa3ba45b 100644
--- a/.github/workflows/CompatHelper.yml
+++ b/.github/workflows/CompatHelper.yml
@@ -19,7 +19,7 @@ jobs:
       - uses: julia-actions/setup-julia@v1
         with:
           # version: ${{ steps.julia_compat.outputs.version }}
-          version: "1.7-nightly"
+          version: "1.9"
       - name: Pkg.add("CompatHelper")
         run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
       - name: CompatHelper.main()
diff --git a/.github/workflows/ci-julia-1.7.yml b/.github/workflows/ci-julia-1.7.yml
index ebc92e08f..a51440d92 100644
--- a/.github/workflows/ci-julia-1.7.yml
+++ b/.github/workflows/ci-julia-1.7.yml
@@ -1,4 +1,4 @@
-name: CI (Julia 1.7)
+name: CI (Julia 1.9)
 on:
   push:
     branches:
@@ -9,14 +9,14 @@ defaults:
   run:
     shell: bash
 jobs:
-  CI-julia-1-7:
-    name: CI-julia-1-7
+  CI-julia-1-0:
+    name: CI-julia-1-9
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
         version:
-          - '1.7'
+          - '1.9'
         os:
           - ubuntu-latest
           - macOS-latest
diff --git a/Project.toml b/Project.toml
index 33272ff4c..e3d8998d4 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "AMDGPU"
 uuid = "21141c5a-9bdb-4563-92ae-f87d6854732e"
 authors = ["Julian P Samaroo <jpsamaroo@jpsamaroo.me>", "Valentin Churavy <v.churavy@gmail.com>", "Anton Smirnov <tonysmn97@gmail.com>"]
-version = "0.4.15"
+version = "0.5.0"
 
 [deps]
 AbstractFFTs = "621f4979-c628-5d54-868e-fcf4e3e8185c"
@@ -19,14 +19,11 @@ LLVM_jll = "86de99a1-58d6-5da7-8064-bd56ce2e322c"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
-MsgPack = "99f44e22-a591-53d1-9472-aa23ef4bd671"
-ObjectFile = "d8793406-e978-5875-9003-1fc021f44a92"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ROCmDeviceLibs_jll = "873c0968-716b-5aa7-bb8d-d1e2e2aeff2d"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 TimespanLogging = "a526e669-04d3-4846-9525-c66122c55f63"
@@ -37,29 +34,26 @@ rocRAND_jll = "a6151927-a32b-54c0-bc8c-bbd7b3f1a996"
 rocSPARSE_jll = "8c6ce2ba-659c-5ec7-ba4c-37596cf1f22a"
 
 [compat]
-AbstractFFTs = "0.5, 1.0"
+AbstractFFTs = "1.0"
 Adapt = "3.0"
 BinaryProvider = "0.5"
-CEnum = "0.2, 0.3, 0.4"
+CEnum = "0.4"
 ExprTools = "0.1"
-GPUArrays = "6, 7, 8"
-GPUCompiler = "0.19"
-HIP_jll = "4, 5"
+GPUArrays = "8"
+GPUCompiler = "0.21"
+HIP_jll = "5"
 KernelAbstractions = "0.9.2"
-LLD_jll = "12, 13, 14, 15"
-LLVM = "5"
-LLVM_jll = "12, 13, 14, 15"
+LLD_jll = "14, 15"
+LLVM = "6"
+LLVM_jll = "14, 15"
 MacroTools = "0.5"
-MsgPack = "1"
-ObjectFile = "0.3"
 Preferences = "1"
-ROCmDeviceLibs_jll = "4, 5"
-Setfield = "0.5, 0.6, 0.7, 1.0"
-SpecialFunctions = "1, 2"
+ROCmDeviceLibs_jll = "5"
+SpecialFunctions = "2"
 TimespanLogging = "0.1"
 UnsafeAtomicsLLVM = "0.1"
-hsa_rocr_jll = "4, 5"
-julia = "1.7"
-rocBLAS_jll = "4, 5"
-rocRAND_jll = "4, 5"
-rocSPARSE_jll = "4, 5"
+hsa_rocr_jll = "5"
+julia = "1.9"
+rocBLAS_jll = "5"
+rocRAND_jll = "5"
+rocSPARSE_jll = "5"
diff --git a/README.md b/README.md
index 9a610bf0d..b01ef015a 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,6 @@
 [docs-dev-url]: https://amdgpu.juliagpu.org/dev
 
 
-
 ## Quick start
 
 AMDGPU.jl can be installed with the Julia package manager.
@@ -38,7 +37,7 @@ julia> import Pkg; Pkg.add("AMDGPU")
 
 ## Project Status
 
-The package is tested against, and being developed for, Julia 1.7, 1.9, and above.
+The package is tested against and being developed for Julia 1.9, and above.
 *Julia 1.8 is not supported and should not be used with AMDGPU.jl, as Julia 1.8
 cannot properly handle the code generated by AMDGPU.jl.*
 
diff --git a/deps/.gitignore b/deps/.gitignore
deleted file mode 100644
index bb75281e1..000000000
--- a/deps/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-build.log
-runtime/
-usr/
-ext.jl
-ext.jl.bak
-deps.jl
diff --git a/deps/bindeps.jl b/deps/bindeps.jl
deleted file mode 100644
index d1025101f..000000000
--- a/deps/bindeps.jl
+++ /dev/null
@@ -1,399 +0,0 @@
-# copied from CUDAdrv/deps/build.jl
-
-using Libdl
-import Base: @invokelatest
-using Preferences
-
-enable_artifacts!(flag::Bool=true) =
-    @set_preferences!("use_artifacts" => flag)
-if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS")
-    enable_artifacts!(!parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false")))
-end
-const use_artifacts = @load_preference("use_artifacts", true)
-
-## library finding
-
-function find_roc_paths()
-    paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":")
-    paths = filter(path->path != "", paths)
-    paths = map(Base.Filesystem.abspath, paths)
-    push!(paths, "/opt/rocm/lib") # shim for Ubuntu rocm packages...
-    if haskey(ENV, "ROCM_PATH")
-        push!(paths, joinpath(ENV["ROCM_PATH"], "lib"))
-    end
-    return filter(isdir, paths)
-end
-
-function find_rocm_library(lib, dirs, ext=dlext)
-    path = Libdl.find_library(lib)
-    if path != ""
-        return Libdl.dlpath(path)
-    end
-    for dir in dirs
-        files = readdir(dir)
-        for file in files
-            matched = startswith(basename(file), lib*".$ext")
-            if matched
-                return joinpath(dir, file)
-            end
-        end
-    end
-    return ""
-end
-function find_rocm_library(libs::Vector, dirs)
-    for lib in libs
-        path = find_rocm_library(lib, dirs)
-        if path != ""
-            return path
-        end
-    end
-    return ""
-end
-
-function find_ld_lld()
-    paths = split(get(ENV, "PATH", ""), ":")
-    paths = filter(path->path != "", paths)
-    paths = map(Base.Filesystem.abspath, paths)
-    basedir = get(ENV, "ROCM_PATH", "/opt/rocm")
-    ispath(joinpath(basedir, "llvm/bin/ld.lld")) &&
-        push!(paths, joinpath(basedir, "llvm/bin/"))
-    ispath(joinpath(basedir, "hcc/bin/ld.lld")) &&
-        push!(paths, joinpath(basedir, "/hcc/bin/"))
-    ispath(joinpath(basedir, "opencl/bin/x86_64/ld.lld")) &&
-        push!(paths, joinpath(basedir, "opencl/bin/x86_64/"))
-    for path in paths
-        exp_ld_path = joinpath(path, "ld.lld")
-        if ispath(exp_ld_path)
-            try
-                tmpfile = mktemp()
-                run(pipeline(`$exp_ld_path -v`; stdout=tmpfile[1]))
-                vstr = read(tmpfile[1], String)
-                rm(tmpfile[1])
-                vstr = replace(vstr, "AMD " => "")
-                vstr_splits = split(vstr, ' ')
-                if VersionNumber(vstr_splits[2]) >= v"6.0.0"
-                    return exp_ld_path
-                end
-            catch
-                @debug "bindeps: Failed running ld.lld in $exp_ld_path"
-            end
-        end
-    end
-    return ""
-end
-
-function find_device_libs()
-    # Might be set by tools like Spack or the user
-    hip_devlibs_path = get(ENV, "HIP_DEVICE_LIB_PATH", "")
-    hip_devlibs_path !== "" && return hip_devlibs_path
-    devlibs_path = get(ENV, "DEVICE_LIB_PATH", "")
-    devlibs_path !== "" && return devlibs_path
-
-    # The canonical location
-    if isdir("/opt/rocm/amdgcn/bitcode")
-        return "/opt/rocm/amdgcn/bitcode"
-    end
-
-    # Search relative to LD_LIBRARY_PATH entries
-    paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":")
-    paths = filter(path->path != "", paths)
-    paths = map(Base.Filesystem.abspath, paths)
-    for path in paths
-        bitcode_path = joinpath(path, "../amdgcn/bitcode/")
-        if ispath(bitcode_path)
-            if isfile(joinpath(bitcode_path, "ocml.bc")) ||
-               isfile(joinpath(bitcode_path, "ocml.amdgcn.bc"))
-               return bitcode_path
-            end
-        end
-    end
-    return nothing
-end
-
-function detect_projects()
-    amdgpu_project = normpath(joinpath(@__DIR__, ".."))
-    current_project = Base.ACTIVE_PROJECT[]
-    julia_project = if Base.JLOptions().project != C_NULL
-        unsafe_string(Base.JLOptions().project)
-    elseif current_project !== nothing
-        current_project
-    else
-        amdgpu_project
-    end
-    return (;amdgpu_project, current_project, julia_project)
-end
-julia_exeflags(projects=detect_projects()) =
-    String["--startup-file=no",
-           "--project=$(projects.julia_project)"]
-function julia_cmd_projects(jl_str)
-    projects = detect_projects()
-
-    cmd = Base.julia_cmd()
-    append!(cmd.exec, julia_exeflags(projects))
-
-    (;amdgpu_project, current_project, julia_project) = projects
-    if current_project !== nothing
-        jl_str = "push!(LOAD_PATH, \"$current_project\");" * jl_str
-    end
-    jl_str = "push!(LOAD_PATH, \"$amdgpu_project\");" * jl_str
-    append!(cmd.exec, ("-e", jl_str))
-    return cmd
-end
-
-function populate_globals!(config)
-    for (key,val) in config
-        @eval const $key = $val
-    end
-end
-
-const rocm_ext_libs = [
-    (:rocblas, :rocBLAS_jll),
-    (:rocsparse, :rocSPARSE_jll),
-    (:rocsolver, nothing),
-    (:rocalution, nothing),
-    (:rocrand, :rocRAND_jll),
-    (:rocfft, nothing),
-    (:MIOpen, :MIOpen_jll),
-]
-
-function bindeps_setup()
-    config = Dict{Symbol,Any}(
-        :use_artifacts => use_artifacts,
-        :configured => false,
-        :build_reason => "unknown",
-        :lld_configured => false,
-        :lld_build_reason => "unknown",
-        :lld_artifact => false,
-        :hsa_configured => false,
-        :hsa_build_reason => "unknown",
-        :hip_configured => false,
-        :hip_build_reason => "unknown",
-        :device_libs_configured => false,
-        :device_libs_build_reason => "unknown",
-    )
-    for (name, _) in rocm_ext_libs
-        lib = Symbol(:lib, string(name))
-        config[lib] = nothing
-        config[Symbol(name, :_configured)] = false
-        config[Symbol(name, :_build_reason)] = "unknown"
-    end
-
-    ## discover stuff
-
-    # check that we're running Linux
-    if !Sys.islinux()
-        @debug "Not running Linux, which is the only platform currently supported by the ROCm Runtime."
-        config[:build_reason] = "Unsupported OS: $(repr(Sys.KERNEL))"
-        @goto populate
-    end
-
-    # Skip build if KFD is not available
-    if !ispath("/dev/kfd")
-        @debug "/dev/kfd not available, cannot use ROCm Runtime."
-        @goto populate
-    end
-
-    # Find some paths for library search
-    roc_dirs = find_roc_paths()
-
-    function safe_exec(str)
-        cmd = julia_cmd_projects(str)
-        success = false
-        error_str = mktemp() do path, _
-            p = run(pipeline(cmd; stdout=path, stderr=path); wait=false)
-            wait(p)
-            success = p.exitcode == 0
-            String(read(path))
-        end
-        return success, error_str
-    end
-    function safe_import(pkg)
-        loaded, error_str = safe_exec("import $pkg")
-        if !loaded
-            return loaded, false, error_str
-        end
-        @eval import $pkg
-        available = @eval(isdefined($pkg, :is_available)) && @eval($pkg.is_available())
-        return loaded, available, error_str
-    end
-
-    # Find HSA runtime v1
-    if use_artifacts
-        loaded, available, error_str = safe_import(:hsa_rocr_jll)
-        if loaded
-            if available
-                config[:libhsaruntime_path] = hsa_rocr_jll.libhsa_runtime64
-                config[:hsa_configured] = true
-            else
-                config[:hsa_build_reason] = "hsa_rocr_jll is not available on this platform"
-            end
-        else
-            iob = IOBuffer()
-            println(iob, "`import hsa_rocr_jll` failed:")
-            print(iob, error_str)
-            config[:hsa_build_reason] = String(take!(iob))
-        end
-    else
-        libhsaruntime_path = find_rocm_library("libhsa-runtime64", roc_dirs, "so.1")
-        if !isempty(something(libhsaruntime_path, ""))
-            loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhsaruntime_path\")")
-            if loaded
-                config[:libhsaruntime_path] = libhsaruntime_path
-                config[:hsa_configured] = true
-            else
-                iob = IOBuffer()
-                println(iob, "Loading `libhsa-runtime64` v1 failed:")
-                print(iob, error_str)
-                config[:hsa_build_reason] = String(take!(iob))
-            end
-        else
-            config[:hsa_build_reason] = "Could not find `libhsa-runtime64` v1 library"
-        end
-    end
-    if !config[:hsa_configured]
-        @goto populate
-    end
-
-    ### Find ld.lld
-    if use_artifacts
-        loaded, available, error_str = safe_import(:LLD_jll)
-        if loaded
-            if available || (Base.libllvm_version < v"14" && @invokelatest(LLD_jll.LLVM_jll.is_available()))
-                if isdefined(LLD_jll, :lld_path)
-                    config[:lld_path] = LLD_jll.lld_path
-                    config[:lld_artifact] = true
-                    config[:lld_configured] = true
-                else
-                    config[:lld_build_reason] = "LLD_jll does not export `lld_path`"
-                end
-            else
-                config[:lld_build_reason] = "LLD_jll is not available on this platform"
-            end
-        else
-            iob = IOBuffer()
-            println(iob, "`import LLD_jll` failed:")
-            print(iob, error_str)
-            config[:lld_build_reason] = String(take!(iob))
-        end
-    else
-        lld_path = find_ld_lld()
-        if !isempty(something(lld_path, ""))
-            # TODO: Validate ld.lld can compile programs
-            config[:lld_path] = lld_path
-            config[:lld_configured] = true
-        else
-            config[:lld_build_reason] = "Could not find `ld.lld` executable"
-        end
-    end
-
-    ### Find device libraries
-    if use_artifacts
-        loaded, available, error_str = safe_import(:ROCmDeviceLibs_jll)
-        if loaded
-            if available
-                config[:device_libs_path] = ROCmDeviceLibs_jll.bitcode_path
-                config[:device_libs_configured] = true
-            else
-                config[:device_libs_build_reason] = "ROCmDeviceLibs_jll is not available on this platform"
-            end
-        else
-            iob = IOBuffer()
-            println(iob, "`import ROCmDeviceLibs_jll` failed:")
-            print(iob, error_str)
-            config[:device_libs_build_reason] = String(take!(iob))
-        end
-    else
-        device_libs_path = find_device_libs()
-        if !isempty(something(device_libs_path, ""))
-            # TODO: Validate bitcode files
-            config[:device_libs_path] = device_libs_path
-            config[:device_libs_configured] = true
-        else
-            config[:device_libs_build_reason] = "Couldn't find bitcode files"
-        end
-    end
-
-    ### Find HIP
-    if use_artifacts
-        loaded, available, error_str = safe_import(:HIP_jll)
-        if loaded
-            if available
-                config[:libhip_path] = HIP_jll.libamdhip64
-                config[:hip_configured] = true
-            else
-                config[:hip_build_reason] = "HIP_jll is not available on this platform"
-            end
-        else
-            iob = IOBuffer()
-            println(iob, "`import HIP_jll` failed:")
-            print(iob, error_str)
-            config[:hip_build_reason] = String(take!(iob))
-        end
-    else
-        libhip_path = find_rocm_library(["libamdhip64", "libhip_hcc"], roc_dirs)
-        if !isempty(something(libhip_path, ""))
-            loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhip_path\")")
-            if loaded
-                config[:libhip_path] = libhip_path
-                config[:hip_configured] = true
-            else
-                iob = IOBuffer()
-                println(iob, "Loading HIP failed:")
-                print(iob, error_str)
-                config[:hip_build_reason] = String(take!(iob))
-            end
-        else
-            config[:hip_build_reason] = "Could not find `libamdhip64` or `libhip_hcc` libraries"
-        end
-    end
-    if config[:hip_configured]
-        ### Find HIP-based libraries
-        @sync for (name, pkg) in rocm_ext_libs
-            @async begin
-            lib = Symbol(:lib, string(name))
-            if use_artifacts
-                if pkg !== nothing
-                    loaded, available, error_str = safe_import(pkg)
-                    if loaded
-                        if available
-                            config[lib] = getfield(@eval($pkg), lib)
-                            config[Symbol(name, :_configured)] = true
-                        else
-                            config[Symbol(name, :_build_reason)] = "$pkg is not available on this platform"
-                        end
-                    else
-                        iob = IOBuffer()
-                        println(iob, "`import $pkg` failed:")
-                        print(iob, error_str)
-                        config[Symbol(name, :_build_reason)] = String(take!(iob))
-                    end
-                end
-            else
-                libpath = find_rocm_library(string(lib), roc_dirs)
-                if !isempty(something(libpath, ""))
-                    loaded, error_str = safe_exec("using Libdl; dlopen(\"$libpath\")")
-                    if loaded
-                        config[lib] = libpath
-                        config[Symbol(name, :_configured)] = true
-                    else
-                        iob = IOBuffer()
-                        println(iob, "Loading `$lib` failed:")
-                        print(iob, error_str)
-                        config[Symbol(name, :_build_reason)] = String(take!(iob))
-                    end
-                else
-                    config[Symbol(name, :_build_reason)] = "Could not find `$lib` library"
-                end
-            end
-            end # @async
-        end
-    end
-
-    config[:configured] = true
-
-    @label populate
-    populate_globals!(config)
-end
-
-# Load binary dependencies
-bindeps_setup()
diff --git a/docs/make.jl b/docs/make.jl
index 302e16500..03dac8127 100644
--- a/docs/make.jl
+++ b/docs/make.jl
@@ -10,18 +10,18 @@ function main()
         pages = [
             "Home" => "index.md",
             "Quick Start" => "quickstart.md",
-            "Devices/Agents" => "devices.md",
-            "Queues and Signals" => "queues_signals.md",
-            "Kernel Dependencies" => "kernel_deps.md",
+            "Devices" => "devices.md",
+            "Streams" => "streams.md",
             "Kernel Launch" => "kernel_launch.md",
-            "Global Variables" => "globals.md",
             "Exceptions" => "exceptions.md",
-            "Printing" => "printing.md",
+            "Profiling" => "profiling.md",
             "Memory" => "memory.md",
+            "Host-Call" => "hostcall.md",
             "Intrinsics" => [
                 "Execution Control" => "execution_control.md",
                 "Wavefront Operations" => "wavefront_ops.md",
             ],
+            "Printing" => "printing.md",
             "Logging" => "logging.md",
             "API Reference" => "api.md"
         ]
diff --git a/docs/src/assets/profiling-1.png b/docs/src/assets/profiling-1.png
new file mode 100644
index 000000000..f565b6a13
Binary files /dev/null and b/docs/src/assets/profiling-1.png differ
diff --git a/docs/src/assets/profiling-2.png b/docs/src/assets/profiling-2.png
new file mode 100644
index 000000000..26e6653fa
Binary files /dev/null and b/docs/src/assets/profiling-2.png differ
diff --git a/docs/src/assets/profiling-3.png b/docs/src/assets/profiling-3.png
new file mode 100644
index 000000000..50b052b61
Binary files /dev/null and b/docs/src/assets/profiling-3.png differ
diff --git a/docs/src/assets/profiling-4.png b/docs/src/assets/profiling-4.png
new file mode 100644
index 000000000..9c2f6088b
Binary files /dev/null and b/docs/src/assets/profiling-4.png differ
diff --git a/docs/src/devices.md b/docs/src/devices.md
index 5a413f9c7..0cfbe8d2e 100644
--- a/docs/src/devices.md
+++ b/docs/src/devices.md
@@ -1,40 +1,43 @@
-# Devices/Agents
+# Devices
 
-In AMDGPU, all GPU devices (also known as "agents" in HSA parlance) are
-auto-detected by the runtime, if they're supported.
-There are three classes of devices:
-- CPU
-- GPU
-- DSP
-
-In AMDGPU, we only support compilation and execution on **GPU** devices,
-so we will henceforth limit discussion to those;
-however, you may see a `kind` `Symbol` available in the APIs of many device
-access functions, which defaults to `:gpu`, but could also be `:cpu` or `:dsp`.
+In AMDGPU, all GPU devices are auto-detected by the runtime, if they're supported.
 
 AMDGPU maintains a global default device.
-The default device is relevant for all kernel and GPUArray operations;
-if one is not specified via `@roc` or an equivalent interface,
+The default device is relevant for all kernel and GPUArray operations.
+If one is not specified via `@roc` or an equivalent interface,
 then the default device is used for those operations,
 which affects compilation and kernel launch.
 
 !!! note "Task-Local Storage"
-    Since AMDGPU.jl relies on Task-Local Storage, this means that
+    AMDGPU.jl relies on Task-Local Storage, this means that
     default devices are default only within a given task.
-    Other tasks migh have different default devices if the user switched them.
+    Other tasks migh have different default devices if user switched them.
+
+The device bound to a current Julia task is accessible via [`AMDGPU.device()`](@ref).
+The list of available devices can be queried with [`AMDGPU.devices`](@ref).
+
+If you have a [`HIPDevice`](@ref) object, you can also switch
+the device with [`AMDGPU.device!`](@ref).
+This will switch it **only within the task it is called from**.
 
-The default device is accessible via [`AMDGPU.device()`](@ref).
-This function returns a [`ROCDevice`](@ref), which is a handle that references the device.
-The list of available devices can be queried with [`AMDGPU.devices`](@ref) to get
-a list of all known and potentially usable devices.
+```julia
+xd1 = AMDGPU.ones(Float32, 16) # On `AMDGPU.device()` device.
 
-If you have a [`ROCDevice`](@ref) object, you can also switch
-the default device via [`AMDGPU.device!`](@ref).
-This will switch it only within the task it is called from.
+AMDGPU.device!(AMDGPU.devices()[2]) # Switch to second device.
+xd2 = AMDPGU.ones(Float32, 16) # On second device.
+```
 
-To select default device for newly created tasks,
+To select a default device for newly created tasks,
 use [`AMDGPU.default_device!`](@ref).
 
+```julia
+AMDGPU.default_device!(AMDGPU.devices()[3]) # New tasks will use 3rd device by default.
+Threads.@spawn begin
+    x = AMDGPU.ones(Float32, 16) # On third device.
+    return
+end
+```
+
 Additionally, devices have an associated numeric ID.
 The default device ID can be queried with [`AMDGPU.default_device_id`](@ref),
 which returns an `Int`.
diff --git a/docs/src/exceptions.md b/docs/src/exceptions.md
index 436407294..6ff55beb2 100644
--- a/docs/src/exceptions.md
+++ b/docs/src/exceptions.md
@@ -1,32 +1,56 @@
-# Kernel-thrown Exceptions
+# Kernel Exceptions
 
-Just like regular CPU-executed Julia functions, GPU kernels can throw
-exceptions! For example, the following kernel will throw a `KernelException`:
+Just like regular CPU-executed Julia functions, GPU kernels can throw exceptions!
+
+For example, the following kernel will throw an out-of-bounds exception:
 
 ```julia
-function throwkernel(A)
-    A[0] = 1
-end
-RA = ROCArray(zeros(Int,1))
-wait(@roc throwkernel(RA))
+julia> using AMDGPU
+
+julia> function kerr(x)
+           x[0] = 1
+           return
+       end
+kerr (generic function with 1 method)
+
+julia> x = ROCArray([1]);
+
+julia> @roc kerr(x);
+
+julia> AMDGPU.synchronize()
+ERROR: GPU Kernel Exception
+Stacktrace:
+ [1] error(s::String)
+   @ Base ./error.jl:35
+ [2] throw_if_exception(dev::HIPDevice)
+   @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115
+ [3] synchronize(stm::HIPStream)
+   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154
+ [4] synchronize()
+   @ AMDGPU ~/.julia/dev/AMDGPU/src/highlevel.jl:154
+ [5] top-level scope
+   @ REPL[5]:1
+
+julia> @roc kerr(x) # Next kernel launch also throws.
+ERROR: GPU Kernel Exception
+Stacktrace:
+ [1] error(s::String)
+   @ Base ./error.jl:35
+ [2] throw_if_exception(dev::HIPDevice)
+   @ AMDGPU ~/.julia/dev/AMDGPU/src/exception_handler.jl:115
+ [3] #_#30
+   @ ~/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:44 [inlined]
+ [4] (::AMDGPU.Runtime.HIPKernel{typeof(kerr), Tuple{AMDGPU.Device.ROCDeviceVector{Int64, 1}}})(args::ROCVector{Int64})
+   @ AMDGPU.Runtime ~/.julia/dev/AMDGPU/src/runtime/hip-execution.jl:41
+ [5] top-level scope
+   @ ~/.julia/dev/AMDGPU/src/highlevel.jl:228
 ```
 
-Kernels that hit an exception will write some exception information into a
-pre-allocated list for the CPU to inspect. Once complete, the wavefront
-throwing the exception will stop itself, but other wavefronts will continue
-executing (possibly throwing their own exceptions, or not).
-
-Kernel-thrown exceptions are thrown on the CPU in the call to `wait(event)`,
-where `event` is the returned value of `@roc` calls. When the kernel signals
-that it's completed, the `wait` function will check if an exception flag has
-been set, and if it has, will collect all of the relevant exception information
-that the kernels set up. Unlike CPU execution, GPU kernel exceptions aren't
-very user-customizable and pretty (for now!). They don't call `Base.show`, but
-instead pass the LLVM function name of their exception handler (details in
-`GPUCompiler`, `src/irgen.jl`). Therefore, the exact error that occured might
-be a bit hard to figure out.
-
-If exception checking turns out to be too expensive for your needs, you can
-disable those checks by passing the kwarg `check_exceptions=false` to the
-`wait` call, which will skip any error checking (although it will still wait
-for the kernel to signal completion).
+Kernel-thrown exceptions are thrown during the
+host synchronization `AMDGPU.synchronize` or on the next kernel launch.
+
+Kernels that hit an exception will write its information into a pre-allocated
+host buffer.
+Once complete, the wavefront throwing the exception will lock the buffer
+to prevent other wavefronts from overwriting the exception and stop itself,
+but other wavefronts will continue executing.
diff --git a/docs/src/globals.md b/docs/src/globals.md
deleted file mode 100644
index fcb3a3ea3..000000000
--- a/docs/src/globals.md
+++ /dev/null
@@ -1,70 +0,0 @@
-# Global Variables
-
-Most programmers are familiar with the concept of a "global variable": a
-variable which is globally accessible to any function in the user's program. In
-Julia, programmers are told to avoid using global variables (also known as
-"globals") because of their tendency to introduce type instabilities. However,
-they're often useful for sharing data between functions in distinct areas of
-the user's program.
-
-In the JuliaGPU ecosystem, globals in the Julia sense are not available unless
-their value is constant and inlinable into the function referencing them, as
-all GPU kernels must be statically compileable. However, a different sort of
-global variable is available which serves a very similar purpose. This variant
-of global variable is statically typed and sized, and is accessible from: all
-kernels with the same function signature (e.g. `mykernel(a::Int32,
-b::Float64)`), the CPU host, and other devices and kernels when accessed by
-pointer.
-
-Global variables can be created within kernels with the
-[`AMDGPU.Device.get_global_pointer`](@ref) function, which both declares the
-global variable, and returns a pointer to it (specifically a
-`Core.LLVMPtr`). Once a kernel which declares a global is
-compiled for GPU execution (either by [`@roc`](@ref) or [`rocfunction`](@ref)),
-the global is allocated memory and made available to the kernel (during the
-linking stage). Globals are unique by name, and so you shouldn't attempt to
-call `get_global_pointer` with the same name but a different type; if you do,
-undefined behavior will result. Like regular pointers in Julia, you can use
-functions like `Base.unsafe_load` and `Base.unsafe_store!` to read from and
-write to the global variable, respectively.
-
-As a concrete example of global variable usage, let's define a kernel which
-creates a global and uses its value to increment the indices of an array:
-
-```julia
-function my_kernel(A)
-    idx = AMDGPU.Device.workitemIdx().x
-    ptr = AMDGPU.Device.get_global_pointer(Val(:myglobal), Float32)
-    A[idx] += Base.unsafe_load(ptr)
-    nothing
-end
-```
-
-In order to access and modify this global before the kernel is launched, we can
-specify a hook function to `@roc` which will be passed the global pointer as an
-argument:
-
-```julia
-function myglobal_hook(gbl, mod, dev)
-    gbl_ptr = Base.unsafe_convert(Ptr{Float32}, gbl.ptr)
-    Base.unsafe_store!(gbl_ptr, 42f0)
-end
-RA = ROCArray(ones(Float32, 4))
-wait(@roc groupsize=4 global_hooks=(myglobal=myglobal_hook,) my_kernel(RA))
-```
-
-In the above function, `gbl_ptr` is a pointer (specifically a `Ptr{Float32}`)
-to the memory that represents the global variable `myglobal`. We can't
-guarantee the initial value of an uninitialized global variable, so we need
-to write a value to that global variable (in this case `42::Float32`).
-
-We can then read the values of `RA` and see that it's what we expect:
-
-```julia-repl
-julia> A = Array(RA)
-4-element ROCArray{Float32,1}:
- 43.0
- 43.0
- 43.0
- 43.0
-```
diff --git a/docs/src/hostcall.md b/docs/src/hostcall.md
new file mode 100644
index 000000000..c42609f5e
--- /dev/null
+++ b/docs/src/hostcall.md
@@ -0,0 +1,70 @@
+# Hostcall
+
+Hostcalls provide a means for GPU-CPU communications within running kernels.
+
+AMDGPU.jl provides its own implementation of hostcalls, relying on HSA signals.
+Currently, hostcalls are used for device-side allocations, printing and exception reporting.
+
+Some of the hostcalls (global hostcalls), are launched automatically, if their
+usage is detected during compilation (e.g. device-side allocations, exception reporting).
+
+Hostcalls require careful usage, since they each spawn their own Tasks.
+There should be no blocking operations during this time.
+
+For example, using non-blocking synchronization instead of blocking with
+`AMDGPU.synchronize(; blocking=false)`.
+
+Non-blocking synchronization is also responsible for stopping global hostcalls,
+otherwise the performance might degarde because of constant pooling
+of HSA signals in a loop.
+
+## Example
+
+```julia
+hc = Device.HostCallHolder(Float32, Tuple{Float32}) do x
+    return x + 42f0
+end
+
+function kernel!(y, hc)
+    y[1] = Device.hostcall!(hc, y[1])
+    return
+end
+
+y = ROCArray(Float32[0f0])
+@roc kernel!(y, hc)
+AMDGPU.synchronize(; blocking=false) # Non-blocking sync to prevent hanging.
+
+@assert Array(y)[1] ≈ 42f0
+```
+
+In this example, `HostCallHolder` is used to create and launch `HostCall`.
+`HostCallHolder` contains the `HostCall` structure itself that is passed to kernel,
+a task that is spawned on creation and some additional info for controlling
+the lifetime of the task.
+
+First argument is a function we want to execute when we call the hostcall.
+In this case we add `42f0` to input argument `x` and return the result.
+
+Second and third arguments are the return type `Float32` and the tuple of types
+of input arguments `Tuple{Float32}`.
+
+`hostcall!` is used to execute the function on the host, wait on the result,
+and obtain the return values.
+At the moment, it is performed once per workgroup.
+
+## Continuous Host-Call
+
+By default, hostcalls can be used only once.
+After executing the function on the host, the task finishes and exits.
+
+However, if you need your hostcall to live indefinitely, pass `continuous=true`
+keyword argument to `HostCallHolder(...; continuous=true)`.
+
+To then stop the hostcall, call `Device.non_continuous!(hc)`
+or `Device.finish!(hc)` on the `HostCallHolder`.
+
+The difference between them is that `non_continuous!` will allow calling
+hostcall one more time before exiting, while `finish!` will exit immediately.
+
+`finish!` can be used on any `HostCallHolder` to force-exit the running
+hostcall task.
diff --git a/docs/src/kernel_deps.md b/docs/src/kernel_deps.md
deleted file mode 100644
index e32e435e0..000000000
--- a/docs/src/kernel_deps.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Kernel Dependencies
-
-Unlike CUDA, ROCm does not have blocking queues; instead, all kernels placed on a queue will usually be processed and scheduled immediately. There is one exception: barrier packets may be placed on the queue to block the GPU's queue packet processor from proceeding until a given set of kernels has completed. These barriers come in two flavors: `barrier_and!` and `barrier_or!`. These functions can be called on a queue with a given set of kernel signals (those returned from `@roc`) to wait for all kernels or any one kernel to complete, respectively.
-
-Generally, the `barrier_and!` call should be the most useful tool for most users, since many codes require synchronization of all "threads of execution" at the end of one step before moving onto the next step. For example, the following code may look innocuous, but in fact the kernels might "race" and return unexpected results:
-
-```julia
-function kernel(A)
-    A[1] += 1.0
-    nothing
-end
-
-RA = ROCArray(zeros(Float64, 1))
-@roc kernel(RA)
-@roc kernel(RA)
-@show Array(RA)[1] # could be 1.0 or 2.0
-```
-
-To fix this example, we use a `barrier_and!` call to ensure proper ordering of execution:
-
-```julia
-RA = ROCArray(zeros(Float64, 1))
-s1 = @roc kernel(RA)
-barrier_and!([s1])
-s2 = @roc kernel(RA)
-wait(s2)
-@show Array(RA)[1] # will always be 2.0
-```
-
-While likely less useful for most, `barrier_or!` can be useful in situations where any one of many "input" kernels can satisfy a condition necessary to allow later kernels to execute properly:
-
-```julia
-function kernel1(A, i)
-    A[1] = i
-    nothing
-end
-function kernel2(A, i)
-    A[2] = i/A[1]
-end
-
-RA = ROCArray(zeros(Float64, 2))
-s1 = @roc kernel1(RA, 1.0)
-s2 = @roc kernel1(RA, 2.0)
-barrier_or!([s1,s2])
-s3 = @roc kernel2(RA, 3.0)
-wait(s3)
-@show Array(RA)[1] # will either be 3.0 or 1.5, but will never throw due to divide-by-zero
-```
-
-!!! warning
-    Because of how barrier OR packets work, you can't use queue hardware to do a wait-any on more than 5 signals at a time. If more than 5 signals are specified, then the signals are split into sets of 5, and the total barrier won't be fulfilled until, for each set, one of the signals is satisfied.
-
-    Contributions are welcome to workaround this issue, which will probably need to implemented in software either on the CPU or GPU side.
diff --git a/docs/src/kernel_launch.md b/docs/src/kernel_launch.md
index e1ec772cf..fba4d0bc3 100644
--- a/docs/src/kernel_launch.md
+++ b/docs/src/kernel_launch.md
@@ -4,11 +4,15 @@
 
 While an almost arbitrarily large number of workitems can be executed per
 kernel launch, the hardware can only support executing a limited number of
-wavefronts at one time. To alleviate this, the compiler calculates the
+wavefronts at one time.
+
+To alleviate this, the compiler calculates the
 "occupancy" of each compiled kernel (which is the number of wavefronts that can
 be simultaneously executing on the GPU), and passes this information to the
 hardware; the hardware then launches a limited number of wavefronts at once,
-based on the kernel's "occupancy" values. The rest of the wavefronts are not
+based on the kernel's "occupancy" values.
+
+The rest of the wavefronts are not
 launched until hardware resources become available, which means that a kernel
 with better occupancy will see more of its wavefronts executing simultaneously
 (which often leads to better performance). Suffice to say, it's important to
@@ -20,31 +24,15 @@ Like CUDA.jl, AMDGPU.jl has the ability to calculate kernel occupancy, with the
 ```julia
 kernel = @roc launch=false mykernel(args...)
 occupancy = AMDGPU.launch_configuration(kernel)
+@show occupancy.gridsize
 @show occupancy.groupsize
 ```
 
 Specifically, `launch_configuration` calculates the occupancy of
 `mykernel(args...)`, and then calculates an optimal groupsize based on the
-occupancy. This value can then be used to select the groupsize for the kernel:
-
-```julia
-wait(@roc groupsize=occupancy.groupsize mykernel(args...)
-```
-
-While it works, it's also pretty verbose. Conveniently, there's also a
-mechanism to do all of the above automatically within `@roc`:
+occupancy.
+This value can then be used to select the groupsize for the kernel:
 
 ```julia
-wait(@roc groupsize=:auto mykernel(args...))
-```
-
-The above is safe to do in a hot path, as the occupancy is cached on a
-per-kernel basis.
-
-There are also various other details available from the occupancy calculation,
-such as SGPR, VGPR, and LDS usage, wavefront size, etc.:
-
-```julia
-kernel = @roc launch=false mykernel(args...)
-@show AMDGPU.Compiler.calculate_occupancy(kernel.fun, AMDGPU.default_device())
+@roc groupsize=occupancy.groupsize mykernel(args...)
 ```
diff --git a/docs/src/memory.md b/docs/src/memory.md
index b345aab9c..72b6729fd 100644
--- a/docs/src/memory.md
+++ b/docs/src/memory.md
@@ -4,83 +4,88 @@
 
 GPUs contain various kinds of memory, just like CPUs:
 
-- Global: Globally accessible by all CUs on a GPU, and possibly accessible from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices, etc.). Slowest form of memory.
-- Constant: Same as global memory, but signals to the hardware that it can use special instructions to access and cache this memory. Can be changed between kernel invocations.
-- Region: Also known as Global Data Store (GDS), all wavefronts on a CU can access the same memory region from the same address. Faster than Global/Constant. Automatically allocated by the compiler/runtime, not user accessible.
-- Local: Also known as Local Data Store (LDS), all wavefronts in the same workgroup can access the same memory region from the same address. Faster than GDS.
-- Private: Uses the hardware scratch space, and is private to each SIMD lane in a wavefront. Fastest form of traditional memory.
+- Global:
+    Globally accessible by all CUs on a GPU, and possibly accessible
+    from outside of the GPU (by the CPU host, by other GPUs, by PCIe devices,
+    etc.). Slowest form of memory.
+- Constant:
+    Same as global memory, but signals to the hardware that it can use
+    special instructions to access and cache this memory.
+    Can be changed between kernel invocations.
+- Region:
+    Also known as Global Data Store (GDS), all wavefronts on a CU
+    can access the same memory region from the same address.
+    Faster than Global/Constant.
+    Automatically allocated by the compiler/runtime, not user accessible.
+- Local:
+    Also known as Local Data Store (LDS), all wavefronts in the same workgroup
+    can access the same memory region from the same address.
+    Faster than GDS.
+- Private:
+    Uses the hardware scratch space, and is private to each SIMD lane
+    in a wavefront.
+    Fastest form of traditional memory.
+
+## Local Memory
+
+Local memory may be allocated within a kernel by calling either:
+
+- `@ROCStaticLocalArray(T, dims)` - if `dims` is passed as a constant value,
+    known at compile-time.
+    E.g. `@ROCStaticLocalArray(Float32, 8)`.
+
+- `@ROCDynamicLocalArray(T, dims)` - otherwise.
+    E.g. `@ROCStaticLocalArray(Float32, length(X))`.
+
+Local memory is zero-initialized by default.
+If this is unnecessary and undesired for performance reasons,
+disable this, passing `false` as a last argument:
+`@ROCStaticLocalArray(Float32, 8, false)` or
+`@ROCStaticLocalArray(Float32, length(X), false)`
 
-## Memory Queries
-
-Most useable memory can be queried via AMDGPU's APIs. There are two sets of
-APIs for querying memory: the older "Regions" API, and the newer "Memory Pools"
-API. The Regions API is able to query all kinds of memory segments for each
-device:
+Local memory does not need to be freed, as it is automatically freed by the
+hardware.
 
-```julia
-for device in AMDGPU.devices()
-    foreach(println, AMDGPU.Runtime.regions(device)::Vector{AMDGPU.Runtime.ROCMemoryRegion})
-end
-```
+If `@ROCDynamicLocalArray` is used, then local memory is dynamically allocated
+at kernel execution time.
+The `shmem` option to `@roc` must be set appropriately to ensure that
+enough local memory is allocated by the hardware.
 
-The Memory Pools API is currently only able to query Global and Group memory
-segments, but is more reliable (due to getting more development attention from
-AMD):
+It is allocated in addition to the local memory that is statically allocated by
+the kernel.
 
 ```julia
-for device in AMDGPU.devices()
-    foreach(println, AMDGPU.Runtime.memory_pools(device)::Vector{AMDGPU.Runtime.ROCMemoryPool})
-end
-```
-
-Most of the details of each memory segment are available by printing the region
-or memory pool in question; they may also be accessed programmatically with the
-`AMDGPU.Runtime.region_*` and `AMDGPU.Runtime.pool_*` APIs.
-
-These details are generally not important to the average user, and are handled
-automatically by AMDGPU when memory is allocated and freed.
+function kernel(C, A)
+    # Allocate local memory dynamically
+    Ctmp = @ROCDynamicLocalArray(Float64, length(C))
+    # Or, allocate local memory statically if the size is known ahead-of-time
+    Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements
 
-## Memory Allocation/Deallocation
+    idx = AMDGPU.workitemIdx().x
+    Ctmp[idx] = A[idx] + C[1]
+    AMDGPU.Device.sync_workgroup()
 
-Currently, we can explicitly allocate Global and Local memory from within
-kernels, and Global from outside of kernels. Global memory allocations are done
-with `AMDGPU.Mem.alloc`, like so:
+    C[idx] = Ctmp[idx]
+    return
+end
 
-```julia
-buf = AMDGPU.Mem.alloc(device, bytes)
-# Or with the extended API if a region or memory pool is already selected:
-buf = AMDGPU.Mem.alloc(device, pool, bytes)
-buf = AMDGPU.Mem.alloc(device, region, bytes)
+...
+# Note: The `shmem` option isn't necessary if `@ROCStaticLocalArray` is used
+shmem = sizeof(Float64) * length(RC)
+@roc groupsize=8 shmem=shmem kernel(RC, RA)
 ```
 
-`buf` in this example is a `Mem.Buffer`, which contains a pointer
-to the allocated memory. The buffer can be converted to a pointer by doing
-`Base.unsafe_convert(Ptr{Nothing}, buf)`, and may then be converted to the
-appropriate pointer type, and loaded from/stored to. By default, memory is
-allocated specifically on and for `device`, and is only accessible to that
-device unless transferred using the various functions in the `Mem` module. If
-memory should be globally accessible by the CPU and by all GPUs, the kwarg
-`coherent=true` may be passed, which utilizes "finegrained" memory instead.
-Memory should be freed once it's no longer in use with `Mem.free(buf)`.
-
-If allocations are done as
-`Mem.alloc(device, bytes; coherent=false, slow_fallback=true)`, and the
-allocation is larger than supported for that memory region, then coherent
-memory will be automatically used (if possible) to service the allocation. This
-can be disabled with `AMDGPU.Mem.enable_slow_allocation_fallback(false)` and
-restarting Julia.
-
 ## Device-Side Allocations
 
-Global memory allocated by a kernel is automatically freed when the kernel
-completes, which is done in the `wait` call on the host. This behavior can be
-disabled by passing `cleanup=false` to `wait`.
+Global memory may be allocated/freed dynamically from kernels by calling
+`AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid}`
+and `AMDGPU.Device.free(::Ptr{Cvoid})`.
+
+This memory allocation/deallocation uses hostcalls to operate,
+and so is relatively slow, but is also very useful.
+See [Hostcall](@ref) section for more info about them.
 
-Global memory may also be allocated and freed dynamically from kernels by
-calling `AMDGPU.Device.malloc(::Csize_t)::Ptr{Cvoid}` and
-`AMDGPU.Device.free(::Ptr{Cvoid})`.  This memory allocation/deallocation uses
-hostcalls to operate, and so is relatively slow, but is also very useful.
-Currently, memory allocated with `AMDGPU.Device.malloc` is coherent by default.
+Memory allocated with `AMDGPU.Device.malloc` is a host-pinned memory.
 Calls to `malloc` and `free` are performed once per workgroup, so ensure that
 enough memory has been allocated to feed the lanes that will be accessing it.
 
@@ -88,68 +93,33 @@ As an example, here's how an array could be allocated on-device to store
 temporary results:
 
 ```julia
-function kernel(C, A, B)
-    # Allocate memory dynamically and get a pointer to it
-    Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64)*length(C)))
-    # Turn it (a pointer to Float64 elements in Global memory) into a device-side array
+function kernel(C, A)
+    # Allocate memory dynamically and get a pointer to it.
+    Ctmp_ptr = AMDGPU.Device.malloc(Csize_t(sizeof(Float64) * length(C)))
+    # Turn a pointer into a device-side array.
     Ctmp = ROCDeviceArray(length(C), reinterpret(Core.LLVMPtr{Float64,1}, Ctmp_ptr))
+
     # Use it
     idx = AMDGPU.workitemIdx().x
-    Ctmp[idx] = A[idx] + B[idx] + C[1]
+    Ctmp[idx] = A[idx] + C[1]
     AMDGPU.Device.sync_workgroup()
+
     C[idx] = Ctmp[idx]
-    # Make sure to free it
+    # Make sure to free it.
     AMDGPU.Device.free(Ctmp_ptr)
-    nothing
+    return
 end
+
 RA = AMDGPU.rand(4)
-RB = AMDGPU.rand(4)
 RC = AMDGPU.rand(4)
 RC_elem = Array(RC)[1]
-wait(@roc groupsize=4 kernel(RC, RA, RB))
-@assert Array(RC) ≈ Array(RA) .+ Array(RB) .+ RC_elem
+@roc groupsize=4 kernel(RC, RA)
+@assert Array(RC) ≈ Array(RA) .+ RC_elem
 ```
 
-Local memory may be allocated within a kernel by calling either
-`@ROCStaticLocalArray(T, dims)` or `@ROCDynamicLocalArray(T, dims)` - use the
-former if `dims` is passed as a constant value, and otherwise use the latter.
-Local memory does not need to be freed, as it is automatically freed by the
-hardware.
-
-If `@ROCDynamicLocalArray` is used, then local memory is dynamically allocated
-at kernel execution time; the `localmem` option to `@roc` must be set
-appropriately to ensure that enough local memory is allocated by the hardware.
-It is allocated in addition to the local memory that is statically allocated by
-the kernel.
-
-```julia
-function kernel(C, A, B)
-    # Allocate local memory dynamically
-    Ctmp = @ROCDynamicLocalArray(Float64, length(C))
-    # Or, allocate local memory statically if the size is known ahead-of-time
-    # Ctmp = @ROCStaticLocalArray(Float64, 8) # if we want 8 elements
-
-    # Use it
-    idx = AMDGPU.workitemIdx().x
-    Ctmp[idx] = A[idx] + B[idx] + C[1]
-    AMDGPU.Device.sync_workgroup()
-    C[idx] = Ctmp[idx]
-    nothing
-end
-# ...
-# Note: The `localmem` option isn't necessary if `@ROCStaticLocalArray` is used
-wait(@roc groupsize=4 localmem=sizeof(Float64)*length(RC) kernel(RC, RA, RB))
-```
-
-Note that like CUDA's shared memory, AMDGPU's local memory is zero-initialized
-automatically. If this behavior is unnecessary (and undesired for performance
-reasons), zero-initialization can be disabled with `@ROCDynamicLocalArray(T,
-dims, false)` or `@ROCStaticLocalArray(T, dims, false)` (the last argument
-is `zeroinit`).
-
 ## Memory Modification Intrinsics
 
 Like C, AMDGPU.jl provides the `memset!` and `memcpy!` intrinsics, which are
 useful for setting a memory region to a value, or copying one region to
-another, respectively. Check `test/device/memory.jl` for examples of their
-usage.
+another, respectively.
+Check `test/device/memory.jl` for examples of their usage.
diff --git a/docs/src/profiling.md b/docs/src/profiling.md
new file mode 100644
index 000000000..9ea7aaf32
--- /dev/null
+++ b/docs/src/profiling.md
@@ -0,0 +1,67 @@
+## rocprof
+
+rocprof allows profiling both HSA & HIP API calls.
+
+Let's profile simple copying kernel saved in `profile.jl` file:
+```julia
+using AMDGPU
+
+function mycopy!(dst, src)
+    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
+    if i ≤ length(dst)
+        @inbounds dst[i] = src[i]
+    end
+    return
+end
+
+function main(N)
+    src = ROCArray{Float64}(undef, N)
+    dst = ROCArray{Float64}(undef, N)
+    nthreads = 256
+    nblocks = cld(N, nthreads)
+
+    for i in 1:10
+        @roc groupsize=nthreads gridsize=nblocks mycopy!(dst, src)
+        AMDGPU.synchronize()
+    end
+
+    AMDGPU.unsafe_free!(dst)
+    AMDGPU.unsafe_free!(src)
+    AMDGPU.synchronize()
+    return
+end
+main(2^24)
+```
+
+```bash
+ENABLE_JITPROFILING=1 rocprof --hip-trace --hsa-trace julia ./profile.jl
+```
+
+This will produce `results.json` (among other files) which can be visualized
+using [Perfetto UI](https://ui.perfetto.dev/).
+
+|Zoomed out|Zoomed in|
+|:---:|:---:|
+|![image](./assets/profiling-1.png)|![image](./assets/profiling-4.png)|
+
+Here we can clearly see that host synchronization after each kernel dispatch
+causes poor device occupancy (empty spaces between kernel dispatches).
+
+We can fix this by moving synchronization outside the loop so that it happens only once.
+
+```julia
+    ...
+    for i in 1:10
+        @roc groupsize=nthreads gridsize=nblocks mycopy!(dst, src)
+    end
+    AMDGPU.synchronize()
+    ...
+```
+
+Running profiling again and visualizing results we now see that
+kernel launches are adjacent to each other and that the average
+wall duaration is lower.
+
+|Zoomed out|Zoomed in|
+|:---:|:---:|
+|![image](./assets/profiling-2.png)|![image](./assets/profiling-3.png)|
diff --git a/docs/src/queues_signals.md b/docs/src/queues_signals.md
deleted file mode 100644
index 7e277f209..000000000
--- a/docs/src/queues_signals.md
+++ /dev/null
@@ -1,98 +0,0 @@
-# Queues
-
-Similar to CUDA streams, ROCm has the concept of queues, which are
-buffers used to instruct the GPU hardware which kernels to launch.
-ROCm queues are synchronous, like CUDA streams.
-
-Each device has a default queue associated,
-which is accessible with [`AMDGPU.queue`](@ref).
-
-To specify which queue to launch a kernel on:
-
-- Using [`AMDGPU.queue!`](@ref), which will execute given function and reset
-    to the original queue after completion:
-
-```julia
-q = AMDGPU.ROCQueue()
-x = AMDGPU.queue!(() -> AMDGPU.ones(Float32, 16), q)
-```
-
-- Using `queue` argument to [`@roc`](@ref) macro:
-
-```julia
-q = AMDGPU.ROCQueue()
-@roc queue=q kernel(...)
-```
-
-Queues also have an inherent priority, which allows control of kernel
-submission latency and on-device scheduling preference with respect to kernels
-submitted on other queues.
-There are three priorities: normal (the default), low, and high priority.
-
-Priority of the default `queue` can be set with [`AMDGPU.priority!`](@ref).
-Alternatively, it can be set at queue creation time:
-
-```julia
-low_prio_queue = ROCQueue(; priority=:low)
-high_prio_queue = ROCQueue(; priority=:high)
-normal_prio_queue = ROCQueue(; priority=:normal) # or just omit "priority"
-```
-
-To get kernels which are currently executing on a given queue,
-use [`AMDGPU.active_kernels`](@ref).
-It will return a `Vector{ROCKernelSignal}`, which can be inspected to
-determine how many (and which) kernels are executing.
-
-If a kernel gets "stuck" and locks up the GPU (noticeable with 100% GPU usage in `rocm-smi`)
-you can kill it and all other kernels associated with the queue it is running on
-with [`AMDGPU.Runtime.kill_queue!(queue)`](@ref).
-This can be "safely" done to the default queue (obtained via [`AMDGPU.queue`](@ref)),
-since default queues are recreated as-needed.
-
-```@docs
-AMDGPU.queue
-AMDGPU.queue!
-AMDGPU.priority!
-AMDGPU.active_kernels
-AMDGPU.ROCQueue
-AMDGPU.Runtime.set_queue_pool_size!
-AMDGPU.Runtime.kill_queue!
-```
-
-# Signals
-
-Unlike CUDA, ROCm kernels are tracked by an associated signal, which is
-created and returned by `@roc`, and is `wait`ed on to track kernel completion.
-Signals may also be used for manual synchronization (since they work for CPUs
-and GPUs equally well). CPU usage is done with the `HSA.signal_*` functions,
-and GPU usage is done with the `device_signal_*` and `hostcall_device_signal_*`
-functions. For most signalling needs, consider using a hostcall instead.
-
-If custom signal handling is desired, signals can be manually constructed and
-passed to `@roc`:
-
-```julia
-# A kernel which waits on all signals in `sigs`
-function multi_wait(sigs)
-    for i in 1:length(sigs)
-        AMDGPU.Device.hostcall_device_signal_wait(sigs[i], 0)
-    end
-    nothing
-end
-
-# Create a set of signals
-sigs = [ROCSignal() for i in 1:10]
-# Get the device-safe signal handles
-_sigs = ROCArray(map(sig->sig.signal, sigs))
-
-# Launch multi-waiter ahead of time; this will block on the device
-final_sig = @roc multi_wait(_sigs)
-
-# Associate kernels with signals
-for sig in sigs
-    @roc signal=sig identity(nothing)
-end
-
-# Wait on the multi-waiter
-wait(final_sig)
-```
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index 34fce1717..b6da54359 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -43,20 +43,23 @@ Pkg.test("AMDGPU")
 
 ## Running a simple kernel
 
-As a simple test, we will try to add two random vectors and make sure that the results from the CPU and the GPU are indeed the same.
+As a simple test, we will try to add two random vectors
+and make sure that the results from the CPU and the GPU are indeed the same.
 
 We can start by first performing this simple calculation on the CPU:
 
 ```julia
-N = 32
+N = 1024
 a = rand(Float64, N)
 b = rand(Float64, N)
 c_cpu = a + b
 ```
 
-To do the same computation on the GPU, we first need to copy the two input arrays `a` and `b` to the device.
+To do the same computation on the GPU, we first need to copy
+the two input arrays `a` and `b` to the device.
 Toward that end, we will use the `ROCArray` type to represent our GPU arrays.
-We can create the two arrays by passing the host data to the constructor as follows:
+We can create the two arrays by passing the host data
+to the constructor as follows:
 
 ```julia
 using AMDGPU
@@ -70,67 +73,98 @@ We need to create one additional array `c_d` to store the results:
 c_d = similar(a_d)
 ```
 
-In this example, the postfix `_d` distinguishes a device memory object from its host memory counterpart.
-This convention is completely arbitrary and you may name your device-side variables whatever you like; they are regular Julia variables.
+In this example, the postfix `_d` distinguishes a device memory object
+from its host memory counterpart.
+This convention is completely arbitrary and you may name your
+device-side variables whatever you like; they are regular Julia variables.
 
 Next, we will define the GPU kernel that does the actual computation:
 
 ```julia
 function vadd!(c, a, b)
-    i = workitemIdx().x
+    i = workitemIdx().x + (workgroupIdx().x - 1) * workgroupDim().x
     c[i] = a[i] + b[i]
     return
 end
 ```
 
-This simple kernel starts by getting the current thread ID using [`workitemIdx`](@ref) and then performs the addition of the elements from `a` and `b`, storing the result in `c`.
+The index of a single workitem can be uniquely identified by its grid index
+(computed linearly as `(workgroupDim().x * (workgroupIdx().x - 1)) + workitemIdx().x`
+when only a single dimension is used).
+
+The grid is the domain over which the *entire* kernel executes over.
+The grid will be split into multiple workgroups by hardware automatically,
+and the kernel does not complete until all workgroups complete.
+
+Like OpenCL, AMDGPU has the concept of "workitems", "workgroups", and the "grid".
+A workitem is a single thread of execution, capable of performing arithmentic
+operations.
+Workitems are grouped into "wavefronts" ("warps" in CUDA) which
+share the same compute unit, and execute the same instructions simulatenously.
+The workgroup is a logical unit of compute supported by hardware
+which comprises multiple wavefronts, which shares resources
+(specifically local memory) and can be efficiently synchronized.
+A workgroup may be executed by one or multiple hardware compute units,
+making it often the only dimension of importance for smaller kernel launches.
+
+Notice how we explicitly specify that this function does not return a value
+by adding the `return` statement.
+This is necessary for all GPU kernels and we can enforce it by adding a `return`,
+`return nothing`, or even `nothing` at the end of the kernel.
+If this statement is omitted, Julia will attempt to return the value
+of the last evaluated expression, in this case a `Float64`,
+which will cause a compilation failure as kernels cannot return values.
+
+The easiest way to launch a GPU kernel is with the [`@roc`](@ref) macro,
+specifying `groupsize` and `gridsize` to cover full array,
+and calling it like a regular function:
 
-Like OpenCL, AMDGPU has the concept of "workitems", "workgroups", and the "grid". A workitem is a single thread of execution, capable of performing arithmentic operations. Workitems are grouped into "wavefronts" ("warps" in CUDA) which share the same compute unit, and execute the same instructions simulatenously. The workgroup is a logical unit of compute supported by hardware which comprises multiple wavefronts, which shares resources (specifically local memory) and can be efficiently synchronized. A workgroup may be executed by one or multiple hardware compute units, making it often the only dimension of importance for smaller kernel launches.
-
-The grid is the domain over which the *entire* kernel executes over. The index of a single workitem can be uniquely identified by its grid index (computed linearly as `(workgroupDim().x * (workgroupIdx().x - 1)) + workitemIdx().x` when only a single dimension is used). The grid will be split into multiple workgroups by hardware automatically, and the kernel does not complete until all workgroups complete.
-
-Notice how we explicitly specify that this function does not return a value by adding the `return` statement.
-This is necessary for all GPU kernels and we can enforce it by adding a `return`, `return nothing`, or even `nothing` at the end of the kernel.
-If this statement is omitted, Julia will attempt to return the value of the last evaluated expression, in this case a `Float64`, which will cause a compilation failure as kernels cannot return values.
+```julia
+groupsize = 128
+gridsize = cld(length(c_d), groupsize)
+@roc gridsize=gridsize groupsize=groupsize vadd!(c_d, a_d, b_d)
+```
 
-The easiest way to launch a GPU kernel is with the [`@roc`](@ref) macro, specifying that we want a single work group with `N` work items and calling it like an ordinary function:
+Keep in mind that kernel launches are asynchronous,
+meaning that you need to do some kind of synchronization before you use the result.
+For instance, you can call `AMDGPU.synchronize()`:
 
 ```julia
 @roc groupsize=N vadd!(c_d, a_d, b_d)
+AMDGPU.synchronize()
 ```
 
-Keep in mind that kernel launches are asynchronous, meaning that you need to do some kind of synchronization before you use the result.
-For instance, you can call `wait()` on the returned HSA signal value:
+Finally, we can make sure that the results match,
+by first copying the data to the host and then comparing it with the CPU results:
 
 ```julia
-wait(@roc groupsize=N vadd!(c_d, a_d, b_d))
-```
+c = Array(c_d)
 
-!!! warning "Naming conventions"
-    Throughout this example we use terms like "work group" and "work item".
-    These terms are used by the Khronos consortium and their APIs including OpenCL and Vulkan, as well as the HSA foundation.
+using Test
+@test isapprox(c, c_cpu)
+```
 
-    NVIDIA, on the other hand, uses some different terms in their CUDA API, which might be confusing to some users porting their kernels from CUDA to AMDGPU.
-    As a quick summary, here is a mapping of the most common terms:
+## Naming conventions
 
-    | AMDGPU | CUDA |
-    |:---:|:---:|
-    | [`workitemIdx`](@ref) | [`threadIdx`](@ref) |
-    | [`workgroupIdx`](@ref) | [`blockIdx`](@ref) |
-    | [`workgroupDim`](@ref) | [`blockDim`](@ref) |
-    | [`gridItemDim`](@ref) | No equivalent |
-    | [`gridGroupDim`](@ref) | `gridDim` |
-    | `groupsize` | `threads` |
-    | `gridsize` | `blocks * threads` |
-    | `queue` | `stream` |
+Throughout this example we use terms like "work group" and "work item".
+These terms are used by the Khronos consortium and their APIs
+including OpenCL and Vulkan, as well as the HSA foundation.
 
-    For compatibilty reasons, the symbols in the CUDA column (except for `gridItemDim`) are also supported by AMDGPU.
+NVIDIA, on the other hand, uses some different terms in their CUDA API,
+which might be confusing to some users porting their kernels from CUDA to AMDGPU.
 
-Finally, we can make sure that the results match, by first copying the data to the host and then comparing it with the CPU results:
+As a quick summary, here is a mapping of the most common terms:
 
-```julia
-c = Array(c_d)
+| AMDGPU | CUDA |
+|:---:|:---:|
+| [`workitemIdx`](@ref) | [`threadIdx`](@ref) |
+| [`workgroupIdx`](@ref) | [`blockIdx`](@ref) |
+| [`workgroupDim`](@ref) | [`blockDim`](@ref) |
+| [`gridItemDim`](@ref) | No equivalent |
+| [`gridGroupDim`](@ref) | `gridDim` |
+| `groupsize` | `threads` |
+| `gridsize` | `blocks * threads` |
+| `queue` | `stream` |
 
-using Test
-@test isapprox(c, c_cpu)
-```
+For compatibilty reasons, the symbols in the CUDA column
+(except for `gridItemDim`) are also supported by AMDGPU.
diff --git a/docs/src/streams.md b/docs/src/streams.md
new file mode 100644
index 000000000..88fdbaf09
--- /dev/null
+++ b/docs/src/streams.md
@@ -0,0 +1,55 @@
+# Streams
+
+Similar to CUDA streams, ROCm has HIP streams,
+which are buffers used to instruct the GPU hardware which kernels to launch.
+HIP streams are synchronous, like CUDA streams.
+
+Each device has a default stream associated,
+which is accessible with `AMDGPU.stream()`.
+
+There are several ways to specify which stream to launch a kernel on:
+
+- Using [`AMDGPU.stream!`](@ref) to change default stream to be used
+    **within the same Julia task**.
+
+```julia
+stream = AMDGPU.HIPStream()
+AMDGPU.stream!(stream) # Change default stream to be used for subsequent operations.
+AMDGPU.ones(Float32, 16) # Will be executed on `stream`.
+```
+
+- Using [`AMDGPU.stream!`](@ref) to execute given function and reset
+    to the original stream after completion:
+
+```julia
+stream = AMDGPU.HIPStream()
+x = AMDGPU.stream!(() -> AMDGPU.ones(Float32, 16), stream)
+```
+
+- Using `stream` argument to [`@roc`](@ref) macro:
+
+```julia
+stream = AMDGPU.HIPStream()
+@roc stream=stream kernel(...)
+```
+
+Streams also have an inherent priority, which allows control of kernel
+submission latency and on-device scheduling preference with respect to kernels
+submitted on other streams.
+There are three priorities: normal (the default), low, and high priority.
+
+Priority of the default `stream` can be set with [`AMDGPU.priority!`](@ref).
+Alternatively, it can be set at stream creation time:
+
+```julia
+low_prio = HIPStream(:low)
+high_prio = HIPStream(:high)
+normal_prio = HIPStream(:normal) # or just omit "priority"
+```
+
+```@docs
+AMDGPU.stream
+AMDGPU.stream!
+AMDGPU.priority!
+AMDGPU.HIPStream
+```
diff --git a/src/AMDGPU.jl b/src/AMDGPU.jl
index 03f12f217..e15c3ca6d 100644
--- a/src/AMDGPU.jl
+++ b/src/AMDGPU.jl
@@ -1,21 +1,19 @@
 module AMDGPU
 
-### Imports ###
-
+using Adapt
 using CEnum
-using Libdl
-using LLVM, LLVM.Interop
 using GPUCompiler
 using GPUArrays
-using Adapt
+using Libdl
+using LLVM, LLVM.Interop
+using Preferences
+using Printf
+
 import LinearAlgebra
 import Core: LLVMPtr
 
-### Exports ###
-
-export ROCDevice, ROCQueue, ROCExecutable, ROCKernel, ROCSignal
+export HIPDevice
 export has_rocm_gpu
-
 export ROCArray, ROCVector, ROCMatrix, ROCVecOrMat
 export roc
 
@@ -32,19 +30,39 @@ function Base.lock(f, x::LockedObject)
     end
 end
 
+struct KernelState
+    # Exception reporting buffers.
+    exception_flag::Ptr{Int32}
+    gate::Ptr{UInt64}
+    buffers_counter::Ptr{Int32}
+    str_buffers_counter::Ptr{Int32}
+    buffers::Ptr{Ptr{Cvoid}}
+    string_buffers::Ptr{Ptr{Cvoid}}
+    n_buffers::Int32
+    n_str_buffers::Int32
+
+    # Malloc/free hostcalls.
+    malloc_hc::Ptr{Cvoid}
+    free_hc::Ptr{Cvoid}
+
+    # Print hostcalls.
+    output_context::Ptr{Cvoid}
+    printf_output_context::Ptr{Cvoid}
+end
+
 # Load HSA Runtime.
 const libhsaruntime = "libhsa-runtime64.so.1"
 include(joinpath("hsa", "HSA.jl"))
-import .HSA: Agent, Queue, Executable, Status, Signal
 
-# Load binary dependencies
-include(joinpath(dirname(@__DIR__), "deps", "bindeps.jl"))
+# Load binary dependencies.
+include("discovery_utils.jl")
+include("rocm_discovery.jl")
+populate_globals!(bindeps_setup())
 
 # Utilities
 include("utils.jl")
 
 # Load HIP
-const libhip = "libamdhip64.so"
 include(joinpath("hip", "HIP.jl"))
 import .HIP: HIPContext, HIPDevice, HIPStream
 export HIPContext, HIPDevice, HIPStream
@@ -53,16 +71,18 @@ include("cache.jl")
 
 module Runtime
     using ..CEnum
-    using Setfield
-    import ..HSA
-    import ..Adapt
     using ..GPUCompiler
+
+    import ..Adapt
     import Preferences: @load_preference, @set_preferences!
     import TimespanLogging
     import TimespanLogging: timespan_start, timespan_finish
 
+    import ..HSA
+    import ..HIP
     import ..AMDGPU
-    import ..AMDGPU: getinfo, LockedObject, HIP
+    import ..AMDGPU: getinfo, LockedObject
+    import .HIP: HIPDevice
 
     struct Adaptor end
 
@@ -71,39 +91,51 @@ module Runtime
 
     include(joinpath("runtime", "logging.jl"))
     include(joinpath("runtime", "error.jl"))
-    include(joinpath("runtime", "thread-utils.jl"))
+    include(joinpath("runtime", "hsa_device.jl"))
     include(joinpath("runtime", "device.jl"))
-    include(joinpath("runtime", "linked-list.jl"))
-    include(joinpath("runtime", "queue.jl"))
-    include(joinpath("runtime", "signal.jl"))
     include(joinpath("runtime", "dims.jl"))
+
     module Mem
-        include(joinpath("runtime", "memory.jl"))
+        using Preferences
+
+        import AMDGPU
+        import AMDGPU: HIP, HSA
+        import AMDGPU: Runtime
+        import .HIP: HIPDevice
+        import .Runtime: ROCDim, ROCDim3, check
+
+        const refcounts_lock = Threads.ReentrantLock()
+
+        abstract type AbstractAMDBuffer end
+
+        include(joinpath("runtime", "memory", "utils.jl"))
+        include(joinpath("runtime", "memory", "hip.jl"))
+        include(joinpath("runtime", "memory", "refcount.jl"))
     end
-    include(joinpath("runtime", "executable.jl"))
-    include(joinpath("runtime", "hashing.jl"))
-    include(joinpath("runtime", "kernel.jl"))
-    include(joinpath("runtime", "kernel-signal.jl"))
-    include(joinpath("runtime", "launch.jl"))
+
     include(joinpath("runtime", "execution.jl"))
-    include(joinpath("runtime", "sync.jl"))
+    include(joinpath("runtime", "hip-execution.jl"))
     include(joinpath("runtime", "fault.jl"))
-end # module Runtime
+end
+
 import .Runtime: Mem
-import .Runtime: ROCDevice, ROCQueue
 
 const ci_cache = GPUCompiler.CodeCache()
 Base.Experimental.@MethodTable(method_table)
 
 module Device
-    import ..HSA
-    import ..Runtime
-    import ..Mem
-    import Core: LLVMPtr
     using ..GPUCompiler
     using ..LLVM
     using ..LLVM.Interop
+
+    import ..Adapt
+    import Core: LLVMPtr
     import ..LinearAlgebra
+
+    import ..HSA
+    import ..HIP
+    import ..Runtime
+    import ..Mem
     import ..AMDGPU
     import .AMDGPU: method_table
 
@@ -117,12 +149,11 @@ module Device
     include(joinpath("device", "runtime.jl"))
     include(joinpath("device", "quirks.jl"))
 end
-import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame
-import .Device: ROCDeviceArray, AS, HostCall, hostcall!
+import .Device: malloc, signal_exception, report_exception, report_oom, report_exception_frame, report_exception_name
+import .Device: ROCDeviceArray, AS, HostCall, HostCallHolder, hostcall!
 import .Device: @ROCDynamicLocalArray, @ROCStaticLocalArray
 import .Device: workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
-import .Device: threadIdx, blockIdx, blockDim
-import .Device: sync_workgroup
+import .Device: threadIdx, blockIdx, blockDim, sync_workgroup
 import .Device: @rocprint, @rocprintln, @rocprintf
 
 export ROCDeviceArray, @ROCDynamicLocalArray, @ROCStaticLocalArray
@@ -131,26 +162,27 @@ export workitemIdx, workgroupIdx, workgroupDim, gridItemDim, gridGroupDim
 export sync_workgroup
 
 module Compiler
+    import Core: LLVMPtr
+    import LLD_jll
+
     using ..GPUCompiler
     using ..LLVM
-    import ..Adapt
-    import Core: LLVMPtr
     using Printf
 
     import ..AMDGPU
     import ..AMDGPU: AS
     import ..Runtime
     import ..Device
-    import .Runtime: ROCDevice, ROCModule, ROCFunction
-    import .Runtime: Adaptor
-    import .Runtime: Mem
+    import ..HIP
+    import ..Mem
 
-    include(joinpath("compiler", "device-libs.jl"))
-    include(joinpath("compiler", "utils.jl"))
-    include(joinpath("compiler", "global-hooks.jl"))
+    include(joinpath("compiler", "zeroinit_lds.jl"))
+    include(joinpath("compiler", "device_libs.jl"))
+    include(joinpath("compiler", "exceptions.jl"))
+    include(joinpath("compiler", "output_context.jl"))
+    include(joinpath("compiler", "dynamic_memory.jl"))
     include(joinpath("compiler", "codegen.jl"))
-    include(joinpath("compiler", "occupancy.jl"))
-end # module Compiler
+end
 
 include("tls.jl")
 include("highlevel.jl")
@@ -164,11 +196,10 @@ include("array.jl")
 include("conversions.jl")
 include("broadcast.jl")
 include("mapreduce.jl")
+include("exception_handler.jl")
 
 allowscalar(x::Bool) = GPUArrays.allowscalar(x)
 
-include("deprecations.jl")
-
 ### Initialization and Shutdown ###
 
 const HSA_REFCOUNT = Threads.Atomic{UInt}(0)
@@ -186,9 +217,6 @@ end
 
 # Load ROCm external libraries
 include(joinpath("blas", "rocBLAS.jl"))
-#include(joinpath("sparse", "rocSPARSE.jl")
-#include(joinpath("solver", "rocSOLVER.jl")
-#include(joinpath("solver", "rocALUTION.jl")
 include(joinpath("rand", "rocRAND.jl"))
 include(joinpath("fft", "rocFFT.jl"))
 include(joinpath("dnn", "MIOpen.jl"))
@@ -207,7 +235,7 @@ function __init__()
 
     if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS")
         env_use_artifacts = !parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false"))
-        if use_artifacts != env_use_artifacts
+        if use_artifacts() != env_use_artifacts
             enable_artifacts!(env_use_artifacts)
             @warn """
             The environment variable JULIA_AMDGPU_DISABLE_ARTIFACTS does not match the value from preferences.
@@ -252,12 +280,9 @@ function __init__()
             end
 
             # Select the default device
-            for device in Runtime.fetch_devices()
-                if !isassigned(Runtime.DEFAULT_DEVICE) && device_type(device) == :gpu
-                    Runtime.DEFAULT_DEVICE[] = device
-                    break
-                end
-            end
+            Runtime.fetch_hsa_devices()
+            devs = Runtime.fetch_devices()
+            Runtime.set_default_device!(first(devs))
 
             # Setup HSA fault handler
             Runtime.setup_fault_handler()
@@ -269,7 +294,6 @@ function __init__()
         HSA runtime is unavailable, compilation and runtime functionality will be disabled.
         Reason: $hsa_build_reason
         """
-
         if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0"))
             print_build_diagnostics()
             error("Failed to load HSA runtime, but HSA must load, bailing out")
@@ -282,7 +306,6 @@ function __init__()
         LLD is unavailable, compilation functionality will be disabled.
         Reason: $lld_build_reason
         """
-
         if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0"))
             print_build_diagnostics()
             error("Failed to find ld.lld, but ld.lld must exist, bailing out")
@@ -295,7 +318,6 @@ function __init__()
         Device libraries are unavailable, device intrinsics will be disabled.
         Reason: $device_libs_build_reason
         """
-
         if parse(Bool, get(ENV, "JULIA_AMDGPU_CORE_MUST_LOAD", "0"))
             print_build_diagnostics()
             error("Failed to find Device Libs, but Device Libs must exist, bailing out")
@@ -303,14 +325,11 @@ function __init__()
     end
 
     # Check whether HIP is available
-    if functional(:hip)
-        push!(Libdl.DL_LOAD_PATH, dirname(libhip_path))
-    else
+    if !functional(:hip)
         @warn """
         HIP library is unavailable, HIP integration will be disabled.
         Reason: $hip_build_reason
         """
-
         if parse(Bool, get(ENV, "JULIA_AMDGPU_HIP_MUST_LOAD", "0"))
             print_build_diagnostics()
             error("Failed to load HIP runtime, but HIP must load, bailing out")
@@ -322,14 +341,15 @@ function __init__()
         "dense BLAS", "sparse BLAS", "linear solver",
         "fancy linear solver", "RNG", "FFT", "DNN/convolution")
     for ((name, pkg), purpose) in zip(rocm_ext_libs, descriptions)
-        if use_artifacts && pkg !== nothing && !functional(name)
+        if use_artifacts() && pkg !== nothing && !functional(name)
             # These are numerous and thus noisy
             build_reason = getfield(AMDGPU, Symbol(name, :_build_reason))
             @debug """
-            $pkg is unavailable, $purpose functionality will be disabled. Reason: $build_reason
+            $pkg is unavailable, $purpose functionality will be disabled.
+            Reason: $build_reason.
             """
         end
     end
 end
 
-end # module
+end
diff --git a/src/ROCKernels.jl b/src/ROCKernels.jl
index a7041d3bd..5066682ad 100644
--- a/src/ROCKernels.jl
+++ b/src/ROCKernels.jl
@@ -37,7 +37,6 @@ end
 
 function KernelAbstractions.copyto!(::ROCBackend, A, B)
     GC.@preserve A B begin
-        # TODO: async copy
         copyto!(A, 1, B, 1, length(A))
     end
     return nothing
@@ -100,9 +99,7 @@ function (obj::Kernel{ROCBackend})(args...; ndrange=nothing, workgroupsize=nothi
     nthreads = length(workitems(iterspace))
     nblocks == 0 && return nothing
 
-    AMDGPU.@roc(
-        groupsize=nthreads, gridsize=(nblocks * nthreads),
-        obj.f(ctx, args...))
+    kernel(ctx, args...; groupsize=nthreads, gridsize=nblocks)
     return nothing
 end
 
@@ -177,11 +174,12 @@ end
     AMDGPU.Device.sync_workgroup()
 end
 
-@device_override @inline function __print(args...)
-    for arg in args
-        AMDGPU.Device.@rocprintf("%s", arg)
-    end
-end
+# TODO fix
+# @device_override @inline function __print(args...)
+#     for arg in args
+#         AMDGPU.Device.@rocprintf("%s", arg)
+#     end
+# end
 
 ###
 # GPU implementation of constant memory
diff --git a/src/array.jl b/src/array.jl
index c3b9bad26..c995f1f6f 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -8,12 +8,11 @@ struct ROCArrayBackend <: AbstractGPUBackend end
 
 struct ROCKernelContext <: AbstractKernelContext end
 
-function GPUArrays.gpu_call(::ROCArrayBackend, f, args, threads::Int, blocks::Int; name::Union{String,Nothing})
-    groupsize, gridsize = threads, blocks * threads
-    wait(@roc groupsize=groupsize gridsize=gridsize f(ROCKernelContext(), args...))
-end
-function GPUArrays.gpu_call(::ROCArrayBackend, f, args; elements::Int, name::Union{String,Nothing}=nothing)
-    wait(@roc groupsize=min(elements, 64) gridsize=elements f(ROCKernelContext(), args...))
+@inline function GPUArrays.gpu_call(
+    ::ROCArrayBackend, f, args, threads::Int, blocks::Int;
+    name::Union{String, Nothing},
+)
+    @roc gridsize=blocks groupsize=threads name=name f(ROCKernelContext(), args...)
 end
 
 ## on-device
@@ -21,11 +20,11 @@ end
 # indexing
 
 for (f, froc) in (
-        (:blockidx, :blockIdx),
-        (:blockdim, :blockDim),
-        (:threadidx, :threadIdx),
-        (:griddim, :gridGroupDim)
-    )
+    (:blockidx, :blockIdx),
+    (:blockdim, :blockDim),
+    (:threadidx, :threadIdx),
+    (:griddim, :gridGroupDim)
+)
     @eval @inline GPUArrays.$f(::ROCKernelContext) = AMDGPU.$froc().x
 end
 
@@ -50,54 +49,32 @@ end
     return
 end
 
-
 #
 # Host abstractions
 #
 
 mutable struct ROCArray{T,N} <: AbstractGPUArray{T,N}
-    buf::Mem.Buffer
+    buf::Mem.HIPBuffer
     dims::Dims{N}
     offset::Int
-    syncstate::Runtime.SyncState
 
     function ROCArray{T,N}(
-        buf::Mem.Buffer, dims::Dims{N}; offset::Integer = 0,
-        syncstate::Runtime.SyncState = Runtime.SyncState(),
+        buf::Mem.HIPBuffer, dims::Dims{N}; offset::Integer = 0,
     ) where {T,N}
         @assert isbitstype(T) "ROCArray only supports bits types"
-        xs = new{T,N}(buf, dims, offset, syncstate)
+        xs = new{T,N}(buf, dims, offset)
         Mem.retain(buf)
         finalizer(_safe_free!, xs)
         return xs
     end
 end
 
-_safe_free!(xs::ROCArray) = _safe_free!(xs.buf)
-function _safe_free!(buf::Mem.Buffer)
-    Mem.release(buf)
-    return
-end
-
-unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf)
+_safe_free!(xs::ROCArray) = Mem.release(xs.buf; stream=default_stream())
 
-wait!(x::ROCArray) = wait!(x.syncstate)
-mark!(x::ROCArray, s) = mark!(x.syncstate, s)
-wait!(xs::Vector{<:ROCArray}) = foreach(wait!, xs)
-mark!(xs::Vector{<:ROCArray}, s) = foreach(x->mark!(x,s), xs)
-wait!(xs::NTuple{N,<:ROCArray} where N) = foreach(wait!, xs)
-mark!(xs::NTuple{N,<:ROCArray} where N, s) = foreach(x->mark!(x,s), xs)
-function Adapt.adapt_storage(::Runtime.WaitAdaptor, x::ROCArray)
-    Runtime.wait!(x.syncstate)
-    x
-end
-function Adapt.adapt_storage(ma::Runtime.MarkAdaptor, x::ROCArray)
-    Runtime.mark!(x.syncstate, ma.s)
-    x
-end
+unsafe_free!(xs::ROCArray) = Mem.free_if_live(xs.buf; stream=stream())
 
 """
-    device(A::ROCArray) -> ROCDevice
+    device(A::ROCArray) -> HIPDevice
 
 Return the device associated with the array `A`.
 """
@@ -119,8 +96,8 @@ AnyROCVecOrMat{T} = Union{AnyROCVector{T}, AnyROCMatrix{T}}
 
 # type and dimensionality specified, accepting dims as tuples of Ints
 function ROCArray{T,N}(::UndefInitializer, dims::Dims{N}) where {T,N}
-    buf = Mem.alloc(prod(dims)*sizeof(T))
-    ROCArray{T,N}(buf, dims)
+    buf = Mem.HIPBuffer(prod(dims) * sizeof(T); stream=stream())
+    ROCArray{T, N}(buf, dims)
 end
 
 # type and dimensionality specified, accepting dims as series of Ints
@@ -134,14 +111,10 @@ ROCArray{T}(::UndefInitializer, dims::Integer...) where {T} =
 # from Base arrays
 function ROCArray{T,N}(x::Array{T,N}, dims::Dims{N}) where {T,N}
     r = ROCArray{T,N}(undef, dims)
-    Mem.upload!(r.buf, pointer(x), sizeof(x))
+    Mem.upload!(r.buf, pointer(x), sizeof(x); stream=stream())
     return r
 end
 
-# type as first argument
-# FIXME: Remove me!
-#ROCArray(::Type{T}, dims::Dims{N}) where {T,N} = ROCArray{T,N}(undef, dims)
-
 # empty vector constructor
 ROCArray{T,1}() where {T} = ROCArray{T,1}(undef, 0)
 
@@ -152,7 +125,6 @@ Base.similar(::ROCArray, ::Type{T}, dims::Base.Dims{N}) where {T,N} = ROCArray{T
 ## array interface
 
 Base.elsize(::Type{<:ROCArray{T}}) where {T} = sizeof(T)
-
 Base.size(x::ROCArray) = x.dims
 Base.sizeof(x::ROCArray) = Base.elsize(x) * length(x)
 
@@ -162,9 +134,9 @@ ROCArray{T,N}(x::AbstractArray{S,N}) where {T,N,S} =
     ROCArray{T,N}(convert(Array{T}, x), size(x))
 
 # underspecified constructors
+ROCArray(A::AbstractArray{T,N}) where {T,N} = ROCArray{T,N}(A)
 ROCArray{T}(xs::AbstractArray{S,N}) where {T,N,S} = ROCArray{T,N}(xs)
 (::Type{ROCArray{T,N} where T})(x::AbstractArray{S,N}) where {S,N} = ROCArray{S,N}(x)
-ROCArray(A::AbstractArray{T,N}) where {T,N} = ROCArray{T,N}(A)
 
 # idempotency
 ROCArray{T,N}(xs::ROCArray{T,N}) where {T,N} = xs
@@ -173,44 +145,47 @@ ROCArray{T,N}(xs::ROCArray{T,N}) where {T,N} = xs
 
 Base.convert(::Type{T}, x::T) where T <: ROCArray = x
 
-
 ## memory operations
 
-function Base.copyto!(dest::Array{T}, d_offset::Integer,
-                      source::ROCArray{T}, s_offset::Integer,
-                      amount::Integer) where T
+function Base.copyto!(
+    dest::Array{T}, d_offset::Integer,
+    source::ROCArray{T}, s_offset::Integer, amount::Integer;
+    async::Bool = false,
+) where T
     amount == 0 && return dest
     @boundscheck checkbounds(dest, d_offset+amount-1)
     @boundscheck checkbounds(source, s_offset+amount-1)
-    wait!(source)
-    Mem.download!(pointer(dest, d_offset),
-                  Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)),
-                  amount*sizeof(T))
+    strm = stream()
+    Mem.download!(
+        pointer(dest, d_offset),
+        Mem.view(source.buf, source.offset + (s_offset - 1) * sizeof(T)),
+        amount * sizeof(T); stream=strm)
+    async || AMDGPU.synchronize(strm)
     dest
 end
-function Base.copyto!(dest::ROCArray{T}, d_offset::Integer,
-                      source::Array{T}, s_offset::Integer,
-                      amount::Integer) where T
+function Base.copyto!(
+    dest::ROCArray{T}, d_offset::Integer,
+    source::Array{T}, s_offset::Integer, amount::Integer,
+) where T
     amount == 0 && return dest
     @boundscheck checkbounds(dest, d_offset+amount-1)
     @boundscheck checkbounds(source, s_offset+amount-1)
-    wait!(dest)
-    Mem.upload!(Mem.view(dest.buf, dest.offset + (d_offset-1)*sizeof(T)),
-                pointer(source, s_offset),
-                amount*sizeof(T))
+    Mem.upload!(
+        Mem.view(dest.buf, dest.offset + (d_offset - 1) * sizeof(T)),
+        pointer(source, s_offset), amount * sizeof(T); stream=stream())
     dest
 end
-function Base.copyto!(dest::ROCArray{T}, d_offset::Integer,
-                      source::ROCArray{T}, s_offset::Integer,
-                      amount::Integer) where T
+function Base.copyto!(
+    dest::ROCArray{T}, d_offset::Integer,
+    source::ROCArray{T}, s_offset::Integer, amount::Integer,
+) where T
     amount == 0 && return dest
-    @boundscheck checkbounds(dest, d_offset+amount-1)
-    @boundscheck checkbounds(source, s_offset+amount-1)
-    wait!(dest)
-    wait!(source)
-    Mem.transfer!(Mem.view(dest.buf, dest.offset + (d_offset-1)*sizeof(T)),
-                  Mem.view(source.buf, source.offset + (s_offset-1)*sizeof(T)),
-                  amount*sizeof(T))
+    @boundscheck checkbounds(dest, d_offset + amount - 1)
+    @boundscheck checkbounds(source, s_offset + amount - 1)
+    Mem.transfer!(
+        Mem.view(dest.buf, dest.offset + (d_offset - 1) * sizeof(T)),
+        Mem.view(source.buf, source.offset + (s_offset - 1) * sizeof(T)),
+        amount * sizeof(T); stream=stream())
     dest
 end
 
@@ -221,13 +196,22 @@ function Base.copy(X::ROCArray{T}) where T
     Xnew
 end
 
-function Base.unsafe_wrap(::Type{<:ROCArray}, ptr::Ptr{T}, dims::NTuple{N,<:Integer}; device=device(), lock::Bool=true) where {T,N}
+function Base.unsafe_wrap(
+    ::Type{<:ROCArray}, ptr::Ptr{T}, dims::NTuple{N, <:Integer};
+    lock::Bool = true,
+) where {T,N}
     @assert isbitstype(T) "Cannot wrap a non-bitstype pointer as a ROCArray"
+    # TODO specialize ROCArray on a buffer type and pass HostBuffer.
     sz = prod(dims) * sizeof(T)
-    device_ptr = lock ? Mem.lock(ptr, sz, device) : ptr
-    buf = Mem.Buffer(device_ptr, Ptr{Cvoid}(ptr), device_ptr, sz, device, false, false)
-    return ROCArray{T, N}(buf, dims)
+    dptr = if lock
+        HIP.hipHostRegister(ptr, sz, HIP.hipHostRegisterMapped) |> HIP.check
+        Mem.device_ptr(Mem.HostBuffer(ptr, sz))
+    else
+        Ptr{Cvoid}(ptr)
+    end
+    return ROCArray{T, N}(Mem.HIPBuffer(dptr, sz), dims)
 end
+
 Base.unsafe_wrap(::Type{ROCArray{T}}, ptr::Ptr, dims; kwargs...) where T =
     unsafe_wrap(ROCArray, Base.unsafe_convert(Ptr{T}, ptr), dims; kwargs...)
 
@@ -274,7 +258,7 @@ end
 end
 @inline function unsafe_contiguous_view(a::ROCArray{T}, I::NTuple{N,Base.ViewIndex}, dims::NTuple{M,Integer}) where {T,N,M}
     offset = Base.compute_offset1(a, 1, I) * sizeof(T)
-    ROCArray{T,M}(a.buf, dims; offset=a.offset+offset, syncstate=a.syncstate)
+    ROCArray{T,M}(a.buf, dims; offset=a.offset + offset)
 end
 
 @inline function unsafe_view(A, I, ::NonContiguous)
@@ -295,36 +279,9 @@ function Base.reshape(a::ROCArray{T,M}, dims::NTuple{N,Int}) where {T,N,M}
     if N == M && dims == size(a)
         return a
     end
-    ROCArray{T,N}(a.buf, dims; offset=a.offset, syncstate=a.syncstate)
-end
-
-
-## fft
-
-#=
-using AbstractFFTs
-
-# defining our own plan type is the easiest way to pass around the plans in FFTW interface
-# without ambiguities
-
-struct FFTPlan{T}
-    p::T
+    ROCArray{T,N}(a.buf, dims; offset=a.offset)
 end
 
-AbstractFFTs.plan_fft(A::ROCArray; kw_args...) = FFTPlan(plan_fft(A.data; kw_args...))
-AbstractFFTs.plan_fft!(A::ROCArray; kw_args...) = FFTPlan(plan_fft!(A.data; kw_args...))
-AbstractFFTs.plan_bfft!(A::ROCArray; kw_args...) = FFTPlan(plan_bfft!(A.data; kw_args...))
-AbstractFFTs.plan_bfft(A::ROCArray; kw_args...) = FFTPlan(plan_bfft(A.data; kw_args...))
-AbstractFFTs.plan_ifft!(A::ROCArray; kw_args...) = FFTPlan(plan_ifft!(A.data; kw_args...))
-AbstractFFTs.plan_ifft(A::ROCArray; kw_args...) = FFTPlan(plan_ifft(A.data; kw_args...))
-
-function Base.:(*)(plan::FFTPlan, A::ROCArray)
-    x = plan.p * A.data
-    ROCArray(x)
-end
-=#
-
-
 ## GPUArrays interfaces
 
 GPUArrays.device(x::ROCArray) = x.buf.device
@@ -387,7 +344,7 @@ zeros(T::Type, dims...) = fill!(ROCArray{T}(undef, dims...), zero(T))
 # create a derived array (reinterpreted or reshaped) that's still a ROCArray
 # TODO: Move this to GPUArrays?
 @inline function _derived_array(::Type{T}, N::Int, a::ROCArray, osize::Dims) where {T}
-    return ROCArray{T,N}(a.buf, osize; offset=a.offset, syncstate=a.syncstate)
+    return ROCArray{T,N}(a.buf, osize; offset=a.offset)
 end
 
 ## reinterpret
@@ -514,14 +471,16 @@ Note that this operation is only supported on managed buffers, i.e., not on
 arrays that are created by `unsafe_wrap`.
 """
 function Base.resize!(A::ROCVector{T}, n::Integer) where T
-    if A.buf.host_ptr != C_NULL
-        throw(ArgumentError("Cannot resize an unowned `ROCVector`"))
-    end
+    # TODO
+    #   1. Specialize ROCArray on storage type.
+    #   2. Check that it is not HostBuffer.
+    # if A.buf.host_ptr != C_NULL
+    #     throw(ArgumentError("Cannot resize an unowned `ROCVector`"))
+    # end
 
     # TODO: add additional space to allow for quicker resizing
-    if n == length(A)
-        return A
-    end
+    n == length(A) && return A
+
     maxsize = n * sizeof(T)
     bufsize = if Base.isbitsunion(T)
         # type tag array past the data
@@ -529,15 +488,15 @@ function Base.resize!(A::ROCVector{T}, n::Integer) where T
     else
         maxsize
     end
-    new_buf = Mem.alloc(A.buf.device, bufsize)
+    new_buf = Mem.HIPBuffer(bufsize; stream=stream())
+
     copy_size = min(length(A), n) * sizeof(T)
-    wait!(A)
     if copy_size > 0
-        Mem.transfer!(new_buf, A.buf, copy_size)
+        Mem.transfer!(new_buf, A.buf, copy_size; stream=stream())
     end
 
     # Release old buffer
-    _safe_free!(A.buf)
+    _safe_free!(A)
     # N.B. Manually retain new buffer (this is normally done in ROCArray ctor)
     Mem.retain(new_buf)
 
diff --git a/src/blas/rocBLAS.jl b/src/blas/rocBLAS.jl
index 9fd6135c3..5cffd00ae 100644
--- a/src/blas/rocBLAS.jl
+++ b/src/blas/rocBLAS.jl
@@ -1,7 +1,7 @@
 module rocBLAS
 
 using ..AMDGPU
-import AMDGPU: wait!, mark!, librocblas, AnyROCArray
+import AMDGPU: librocblas, AnyROCArray
 import AMDGPU: HandleCache, HIP, library_state
 import .HIP: HIPContext, HIPStream, hipContext_t, hipStream_t, hipEvent_t
 
diff --git a/src/blas/wrappers.jl b/src/blas/wrappers.jl
index 97fdcdf1d..1413bd231 100644
--- a/src/blas/wrappers.jl
+++ b/src/blas/wrappers.jl
@@ -47,39 +47,35 @@ end
 
 # Level 1
 ## copy
-for (fname, elty) in ((:rocblas_dcopy,:Float64),
-                      (:rocblas_scopy,:Float32),
-                      (:rocblas_zcopy,:ComplexF64),
-                      (:rocblas_ccopy,:ComplexF32))
+for (fname, elty) in (
+    (:rocblas_dcopy,:Float64),
+    (:rocblas_scopy,:Float32),
+    (:rocblas_zcopy,:ComplexF64),
+    (:rocblas_ccopy,:ComplexF32),
+)
     @eval begin
-        function blascopy!(n::Integer,
-                           DX::ROCArray{$elty},
-                           incx::Integer,
-                           DY::ROCArray{$elty},
-                           incy::Integer)
-              wait!((DX,DY))
-              (; handle, stream) = lib_state()
-              $(fname)(handle, n, DX, incx, DY, incy) |> check
-              mark!((DX,DY), stream)
-              DY
+        function blascopy!(
+            n::Integer, DX::ROCArray{$elty}, incx::Integer,
+            DY::ROCArray{$elty}, incy::Integer,
+        )
+            (; handle, stream) = lib_state()
+            $(fname)(handle, n, DX, incx, DY, incy) |> check
+            DY
         end
     end
 end
 
 ## scal
-for (fname, elty) in ((:rocblas_dscal,:Float64),
-                      (:rocblas_sscal,:Float32),
-                      (:rocblas_zscal,:ComplexF64),
-                      (:rocblas_cscal,:ComplexF32))
+for (fname, elty) in (
+    (:rocblas_dscal,:Float64),
+    (:rocblas_sscal,:Float32),
+    (:rocblas_zscal,:ComplexF64),
+    (:rocblas_cscal,:ComplexF32),
+)
     @eval begin
-        function scal!(n::Integer,
-                       DA::$elty,
-                       DX::ROCArray{$elty},
-                       incx::Integer)
-            wait!(DX)
+        function scal!(n::Integer, DA::$elty, DX::ROCArray{$elty}, incx::Integer)
             (; handle, stream) = lib_state()
             $(fname)(handle, n, Ref(DA), DX, incx) |> check
-            mark!(DX, stream)
             DX
         end
     end
@@ -88,14 +84,9 @@ end
 for (fname, elty, celty) in ((:rocblas_sscal, :Float32, :ComplexF32),
                              (:rocblas_dscal, :Float64, :ComplexF64))
     @eval begin
-        function scal!(n::Integer,
-                       DA::$elty,
-                       DX::ROCArray{$celty},
-                       incx::Integer)
-            wait!(DX)
+        function scal!(n::Integer, DA::$elty, DX::ROCArray{$celty}, incx::Integer)
             (; handle, stream) = lib_state()
             $(fname)(handle, 2*n, Ref(DA), DX, incx) |> check
-            mark!(DX, stream)
             DX
         end
     end
@@ -109,13 +100,11 @@ for (jname, fname, elty) in ((:dot,:rocblas_ddot,:Float64),
                              (:dotu,:rocblas_zdotu,:ComplexF64),
                              (:dotu,:rocblas_cdotu,:ComplexF32))
     @eval begin
-        function $jname(n::Integer,
-                        DX::ROCArray{$elty},
-                        incx::Integer,
-                        DY::ROCArray{$elty},
-                        incy::Integer)
+        function $jname(
+            n::Integer, DX::ROCArray{$elty}, incx::Integer,
+            DY::ROCArray{$elty}, incy::Integer,
+        )
             result = Ref{$elty}()
-            wait!((DX,DY))
             $(fname)(handle(), n, DX, incx, DY, incy, result) |> check
             return result[]
         end
@@ -128,11 +117,8 @@ for (fname, elty, ret_type) in ((:rocblas_dnrm2,:Float64,:Float64),
                                 (:rocblas_dznrm2,:ComplexF64,:Float64),
                                 (:rocblas_scnrm2,:ComplexF32,:Float32))
     @eval begin
-        function nrm2(n::Integer,
-                      X::ROCArray{$elty},
-                      incx::Integer)
+        function nrm2(n::Integer, X::ROCArray{$elty}, incx::Integer)
             result = Ref{$ret_type}()
-            wait!(X)
             $(fname)(handle(), n, X, incx, result) |> check
             return result[]
         end
@@ -147,11 +133,8 @@ for (fname, elty, ret_type) in ((:rocblas_dasum,:Float64,:Float64),
                                 (:rocblas_dzasum,:ComplexF64,:Float64),
                                 (:rocblas_scasum,:ComplexF32,:Float32))
     @eval begin
-        function asum(n::Integer,
-                      X::ROCArray{$elty},
-                      incx::Integer)
+        function asum(n::Integer, X::ROCArray{$elty}, incx::Integer)
             result = Ref{$ret_type}()
-            wait!(X)
             $(fname)(handle(), n, X, incx, result) |> check
             return result[]
         end
@@ -165,16 +148,12 @@ for (fname, elty) in ((:rocblas_daxpy,:Float64),
                       (:rocblas_zaxpy,:ComplexF64),
                       (:rocblas_caxpy,:ComplexF32))
     @eval begin
-        function axpy!(n::Integer,
-                       alpha::($elty),
-                       dx::ROCArray{$elty},
-                       incx::Integer,
-                       dy::ROCArray{$elty},
-                       incy::Integer)
-            wait!((dx,dy))
+        function axpy!(
+            n::Integer, alpha::($elty), dx::ROCArray{$elty}, incx::Integer,
+            dy::ROCArray{$elty}, incy::Integer,
+        )
             (; handle, stream) = lib_state()
             $(fname)(handle, n, Ref(alpha), dx, incx, dy, incy) |> check
-            mark!((dx,dy), stream)
             dy
         end
     end
@@ -188,8 +167,10 @@ function axpy!(
     if minimum(rx) < 1 || maximum(rx) > length(x) || minimum(ry) < 1 || maximum(ry) > length(y)
         throw(BoundsError())
     end
-    axpy!(length(rx), convert(T, alpha), pointer(x)+(first(rx)-1)*sizeof(T),
-          step(rx), pointer(y)+(first(ry)-1)*sizeof(T), step(ry))
+    axpy!(
+        length(rx), convert(T, alpha),
+        pointer(x) + (first(rx) - 1) * sizeof(T), step(rx),
+        pointer(y) + (first(ry) - 1) * sizeof(T), step(ry))
     y
 end
 
@@ -244,12 +225,10 @@ for (fname, elty) in ((:rocblas_dgemv,:Float64),
                       (:rocblas_zgemv,:ComplexF64),
                       (:rocblas_cgemv,:ComplexF32))
     @eval begin
-        function gemv!(trans::Char,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       X::ROCVector{$elty},
-                       beta::($elty),
-                       Y::ROCVector{$elty})
+        function gemv!(
+            trans::Char, alpha::($elty), A::ROCMatrix{$elty}, X::ROCVector{$elty},
+            beta::($elty), Y::ROCVector{$elty},
+        )
             # handle trans
             roctrans = rocblasop(trans)
             m,n = size(A)
@@ -257,12 +236,9 @@ for (fname, elty) in ((:rocblas_dgemv,:Float64),
             length(X) == (trans == 'N' ? n : m) && length(Y) == (trans == 'N' ? m : n) || throw(DimensionMismatch(""))
             # compute increments
             lda = max(1,stride(A,2))
-            incx = stride(X,1)
-            incy = stride(Y,1)
-            wait!((A,X,Y))
+            incx, incy = stride(X,1), stride(Y,1)
             (; handle, stream) = lib_state()
             $(fname)(handle, roctrans, m, n, Ref(alpha), A, lda, X, incx, Ref(beta), Y, incy) |> check
-            mark!((A,X,Y), stream)
             Y
         end
         function gemv(trans::Char, alpha::($elty), A::ROCMatrix{$elty}, X::ROCVector{$elty})
@@ -280,48 +256,35 @@ for (fname, elty) in ((:rocblas_dgbmv,:Float64),
                       (:rocblas_zgbmv,:ComplexF64),
                       (:rocblas_cgbmv,:ComplexF32))
     @eval begin
-        function gbmv!(trans::Char,
-                       m::Integer,
-                       kl::Integer,
-                       ku::Integer,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty},
-                       beta::($elty),
-                       y::ROCVector{$elty})
+        function gbmv!(
+            trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::($elty),
+            A::ROCMatrix{$elty}, x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty},
+        )
             # handle trans
             roctrans = rocblasop(trans)
-            n = size(A,2)
+            n = size(A, 2)
             # check dimensions
             length(x) == (trans == 'N' ? n : m) && length(y) == (trans == 'N' ? m : n) || throw(DimensionMismatch(""))
             # compute increments
-            lda = max(1,stride(A,2))
-            incx = stride(x,1)
-            incy = stride(y,1)
-            wait!((A,x,y))
+            lda = max(1, stride(A, 2))
+            incx, incy = stride(x, 1), stride(y, 1)
             (; handle, stream) = lib_state()
             $(fname)(handle, roctrans, m, n, kl, ku, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check
-            mark!((A,x,y), stream)
             y
         end
-        function gbmv(trans::Char,
-                      m::Integer,
-                      kl::Integer,
-                      ku::Integer,
-                      alpha::($elty),
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function gbmv(
+            trans::Char, m::Integer, kl::Integer, ku::Integer, alpha::($elty),
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             # TODO: fix gbmv bug in julia
             n = size(A,2)
             leny = trans == 'N' ? m : n
             gbmv!(trans, m, kl, ku, alpha, A, x, zero($elty), similar(x, $elty, leny))
         end
-        function gbmv(trans::Char,
-                      m::Integer,
-                      kl::Integer,
-                      ku::Integer,
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function gbmv(
+            trans::Char, m::Integer, kl::Integer, ku::Integer,
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             gbmv(trans, m, kl, ku, one($elty), A, x)
         end
     end
@@ -334,23 +297,18 @@ for (fname, elty) in ((:rocblas_dsymv,:Float64),
                       (:rocblas_csymv,:ComplexF32))
     # Note that the complex symv are not BLAS but auiliary functions in LAPACK
     @eval begin
-        function symv!(uplo::Char,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty},
-                       beta::($elty),
-                       y::ROCVector{$elty})
+        function symv!(
+            uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, x::ROCVector{$elty},
+            beta::($elty), y::ROCVector{$elty},
+        )
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end
             if m != length(x) || m != length(y) throw(DimensionMismatch("")) end
-            lda = max(1,stride(A,2))
-            incx = stride(x,1)
-            incy = stride(y,1)
-            wait!((A,x,y))
+            lda = max(1, stride(A, 2))
+            incx, incy = stride(x, 1), stride(y,1)
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check
-            mark!((A,x,y), stream)
             y
         end
         function symv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty}, x::ROCVector{$elty})
@@ -367,24 +325,19 @@ end
 for (fname, elty) in ((:rocblas_zhemv,:ComplexF64),
                       (:rocblas_chemv,:ComplexF32))
     @eval begin
-        function hemv!(uplo::Char,
-                       alpha::$elty,
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty},
-                       beta::$elty,
-                       y::ROCVector{$elty})
+        function hemv!(
+            uplo::Char, alpha::$elty, A::ROCMatrix{$elty}, x::ROCVector{$elty},
+            beta::$elty, y::ROCVector{$elty},
+        )
             # TODO: fix dimension check bug in julia
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end
             if m != length(x) || m != length(y) throw(DimensionMismatch("")) end
-            lda = max(1,stride(A,2))
-            incx = stride(x,1)
-            incy = stride(y,1)
-            wait!((A,x,y))
+            lda = max(1, stride(A, 2))
+            incx, incy = stride(x, 1), stride(y, 1)
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check
-            mark!((A,x,y), stream)
             y
         end
         function hemv(uplo::Char, alpha::($elty), A::ROCMatrix{$elty},
@@ -404,26 +357,20 @@ end
 for (fname, elty) in ((:rocblas_dsbmv,:Float64),
                       (:rocblas_ssbmv,:Float32))
     @eval begin
-        function sbmv!(uplo::Char,
-                       k::Integer,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty},
-                       beta::($elty),
-                       y::ROCVector{$elty})
+        function sbmv!(
+            uplo::Char, k::Integer, alpha::($elty), A::ROCMatrix{$elty},
+            x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty},
+        )
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             #if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end
             if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end
             if m < 1+k throw(DimensionMismatch("Array A has fewer than 1+k rows")) end
             if n != length(x) || n != length(y) throw(DimensionMismatch("")) end
-            lda = max(1,stride(A,2))
-            incx = stride(x,1)
-            incy = stride(y,1)
-            wait!((A,x,y))
+            lda = max(1, stride(A, 2))
+            incx, incy = stride(x, 1), stride(y, 1)
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check
-            mark!((A,x,y), stream)
             y
         end
         function sbmv(uplo::Char, k::Integer, alpha::($elty),
@@ -442,25 +389,19 @@ end
 for (fname, elty) in ((:rocblas_zhbmv,:ComplexF64),
                       (:rocblas_chbmv,:ComplexF32))
     @eval begin
-        function hbmv!(uplo::Char,
-                       k::Integer,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty},
-                       beta::($elty),
-                       y::ROCVector{$elty})
+        function hbmv!(
+            uplo::Char, k::Integer, alpha::($elty), A::ROCMatrix{$elty},
+            x::ROCVector{$elty}, beta::($elty), y::ROCVector{$elty},
+        )
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             if !(1<=(1+k)<=n) throw(DimensionMismatch("Incorrect number of bands")) end
             if m < 1+k throw(DimensionMismatch("Array A has fewer than 1+k rows")) end
             if n != length(x) || n != length(y) throw(DimensionMismatch("")) end
-            lda = max(1,stride(A,2))
-            incx = stride(x,1)
-            incy = stride(y,1)
-            wait!((A,x,y))
+            lda = max(1,stride(A, 2))
+            incx, incy = stride(x, 1), stride(y, 1)
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, k, Ref(alpha), A, lda, x, incx, Ref(beta), y, incy) |> check
-            mark!((A,x,y), stream)
             y
         end
         function hbmv(uplo::Char, k::Integer, alpha::($elty),
@@ -481,12 +422,10 @@ for (fname, elty) in ((:rocblas_stbmv,:Float32),
                       (:rocblas_ztbmv,:ComplexF64),
                       (:rocblas_ctbmv,:ComplexF32))
     @eval begin
-        function tbmv!(uplo::Char,
-                       trans::Char,
-                       diag::Char,
-                       k::Integer,
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty})
+        function tbmv!(
+            uplo::Char, trans::Char, diag::Char, k::Integer,
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             rocuplo  = rocblasfill(uplo)
             roctrans = rocblasop(trans)
             rocdiag  = rocblasdiag(diag)
@@ -496,17 +435,14 @@ for (fname, elty) in ((:rocblas_stbmv,:Float32),
             if n != length(x) throw(DimensionMismatch("")) end
             lda = max(1,stride(A,2))
             incx = stride(x,1)
-            wait!((A,x))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check
-            mark!((A,x), stream)
             x
         end
-        function tbmv(uplo::Char,
-                      trans::Char,
-                      diag::Char,
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function tbmv(
+            uplo::Char, trans::Char, diag::Char,
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             tbmv!(uplo, trans, diag, A, copy(x))
         end
     end
@@ -518,12 +454,10 @@ for (fname, elty) in ((:rocblas_stbsv,:Float32),
                       (:rocblas_ztbsv,:ComplexF64),
                       (:rocblas_ctbsv,:ComplexF32))
     @eval begin
-        function tbsv!(uplo::Char,
-                       trans::Char,
-                       diag::Char,
-                       k::Integer,
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty})
+        function tbsv!(
+            uplo::Char, trans::Char, diag::Char, k::Integer,
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             rocuplo  = rocblasfill(uplo)
             roctrans = rocblasop(trans)
             rocdiag  = rocblasdiag(diag)
@@ -533,18 +467,14 @@ for (fname, elty) in ((:rocblas_stbsv,:Float32),
             if n != length(x) throw(DimensionMismatch("")) end
             lda = max(1,stride(A,2))
             incx = stride(x,1)
-            wait!((A,x))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, roctrans, rocdiag, n, k, A, lda, x, incx) |> check
-            mark!((A,x), stream)
             x
         end
-        function tbsv(uplo::Char,
-                      trans::Char,
-                      diag::Char,
-                      k::Integer,
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function tbsv(
+            uplo::Char, trans::Char, diag::Char, k::Integer,
+            A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             tbsv!(uplo, trans, diag, k, A, copy(x))
         end
     end
@@ -556,11 +486,9 @@ for (fname, elty) in ((:rocblas_dtrmv,:Float64),
                       (:rocblas_ztrmv,:ComplexF64),
                       (:rocblas_ctrmv,:ComplexF32))
     @eval begin
-        function trmv!(uplo::Char,
-                       trans::Char,
-                       diag::Char,
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty})
+        function trmv!(
+            uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty},
+        )
             m, n = size(A)
             if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end
             if n != length(x)
@@ -571,17 +499,11 @@ for (fname, elty) in ((:rocblas_dtrmv,:Float64),
             rocdiag = rocblasdiag(diag)
             lda = max(1,stride(A,2))
             incx = stride(x,1)
-            wait!((A,x))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check
-            mark!((A,x), stream)
             x
         end
-        function trmv(uplo::Char,
-                      trans::Char,
-                      diag::Char,
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function trmv(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty})
             trmv!(uplo, trans, diag, A, copy(x))
         end
     end
@@ -593,11 +515,7 @@ for (fname, elty) in ((:rocblas_dtrsv,:Float64),
                       (:rocblas_ztrsv,:ComplexF64),
                       (:rocblas_ctrsv,:ComplexF32))
     @eval begin
-        function trsv!(uplo::Char,
-                       trans::Char,
-                       diag::Char,
-                       A::ROCMatrix{$elty},
-                       x::ROCVector{$elty})
+        function trsv!(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty})
             m, n = size(A)
             if m != n throw(DimensionMismatch("Matrix A is $m by $n but must be square")) end
             if n != length(x)
@@ -608,17 +526,11 @@ for (fname, elty) in ((:rocblas_dtrsv,:Float64),
             rocdiag = rocblasdiag(diag)
             lda = max(1,stride(A,2))
             incx = stride(x,1)
-            wait!((A,x))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, roctrans, rocdiag, n, A, lda, x, incx) |> check
-            mark!((A,x), stream)
             x
         end
-        function trsv(uplo::Char,
-                      trans::Char,
-                      diag::Char,
-                      A::ROCMatrix{$elty},
-                      x::ROCVector{$elty})
+        function trsv(uplo::Char, trans::Char, diag::Char, A::ROCMatrix{$elty}, x::ROCVector{$elty})
             trsv!(uplo, trans, diag, A, copy(x))
         end
     end
@@ -630,20 +542,15 @@ for (fname, elty) in ((:rocblas_dger,:Float64),
                       (:rocblas_zgerc,:ComplexF64),
                       (:rocblas_cgerc,:ComplexF32))
     @eval begin
-        function ger!(alpha::$elty,
-                      x::ROCVector{$elty},
-                      y::ROCVector{$elty},
-                      A::ROCMatrix{$elty})
+        function ger!(alpha::$elty, x::ROCVector{$elty}, y::ROCVector{$elty}, A::ROCMatrix{$elty})
             m, n = size(A)
             m == length(x) || throw(DimensionMismatch(""))
             n == length(y) || throw(DimensionMismatch(""))
             incx = stride(x,1)
             incy = stride(y,1)
             lda = max(1,stride(A,2))
-            wait!((x,y,A))
             (; handle, stream) = lib_state()
             $(fname)(handle, m, n, Ref(alpha), x, incx, y, incy, A, lda) |> check
-            mark!((x,y,A), stream)
             A
         end
     end
@@ -656,20 +563,15 @@ for (fname, elty) in ((:rocblas_dsyr,:Float64),
                       (:rocblas_zsyr,:ComplexF64),
                       (:rocblas_csyr,:ComplexF32))
     @eval begin
-        function syr!(uplo::Char,
-                      alpha::$elty,
-                      x::ROCVector{$elty},
-                      A::ROCMatrix{$elty})
+        function syr!(uplo::Char, alpha::$elty, x::ROCVector{$elty}, A::ROCMatrix{$elty})
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square"))
             length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions"))
             incx = stride(x,1)
             lda = max(1,stride(A,2))
-            wait!((x,A))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check
-            mark!((x,A), stream)
             A
         end
     end
@@ -679,20 +581,15 @@ end
 for (fname, elty) in ((:rocblas_zher,:ComplexF64),
                       (:rocblas_cher,:ComplexF32))
     @eval begin
-        function her!(uplo::Char,
-                      alpha::$elty,
-                      x::ROCVector{$elty},
-                      A::ROCMatrix{$elty})
+        function her!(uplo::Char, alpha::$elty, x::ROCVector{$elty}, A::ROCMatrix{$elty})
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square"))
             length(x) == n || throw(DimensionMismatch("Length of vector must be the same as the matrix dimensions"))
             incx = stride(x,1)
             lda = max(1,stride(A,2))
-            wait!((x,A))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, A, lda) |> check
-            mark!((x,A), stream)
             A
         end
     end
@@ -702,11 +599,10 @@ end
 for (fname, elty) in ((:rocblas_zher2,:ComplexF64),
                       (:rocblas_cher2,:ComplexF32))
     @eval begin
-        function her2!(uplo::Char,
-                      alpha::$elty,
-                      x::ROCVector{$elty},
-                      y::ROCVector{$elty},
-                      A::ROCMatrix{$elty})
+        function her2!(
+            uplo::Char, alpha::$elty, x::ROCVector{$elty},
+            y::ROCVector{$elty}, A::ROCMatrix{$elty},
+        )
             rocuplo = rocblasfill(uplo)
             m, n = size(A)
             m == n || throw(DimensionMismatch("Matrix A is $m by $n but must be square"))
@@ -715,10 +611,8 @@ for (fname, elty) in ((:rocblas_zher2,:ComplexF64),
             incx = stride(x,1)
             incy = stride(y,1)
             lda = max(1,stride(A,2))
-            wait!((x,y,A))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, n, Ref(alpha), x, incx, y, incy, A, lda) |> check
-            mark!((x,y,A), stream)
             A
         end
     end
@@ -748,27 +642,21 @@ for (fname, elty) in
             lda = max(1, stride(A, 2))
             ldb = max(1, stride(B, 2))
             ldc = max(1, stride(C, 2))
-            wait!((A, B, C))
             (; handle, stream) = lib_state()
             $(fname)(
                 handle, rocblasop(transA), rocblasop(transB),
                 m, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
-            mark!((A, B, C), stream)
             C
         end
-        function gemm(transA::Char,
-                      transB::Char,
-                      alpha::($elty),
-                      A::ROCMatrix{$elty},
-                      B::ROCMatrix{$elty})
+        function gemm(
+            transA::Char, transB::Char, alpha::($elty),
+            A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
+        )
             gemm!(transA, transB, alpha, A, B, zero($elty),
                   similar(B, $elty, (size(A, transA == 'N' ? 1 : 2),
                                      size(B, transB == 'N' ? 2 : 1))))
         end
-        function gemm(transA::Char,
-                      transB::Char,
-                      A::ROCMatrix{$elty},
-                      B::ROCMatrix{$elty})
+        function gemm(transA::Char, transB::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty})
             gemm(transA, transB, one($elty), A, B)
         end
     end
@@ -866,7 +754,6 @@ for (fname, elty) in
         )
             m, k, n, lda, ldb, ldc = _check_gemm_batched_dims(
                 transA, transB, A, B, C)
-            wait!((A, B, C))
 
             batch_count = size(C, 3)
             a_broadcast = (size(A, 3) == 1) && (batch_count > 1)
@@ -880,7 +767,6 @@ for (fname, elty) in
                 handle, rocblasop(transA), rocblasop(transB),
                 m, n, k, Ref(alpha), Ab, lda, Bb, ldb, Ref(beta),
                 Cb, ldc, batch_count) |> check
-            mark!((A, B, C), stream)
             C
         end
         function gemm_batched(
@@ -920,17 +806,14 @@ for (fname, elty) in
          (:rocblas_zgemmStridedBatched,:ComplexF64),
          (:rocblas_cgemmStridedBatched,:ComplexF32))
     @eval begin
-        function gemm_strided_batched!(transA::Char,
-                               transB::Char,
-                               alpha::($elty),
-                               A::ROCArray{$elty, 3},
-                               B::ROCArray{$elty, 3},
-                               beta::($elty),
-                               C::ROCArray{$elty, 3})
+        function gemm_strided_batched!(
+            transA::Char, transB::Char, alpha::($elty),
+            A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3},
+            beta::($elty), C::ROCArray{$elty, 3},
+        )
            m = size(A, transA == 'N' ? 1 : 2)
            k = size(A, transA == 'N' ? 2 : 1)
            n = size(B, transB == 'N' ? 2 : 1)
-
            @assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch"
 
            if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2)
@@ -946,24 +829,20 @@ for (fname, elty) in
            strideB = stride(B, 3)
            strideC = stride(C, 3)
            batchCount = size(A, 3)
-           wait!((A,B,C))
            (; handle, stream) = lib_state()
            $(fname)(handle, roctransA, roctransB, m, n, k, Ref(alpha), A, lda, strideA, B, ldb, strideB, Ref(beta), C, ldc, strideC, batchCount) |> check
-           mark!((A,B,C), stream)
            C
         end
-        function gemm_strided_batched(transA::Char,
-                      transB::Char,
-                      alpha::($elty),
-                      A::ROCArray{$elty, 3},
-                      B::ROCArray{$elty, 3})
+        function gemm_strided_batched(
+            transA::Char, transB::Char, alpha::($elty),
+            A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3},
+        )
             C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
-            gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
+            gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C)
         end
-        function gemm_strided_batched(transA::Char,
-                      transB::Char,
-                      A::ROCArray{$elty, 3},
-                      B::ROCArray{$elty, 3})
+        function gemm_strided_batched(
+            transA::Char, transB::Char, A::ROCArray{$elty, 3}, B::ROCArray{$elty, 3},
+        )
             gemm_strided_batched(transA, transB, one($elty), A, B)
         end
     end
@@ -976,13 +855,11 @@ for (fname, elty) in ((:rocblas_dsymm,:Float64),
                       (:rocblas_csymm,:ComplexF32))
     # TODO: fix julia dimension checks in symm!
     @eval begin
-        function symm!(side::Char,
-                       uplo::Char,
-                       alpha::($elty),
-                       A::ROCMatrix{$elty},
-                       B::ROCMatrix{$elty},
-                       beta::($elty),
-                       C::ROCMatrix{$elty})
+        function symm!(
+            side::Char, uplo::Char, alpha::($elty),
+            A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
+            beta::($elty), C::ROCMatrix{$elty},
+        )
             rocside = rocblasside(side)
             rocuplo = rocblasfill(uplo)
             k, nA = size(A)
@@ -995,23 +872,17 @@ for (fname, elty) in ((:rocblas_dsymm,:Float64),
             lda = max(1,stride(A,2))
             ldb = max(1,stride(B,2))
             ldc = max(1,stride(C,2))
-            wait!((A,B,C))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
-            mark!((A,B,C), stream)
             C
         end
-        function symm(side::Char,
-                      uplo::Char,
-                      alpha::($elty),
-                      A::ROCMatrix{$elty},
-                      B::ROCMatrix{$elty})
+        function symm(
+            side::Char, uplo::Char, alpha::($elty),
+            A::ROCMatrix{$elty}, B::ROCMatrix{$elty},
+        )
             symm!(side, uplo, alpha, A, B, zero($elty), similar(B))
         end
-        function symm(side::Char,
-                      uplo::Char,
-                      A::ROCMatrix{$elty},
-                      B::ROCMatrix{$elty})
+        function symm(side::Char, uplo::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty})
             symm(side, uplo, one($elty), A, B)
         end
     end
@@ -1022,113 +893,93 @@ for (fname, elty) in ((:rocblas_dsyrk,:Float64),
                       (:rocblas_ssyrk,:Float32),
                       (:rocblas_zsyrk,:ComplexF64),
                       (:rocblas_csyrk,:ComplexF32))
-   @eval begin
-       function syrk!(uplo::Char,
-                      trans::Char,
-                      alpha::($elty),
-                      A::ROCVecOrMat{$elty},
-                      beta::($elty),
-                      C::ROCMatrix{$elty})
-           rocuplo = rocblasfill(uplo)
-           roctrans = rocblasop(trans)
-           mC, n = size(C)
-           if mC != n throw(DimensionMismatch("C must be square")) end
-           nn = size(A, trans == 'N' ? 1 : 2)
-           if nn != n throw(DimensionMismatch("syrk!")) end
-           k  = size(A, trans == 'N' ? 2 : 1)
-           lda = max(1,stride(A,2))
-           ldc = max(1,stride(C,2))
-           wait!((A,C))
-           (; handle, stream) = lib_state()
-           $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check
-           mark!((A,C), stream)
-           C
+    @eval begin
+        function syrk!(
+            uplo::Char, trans::Char, alpha::($elty),
+            A::ROCVecOrMat{$elty}, beta::($elty), C::ROCMatrix{$elty},
+        )
+            rocuplo = rocblasfill(uplo)
+            roctrans = rocblasop(trans)
+            mC, n = size(C)
+            if mC != n throw(DimensionMismatch("C must be square")) end
+            nn = size(A, trans == 'N' ? 1 : 2)
+            if nn != n throw(DimensionMismatch("syrk!")) end
+            k  = size(A, trans == 'N' ? 2 : 1)
+            lda = max(1,stride(A,2))
+            ldc = max(1,stride(C,2))
+            (; handle, stream) = lib_state()
+            $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check
+            C
         end
     end
 end
-function syrk(uplo::Char,
-              trans::Char,
-              alpha::Number,
-              A::ROCVecOrMat)
+function syrk(uplo::Char, trans::Char, alpha::Number, A::ROCVecOrMat)
     T = eltype(A)
     n = size(A, trans == 'N' ? 1 : 2)
     syrk!(uplo, trans, convert(T,alpha), A, zero(T), similar(A, T, (n, n)))
 end
-syrk(uplo::Char, trans::Char, A::ROCVecOrMat) = syrk(uplo, trans,
-                                                              one(eltype(A)),
-                                                              A)
+syrk(uplo::Char, trans::Char, A::ROCVecOrMat) = syrk(uplo, trans, one(eltype(A)), A)
 
 ## hemm
 for (fname, elty) in ((:rocblas_zhemm,:ComplexF64),
                       (:rocblas_chemm,:ComplexF32))
-   @eval begin
-       function hemm!(side::Char,
-                      uplo::Char,
-                      alpha::($elty),
-                      A::ROCMatrix{$elty},
-                      B::ROCMatrix{$elty},
-                      beta::($elty),
-                      C::ROCMatrix{$elty})
-           rocside = rocblasside(side)
-           rocuplo = rocblasfill(uplo)
-           mA, nA = size(A)
-           m, n = size(B)
-           mC, nC = size(C)
-           if mA != nA throw(DimensionMismatch("A must be square")) end
-           if ((m != mC) || (n != nC)) throw(DimensionMismatch("B and C must have same dimensions")) end
-           if ((side == 'L') && (mA != m)) throw(DimensionMismatch("")) end
-           if ((side == 'R') && (mA != n)) throw(DimensionMismatch("")) end
-           lda = max(1,stride(A,2))
-           ldb = max(1,stride(B,2))
-           ldc = max(1,stride(C,2))
-           wait!((A,B,C))
-           (; handle, stream) = lib_state()
-           $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
-           mark!((A,B,C), stream)
-           C
-       end
-       function hemm(uplo::Char,
-                     trans::Char,
-                     alpha::($elty),
-                     A::ROCMatrix{$elty},
-                     B::ROCMatrix{$elty})
-           m,n = size(B)
-           hemm!( uplo, trans, alpha, A, B, zero($elty), similar(B, $elty, (m,n) ) )
-       end
-       hemm( uplo::Char, trans::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) = hemm( uplo, trans, one($elty), A, B)
+    @eval begin
+        function hemm!(
+            side::Char, uplo::Char, alpha::($elty), A::ROCMatrix{$elty},
+            B::ROCMatrix{$elty}, beta::($elty), C::ROCMatrix{$elty},
+        )
+            rocside = rocblasside(side)
+            rocuplo = rocblasfill(uplo)
+            mA, nA = size(A)
+            m, n = size(B)
+            mC, nC = size(C)
+            if mA != nA throw(DimensionMismatch("A must be square")) end
+            if ((m != mC) || (n != nC)) throw(DimensionMismatch("B and C must have same dimensions")) end
+            if ((side == 'L') && (mA != m)) throw(DimensionMismatch("")) end
+            if ((side == 'R') && (mA != n)) throw(DimensionMismatch("")) end
+            lda = max(1,stride(A,2))
+            ldb = max(1,stride(B,2))
+            ldc = max(1,stride(C,2))
+            (; handle, stream) = lib_state()
+            $(fname)(handle, rocside, rocuplo, m, n, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
+            C
+        end
+        function hemm(uplo::Char, trans::Char, alpha::($elty), A::ROCMatrix{$elty}, B::ROCMatrix{$elty})
+            m,n = size(B)
+            hemm!( uplo, trans, alpha, A, B, zero($elty), similar(B, $elty, (m,n) ) )
+        end
+        hemm( uplo::Char, trans::Char, A::ROCMatrix{$elty}, B::ROCMatrix{$elty}) =
+            hemm( uplo, trans, one($elty), A, B)
     end
 end
 
 ## herk
 for (fname, elty) in ((:rocblas_zherk,:ComplexF64),
                       (:rocblas_cherk,:ComplexF32))
-   @eval begin
-       function herk!(uplo::Char,
-                      trans::Char,
-                      alpha::($elty),
-                      A::ROCVecOrMat{$elty},
-                      beta::($elty),
-                      C::ROCMatrix{$elty})
-           rocuplo = rocblasfill(uplo)
-           roctrans = rocblasop(trans)
-           mC, n = size(C)
-           if mC != n throw(DimensionMismatch("C must be square")) end
-           nn = size(A, trans == 'N' ? 1 : 2)
-           if nn != n throw(DimensionMismatch("syrk!")) end
-           k  = size(A, trans == 'N' ? 2 : 1)
-           lda = max(1,stride(A,2))
-           ldc = max(1,stride(C,2))
-           wait!((A,C))
-           (; handle, stream) = lib_state()
-           $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check
-           mark!((A,C), stream)
-           C
-       end
-       function herk(uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty})
-           n = size(A, trans == 'N' ? 1 : 2)
-           herk!(uplo, trans, alpha, A, zero($elty), similar(A, $elty, (n,n)))
-       end
-       herk(uplo::Char, trans::Char, A::ROCVecOrMat{$elty}) = herk(uplo, trans, one($elty), A)
+    @eval begin
+        function herk!(
+            uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty},
+            beta::($elty), C::ROCMatrix{$elty},
+        )
+            rocuplo = rocblasfill(uplo)
+            roctrans = rocblasop(trans)
+            mC, n = size(C)
+            if mC != n throw(DimensionMismatch("C must be square")) end
+            nn = size(A, trans == 'N' ? 1 : 2)
+            if nn != n throw(DimensionMismatch("syrk!")) end
+            k  = size(A, trans == 'N' ? 2 : 1)
+            lda = max(1,stride(A,2))
+            ldc = max(1,stride(C,2))
+            (; handle, stream) = lib_state()
+            $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, Ref(beta), C, ldc) |> check
+            C
+        end
+        function herk(uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty})
+            n = size(A, trans == 'N' ? 1 : 2)
+            herk!(uplo, trans, alpha, A, zero($elty), similar(A, $elty, (n,n)))
+        end
+        herk(uplo::Char, trans::Char, A::ROCVecOrMat{$elty}) =
+            herk(uplo, trans, one($elty), A)
    end
 end
 
@@ -1138,13 +989,10 @@ for (fname, elty) in ((:rocblas_dsyr2k,:Float64),
                       (:rocblas_zsyr2k,:ComplexF64),
                       (:rocblas_csyr2k,:ComplexF32))
     @eval begin
-        function syr2k!(uplo::Char,
-                        trans::Char,
-                        alpha::($elty),
-                        A::ROCVecOrMat{$elty},
-                        B::ROCVecOrMat{$elty},
-                        beta::($elty),
-                        C::ROCMatrix{$elty})
+        function syr2k!(
+            uplo::Char, trans::Char, alpha::($elty), A::ROCVecOrMat{$elty},
+            B::ROCVecOrMat{$elty}, beta::($elty), C::ROCMatrix{$elty},
+        )
             # TODO: check size of B in julia (syr2k!)
             rocuplo = rocblasfill(uplo)
             roctrans = rocblasop(trans)
@@ -1160,36 +1008,28 @@ for (fname, elty) in ((:rocblas_dsyr2k,:Float64),
             lda = max(1,stride(A,2))
             ldb = max(1,stride(B,2))
             ldc = max(1,stride(C,2))
-            wait!((A,B,C))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
-            mark!((A,B,C), stream)
             C
         end
     end
 end
-function syr2k(uplo::Char,
-               trans::Char,
-               alpha::Number,
-               A::ROCVecOrMat,
-               B::ROCVecOrMat)
+function syr2k(uplo::Char, trans::Char, alpha::Number, A::ROCVecOrMat, B::ROCVecOrMat)
     T = eltype(A)
     n = size(A, trans == 'N' ? 1 : 2)
     syr2k!(uplo, trans, convert(T,alpha), A, B, zero(T), similar(A, T, (n, n)))
 end
-syr2k(uplo::Char, trans::Char, A::ROCVecOrMat, B::ROCVecOrMat) = syr2k(uplo, trans, one(eltype(A)), A, B)
+syr2k(uplo::Char, trans::Char, A::ROCVecOrMat, B::ROCVecOrMat) =
+    syr2k(uplo, trans, one(eltype(A)), A, B)
 
 ## her2k
 for (fname, elty1, elty2) in ((:rocblas_zher2k,:ComplexF64,:Float64),
                               (:rocblas_cher2k,:ComplexF32,:Float32))
-   @eval begin
-       function her2k!(uplo::Char,
-                       trans::Char,
-                       alpha::($elty1),
-                       A::ROCVecOrMat{$elty1},
-                       B::ROCVecOrMat{$elty1},
-                       beta::($elty2),
-                       C::ROCMatrix{$elty1})
+    @eval begin
+        function her2k!(
+            uplo::Char, trans::Char, alpha::($elty1), A::ROCVecOrMat{$elty1},
+            B::ROCVecOrMat{$elty1}, beta::($elty2), C::ROCMatrix{$elty1},
+        )
            # TODO: check size of B in julia (her2k!)
            rocuplo = rocblasfill(uplo)
            roctrans = rocblasop(trans)
@@ -1206,24 +1046,19 @@ for (fname, elty1, elty2) in ((:rocblas_zher2k,:ComplexF64,:Float64),
            lda = max(1,stride(A,2))
            ldb = max(1,stride(B,2))
            ldc = max(1,stride(C,2))
-           wait!((A,B,C))
            (; handle, stream) = lib_state()
            $(fname)(handle, rocuplo, roctrans, n, k, Ref(alpha), A, lda, B, ldb, Ref(beta), C, ldc) |> check
-           mark!((A,B,C), stream)
            C
-       end
-       function her2k(uplo::Char,
-                      trans::Char,
-                      alpha::($elty1),
-                      A::ROCVecOrMat{$elty1},
-                      B::ROCVecOrMat{$elty1})
+        end
+        function her2k(
+            uplo::Char, trans::Char, alpha::($elty1),
+            A::ROCVecOrMat{$elty1}, B::ROCVecOrMat{$elty1},
+        )
            n = size(A, trans == 'N' ? 1 : 2)
            her2k!(uplo, trans, alpha, A, B, zero($elty2), similar(A, $elty1, (n,n)))
-       end
-       her2k(uplo::Char,
-             trans::Char,
-             A::ROCVecOrMat{$elty1},
-             B::ROCVecOrMat{$elty1}) = her2k(uplo, trans, one($elty1), A, B)
+        end
+        her2k(uplo::Char, trans::Char, A::ROCVecOrMat{$elty1}, B::ROCVecOrMat{$elty1}) =
+            her2k(uplo, trans, one($elty1), A, B)
    end
 end
 
@@ -1249,12 +1084,10 @@ for (mmname, smname, elty) in
             if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trmm!")) end
             lda = max(1,stride(A,2))
             ldb = max(1,stride(B,2))
-            wait!((A,B))
             (; handle, stream) = lib_state()
             $(mmname)(
                 handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha),
                 A, lda, B, ldb) |> check
-            mark!((A,B), stream)
             B
         end
         function trmm(
@@ -1278,10 +1111,8 @@ for (mmname, smname, elty) in
             if nA != (side == 'L' ? m : n) throw(DimensionMismatch("trsm!")) end
             lda = max(1,stride(A,2))
             ldb = max(1,stride(B,2))
-            wait!((A,B))
             (; handle, stream) = lib_state()
             $(smname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), A, lda, B, ldb) |> check
-            mark!((A,B), stream)
             B
         end
         function trsm(
@@ -1300,13 +1131,10 @@ for (fname, elty) in
          (:rocblas_ztrsmBatched,:ComplexF64),
          (:rocblas_ctrsmBatched,:ComplexF32))
     @eval begin
-        function trsm_batched!(side::Char,
-                               uplo::Char,
-                               transa::Char,
-                               diag::Char,
-                               alpha::($elty),
-                               A::Array{ROCMatrix{$elty},1},
-                               B::Array{ROCMatrix{$elty},1})
+        function trsm_batched!(
+            side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
+            A::Array{ROCMatrix{$elty},1}, B::Array{ROCMatrix{$elty},1},
+        )
             rocside = rocblasside(side)
             rocuplo = rocblasfill(uplo)
             roctransa = rocblasop(transa)
@@ -1325,19 +1153,14 @@ for (fname, elty) in
             ldb = max(1,stride(B[1],2))
             Aptrs = device_batch(A)
             Bptrs = device_batch(B)
-            wait!((A,B))
             (; handle, stream) = lib_state()
             $(fname)(handle, rocside, rocuplo, roctransa, rocdiag, m, n, Ref(alpha), Aptrs, lda, Bptrs, ldb, length(A)) |> check
-            mark!((A,B), stream)
             B
         end
-        function trsm_batched(side::Char,
-                              uplo::Char,
-                              transa::Char,
-                              diag::Char,
-                              alpha::($elty),
-                              A::Array{ROCMatrix{$elty},1},
-                              B::Array{ROCMatrix{$elty},1})
+        function trsm_batched(
+            side::Char, uplo::Char, transa::Char, diag::Char, alpha::($elty),
+            A::Array{ROCMatrix{$elty},1}, B::Array{ROCMatrix{$elty},1},
+        )
             trsm_batched!(side, uplo, transa, diag, alpha, A, copy(B) )
         end
     end
@@ -1352,14 +1175,11 @@ for (fname, elty) in ((:rocblas_dgeam,:Float64),
                       (:rocblas_sgeam,:Float32),
                       (:rocblas_zgeam,:ComplexF64),
                       (:rocblas_cgeam,:ComplexF32))
-   @eval begin
-       function geam!(transa::Char,
-                      transb::Char,
-                      alpha::($elty),
-                      A::ROCMatrix{$elty},
-                      beta::($elty),
-                      B::ROCMatrix{$elty},
-                      C::ROCMatrix{$elty})
+    @eval begin
+        function geam!(
+            transa::Char, transb::Char, alpha::($elty), A::ROCMatrix{$elty},
+            beta::($elty), B::ROCMatrix{$elty}, C::ROCMatrix{$elty},
+        )
            roctransa = rocblasop(transa)
            roctransb = rocblasop(transb)
            mA, nA = size(A)
@@ -1372,18 +1192,14 @@ for (fname, elty) in ((:rocblas_dgeam,:Float64),
            lda = max(1,stride(A,2))
            ldb = max(1,stride(B,2))
            ldc = max(1,stride(C,2))
-           wait!((A,B,C))
            (; handle, stream) = lib_state()
            $(fname)(handle, roctransa, roctransb, m, n, Ref(alpha), A, lda, Ref(beta), B, ldb, C, ldc) |> check
-           mark!((A,B,C), stream)
            C
-       end
-       function geam(transa::Char,
-                     transb::Char,
-                     alpha::($elty),
-                     A::ROCMatrix{$elty},
-                     beta::($elty),
-                     B::ROCMatrix{$elty})
+        end
+        function geam(
+            transa::Char, transb::Char, alpha::($elty), A::ROCMatrix{$elty},
+            beta::($elty), B::ROCMatrix{$elty},
+        )
            m,n = size(B)
            if ((transb == 'T' || transb == 'C'))
                geam!( transa, transb, alpha, A, beta, B, similar(B, $elty, (n,m) ) )
@@ -1404,8 +1220,7 @@ for (fname, elty) in
          (:rocblas_zgetrfBatched,:ComplexF64),
          (:rocblas_cgetrfBatched,:ComplexF32))
     @eval begin
-        function getrf_batched!(A::Array{ROCMatrix{$elty},1},
-                                Pivot::Bool)
+        function getrf_batched!(A::Array{ROCMatrix{$elty},1}, Pivot::Bool)
             for As in A
                 m,n = size(As)
                 if m != n
@@ -1417,18 +1232,14 @@ for (fname, elty) in
             Aptrs = device_batch(A)
             info  = ROCArray{Cint}(undef, length(A))
             pivotArray  = Pivot ? ROCArray{Int32}(undef, (n, length(A))) : C_NULL
-            wait!(A)
             (; handle, stream) = lib_state()
             $(fname)(handle, n, Aptrs, lda, pivotArray, info, length(A)) |> check
             if( !Pivot )
                 pivotArray = ROCArray(zeros(Cint, (n, length(A))))
             end
-            mark!((A, info), stream)
-            pivotArray != C_NULL && mark!(pivotArray, stream)
             pivotArray, info, A
         end
-        function getrf_batched(A::Array{ROCMatrix{$elty},1},
-                               Pivot::Bool)
+        function getrf_batched(A::Array{ROCMatrix{$elty},1}, Pivot::Bool)
             newA = copy(A)
             pivotarray, info = getrf_batched!(newA, Pivot)
             pivotarray, info, newA
@@ -1444,8 +1255,7 @@ for (fname, elty) in
          (:rocblas_zgetriBatched,:ComplexF64),
          (:rocblas_cgetriBatched,:ComplexF32))
     @eval begin
-        function getri_batched(A::Array{ROCMatrix{$elty},1},
-                               pivotArray::ROCMatrix{Cint})
+        function getri_batched(A::Array{ROCMatrix{$elty},1}, pivotArray::ROCMatrix{Cint})
             for As in A
                 m,n = size(As)
                 if m != n
@@ -1459,11 +1269,8 @@ for (fname, elty) in
             Aptrs = device_batch(A)
             Cptrs = device_batch(C)
             info = ROCArray(zeros(Cint,length(A)))
-            wait!(A)
-            wait!(pivotArray)
             (; handle, stream) = lib_state()
             $(fname)(handle, n, Aptrs, lda, pivotArray, Cptrs, ldc, info, length(A)) |> check
-            mark!((A, pivotArray, info, C), stream)
             pivotArray, info, C
         end
     end
@@ -1494,10 +1301,8 @@ for (fname, elty) in
             Aptrs = device_batch(A)
             Cptrs = device_batch(C)
             info = ROCArray(zeros(Cint,length(A)))
-            wait!(A)
             (; handle, stream) = lib_state()
             $(fname)(handle, n, Aptrs, lda, Cptrs, ldc, info, length(A)) |> check
-            mark!((A, info, C), stream)
             info, C
         end
     end
@@ -1522,13 +1327,11 @@ for (fname, elty) in
             end
             Tauptrs = device_batch(TauArray)
             info    = zero(Cint)
-            wait!(A)
             (; handle, stream) = lib_state()
             $(fname)(handle, m, n, Aptrs, lda, Tauptrs, Ref(info), length(A)) |> check
             if( info != 0 )
                 throw(ArgumentError,string("Invalid value at ",-info))
             end
-            mark!((A, TauArray), stream)
             TauArray, A
         end
         function geqrf_batched(A::Array{ROCMatrix{$elty},1})
@@ -1545,9 +1348,9 @@ for (fname, elty) in
          (:rocblas_zgelsBatched,:ComplexF64),
          (:rocblas_cgelsBatched,:ComplexF32))
     @eval begin
-        function gels_batched!(trans::Char,
-                              A::Array{ROCMatrix{$elty},1},
-                              C::Array{ROCMatrix{$elty},1})
+        function gels_batched!(
+            trans::Char, A::Array{ROCMatrix{$elty},1}, C::Array{ROCMatrix{$elty},1},
+        )
             roctrans = rocblasop(trans)
             if( length(A) != length(C) )
                 throw(DimensionMismatch(""))
@@ -1570,21 +1373,15 @@ for (fname, elty) in
             Cptrs = device_batch(C)
             info  = zero(Cint)
             infoarray = ROCArray(zeros(Cint, length(A)))
-            wait!(A)
-            wait!(C)
             (; handle, stream) = lib_state()
             $(fname)(handle, roctrans, m, n, nrhs, Aptrs, lda, Cptrs, ldc, Ref(info), infoarray, length(A)) |> check
             if( info != 0 )
                 throw(ArgumentError,string("Invalid value at ",-info))
             end
-            mark!((A, C, infoarray), stream)
             A, C, infoarray
         end
-        function gels_batched(trans::Char,
-                             A::Array{ROCMatrix{$elty},1},
-                             C::Array{ROCMatrix{$elty},1})
+        gels_batched(trans::Char, A::Array{ROCMatrix{$elty},1}, C::Array{ROCMatrix{$elty},1}) =
             gels_batched!(trans, copy(A), copy(C))
-        end
     end
 end
 
@@ -1594,10 +1391,9 @@ for (fname, elty) in ((:rocblas_ddgmm,:Float64),
                       (:rocblas_zdgmm,:ComplexF64),
                       (:rocblas_cdgmm,:ComplexF32))
    @eval begin
-       function dgmm!(mode::Char,
-                      A::ROCMatrix{$elty},
-                      X::ROCVector{$elty},
-                      C::ROCMatrix{$elty})
+       function dgmm!(
+           mode::Char, A::ROCMatrix{$elty}, X::ROCVector{$elty}, C::ROCMatrix{$elty},
+        )
            rocside = rocblasside(mode)
            m, n = size(C)
            mA, nA = size(A)
@@ -1608,15 +1404,11 @@ for (fname, elty) in ((:rocblas_ddgmm,:Float64),
            lda = max(1,stride(A,2))
            incx = stride(X,1)
            ldc = max(1,stride(C,2))
-           wait!((A,X,C))
            (; handle, stream) = lib_state()
            $(fname)(handle, rocside, m, n, A, lda, X, incx, C, ldc) |> check
-           mark!((A,X,C), stream)
            C
        end
-       function dgmm(mode::Char,
-                     A::ROCMatrix{$elty},
-                     X::ROCVector{$elty})
+       function dgmm(mode::Char, A::ROCMatrix{$elty}, X::ROCVector{$elty})
            m,n = size(A)
            dgmm!( mode, A, X, similar(A, $elty, (m,n) ) )
        end
diff --git a/src/cache.jl b/src/cache.jl
index 82a546aca..beda5fc83 100644
--- a/src/cache.jl
+++ b/src/cache.jl
@@ -115,7 +115,7 @@ function library_state(
     state = get!(() -> new_state(tls), states, tls.context)
 
     @noinline function update_stream(tls, state)
-        set_stream(new_handle, tls.stream)
+        set_stream(state.handle, tls.stream)
         return (; state.handle, tls.stream)
     end
     if state.stream != tls.stream
diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl
index d4751a3d1..bcfb1241e 100644
--- a/src/compiler/codegen.jl
+++ b/src/compiler/codegen.jl
@@ -1,257 +1,169 @@
-import .Device: ExceptionEntry, HostCall
+struct HIPCompilerParams <: AbstractCompilerParams end
 
-## GPUCompiler interface
+const HIPCompilerConfig = CompilerConfig{GCNCompilerTarget, HIPCompilerParams}
+const HIPCompilerJob = CompilerJob{GCNCompilerTarget, HIPCompilerParams}
 
-struct ROCCompilerParams <: AbstractCompilerParams
-    device::ROCDevice
-    global_hooks::NamedTuple
-end
-
-const ROCCompilerConfig = CompilerConfig{GCNCompilerTarget, ROCCompilerParams}
-const ROCCompilerJob = CompilerJob{GCNCompilerTarget, ROCCompilerParams}
-
-# Caches for GPUCompiler.
-const _compiler_cache = Dict{ROCDevice, Dict{UInt, Any}}()
-const _compiler_configs = Dict{UInt, ROCCompilerConfig}()
-const _kernel_instances = Dict{UInt, Any}() # HostKernel
+const _hip_compiler_cache = Dict{HIP.HIPDevice, Dict{Any, HIP.HIPFunction}}()
 
-function compiler_cache(dev::ROCDevice)
-    get!(() -> Dict{UInt, Any}(), _compiler_cache, dev)
-end
-
-function compiler_config(dev::ROCDevice; kwargs...)
-    h = hash(dev, hash(kwargs))
-    get!(() -> _compiler_config(dev; kwargs...), _compiler_configs, h)
-end
-function _compiler_config(
-    dev::ROCDevice; global_hooks, kernel::Bool = true, name=nothing,
-    always_inline=true, kwargs...,
-)
-    isa = AMDGPU.default_isa(dev)
-    dev_isa, features = Runtime.llvm_arch_features(isa)
+# hash(fun, hash(f, hash(tt))) => HIPKernel
+const _kernel_instances = Dict{UInt, Runtime.HIPKernel}()
 
-    target = GCNCompilerTarget(; dev_isa, features)
-    params = ROCCompilerParams(dev, global_hooks)
-    CompilerConfig(target, params; kernel, name, always_inline)
+function compiler_cache(dev::HIP.HIPDevice)
+    get!(() -> Dict{UInt, Any}(), _hip_compiler_cache, dev)
 end
 
-GPUCompiler.runtime_module(@nospecialize(::ROCCompilerJob)) = AMDGPU
+GPUCompiler.runtime_module(@nospecialize(::HIPCompilerJob)) = AMDGPU
 
-GPUCompiler.ci_cache(@nospecialize(::ROCCompilerJob)) = AMDGPU.ci_cache
+GPUCompiler.ci_cache(@nospecialize(::HIPCompilerJob)) = AMDGPU.ci_cache
 
-GPUCompiler.method_table(@nospecialize(::ROCCompilerJob)) = AMDGPU.method_table
+GPUCompiler.method_table(@nospecialize(::HIPCompilerJob)) = AMDGPU.method_table
 
-# filter out functions from device libs
-GPUCompiler.isintrinsic(@nospecialize(job::ROCCompilerJob), fn::String) =
-    invoke(GPUCompiler.isintrinsic,
-           Tuple{CompilerJob{GCNCompilerTarget}, typeof(fn)},
-           job, fn) ||
-    startswith(fn, "rocm")
-
-function GPUCompiler.process_module!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module)
-    invoke(GPUCompiler.process_module!,
-           Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
-           job, mod)
-    # Run this early (before optimization) to ensure we link OCKL
-    emit_exception_user!(mod)
-end
-function GPUCompiler.process_entry!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module, entry::LLVM.Function)
-    invoke(GPUCompiler.process_entry!,
-           Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)},
-           job, mod, entry)
-    # Workaround for the lack of zeroinitializer support for LDS
-    zeroinit_lds!(mod, entry)
-end
-function GPUCompiler.finish_module!(@nospecialize(job::ROCCompilerJob), mod::LLVM.Module)
-    invoke(GPUCompiler.finish_module!,
-           Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
-           job, mod)
-    delete_exception_user!(mod)
-end
+GPUCompiler.kernel_state_type(@nospecialize(::HIPCompilerJob)) = AMDGPU.KernelState
 
 function GPUCompiler.link_libraries!(
-    @nospecialize(job::ROCCompilerJob), mod::LLVM.Module,
+    @nospecialize(job::HIPCompilerJob), mod::LLVM.Module,
     undefined_fns::Vector{String},
 )
+    # @show undefined_fns
     invoke(GPUCompiler.link_libraries!,
-           Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(undefined_fns)},
-           job, mod, undefined_fns)
+        Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(undefined_fns)},
+        job, mod, undefined_fns)
+    link_device_libs!(job.config.target, mod)
+end
+
+function GPUCompiler.finish_ir!(
+    @nospecialize(job::HIPCompilerJob), mod::LLVM.Module, entry::LLVM.Function,
+)
+    # @show collect(GPUCompiler.decls(mod))
+    # TODO fixx
     link_device_libs!(job.config.target, mod)
+    return entry
 end
 
-const rocfunction_lock = ReentrantLock()
+function GPUCompiler.finish_module!(
+    @nospecialize(job::HIPCompilerJob), mod::LLVM.Module, entry::LLVM.Function,
+)
+    entry = invoke(GPUCompiler.finish_module!,
+        Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod), typeof(entry)},
+        job, mod, entry)
+
+    # Workaround for the lack of zeroinitializer support for LDS.
+    zeroinit_lds!(mod, entry)
+
+    # Force-inline exception-related functions.
+    # LLVM gets confused when not all functions are inlined,
+    # causing huge scratch memory usage.
+    # And GPUCompiler fails to inline all functions without forcing
+    # always-inline attributes on them. Add them here.
+    target_fns = (
+        "signal_exception", "report_exception", "malloc", "__throw_")
+    inline_attr = EnumAttribute("alwaysinline")
+    for fn in LLVM.functions(mod)
+        any(occursin.(target_fns, LLVM.name(fn))) || continue
+        attrs = LLVM.function_attributes(fn)
+        inline_attr ∈ collect(attrs) || push!(attrs, inline_attr)
+    end
+
+    return entry
+end
 
-"""
-    rocfunction(f, tt=Tuple{}; kwargs...)
+function compiler_config(
+    dev::HIP.HIPDevice; kernel::Bool = true,
+    name::Union{String, Nothing} = nothing, always_inline::Bool = true,
+)
+    hsa_isa = AMDGPU.default_isa(dev)
+    dev_isa, features = hsa_isa.arch_features
 
-Low-level interface to compile a function invocation for the currently-active
-GPU, returning a callable kernel object. For a higher-level interface, use
-[`@roc`](@ref).
+    target = GCNCompilerTarget(; dev_isa, features)
+    params = HIPCompilerParams()
+    CompilerConfig(target, params; kernel, name, always_inline)
+end
 
-The following keyword arguments are supported:
-- `name`: overrides the name that the kernel will have in the generated code
-- `device`: chooses which device to compile the kernel for
-- `global_hooks`: specifies maps from global variable name to initializer hook
+const hipfunction_lock = ReentrantLock()
 
-The output of this function is automatically cached, i.e. you can simply call
-`rocfunction` in a hot path without degrading performance. New code will be
-generated automatically, when function definitions change, or when different
-types or keyword arguments are provided.
-"""
-function rocfunction(
-    f::F, tt::Type = Tuple{}; device::ROCDevice = AMDGPU.device(),
-    global_hooks = NamedTuple(), kwargs...,
-) where {F <: Core.Function}
-    Base.@lock rocfunction_lock begin
-        @debug "Compiling $f($(join(tt.parameters, ", ")))"
-        Runtime.@log_start(:cached_compile, (;f=F, tt), nothing)
+function hipfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F <: Core.Function, TT}
+    Base.@lock hipfunction_lock begin
+        dev = AMDGPU.device()
+        cache = compiler_cache(dev)
+        config = compiler_config(dev; kwargs...)
 
-        cache = compiler_cache(device)
-        config = compiler_config(device; global_hooks, kwargs...)
+        source = methodinstance(F, tt)
         fun = GPUCompiler.cached_compilation(
-            cache, config, F, tt, compile, link)::ROCFunction
+            cache, source, config, hipcompile, hiplink)
 
         h = hash(fun, hash(f, hash(tt)))
-        kernel = get(_kernel_instances, h, Runtime.HostKernel{F,tt}(f, fun.mod, fun))
-        Runtime.@log_finish(:cached_compile, (;f=F, tt), nothing)
-        return kernel::Runtime.HostKernel{F,tt}
+        kernel = get!(_kernel_instances, h) do
+            Runtime.HIPKernel{F, tt}(f, fun)
+        end
+        return kernel::Runtime.HIPKernel{F, tt}
     end
 end
 
-# compile to GCN
-function compile(@nospecialize(job::CompilerJob))
-    Runtime.@log_start(:compile, (;fspec=job.source.specTypes), nothing)
-    JuliaContext() do ctx
-        obj, meta = GPUCompiler.compile(:obj, job; ctx)
-        # Find undefined globals and calculate sizes.
-        globals = map(
-            gbl -> Symbol(LLVM.name(gbl)) => llvmsize(eltype(value_type(gbl))),
-            filter!(isextinit, collect(LLVM.globals(meta.ir))))
-        entry = LLVM.name(meta.entry)
+function create_executable(obj)
+    lld = if AMDGPU.lld_artifact
+        `$(LLD_jll.lld()) -flavor gnu`
+    else
+        @assert !isempty(AMDGPU.lld_path) "ld.lld was not found; cannot link kernel"
+        `$(AMDGPU.lld_path)`
+    end
 
-        Runtime.@log_finish(:compile, (;fspec=job.source.specTypes), nothing)
-        return (; obj, entry, globals)
+    path_exe = mktemp() do path_o, io_o
+        write(io_o, obj)
+        flush(io_o)
+        path_exe = path_o * ".exe"
+        run(`$lld -shared -o $path_exe $path_o`)
+        path_exe
     end
+    return read(path_exe)
 end
-function link(@nospecialize(job::CompilerJob), compiled)
-    Runtime.@log_start(:link, (;fspec=job.source.specTypes), nothing)
-    device = job.config.params.device
-    global_hooks = job.config.params.global_hooks
-    (;obj, entry, globals) = compiled
 
-    # create executable and kernel
-    obj = codeunits(obj)
-    exe = AMDGPU.create_executable(device, entry, obj; globals=globals)
-    mod = ROCModule(exe)
-    fun = ROCFunction(mod, entry, hash(job.source, UInt64(0)))
-
-    # initialize globals from hooks
-    for gname in first.(globals)
-        hook = nothing
-        if haskey(default_global_hooks, gname)
-            hook = default_global_hooks[gname]
-        elseif haskey(global_hooks, gname)
-            hook = global_hooks[gname]
-        end
-        if hook !== nothing
-            @debug "Initializing global $gname"
-            Runtime.@log_start(:global_init, (;fspec=job.source.specTypes, gname), nothing)
-            gbl = Runtime.get_global(exe, gname)
-            hook(gbl, mod, device)
-            Runtime.@log_finish(:global_init, (;fspec=job.source.specTypes, gname), nothing)
-        else
-            @debug "Uninitialized global $gname"
-            continue
-        end
+function hipcompile(@nospecialize(job::CompilerJob))
+    obj, meta = JuliaContext() do ctx
+        GPUCompiler.compile(:obj, job)
     end
 
-    Runtime.@log_finish(:link, (;fspec=job.source.specTypes), nothing)
-    return fun
-end
+    entry = LLVM.name(meta.entry)
+    globals = filter(isextinit, collect(LLVM.globals(meta.ir))) .|> LLVM.name
 
-function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function)
-    if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv
-        return entry
+    global_hostcall_names = (
+        :malloc_hostcall, :free_hostcall, :print_hostcall, :printf_hostcall)
+    global_hostcalls = Symbol[]
+    for gbl in LLVM.globals(meta.ir), gbl_name in global_hostcall_names
+        occursin("__$gbl_name", LLVM.name(gbl)) || continue
+        push!(global_hostcalls, gbl_name)
     end
-    to_init = []
-    for gbl in LLVM.globals(mod)
-        if startswith(LLVM.name(gbl), "__zeroinit")
-            as = LLVM.addrspace(value_type(gbl))
-            if as == AMDGPU.Device.AS.Local
-                push!(to_init, gbl)
-            end
-        end
+    if !isempty(global_hostcalls)
+        @warn """Global hostcalls detected: $global_hostcalls.
+        Use `AMDGPU.synchronize(; blocking=false)` to synchronize and stop them.
+        Otherwise, performance might degrade.
+        """ maxlog=1
     end
-    if length(to_init) > 0
-        ctx = LLVM.context(mod)
-        T_void = LLVM.VoidType(ctx)
-        LLVM.@dispose builder=LLVM.IRBuilder(ctx) begin
-            # Make these the first operations we do
-            position!(builder, first(LLVM.instructions(first(LLVM.blocks(entry)))))
 
-            # Use memset to clear all values to 0
-            for gbl in to_init
-                sz = llvmsize(eltype(value_type(gbl)))
-                if sz > 0
-                    LLVM.memset!(builder, gbl, ConstantInt(UInt8(0); ctx), ConstantInt(sz; ctx), LLVM.alignment(gbl))
-                end
-            end
+    if !isempty(globals)
+        @warn """
+        HIP backend does not support setting extinit globals.
+        But kernel `$entry` has following:
+        $globals
 
-            # Synchronize the workgroup to prevent races
-            sync_ft = LLVM.FunctionType(LLVM.VoidType(ctx))
-            sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier"))
-            call!(builder, sync_ft, sync_f)
-        end
+        Compilation will likely fail.
+        """
     end
-
-    return entry
+    (; obj=create_executable(codeunits(obj)), entry, global_hostcalls)
 end
 
-## exception codegen
-# emit a global variable for storing the current exception status
-function emit_exception_user!(mod::LLVM.Module)
-    # add a fake user for __ockl_hsa_signal_store and __ockl_hsa_signal_load
-    if !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user")
-        ctx = LLVM.context(mod)
-        ft = LLVM.FunctionType(LLVM.VoidType(ctx))
-        fn = LLVM.Function(mod, "__fake_global_exception_flag_user", ft)
-        IRBuilder(ctx) do builder
-            entry = BasicBlock(fn, "entry"; ctx)
-            position!(builder, entry)
-            T_nothing = LLVM.VoidType(ctx)
-            T_i32 = LLVM.Int32Type(ctx)
-            T_i64 = LLVM.Int64Type(ctx)
-
-            T_signal_store = LLVM.FunctionType(T_nothing, [T_i64, T_i64, T_i32])
-            signal_store = LLVM.Function(mod, "__ockl_hsa_signal_store", T_signal_store)
-            call!(builder, T_signal_store, signal_store,
-                [ConstantInt(0; ctx), ConstantInt(0; ctx),
-                #= __ATOMIC_RELEASE == 3 =#
-                ConstantInt(Int32(3); ctx)])
-
-            T_signal_load = LLVM.FunctionType(T_i64, [T_i64, T_i32])
-            signal_load = LLVM.Function(mod, "__ockl_hsa_signal_load", T_signal_load)
-            loaded_value = call!(builder, T_signal_load, signal_load,
-                [ConstantInt(0; ctx),
-                #= __ATOMIC_ACQUIRE == 2 =#
-                ConstantInt(Int32(2); ctx)])
+function hiplink(@nospecialize(job::CompilerJob), compiled)
+    (; obj, entry, global_hostcalls) = compiled
+    mod = HIP.HIPModule(obj)
+    HIP.HIPFunction(mod, entry, global_hostcalls)
+end
 
-            T_signal_cas = LLVM.FunctionType(T_i64, [T_i64, T_i64, T_i64, T_i32])
-            signal_cas = LLVM.Function(mod, "__ockl_hsa_signal_cas", T_signal_cas)
-            loaded_value = call!(builder, T_signal_cas, signal_cas,
-                [ConstantInt(0; ctx), ConstantInt(0; ctx), ConstantInt(0; ctx),
-                 #= __ATOMIC_ACQ_REL == 4 =#
-                 ConstantInt(Int32(4); ctx)])
+function run_and_collect(cmd)
+    stdout = Pipe()
+    proc = run(pipeline(ignorestatus(cmd); stdout, stderr=stdout), wait=false)
+    close(stdout.in)
 
-            ret!(builder)
-        end
-    end
-    @assert haskey(LLVM.functions(mod), "__fake_global_exception_flag_user")
-end
-function delete_exception_user!(mod::LLVM.Module)
-    fns = LLVM.functions(mod)
-    if haskey(fns, "__fake_global_exception_flag_user")
-        unsafe_delete!(mod, fns["__fake_global_exception_flag_user"])
-    end
-    @assert !haskey(LLVM.functions(mod), "__fake_global_exception_flag_user")
+    reader = Threads.@spawn String(read(stdout))
+    Base.wait(proc)
+    log = strip(fetch(reader))
+    return proc, log
 end
diff --git a/src/compiler/device-libs.jl b/src/compiler/device_libs.jl
similarity index 73%
rename from src/compiler/device-libs.jl
rename to src/compiler/device_libs.jl
index 9837e118a..ad89d9870 100644
--- a/src/compiler/device-libs.jl
+++ b/src/compiler/device_libs.jl
@@ -1,43 +1,37 @@
 ## ROCm device library
 
-import AMDGPU: device_libs_path
+import AMDGPU: libdevice_libs
 
 function load_and_link!(mod, path)
-    ctx = LLVM.context(mod)
-    lib = parse(LLVM.Module, read(path); ctx)
+    lib = parse(LLVM.Module, read(path))
 
     for f in LLVM.functions(lib)
         # FIXME: We should be able to inline this, that we can't means
         #        we are inserting calls to it late.
-        name = LLVM.name(f)
-        name == "__ockl_hsa_signal_store" && continue
-        name == "__ockl_hsa_signal_load" && continue
-        startswith(name, "__ockl_hsa_signal") && continue
+        startswith(LLVM.name(f), "__ockl_hsa_signal") && continue
+
         attrs = function_attributes(f)
         inline = true
-        noinline_attr = EnumAttribute("noinline"; ctx)
+        noinline_attr = EnumAttribute("noinline")
         for attr in collect(attrs)
             if kind(attr) == kind(noinline_attr)
                 inline = false
                 break
             end
         end
-        if inline
-            push!(attrs, EnumAttribute("alwaysinline"; ctx))
-        end
+        inline && push!(attrs, EnumAttribute("alwaysinline"))
     end
 
     # override triple and datalayout to avoid warnings
     triple!(lib, triple(mod))
     datalayout!(lib, datalayout(mod))
-
     LLVM.link!(mod, lib)
 end
 
 function locate_lib(file)
-    file_path = joinpath(device_libs_path, file*".bc")
+    file_path = joinpath(libdevice_libs, file*".bc")
     if !ispath(file_path)
-        file_path = joinpath(device_libs_path, file*".amdgcn.bc")
+        file_path = joinpath(libdevice_libs, file*".amdgcn.bc")
         if !ispath(file_path)
             # failed to find matching bitcode file
             return nothing
@@ -50,21 +44,13 @@ function link_device_libs!(target, mod::LLVM.Module)
     # TODO: only link if used
     # TODO: make these globally/locally configurable
 
-    device_libs_path === nothing && return
+    isnothing(libdevice_libs) && return
 
     # https://github.com/RadeonOpenCompute/ROCm-Device-Libs/blob/9420f6380990b09851edc2a5f9cbfaa88742b449/doc/OCML.md#controls
     # Note: It seems we need to load in reverse order, to avoid LLVM deleting the globals from the module, before we use them.
 
     # 1. Load other libraries
-    libs = (
-        "hc",
-        "hip",
-        "irif",
-        "ockl",
-        "opencl",
-        "ocml",
-    )
-
+    libs = ("hc", "hip", "irif", "ockl", "opencl", "ocml")
     for lib in libs
         lib_path = locate_lib(lib)
         lib_path === nothing && continue
@@ -82,10 +68,15 @@ function link_device_libs!(target, mod::LLVM.Module)
     try
         load_and_link!(mod, lib)
     catch err
-        @warn "Failed to load/link OCLC core library for ISA $(target.dev_isa)" err=err
+        @warn "Failed to load/link OCLC core library `$lib` for ISA $(target.dev_isa)." err=err
     end
 
-    # 3. Load options libraries
+    # 3. Load OCLC ABI library (required for printing).
+    lib = locate_lib("oclc_abi_version_500")
+    @assert lib !== nothing
+    load_and_link!(mod, lib)
+
+    # 4. Load options libraries
     options = Dict(
         :finite_only => false,
         :unsafe_math => false,
diff --git a/src/compiler/dynamic_memory.jl b/src/compiler/dynamic_memory.jl
new file mode 100644
index 000000000..3f42d71a3
--- /dev/null
+++ b/src/compiler/dynamic_memory.jl
@@ -0,0 +1,43 @@
+function create_malloc_hostcall!()
+    dev = AMDGPU.device()
+    _, buf = Device.named_perdevice_hostcall(dev, :malloc_hostcall) do
+        holder = Device.HostCallHolder(
+            Ptr{Cvoid}, Tuple{Csize_t}; continuous=true,
+        ) do bytesize
+            buf = Mem.HostBuffer(bytesize, HIP.hipHostAllocMapped)
+            dev_ptr = Mem.device_ptr(buf)
+            @assert buf.ptr == dev_ptr # TODO
+            return dev_ptr
+        end
+
+        # Create host pinned memory and store HostCall in it.
+        # It will be then accessed by kernels from kernel state.
+        buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped)
+        ptr = Base.unsafe_convert(
+            Ptr{Device.HostCall{Ptr{Cvoid}, Tuple{Csize_t}}}, buf)
+        Base.unsafe_store!(ptr, holder.hc)
+        return holder, buf
+    end
+    return Mem.device_ptr(buf)
+end
+
+function create_free_hostcall!()
+    dev = AMDGPU.device()
+    _, buf = Device.named_perdevice_hostcall(dev, :free_hostcall) do
+        holder = Device.HostCallHolder(
+            Nothing, Tuple{Ptr{Cvoid}}; continuous=true,
+        ) do ptr
+            ptr == C_NULL && return
+            # FIXME for some reason it hangs on free function in hostcall...
+            # HIP.hipHostFree(ptr) |> HIP.check
+            return
+        end
+
+        buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped)
+        ptr = Base.unsafe_convert(
+            Ptr{Device.HostCall{Nothing, Tuple{Ptr{Cvoid}}}}, buf)
+        Base.unsafe_store!(ptr, holder.hc)
+        return holder, buf
+    end
+    return Mem.device_ptr(buf)
+end
diff --git a/src/compiler/exceptions.jl b/src/compiler/exceptions.jl
new file mode 100644
index 000000000..7f764a1ae
--- /dev/null
+++ b/src/compiler/exceptions.jl
@@ -0,0 +1,27 @@
+struct KernelException <: Exception
+    dev::HIP.HIPDevice
+end
+
+function Base.showerror(io::IO, ex::KernelException)
+    print(io, "KernelException: exception thrown during kernel execution on `$(ex.dev)`.")
+end
+
+const _exception_flags = Dict{HIP.HIPDevice, Mem.HostBuffer}()
+
+function create_exception!(mod::HIP.HIPModule)
+    exception_flag = get!(_exception_flags, mod.dev,
+        Mem.HostBuffer(sizeof(Int), HIP.hipHostAllocMapped))
+    return Mem.device_ptr(exception_flag)
+end
+
+# Check for exceptions on every synchronization.
+function check_exceptions()
+    for (dev, buf) in _exception_flags
+        ptr = Base.unsafe_convert(Ptr{Int}, buf)
+        flag = unsafe_load(ptr)
+        if flag != 0
+            unsafe_store!(ptr, 0)
+            throw(KernelException(dev))
+        end
+    end
+end
diff --git a/src/compiler/global-hooks.jl b/src/compiler/global-hooks.jl
deleted file mode 100644
index da7d1edb4..000000000
--- a/src/compiler/global-hooks.jl
+++ /dev/null
@@ -1,94 +0,0 @@
-const default_global_hooks = Dict{Symbol,Function}()
-
-default_global_hooks[:__global_output_context] = (gbl, mod, device) -> begin
-    # initialize global output context
-    gbl_ptr = Base.unsafe_convert(Ptr{AMDGPU.Device.GLOBAL_OUTPUT_CONTEXT_TYPE}, gbl)
-    oc = Device.OutputContext(stdout; device, name=:__global_output, timeout=nothing)
-    Base.unsafe_store!(gbl_ptr, oc)
-end
-default_global_hooks[:__global_printf_context] = (gbl, mod, device) -> begin
-    # initialize global printf context
-    # Return type of Int to force synchronizing behavior
-    args_type = Tuple{LLVMPtr{UInt8, AS.Global}}
-    ret_type = Int
-    gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl)
-
-    hc = Device.named_perdevice_hostcall(device, :__global_printf) do
-        HostCall(ret_type, args_type; device, continuous=true, buf_len=2^16, timeout=nothing) do _
-            fmt, all_args = unsafe_load(reinterpret(LLVMPtr{AMDGPU.Device.ROCPrintfBuffer,AS.Global}, hc.buf_ptr))
-
-            for args in all_args
-                args = map(x -> x isa Cstring ? unsafe_string(x) : x, args)
-                @debug "@rocprintf with $fmt and $(args)"
-                try
-                    @eval @printf($fmt, $(args...))
-                catch err
-                    @error "@rocprintf error" exception=(err,catch_backtrace())
-                end
-            end
-            return 0
-        end
-    end
-    Base.unsafe_store!(gbl_ptr, hc)
-end
-default_global_hooks[:__global_exception_flag] = (gbl, mod, device) -> begin
-    # initialize global exception flag
-    gbl_ptr = Base.unsafe_convert(Ptr{Int64}, gbl)
-    Base.unsafe_store!(gbl_ptr, 0)
-end
-default_global_hooks[:__global_exception_ring] = (gbl, mod, device) -> begin
-    # initialize exception ring buffer
-    gbl_ptr = Base.unsafe_convert(Ptr{Ptr{ExceptionEntry}}, gbl)
-    ex_ptr = Base.unsafe_convert(Ptr{ExceptionEntry}, mod.exceptions)
-    unsafe_store!(gbl_ptr, ex_ptr)
-
-    # setup initial slots
-    for i in 1:Runtime.MAX_EXCEPTIONS-1
-        unsafe_store!(ex_ptr, ExceptionEntry(0, LLVMPtr{UInt8,1}(0)))
-        ex_ptr += sizeof(ExceptionEntry)
-    end
-    # setup tail slot
-    unsafe_store!(ex_ptr, ExceptionEntry(1, LLVMPtr{UInt8,1}(0)))
-end
-default_global_hooks[:__global_malloc_hostcall] = (gbl, mod, device) -> begin
-    # initialize malloc hostcall
-    args_type = Tuple{UInt64, Csize_t}
-    ret_type = Ptr{Cvoid}
-    gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl)
-
-    hc = Device.named_perdevice_hostcall(device, :__global_malloc) do
-        HostCall(ret_type, args_type; device, continuous=true, timeout=nothing) do kern, sz
-            buf = Mem.alloc(device, sz; coherent=true)
-            # FIXME: Lock
-            push!(mod.metadata, Runtime.KernelMetadata(kern, buf))
-            @debug "Allocated $(buf.ptr) ($sz bytes) for kernel $kern on device $device"
-            return buf.ptr
-        end
-    end
-    Base.unsafe_store!(gbl_ptr, hc)
-end
-default_global_hooks[:__global_free_hostcall] = (gbl, mod, device) -> begin
-    # initialize free hostcall
-    args_type = Tuple{UInt64, Ptr{Cvoid}}
-    ret_type = Nothing
-    gbl_ptr = Base.unsafe_convert(Ptr{HostCall{ret_type, args_type}}, gbl)
-
-    hc = Device.named_perdevice_hostcall(device, :__global_free) do
-        HostCall(ret_type, args_type; device, continuous=true, timeout=nothing) do kern, ptr
-            # FIXME: Lock
-            for idx in length(mod.metadata):-1:1
-                meta = mod.metadata[idx]
-                same_kern = meta.kern == kern
-                same_ptr = meta.buf.ptr == ptr
-                if same_kern && same_ptr
-                    Mem.free(meta.buf)
-                    deleteat!(mod.metadata, idx)
-                    @debug "Freed $ptr ($(meta.buf.bytesize) bytes) for kernel $kern on device $device."
-                    break
-                end
-            end
-            return nothing
-        end
-    end
-    Base.unsafe_store!(gbl_ptr, hc)
-end
diff --git a/src/compiler/occupancy.jl b/src/compiler/occupancy.jl
deleted file mode 100644
index ea7c50d11..000000000
--- a/src/compiler/occupancy.jl
+++ /dev/null
@@ -1,150 +0,0 @@
-import ObjectFile
-import ObjectFile: readmeta, Sections, section_name, section_size, section_offset
-import MsgPack
-
-# TODO use LockedObject
-const OCCUPANCY_CACHE = Dict{Tuple{AMDGPU.Runtime.ROCDevice, UInt64, Int, Int}, NamedTuple}()
-const OCCUPANCY_CACHE_LOCK = Threads.ReentrantLock()
-
-function read_metadata(fun::ROCFunction)
-    path, io = mktemp(; cleanup=false)
-    write(io, fun.mod.exe.data)
-    close(io)
-    mv(path, path*".exe") # so that readmeta knows that this is an ELF file
-    path = path * ".exe"
-    try
-        return open(path, "r") do io
-            elf = readmeta(io)
-            note_sec_idx = findfirst(sec->section_name(sec) == ".note", Sections(elf))
-            note_sec = Sections(elf)[note_sec_idx]
-            note_sec_size = section_size(note_sec)
-
-            seek(io, section_offset(note_sec))
-            off = position(io)
-            while position(io) - off < note_sec_size
-                name_size = read(io, UInt32)
-                desc_size = read(io, UInt32)
-                note_type = read(io, UInt32)
-                if note_type != 0x20 # NT_AMDGPU_METADATA
-                    # Skip this note
-                    seek(io, position(io) + name_size + 1 + desc_size)
-                    continue
-                end
-                name = readuntil(io, '\0'); read(io, UInt8)
-                desc = Vector{UInt8}(undef, desc_size)
-                readbytes!(io, desc)
-                return MsgPack.unpack(desc)
-            end
-        end
-    finally
-        rm(path)
-    end
-    return nothing
-end
-
-calculate_occupancy(kernel::Runtime.HostKernel; kwargs...) =
-    calculate_occupancy(kernel.fun; kwargs...)
-function calculate_occupancy(fun::ROCFunction; input_block_size=1, localmem=0)
-    lock(OCCUPANCY_CACHE_LOCK) do
-        get!(OCCUPANCY_CACHE, (fun.mod.exe.device, fun.hash, input_block_size, localmem)) do
-            _calculate_occupancy(fun, fun.mod.exe.device; input_block_size, localmem)
-        end
-    end
-end
-function _calculate_occupancy(fun::ROCFunction, device::ROCDevice; input_block_size, localmem)
-    # Calculate occupancy
-    # Copied from https://github.com/ROCm-Developer-Tools/hipamd/blob/3ec1ccdbbbee7090ba854eddd1dee281973a4498/src/hip_platform.cpp#L301
-    isa = first(Runtime.isas(device))
-    if input_block_size == 1
-        # We assume the user is requesting groupsize optimization
-        input_block_size = Runtime.isa_workgroup_max_size(isa)
-    end
-    arch = Runtime.architecture(isa)
-    arch_major, arch_minor, arch_stepping = if startswith(arch, "gfx8")
-        8, parse(Int, "0x"*arch[5]), parse(Int, "0x"*arch[6:end])
-    elseif startswith(arch, "gfx9")
-        9, parse(Int, "0x"*arch[5]), parse(Int, "0x"*arch[6:end])
-    elseif startswith(arch, "gfx10")
-        10, parse(Int, "0x"*arch[6]), parse(Int, "0x"*arch[7:end])
-    elseif startswith(arch, "gfx11")
-        11, parse(Int, "0x"*arch[6]), parse(Int, "0x"*arch[7:end])
-    else
-        error("Unsupported architecture: $arch")
-    end
-    meta = read_metadata(fun)
-    kernel_idx = findfirst(k->startswith(k[".symbol"], fun.entry), meta["amdhsa.kernels"])
-    kernel = meta["amdhsa.kernels"][kernel_idx]
-    SGPR_count = Int(kernel[".sgpr_count"])
-    VGPR_count = Int(kernel[".vgpr_count"])
-    LDS_size = Int(kernel[".group_segment_fixed_size"])
-    wavefront_size = Int(kernel[".wavefront_size"])
-    # TODO: Print signature
-    @debug "Calculating occupancy of $(fun.entry) for $arch ($arch_major, $arch_minor, $arch_stepping)" SGPR_count VGPR_count LDS_size
-    max_waves_per_SIMD = arch_major <= 9 ? 8 : 16
-    VGPR_waves = max_waves_per_SIMD
-    local max_VGPRs, VGPR_granularity
-    if arch_major <= 9
-        if arch == "gfx90a"
-            max_VGPRs = 512
-            VGPR_granularity = 8
-        else
-            max_VGPRs = 256
-            VGPR_granularity = 4
-        end
-    else
-        max_VGPRs = 1024
-        VGPR_granularity = 8
-    end
-
-    function align_up(x, y)
-        r = rem(x, y)
-        r > 0 && return x + y-r
-        return x
-    end
-    if VGPR_count > 0
-        VGPR_waves = max_VGPRs ÷ align_up(VGPR_count, VGPR_granularity)
-    end
-
-    GPR_waves = VGPR_waves
-    if SGPR_count > 0
-        max_SGPRs = if arch_major < 8
-            512
-        elseif arch_major < 10
-            800
-        else
-            typemax(Int64)
-        end
-        SGPR_waves = max_SGPRs ÷ align_up(SGPR_count, 16)
-        GPR_waves = min(VGPR_waves, SGPR_waves)
-    end
-
-    alu_occupancy = Runtime.device_num_simds_per_compute_unit(device) * min(max_waves_per_SIMD, GPR_waves)
-    alu_limited_threads = alu_occupancy * wavefront_size
-
-    LDS_occupancy_wgs = typemax(Int)
-    total_used_LDS = LDS_size + localmem
-    if total_used_LDS != 0
-        LDS_occupancy_wgs = Int(Runtime.device_local_memory_size(device) ÷ total_used_LDS)
-    end
-
-    # Calculate how many blocks of input_block_size we can fit per CU
-    max_blocks_per_CU = alu_limited_threads ÷ align_up(input_block_size, wavefront_size)
-    max_blocks_per_CU = min(max_blocks_per_CU, LDS_occupancy_wgs)
-    best_block_size = Int(min(alu_limited_threads, align_up(input_block_size, wavefront_size)))
-    best_block_size = min(best_block_size, AMDGPU.Device._max_group_size)
-    best_blocks_per_CU = alu_limited_threads ÷ best_block_size
-    num_blocks_per_grid = Runtime.device_num_compute_units(device) * min(best_blocks_per_CU, LDS_occupancy_wgs)
-
-    # TODO: Print signature
-    @debug "Occupancy of $(fun.entry) for $arch ($arch_major, $arch_minor, $arch_stepping)" max_blocks_per_CU best_block_size best_blocks_per_CU num_blocks_per_grid
-    return (;max_blocks_per_CU,
-             best_block_size,
-             best_blocks_per_CU,
-             num_blocks_per_grid,
-             GPR_waves,
-             alu_limited_threads,
-             SGPR_count,
-             VGPR_count,
-             LDS_size,
-             wavefront_size)
-end
diff --git a/src/compiler/output_context.jl b/src/compiler/output_context.jl
new file mode 100644
index 000000000..6122b5256
--- /dev/null
+++ b/src/compiler/output_context.jl
@@ -0,0 +1,57 @@
+function create_output_context!(#= TODO mod::HIP.HIPModule =#)
+    dev = AMDGPU.device()
+    _, buf = Device.named_perdevice_hostcall(dev, :print_hostcall) do
+        buf_len = 2^16
+        holder = Device.HostCallHolder(
+            Nothing, Tuple{LLVMPtr{Device.DeviceStaticString{buf_len}, AS.Global}};
+            continuous=true, buf_len,
+        ) do _
+            str_ptr = reinterpret(
+                LLVMPtr{Device.DeviceStaticString{buf_len}, AS.Global},
+                holder.hc.buf_ptr)
+            Core.print(unsafe_load(str_ptr))
+            return
+        end
+
+        # Pointer to HostCall to be read from device.
+        buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped)
+        ptr = Base.unsafe_convert(Ptr{Device.OUTPUT_CONTEXT_TYPE}, buf)
+        Base.unsafe_store!(ptr, holder.hc)
+        return holder, buf
+    end
+    return Mem.device_ptr(buf)
+end
+
+function create_printf_output_context!()
+    dev = AMDGPU.device()
+    _, buf = Device.named_perdevice_hostcall(dev, :printf_hostcall) do
+        holder = Device.HostCallHolder(
+            Nothing, Tuple{LLVMPtr{UInt8, AS.Global}};
+            continuous=true, buf_len=2^16,
+        ) do _
+            printf_buf_ptr = reinterpret(
+                LLVMPtr{Device.ROCPrintfBuffer, AS.Global},
+                holder.hc.buf_ptr)
+            fmt, all_args = unsafe_load(printf_buf_ptr)
+            format = Printf.Format(fmt)
+
+            for args in all_args
+                try
+                    args = map(x -> x isa Cstring ? unsafe_string(x) : x, args)
+                    formatted = Printf.format(format, args...)
+                    Core.print(formatted)
+                catch err
+                    @error "@rocprintf error" exception=(err, catch_backtrace())
+                end
+            end
+            return
+        end
+        # Pointer to HostCall to be read from device.
+        buf = Mem.HostBuffer(sizeof(holder.hc), HIP.hipHostAllocMapped)
+        ptr = Base.unsafe_convert(
+            Ptr{Device.PRINTF_OUTPUT_CONTEXT_TYPE}, buf)
+        Base.unsafe_store!(ptr, holder.hc)
+        return holder, buf
+    end
+    return Mem.device_ptr(buf)
+end
diff --git a/src/compiler/utils.jl b/src/compiler/utils.jl
deleted file mode 100644
index 94973f25a..000000000
--- a/src/compiler/utils.jl
+++ /dev/null
@@ -1,327 +0,0 @@
-# Tools for implementing device functionality
-
-# which Julia types map to a given LLVM type
-const llvmtypes = Dict{Type,Symbol}(
-    Nothing => :void,
-    Bool    => :i1,
-    Int8    => :i8,
-    Int16   => :i16,
-    Int32   => :i32,
-    Int64   => :i64,
-    UInt8   => :i8,
-    UInt16  => :i16,
-    UInt32  => :i32,
-    UInt64  => :i64,
-    Float32 => :float,
-    Float64 => :double,
-)
-
-# which LLVM types map to a given Julia type
-const jltypes = Dict{Symbol,Type}(
-    :void   => Nothing,
-    :i1     => Bool,
-    :i8     => Int8,
-    :i16    => Int16,
-    :i32    => Int32,
-    :i64    => Int64,
-    :float  => Float32,
-    :double => Float64
-)
-
-# Decode an expression of the form:
-#
-#    function(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type
-#
-# Returns a tuple containing the function name, a vector of argument, a vector of argument
-# types and the return type (all in symbolic form).
-function decode_call(e)
-    @assert e.head == :(::)
-
-    # decode the return type expression: single symbol (the LLVM type), or a tuple of 2
-    # symbols (the LLVM and corresponding Julia type)
-    retspec = e.args[2]
-    if isa(retspec, Symbol)
-        rettype = retspec
-    else
-        @assert retspec.head == :tuple
-        @assert length(retspec.args) == 2
-        rettype = (retspec.args[1], retspec.args[2])
-    end
-
-    call = e.args[1]
-    @assert call.head == :call
-
-    fn = Symbol(call.args[1])
-    args = Symbol[arg.args[1] for arg in call.args[2:end]]
-    argtypes = Symbol[arg.args[2] for arg in call.args[2:end]]
-
-    return fn, args, argtypes, rettype
-end
-
-# Generate a `llvmcall` statement calling an intrinsic specified as follows:
-#
-#     intrinsic(arg::arg_type, arg::arg_type, ... arg::arg_type)::return_type [attr]
-#
-# The argument types should be valid LLVM type identifiers (eg. i32, float, double).
-# Conversions to the corresponding Julia type are automatically generated; make sure the
-# actual arguments are of the same type to make these conversions no-ops. The optional
-# argument `attr` indicates which LLVM function attributes (such as `readnone` or `nounwind`)
-# to add to the intrinsic declaration.
-
-# For example, the following call:
-#     `@wrap __some_intrinsic(x::float, y::double)::float`
-#
-# will yield the following `llvmcall`:
-# ```
-#     Base.llvmcall(("declare float @__somme__intr(float, double)",
-#                    "%3 = call float @__somme__intr(float %0, double %1)
-#                     ret float %3"),
-#                   Float32, Tuple{Float32,Float64},
-#                   convert(Float32,x), convert(Float64,y))
-# ```
-macro wrap(call, attrs="")
-    intrinsic, args, argtypes, rettype = decode_call(call)
-
-    # decide on intrinsic return type
-    if isa(rettype, Symbol)
-        # only LLVM return type specified, match against known LLVM/Julia type combinations
-        llvm_ret_typ = rettype
-        julia_ret_typ = jltypes[rettype]
-    else
-        # both specified (for when there is a mismatch, eg. i32 -> UInt32)
-        llvm_ret_typ = rettype[1]
-        julia_ret_typ = rettype[2]
-    end
-
-    llvm_args = String["%$i" for i in 0:length(argtypes)]
-    if llvm_ret_typ == :void
-        llvm_ret_asgn = ""
-        llvm_ret = "void"
-    else
-        llvm_ret_var = "%$(length(argtypes)+1)"
-        llvm_ret_asgn = "$llvm_ret_var = "
-        llvm_ret = "$llvm_ret_typ $llvm_ret_var"
-    end
-    llvm_declargs = join(argtypes, ", ")
-    llvm_defargs = join(("$t $arg" for (t,arg) in zip(argtypes, llvm_args)), ", ")
-
-    julia_argtypes = (jltypes[t] for t in argtypes)
-    julia_args = (:(convert($argtype, $(esc(arg)))) for (arg, argtype) in zip(args, julia_argtypes))
-
-    dest = ("""declare $llvm_ret_typ @$intrinsic($llvm_declargs)""",
-            """$llvm_ret_asgn call $llvm_ret_typ @$intrinsic($llvm_defargs)
-                ret $llvm_ret""")
-
-    return quote
-        Base.llvmcall($dest, $julia_ret_typ, Tuple{$(julia_argtypes...)}, $(julia_args...))
-    end
-end
-
-# generalization of word-based primitives
-
-# extract bits from a larger value
-@inline function extract_word(val, ::Val{i}) where {i}
-    extract_value(val, UInt32, Val(32*(i-1)))
-end
-
-@generated function extract_value(val, ::Type{sub}, ::Val{offset}) where {sub, offset}
-    Context() do ctx
-        T_val = convert(LLVMType, val; ctx)
-        T_sub = convert(LLVMType, sub; ctx)
-        bytes = Core.sizeof(val)
-        T_int = LLVM.IntType(8*bytes; ctx)
-
-        # create function
-        llvm_f, _ = create_function(T_sub, [T_val])
-        mod = LLVM.parent(llvm_f)
-
-        # generate IR
-        IRBuilder(ctx) do builder
-            entry = BasicBlock(llvm_f, "entry"; ctx)
-            position!(builder, entry)
-            equiv = bitcast!(builder, parameters(llvm_f)[1], T_int)
-            shifted = lshr!(builder, equiv, LLVM.ConstantInt(T_int, offset))
-            # extracted = and!(builder, shifted, 2^32-1)
-            extracted = trunc!(builder, shifted, T_sub)
-            ret!(builder, extracted)
-        end
-    end
-
-    call_function(llvm_f, UInt32, Tuple{val}, :val)
-end
-
-# insert bits into a larger value
-@inline function insert_word(val, word::UInt32, ::Val{i}) where {i}
-    insert_value(val, word, Val(32*(i-1)))
-end
-
-@generated function insert_value(val, sub, ::Val{offset}) where {offset}
-    Context() do ctx
-        T_val = convert(LLVMType, val; ctx)
-        T_sub = convert(LLVMType, sub; ctx)
-        bytes = Core.sizeof(val)
-        T_out_int = LLVM.IntType(8*bytes; ctx)
-
-        # create function
-        llvm_f, _ = create_function(T_val, [T_val, T_sub])
-        mod = LLVM.parent(llvm_f)
-
-        # generate IR
-        IRBuilder(ctx) do builder
-            entry = BasicBlock(llvm_f, "entry"; ctx)
-            position!(builder, entry)
-            equiv = bitcast!(builder, parameters(llvm_f)[1], T_out_int)
-            ext = zext!(builder, parameters(llvm_f)[2], T_out_int)
-            shifted = shl!(builder, ext, LLVM.ConstantInt(T_out_int, offset))
-            inserted = or!(builder, equiv, shifted)
-            orig = bitcast!(builder, inserted, T_val)
-            ret!(builder, orig)
-        end
-    end
-
-    call_function(llvm_f, val, Tuple{val, sub}, :val, :sub)
-end
-
-# split the invocation of a function `op` on a value `val` with non-struct eltype
-# into multiple smaller invocations on byte-sized partial values.
-@generated function split_value_invocation(op::Function, val, args...)
-    # TODO: control of lower-limit
-
-    ex = quote
-        Base.@_inline_meta
-    end
-
-    # disassemble into words
-    words = Symbol[]
-    for i in 1:Core.sizeof(val)÷4
-        word = Symbol("word$i")
-        push!(ex.args, :( $word = extract_word(val, Val($i)) ))
-        push!(words, word)
-    end
-
-    # perform the operation
-    for word in words
-        push!(ex.args, :( $word = op($word, args...)) )
-    end
-
-    # reassemble
-    push!(ex.args, :( out = zero(val) ))
-    for (i,word) in enumerate(words)
-        push!(ex.args, :( out = insert_word(out, $word, Val($i)) ))
-    end
-
-    push!(ex.args, :( out ))
-
-    return ex
-end
-
-# split the invocation of a function `op` on a value `val`
-# by invoking the function on each of its fields
-@generated function recurse_value_invocation(op::Function, val, args...)
-    ex = quote
-        Base.@_inline_meta
-    end
-
-    fields = fieldnames(val)
-    if isempty(fields)
-        push!(ex.args, :( split_value_invocation(op, val, args...) ))
-    else
-        ctor = Expr(:new, val)
-        for field in fields
-            push!(ctor.args, :(
-                recurse_value_invocation(op, getfield(val, $(QuoteNode(field))), args...) ))
-        end
-        push!(ex.args, ctor)
-    end
-
-    return ex
-end
-
-# split the invocation of a function `op` on a pointer `ptr` with non-struct eltype
-# into multiple smaller invocations on any supported pointer as listed in `supported_ptrs`.
-@generated function split_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs},
-                                             args...) where {supported_ptrs}
-    T = eltype(ptr)
-    elsize(x) = Core.sizeof(eltype(x))
-    supported_ptrs = reverse(Base.uniontypes(supported_ptrs))
-
-    ex = quote
-        Base.@_inline_meta
-    end
-
-    # disassemble
-    vals = Tuple{Symbol,Int,Type}[]
-    offset = 0
-    while offset < Core.sizeof(T)
-        val = Symbol("value.$(length(vals)+1)")
-
-        # greedy selection of next pointer type
-        remaining = Core.sizeof(T)-offset
-        valid = filter(ptr->elsize(ptr)<=remaining, supported_ptrs)
-        if isempty(valid)
-            error("Cannot partition $T into values of $supported_typs")
-        end
-        ptr = first(sort(collect(valid); by=elsize, rev=true))
-
-        push!(vals, (val, offset, ptr))
-        offset += elsize(ptr)
-    end
-
-    # perform the operation
-    for (val, offset, ptr) in vals
-        subptr = :(convert($ptr, ptr+$offset))
-        push!(ex.args, :( $val = op($subptr, args...)) )
-    end
-
-    # reassemble
-    push!(ex.args, :( out = zero($T) ))
-    for (val, offset, ptr) in vals
-        push!(ex.args, :( out = insert_value(out, $val, Val($offset)) ))
-    end
-
-    push!(ex.args, :( out ))
-
-    return ex
-end
-
-# split the invocation of a function `op` on a pointer `ptr`
-# by invoking the function on a pointer to each of its fields
-@generated function recurse_pointer_invocation(op::Function, ptr, ::Type{supported_ptrs},
-                                               args...) where {supported_ptrs}
-    T = eltype(ptr)
-    ex = quote
-        Base.@_inline_meta
-    end
-    fields = fieldnames(T)
-
-    if isempty(fields)
-        push!(ex.args, :( split_pointer_invocation(op, ptr, supported_ptrs, args...) ))
-    else
-        ctor = Expr(:new, T)
-        for (i,field) in enumerate(fields)
-            field_typ = fieldtype(T, i)
-            field_offset = fieldoffset(T, i)
-            field_ptr_typ = :($(ptr.name.wrapper){$field_typ})
-            # NOTE: this ctor is a leap of faith
-            subptr = :(convert($field_ptr_typ, ptr+$field_offset))
-            push!(ctor.args, :(
-                recurse_pointer_invocation(op, $subptr, supported_ptrs, args...) ))
-        end
-        push!(ex.args, ctor)
-    end
-
-    return ex
-end
-
-# calculate the size of an LLVM type
-llvmsize(::LLVM.LLVMHalf) = sizeof(Float16)
-llvmsize(::LLVM.LLVMFloat) = sizeof(Float32)
-llvmsize(::LLVM.LLVMDouble) = sizeof(Float64)
-llvmsize(::LLVM.IntegerType) = Context() do ctx
-    div(Int(intwidth(GenericValue(LLVM.Int128Type(ctx), -1))), 8)
-end
-llvmsize(ty::LLVM.ArrayType) = length(ty)*llvmsize(eltype(ty))
-llvmsize(ty::LLVM.StructType) = ispacked(ty) ? sum(llvmsize(elem) for elem in elements(ty)) : 8*length(elements(ty)) # FIXME: Properly determine non-packed sizing
-llvmsize(ty::LLVM.PointerType) = div(Sys.WORD_SIZE, 8)
-llvmsize(ty::LLVM.VectorType) = size(ty)
-llvmsize(ty) = error("Unknown size for type: $ty, typeof: $(typeof(ty))")
diff --git a/src/compiler/zeroinit_lds.jl b/src/compiler/zeroinit_lds.jl
new file mode 100644
index 000000000..06e4daa5d
--- /dev/null
+++ b/src/compiler/zeroinit_lds.jl
@@ -0,0 +1,55 @@
+# Calculate the size of an LLVM type.
+llvmsize(::LLVM.LLVMHalf) = sizeof(Float16)
+llvmsize(::LLVM.LLVMFloat) = sizeof(Float32)
+llvmsize(::LLVM.LLVMDouble) = sizeof(Float64)
+function llvmsize(::LLVM.IntegerType)
+    div(Int(intwidth(GenericValue(LLVM.Int128Type(), -1))), 8)
+end
+
+llvmsize(ty::LLVM.ArrayType) = length(ty) * llvmsize(eltype(ty))
+llvmsize(ty::LLVM.StructType) = ispacked(ty) ?
+    sum(llvmsize(elem) for elem in elements(ty)) :
+    8 * length(elements(ty)) # FIXME: Properly determine non-packed sizing
+llvmsize(ty::LLVM.PointerType) = div(Sys.WORD_SIZE, 8)
+llvmsize(ty::LLVM.VectorType) = size(ty)
+llvmsize(ty) = error("Unknown size for type: $ty, typeof: $(typeof(ty))")
+
+function zeroinit_lds!(mod::LLVM.Module, entry::LLVM.Function)
+    if LLVM.callconv(entry) != LLVM.API.LLVMAMDGPUKERNELCallConv
+        return entry
+    end
+
+    to_init = []
+    for gbl in LLVM.globals(mod)
+        if startswith(LLVM.name(gbl), "__zeroinit")
+            as = LLVM.addrspace(value_type(gbl))
+            if as == AMDGPU.Device.AS.Local
+                push!(to_init, gbl)
+            end
+        end
+    end
+    isempty(to_init) && return entry
+
+    @dispose builder=IRBuilder() begin
+        # Make these the first operations we do.
+        block = first(LLVM.blocks(entry))
+        instruction = first(LLVM.instructions(block))
+        position!(builder, instruction)
+
+        # Use memset to clear all values to 0.
+        for gbl in to_init
+            sz = llvmsize(eltype(value_type(gbl)))
+            sz == 0 && continue
+
+            LLVM.memset!(builder, gbl,
+                ConstantInt(UInt8(0)), ConstantInt(sz),
+                LLVM.alignment(gbl))
+        end
+
+        # Synchronize the workgroup to prevent races.
+        sync_ft = LLVM.FunctionType(LLVM.VoidType())
+        sync_f = LLVM.Function(mod, LLVM.Intrinsic("llvm.amdgcn.s.barrier"))
+        call!(builder, sync_ft, sync_f)
+    end
+    return entry
+end
diff --git a/src/deprecations.jl b/src/deprecations.jl
deleted file mode 100644
index 2a25fdb5c..000000000
--- a/src/deprecations.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-@deprecate gridDim() gridItemDim()
-@deprecate gridDimWG() gridGroupDim()
-@deprecate HSAAgent ROCDevice
-@deprecate HSAQueue ROCQueue
-@deprecate HSASignal ROCSignal
-@deprecate HSAStatusSignal ROCKernelSignal
-@deprecate HSAKernelInstance ROCKernel
-@deprecate HSARegion ROCMemoryRegion
-@deprecate HSAMemoryPool ROCMemoryPool
diff --git a/src/device/gcn/array.jl b/src/device/gcn/array.jl
index 8ba74490f..305ad0e3e 100644
--- a/src/device/gcn/array.jl
+++ b/src/device/gcn/array.jl
@@ -28,17 +28,12 @@ ROCDeviceArray
 struct ROCDeviceArray{T,N,A} <: AbstractArray{T,N}
     shape::Dims{N}
     ptr::LLVMPtr{T,A}
+    len::Int
 
     # inner constructors, fully parameterized, exact types (ie. Int not <:Integer)
-    ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N} = new(shape,ptr)
-end
-
-# Define `khash` function to reduce runtime dispatches.
-function Runtime.khash(x::T, h::UInt=UInt(0)) where T <: AMDGPU.Device.ROCDeviceArray
-    for s in x.shape
-        h = hash(s, h)
+    function ROCDeviceArray{T,N,A}(shape::Dims{N}, ptr::LLVMPtr{T,A}) where {T,A,N}
+        new(shape, ptr, prod(shape))
     end
-    Runtime.khash(x.ptr, h)
 end
 
 const ROCDeviceVector = ROCDeviceArray{T,1,A} where {T,A}
@@ -67,11 +62,11 @@ ROCDeviceVector{T,A}(len::Integer,               p::LLVMPtr{T,A}) where {T,A}
 
 Base.pointer(a::ROCDeviceArray) = a.ptr
 Base.pointer(a::ROCDeviceArray, i::Integer) =
-    pointer(a) + (i - 1) * Base.elsize(a)
+    pointer(a) + (i - 1) * Base.elsize(a) # TODO use _memory_offset(a, i)
 
 Base.elsize(::Type{<:ROCDeviceArray{T}}) where {T} = sizeof(T)
 Base.size(g::ROCDeviceArray) = g.shape
-Base.length(g::ROCDeviceArray) = prod(g.shape)
+Base.length(g::ROCDeviceArray) = g.len
 
 # conversions
 
diff --git a/src/device/gcn/assertion.jl b/src/device/gcn/assertion.jl
index 8d7c223db..d5988cf57 100644
--- a/src/device/gcn/assertion.jl
+++ b/src/device/gcn/assertion.jl
@@ -39,9 +39,9 @@ assert_counter = 0
 
 @generated function rocassert_fail(::Val{msg}, ::Val{file}, ::Val{line}) where {msg, file, line}
     @dispose ctx=Context() begin
-        T_void = LLVM.VoidType(ctx)
-        T_int32 = LLVM.Int32Type(ctx)
-        T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx))
+        T_void = LLVM.VoidType()
+        T_int32 = LLVM.Int32Type()
+        T_pint8 = LLVM.PointerType(LLVM.Int8Type())
 
         # create function
         llvm_f, _ = create_function()
@@ -49,8 +49,8 @@ assert_counter = 0
 
         # generate IR
 
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
             global assert_counter
             assert_counter += 1
@@ -58,7 +58,7 @@ assert_counter = 0
             file = globalstring_ptr!(builder, String(file), "assert_file_$(assert_counter)")
             line = ConstantInt(T_int32, line)
             func = globalstring_ptr!(builder, "unknown", "assert_function_$(assert_counter)")
-            charSize = ConstantInt(Csize_t(1); ctx)
+            charSize = ConstantInt(Csize_t(1))
 
             # invoke __assertfail and return
             # TODO: mark noreturn since we don't use ptxas?
diff --git a/src/device/gcn/atomics.jl b/src/device/gcn/atomics.jl
index 53f5486ef..b1a9d2e0b 100644
--- a/src/device/gcn/atomics.jl
+++ b/src/device/gcn/atomics.jl
@@ -29,14 +29,14 @@ atomic_store!(ptr::LLVMPtr, val, order=Val{:release}()) =
 
 @generated function llvm_atomic_op(::Val{binop}, ptr::LLVMPtr{T,A}, val::T) where {binop, T, A}
     @dispose ctx=Context() begin
-        T_val = convert(LLVMType, T; ctx)
-        T_ptr = convert(LLVMType, ptr; ctx)
+        T_val = convert(LLVMType, T)
+        T_ptr = convert(LLVMType, ptr)
 
         T_typed_ptr = LLVM.PointerType(T_val, A)
         llvm_f, _ = create_function(T_val, [T_ptr, T_val])
 
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr)
@@ -96,14 +96,14 @@ end
 
 @generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
     @dispose ctx=Context() begin
-        T_val = convert(LLVMType, T; ctx)
-        T_ptr = convert(LLVMType, ptr; ctx)
+        T_val = convert(LLVMType, T)
+        T_ptr = convert(LLVMType, ptr)
 
         T_typed_ptr = LLVM.PointerType(T_val, A)
         llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val])
 
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr)
@@ -171,9 +171,11 @@ end
 """
     atomic_cas!(ptr::LLVMPtr{T}, cmp::T, val::T)
 
-Reads the value `old` located at address `ptr` and compare with `cmp`. If `old` equals to
-`cmp`, stores `val` at the same address. Otherwise, doesn't change the value `old`. These
-operations are performed in one atomic transaction. The function returns `old`.
+Reads the value `old` located at address `ptr` and compare with `cmp`.
+If `old` equals to `cmp`, stores `val` at the same address.
+Otherwise, doesn't change the value `old`.
+These operations are performed in one atomic transaction.
+The function returns `old`.
 
 This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
 """
diff --git a/src/device/gcn/execution_control.jl b/src/device/gcn/execution_control.jl
index 52d053ec5..7648502bf 100644
--- a/src/device/gcn/execution_control.jl
+++ b/src/device/gcn/execution_control.jl
@@ -13,8 +13,8 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa
 
 @generated function _completion_signal()
     @dispose ctx=Context() begin
-        T_int8 = LLVM.Int8Type(ctx)
-        T_int64 = LLVM.Int64Type(ctx)
+        T_int8 = LLVM.Int8Type()
+        T_int64 = LLVM.Int64Type()
         _as = convert(Int, AS.Constant)
         T_ptr_i8 = LLVM.PointerType(T_int8, _as)
         T_ptr_i64 = LLVM.PointerType(T_int64, _as)
@@ -24,8 +24,8 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa
         mod = LLVM.parent(llvm_f)
 
         # generate IR
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             # get the kernel dispatch pointer
@@ -34,7 +34,7 @@ const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signa
             ptr = call!(builder, intr_typ, intr)
 
             # load the index
-            signal_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(completion_signal_base; ctx)])
+            signal_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(completion_signal_base)])
             signal_ptr = bitcast!(builder, signal_ptr_i8, T_ptr_i64)
             signal = load!(builder, T_int64, signal_ptr)
             ret!(builder, signal)
diff --git a/src/device/gcn/helpers.jl b/src/device/gcn/helpers.jl
index 9d378df35..cc9c3efe3 100644
--- a/src/device/gcn/helpers.jl
+++ b/src/device/gcn/helpers.jl
@@ -6,18 +6,18 @@ _packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names)
     @dispose ctx=Context() begin
         inp_exprs = [:( inp_args[$i] ) for i in 1:length(inp_args)]
         inp_types = [inp_args...]
-        out_type = convert(LLVMType, out_arg.parameters[1]; ctx)
+        out_type = convert(LLVMType, out_arg.parameters[1])
 
         # create function
         bool_types = map(x->x===Bool, inp_types)
-        T_bool = LLVM.Int1Type(ctx)
-        param_types = LLVMType[convert.(LLVMType, inp_types; ctx=ctx)...]
+        T_bool = LLVM.Int1Type()
+        param_types = LLVMType[convert.(LLVMType, inp_types)...]
         llvm_f, _ = create_function(out_type, param_types)
         mod = LLVM.parent(llvm_f)
 
         # generate IR
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             # call the intrinsic
@@ -28,13 +28,13 @@ _packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names)
             for idx in 1:length(param_types)
                 if bool_types[idx]
                     attrs = parameter_attributes(intr, idx)
-                    push!(attrs, EnumAttribute("zeroext", 0; ctx))
+                    push!(attrs, EnumAttribute("zeroext", 0))
                 end
             end
             params = map(x->bool_types[x[1]] ? trunc!(builder, x[2], T_bool) : x[2], enumerate(parameters(llvm_f)))
             value = call!(builder, intr_ftype, intr, [params...])
             if out_arg === Type{Bool}
-                ret!(builder, zext!(builder, value, convert(LLVMType, Bool; ctx)))
+                ret!(builder, zext!(builder, value, convert(LLVMType, Bool)))
             else
                 ret!(builder, value)
             end
diff --git a/src/device/gcn/hostcall.jl b/src/device/gcn/hostcall.jl
index d5086a966..a71b51b50 100644
--- a/src/device/gcn/hostcall.jl
+++ b/src/device/gcn/hostcall.jl
@@ -26,7 +26,7 @@ const DEVICE_ERR_SENTINEL = 5
 const HOST_ERR_SENTINEL = 6
 
 const DEFAULT_HOSTCALL_TIMEOUT = Ref{Union{Float64, Nothing}}(nothing)
-const DEFAULT_HOSTCALL_LATENCY = Ref{Float64}(0.01)
+const DEFAULT_HOSTCALL_LATENCY = 0.01
 
 include("hostcall_signal_helpers.jl")
 
@@ -35,14 +35,15 @@ include("hostcall_signal_helpers.jl")
 
 GPU-compatible struct for making hostcalls.
 """
-struct HostCall{RT,AT}
+struct HostCall{RT, AT}
     signal_handle::UInt64
-    buf_ptr::LLVMPtr{UInt8,AS.Global}
+    buf_ptr::LLVMPtr{UInt8, AS.Global}
     buf_len::UInt
 end
 
-function HostCall(RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64;
-                  device=AMDGPU.device(), buf_len=nothing)
+function HostCall(
+    RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64; buf_len = nothing,
+)
     if isnothing(buf_len)
         buf_len = 0
         for T in AT.parameters
@@ -52,274 +53,101 @@ function HostCall(RT::Type, AT::Type{<:Tuple}, signal_handle::UInt64;
     end
 
     buf_len = max(sizeof(UInt64), buf_len) # make room for return buffer pointer
-    buf = Mem.alloc(device, buf_len; coherent=true)
-    buf_ptr = LLVMPtr{UInt8,AS.Global}(Base.unsafe_convert(Ptr{UInt8}, buf))
+    buf = Mem.HostBuffer(buf_len, AMDGPU.HIP.hipHostAllocMapped)
+    buf_ptr = LLVMPtr{UInt8, AS.Global}(Base.unsafe_convert(Ptr{UInt8}, buf))
     host_signal_store!(HSA.Signal(signal_handle), READY_SENTINEL)
-    HostCall{RT,AT}(signal_handle, buf_ptr, buf_len)
+    HostCall{RT, AT}(signal_handle, buf_ptr, buf_len)
 end
 
-"Calls the host function stored in `hc` with arguments `args`."
-@inline function hostcall!(hc::HostCall, args...)
-    hostcall!(Val{:group}(), hc, args...)
+struct HostCallHolder
+    hc::HostCall
+    task::Task
+    finish::Ref{Bool}
+    continuous::Ref{Bool}
 end
 
-@inline function hostcall!(
-    ::Val{mode}, hc::HostCall{RT, AT}, args...,
-) where {mode, RT, AT}
-    hostcall_device_lock!(Val{mode}(), hc)
-    hostcall_device_write_args!(Val{mode}(), hc, args...)
-    return hostcall_device_trigger_and_return!(Val{mode}(), hc)
-end
-
-macro device_execution_gate(mode, exec_ex)
-    if mode isa QuoteNode
-        mode = mode.value::Symbol
-    end
-    @assert mode in (:grid, :group, :wave, :lane) "Invalid mode: $mode"
-    ex = Expr(:block)
-    if mode == :grid
-        push!(ex.args, quote
-            # Must be on first item of first group
-            if $workgroupIdx().x != 1 || $workitemIdx().x != 1
-                @goto gated_done
-            end
-        end)
-    elseif mode == :group
-        push!(ex.args, quote
-            # Must be on first item of each group
-            if $workitemIdx().x != 1
-                @goto gated_done
-            end
-        end)
-    elseif mode == :wave
-        push!(ex.args, quote
-            # Must be on first lane of each wavefront of each group
-            if Core.Intrinsics.urem_int($workitemIdx().x - UInt32(1),
-                                        $wavefrontsize()) != 0
-                @goto gated_done
-            end
-        end)
-    end
-    push!(ex.args, quote
-        $(esc(exec_ex))
-        @label gated_done
-        $sync_workgroup()
-    end)
-    return ex
-end
-
-@inline function hostcall_device_lock!(hc::HostCall)
-    hostcall_device_lock!(Val{:group}(), hc)
-end
-
-@inline @generated function hostcall_device_lock!(
-    ::Val{mode}, hc::HostCall,
-) where mode
-    return quote
-        @device_execution_gate $mode begin
-            # Acquire lock on hostcall buffer
-            hostcall_device_signal_wait_cas!(hc.signal_handle, READY_SENTINEL, DEVICE_LOCK_SENTINEL)
-        end
-    end
-end
-
-@inline function hostcall_device_write_args!(hc::HostCall, args...)
-    hostcall_device_write_args!(Val{:group}(), hc, args...)
-end
-
-@inline @generated function hostcall_device_write_args!(
-    ::Val{mode}, hc::HostCall{RT, AT}, args...,
-) where {mode, RT, AT}
-    ex = Expr(:block)
-
-    # Copy arguments into buffer
-    # Modified from CUDAnative src/device/cuda/dynamic_parallelism.jl
-    off = 1
-    for i in 1:length(args)
-        T = args[i]
-        sz = sizeof(T)
-        # FIXME: Use proper alignment
-        ptr = :(reinterpret(LLVMPtr{$T,AS.Global}, hc.buf_ptr+$off-1))
-        push!(ex.args, :(Base.unsafe_store!($ptr, args[$i])))
-        off += sz
-    end
-
-    return macroexpand(@__MODULE__, quote
-        @device_execution_gate $mode begin
-            $ex
-        end
-    end)
-end
-
-@inline function hostcall_device_trigger_and_return!(hc::HostCall)
-    hostcall_device_trigger_and_return!(Val{:group}(), hc::HostCall)
-end
-
-@inline @generated function hostcall_device_trigger_and_return!(::Val{mode}, hc::HostCall{RT, AT}) where {mode, RT, AT}
-    ex = Expr(:block)
-    @gensym shmem buf_ptr ret_ptr hostcall_return
-
-    push!(ex.args, quote
-        if $RT !== Nothing
-            # FIXME: This is not valid without the @inline
-            # $shmem = $alloc_local($hostcall_return, $RT, 1)
-            # But this is fine (if slower)
-            $shmem = $get_global_pointer($(Val{hostcall_return}()), $RT)
-        end
-
-        @device_execution_gate $mode begin
-            # Ensure arguments are written
-            $hostcall_device_signal_wait_cas!(hc.signal_handle, $DEVICE_LOCK_SENTINEL, $DEVICE_MSG_SENTINEL)
-            # Wait on host message
-            $hostcall_device_signal_wait(hc.signal_handle, $HOST_MSG_SENTINEL)
-            # Get return buffer and load first value
-            if $RT !== Nothing
-                $buf_ptr = reinterpret(LLVMPtr{LLVMPtr{$RT,AS.Global},AS.Global},hc.buf_ptr)
-                $ret_ptr = unsafe_load($buf_ptr)
-                if UInt64($ret_ptr) == 0
-                    $device_signal_store!(hc.signal_handle, $DEVICE_ERR_SENTINEL)
-                    $signal_exception()
-                    $trap()
-                end
-                unsafe_store!($shmem, unsafe_load($ret_ptr)::$RT)
-            end
-            $device_signal_store!(hc.signal_handle, $READY_SENTINEL)
-        end
-        if $RT !== Nothing
-            return unsafe_load($shmem)
-        else
-            return nothing
-        end
-    end)
-
-    return ex
-end
-
-@inline @generated function hostcall_device_args_size(args...)
-    sz = 0
-    for arg in args
-        sz += sizeof(arg)
-    end
-    return sz
-end
-
-@generated function hostcall_host_read_args(hc::HostCall{RT,AT}) where {RT,AT}
-    ex = Expr(:tuple)
-
-    # Copy arguments into buffer
-    off = 1
-    for i in 1:length(AT.parameters)
-        T = AT.parameters[i]
-        sz = sizeof(T)
-        # FIXME: Use correct alignment
-        push!(ex.args, quote
-            lref = Ref{$T}()
-            HSA.memory_copy(reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{$T}, lref)),
-                            reinterpret(Ptr{Cvoid}, hc.buf_ptr+$off - 1),
-                            $sz) |> Runtime.check
-            lref[]
-        end)
-        off += sz
-    end
-
-    return ex
-end
-
-struct HostCallException <: Exception
-    reason::String
-    err::Union{Exception, Nothing}
-    bt::Union{Vector, Nothing}
-end
-
-HostCallException(reason) = HostCallException(reason, nothing, backtrace())
-
-HostCallException(reason, err) = HostCallException(reason, err, catch_backtrace())
-
-function Base.showerror(io::IO, err::HostCallException)
-    print(io, "HostCallException: $(err.reason)")
-    if err.err !== nothing || err.bt !== nothing
-        print(io, ":\n")
-    end
-    err.err !== nothing && Base.showerror(io, err.err)
-    err.bt !== nothing && Base.show_backtrace(io, err.bt)
-end
-
-const NAMED_PERDEVICE_HOSTCALLS = Dict{Runtime.ROCDevice, Dict{Symbol, HostCall}}()
-
-function named_perdevice_hostcall(func, device::Runtime.ROCDevice, name::Symbol)
-    lock(Runtime.RT_LOCK) do
-        hcs = get!(()->Dict{Symbol, HostCall}(), NAMED_PERDEVICE_HOSTCALLS, device)
-        return get!(func, hcs, name)
-    end
-end
+include("hostcall_utils.jl")
 
 """
-    HostCall(func, return_type::Type, arg_types::Type{Tuple}) -> HostCall
+    HostCallHolder(func, return_type::Type, arg_types::Type{Tuple}) -> HostCall
 
 Construct a `HostCall` that executes `func` with the arguments passed from the
-calling kernel. `func` must be passed arguments of types contained in
-`arg_types`, and must return a value of type `return_type`, or else the
-hostcall will fail with undefined behavior.
+calling kernel.
+
+`func` must be passed arguments of types contained in `arg_types`,
+and must return a value of type `return_type`,
+or else the hostcall will fail with undefined behavior.
 
 Note: This API is currently experimental and is subject to change at any time.
 """
-function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; return_task::Bool = false,
-                  device=AMDGPU.device(), maxlat=DEFAULT_HOSTCALL_LATENCY[],
-                  timeout=nothing, continuous=false, buf_len=nothing)
-    # Create raw HSA signal to avoid ROCSignal finalizer
-    # being called too early in the HostCall task.
+function HostCallHolder(
+    func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple};
+    timeout = nothing, continuous = false, buf_len = nothing,
+    maxlat = DEFAULT_HOSTCALL_LATENCY,
+)
     signal_ref = Ref{HSA.Signal}()
     HSA.signal_create(1, 0, C_NULL, signal_ref) |> Runtime.check
     signal = signal_ref[]
     AMDGPU.hsaref!()
 
-    hc = HostCall(rettype, argtypes, signal.handle; device, buf_len)
+    hc = HostCall(rettype, argtypes, signal.handle; buf_len)
+    finish_ref = Ref{Bool}(false)
+    continuous_ref = Ref{Bool}(continuous)
 
     tsk = Threads.@spawn begin
-        ret_buf = Ref{Mem.Buffer}()
+        ret_buf = Ref{Mem.HostBuffer}(Mem.HostBuffer())
         ret_len = 0
+
         try
             while true
-                if !hostcall_host_wait(signal; maxlat=maxlat, timeout=timeout)
-                    throw(HostCallException("Hostcall: Timeout on signal $signal"))
+                if !hostcall_host_wait(signal, finish_ref; maxlat, timeout)
+                    Runtime.RT_EXITING[] && break
+                    finish_ref[] && break
+                    throw(HostCallException("Timeout on signal $signal"))
                 end
-                if length(argtypes.parameters) > 0
-                    args = try
+
+                args = if isempty(argtypes.parameters)
+                    ()
+                else
+                    try
                         hostcall_host_read_args(hc)
                     catch err
                         throw(HostCallException("Error getting arguments", err))
                     end
-                    @debug "Hostcall: Got arguments of length $(length(args))"
-                else
-                    args = ()
                 end
+
                 ret = try
                     func(args...,)
                 catch err
                     throw(HostCallException("Error executing host function", err))
                 end
-                if typeof(ret) != rettype
-                    throw(HostCallException("Host function result of wrong type: $(typeof(ret)), expected $rettype"))
-                end
-                if !isbits(ret)
-                    throw(HostCallException("Host function result not isbits: $(typeof(ret))"))
-                end
-                @debug "Hostcall: Host function returning value of type $(typeof(ret))"
+
+                typeof(ret) == rettype || throw(HostCallException("""
+                    Host function result of wrong type:
+                        - returned: $(typeof(ret))
+                        - expected: $rettype
+                    """))
+                isbits(ret) || throw(HostCallException(
+                    "Host function result not isbits: $(typeof(ret))"))
+
                 try
-                    if isassigned(ret_buf) && (ret_len < sizeof(ret))
+                    if ret_buf[].ptr != C_NULL && ret_len < sizeof(ret)
                         Mem.free(ret_buf[])
                         ret_len = sizeof(ret)
-                        ret_buf[] = Mem.alloc(device, ret_len; coherent=true)
-                    elseif !isassigned(ret_buf)
+                        ret_buf[] = Mem.HostBuffer(ret_len, AMDGPU.HIP.hipHostAllocMapped)
+                    elseif ret_buf[].ptr == C_NULL
                         ret_len = sizeof(ret)
-                        ret_buf[] = Mem.alloc(device, ret_len; coherent=true)
+                        ret_buf[] = Mem.HostBuffer(ret_len, AMDGPU.HIP.hipHostAllocMapped)
                     end
+
                     ret_ref = Ref{rettype}(ret)
                     GC.@preserve ret_ref begin
                         ret_ptr = Base.unsafe_convert(Ptr{Cvoid}, ret_buf[])
                         if sizeof(ret) > 0
-                            src_ptr = reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{rettype}, ret_ref))
-                            HSA.memory_copy(ret_ptr, src_ptr, sizeof(ret)) |> Runtime.check
+                            src_ptr = reinterpret(Ptr{Cvoid},
+                                Base.unsafe_convert(Ptr{rettype}, ret_ref))
+                            HSA.memory_copy(
+                                ret_ptr, src_ptr, sizeof(ret)) |> Runtime.check
                         end
 
                         args_buf_ptr = reinterpret(Ptr{Ptr{Cvoid}}, hc.buf_ptr)
@@ -327,10 +155,10 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r
                     end
                     host_signal_store!(signal, HOST_MSG_SENTINEL)
                 catch err
-                    throw(HostCallException("Error returning hostcall result", err))
+                    throw(HostCallException(
+                        "Error returning hostcall result", err))
                 end
-                @debug "Hostcall: Host function return completed"
-                continuous || break
+                continuous_ref[] || break
             end
         catch err
             # Gracefully terminate all waiters
@@ -342,17 +170,19 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r
                 rethrow(err)
             end
         finally
-            # We need to free the memory buffers, but first we need to ensure that
-            # the device has read from these buffers. Therefore we wait either for
-            # READY_SENTINEL or else an error signal.
+            # We need to free the memory buffers, but first we need
+            # to ensure that the device has read from these buffers.
+            # Therefore we wait either for READY_SENTINEL or else an error signal.
             while !Runtime.RT_EXITING[]
                 prev = host_signal_load(signal)
-                if prev == READY_SENTINEL || prev == HOST_ERR_SENTINEL || prev == DEVICE_ERR_SENTINEL
-                    if isassigned(ret_buf)
-                        Mem.free(ret_buf[])
-                    end
+                not_used =
+                    prev == READY_SENTINEL ||
+                    prev == HOST_ERR_SENTINEL ||
+                    prev == DEVICE_ERR_SENTINEL
+                if not_used
+                    Mem.free(ret_buf[]) # `free` checks for C_NULL.
                     buf_ptr = reinterpret(Ptr{Cvoid}, hc.buf_ptr)
-                    Mem.free(Mem.Buffer(buf_ptr, C_NULL, buf_ptr, 0, device, true, false))
+                    Mem.free(Mem.HostBuffer(buf_ptr, 0))
                     break
                 end
             end
@@ -360,29 +190,41 @@ function HostCall(func::Base.Callable, rettype::Type, argtypes::Type{<:Tuple}; r
             HSA.signal_destroy(signal) |> Runtime.check
             AMDGPU.hsaunref!()
         end
+        return
     end
-
-    if return_task
-        return hc, tsk
-    else
-        return hc
-    end
+    HostCallHolder(hc, tsk, finish_ref, continuous_ref)
 end
 
-function hostcall_host_wait(signal_handle::HSA.Signal; maxlat=DEFAULT_HOSTCALL_LATENCY[], timeout=DEFAULT_HOSTCALL_TIMEOUT[])
-    @debug "Hostcall: Waiting on signal $signal_handle"
+Adapt.adapt(to::Runtime.Adaptor, hc::HostCallHolder) = hc.hc
+
+non_continuous!(hc::HostCallHolder) = hc.continuous[] = false
+
+finish!(hc::HostCallHolder) = hc.finish[] = true
+
+Base.istaskdone(hc::HostCallHolder) = istaskdone(hc.task)
+
+function hostcall_host_wait(
+    signal_handle::HSA.Signal, finish_ref::Ref{Bool};
+    maxlat=DEFAULT_HOSTCALL_LATENCY, timeout=DEFAULT_HOSTCALL_TIMEOUT[],
+)
+    res::Bool = false
     start_time = time_ns()
+
     while !Runtime.RT_EXITING[]
+        finish_ref[] && break
         prev = host_signal_load(signal_handle)
+
+        # If device-sourced message is available,
+        # lock on host to prevent further writes from the device.
+        # If successfully locked on host, done waiting.
         if prev == DEVICE_MSG_SENTINEL
             prev = host_signal_cmpxchg!(
                 signal_handle, DEVICE_MSG_SENTINEL, HOST_LOCK_SENTINEL)
             if prev == DEVICE_MSG_SENTINEL
-                @debug "Hostcall: Device message on signal $signal_handle"
-                return true
+                res = true
+                break
             end
         elseif prev == DEVICE_ERR_SENTINEL
-            @debug "Hostcall: Device error on signal $signal_handle"
             throw(HostCallException("Device error on signal $signal_handle"))
         end
 
@@ -390,10 +232,13 @@ function hostcall_host_wait(signal_handle::HSA.Signal; maxlat=DEFAULT_HOSTCALL_L
             now_time = time_ns()
             diff_time = (now_time - start_time) / 10^9
             if diff_time > timeout
-                @debug "Hostcall: Signal wait timeout on signal $signal_handle"
-                return false
+                res = false
+                break
             end
         end
-        sleep(maxlat)
+
+        Libc.systemsleep(maxlat)
+        yield()
     end
+    return res
 end
diff --git a/src/device/gcn/hostcall_utils.jl b/src/device/gcn/hostcall_utils.jl
new file mode 100644
index 000000000..e2a95f207
--- /dev/null
+++ b/src/device/gcn/hostcall_utils.jl
@@ -0,0 +1,219 @@
+"Calls the host function stored in `hc` with arguments `args`."
+@inline function hostcall!(hc::HostCall, args...)
+    hostcall!(Val{:group}(), hc, args...)
+end
+
+@inline function hostcall!(
+    ::Val{mode}, hc::HostCall{RT, AT}, args...,
+) where {mode, RT, AT}
+    hostcall_device_lock!(Val{mode}(), hc)
+    hostcall_device_write_args!(Val{mode}(), hc, args...)
+    return hostcall_device_trigger_and_return!(Val{mode}(), hc)
+end
+
+macro device_execution_gate(mode, exec_ex)
+    if mode isa QuoteNode
+        mode = mode.value::Symbol
+    end
+    @assert mode in (:grid, :group, :wave, :lane) "Invalid mode: $mode"
+    ex = Expr(:block)
+    if mode == :grid
+        push!(ex.args, quote
+            # Must be on first item of first group
+            if $workgroupIdx().x != 1 || $workitemIdx().x != 1
+                @goto gated_done
+            end
+        end)
+    elseif mode == :group
+        push!(ex.args, quote
+            # Must be on first item of each group
+            if $workitemIdx().x != 1
+                @goto gated_done
+            end
+        end)
+    elseif mode == :wave
+        push!(ex.args, quote
+            # Must be on first lane of each wavefront of each group
+            is_not_first_lane = Core.Intrinsics.urem_int(
+                $workitemIdx().x - UInt32(1), $wavefrontsize()) != 0
+            if is_not_first_lane
+                @goto gated_done
+            end
+        end)
+    end
+    push!(ex.args, quote
+        $(esc(exec_ex))
+        @label gated_done
+        $sync_workgroup()
+    end)
+    return ex
+end
+
+@inline function hostcall_device_lock!(hc::HostCall)
+    hostcall_device_lock!(Val{:group}(), hc)
+end
+
+@inline @generated function hostcall_device_lock!(
+    ::Val{mode}, hc::HostCall,
+) where mode
+    return quote
+        @device_execution_gate $mode begin
+            # Acquire lock on hostcall buffer
+            hostcall_device_signal_wait_cas!(
+                hc.signal_handle, READY_SENTINEL, DEVICE_LOCK_SENTINEL)
+        end
+    end
+end
+
+@inline function hostcall_device_write_args!(hc::HostCall, args...)
+    hostcall_device_write_args!(Val{:group}(), hc, args...)
+end
+
+@inline @generated function hostcall_device_write_args!(
+    ::Val{mode}, hc::HostCall{RT, AT}, args...,
+) where {mode, RT, AT}
+    ex = Expr(:block)
+
+    # Copy arguments into buffer
+    # Modified from CUDAnative src/device/cuda/dynamic_parallelism.jl
+    off = 1
+    for i in 1:length(args)
+        T = args[i]
+        sz = sizeof(T)
+        # FIXME: Use proper alignment
+        ptr = :(reinterpret(LLVMPtr{$T,AS.Global}, hc.buf_ptr + $off - 1))
+        push!(ex.args, :(Base.unsafe_store!($ptr, args[$i])))
+        off += sz
+    end
+
+    return macroexpand(@__MODULE__, quote
+        @device_execution_gate $mode begin
+            $ex
+        end
+    end)
+end
+
+@inline function hostcall_device_trigger_and_return!(hc::HostCall)
+    hostcall_device_trigger_and_return!(Val{:group}(), hc::HostCall)
+end
+
+@inline @generated function hostcall_device_trigger_and_return!(
+    ::Val{mode}, hc::HostCall{RT, AT},
+) where {mode, RT, AT}
+    ex = Expr(:block)
+    @gensym shmem buf_ptr ret_ptr
+
+    push!(ex.args, quote
+        if $RT !== Nothing
+            $shmem = $alloc_local(:hostcall_return, $RT, 1)
+        end
+
+        @device_execution_gate $mode begin
+            # Ensure arguments are written
+            $hostcall_device_signal_wait_cas!(
+                hc.signal_handle, $DEVICE_LOCK_SENTINEL, $DEVICE_MSG_SENTINEL)
+            # Wait on host message
+            $hostcall_device_signal_wait(hc.signal_handle, $HOST_MSG_SENTINEL)
+            # Get return buffer and load first value
+            if $RT !== Nothing
+                $buf_ptr = reinterpret(LLVMPtr{LLVMPtr{$RT,AS.Global}, AS.Global}, hc.buf_ptr)
+                $ret_ptr = unsafe_load($buf_ptr)
+                if UInt64($ret_ptr) == 0
+                    $device_signal_store!(hc.signal_handle, $DEVICE_ERR_SENTINEL)
+                    $signal_exception()
+                    $trap()
+                end
+                unsafe_store!($shmem, unsafe_load($ret_ptr)::$RT)
+            end
+            $device_signal_store!(hc.signal_handle, $READY_SENTINEL)
+        end
+        if $RT !== Nothing
+            return unsafe_load($shmem)
+        else
+            return nothing
+        end
+    end)
+
+    return ex
+end
+
+@inline @generated function hostcall_device_args_size(args...)
+    sz = 0
+    for arg in args
+        sz += sizeof(arg)
+    end
+    return sz
+end
+
+@generated function hostcall_host_read_args(hc::HostCall{RT,AT}) where {RT,AT}
+    ex = Expr(:tuple)
+
+    # Copy arguments into buffer
+    off = 1
+    for i in 1:length(AT.parameters)
+        T = AT.parameters[i]
+        sz = sizeof(T)
+        # FIXME: Use correct alignment
+        push!(ex.args, quote
+            lref = Ref{$T}()
+            HSA.memory_copy(
+                reinterpret(Ptr{Cvoid}, Base.unsafe_convert(Ptr{$T}, lref)),
+                reinterpret(Ptr{Cvoid}, hc.buf_ptr + $off - 1), $sz) |> Runtime.check
+            lref[]
+        end)
+        off += sz
+    end
+
+    return ex
+end
+
+struct HostCallException <: Exception
+    reason::String
+    err::Union{Exception, Nothing}
+    bt::Union{Vector, Nothing}
+end
+
+HostCallException(reason) = HostCallException(reason, nothing, backtrace())
+
+HostCallException(reason, err) = HostCallException(reason, err, catch_backtrace())
+
+function Base.showerror(io::IO, err::HostCallException)
+    print(io, "HostCallException: $(err.reason)")
+    if err.err !== nothing || err.bt !== nothing
+        print(io, ":\n")
+    end
+    err.err !== nothing && Base.showerror(io, err.err)
+    err.bt !== nothing && Base.show_backtrace(io, err.bt)
+end
+
+const NAMED_PERDEVICE_HOSTCALLS = Dict{
+    HIP.HIPDevice, Dict{Symbol, Tuple{HostCallHolder, Mem.HostBuffer}}}()
+
+function named_perdevice_hostcall(func, dev::HIP.HIPDevice, name::Symbol)
+    Base.@lock Runtime.RT_LOCK begin
+        hcs = get!(
+            () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(),
+            NAMED_PERDEVICE_HOSTCALLS, dev)
+        get!(func, hcs, name)
+    end
+end
+
+# TODO rename
+function get_named_perdevice_hostcall(dev::HIP.HIPDevice, name::Symbol)
+    Base.@lock Runtime.RT_LOCK begin
+        hcs = get(
+            () -> Dict{Symbol, Tuple{HostCall, Mem.HostBuffer}}(),
+            NAMED_PERDEVICE_HOSTCALLS, dev)
+        get(hcs, name, nothing)
+    end
+end
+
+function remove_perdevice_hostcall!(dev::HIP.HIPDevice, name::Symbol)
+    Base.@lock Runtime.RT_LOCK begin
+        dev_hcs = get(NAMED_PERDEVICE_HOSTCALLS, dev, nothing)
+        isnothing(dev_hcs) && return
+
+        pop!(dev_hcs, name)
+        return
+    end
+end
diff --git a/src/device/gcn/indexing.jl b/src/device/gcn/indexing.jl
index 938557ebb..4c09e720c 100644
--- a/src/device/gcn/indexing.jl
+++ b/src/device/gcn/indexing.jl
@@ -2,15 +2,15 @@
 
 @generated function _index(::Val{fname}, ::Val{name}, ::Val{range}) where {fname, name, range}
     @dispose ctx=Context() begin
-        T_int32 = LLVM.Int32Type(ctx)
+        T_int32 = LLVM.Int32Type()
 
         # create function
         llvm_f, _ = create_function(T_int32)
         mod = LLVM.parent(llvm_f)
 
         # generate IR
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             # call the indexing intrinsic
@@ -19,10 +19,9 @@
             idx = call!(builder, intr_typ, intr)
 
             # attach range metadata
-            range_metadata = MDNode([ConstantInt(UInt32(range.start); ctx),
-                                     ConstantInt(UInt32(range.stop); ctx)];
-                                    ctx)
-            metadata(idx)[LLVM.MD_range] = range_metadata
+            metadata(idx)[LLVM.MD_range] = MDNode([
+                ConstantInt(UInt32(range.start)),
+                ConstantInt(UInt32(range.stop))])
             ret!(builder, idx)
         end
 
@@ -32,14 +31,14 @@ end
 
 @generated function _dim(::Val{base}, ::Val{off}, ::Val{range}, ::Type{T}) where {base, off, range, T}
     @dispose ctx=Context() begin
-        T_int8 = LLVM.Int8Type(ctx)
-        T_int32 = LLVM.Int32Type(ctx)
+        T_int8 = LLVM.Int8Type()
+        T_int32 = LLVM.Int32Type()
 
         _as = convert(Int, AS.Constant)
         T_ptr_i8 = LLVM.PointerType(T_int8, _as)
         T_ptr_i32 = LLVM.PointerType(T_int32, _as)
 
-        T_T = convert(LLVMType, T; ctx)
+        T_T = convert(LLVMType, T)
         T_ptr_T = LLVM.PointerType(T_T, _as)
 
         # create function
@@ -47,8 +46,8 @@ end
         mod = LLVM.parent(llvm_f)
 
         # generate IR
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             # get the kernel dispatch pointer
@@ -58,16 +57,15 @@ end
 
             # load the index
             offset = base + ((off - 1) * sizeof(T))
-            idx_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(offset; ctx)])
+            idx_ptr_i8 = inbounds_gep!(builder, T_int8, ptr, [ConstantInt(offset)])
             idx_ptr_T = bitcast!(builder, idx_ptr_i8, T_ptr_T)
             idx_T = load!(builder, T_T, idx_ptr_T)
             idx = zext!(builder, idx_T, T_int32)
 
             # attach range metadata
-            range_metadata = MDNode([ConstantInt(T(range.start); ctx),
-                                     ConstantInt(T(range.stop); ctx)];
-                                    ctx)
-            metadata(idx_T)[LLVM.MD_range] = range_metadata
+            metadata(idx_T)[LLVM.MD_range] = MDNode([
+                ConstantInt(T(range.start)),
+                ConstantInt(T(range.stop))])
             ret!(builder, idx)
         end
 
diff --git a/src/device/gcn/memory_dynamic.jl b/src/device/gcn/memory_dynamic.jl
index 617b95e0a..a43f7e8c9 100644
--- a/src/device/gcn/memory_dynamic.jl
+++ b/src/device/gcn/memory_dynamic.jl
@@ -1,20 +1,20 @@
 export malloc, free
 
-malloc(sz) = device_malloc(sz)
-function device_malloc(sz::Csize_t)
-    malloc_gbl = get_global_pointer(Val(:__global_malloc_hostcall),
-                                    HostCall{Ptr{Cvoid},Tuple{UInt64,Csize_t}})
-    malloc_hc = Base.unsafe_load(malloc_gbl)
-    kern = _completion_signal()
-    ptr = hostcall!(malloc_hc, kern, sz)
-    return ptr
+# @device_function function dm_alloc(sz::Csize_t)
+#     ccall("extern __ockl_dm_alloc", llvmcall, Ptr{Cvoid}, (Csize_t,), sz)
+# end
+
+# @device_function function dm_free(ptr::Ptr{Cvoid})
+#     ccall("extern __ockl_dm_free", llvmcall, Nothing, (Csize_t,), ptr)
+# end
+
+function malloc(bytesize::Csize_t)::Ptr{Cvoid}
+    mhc = Base.unsafe_load(malloc_hc())
+    return hostcall!(mhc, bytesize)
 end
 
-free(ptr) = device_free(ptr)
-function device_free(ptr::Ptr{Cvoid})
-    free_gbl = get_global_pointer(Val(:__global_free_hostcall),
-                                  HostCall{Nothing,Tuple{UInt64,Ptr{Cvoid}}})
-    free_hc = Base.unsafe_load(free_gbl)
-    kern = _completion_signal()
-    hostcall!(free_hc, kern, ptr)
+function free(ptr::Ptr{Cvoid})::Nothing
+    fhc = Base.unsafe_load(free_hc())
+    hostcall!(fhc, ptr)
+    return
 end
diff --git a/src/device/gcn/memory_static.jl b/src/device/gcn/memory_static.jl
index 58afd8868..26df9d614 100644
--- a/src/device/gcn/memory_static.jl
+++ b/src/device/gcn/memory_static.jl
@@ -1,7 +1,9 @@
 "Allocates on-device memory statically from the specified address space."
-@generated function alloc_special(::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit}=Val{false}()) where {id,T,as,len,zeroinit}
+@generated function alloc_special(
+    ::Val{id}, ::Type{T}, ::Val{as}, ::Val{len}, ::Val{zeroinit} = Val{false}(),
+) where {id,T,as,len,zeroinit}
     @dispose ctx=Context() begin
-        eltyp = convert(LLVMType, T; ctx)
+        eltyp = convert(LLVMType, T)
 
         # old versions of GPUArrays invoke _shmem with an integer id; make sure those are unique
         if !isa(id, String) || !isa(id, Symbol)
@@ -11,7 +13,7 @@
             id = "__zeroinit_" * id
         end
 
-        T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as}; ctx)
+        T_ptr_i8 = convert(LLVMType, LLVMPtr{T,as})
 
         # create a function
         llvm_f, _ = create_function(T_ptr_i8)
@@ -30,17 +32,18 @@
             end
         end
 
-        # by requesting a larger-than-datatype alignment, we might be able to vectorize.
+        # By requesting a larger-than-datatype alignment,
+        # we might be able to vectorize.
         # TODO: Make the alignment configurable
         alignment!(gv, Base.max(32, Base.datatype_alignment(T)))
 
         # generate IR
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             ptr_with_as = gep!(builder, gv_typ, gv,
-                [ConstantInt(0; ctx), ConstantInt(0; ctx)])
+                [ConstantInt(0), ConstantInt(0)])
             ptr = bitcast!(builder, ptr_with_as, T_ptr_i8)
             ret!(builder, ptr)
         end
@@ -49,8 +52,10 @@
     end
 end
 
-@inline alloc_local(id, T, len, zeroinit=false) = alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}())
-@inline alloc_scratch(id, T, len) = alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}())
+@inline alloc_local(id, T, len, zeroinit=false) =
+    alloc_special(Val{id}(), T, Val{AS.Local}(), Val{len}(), Val{zeroinit}())
+@inline alloc_scratch(id, T, len) =
+    alloc_special(Val{id}(), T, Val{AS.Private}(), Val{len}(), Val{false}())
 
 macro ROCStaticLocalArray(T, dims, zeroinit=true)
     zeroinit = zeroinit isa Expr ? zeroinit.args[1] : zeroinit
@@ -59,9 +64,12 @@ macro ROCStaticLocalArray(T, dims, zeroinit=true)
     @gensym id len
     quote
         $len = prod($(esc(dims)))
-        $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))), $(esc(T)), $len, $zeroinit))
+        $ROCDeviceArray($(esc(dims)),
+            $alloc_local($(QuoteNode(Symbol(:ROCStaticLocalArray_, id))),
+            $(esc(T)), $len, $zeroinit))
     end
 end
+
 macro ROCDynamicLocalArray(T, dims, zeroinit=true)
     if Base.libllvm_version < v"14"
         @warn "@ROCDynamicLocalArray is unsupported on LLVM <14\nUndefined behavior may result"
@@ -73,7 +81,9 @@ macro ROCDynamicLocalArray(T, dims, zeroinit=true)
     @gensym id DA
     quote
         let
-            $DA = $ROCDeviceArray($(esc(dims)), $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))), $(esc(T)), 0, $zeroinit))
+            $DA = $ROCDeviceArray($(esc(dims)),
+                $alloc_local($(QuoteNode(Symbol(:ROCDynamicLocalArray_, id))),
+                $(esc(T)), 0, $zeroinit))
             if $zeroinit
                 # Zeroinit doesn't work at the compiler level for dynamic LDS
                 # allocations, so zero it here
@@ -89,10 +99,10 @@ end
 
 @inline @generated function alloc_string(::Val{str}) where str
     @dispose ctx=Context() begin
-        T_pint8 = LLVM.PointerType(LLVM.Int8Type(ctx), AS.Global)
+        T_pint8 = LLVM.PointerType(LLVM.Int8Type(), AS.Global)
         llvm_f, _ = create_function(T_pint8)
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
             str_ptr = globalstring_ptr!(builder, String(str))
             ptr = addrspacecast!(builder, str_ptr, T_pint8)
@@ -105,18 +115,18 @@ end
 # TODO: Support various types of len
 @inline @generated function memcpy!(dest_ptr::LLVMPtr{UInt8,DestAS}, src_ptr::LLVMPtr{UInt8,SrcAS}, len::LT) where {DestAS,SrcAS,LT<:Union{Int64,UInt64}}
     @dispose ctx=Context() begin
-        T_nothing = LLVM.VoidType(ctx)
-        T_pint8_dest = convert(LLVMType, dest_ptr; ctx)
-        T_pint8_src = convert(LLVMType, src_ptr; ctx)
-        T_int64 = convert(LLVMType, len; ctx)
-        T_int1 = LLVM.Int1Type(ctx)
+        T_nothing = LLVM.VoidType()
+        T_pint8_dest = convert(LLVMType, dest_ptr)
+        T_pint8_src = convert(LLVMType, src_ptr)
+        T_int64 = convert(LLVMType, len)
+        T_int1 = LLVM.Int1Type()
 
         llvm_f, _ = create_function(T_nothing, [T_pint8_dest, T_pint8_src, T_int64])
         mod = LLVM.parent(llvm_f)
         T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_pint8_src, T_int64, T_int1])
         intr = LLVM.Function(mod, "llvm.memcpy.p$(DestAS)i8.p$(SrcAS)i8.i64", T_intr)
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             dest_ptr_i8 = parameters(llvm_f)[1]
@@ -131,18 +141,18 @@ memcpy!(dest_ptr::LLVMPtr{T,DestAS}, src_ptr::LLVMPtr{T,SrcAS}, len::Integer) wh
     memcpy!(reinterpret(LLVMPtr{UInt8,DestAS}, dest_ptr), reinterpret(LLVMPtr{UInt8,SrcAS}, src_ptr), UInt64(len))
 @inline @generated function memset!(dest_ptr::LLVMPtr{UInt8,DestAS}, value::UInt8, len::LT) where {DestAS,LT<:Union{Int64,UInt64}}
     @dispose ctx=Context() begin
-        T_nothing = LLVM.VoidType(ctx)
-        T_pint8_dest = convert(LLVMType, dest_ptr; ctx)
-        T_int8 = convert(LLVMType, value; ctx)
-        T_int64 = convert(LLVMType, len; ctx)
-        T_int1 = LLVM.Int1Type(ctx)
+        T_nothing = LLVM.VoidType()
+        T_pint8_dest = convert(LLVMType, dest_ptr)
+        T_int8 = convert(LLVMType, value)
+        T_int64 = convert(LLVMType, len)
+        T_int1 = LLVM.Int1Type()
 
         llvm_f, _ = create_function(T_nothing, [T_pint8_dest, T_int8, T_int64])
         mod = LLVM.parent(llvm_f)
         T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_int8, T_int64, T_int1])
         intr = LLVM.Function(mod, "llvm.memset.p$(DestAS)i8.i64", T_intr)
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             call!(builder, T_intr, intr, [parameters(llvm_f)[1], parameters(llvm_f)[2], parameters(llvm_f)[3], ConstantInt(T_int1, 0)])
diff --git a/src/device/gcn/output.jl b/src/device/gcn/output.jl
index f854b6ba0..667baae57 100644
--- a/src/device/gcn/output.jl
+++ b/src/device/gcn/output.jl
@@ -5,64 +5,11 @@ Base.unsafe_load(ptr::LLVMPtr{<:DeviceStaticString,AS.Global}) =
     unsafe_string(reinterpret(Cstring, ptr))
 Base.unsafe_store!(ptr::LLVMPtr{<:DeviceStaticString,AS.Global}, x) = nothing
 
-struct OutputContext{HC}
-    hostcall::HC
-end
-function OutputContext(io::IO=stdout; device=AMDGPU.device(), continuous=true, buf_len=2^16, name=nothing, kwargs...)
-    hc = if name !== nothing
-        named_perdevice_hostcall(device, name) do
-            create_output_context_hostcall(io; device, continuous, buf_len, kwargs...)
-        end
-    else
-        create_output_context_hostcall(io; device, continuous, buf_len, kwargs...)
-    end
-    return OutputContext(hc)
-end
-function create_output_context_hostcall(io; buf_len, kwargs...)
-    hc = HostCall(Int64, Tuple{LLVMPtr{DeviceStaticString{buf_len},AS.Global}}; buf_len, kwargs...) do bytes
-        str = unsafe_load(reinterpret(LLVMPtr{DeviceStaticString{buf_len},AS.Global}, hc.buf_ptr))
-        print(io, str)
-        return Int64(length(str))
-    end
-    return hc
-end
-
-const GLOBAL_OUTPUT_CONTEXT_TYPE = OutputContext{HostCall{Int64,Tuple{LLVMPtr{DeviceStaticString{2^16},AS.Global}}}}
-
-### macros
-
-macro rocprint(str...)
-    if first(str) isa String || Meta.isexpr(first(str), :string)
-        # No OutputContext
-        @gensym oc_ptr oc
-        ex = quote
-            $oc_ptr = $get_global_pointer($(Val(:__global_output_context)),
-                                          $GLOBAL_OUTPUT_CONTEXT_TYPE)
-            $oc = Base.unsafe_load($oc_ptr)
-        end
-        push!(ex.args, rocprint(oc, str...))
-        return esc(ex)
-    else
-        return esc(rocprint(first(str), str[2:end]...))
-    end
-end
-macro rocprintln(str...)
-    if first(str) isa String || Meta.isexpr(first(str), :string)
-        # No OutputContext
-        @gensym oc_ptr oc
-        ex = quote
-            $oc_ptr = $get_global_pointer($(Val(:__global_output_context)),
-                                                 $GLOBAL_OUTPUT_CONTEXT_TYPE)
-            $oc = Base.unsafe_load($oc_ptr)
-        end
-        push!(ex.args, rocprint(oc, str..., '\n'))
-        return esc(ex)
-    else
-        return esc(rocprint(first(str), str[2:end]..., '\n'))
-    end
-end
+const OUTPUT_CONTEXT_TYPE = HostCall{
+    Nothing, Tuple{LLVMPtr{DeviceStaticString{2^16}, AS.Global}}}
 
-### parse-time helpers
+const PRINTF_OUTPUT_CONTEXT_TYPE = HostCall{
+    Nothing, Tuple{LLVMPtr{UInt8, AS.Global}}}
 
 function rocprint(oc, str...)
     ex = Expr(:block)
@@ -74,82 +21,107 @@ function rocprint(oc, str...)
         @assert s.head == :string
         push!(strs, s)
     end
-    push!(ex.args, :($hostcall_device_lock!($oc.hostcall)))
+    push!(ex.args, :($hostcall_device_lock!($oc)))
     N = 1
+    # Write strings & null termination to hostcall buffer.
     for str in strs
         N = rocprint!(ex, N, oc, str)
     end
     rocprint!(ex, N, oc, '\0')
-    push!(ex.args, :($hostcall_device_trigger_and_return!($oc.hostcall)))
+    # Make host read args, execute function & wait for return.
+    push!(ex.args, :($hostcall_device_trigger_and_return!($oc)))
     push!(ex.args, :(nothing))
     return ex
 end
+
 function rocprint!(ex, N, oc, str::String)
     @gensym str_ptr
     push!(ex.args, :($str_ptr = $alloc_string($(Val(Symbol(str))))))
-    push!(ex.args, :($memcpy!($oc.hostcall.buf_ptr+$(N-1), $str_ptr, $(length(str)))))
-    return N+length(str)
+    push!(ex.args, :($memcpy!(
+        $oc.buf_ptr + $(N - 1), $str_ptr, $(length(str)))))
+    return N + length(str)
 end
+
 function rocprint!(ex, N, oc, char::Char)
     @assert length(codeunits(string(char))) == 1 "Multi-codeunit chars not yet implemented"
     byte = UInt8(char)
-    ptr = :(reinterpret($(LLVMPtr{UInt8,AS.Global}), $oc.hostcall.buf_ptr))
+    ptr = :(reinterpret($(LLVMPtr{UInt8, AS.Global}), $oc.buf_ptr))
     push!(ex.args, :(Base.unsafe_store!($ptr, $byte, $N)))
-    return N+1
+    return N + 1
 end
+
 function rocprint!(ex, N, oc, iex::Expr)
     for arg in iex.args
         N = rocprint!(ex, N, oc, arg)
     end
     return N
 end
-function rocprint!(ex, N, oc, sym::S) where S
+
+function rocprint!(ex, N, oc, ::S) where S
     error("Dynamic printing of $S only supported via @rocprintf")
 end
 
-## @rocprintf
-
-# Serializes execution of a function within a wavefront
-# From implementation by @jonathanvdc in CUDAnative.jl#419
-function wave_serialized(func::Function)
-    # Get the current thread's ID
-    thread_id = workitemIdx().x - 1
-
-    # Get the size of a wavefront
-    size = wavefrontsize()
+macro rocprint(str...)
+    if first(str) isa String || Meta.isexpr(first(str), :string)
+        # No OutputContext
+        @gensym oc_ptr oc
+        ex = quote
+            $oc_ptr = $output_context()
+            $oc = Base.unsafe_load($oc_ptr)
+        end
+        push!(ex.args, rocprint(oc, str...))
+        return esc(ex)
+    else
+        return esc(rocprint(first(str), str[2:end]...))
+    end
+end
 
-    local result
-    i = 0
-    while i < size
-        if thread_id % size == i
-            result = func()
+macro rocprintln(str...)
+    if first(str) isa String || Meta.isexpr(first(str), :string)
+        # No OutputContext
+        @gensym oc_ptr oc
+        ex = quote
+            $oc_ptr = $output_context()
+            $oc = Base.unsafe_load($oc_ptr)
         end
-        i += 1
+        push!(ex.args, rocprint(oc, str..., '\n'))
+        return esc(ex)
+    else
+        return esc(rocprint(first(str), str[2:end]..., '\n'))
     end
-    return result
 end
 
+# @rocprintf impementation.
+
 struct ROCPrintfBuffer end
+
 Base.sizeof(::ROCPrintfBuffer) = 0
-Base.unsafe_store!(::LLVMPtr{ROCPrintfBuffer,as} where as, x) = nothing
-function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as)
+
+Base.unsafe_store!(::LLVMPtr{ROCPrintfBuffer, AS.Global}, x) = nothing
+
+# TODO add docs about format.
+"""
+Read from the printf buffer on the host from HostCall task.
+"""
+function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer, AS.Global})
     ptr = reinterpret(Ptr{UInt64}, ptr)
 
-    # Read number of argument blocks in buffer
+    # Read number of argument blocks in buffer.
     blocks = unsafe_load(ptr)
     ptr += sizeof(UInt64)
 
-    # Read pointer to format string
+    # Read pointer to format string.
     fmt_ptr = Ptr{UInt64}(unsafe_load(ptr))
     ptr += sizeof(UInt64)
-
-    # Read format string length
+    # Read format string length.
     fmt_len = unsafe_load(ptr)
     ptr += sizeof(UInt64)
 
-    # Read format string into host buffer
+    # Read format string into host buffer.
     fmt_buf = Vector{UInt8}(undef, fmt_len)
-    HSA.memory_copy(convert(Ptr{Cvoid}, pointer(fmt_buf)), convert(Ptr{Cvoid}, fmt_ptr), fmt_len) |> Runtime.check
+    HSA.memory_copy(
+        convert(Ptr{Cvoid}, pointer(fmt_buf)),
+        convert(Ptr{Cvoid}, fmt_ptr), fmt_len) |> Runtime.check
     fmt = String(fmt_buf)
 
     # Read arguments
@@ -166,8 +138,7 @@ function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as)
                 break
             end
             T = unsafe_pointer_to_objref(T_ptr)
-
-            # Read argument
+            # Read argument.
             arg = unsafe_load(reinterpret(Ptr{T}, ptr))
             push!(args, arg)
             ptr += sizeof(arg)
@@ -176,59 +147,82 @@ function Base.unsafe_load(ptr::LLVMPtr{ROCPrintfBuffer,as} where as)
         block += 1
     end
 
-    return (fmt, all_args)
+    return fmt, all_args
 end
 
-function _rocprintf_fmt(ptr, fmt_ptr, fmt_len)
+function _rocprintf_fmt(ptr::LLVMPtr{UInt64, AS.Global}, fmt_ptr, fmt_len::Int64)
     unsafe_store!(ptr, reinterpret(UInt64, fmt_ptr))
     ptr += sizeof(UInt64)
     unsafe_store!(ptr, UInt64(fmt_len))
     ptr += sizeof(UInt64)
     return ptr
 end
-@generated function pointer_from_type(::Type{T}) where T
-    ptr = pointer_from_objref(T)
-    return UInt64(ptr)
+
+function _pointer_from_type(::Type{T}) where T
+    UInt64(pointer_from_objref(T))
 end
-function _rocprintf_arg(ptr, arg::T) where T
-    T_ptr = pointer_from_type(T)
-    unsafe_store!(ptr, T_ptr)
+
+function _rocprintf_arg(ptr::LLVMPtr{UInt64, AS.Global}, arg::T) where T
+    unsafe_store!(ptr, _pointer_from_type(T))
     ptr += sizeof(UInt64)
-    unsafe_store!(reinterpret(LLVMPtr{T,1}, ptr), arg)
+
+    unsafe_store!(reinterpret(LLVMPtr{T, AS.Global}, ptr), arg)
     ptr += sizeof(arg)
-    #= FIXME
-    ref_arg = Ref{T}(arg)
-    GC.@preserve ref_arg begin
-    ptr_arg = convert(DevicePtr{UInt8,AS.Global},
-                      convert(DevicePtr{T,AS.Global},
-                      Base.unsafe_convert(Ptr{T}, ref_arg)))
-    memcpy!(ptr, ptr_arg, sizeof(arg), Val(true))
-    end
-    =#
     return ptr
 end
-#= TODO: Not really useful until we can work with device-side strings
-function _rocprintf_string(ptr, str::String)
-    @gensym T_str T_str_len str_ptr
-    quote
-        $T_str, $T_str_len = AMDGPU._rocprintf_T_str(String)
-        AMDGPU.Device.memcpy!($ptr, $T_str, $T_str_len)
-        $ptr += $T_str_len
-        unsafe_store!($ptr, UInt8(0))
-        $ptr += 1
-        $str_ptr = Base.unsafe_convert(DevicePtr{UInt8,AS.Generic}, $str_ptr)
-        $str_ptr = AMDGPU.Device.alloc_string($(Val(Symbol(str))))
-        AMDGPU.Device.memcpy!($ptr, $str_ptr, $(length(str)))
-        $ptr += $(length(str))
-        $ptr
-    end
-end
-@generated function _rocprintf_T_str(::Type{T}) where T
-    quote
-        (AMDGPU.Device.alloc_string($(Val(Symbol(repr(T))))), $(sizeof(repr(T))))
-    end
-end
-=#
+
+# macro rocprintf(args...)
+#     mode = :group
+#     @assert first(args) isa Union{QuoteNode,String} "First argument must be an inline Symbol or String"
+#     if first(args) isa QuoteNode
+#         mode = args[1].value::Symbol
+#         args = args[2:end]
+#         @assert mode isa Symbol "Execution mode must be a Symbol"
+#         @assert mode in (:grid, :group, :wave, :lane) "Invalid execution mode: $mode"
+#     end
+
+#     @assert first(args) isa String "@rocprintf format-string must be a String"
+#     fmt = args[1]
+#     args = args[2:end]
+
+#     @gensym printf_hc device_ptr device_fmt_ptr write_size
+#     ex = quote
+#         # Load printf HostCall.
+#         $printf_hc = Base.unsafe_load($printf_output_context())
+#         $device_ptr = reinterpret(
+#             $(LLVMPtr{UInt64, AS.Global}), $printf_hc.buf_ptr)
+#         # Allocate device-side format pointer.
+#         $device_fmt_ptr = $alloc_string($(Val(Symbol(fmt))))
+#         # Lock hostcall buffer.
+#         $hostcall_device_lock!($printf_hc)
+#         # Write block count.
+#         Base.unsafe_store!($device_ptr, UInt64(1)) # TODO take into account mode
+#         $device_ptr += sizeof(UInt64)
+#         # Write fmt string pointer & its bytesize.
+#         $device_ptr = $_rocprintf_fmt(
+#             $device_ptr, $device_fmt_ptr, $(sizeof(fmt)))
+#         # Calculate total write size per args block.
+#         $write_size =
+#             $hostcall_device_args_size($(map(esc, args)...)) + # Space for arguments.
+#             $(length(args)) * sizeof(UInt64) + # Space for type tags. # TODO what if args are less than uint64?
+#             sizeof(UInt64) # Space for terminator.
+#         # TODO account for offset for different modes.
+#     end
+
+#     # Write arguments & terminating null word.
+#     ex_args = Expr(:block)
+#     for arg in args
+#         push!(ex_args.args, :($device_ptr = $_rocprintf_arg(
+#             $device_ptr, $(esc(arg)))))
+#     end
+#     push!(ex_args.args, :(unsafe_store!($device_ptr, 0)))
+#     push!(ex.args, :(@device_execution_gate $mode $ex_args))
+
+#     # Submit & unlock hostcall buffer.
+#     push!(ex.args, :($hostcall_device_trigger_and_return!($printf_hc)))
+#     push!(ex.args, :(nothing))
+#     ex
+# end
 
 function unsafe_ceil(x, y)
     up = Core.Intrinsics.urem_int(x, y) > 0
@@ -237,13 +231,16 @@ end
 
 macro rocprintf(args...)
     mode = :group
-    @assert first(args) isa Union{QuoteNode,String} "First argument must be an inline Symbol or String"
+    @assert first(args) isa Union{QuoteNode,String}
+        "First argument must be an inline Symbol or String"
     if first(args) isa QuoteNode
         mode = args[1].value::Symbol
         args = args[2:end]
         @assert mode isa Symbol "Execution mode must be a Symbol"
-        @assert mode in (:grid, :group, :wave, :lane) "Invalid execution mode: $mode"
+        @assert mode in (:grid, :group, :wave, :lane)
+            "Invalid execution mode: $mode"
     end
+
     @assert first(args) isa String "Format must be a String"
     fmt = args[1]
     args = args[2:end]
@@ -264,10 +261,9 @@ macro rocprintf(args...)
     push!(ex.args, :($device_fmt_ptr = $alloc_string($(Val(Symbol(fmt))))))
 
     # Load HostCall object
-    push!(ex.args, :($printf_hc = unsafe_load($get_global_pointer(Val(:__global_printf_context),
-                                                                        HostCall{Int64,Tuple{LLVMPtr{ROCPrintfBuffer,AS.Global}}}))))
-    push!(ex.args, :($device_ptr = reinterpret($(LLVMPtr{UInt64,AS.Global}), $printf_hc.buf_ptr)))
-
+    push!(ex.args, :($printf_hc = unsafe_load($printf_output_context())))
+    push!(ex.args, :($device_ptr = reinterpret(
+        $(LLVMPtr{UInt64,AS.Global}), $printf_hc.buf_ptr)))
     # Lock hostcall buffer
     push!(ex.args, :($hostcall_device_lock!($printf_hc)))
 
@@ -278,21 +274,24 @@ macro rocprintf(args...)
     elseif mode == :group
         push!(ex.args, :(unsafe_store!($device_ptr, UInt64(1))))
     elseif mode == :wave
-        waves_per_group = :($unsafe_ceil($workgroupDim().x,
-                                         $wavefrontsize()))
-        push!(ex.args, :(unsafe_store!($device_ptr, Base.unsafe_trunc(UInt64, $waves_per_group))))
+        waves_per_group = :($unsafe_ceil($workgroupDim().x, $wavefrontsize()))
+        push!(ex.args, :(unsafe_store!(
+            $device_ptr, Base.unsafe_trunc(UInt64, $waves_per_group))))
     elseif mode == :lane
-        push!(ex.args, :(unsafe_store!($device_ptr, Base.unsafe_trunc(UInt64, $workgroupDim().x))))
+        push!(ex.args, :(unsafe_store!(
+            $device_ptr, Base.unsafe_trunc(UInt64, $workgroupDim().x))))
     end
     push!(ex.args, :($device_ptr += sizeof(UInt64)))
 
     # Write format string pointer
-    push!(ex.args, :($device_ptr = $_rocprintf_fmt($device_ptr, $device_fmt_ptr, $(sizeof(fmt)))))
+    push!(ex.args, :($device_ptr = $_rocprintf_fmt(
+        $device_ptr, $device_fmt_ptr, $(sizeof(fmt)))))
 
     # Calculate total write size per args block
-    push!(ex.args, :($write_size = $hostcall_device_args_size($(map(esc, args)...)) # Space for arguments
-                                 + $(length(args))*sizeof(UInt64) + # Space for type tags
-                                 + sizeof(UInt64))) # Space for terminator
+    push!(ex.args, :($write_size =
+        $hostcall_device_args_size($(map(esc, args)...)) # Space for arguments
+        + $(length(args))*sizeof(UInt64) + # Space for type tags
+        + sizeof(UInt64))) # Space for terminator
 
     # Calulate offset into buffer
     # FIXME: Use y and z dims
@@ -301,8 +300,8 @@ macro rocprintf(args...)
     elseif mode == :group
         :(0)
     elseif mode == :wave
-        wave_idx = :(Core.Intrinsics.udiv_int($workitemIdx().x - UInt32(1),
-                                              $wavefrontsize()))
+        wave_idx = :(Core.Intrinsics.udiv_int(
+            $workitemIdx().x - UInt32(1), $wavefrontsize()))
         :($wave_idx * $write_size)
     elseif mode == :lane
         lane_idx = :(workitemIdx().x - 1)
@@ -313,7 +312,8 @@ macro rocprintf(args...)
     # Write arguments and terminating null word
     ex_args = Expr(:block)
     for arg in args
-        push!(ex_args.args, :($device_ptr = $_rocprintf_arg($device_ptr, $(esc(arg)))))
+        push!(ex_args.args, :($device_ptr = $_rocprintf_arg(
+            $device_ptr, $(esc(arg)))))
     end
     push!(ex_args.args, :(unsafe_store!($device_ptr, 0)))
     push!(ex.args, :(@device_execution_gate $mode $ex_args))
@@ -323,3 +323,57 @@ macro rocprintf(args...)
     push!(ex.args, :(nothing))
     ex
 end
+
+@inline _to_linear(w, h, i, j, k) =
+    i + w * (j - 1 + ((k - 1) * h))
+
+macro ⊡(exec_ex)
+    @gensym x y z value
+    quote
+        $x = $workitemIdx().x + ($workgroupIdx().x - UInt32(1)) * $workgroupDim().x
+        $y = $workitemIdx().y + ($workgroupIdx().y - UInt32(1)) * $workgroupDim().y
+        $z = $workitemIdx().z + ($workgroupIdx().z - UInt32(1)) * $workgroupDim().z
+        $value = _to_linear(
+            UInt64($gridItemDim().x), UInt64($gridItemDim().y),
+            UInt64($x), UInt64($y), UInt64($z))
+        if gate!($value)
+            $(esc(exec_ex))
+        end
+    end
+end
+
+macro errprintf(args...)
+    fmt, args = args[1], args[2:end]
+    @assert fmt isa String "@errprintf format-string must be a String: $fmt."
+
+    @gensym buffer_ptr device_fmt_str write_size
+    err_ex = quote
+        $buffer_ptr = $err_buffer!()
+        reinterpret(UInt64, $buffer_ptr) == 0 && return
+        $device_fmt_str = $alloc_string($(Val(Symbol(fmt))))
+        # Write block count (compat with printf, not used).
+        Base.unsafe_store!($buffer_ptr, UInt64(1))
+        $buffer_ptr += sizeof(UInt64)
+        # Write fmt string pointer & its bytesize.
+        $buffer_ptr = $_rocprintf_fmt(
+            $buffer_ptr, $device_fmt_str, $(sizeof(fmt)))
+        # Calculate total write size per args block.
+        $write_size =
+            $hostcall_device_args_size($(map(esc, args)...)) + # Space for arguments.
+            $(length(args)) * sizeof(UInt64) + # Space for type tags.
+            sizeof(UInt64) # Space for terminator.
+    end
+
+    # Write arguments & terminating null word.
+    for arg in args
+        push!(err_ex.args,
+            :($buffer_ptr = $_rocprintf_arg($buffer_ptr, $(esc(arg)))))
+    end
+    push!(err_ex.args, :(unsafe_store!($buffer_ptr, 0)))
+
+    # Pass through ⊡ gate.
+    ex = Expr(:block)
+    push!(ex.args, :(@⊡ $err_ex))
+    push!(ex.args, :(nothing))
+    ex
+end
diff --git a/src/device/globals.jl b/src/device/globals.jl
index 5b6936460..125489965 100644
--- a/src/device/globals.jl
+++ b/src/device/globals.jl
@@ -5,8 +5,8 @@
 # space.
 @inline @generated function get_global_pointer(::Val{global_name}, ::Type{T})::LLVMPtr{T} where {global_name, T}
     @dispose ctx=Context() begin
-        T_global = convert(LLVMType, T; ctx)
-        T_result = convert(LLVMType, Ptr{T}; ctx)
+        T_global = convert(LLVMType, T)
+        T_result = convert(LLVMType, Ptr{T})
 
         # Create a thunk that computes a pointer to the global.
         llvm_f, _ = create_function(T_result)
@@ -27,8 +27,8 @@
         end
 
         # Generate IR that computes the global's address.
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
             position!(builder, entry)
 
             # Cast the global variable's type to the result type.
diff --git a/src/device/quirks.jl b/src/device/quirks.jl
index 0322ebbb3..f85dfc69e 100644
--- a/src/device/quirks.jl
+++ b/src/device/quirks.jl
@@ -1,56 +1,57 @@
 # Copied from CUDA.jl/src/device/quirks.jl
 
-macro print_and_throw(arg)
+macro print_and_throw(description)
     quote
-        str = $alloc_string($(Val(Symbol(arg))))
-        $device_report_exception(reinterpret(Ptr{Cchar}, str))
-        # FIXME: Report exception frames
-        signal_exception()
+        # FIXME
+        # all functions that take part in exception reporting are not inlined
+        # @errprintf($description)
+        throw(nothing)
     end
 end
 
 # math.jl
-@device_override @noinline Base.Math.throw_complex_domainerror(f::Symbol, x) =
-    @print_and_throw "This operation requires a complex input to return a complex result"
-@device_override @noinline Base.Math.throw_exp_domainerror(f::Symbol, x) =
-    @print_and_throw "Exponentiation yielding a complex result requires a complex argument"
+@device_override Base.Math.throw_complex_domainerror(f::Symbol, x) =
+    @print_and_throw "This operation requires a complex input to return a complex result.\n"
+@device_override Base.Math.throw_exp_domainerror(f::Symbol, x) =
+    @print_and_throw "Exponentiation yielding a complex result requires a complex argument.\n"
 
 # intfuncs.jl
-@device_override @noinline Base.throw_domerr_powbysq(::Any, p) =
-    @print_and_throw "Cannot raise an integer to a negative power"
-@device_override @noinline Base.throw_domerr_powbysq(::Integer, p) =
-    @print_and_throw "Cannot raise an integer to a negative power"
-@device_override @noinline Base.throw_domerr_powbysq(::AbstractMatrix, p) =
-    @print_and_throw "Cannot raise an integer to a negative power"
-@device_override @noinline Base.__throw_gcd_overflow(a, b) =
-    @print_and_throw "gcd overflow"
+@device_override Base.throw_domerr_powbysq(::Any, p) =
+    @print_and_throw "Cannot raise an integer to a negative power.\n"
+@device_override Base.throw_domerr_powbysq(::Integer, p) =
+    @print_and_throw "Cannot raise an integer to a negative power.\n"
+@device_override Base.throw_domerr_powbysq(::AbstractMatrix, p) =
+    @print_and_throw "Cannot raise an integer to a negative power.\n"
+@device_override Base.__throw_gcd_overflow(a, b) =
+    @print_and_throw "GCD overflow.\n"
 
 # checked.jl
-@device_override @noinline Base.Checked.throw_overflowerr_binaryop(op, x, y) =
-    @print_and_throw "Binary operation overflowed"
-@device_override @noinline Base.Checked.throw_overflowerr_negation(op, x, y) =
-    @print_and_throw "Negation overflowed"
+@device_override Base.Checked.throw_overflowerr_binaryop(op, x, y) =
+    @print_and_throw "Binary operation overflowed.\n"
+@device_override Base.Checked.throw_overflowerr_negation(op, x, y) =
+    @print_and_throw "Negation overflowed.\n"
 @device_override function Base.Checked.checked_abs(x::Base.Checked.SignedInt)
-    r = ifelse(x<0, -x, x)
-    r<0 && @print_and_throw("checked arithmetic: cannot compute |x|")
+    r = ifelse(x < 0, -x, x)
+    r < 0 && @print_and_throw("checked arithmetic: cannot compute |x|.\n")
     r
 end
 
 # boot.jl
-@device_override @noinline Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} =
-    @print_and_throw "Inexact conversion"
+@device_override Core.throw_inexacterror(f::Symbol, ::Type{T}, val) where {T} =
+    @print_and_throw "Inexact conversion.\n"
 
 # abstractarray.jl
-@device_override @noinline Base.throw_boundserror(A, I) =
-    @print_and_throw "Out-of-bounds array access"
+@device_override Base.throw_boundserror(A, I) =
+    @print_and_throw "Out-of-bounds array access.\n"
 
 # trig.jl
-@device_override @noinline Base.Math.sincos_domain_error(x) =
-    @print_and_throw "sincos(x) is only defined for finite x."
+@device_override Base.Math.sincos_domain_error(x) =
+    @print_and_throw "sincos(x) is only defined for finite x.\n"
 
 # multidimensional.jl
-@device_override Base.@propagate_inbounds function Base.getindex(iter::CartesianIndices{N,R},
-                                                                 I::Vararg{Int, N}) where {N,R}
+@device_override Base.@propagate_inbounds function Base.getindex(
+    iter::CartesianIndices{N,R}, I::Vararg{Int, N},
+) where {N,R}
     @boundscheck checkbounds(iter, I...)
     index = map(iter.indices, I) do r, i
         @inbounds getindex(r, i)
@@ -60,30 +61,31 @@ end
 
 # range.jl
 @eval begin
-    @device_override function Base.StepRangeLen{T,R,S,L}(ref::R, step::S, len::Integer,
-                                                         offset::Integer=1) where {T,R,S,L}
+    @device_override function Base.StepRangeLen{T,R,S,L}(
+        ref::R, step::S, len::Integer, offset::Integer=1,
+    ) where {T,R,S,L}
         if T <: Integer && !isinteger(ref + step)
-            @print_and_throw("StepRangeLen{<:Integer} cannot have non-integer step")
+            @print_and_throw("StepRangeLen{<:Integer} cannot have non-integer step.\n")
         end
         len = convert(L, len)
-        len >= zero(len) || @print_and_throw("StepRangeLen length cannot be negative")
+        len >= zero(len) || @print_and_throw("StepRangeLen length cannot be negative.\n")
         offset = convert(L, offset)
         L1 = oneunit(typeof(len))
-        L1 <= offset <= max(L1, len) || @print_and_throw("StepRangeLen: offset must be in [1,...]")
-        $(
-            Expr(:new, :(StepRangeLen{T,R,S,L}), :ref, :step, :len, :offset)
-        )
+        L1 <= offset <= max(L1, len) || @print_and_throw("StepRangeLen: offset must be in [1,...].\n")
+        $(Expr(:new, :(StepRangeLen{T,R,S,L}), :ref, :step, :len, :offset))
     end
 end
 
 # LinearAlgebra
 @static if VERSION >= v"1.8-"
-    @device_override function Base.setindex!(D::LinearAlgebra.Diagonal, v, i::Int, j::Int)
+    @device_override function Base.setindex!(
+        D::LinearAlgebra.Diagonal, v, i::Int, j::Int,
+    )
         @boundscheck checkbounds(D, i, j)
         if i == j
             @inbounds D.diag[i] = v
         elseif !iszero(v)
-            @print_and_throw("cannot set off-diagonal entry to a nonzero value")
+            @print_and_throw("Cannot set off-diagonal entry to a nonzero value.\n")
         end
         return v
     end
diff --git a/src/device/runtime.jl b/src/device/runtime.jl
index cc0a037d2..3cf521c8a 100644
--- a/src/device/runtime.jl
+++ b/src/device/runtime.jl
@@ -1,84 +1,135 @@
+using Core: LLVMPtr
 # ROCm-specific runtime libraries
 
-
 ## GPU runtime library
 
 # reset the runtime cache from global scope, so that any change triggers recompilation
 GPUCompiler.reset_runtime()
 
-signal_exception() = device_signal_exception()
-function device_signal_exception()
-    flag_ptr = get_global_pointer(Val(:__global_exception_flag), Int64)
-    unsafe_store!(flag_ptr, 1)
+@inline @generated kernel_state() = GPUCompiler.kernel_state_value(AMDGPU.KernelState)
+
+exception_flag() = kernel_state().exception_flag
+
+function err_buffer!()
+    st = kernel_state()
+    counter_ptr = reinterpret(LLVMPtr{Int32, AS.Global}, st.buffers_counter)
+    idx = atomic_add!(counter_ptr, Int32(1)) + Int32(1)
+    idx > st.n_buffers && return reinterpret(LLVMPtr{UInt64, AS.Global}, 0)
+
+    buf = unsafe_load(st.buffers, idx)
+    reinterpret(LLVMPtr{UInt64, AS.Global}, buf)
+end
+
+function err_str_buffer!()
+    st = kernel_state()
+    counter_ptr = reinterpret(LLVMPtr{Int32, AS.Global}, st.str_buffers_counter)
+    idx = atomic_add!(counter_ptr, Int32(1)) + Int32(1)
+    idx > st.n_str_buffers && return reinterpret(LLVMPtr{UInt8, AS.Global}, 0)
+
+    buf = unsafe_load(kernel_state().string_buffers, idx)
+    reinterpret(LLVMPtr{UInt8, AS.Global}, buf)
+end
+
+function gate!(value::UInt64)::Bool
+    gate_ptr = reinterpret(LLVMPtr{UInt64, AS.Global}, kernel_state().gate)
+    old_value = atomic_cas!(gate_ptr, UInt64(0), value)
+    ifelse(iszero(old_value), true, value == old_value)
+end
+
+function output_context()
+    ptr = convert(Ptr{OUTPUT_CONTEXT_TYPE}, kernel_state().output_context)
+
+    x = alloc_local(:__print_hostcall, UInt64, 1)
+    unsafe_store!(x, reinterpret(UInt64, ptr))
+    return ptr
+end
+
+function printf_output_context()
+    ptr = convert(
+        Ptr{PRINTF_OUTPUT_CONTEXT_TYPE},
+        kernel_state().printf_output_context)
 
-    # stop this wavefront
+    x = alloc_local(:__printf_hostcall, UInt64, 1)
+    unsafe_store!(x, reinterpret(UInt64, ptr))
+    return ptr
+end
+
+function malloc_hc()
+    ptr = convert(
+        Ptr{HostCall{Ptr{Cvoid}, Tuple{Csize_t}}},
+        kernel_state().malloc_hc)
+
+    # FIXME
+    # Hack to detect when global malloc hostcall is used.
+    # Create global variable and write pointer to it to prevent it
+    # from being optimized away.
+    x = alloc_local(:__malloc_hostcall, UInt64, 1)
+    unsafe_store!(x, reinterpret(UInt64, ptr))
+    return ptr
+end
+
+function free_hc()
+    ptr = convert(
+        Ptr{HostCall{Nothing, Tuple{Ptr{Cvoid}}}},
+        kernel_state().free_hc)
+
+    x = alloc_local(:__free_hostcall, UInt64, 1)
+    unsafe_store!(x, reinterpret(UInt64, ptr))
+    return ptr
+end
+
+function signal_exception()
+    unsafe_store!(exception_flag(), Int32(1))
+    # Without endpgm we'll get hardware exception.
     endpgm()
-    trap()
+    return
 end
 
-function device_string_to_host(ex)
-    # We get a ReadOnlyMemoryError on the host without making a copy because the data is pinned to the device
-    ex_ptr = reinterpret(LLVMPtr{UInt8,1}, ex)
-    ex_len = string_length(ex_ptr)
-    # TODO: Don't use an expensive host malloc
-    ex_str = reinterpret(LLVMPtr{UInt8,1}, device_malloc(Csize_t(ex_len+1)))
-    if reinterpret(UInt64, ex_str) == 0
-        @rocprintf("Device-to-host string conversion failed\n")
-        return reinterpret(Cstring, 0)
-    end
-    memcpy!(ex_str, ex_ptr, ex_len)
-    unsafe_store!(ex_str+ex_len, UInt8(0))
-    reinterpret(Cstring, ex_str)
+function err_device_string_to_host(str::Ptr{Cchar})
+    host_str = reinterpret(LLVMPtr{UInt8, AS.Global}, C_NULL)
+    @⊡ host_str = err_str_buffer!()
+    reinterpret(UInt64, host_str) == 0 && return reinterpret(Cstring, 0)
+
+    str_ptr = reinterpret(LLVMPtr{UInt8, AS.Global}, str)
+    str_len = string_length(str_ptr)
+
+    # Copy `ex` to allocated memory & null termination.
+    memcpy!(host_str, str_ptr, str_len)
+    unsafe_store!(host_str + str_len, UInt8(0))
+    return reinterpret(Cstring, host_str)
+end
+
+function report_oom(sz::Csize_t)
+    # @errprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
+    return
 end
 
-report_exception(ex) = device_report_exception(ex)
-function device_report_exception(ex::Ptr{Cchar})
-    # Add kernel ID and exception string to exception ring buffer
-    ring_ptr = get_global_pointer(Val(:__global_exception_ring), LLVMPtr{ExceptionEntry,AS.Global})
-    ring_ptr = unsafe_load(ring_ptr)
-    our_signal = _completion_signal()
-    prev = UInt64(1)
-    while prev != UInt64(0)
-        # Try to write to this slot, and skip if we fail (because another wavefront wrote first)
-        prev = atomic_cas!(reinterpret(LLVMPtr{UInt64,AS.Global}, ring_ptr), UInt64(0), our_signal)
-        if prev == UInt64(0)
-            ex_str = device_string_to_host(ex)
-            Base.unsafe_store!(reinterpret(LLVMPtr{UInt64,AS.Global}, ring_ptr+sizeof(UInt64)), reinterpret(UInt64, ex_str))
-            break
-        elseif prev == UInt64(1)
-            # Tail slot, give up
-            break
-        end
-        ring_ptr += sizeof(ExceptionEntry)
-    end
+function report_exception(ex::Ptr{Cchar})
+    # ex_str = err_device_string_to_host(ex)
+    # @errprintf("""
+    #     ERROR: a %s was thrown during kernel execution.
+    #            Run Julia on debug level 2 for device stack traces.
+    #     """, ex_str)
     return
 end
 
-report_oom(sz) = device_report_oom(sz)
-device_report_oom(sz::Csize_t) =
-    @rocprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz)
-
-report_exception_name(ex) = device_report_exception_name(ex)
-function device_report_exception_name(ex::Ptr{Cchar})
-    device_report_exception(ex)
-    # Pass argument in host buffer
-    ex_str = device_string_to_host(ex)
-    @rocprintf("""
-        ERROR: a %s was thrown during kernel execution.
-        Stacktrace:
-        """, ex_str)
-    device_free(reinterpret(Ptr{Cvoid}, ex_str))
+function report_exception_name(ex::Ptr{Cchar})
+    # ex_str = err_device_string_to_host(ex)
+    # @errprintf("""
+    #     ERROR: a %s was thrown during kernel execution.
+    #     Stacktrace:
+    #     """, ex_str)
     return
 end
 
-report_exception_frame(idx, func, file, line) =
-    device_report_exception_frame(idx, func, file, line)
-function device_report_exception_frame(idx::Cint, func::Ptr{Cchar}, file::Ptr{Cchar}, line::Cint)
-    # Pass arguments in host buffers
-    func_str = device_string_to_host(func)
-    file_str = device_string_to_host(file)
-    @rocprintf(" [%i] %s at %s:%i\n", idx, func_str, file_str, line)
-    device_free(reinterpret(Ptr{Cvoid}, func_str))
-    device_free(reinterpret(Ptr{Cvoid}, file_str))
+function report_exception_frame(
+    idx::Cint, func::Ptr{Cchar}, file::Ptr{Cchar}, line::Cint,
+)
+    # func_str = err_device_string_to_host(func)
+    # file_str = err_device_string_to_host(file)
+    # @errprintf("""
+    #  [%i] %s
+    #    @ %s:%i
+    # """, idx, func_str, file_str, line)
     return
 end
diff --git a/src/device/strings.jl b/src/device/strings.jl
index 48e492c62..ffd6a39bd 100644
--- a/src/device/strings.jl
+++ b/src/device/strings.jl
@@ -2,21 +2,21 @@
 
 @generated function string_length(ex::Union{Ptr,LLVMPtr})
     @dispose ctx=Context() begin
-        T_ex = convert(LLVMType, ex; ctx)
+        T_ex = convert(LLVMType, ex)
         T_ex_ptr = LLVM.PointerType(T_ex)
-        T_i8 = LLVM.Int8Type(ctx)
+        T_i8 = LLVM.Int8Type()
         T_i8_ptr = LLVM.PointerType(T_i8)
-        T_i64 = LLVM.Int64Type(ctx)
+        T_i64 = LLVM.Int64Type()
         llvm_f, _ = create_function(T_i64, [T_ex])
         mod = LLVM.parent(llvm_f)
 
-        @dispose builder=IRBuilder(ctx) begin
-            entry = BasicBlock(llvm_f, "entry"; ctx)
-            check = BasicBlock(llvm_f, "check"; ctx)
-            done = BasicBlock(llvm_f, "done"; ctx)
+        @dispose builder=IRBuilder() begin
+            entry = BasicBlock(llvm_f, "entry")
+            check = BasicBlock(llvm_f, "check")
+            done = BasicBlock(llvm_f, "done")
 
             position!(builder, entry)
-            init_offset = ConstantInt(0; ctx)
+            init_offset = ConstantInt(0)
             input_ptr = if T_ex isa LLVM.PointerType
                 parameters(llvm_f)[1]
             else
@@ -30,12 +30,12 @@
 
             position!(builder, check)
             offset = phi!(builder, T_i64)
-            next_offset = add!(builder, offset, ConstantInt(1; ctx))
+            next_offset = add!(builder, offset, ConstantInt(1))
             append!(LLVM.incoming(offset), [(init_offset, entry), (next_offset, check)])
 
             ptr = gep!(builder, T_i8, input_ptr, [offset])
             value = load!(builder, T_i8, ptr)
-            cond = icmp!(builder, LLVM.API.LLVMIntEQ, value, ConstantInt(0x0; ctx))
+            cond = icmp!(builder, LLVM.API.LLVMIntEQ, value, ConstantInt(0x0))
             br!(builder, cond, done, check)
 
             position!(builder, done)
diff --git a/src/discovery_utils.jl b/src/discovery_utils.jl
new file mode 100644
index 000000000..f42cb3f92
--- /dev/null
+++ b/src/discovery_utils.jl
@@ -0,0 +1,155 @@
+function detect_projects()
+    amdgpu_project = normpath(joinpath(@__DIR__, ".."))
+    current_project = Base.ACTIVE_PROJECT[]
+    julia_project = if Base.JLOptions().project != C_NULL
+        unsafe_string(Base.JLOptions().project)
+    elseif current_project !== nothing
+        current_project
+    else
+        amdgpu_project
+    end
+    return (;amdgpu_project, current_project, julia_project)
+end
+
+julia_exeflags(projects = detect_projects()) =
+    String["--startup-file=no", "--project=$(projects.julia_project)"]
+
+function julia_cmd_projects(jl_str)
+    projects = detect_projects()
+
+    cmd = Base.julia_cmd()
+    append!(cmd.exec, julia_exeflags(projects))
+
+    (;amdgpu_project, current_project, julia_project) = projects
+    if current_project !== nothing
+        jl_str = "push!(LOAD_PATH, \"$current_project\");" * jl_str
+    end
+    jl_str = "push!(LOAD_PATH, \"$amdgpu_project\");" * jl_str
+    append!(cmd.exec, ("-e", jl_str))
+    return cmd
+end
+
+function safe_exec(str)
+    cmd = julia_cmd_projects(str)
+    success = false
+    error_str = mktemp() do path, _
+        p = run(pipeline(cmd; stdout=path, stderr=path); wait=false)
+        wait(p)
+        success = p.exitcode == 0
+        String(read(path))
+    end
+    return success, error_str
+end
+
+function safe_import(pkg)
+    loaded, error_str = safe_exec("import $pkg")
+    loaded || return loaded, false, error_str
+
+    @eval import $pkg
+    available = @eval(isdefined($pkg, :is_available)) && @eval($pkg.is_available())
+    return loaded, available, error_str
+end
+
+function find_rocm_library(lib, dirs, ext=dlext)
+    path = Libdl.find_library(lib)
+    if path != ""
+        return Libdl.dlpath(path)
+    end
+    for dir in dirs
+        files = readdir(dir)
+        for file in files
+            matched = startswith(basename(file), lib*".$ext")
+            if matched
+                return joinpath(dir, file)
+            end
+        end
+    end
+    return ""
+end
+
+function find_roc_paths()
+    paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":")
+    paths = filter(path -> path != "", paths)
+    paths = map(Base.Filesystem.abspath, paths)
+    push!(paths, "/opt/rocm/lib") # shim for Ubuntu rocm packages...
+    if haskey(ENV, "ROCM_PATH")
+        push!(paths, joinpath(ENV["ROCM_PATH"], "lib"))
+    end
+    return filter(isdir, paths)
+end
+
+function find_rocm_library(libs::Vector, dirs)
+    for lib in libs
+        path = find_rocm_library(lib, dirs)
+        if path != ""
+            return path
+        end
+    end
+    return ""
+end
+
+function find_ld_lld()
+    paths = split(get(ENV, "PATH", ""), ":")
+    paths = filter(path -> path != "", paths)
+    paths = map(Base.Filesystem.abspath, paths)
+    basedir = get(ENV, "ROCM_PATH", "/opt/rocm")
+    ispath(joinpath(basedir, "llvm/bin/ld.lld")) &&
+        push!(paths, joinpath(basedir, "llvm/bin/"))
+    ispath(joinpath(basedir, "hcc/bin/ld.lld")) &&
+        push!(paths, joinpath(basedir, "/hcc/bin/"))
+    ispath(joinpath(basedir, "opencl/bin/x86_64/ld.lld")) &&
+        push!(paths, joinpath(basedir, "opencl/bin/x86_64/"))
+    for path in paths
+        exp_ld_path = joinpath(path, "ld.lld")
+        if ispath(exp_ld_path)
+            try
+                tmpfile = mktemp()
+                run(pipeline(`$exp_ld_path -v`; stdout=tmpfile[1]))
+                vstr = read(tmpfile[1], String)
+                rm(tmpfile[1])
+                vstr = replace(vstr, "AMD " => "")
+                vstr_splits = split(vstr, ' ')
+                if VersionNumber(vstr_splits[2]) >= v"6.0.0"
+                    return exp_ld_path
+                end
+            catch
+                @debug "bindeps: Failed running ld.lld in $exp_ld_path"
+            end
+        end
+    end
+    return ""
+end
+
+function find_device_libs()
+    # Might be set by tools like Spack or the user
+    hip_devlibs_path = get(ENV, "HIP_DEVICE_LIB_PATH", "")
+    hip_devlibs_path !== "" && return hip_devlibs_path
+    devlibs_path = get(ENV, "DEVICE_LIB_PATH", "")
+    devlibs_path !== "" && return devlibs_path
+
+    # The canonical location
+    if isdir("/opt/rocm/amdgcn/bitcode")
+        return "/opt/rocm/amdgcn/bitcode"
+    end
+
+    # Search relative to LD_LIBRARY_PATH entries
+    paths = split(get(ENV, "LD_LIBRARY_PATH", ""), ":")
+    paths = filter(path -> path != "", paths)
+    paths = map(Base.Filesystem.abspath, paths)
+    for path in paths
+        bitcode_path = joinpath(path, "../amdgcn/bitcode/")
+        if ispath(bitcode_path)
+            if isfile(joinpath(bitcode_path, "ocml.bc")) ||
+               isfile(joinpath(bitcode_path, "ocml.amdgcn.bc"))
+               return bitcode_path
+            end
+        end
+    end
+    return nothing
+end
+
+function populate_globals!(config)
+    for (key,val) in config
+        @eval const $key = $val
+    end
+end
diff --git a/src/dnn/MIOpen.jl b/src/dnn/MIOpen.jl
index b619748b4..2fb5d8fb5 100644
--- a/src/dnn/MIOpen.jl
+++ b/src/dnn/MIOpen.jl
@@ -1,15 +1,13 @@
 module MIOpen
 
+using CEnum
+
 using ..AMDGPU
+import AMDGPU: ROCArray, LockedObject, HandleCache, HIP, library_state
 import AMDGPU.Runtime.Mem
-import AMDGPU: ROCArray, ROCDevice, LockedObject
-import AMDGPU: HandleCache, HIP, library_state
 import .HIP: hipStream_t
 
-using CEnum
-using GPUArrays
-
-if AMDGPU.use_artifacts && AMDGPU.functional(:MIOpen)
+if AMDGPU.use_artifacts() && AMDGPU.functional(:MIOpen)
     using MIOpen_jll
     const libMIOpen_path = MIOpen_jll.libMIOpen_path
 else
@@ -92,15 +90,6 @@ lib_state() = library_state(
 handle() = lib_state().handle
 stream() = lib_state().stream
 
-mutable struct Workspace
-    data::Mem.Buffer
-    function Workspace(dev::ROCDevice, bytesize)
-        w = new(Mem.alloc(dev, bytesize))
-        finalizer(w_ -> Mem.free(w_.data), w)
-        w
-    end
-end
-
 include("descriptors.jl")
 include("convolution.jl")
 include("pooling.jl")
diff --git a/src/dnn/activations.jl b/src/dnn/activations.jl
index 1db7635c5..ec0bae2b2 100644
--- a/src/dnn/activations.jl
+++ b/src/dnn/activations.jl
@@ -117,12 +117,10 @@ function _activation(
 ) where T <: MIOPENFloat
     y = similar(x)
     xdesc, ydesc = TensorDescriptor.((x, y))
-    AMDGPU.wait!(x)
     (; handle, stream) = lib_state()
     miopenActivationForward(
         handle, desc.handle, Ref{Float32}(1f0), xdesc.handle, x,
         Ref{Float32}(0f0), ydesc.handle, y) |> check
-    AMDGPU.mark!(y, stream)
     y
 end
 
@@ -131,12 +129,10 @@ function _∇activation(
 ) where T <: MIOPENFloat
     dx = similar(x)
     xdesc, ydesc, dydesc, dxdesc = TensorDescriptor.((x, y, dy, dx))
-    AMDGPU.wait!((x, y, dy))
     (; handle, stream) = lib_state()
     miopenActivationBackward(
         handle, desc, Ref{Float32}(1f0), ydesc.handle, y,
         dydesc.handle, dy, xdesc.handle, x, Ref{Float32}(0f0),
         dxdesc.handle, dx) |> check
-    AMDGPU.mark!(dx, stream)
     dx
 end
diff --git a/src/dnn/batchnorm.jl b/src/dnn/batchnorm.jl
index 2178beb2e..1e1c4d705 100644
--- a/src/dnn/batchnorm.jl
+++ b/src/dnn/batchnorm.jl
@@ -32,13 +32,11 @@ function batchnorm_training(
     # For backward pass.
     μ_saved, ν_saved = similar(x, n_features), similar(x, n_features)
 
-    AMDGPU.wait!((x, γ, β, μ, ν))
     (; handle, stream) = lib_state()
     miopenBatchNormalizationForwardTraining(
         handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0),
         xdesc.handle, x, ydesc.handle, y, bndesc.handle, γ, β, factor,
         μ, ν, ϵ, μ_saved, ν_saved) |> check
-    AMDGPU.mark!(y, stream)
     y, μ_saved, ν_saved
 end
 
@@ -71,13 +69,11 @@ function batchnorm_inference(
     xdesc, ydesc = TensorDescriptor4D.((x, y))
     bndesc = derive_beta_gamma_descriptors(xdesc, mode)
 
-    AMDGPU.wait!((x, γ, β, μ, ν))
     (; handle, stream) = lib_state()
     miopenBatchNormalizationForwardInference(
         handle, mode, Ref{Float32}(1f0), Ref{Float32}(0f0),
         xdesc.handle, x, ydesc.handle, y, bndesc.handle,
         γ, β, μ, ν, ϵ) |> check
-    AMDGPU.mark!(y, stream)
     y
 end
 
@@ -93,7 +89,6 @@ function ∇batchnorm(
     xdesc, dxdesc, dydesc = TensorDescriptor4D.((x, dx, dy))
     bndesc = derive_beta_gamma_descriptors(xdesc, mode)
 
-    AMDGPU.wait!((x, dy, γ, β, μ_saved, ν_saved))
     (; handle, stream) = lib_state()
     miopenBatchNormalizationBackward(
         handle, mode,
@@ -101,7 +96,6 @@ function ∇batchnorm(
         Ref{Float32}(1f0), Ref{Float32}(0f0),
         xdesc.handle, x, dydesc.handle, dy, dxdesc.handle, dx,
         bndesc.handle, γ, dγ, dβ, ϵ, μ_saved, ν_saved) |> check
-    AMDGPU.mark!((dx, dγ, dβ), stream)
     dx, dγ, dβ
 end
 
diff --git a/src/dnn/convolution.jl b/src/dnn/convolution.jl
index 2cb491f5c..da1224119 100644
--- a/src/dnn/convolution.jl
+++ b/src/dnn/convolution.jl
@@ -1,10 +1,7 @@
-# TODO free workspace once used
-
 const CONV_ALGOS = Union{
     Type{miopenConvFwdAlgorithm_t},
     Type{miopenConvBwdWeightsAlgorithm_t},
-    Type{miopenConvBwdDataAlgorithm_t},
-}
+    Type{miopenConvBwdDataAlgorithm_t}}
 
 # Struct for hashing convolution arguments.
 struct ConvolutionArgs
@@ -40,12 +37,12 @@ get_conv_cache_type(::Type{miopenConvFwdAlgorithm_t}) = CONV_FWD_BENCHMARK_CACHE
 get_conv_cache_type(::Type{miopenConvBwdDataAlgorithm_t}) = CONV_BWD_DATA_BENCHMARK_CACHE
 get_conv_cache_type(::Type{miopenConvBwdWeightsAlgorithm_t}) = CONV_BWD_WEIGHT_BENCHMARK_CACHE
 
-function get_benchmark_cache(conv_type::C, conv_args, dev) where C <: CONV_ALGOS
+function get_benchmark_cache(conv_type::C, conv_args) where C <: CONV_ALGOS
     perf_results = lock(get_conv_cache_type(conv_type)) do cache
         get(cache, conv_args, nothing)
     end
     isnothing(perf_results) && return nothing
-    workspace = Workspace(dev, perf_results.memory)
+    workspace = ROCArray{UInt8}(undef, perf_results.memory)
     perf_results, workspace
 end
 
@@ -85,8 +82,7 @@ function find_conv_algo(
         handle, a_desc.handle, a, b_desc.handle, b,
         conv_desc.handle, c_desc.handle, c, n_algos,
         perf_count_ref, perf_results_ref,
-        workspace.data.ptr, workspace.data.bytesize,
-        exhaustive_search) |> check
+        workspace, length(workspace), exhaustive_search) |> check
     perf_results_ref[]
 end
 
@@ -94,15 +90,14 @@ function find_algorithm(
     conv_type::C, handle::miopenHandle_t, conv_args::ConvolutionArgs,
     a, a_desc, b, b_desc, conv_desc, c, c_desc,
 ) where C <: CONV_ALGOS
-    dev = GPUArrays.device(a)
-    cache = get_benchmark_cache(conv_type, conv_args, dev)
+    cache = get_benchmark_cache(conv_type, conv_args)
     isnothing(cache) || return cache
 
-    workspace = Workspace(dev, 0)
+    workspace = ROCArray{UInt8}(undef, 0)
     perf_results = find_conv_algo(conv_type;
         handle, workspace, a, a_desc, b, b_desc, conv_desc, c, c_desc)
     set_benchmark_cache!(conv_type, conv_args, perf_results)
-    workspace = Workspace(dev, perf_results.memory)
+    workspace = ROCArray{UInt8}(undef, perf_results.memory)
 
     perf_results, workspace
 end
@@ -127,12 +122,11 @@ function convolution!(
         miopenConvFwdAlgorithm_t, handle, conv_args,
         x, xdesc, w, wdesc, cdesc, y, ydesc)
 
-    AMDGPU.wait!((x, y, w))
     miopenConvolutionForward(
         handle, Ref{Float32}(1f0), xdesc.handle, x, wdesc.handle, w, cdesc.handle,
         perf_results.fwd_algo, Ref{Float32}(0f0), ydesc.handle, y,
-        workspace.data.ptr, perf_results.memory) |> check
-    AMDGPU.mark!(y, stream)
+        workspace, perf_results.memory) |> check
+    AMDGPU.unsafe_free!(workspace)
     y
 end
 
@@ -171,12 +165,11 @@ function ∇convolution_weight!(
     perf_algo, workspace = find_algorithm(
         miopenConvBwdWeightsAlgorithm_t, handle, conv_args,
         dy, dydesc, x, xdesc, cdesc, ∇w, ∇wdesc)
-    AMDGPU.wait!((∇w, dy, x))
     miopenConvolutionBackwardWeights(
         handle, Ref{Float32}(1f0), dydesc.handle, dy, xdesc.handle, x, cdesc.handle,
         perf_algo.bwd_weights_algo, Ref{Float32}(0f0), ∇wdesc.handle, ∇w,
-        workspace.data.ptr, perf_algo.memory) |> check
-    AMDGPU.mark!(∇w, stream)
+        workspace, perf_algo.memory) |> check
+    AMDGPU.unsafe_free!(workspace)
     ∇w
 end
 
@@ -215,12 +208,11 @@ function ∇convolution_data!(
     perf_algo, workspace = find_algorithm(
         miopenConvBwdDataAlgorithm_t, handle, conv_args,
         dy, dydesc, w, wdesc, cdesc, ∇x, ∇xdesc)
-    AMDGPU.wait!((∇x, dy, w))
     miopenConvolutionBackwardData(
         handle, Ref{Float32}(1f0), dydesc.handle, dy, wdesc.handle, w, cdesc.handle,
         perf_algo.bwd_data_algo, Ref{Float32}(0f0), ∇xdesc.handle, ∇x,
-        workspace.data.ptr, perf_algo.memory) |> check
-    AMDGPU.mark!(∇x, stream)
+        workspace, perf_algo.memory) |> check
+    AMDGPU.unsafe_free!(workspace)
     ∇x
 end
 
diff --git a/src/dnn/pooling.jl b/src/dnn/pooling.jl
index eeab61238..85ce46932 100644
--- a/src/dnn/pooling.jl
+++ b/src/dnn/pooling.jl
@@ -94,22 +94,12 @@ function pool!(
     x::ROCArray{T, N}, xdesc::TensorDescriptor,
     pdesc::PoolingDescriptor; alpha = 1f0, beta = 0f0, do_backward::Bool = true,
 ) where {T <: MIOPENFloat, N}
-    if do_backward
-        wsize = get_workspace_size(pdesc, ydesc)
-        workspace = Workspace(GPUArrays.device(y), wsize)
-        wptr = workspace.data.ptr
-    else
-        wsize = 0
-        workspace = nothing
-        wptr = C_NULL
-    end
-    AMDGPU.wait!((x, y))
+    wsize = do_backward ? get_workspace_size(pdesc, ydesc) : 0
+    workspace = ROCArray{UInt8}(undef, wsize)
     (; handle, stream) = lib_state()
     miopenPoolingForward(
         handle, pdesc.handle, Ref{Float32}(alpha), xdesc.handle, x,
-        Ref{Float32}(beta), ydesc.handle, y, do_backward,
-        wptr, wsize) |> check
-    AMDGPU.mark!(y, stream)
+        Ref{Float32}(beta), ydesc.handle, y, do_backward, workspace, wsize) |> check
     y, workspace
 end
 
@@ -120,13 +110,10 @@ function ∇pool!(
     x::ROCArray{T, N}, xdesc::TensorDescriptor,
     pdesc::PoolingDescriptor; alpha = 1f0, beta = 0f0, workspace,
 ) where {T <: MIOPENFloat, N}
-    AMDGPU.wait!((dx, dy, y, x))
     (; handle, stream) = lib_state()
     miopenPoolingBackward(
         handle, pdesc.handle, Ref{Float32}(alpha),
         ydesc.handle, y, dydesc.handle, dy, xdesc.handle, x,
-        Ref{Float32}(beta), dxdesc.handle, dx,
-        (isnothing(workspace) ? C_NULL : workspace.data.ptr)) |> check
-    AMDGPU.mark!(dx, stream)
+        Ref{Float32}(beta), dxdesc.handle, dx, workspace) |> check
     dx
 end
diff --git a/src/dnn/softmax.jl b/src/dnn/softmax.jl
index b24f73ecd..eaee271ee 100644
--- a/src/dnn/softmax.jl
+++ b/src/dnn/softmax.jl
@@ -57,13 +57,12 @@ function _softmax!(
             _logsoftmax!(y, x; dims) : _softmax!(y, x; dims)
     end
 
-    AMDGPU.wait!((x, y))
-    xdesc, ydesc = TensorDescriptor.((reshape(x, sdims), reshape(y, sdims)))
+    xr, yr = reshape(x, sdims), reshape(y, sdims)
+    xdesc, ydesc = TensorDescriptor.((xr, yr))
     (; handle, stream) = lib_state()
     miopenSoftmaxForward_V2(
-        handle, Ref{Float32}(1f0), xdesc.handle, x, Ref{Float32}(0f0),
-        ydesc.handle, y, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check
-    AMDGPU.mark!(y, stream)
+        handle, Ref{Float32}(1f0), xdesc.handle, xr, Ref{Float32}(0f0),
+        ydesc.handle, yr, algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check
     y
 end
 
@@ -76,14 +75,12 @@ function _∇softmax!(
             _∇logsoftmax!(dx, dy, y; dims) : _∇softmax!(dx, dy, y; dims)
     end
 
-    AMDGPU.wait!((dx, dy, y))
     ydesc, dydesc, dxdesc = TensorDescriptor.((reshape(y, sdims), reshape(dy, sdims), reshape(dx, sdims)))
     (; handle, stream) = lib_state()
     miopenSoftmaxBackward_V2(
         handle, Ref{Float32}(1f0), ydesc.handle, y, dydesc.handle, dy,
         Ref{Float32}(0f0), dxdesc.handle, dx,
         algo, MIOPEN_SOFTMAX_MODE_CHANNEL) |> check
-    AMDGPU.mark!(dx, stream)
     dx
 end
 
diff --git a/src/exception_handler.jl b/src/exception_handler.jl
new file mode 100644
index 000000000..7a75d81bd
--- /dev/null
+++ b/src/exception_handler.jl
@@ -0,0 +1,160 @@
+"""
+ExceptionHolder
+
+- `exception_flag::Mem.HostBuffer`:
+    Pinned host memory. Contains one element of `Int32` type.
+    If stored value is not 0, then there is an exception that occurred
+    during kernel execution on the respective device.
+- `gate::ROCArray{UInt64}`:
+    Linear index for x, y & z dimensions at which the exception occurred.
+    This is used to filter out other threads from duplication exceptions.
+- `buffers_counter::ROCArray{Int32}`:
+    Counts number of printf buffers `errprintf_buffers` currently used.
+- `str_buffers_counter::ROCArray{Int32}`:
+    Error string counter. Counts number of string buffers `string_buffers`
+    used for exception reporting.
+- `errprintf_buffers::Vector{Mem.HostBuffer}`:
+    Array of buffers used for writing exceptions.
+    These buffers are used in the same way as device-printf buffers,
+    except they are pre-allocated.
+- `string_buffers::Vector{Mem.HostBuffer}`:
+    Array of string buffers. These buffers are used every time
+    we need to report the name of the exception, file, or line.
+"""
+struct ExceptionHolder
+    exception_flag::Mem.HostBuffer # Main buffer where printf context is written.
+    gate::ROCArray{UInt64}
+    buffers_counter::ROCArray{Int32}
+    str_buffers_counter::ROCArray{Int32}
+
+    errprintf_buffers::Vector{Mem.HostBuffer} # Buffers used by `@errprintf`.
+    string_buffers::Vector{Mem.HostBuffer} # Buffers used for storing device strings on the host.
+
+    errprintf_buffers_dev::ROCArray{Ptr{Cvoid}} # Pointers of `errprintf_buffers` on the device.
+    string_buffers_dev::ROCArray{Ptr{Cvoid}} # Pointers of `string_buffers` on the device.
+
+    function ExceptionHolder()
+        buf_len = 2^11 # 2 KiB
+        str_len = 2^11 # 2 KiB
+        n_buffers = 50
+        n_str_buffers = 100
+
+        exception_flag = Mem.HostBuffer(sizeof(Int32), HIP.hipHostAllocMapped)
+        gate = ROCArray(UInt64[0])
+        buffers_counter = ROCArray(Int32[0])
+        str_buffers_counter = ROCArray(Int32[0])
+
+        errprintf_buffers = [
+            Mem.HostBuffer(buf_len, HIP.hipHostAllocMapped)
+            for _ in 1:n_buffers]
+        str_buffers = [
+            Mem.HostBuffer(str_len, HIP.hipHostAllocMapped)
+            for _ in 1:n_str_buffers]
+
+        errprintf_buffers_dev = ROCArray(Mem.device_ptr.(errprintf_buffers))
+        str_buffers_dev = ROCArray(Mem.device_ptr.(str_buffers))
+
+        new(
+            exception_flag, gate, buffers_counter, str_buffers_counter,
+            errprintf_buffers, str_buffers,
+            errprintf_buffers_dev, str_buffers_dev)
+    end
+end
+
+# hash(dev::HIPDevice) => ExceptionHolder
+const GLOBAL_EXCEPTION_HOLDER = Dict{UInt, ExceptionHolder}()
+
+function exception_holder(dev::HIPDevice)
+    # TODO lock using RT_LOCK
+    get!(() -> ExceptionHolder(), GLOBAL_EXCEPTION_HOLDER, hash(dev))
+end
+
+function has_exception(dev::HIPDevice)::Bool
+    ex = exception_holder(dev)
+    ptr = Base.unsafe_convert(Ptr{Int}, ex.exception_flag)
+    unsafe_load(ptr) != 0
+end
+
+function reset_exception_holder!(dev::HIPDevice)
+    ex = exception_holder(dev)
+    ptr = Base.unsafe_convert(Ptr{Int}, ex.exception_flag)
+    unsafe_store!(ptr, 0)
+
+    fill!(ex.buffers_counter, 0)
+    fill!(ex.str_buffers_counter, 0)
+    return
+end
+
+function get_exception_string(dev::HIPDevice)::String
+    ex = exception_holder(dev)
+
+    # Use async copy and HIP.synchronize() to avoid triggering
+    # error exception checking path and stack-overflowing.
+    n_used_buffers = eltype(ex.buffers_counter)[0]
+    Base.copyto!(n_used_buffers, 1, ex.buffers_counter, 1, 1; async=true)
+    HIP.synchronize(AMDGPU.stream())
+
+    n_strings = min(n_used_buffers[1], length(ex.errprintf_buffers))
+
+    exception_str = ""
+    for i in 1:n_strings
+        ptr = reinterpret(
+            LLVMPtr{Device.ROCPrintfBuffer, AS.Global},
+            ex.errprintf_buffers[i].ptr)
+        fmt, all_args = unsafe_load(ptr)
+
+        if isempty(all_args)
+            exception_str = "$(exception_str)$(fmt)\n"
+        else
+            args = map(x -> x isa Cstring ? unsafe_string(x) : x, first(all_args))
+            str = Printf.format(Printf.Format(fmt), args...)
+            exception_str = "$(exception_str)$(str)"
+        end
+    end
+    return exception_str
+end
+
+function throw_if_exception(dev::HIPDevice)
+    has_exception(dev) || return
+
+    exception_str = get_exception_string(dev)
+    exception_str = isempty(exception_str) ? "" : "\n$exception_str"
+    error("GPU Kernel Exception$exception_str")
+end
+
+function KernelState(dev::HIPDevice, global_hostcalls::Vector{Symbol})
+    malloc_ptr = :malloc_hostcall in global_hostcalls ?
+        Compiler.create_malloc_hostcall!() :
+        C_NULL
+    free_ptr = :free_hostcall in global_hostcalls ?
+        Compiler.create_free_hostcall!() :
+        C_NULL
+    print_ptr = :print_hostcall in global_hostcalls ?
+        Compiler.create_output_context!() :
+        C_NULL
+    printf_ptr = :printf_hostcall in global_hostcalls ?
+        Compiler.create_printf_output_context!() :
+        C_NULL
+
+    ex = exception_holder(dev)
+    KernelState(
+        # Exception reporting buffers.
+        Mem.device_ptr(ex.exception_flag),
+        pointer(ex.gate),
+        pointer(ex.buffers_counter),
+        pointer(ex.str_buffers_counter),
+
+        pointer(ex.errprintf_buffers_dev),
+        pointer(ex.string_buffers_dev),
+        Int32(length(ex.errprintf_buffers_dev)),
+        Int32(length(ex.string_buffers_dev)),
+
+        # Malloc/free hostcall pointer.
+        malloc_ptr,
+        free_ptr,
+
+        # Print hostcalls.
+        print_ptr,
+        printf_ptr,
+    )
+end
diff --git a/src/fft/fft.jl b/src/fft/fft.jl
index 989d70d2d..747853d6d 100644
--- a/src/fft/fft.jl
+++ b/src/fft/fft.jl
@@ -347,35 +347,24 @@ function assert_applicable(p::ROCFFTPlan{T,K}, X::ROCArray{T}, Y::ROCArray{Ty})
 end
 
 function unsafe_execute!(plan::cROCFFTPlan{T,K,true,N}, X::ROCArray{T,N}) where {T,K,N}
-    wait!(X)
     rocfft_execute(plan, [pointer(X),], C_NULL, plan.execution_info)
-    mark!(X, C_NULL)
 end
 
 function unsafe_execute!(plan::cROCFFTPlan{T,K,false,N}, X::ROCArray{T,N}, Y::ROCArray{T}) where {T,N,K}
     Xcopy = copy(X) # since input array can also be modified
-    wait!(Y)
     rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
-    mark!(Xcopy, C_NULL)
-    mark!(Y, C_NULL)
 end
 
 function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_FORWARD,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftComplexes,N}) where {T<:rocfftReals,N}
     @assert plan.xtype == rocfft_transform_type_real_forward
     Xcopy = copy(X)
-    wait!(Y)
     rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
-    mark!(Xcopy, C_NULL)
-    mark!(Y, C_NULL)
 end
 
 function unsafe_execute!(plan::rROCFFTPlan{T,ROCFFT_INVERSE,false,N}, X::ROCArray{T,N}, Y::ROCArray{<:rocfftReals,N}) where {T<:rocfftComplexes,N}
     @assert plan.xtype == rocfft_transform_type_real_inverse
     Xcopy = copy(X)
-    wait!(Y)
     rocfft_execute(plan, [pointer(Xcopy),], [pointer(Y),], plan.execution_info)
-    mark!(Xcopy, C_NULL)
-    mark!(Y, C_NULL)
 end
 
 
diff --git a/src/fft/rocFFT.jl b/src/fft/rocFFT.jl
index 9b05747ab..3de57145b 100644
--- a/src/fft/rocFFT.jl
+++ b/src/fft/rocFFT.jl
@@ -1,7 +1,7 @@
 module rocFFT
 
 import ..AMDGPU
-import .AMDGPU: librocfft, mark!, wait!
+import .AMDGPU: librocfft
 import ..HIP: hipStream_t
 
 using CEnum
@@ -14,6 +14,8 @@ include("librocfft.jl")
 include("util.jl")
 include("fft.jl")
 
+# TODO use TLS library state
+
 if AMDGPU.functional(:rocfft)
     const INITIALIZED = Threads.Atomic{Int64}(0)
     @eval function rocfft_setup_once()
diff --git a/src/highlevel.jl b/src/highlevel.jl
index 3fe07f56b..4a3511316 100644
--- a/src/highlevel.jl
+++ b/src/highlevel.jl
@@ -1,17 +1,15 @@
 # High-level APIs
 
 import AMDGPU: Runtime, Compiler
-import .Runtime: ROCDevice, ROCQueue, ROCExecutable, ROCKernel, ROCSignal, ROCKernelSignal, HSAError
 import .Runtime: ROCDim, ROCDim3
-import .Runtime: wait!, mark!
-import .Compiler: rocfunction
+import .Compiler: hipfunction
 
-export @roc, rocconvert, rocfunction
+export @roc, rocconvert
 
 ## Devices
 
 """
-    default_device()::ROCDevice
+    default_device()::HIPDevice
 
 Default device which will be used by default in tasks.
 Meaning when a task is created, it selects this device as default.
@@ -21,7 +19,7 @@ All subsequent uses rely on [`device()`](@ref) for device selection.
 default_device() = Runtime.get_default_device()
 
 """
-    default_device!(device::ROCDevice)
+    default_device!(device::HIPDevice)
 
 Set default device that will be used when creating new tasks.
 
@@ -29,10 +27,29 @@ Set default device that will be used when creating new tasks.
     This does not change current device being used.
     Refer to [`device!`](@ref) for that.
 """
-default_device!(device::ROCDevice) = Runtime.set_default_device!(device)
+default_device!(device::HIPDevice) = Runtime.set_default_device!(device)
 
 """
-    device()::ROCDevice
+    default_device_id() -> Int
+
+Returns the numeric ID of the current default device,
+which is in the range of `1:length(AMDGPU.devices())`.
+This number should be stable for all processes on the same node,
+The [`default_device_id!`](@ref) function accepts the same
+numeric ID that is produced by this function.
+"""
+default_device_id() = default_device().device_id
+
+"""
+    default_device_id!(idx::Integer, kind::Symbol=:gpu)
+
+Sets the default device to `AMDGPU.devices(kind)[idx]`. See
+[`default_device_id`](@ref) for details on the numbering semantics.
+"""
+default_device_id!(idx::Integer) = default_device!(devices()[idx])
+
+"""
+    device()::HIPDevice
 
 Get currently active device.
 This device is used when launching kernels via `@roc`.
@@ -40,7 +57,7 @@ This device is used when launching kernels via `@roc`.
 device() = task_local_state().device
 
 """
-    device!(device::ROCDevice)
+    device!(device::HIPDevice)
 
 Switch current device being used.
 This switches only for a task inside which it is called.
@@ -49,51 +66,26 @@ This switches only for a task inside which it is called.
     To select default device that will be used when creating new tasks,
     refer to [`default_device!`](@ref) for that.
 """
-function device!(device::ROCDevice)
+function device!(device::HIPDevice)
     task_local_state!(; device)
     return device
 end
-device!(f::Base.Callable, device::ROCDevice) = task_local_state!(f; device)
-
-"""
-    devices(kind::Symbol = :gpu)
-
-Get list of all devices of the given `kind`.
-`kind` can be `:cpu`, `:gpu` or `:dsp`, although AMDGPU.jl supports
-execution only on `:gpu` devices.
-"""
-devices(kind::Symbol = :gpu) =
-    filter!(d -> device_type(d) == kind, copy(Runtime.ALL_DEVICES))
+device!(f::Base.Callable, device::HIPDevice) = task_local_state!(f; device)
 
 """
-    default_device_id(kind::Symbol=:gpu) -> Int
+    devices()
 
-Returns the numeric ID of the current default device, which is in the range of
-`1:length(AMDGPU.devices(kind))`. This number should be stable for all
-processes on the same node, so long as any device filtering is consistently
-applied (such as `ROCR_VISIBLE_DEVICES`). The [`default_device_id!`](@ref)
-function accepts the same numeric ID that is produced by this function.
+Get list of all devices.
 """
-default_device_id(kind::Symbol=:gpu) =
-    something(findfirst(a->a==default_device(), devices(kind)))
+devices() = Runtime.fetch_devices()
 
 """
-    default_device_id!(idx::Integer, kind::Symbol=:gpu)
-
-Sets the default device to `AMDGPU.devices(kind)[idx]`. See
-[`default_device_id`](@ref) for details on the numbering semantics.
-"""
-default_device_id!(idx::Integer, kind::Symbol=:gpu) =
-    default_device!(devices(kind)[idx])
-
-"""
-    device_id(device::ROCDevice, kind::Symbol=:gpu) -> Int
+    device_id(device::HIPDevice) -> Int
 
 Returns the numerical device ID for `device`. See [`default_device_id`](@ref)
 for details on the numbering semantics.
 """
-device_id(device::ROCDevice, kind::Symbol=:gpu) =
-    something(findfirst(dev->dev === device, devices(kind)))
+device_id(device::HIPDevice) = device.device_id
 
 """
     device_id!(idx::Integer, kind::Symbol=:gpu)
@@ -101,30 +93,7 @@ device_id(device::ROCDevice, kind::Symbol=:gpu) =
 Sets the current device to `AMDGPU.devices(kind)[idx]`. See
 [`device_id`](@ref) for details on the numbering semantics.
 """
-device_id!(idx::Integer, kind::Symbol=:gpu) =
-    device!(devices(kind)[idx])
-
-"""
-    device_type(device::ROCDevice) -> Symbol
-
-Return the kind of `device` as a `Symbol`. CPU devices return `:cpu`, GPU
-devices return `:gpu`, DSP devices return `:dsp`, and all others return
-`:unknown`.
-"""
-function device_type(device::ROCDevice)
-    devtype = Runtime.device_type(device)
-    if devtype == HSA.DEVICE_TYPE_CPU
-        return :cpu
-    elseif devtype == HSA.DEVICE_TYPE_GPU
-        return :gpu
-    elseif devtype[] == HSA.DEVICE_TYPE_DSP
-        return :dsp
-    else
-        return :unknown
-    end
-end
-
-wavefrontsize(device::ROCDevice) = Runtime.device_wavefront_size(device)
+device_id!(idx::Integer) = device!(devices()[idx])
 
 # Contexts
 
@@ -135,40 +104,9 @@ function device(context::HIPContext)
     end
 end
 
-device_id(device::HIPDevice) = device.device_id
-HIPDevice(device::ROCDevice) = HIPDevice(device_id(device))
-HIPContext(device::ROCDevice) = HIPContext(HIPDevice(device))
-
-# Queues/Streams
-
-"""
-    queue()::ROCQueue
-
-Get task-local default queue for the currently active device.
-"""
-queue() = task_local_state().queue::ROCQueue
-@deprecate default_queue() queue()
-function queue(device::ROCDevice)
-    tls = task_local_state()
-    q = tls.queues[device_id(device)]
-    isnothing(q) || return q
-
-    tls.queues[device_id(device)] = ROCQueue(device)
-    return q
-end
-"""
-    queue!(f::Base.Callable, queue::ROCQueue)
-
-Change default queue, execute given function `f`
-and revert back to the original queue.
-
-# Returns
-
-Return value of the function `f`.
-"""
-queue!(f::Base.Callable, queue::ROCQueue) = task_local_state!(f; queue)
-device(queue::ROCQueue) = queue.device
+# Streams.
 
+default_stream() = HIP.default_stream()
 stream() = task_local_state().stream::HIPStream
 function stream!(stream::HIPStream)
     task_local_state!(;stream)
@@ -182,7 +120,7 @@ priority() = task_local_state().priority
 """
     priority!(priority::Symbol)
 
-Change the priority of the default queue.
+Change the priority of the default stream.
 Accepted values are `:normal` (the default), `:low` and `:high`.
 """
 function priority!(priority::Symbol)
@@ -193,7 +131,7 @@ end
 """
     priority!(f::Base.Callable, priority::Symbol)
 
-Chnage the priority of default queue, execute `f` and
+Chnage the priority of default stream, execute `f` and
 revert to the original priority.
 Accepted values are `:normal` (the default), `:low` and `:high`.
 
@@ -205,149 +143,36 @@ priority!(f::Base.Callable, priority::Symbol) = task_local_state!(f; priority)
 
 # Device ISAs
 
-default_isa(device::ROCDevice) = Runtime.default_isa(device)
-default_isa_architecture(device::ROCDevice) = Runtime.architecture(default_isa(device))
-default_isa_features(device::ROCDevice) = Runtime.features(default_isa(device))
-
-## Executable creation
-
-function create_executable(device, entry, obj; globals=())
-    # link with ld.lld
-    @assert lld_path != "" "ld.lld was not found; cannot link kernel"
-    path_exe = mktemp() do path_o, io_o
-        write(io_o, obj)
-        flush(io_o)
-        path_exe = path_o*".exe"
-        if lld_artifact
-            LLD_jll.lld() do lld
-                run(`$lld -flavor gnu -shared -o $path_exe $path_o`)
-            end
-        else
-            run(`$lld_path -shared -o $path_exe $path_o`)
-        end
-        path_exe
-    end
-    data = read(path_exe)
-    rm(path_exe)
-
-    return ROCExecutable(device, data, entry; globals=globals)
-end
-
-function get_kernel_queue(;
-    event_queue::Union{ROCQueue, Nothing}, device::Union{ROCDevice, Nothing},
-)
-    if !isnothing(event_queue) && !isnothing(device)
-        if event_queue.device != device
-            error(
-                "Specified both `device` and `queue`, " *
-                "but `queue` is on a different device than `device`.\n" *
-                "In this case, only one argument can be specified.")
-        else
-            return event_queue
-        end
-    end
-    isnothing(event_queue) && isnothing(device) && return queue()
-    isnothing(event_queue) && return queue(device)
-    event_queue
-end
-
-## Event creation
-function create_event(kernel::ROCKernel;
-    signal::Union{ROCKernelSignal, ROCSignal} = ROCSignal(),
-    device::Union{ROCDevice, Nothing} = nothing,
-    queue::Union{ROCQueue, Nothing} = nothing,
-    kwargs...,
-)
-    if signal isa ROCKernelSignal
-        return signal
-    end
-    kernel_queue = get_kernel_queue(; event_queue=queue, device)
-    return ROCKernelSignal(signal, kernel_queue, kernel; kwargs...)
-end
-
-## Kernel creation
-
-"""
-    create_kernel(kernel::HostKernel, f, args::Tuple; kwargs...)
-
-Constructs a `ROCKernel` object from a compiled kernel described by `kernel`.
-`f` is the function being called, and `args` is the `Tuple` of arguments that
-`f` is called with.
-
-See [`@roc`](@ref) for the list of available keyword arguments.
-"""
-create_kernel(kernel::Runtime.HostKernel; kwargs...) =
-    ROCKernel(kernel; kwargs...)
-
-## Kernel launch and barriers
-
-barrier_and!(signals::Vector) = barrier_and!(queue(), signals)
-barrier_or!(signals::Vector) = barrier_or!(queue(), signals)
-barrier_and!(queue::ROCQueue, signals::Vector{ROCKernelSignal}) =
-    barrier_and!(queue, map(x->x.signal,signals))
-barrier_or!(queue::ROCQueue, signals::Vector{ROCKernelSignal}) =
-    barrier_or!(queue, map(x->x.signal,signals))
-barrier_and!(queue::ROCQueue, signals::Vector{HSA.Signal}) = barrier_and!(queue, map(ROCSignal, signals))
-barrier_or!(queue::ROCQueue, signals::Vector{HSA.Signal}) = barrier_or!(queue, map(ROCSignal, signals))
-barrier_and!(queue::ROCQueue, signals::Vector{ROCSignal}) =
-    Runtime.launch_barrier!(HSA.BarrierAndPacket, queue, signals)
-barrier_or!(queue::ROCQueue, signals::Vector{ROCSignal}) =
-    Runtime.launch_barrier!(HSA.BarrierOrPacket, queue, signals)
-
-"""
-    active_kernels(queue::ROCQueue = queue()) -> Vector{ROCKernelSignal}
-
-Returns the set of actively-executing kernels on `queue`.
-"""
-function active_kernels(queue::ROCQueue = queue())
-    isempty(queue.active_kernels) && return NO_ACTIVE_KERNELS
-    return Array(queue.active_kernels)
-end
-const NO_ACTIVE_KERNELS = ROCKernelSignal[]
-
-"""
-    synchronize(; errors::Bool=true)
-
-Blocks until all kernels currently executing on the default queue and stream
-have completed. See [`synchronize(::ROCQueue)`](@ref) for details on `errors`.
-"""
-function synchronize(; errors::Bool=true)
-    synchronize(queue(); errors)
-    synchronize(stream())
-end
-"""
-    synchronize(queue::ROCQueue; errors::Bool=true)
+default_isa(device::HIPDevice) = Runtime.default_isa(Runtime.hsa_device(device))
 
-Blocks until all kernels currently executing on `queue` have completed. If
-`errors` is `true`, then any kernels currently on the queue which throw an
-error will be re-thrown; only the first encountered error will be thrown. If
-`false`, errors will not be thrown.
-"""
-function synchronize(queue::ROCQueue; errors::Bool=true)
-    isempty(queue.active_kernels) && return
-
-    if errors
-        kerns = copy(queue.active_kernels)
-        while length(kerns) > 0
-            sig = first(kerns)
-            wait(sig; check_exceptions=true, cleanup=false)
-            Runtime.next!(kerns)
-        end
-    else
-        sig = Runtime.maybelast(queue.active_kernels)
-        if sig !== nothing
-            wait(sig; check_exceptions=false, cleanup=false)
-        end
-    end
-    return
-end
 """
-    synchronize(stream::HIPStream)
+    synchronize(stream::HIPStream = stream())
 
 Blocks until all kernels currently executing on `stream` have completed.
 """
-function synchronize(stream::HIPStream)
-    HIP.hipStreamSynchronize(stream.stream) |> check
+# TODO
+#   allow non blocking sync of several HIPStreams
+#   and only then disable global hostcall
+function synchronize(stm::HIPStream = stream(); blocking::Bool = true)
+    throw_if_exception(stm.device)
+    HIP.synchronize(stm; blocking)
+    throw_if_exception(stm.device)
+
+    blocking && return
+
+    # Stop any running global hostcall.
+    global_hostcall_names = (
+        :malloc_hostcall, :free_hostcall, :print_hostcall, :printf_hostcall)
+    for gbl in global_hostcall_names
+        hc = AMDGPU.Device.get_named_perdevice_hostcall(stm.device, gbl)
+        isnothing(hc) && continue
+        hc[1].finish[] && continue
+
+        # Signal HostCall to exit.
+        AMDGPU.Device.finish!(hc[1])
+        # Remove it from global hostcalls, so that new one is created.
+        AMDGPU.Device.remove_perdevice_hostcall!(stm.device, gbl)
+    end
     return
 end
 
@@ -365,179 +190,10 @@ register methods for the the `AMDGPU.Adaptor` type.
 """
 rocconvert(arg) = adapt(Runtime.Adaptor(), arg)
 
-### @roc helper functions
-
-# split keyword arguments to `@roc` into ones affecting the macro itself, the compiler
-# and the code it generates, or the execution
-function split_kwargs(kwargs)
-    alias_kws    = Dict(:stream=>:queue)
-    macro_kws    = [:dynamic, :launch, :wait, :mark]
-    compiler_kws = [:name, :global_hooks]
-    call_kws     = [:gridsize, :groupsize, :config]
-    signal_kws   = [:queue, :signal, :soft, :minlat, :timeout]
-    kernel_kws   = [:localmem]
-    computed_kws = [:threads, :blocks]
-
-    device_kwargs = []
-    macro_kwargs = []
-    compiler_kwargs = []
-    call_kwargs = []
-    signal_kwargs = []
-    kernel_kwargs = []
-
-    for kwarg in kwargs
-        if !Meta.isexpr(kwarg, :(=))
-            throw(ArgumentError("non-keyword argument like option '$kwarg'"))
-        end
-
-        key, val = kwarg.args
-        oldkey = key
-        if key in keys(alias_kws)
-            key = alias_kws[key]
-            kwarg = :($key=$val)
-        end
-
-        if !isa(key, Symbol)
-            throw(ArgumentError("non-symbolic keyword '$oldkey'"))
-        end
-
-        if key == :device
-            push!(device_kwargs, kwarg)
-        elseif key in macro_kws
-            push!(macro_kwargs, kwarg)
-        elseif key in compiler_kws
-            push!(compiler_kwargs, kwarg)
-        elseif key in call_kws
-            push!(call_kwargs, kwarg)
-        elseif key in signal_kws
-            push!(signal_kwargs, kwarg)
-        elseif key in kernel_kws
-            push!(kernel_kwargs, kwarg)
-        elseif key in computed_kws
-            push!(call_kwargs, kwarg)
-        else
-            throw(ArgumentError("unknown keyword argument '$oldkey'"))
-        end
-    end
-
-    return device_kwargs, macro_kwargs, compiler_kwargs, call_kwargs, signal_kwargs, kernel_kwargs
-end
-function simplify_call_kwargs!(call_kwargs)
-    call_kwargs_keys = map(x->x.args[1], call_kwargs)
-    has_threads = :threads in call_kwargs_keys
-    has_blocks = :blocks in call_kwargs_keys
-    has_threads || has_blocks || return
-    if :groupsize in call_kwargs_keys
-        throw(ArgumentError("cannot combine :threads/:blocks with :groupsize"))
-    elseif :gridsize in call_kwargs_keys
-        throw(ArgumentError("cannot combine :threads/:blocks with :gridsize"))
-    end
-    if has_threads
-        threads_idx = findfirst(x->x.args[1]==:threads, call_kwargs)
-        groupsize = call_kwargs[threads_idx].args[2]
-        deleteat!(call_kwargs, threads_idx)
-    else
-        groupsize = 1
-    end
-    if has_blocks
-        blocks_idx = findfirst(x->x.args[1]==:blocks, call_kwargs)
-        blocks = call_kwargs[blocks_idx].args[2]
-        deleteat!(call_kwargs, blocks_idx)
-    else
-        blocks = 1
-    end
-    push!(call_kwargs, :(groupsize=$groupsize))
-    push!(call_kwargs, :(gridsize=$groupsize .* $blocks))
-end
-
-# assign arguments to variables, handle splatting
-function assign_args!(code, args)
-    # handle splatting
-    splats = map(arg -> Meta.isexpr(arg, :(...)), args)
-    args = map(args, splats) do arg, splat
-        splat ? arg.args[1] : arg
-    end
-
-    # assign arguments to variables
-    vars = Tuple(gensym() for arg in args)
-    map(vars, args) do var,arg
-        push!(code.args, :($var = $arg))
-    end
-
-    # convert the arguments, compile the function and call the kernel
-    # while keeping the original arguments alive
-    var_exprs = map(vars, args, splats) do var, arg, splat
-        splat ? Expr(:(...), var) : var
-    end
-
-    return vars, var_exprs
-end
-
-### @roc macro
-
-"""
-    @roc [kwargs...] func(args...)
-
-High-level interface for executing code on a GPU. The `@roc` macro should
-prefix a call, with `func` a callable function or object that should return
-nothing. It will be compiled to a GCN function via `rocfunction` upon first
-use, and to a certain extent arguments will be converted and managed
-automatically using `rocconvert`. Finally, a call to `roccall` is performed,
-scheduling a kernel launch on the specified (or default) HSA queue.
-
-Several keyword arguments are supported that influence the behavior of `@roc`.
-
-Keyword arguments that control general `@roc` behavior:
-- `dynamic::Bool = false`: Use dynamic parallelism to launch as a device-side kernel
-- `launch::Bool = true`: Whether to launch the kernel
-- `wait::Bool = true`: Whether to wait on all arguments' dependencies
-- `mark::Bool = true`: Whether to mark this kernel as a dependency for all arguments
-
-Keyword arguments that affect various parts of `@roc`:
-- `device::ROCDevice = AMDGPU.default_device()`: The device to compile code for, and launch the kernel on.
-- `queue::ROCQueue = AMDGPU.queue(device)`: Which queue to associate the kernel (and its completion signal) with. May also be specified as `stream` for compatibility with CUDA.jl.
-
-Keyword arguments that control kernel compilation via [`rocfunction`](@ref) and [`dynamic_rocfunction`](@ref):
-- `name::Union{String,Nothing} = nothing`: If not `nothing`, the name to use for the generated kernel.
-- `global_hooks::NamedTuple = (;)`: The set of global compiler hooks to use to initialize memory accessed by the kernel. See `AMDGPU.Compiler.default_global_hooks` for an example of how to implement these.
-
-Keyword arguments that control signal creation via [`AMDGPU.create_event`](@ref):
-- `signal::ROCSignal = ROCSignal()`: The underlying signal object to associate the high-level `ROCKernelSignal` with.
-- `soft::Bool = true`: Whether to use the "soft" busy-poll waiter algorithm. If `false`, uses HSA's built-in blocking wait.
-- `minlat::Float64 = 0.000001`: The minimum latency allowed on the first wait cycle. Specifically, if the kernel completes in less than this amount of time, then the observed latency from kernel launch to return from `wait` is this value, in seconds.
-- `timeout::Union{Float64, Nothing} = nothing`: How long to wait for the signal to complete before throwing an `AMDGPU.Runtime.SignalTimeoutException`, in seconds. If `nothing`, then timeouts are disabled and the `wait` call may hang forever if the kernel never completes.
-
-Keyword arguments that control kernel creation via [`AMDGPU.create_kernel`](@ref):
-- `localmem::Int = 0`: The amount of dynamic local memory to allocate for the kernel. This value is separate from the amount of static local memory required by the kernel (as reported by the compiler).
-
-Keyword arguments that control kernel launch via [`AMDGPU.HostKernel`](@ref) and [`AMDGPU.DeviceKernel`](@ref):
-- `groupsize::Union{Tuple,Integer} = 1`: The size of the groups to execute over the grid. If an `Integer` or `Tuple{<:Integer}`, only activate the X dimension of the group. If `Tuple{<:Integer,<:Integer}`, activate the X and Y dimensions of the group. If `Tuple{<:Integer,<:Integer,<:Integer}`, activate the X, Y, and Z dimensions of the group. All sizes must be greater than 0.
-- `gridsize::Union{Tuple,Integer} = 1`: The size of the grid to execute the kernel over. If an `Integer` or `Tuple{<:Integer}`, only activate the X dimension of the grid. If `Tuple{<:Integer,<:Integer}`, activate the X and Y dimensions of the grid. If `Tuple{<:Integer,<:Integer,<:Integer}`, activate the X, Y, and Z dimensions of the grid. All sizes must be greater than 0.
-- `threads::Union{Tuple,Integer}` - Alias for `groupsize`, for compatibility with CUDA.jl.
-- `blocks::Union{Tuple,Integer}` - How many groups to execute across the grid. Potentially a more convenient way to specify groupsize, and intended for compatibility with CUDA.jl.
-
-The underlying operations (argument conversion, kernel compilation, kernel call) can be
-performed explicitly when more control is needed, e.g. to reflect on the resource usage of a
-kernel to determine the launch configuration. A host-side kernel launch is done as follows:
-
-    args = ...
-    GC.@preserve args begin
-        kernel_f = rocconvert(f)
-        kernel_args = rocconvert.(args)
-        kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-        kernel = rocfunction(kernel_f, kernel_tt; compilation_kwargs)
-        kernel(kernel_args...; launch_kwargs)
-    end
-
-A device-side launch, aka. dynamic parallelism, is similar but more restricted:
+const MACRO_KWARGS = [:dynamic, :launch]
+const COMPILER_KWARGS = [:name]
+const LAUNCH_KWARGS = [:gridsize, :groupsize, :shmem, :stream]
 
-    args = ...
-    # GC.@preserve is not supported
-    # we're on the device already, so no need to rocconvert
-    kernel_tt = Tuple{Core.Typeof(args[1]), ...}    # this needs to be fully inferred!
-    kernel = dynamic_rocfunction(f, kernel_tt)       # no compiler kwargs supported
-    kernel(args...; launch_kwargs)
-"""
 macro roc(ex...)
     # destructure the `@roc` expression
     call = ex[end]
@@ -549,29 +205,28 @@ macro roc(ex...)
     args = call.args[2:end]
 
     code = quote end
-    device_kwargs, macro_kwargs, compiler_kwargs, call_kwargs, signal_kwargs, kernel_kwargs = split_kwargs(kwargs)
-    simplify_call_kwargs!(call_kwargs)
     vars, var_exprs = assign_args!(code, args)
 
-    # handle keyword arguments that influence the macro's behavior
-    dynamic = false
+    macro_kwargs, compiler_kwargs, launch_kwargs, other_kwargs =
+        split_kwargs(kwargs, MACRO_KWARGS, COMPILER_KWARGS, LAUNCH_KWARGS)
+    if !isempty(other_kwargs)
+        key, val = first(other_kwargs).args
+        throw(ArgumentError("Unsupported keyword argument: `$key`."))
+    end
+
+    dynamic = false # TODO unsupported for now
     launch = true
-    wait = true
-    mark = true
     for kwarg in macro_kwargs
-        key,val = kwarg.args
+        key, val = kwarg.args
         if key == :dynamic
-            isa(val, Bool) || throw(ArgumentError("`dynamic` keyword argument to @roc should be a constant Bool"))
+            isa(val, Bool) || throw(ArgumentError(
+                "`dynamic` keyword argument to @roc should be a constant Bool"))
             dynamic = val::Bool
+            @assert false "`dynamic` kernel launch is not yet implemented"
         elseif key == :launch
-            isa(val, Bool) || throw(ArgumentError("`launch` keyword argument to @roc should be a constant Bool"))
+            isa(val, Bool) || throw(ArgumentError(
+                "`launch` keyword argument to @roc should be a constant Bool"))
             launch = val::Bool
-        elseif key == :wait
-            isa(val, Bool) || throw(ArgumentError("`wait` keyword argument to @roc should be a constant Bool"))
-            wait = val::Bool
-        elseif key == :mark
-            isa(val, Bool) || throw(ArgumentError("`mark` keyword argument to @roc should be a constant Bool"))
-            mark = val::Bool
         else
             throw(ArgumentError("Unsupported keyword argument '$key'"))
         end
@@ -579,62 +234,30 @@ macro roc(ex...)
 
     # FIXME: macro hygiene wrt. escaping kwarg values (this broke with 1.5)
     #        we esc() the whole thing now, necessitating gensyms...
-    @gensym kernel_f kernel_args kernel_tt kernel kernel_instance device queue signal
-    if dynamic
-        # FIXME: we could probably somehow support kwargs with constant values by either
-        #        saving them in a global Dict here, or trying to pick them up from the Julia
-        #        IR when processing the dynamic parallelism marker
-        isempty(compiler_kwargs) || error("@roc dynamic parallelism does not support compiler keyword arguments")
-
-        # dynamic, device-side kernel launch
-        push!(code.args,
-            quote
-                # we're in kernel land already, so no need to rocconvert arguments
-                local $kernel_tt = Tuple{$((:(Core.Typeof($var)) for var in var_exprs)...)}
-                local $kernel = $dynamic_rocfunction($f, $kernel_tt)
-                $kernel($(var_exprs...); $(call_kwargs...))
-            end)
-    else
-        # regular, host-side kernel launch
-        #
-        # convert the function, its arguments, call the compiler and launch the kernel
-        # while keeping the original arguments alive
-        push!(code.args,
-            quote
-                GC.@preserve $(vars...) begin
-                    local $kernel_f = $rocconvert($f)
-                    local $kernel_args = map($rocconvert, ($(var_exprs...),))
-                    local $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
-                    local $kernel = $rocfunction(
-                        $kernel_f, $kernel_tt;
-                        $(device_kwargs...), $(compiler_kwargs...))
-
-                    if $launch
-                        if $wait
-                            foreach($wait!, ($(var_exprs...),))
-                        end
-                        local $kernel_instance = $create_kernel($kernel; $(kernel_kwargs...))
-                        local $signal = $create_event(
-                            $kernel_instance; $(device_kwargs...), $(signal_kwargs...))
-                        $kernel($kernel_args...; signal=$signal, $(call_kwargs...))
-                        if $mark
-                            foreach(x->$mark!(x, $signal), ($(var_exprs...),))
-                        end
-                        $signal
-                    else
-                        $kernel
-                    end
+    @gensym kernel_f kernel_args kernel_tt kernel
+    push!(code.args,
+        quote
+            GC.@preserve $(vars...) begin
+                local $kernel_f = $rocconvert($f)
+                local $kernel_args = map($rocconvert, ($(var_exprs...),))
+                local $kernel_tt = Tuple{map(Core.Typeof, $kernel_args)...}
+                local $kernel = $hipfunction($kernel_f, $kernel_tt; $(compiler_kwargs...))
+
+                if $launch
+                    $kernel($(var_exprs...); $(launch_kwargs...))
                 end
-            end)
+                $kernel
+            end
+        end)
+    return esc(quote
+        let
+            $code
         end
-    return esc(code)
+    end)
 end
 
-# launch config
-
-launch_configuration(kern::Runtime.HostKernel; kwargs...) =
-    launch_configuration(kern.fun)
-function launch_configuration(fun::Runtime.ROCFunction; input_block_size=1, localmem=0)
-    occ = Compiler.calculate_occupancy(fun; input_block_size, localmem)
-    return (;groupsize=occ.best_block_size)
+function launch_configuration(
+    kern::Runtime.HIPKernel; shmem::Integer = 0, max_block_size::Integer = 0,
+)
+    HIP.launch_configuration(kern.fun; shmem, max_block_size)
 end
diff --git a/src/hip/HIP.jl b/src/hip/HIP.jl
index e80646571..978a77064 100644
--- a/src/hip/HIP.jl
+++ b/src/hip/HIP.jl
@@ -1,50 +1,14 @@
 module HIP
 
+using CEnum
+
 import ..AMDGPU
 import ..AMDGPU.libhip
-using CEnum
 
 include("libhip_common.jl")
 include("error.jl")
 include("libhip.jl")
-
-struct HIPDevice
-    device::hipDevice_t
-    device_id::Cint
-end
-function HIPDevice(device_id::Integer)
-    device_ref = Ref{hipDevice_t}()
-    hipDeviceGet(device_ref, Cint(device_id-1)) |> check
-    return HIPDevice(device_ref[], device_id)
-end
-Base.unsafe_convert(::Type{Ptr{T}}, device::HIPDevice) where T =
-    reinterpret(Ptr{T}, device.device)
-function name(device::HIPDevice)
-    name_vec = zeros(Cuchar, 64)
-    hipDeviceGetName(pointer(name_vec), Cint(64), device.device) |> check
-    return String(name_vec)
-end
-function Base.show(io::IO, device::HIPDevice)
-    print(io, "HIPDevice(name=\"$(name(device))\", id=$(device.device_id))")
-end
-
-function device()
-    device_id_ref = Ref{Cint}()
-    hipGetDevice(device_id_ref) |> check
-    return HIPDevice(device_id_ref[]+1)
-end
-device!(device::HIPDevice) = hipSetDevice(device.device_id-Int32(1)) |> check
-device!(device_id::Integer) = hipSetDevice(Cint(device_id-1)) |> check
-function device!(f::Base.Callable, device::HIPDevice)
-    old_device_id_ref = Ref{Cint}()
-    hipGetDevice(old_device_id_ref) |> check
-    device!(device)
-    try
-        f()
-    finally
-        device!(old_device_id_ref[]+1)
-    end
-end
+include("device.jl")
 
 mutable struct HIPContext
     context::hipContext_t
@@ -84,73 +48,19 @@ function context!(f::Base.Callable, context::HIPContext)
     end
 end
 
-mutable struct HIPStream
-    stream::hipStream_t
-    priority::Symbol
-    device::HIPDevice
-end
-
-"""
-    HIPStream(priority::Symbol = :normal)
-
-# Arguments:
-
-- `priority::Symbol`: Priority of the stream: `:normal`, `:high` or `:low`.
-
-Create HIPStream with given priority.
-Device is the default device that's currently in use.
-"""
-function HIPStream(priority::Symbol = :normal)
-    priority_int = symbol_to_priority(priority)
-
-    stream_ref = Ref{hipStream_t}()
-    hipStreamCreateWithPriority(stream_ref, Cuint(0), priority_int) |> check
-    stream = HIPStream(stream_ref[], priority, device())
-    finalizer(stream) do s
-        hipStreamSynchronize(s.stream) |> check
-        hipStreamDestroy(s.stream) |> check
-    end
-    return stream
-end
-
-"""
-    HIPStream(stream::hipStream_t)
-
-Create HIPStream from `hipStream_t` handle.
-Device is the default device that's currently in use.
-"""
-HIPStream(stream::hipStream_t) = HIPStream(stream, priority(stream), device())
+include("stream.jl")
+include("event.jl")
+include("pool.jl")
 
-Base.unsafe_convert(::Type{Ptr{T}}, stream::HIPStream) where T =
-    reinterpret(Ptr{T}, stream.stream)
-function Base.show(io::IO, stream::HIPStream)
-    print(io, "HIPStream(device=$(stream.device), ptr=$(repr(UInt64(stream.stream))), priority=$(stream.priority))")
-end
-
-function priority_to_symbol(priority)
-    priority ==  0 && return :normal
-    priority == -1 && return :high
-    priority ==  1 && return :low
-    throw(ArgumentError("""
-    Invalid HIP priority: $priority.
-    Valid values are: 0, -1, 1.
-    """))
-end
+include("module.jl")
 
-function symbol_to_priority(priority::Symbol)
-    priority == :normal && return Cint(0)
-    priority == :high && return Cint(-1)
-    priority == :low && return Cint(1)
-    throw(ArgumentError("""
-    Invalid HIP priority symbol: $priority.
-    Valid values are: `:normal`, `:low`, `:high`.
-    """))
+function device_synchronize()
+    hipDeviceSynchronize() |> check
 end
 
-function priority(stream::hipStream_t)
-    priority = Ref{Cint}()
-    hipStreamGetPriority(stream, priority) |> check
-    priority_to_symbol(priority[])
+function reclaim(bytes_to_keep::Integer = 0)
+    device_synchronize()
+    trim(memory_pool(device()), bytes_to_keep)
 end
 
 end
diff --git a/src/hip/device.jl b/src/hip/device.jl
new file mode 100644
index 000000000..094dbc21d
--- /dev/null
+++ b/src/hip/device.jl
@@ -0,0 +1,99 @@
+struct HIPDevice
+    device::hipDevice_t
+    device_id::Cint
+end
+
+function HIPDevice(device_id::Integer)
+    device_ref = Ref{hipDevice_t}()
+    hipDeviceGet(device_ref, device_id - 1) |> check
+    return HIPDevice(device_ref[], device_id)
+end
+
+device_id(d::HIPDevice) = d.device_id - 1
+
+function stack_size()
+    value = Ref{Csize_t}()
+    hipDeviceGetLimit(value, hipLimitStackSize) |> check
+    value[]
+end
+
+function stack_size!(value::Integer)
+    hipDeviceSetLimit(hipLimitStackSize, value) |> check
+end
+
+# TODO heap_size tweaking available since 5.5
+
+function heap_size()
+    value = Ref{Csize_t}()
+    hipDeviceGetLimit(value, hipLimitMallocHeapSize) |> check
+    value[]
+end
+
+function heap_size!(value::Integer)
+    hipDeviceSetLimit(hipLimitMallocHeapSize, value) |> check
+end
+
+Base.hash(dev::HIPDevice, h::UInt) = hash(dev.device, h)
+
+Base.unsafe_convert(::Type{Ptr{T}}, device::HIPDevice) where T =
+    reinterpret(Ptr{T}, device.device)
+
+function name(dev::HIPDevice)
+    name_vec = zeros(Cuchar, 64)
+    hipDeviceGetName(pointer(name_vec), Cint(64), dev.device) |> check
+    name_vec[1] == Cuchar(0) || return strip(String(name_vec), '\0')
+
+    # Fallback to HSA device name if HIP failed to report.
+    AMDGPU.Runtime.hsa_device(dev).name
+end
+
+function properties(dev::HIPDevice)
+    init_arch_ref = Ref(hipDeviceArch_t())
+    arch_field_id = findfirst(i -> i == :arch, fieldnames(hipDeviceProp_t))
+    arch_offset = fieldoffset(hipDeviceProp_t, arch_field_id)
+
+    props_ref = Ref{hipDeviceProp_t}()
+    ccall(:memset,
+        Cvoid, (Ptr{Cvoid}, Cint, Csize_t),
+        props_ref, 0, sizeof(hipDeviceProp_t))
+    ccall(:memcpy,
+        Cvoid, (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
+        Base.unsafe_convert(Ptr{Cvoid}, props_ref) + arch_offset,
+        init_arch_ref, sizeof(hipDeviceArch_t))
+
+    hipGetDeviceProperties(props_ref, device_id(dev)) |> check
+    props_ref[]
+end
+
+function Base.show(io::IO, device::HIPDevice)
+    print(io, "HIPDevice(name=\"$(name(device))\", id=$(device.device_id))")
+end
+
+function ndevices()
+    count_ref = Ref{Cint}()
+    hipGetDeviceCount(count_ref) |> check
+    count_ref[]
+end
+
+devices() = [HIPDevice(i) for i in 1:ndevices()]
+
+function device()
+    device_id_ref = Ref{Cint}()
+    hipGetDevice(device_id_ref) |> check
+    return HIPDevice(device_id_ref[]+1)
+end
+
+device!(device::HIPDevice) = hipSetDevice(device_id(device)) |> check
+
+device!(device_id::Integer) = hipSetDevice(device_id - 1) |> check
+
+function device!(f::Base.Callable, device::HIPDevice)
+    old_device_id_ref = Ref{Cint}()
+    hipGetDevice(old_device_id_ref) |> check
+    device!(device)
+    try
+        f()
+    finally
+        device!(old_device_id_ref[] + 1)
+    end
+end
diff --git a/src/hip/event.jl b/src/hip/event.jl
new file mode 100644
index 000000000..b15832b5a
--- /dev/null
+++ b/src/hip/event.jl
@@ -0,0 +1,64 @@
+mutable struct HIPEvent
+    handle::hipEvent_t
+    stream::hipStream_t
+end
+
+Base.:(==)(a::HIPEvent, b::HIPEvent) = a.handle == b.handle
+
+Base.unsafe_convert(::Type{Ptr{T}}, event::HIPEvent) where T =
+    reinterpret(Ptr{T}, event.handle)
+
+function record(event::HIPEvent)
+    hipEventRecord(event.handle, event.stream) |> check
+    return event
+end
+
+function isdone(event::HIPEvent)
+    query = hipEventQuery(event)
+    if query == hipSuccess
+        return true
+    elseif query == hipErrorNotReady
+        return false
+    else
+        throw(HIPError(query))
+    end
+end
+
+function non_blocking_synchronize(event::HIPEvent)
+    isdone(event) && return true
+
+    # spin (initially without yielding to minimize latency)
+    spins = 0
+    while spins < 256
+        if spins < 32
+            ccall(:jl_cpu_pause, Cvoid, ())
+            # Temporary solution before we have gc transition support in codegen.
+            ccall(:jl_gc_safepoint, Cvoid, ())
+        else
+            yield()
+        end
+        isdone(event) && return true
+        spins += 1
+    end
+    return false
+end
+
+wait(event::HIPEvent) = hipEventSynchronize(event) |> check
+
+function synchronize(event::HIPEvent)
+    non_blocking_synchronize(event) || wait(event)
+    return
+end
+
+function HIPEvent(stream::hipStream_t; do_record::Bool = true)
+    event_ref = Ref{hipEvent_t}()
+    hipEventCreateWithFlags(event_ref, hipEventDisableTiming) |> check
+    event = HIPEvent(event_ref[], stream)
+    do_record && record(event)
+
+    finalizer(event) do e
+        hipEventDestroy(e) |> check
+    end
+    event
+end
+HIPEvent(stream::HIPStream; do_record::Bool = true) = HIPEvent(stream.stream; do_record)
diff --git a/src/hip/libhip.jl b/src/hip/libhip.jl
index d26cabc30..7fcc9acaf 100644
--- a/src/hip/libhip.jl
+++ b/src/hip/libhip.jl
@@ -1,65 +1,102 @@
-function hipInit(flags::Cint)
+function hipInit(flags)
     ccall((:hipInit, libhip), hipError_t, (Cint,), flags)
 end
 
-function hipDeviceGet(device_ref::Ref{hipDevice_t}, device_id::Cint)
+function hipDeviceGet(dev, device_id)
     ccall((:hipDeviceGet, libhip), hipError_t,
-          (Ptr{hipDevice_t}, Cint), device_ref, device_id)
+        (Ptr{hipDevice_t}, Cint), dev, device_id)
 end
 
-function hipCtxCreate(ctx_ref::Ref{hipContext_t}, flags::Cuint, device::hipDevice_t)
+function hipCtxCreate(ctx_ref, flags, device)
     ccall((:hipCtxCreate, libhip), hipError_t,
-          (Ptr{hipContext_t}, Cuint, hipDevice_t),
-          ctx_ref, flags, device)
+        (Ptr{hipContext_t}, Cuint, hipDevice_t),
+        ctx_ref, flags, device)
 end
 
-function hipCtxDestroy(ctx::hipContext_t)
+function hipCtxDestroy(ctx)
     ccall((:hipCtxDestroy, libhip), hipError_t, (hipContext_t,), ctx)
 end
 
-function hipCtxSetCurrent(ctx::hipContext_t)
+function hipCtxSetCurrent(ctx)
     ccall((:hipCtxSetCurrent, libhip), hipError_t, (hipContext_t,), ctx)
 end
 
-function hipCtxGetCurrent(ctx_ref::Ref{hipContext_t})
+function hipCtxGetCurrent(ctx_ref)
     ccall((:hipCtxGetCurrent, libhip), hipError_t, (Ptr{hipContext_t},), ctx_ref)
 end
 
-function hipGetDevice(device_id_ref::Ref{Cint})
+function hipGetDevice(device_id_ref)
     ccall((:hipGetDevice, libhip), hipError_t, (Ptr{Cint},), device_id_ref)
 end
 
-function hipSetDevice(device_id::Cint)
+function hipSetDevice(device_id)
     ccall((:hipSetDevice, libhip), hipError_t, (Cint,), device_id)
 end
 
-function hipDeviceGetCount(count_ref::Ref{Cint})
-    ccall((:hipDeviceGetCount, libhip), hipError_t, (Ptr{Cint},), count_ref)
+function hipGetDeviceCount(count_ref)
+    ccall((:hipGetDeviceCount, libhip), hipError_t, (Ptr{Cint},), count_ref)
 end
 
-function hipDeviceGetName(name::Ptr{Cuchar}, len::Cint, device::hipDevice_t)
+function hipGetDeviceProperties(prop, dev_id)
+    ccall((:hipGetDeviceProperties, libhip), hipError_t,
+        (Ptr{hipDeviceProp_t}, Cint), prop, dev_id)
+end
+
+function hipDeviceGetName(name, len, device)
     ccall((:hipDeviceGetName, libhip), hipError_t,
           (Ptr{Cuchar}, Cint, hipDevice_t), name, len, device)
 end
 
-function hipStreamCreateWithPriority(stream_ref::Ref{hipStream_t}, flags::Cuint, priority::Cint)
+function hipDeviceGetAttribute(val, attribute, device_id)
+    ccall((:hipDeviceGetAttribute, libhip), hipError_t,
+        (Ptr{Cint}, hipDeviceAttribute_t, Cint), val, attribute, device_id)
+end
+
+function hipEventCreate(event_ref)
+    ccall((:hipEventCreate, libhip), hipError_t, (Ptr{hipEvent_t},), event_ref)
+end
+
+function hipEventCreateWithFlags(event_ref, flags)
+    ccall((:hipEventCreateWithFlags, libhip), hipError_t,
+        (Ptr{hipEvent_t}, Cuint), event_ref, flags)
+end
+
+function hipEventDestroy(event)
+    ccall((:hipEventDestroy, libhip), hipError_t, (hipEvent_t,), event)
+end
+
+function hipEventRecord(event, stream)
+    ccall((:hipEventRecord, libhip), hipError_t,
+        (hipEvent_t, hipStream_t), event, stream)
+end
+
+function hipEventQuery(event)
+    ccall((:hipEventQuery, libhip), hipError_t, (hipEvent_t,), event)
+end
+
+function hipEventSynchronize(event)
+    ccall((:hipEventSynchronize, libhip), hipError_t, (hipEvent_t,), event)
+end
+
+function hipStreamCreateWithPriority(stream_ref, flags, priority)
     ccall((:hipStreamCreateWithPriority, libhip), hipError_t,
-          (Ptr{hipStream_t}, Cuint, Cint), stream_ref, flags, priority)
+        (Ptr{hipStream_t}, Cuint, Cint), stream_ref, flags, priority)
 end
 
-function hipStreamGetPriority(stream::hipStream_t, priority::Ref{Cint})
-    ccall((:hipStreamGetPriority, libhip), hipError_t, (hipStream_t, Ptr{Cint}), stream, priority)
+function hipStreamGetPriority(stream, priority)
+    ccall((:hipStreamGetPriority, libhip), hipError_t,
+        (hipStream_t, Ptr{Cint}), stream, priority)
 end
 
-function hipStreamSynchronize(stream::hipStream_t)
+function hipStreamSynchronize(stream)
     ccall((:hipStreamSynchronize, libhip), hipError_t, (hipStream_t,), stream)
 end
 
-function hipStreamDestroy(stream::hipStream_t)
+function hipStreamDestroy(stream)
     ccall((:hipStreamDestroy, libhip), hipError_t, (hipStream_t,), stream)
 end
 
-function hipStreamQuery(stream::hipStream_t)
+function hipStreamQuery(stream)
     ccall((:hipStreamQuery, libhip), hipError_t, (hipStream_t,), stream)
 end
 
@@ -74,3 +111,169 @@ end
 function hipFree(ptr::Ptr{Cvoid})
     ccall((:hipFree, libhip), hipError_t, (Ptr{Cvoid},), ptr)
 end
+
+function hipHostMalloc(ptr, sz, flags)
+    ccall((:hipHostMalloc, libhip), hipError_t,
+        (Ptr{Ptr{Cvoid}}, Csize_t, Cuint), ptr, sz, flags)
+end
+
+function hipHostFree(ptr)
+    ccall((:hipHostFree, libhip), hipError_t, (Ptr{Cvoid},), ptr)
+end
+
+function hipHostRegister(hostPtr, sizeBytes, flags)
+    ccall((:hipHostRegister, libhip), hipError_t,
+        (Ptr{Cvoid}, Csize_t, Cuint),
+        hostPtr, sizeBytes, flags)
+end
+
+function hipHostUnregister(hostPtr)
+    ccall((:hipHostUnregister, libhip), hipError_t, (Ptr{Cvoid},), hostPtr)
+end
+
+function hipHostGetDevicePointer(devPtr, hstPtr, flags)
+    ccall((:hipHostGetDevicePointer, libhip), hipError_t,
+        (Ptr{Ptr{Cvoid}}, Ptr{Cvoid}, Cuint), devPtr, hstPtr, flags)
+end
+
+function hipMallocAsync(ptr, sz, stream)
+    ccall((:hipMallocAsync, libhip), hipError_t,
+        (Ptr{Ptr{Cvoid}}, Csize_t, hipStream_t),
+        ptr, sz, stream)
+end
+
+function hipFreeAsync(ptr, stream)
+    ccall((:hipFreeAsync, libhip), hipError_t,
+        (Ptr{Cvoid}, hipStream_t), ptr, stream)
+end
+
+function hipMemcpyHtoDAsync(dst, src, sz, stream)
+    ccall((:hipMemcpyHtoDAsync, libhip), hipError_t,
+        (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t),
+        dst, src, sz, stream)
+end
+
+function hipMemcpyDtoHAsync(dst, src, sz, stream)
+    ccall((:hipMemcpyDtoHAsync, libhip), hipError_t,
+        (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t),
+        dst, src, sz, stream)
+end
+
+function hipMemcpyDtoDAsync(dst, src, sz, stream)
+    ccall((:hipMemcpyDtoDAsync, libhip), hipError_t,
+        (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t, hipStream_t),
+        dst, src, sz, stream)
+end
+
+function hipMemGetInfo(free, total)
+    ccall((:hipMemGetInfo, libhip), hipError_t,
+        (Ptr{Csize_t}, Ptr{Csize_t}), free, total)
+end
+
+function hipDeviceGetDefaultMemPool(pool, device_id)
+    ccall((:hipDeviceGetDefaultMemPool, libhip), hipError_t,
+        (Ptr{hipMemPool_t}, Cint), pool, device_id)
+end
+
+function hipDeviceGetMemPool(pool, device_id)
+    ccall((:hipDeviceGetMemPool, libhip), hipError_t,
+        (Ptr{hipMemPool_t}, Cint), pool, device_id)
+end
+
+function hipDeviceSetMemPool(device_id, pool)
+    ccall((:hipDeviceSetMemPool, libhip), hipError_t,
+        (Cint, hipMemPool_t), device_id, pool)
+end
+
+function hipMemPoolTrimTo(pool, min_bytes_to_hold)
+    ccall((:hipMemPoolTrimTo, libhip), hipError_t,
+        (hipMemPool_t, Csize_t), pool, min_bytes_to_hold)
+end
+
+function hipMemPoolSetAttribute(pool, attr, value)
+    ccall((:hipMemPoolSetAttribute, libhip), hipError_t,
+        (hipMemPool_t, hipMemPoolAttr, Ptr{Cvoid}), pool, attr, value)
+end
+
+function hipMemPoolGetAttribute(pool, attr, value)
+    ccall((:hipMemPoolGetAttribute, libhip), hipError_t,
+        (hipMemPool_t, hipMemPoolAttr, Ptr{Cvoid}), pool, attr, value)
+end
+
+function hipMemPoolCreate(pool, props)
+    ccall((:hipMemPoolCreate, libhip), hipError_t,
+        (Ptr{hipMemPool_t}, Ptr{hipMemPoolProps}), pool, props)
+end
+
+function hipMemPoolDestroy(pool)
+    ccall((:hipMemPoolDestroy, libhip), hipError_t, (hipMemPool_t,), pool)
+end
+
+function hipDeviceGetLimit(value, limit)
+    ccall((:hipDeviceGetLimit, libhip), hipError_t,
+        (Ptr{Csize_t}, hipLimit_t), value, limit)
+end
+
+function hipDeviceSetLimit(limit, value)
+    ccall((:hipDeviceSetLimit, libhip), hipError_t,
+        (hipLimit_t, Csize_t), limit, value)
+end
+
+function hiprtcLinkCreate(n_options, option_ptr, option_vals_pptr, hip_link_state_ptr)
+    ccall((:hiprtcLinkCreate, libhip), hiprtcResult,
+        (Cuint, Ptr{hiprtcJIT_option}, Ptr{Ptr{Cvoid}}, Ptr{hiprtcLinkState}),
+        n_options, option_ptr, option_vals_pptr, hip_link_state_ptr)
+end
+
+function hiprtcLinkAddFile(
+    hip_link_state, input_type, file_path,
+    num_options, options_ptr, option_vals_pptr,
+)
+    ccall((:hiprtcLinkAddFile, libhip), hiprtcResult,
+        (hiprtcLinkState, hiprtcJITInputType, Ptr{Cchar}, Cuint, Ptr{hiprtcJIT_option}, Ptr{Ptr{Cvoid}}),
+        hip_link_state, input_type, file_path, num_options, options_ptr, option_vals_pptr)
+end
+
+function hiprtcLinkComplete(hip_link_state, bin_out, size_out)
+    ccall((:hiprtcLinkComplete, libhip), hiprtcResult,
+        (hiprtcLinkState, Ptr{Ptr{Cvoid}}, Ptr{Csize_t}),
+        hip_link_state, bin_out, size_out)
+end
+
+function hipModuleLoad(mod, fname)
+    ccall((:hipModuleLoad, libhip), hipError_t,
+        (Ptr{hipModule_t}, Ptr{Cchar}), mod, fname)
+end
+
+function hipModuleLoadData(mod, img)
+    ccall((:hipModuleLoadData, libhip), hipError_t,
+        (Ptr{hipModule_t}, Ptr{Cvoid}), mod, img)
+end
+
+function hipModuleGetFunction(func, mod, name)
+    ccall((:hipModuleGetFunction, libhip), hipError_t,
+        (Ptr{hipFunction_t}, hipModule_t, Ptr{Cchar}), func, mod, name)
+end
+
+function hipModuleUnload(mod)
+    ccall((:hipModuleUnload, libhip), hipError_t, (hipModule_t,), mod)
+end
+
+function hipModuleLaunchKernel(
+    func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+    sharedMemBytes, stream, kernelParams, extra,
+)
+    ccall((:hipModuleLaunchKernel, libhip), hipError_t,
+        (hipFunction_t, Cuint, Cuint, Cuint, Cuint, Cuint, Cuint,
+        Cuint, hipStream_t, Ptr{Ptr{Cvoid}}, Ptr{Ptr{Cvoid}}),
+        func, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+        sharedMemBytes, stream, kernelParams, extra)
+end
+
+function hipModuleOccupancyMaxPotentialBlockSize(
+    gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit,
+)
+    ccall((:hipModuleOccupancyMaxPotentialBlockSize, libhip), hipError_t,
+        (Ptr{Cint}, Ptr{Cint}, hipFunction_t, Csize_t, Cint),
+        gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)
+end
diff --git a/src/hip/libhip_common.jl b/src/hip/libhip_common.jl
index 58031e072..baa4a1904 100644
--- a/src/hip/libhip_common.jl
+++ b/src/hip/libhip_common.jl
@@ -1,13 +1,129 @@
+const HIP_LAUNCH_PARAM_BUFFER_POINTER = Ptr{Cvoid}(1)
+const HIP_LAUNCH_PARAM_BUFFER_SIZE = Ptr{Cvoid}(2)
+const HIP_LAUNCH_PARAM_END = Ptr{Cvoid}(3)
+
+@cenum hiprtcResult::UInt32 begin
+    HIPRTC_SUCCESS = 0
+    HIPRTC_ERROR_OUT_OF_MEMORY = 1
+    HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
+    HIPRTC_ERROR_INVALID_INPUT = 3
+    HIPRTC_ERROR_INVALID_PROGRAM = 4
+    HIPRTC_ERROR_INVALID_OPTION = 5
+    HIPRTC_ERROR_COMPILATION = 6
+    HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
+    HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
+    HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
+    HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
+    HIPRTC_ERROR_INTERNAL_ERROR = 11
+    HIPRTC_ERROR_LINKING = 100
+end
+
+@cenum hiprtcJIT_option::UInt32 begin
+    HIPRTC_JIT_MAX_REGISTERS = 0
+    HIPRTC_JIT_THREADS_PER_BLOCK
+    HIPRTC_JIT_WALL_TIME
+    HIPRTC_JIT_INFO_LOG_BUFFER
+    HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+    HIPRTC_JIT_ERROR_LOG_BUFFER
+    HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+    HIPRTC_JIT_OPTIMIZATION_LEVEL
+    HIPRTC_JIT_TARGET_FROM_HIPCONTEXT
+    HIPRTC_JIT_TARGET
+    HIPRTC_JIT_FALLBACK_STRATEGY
+    HIPRTC_JIT_GENERATE_DEBUG_INFO
+    HIPRTC_JIT_LOG_VERBOSE
+    HIPRTC_JIT_GENERATE_LINE_INFO
+    HIPRTC_JIT_CACHE_MODE
+    HIPRTC_JIT_NEW_SM3X_OPT
+    HIPRTC_JIT_FAST_COMPILE
+    HIPRTC_JIT_GLOBAL_SYMBOL_NAMES
+    HIPRTC_JIT_GLOBAL_SYMBOL_ADDRESS
+    HIPRTC_JIT_GLOBAL_SYMBOL_COUNT
+    HIPRTC_JIT_LTO
+    HIPRTC_JIT_FTZ
+    HIPRTC_JIT_PREC_DIV
+    HIPRTC_JIT_PREC_SQRT
+    HIPRTC_JIT_FMA
+    HIPRTC_JIT_NUM_OPTIONS
+end
+
+@cenum hiprtcJITInputType::UInt32 begin
+    HIPRTC_JIT_INPUT_CUBIN = 0
+    HIPRTC_JIT_INPUT_PTX
+    HIPRTC_JIT_INPUT_FATBINARY
+    HIPRTC_JIT_INPUT_OBJECT
+    HIPRTC_JIT_INPUT_LIBRARY
+    HIPRTC_JIT_INPUT_NVVM
+    HIPRTC_JIT_NUM_LEGACY_INPUT_TYPES
+    HIPRTC_JIT_INPUT_LLVM_BITCODE = 100
+    HIPRTC_JIT_INPUT_LLVM_BUNDLED_BITCODE = 101
+    HIPRTC_JIT_INPUT_LLVM_ARCHIVES_OF_BUNDLED_BITCODE = 102
+end
+
+@cenum hipLimit_t::UInt32 begin
+    hipLimitStackSize = 0
+    hipLimitPrintfFifoSize = 1
+    hipLimitMallocHeapSize = 2
+end
+
+@cenum hipMemAllocationHandleType::UInt32 begin
+    hipMemHandleTypeNone = 0
+    hipMemHandleTypePosixFileDescriptor = 1
+    hipMemHandleTypeWin32 = 2
+    hipMemHandleTypeWin32Kmt = 4
+end
+
+@cenum hipMemAllocationType::UInt32 begin
+    hipMemAllocationTypeInvalid = 0
+    hipMemAllocationTypePinned = 1
+    hipMemAllocationTypeMax = 2147483647
+end
+
+@cenum hipMemLocationType::UInt32 begin
+    hipMemLocationTypeInvalid = 0
+    hipMemLocationTypeDevice = 1
+end
+
+@cenum hipMemPoolAttr::UInt32 begin
+    hipMemPoolReuseFollowEventDependencies = 1
+    hipMemPoolReuseAllowOpportunistic = 2
+    hipMemPoolReuseAllowInternalDependencies = 3
+    hipMemPoolAttrReleaseThreshold = 4
+    hipMemPoolAttrReservedMemCurrent = 5
+    hipMemPoolAttrReservedMemHigh = 6
+    hipMemPoolAttrUsedMemCurrent = 7
+    hipMemPoolAttrUsedMemHigh = 8
+end
+
+@cenum hipEventFlag_t::Cuint begin
+    hipEventDefault = 0
+    hipEventDisableTiming = 2
+    hipEventInterprocess = 4
+end
+
 @cenum hipError_t::UInt32 begin
     hipSuccess = 0
+    hipErrorInvalidValue = 1
     hipErrorOutOfMemory = 2
+    hipErrorMemoryAllocation = 2
     hipErrorNotInitialized = 3
+    hipErrorInitializationError = 3
     hipErrorDeinitialized = 4
     hipErrorProfilerDisabled = 5
     hipErrorProfilerNotInitialized = 6
     hipErrorProfilerAlreadyStarted = 7
     hipErrorProfilerAlreadyStopped = 8
+    hipErrorInvalidConfiguration = 9
+    hipErrorInvalidPitchValue = 12
+    hipErrorInvalidSymbol = 13
+    hipErrorInvalidDevicePointer = 17
+    hipErrorInvalidMemcpyDirection = 21
     hipErrorInsufficientDriver = 35
+    hipErrorMissingConfiguration = 52
+    hipErrorPriorLaunchFailure = 53
+    hipErrorInvalidDeviceFunction = 98
+    hipErrorNoDevice = 100
+    hipErrorInvalidDevice = 101
     hipErrorInvalidImage = 200
     hipErrorInvalidContext = 201
     hipErrorContextAlreadyCurrent = 202
@@ -31,42 +147,368 @@
     hipErrorSharedObjectSymbolNotFound = 302
     hipErrorSharedObjectInitFailed = 303
     hipErrorOperatingSystem = 304
-    hipErrorSetOnActiveProcess = 305
     hipErrorInvalidHandle = 400
+    hipErrorIllegalState = 401
     hipErrorNotFound = 500
+    hipErrorNotReady = 600
     hipErrorIllegalAddress = 700
-    hipErrorInvalidSymbol = 701
-    # Runtime Error Codes start here.
-    hipErrorMissingConfiguration = 1001
-    hipErrorMemoryAllocation = 1002
-    hipErrorInitializationError = 1003
-    hipErrorLaunchFailure = 1004
-    hipErrorPriorLaunchFailure = 1005
-    hipErrorLaunchTimeOut = 1006
-    hipErrorLaunchOutOfResources = 1007
-    hipErrorInvalidDeviceFunction = 1008
-    hipErrorInvalidConfiguration = 1009
-    hipErrorInvalidDevice = 1010
-    hipErrorInvalidValue = 1011
-    hipErrorInvalidDevicePointer = 1017
-    hipErrorInvalidMemcpyDirection = 1021
-    hipErrorUnknown = 1030
-    hipErrorInvalidResourceHandle = 1033
-    hipErrorNotReady = 1034
-    hipErrorNoDevice = 1038
-    hipErrorPeerAccessAlreadyEnabled = 1050
-    hipErrorPeerAccessNotEnabled = 1051
+    hipErrorLaunchOutOfResources = 701
+    hipErrorLaunchTimeOut = 702
+    hipErrorPeerAccessAlreadyEnabled = 704
+    hipErrorPeerAccessNotEnabled = 705
+    hipErrorSetOnActiveProcess = 708
+    hipErrorContextIsDestroyed = 709
+    hipErrorAssert = 710
+    hipErrorHostMemoryAlreadyRegistered = 712
+    hipErrorHostMemoryNotRegistered = 713
+    hipErrorLaunchFailure = 719
+    hipErrorCooperativeLaunchTooLarge = 720
+    hipErrorNotSupported = 801
+    hipErrorStreamCaptureUnsupported = 900
+    hipErrorStreamCaptureInvalidated = 901
+    hipErrorStreamCaptureMerge = 902
+    hipErrorStreamCaptureUnmatched = 903
+    hipErrorStreamCaptureUnjoined = 904
+    hipErrorStreamCaptureIsolation = 905
+    hipErrorStreamCaptureImplicit = 906
+    hipErrorCapturedEvent = 907
+    hipErrorStreamCaptureWrongThread = 908
+    hipErrorGraphExecUpdateFailure = 910
+    hipErrorUnknown = 999
+    # HSA Runtime Error Codes start here.
     hipErrorRuntimeMemory = 1052
     hipErrorRuntimeOther = 1053
-    hipErrorHostMemoryAlreadyRegistered = 1061
-    hipErrorHostMemoryNotRegistered = 1062
-    hipErrorMapBufferObjectFailed = 1071
-    hipErrorAssert = 1081
-    hipErrorNotSupported = 1082
     hipErrorTbd
 end
 
 hipContext_t = Ptr{Cvoid}
+
 hipDevice_t = Ptr{Cvoid}
+
 hipStream_t = Ptr{Cvoid}
+
 hipEvent_t = Ptr{Cvoid}
+
+hipMemPool_t = Ptr{Cvoid}
+
+hipModule_t = Ptr{Cvoid}
+
+hipFunction_t = Ptr{Cvoid}
+
+hiprtcLinkState = Ptr{Cvoid}
+
+struct hipMemLocation
+    type::hipMemLocationType
+    id::Cint
+end
+
+struct hipMemPoolProps
+    allocType::hipMemAllocationType
+    handleTypes::hipMemAllocationHandleType
+    location::hipMemLocation
+    win32SecurityAttributes::Ptr{Cvoid}
+    reserved::NTuple{64,Cuchar}
+end
+
+Base.@kwdef struct hipDeviceArch_t
+    hasGlobalInt32Atomics::Cuint = 1
+    hasGlobalFloatAtomicExch::Cuint = 1
+    hasSharedInt32Atomics::Cuint = 1
+    hasSharedFloatAtomicExch::Cuint = 1
+    hasFloatAtomicAdd::Cuint = 1
+
+    hasGlobalInt64Atomics::Cuint = 1
+    hasSharedInt64Atomics::Cuint = 1
+
+    # Doubles
+    hasDoubles::Cuint = 1
+
+    # Warp cross-lane operations
+    hasWarpVote::Cuint = 1
+    hasWarpBallot::Cuint = 1
+    hasWarpShuffle::Cuint = 1
+    hasFunnelShift::Cuint = 1
+
+    # Sync
+    hasThreadFenceSystem::Cuint = 1
+    hasSyncThreadsExt::Cuint = 1
+
+    # Misc
+    hasSurfaceFuncs::Cuint = 1
+    has3dGrid::Cuint = 1
+    hasDynamicParallelism::Cuint = 1
+end
+
+function Base.show(io::IO, arch::hipDeviceArch_t)
+    print(io,
+    """
+    struct hipDeviceArch_t
+        hasGlobalInt32Atomics = $(arch.hasGlobalInt32Atomics)
+        hasGlobalFloatAtomicExch = $(arch.hasGlobalFloatAtomicExch)
+        hasSharedInt32Atomics = $(arch.hasSharedInt32Atomics)
+        hasSharedFloatAtomicExch = $(arch.hasSharedFloatAtomicExch)
+        hasFloatAtomicAdd = $(arch.hasFloatAtomicAdd)
+
+        hasGlobalInt64Atomics = $(arch.hasGlobalInt64Atomics)
+        hasSharedInt64Atomics = $(arch.hasSharedInt64Atomics)
+
+        # Doubles
+        hasDoubles = $(arch.hasDoubles)
+
+        # Warp cross-lane operations
+        hasWarpVote = $(arch.hasWarpVote)
+        hasWarpBallot = $(arch.hasWarpBallot)
+        hasWarpShuffle = $(arch.hasWarpShuffle)
+        hasFunnelShift = $(arch.hasFunnelShift)
+
+        # Sync
+        hasThreadFenceSystem = $(arch.hasThreadFenceSystem)
+        hasSyncThreadsExt = $(arch.hasSyncThreadsExt)
+
+        # Misc
+        hasSurfaceFuncs = $(arch.hasSurfaceFuncs)
+        has3dGrid = $(arch.has3dGrid)
+        hasDynamicParallelism = $(arch.hasDynamicParallelism)
+    end
+    """)
+end
+
+struct hipDeviceProp_t
+    name::NTuple{256, Cchar}
+    totalGlobalMem::Csize_t
+    sharedMemPerBlock::Csize_t
+    regsPerBlock::Cint
+    warpSize::Cint
+    maxThreadsPerBlock::Cint
+    maxThreadsDim::NTuple{3, Cint}
+    maxGridSize::NTuple{3, Cint}
+    clockRate::Cint
+    memoryClockRate::Cint
+    memoryBusWidth::Cint
+    totalConstMem::Csize_t
+    major::Cint
+    minor::Cint
+    multiProcessorCount::Cint
+    l2CacheSize::Cint
+    maxThreadsPerMultiProcessor::Cint
+    computeMode::Cint
+    clockInstructionRate::Cint
+    arch::hipDeviceArch_t
+    concurrentKernels::Cint
+    pciDomainID::Cint
+    pciBusID::Cint
+    pciDeviceID::Cint
+    maxSharedMemoryPerMultiProcessor::Csize_t
+    isMultiGpuBoard::Cint
+    canMapHostMemory::Cint
+    gcnArch::Cint
+    gcnArchName::NTuple{256, Cchar}
+    integrated::Cint
+    cooperativeLaunch::Cint
+    cooperativeMultiDeviceLaunch::Cint
+    maxTexture1DLinear::Cint
+    maxTexture1D::Cint
+    maxTexture2D::NTuple{2, Cint}
+    maxTexture3D::NTuple{3, Cint}
+    hdpMemFlushCntl::Ptr{Cuint}
+    hdpRegFlushCntl::Ptr{Cuint}
+    memPitch::Csize_t
+    textureAlignment::Csize_t
+    texturePitchAlignment::Csize_t
+    kernelExecTimeoutEnabled::Cint
+    ECCEnabled::Cint
+    tccDriver::Cint
+    cooperativeMultiDeviceUnmatchedFunc::Cint
+    cooperativeMultiDeviceUnmatchedGridDim::Cint
+    cooperativeMultiDeviceUnmatchedBlockDim::Cint
+    cooperativeMultiDeviceUnmatchedSharedMem::Cint
+    isLargeBar::Cint
+    asicRevision::Cint
+    managedMemory::Cint
+    directManagedMemAccessFromHost::Cint
+    concurrentManagedAccess::Cint
+    pageableMemoryAccess::Cint
+    pageableMemoryAccessUsesHostPageTables::Cint
+end
+
+function Base.show(io::IO, props::hipDeviceProp_t)
+    name_ptr = pointer([props.name...])
+    gcn_name = pointer([props.gcnArchName...])
+    print(io,
+    """
+    struct hipDeviceProp_t
+        name = $(unsafe_string(name_ptr))
+        totalGlobalMem = $(Base.format_bytes(props.totalGlobalMem))
+        sharedMemPerBlock = $(Base.format_bytes(props.sharedMemPerBlock))
+        regsPerBlock = $(props.regsPerBlock)
+        warpSize = $(props.warpSize)
+        maxThreadsPerBlock = $(props.maxThreadsPerBlock)
+        maxThreadsDim = $(props.maxThreadsDim)
+        maxGridSize = $(props.maxGridSize)
+        clockRate = $(props.clockRate)
+        memoryClockRate = $(props.memoryClockRate)
+        memoryBusWidth = $(props.memoryBusWidth)
+        totalConstMem = $(Base.format_bytes(props.totalConstMem))
+        major = $(props.major)
+        minor = $(props.minor)
+        multiProcessorCount = $(props.multiProcessorCount)
+        l2CacheSize = $(Base.format_bytes(props.l2CacheSize))
+        maxThreadsPerMultiProcessor = $(props.maxThreadsPerMultiProcessor)
+        computeMode = $(props.computeMode)
+        clockInstructionRate = $(props.clockInstructionRate)
+        arch = [printed separately below]
+        concurrentKernels = $(props.concurrentKernels)
+        pciBusID = $(props.pciBusID)
+        pciDeviceID = $(props.pciDeviceID)
+        maxSharedMemoryPerMultiProcessor = $(Base.format_bytes(props.maxSharedMemoryPerMultiProcessor))
+        isMultiGpuBoard = $(props.isMultiGpuBoard)
+        canMapHostMemory = $(props.canMapHostMemory)
+        gcnArch = $(props.gcnArch)
+        gcnArchName = $(unsafe_string(gcn_name))
+        ...
+    end
+    """)
+    println(io)
+    show(io, props.arch)
+end
+
+@cenum hipDeviceAttribute_t begin
+    hipDeviceAttributeCudaCompatibleBegin = 0
+
+    hipDeviceAttributeEccEnabled = 0
+    hipDeviceAttributeAccessPolicyMaxWindowSize
+    hipDeviceAttributeAsyncEngineCount
+    hipDeviceAttributeCanMapHostMemory
+    hipDeviceAttributeCanUseHostPointerForRegisteredMem
+
+    hipDeviceAttributeClockRate
+    hipDeviceAttributeComputeMode
+    hipDeviceAttributeComputePreemptionSupported
+    hipDeviceAttributeConcurrentKernels
+    hipDeviceAttributeConcurrentManagedAccess
+    hipDeviceAttributeCooperativeLaunch
+    hipDeviceAttributeCooperativeMultiDeviceLaunch
+    hipDeviceAttributeDeviceOverlap
+
+    hipDeviceAttributeDirectManagedMemAccessFromHost
+
+    hipDeviceAttributeGlobalL1CacheSupported
+    hipDeviceAttributeHostNativeAtomicSupported
+    hipDeviceAttributeIntegrated
+    hipDeviceAttributeIsMultiGpuBoard
+    hipDeviceAttributeKernelExecTimeout
+    hipDeviceAttributeL2CacheSize
+    hipDeviceAttributeLocalL1CacheSupported
+    hipDeviceAttributeLuid
+    hipDeviceAttributeLuidDeviceNodeMask
+    hipDeviceAttributeComputeCapabilityMajor
+    hipDeviceAttributeManagedMemory
+    hipDeviceAttributeMaxBlocksPerMultiProcessor
+    hipDeviceAttributeMaxBlockDimX
+    hipDeviceAttributeMaxBlockDimY
+    hipDeviceAttributeMaxBlockDimZ
+    hipDeviceAttributeMaxGridDimX
+    hipDeviceAttributeMaxGridDimY
+    hipDeviceAttributeMaxGridDimZ
+    hipDeviceAttributeMaxSurface1D
+    hipDeviceAttributeMaxSurface1DLayered
+    hipDeviceAttributeMaxSurface2D
+    hipDeviceAttributeMaxSurface2DLayered
+    hipDeviceAttributeMaxSurface3D
+    hipDeviceAttributeMaxSurfaceCubemap
+    hipDeviceAttributeMaxSurfaceCubemapLayered
+    hipDeviceAttributeMaxTexture1DWidth
+    hipDeviceAttributeMaxTexture1DLayered
+    hipDeviceAttributeMaxTexture1DLinear
+
+    hipDeviceAttributeMaxTexture1DMipmap
+    hipDeviceAttributeMaxTexture2DWidth
+    hipDeviceAttributeMaxTexture2DHeight
+    hipDeviceAttributeMaxTexture2DGather
+    hipDeviceAttributeMaxTexture2DLayered
+    hipDeviceAttributeMaxTexture2DLinear
+    hipDeviceAttributeMaxTexture2DMipmap
+    hipDeviceAttributeMaxTexture3DWidth
+    hipDeviceAttributeMaxTexture3DHeight
+    hipDeviceAttributeMaxTexture3DDepth
+    hipDeviceAttributeMaxTexture3DAlt
+    hipDeviceAttributeMaxTextureCubemap
+    hipDeviceAttributeMaxTextureCubemapLayered
+    hipDeviceAttributeMaxThreadsDim
+    hipDeviceAttributeMaxThreadsPerBlock
+    hipDeviceAttributeMaxThreadsPerMultiProcessor
+    hipDeviceAttributeMaxPitch
+    hipDeviceAttributeMemoryBusWidth
+    hipDeviceAttributeMemoryClockRate
+    hipDeviceAttributeComputeCapabilityMinor
+    hipDeviceAttributeMultiGpuBoardGroupID
+    hipDeviceAttributeMultiprocessorCount
+    hipDeviceAttributeName
+    hipDeviceAttributePageableMemoryAccess
+
+    hipDeviceAttributePageableMemoryAccessUsesHostPageTables
+    hipDeviceAttributePciBusId
+    hipDeviceAttributePciDeviceId
+    hipDeviceAttributePciDomainID
+    hipDeviceAttributePersistingL2CacheMaxSize
+    hipDeviceAttributeMaxRegistersPerBlock
+
+    hipDeviceAttributeMaxRegistersPerMultiprocessor
+    hipDeviceAttributeReservedSharedMemPerBlock
+    hipDeviceAttributeMaxSharedMemoryPerBlock
+    hipDeviceAttributeSharedMemPerBlockOptin
+    hipDeviceAttributeSharedMemPerMultiprocessor
+    hipDeviceAttributeSingleToDoublePrecisionPerfRatio
+    hipDeviceAttributeStreamPrioritiesSupported
+    hipDeviceAttributeSurfaceAlignment
+    hipDeviceAttributeTccDriver
+    hipDeviceAttributeTextureAlignment
+    hipDeviceAttributeTexturePitchAlignment
+    hipDeviceAttributeTotalConstantMemory
+    hipDeviceAttributeTotalGlobalMem
+    hipDeviceAttributeUnifiedAddressing
+    hipDeviceAttributeUuid
+    hipDeviceAttributeWarpSize
+    hipDeviceAttributeMemoryPoolsSupported
+    hipDeviceAttributeVirtualMemoryManagementSupported
+
+    hipDeviceAttributeCudaCompatibleEnd = 9999
+    hipDeviceAttributeAmdSpecificBegin = 10000
+
+    hipDeviceAttributeClockInstructionRate = 10000
+    hipDeviceAttributeArch
+    hipDeviceAttributeMaxSharedMemoryPerMultiprocessor
+    hipDeviceAttributeGcnArch
+    hipDeviceAttributeGcnArchName
+    hipDeviceAttributeHdpMemFlushCntl
+    hipDeviceAttributeHdpRegFlushCntl
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedFunc
+
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedGridDim
+
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedBlockDim
+
+    hipDeviceAttributeCooperativeMultiDeviceUnmatchedSharedMem
+
+    hipDeviceAttributeIsLargeBar
+    hipDeviceAttributeAsicRevision
+    hipDeviceAttributeCanUseStreamWaitValue
+
+    hipDeviceAttributeImageSupport
+    hipDeviceAttributePhysicalMultiProcessorCount
+
+    hipDeviceAttributeFineGrainSupport
+    hipDeviceAttributeWallClockRate
+
+    hipDeviceAttributeAmdSpecificEnd = 19999
+    hipDeviceAttributeVendorSpecificBegin = 20000
+end
+
+const hipHostAllocDefault = 0x00
+const hipHostAllocPortable = 0x01
+const hipHostAllocMapped = 0x02
+
+const hipHostRegisterDefault = 0x0
+const hipHostRegisterPortable = 0x1
+const hipHostRegisterMapped = 0x2
+
diff --git a/src/hip/module.jl b/src/hip/module.jl
new file mode 100644
index 000000000..882c270ad
--- /dev/null
+++ b/src/hip/module.jl
@@ -0,0 +1,46 @@
+mutable struct HIPModule
+    handle::hipModule_t
+    dev::HIPDevice
+
+    function HIPModule(data)
+        dev = device()
+        device_synchronize()
+
+        # TODO use alloc_retry?
+        mod_ref = Ref{hipModule_t}()
+        hipModuleLoadData(mod_ref, data) |> check
+        mod = new(mod_ref[], dev)
+
+        finalizer(mod) do mod
+            hipModuleUnload(mod) |> check
+        end
+        mod
+    end
+end
+
+Base.unsafe_convert(::Type{hipModule_t}, mod::HIPModule) = mod.handle
+
+struct HIPFunction
+    handle::hipFunction_t
+    mod::HIPModule
+    global_hostcalls::Vector{Symbol}
+
+    function HIPFunction(
+        mod::HIPModule, name::String, global_hostcalls::Vector{Symbol},
+    )
+        fun_ref = Ref{hipFunction_t}()
+        hipModuleGetFunction(fun_ref, mod, name) |> check
+        new(fun_ref[], mod, global_hostcalls)
+    end
+end
+
+Base.unsafe_convert(::Type{hipFunction_t}, fun::HIPFunction) = fun.handle
+
+function launch_configuration(
+    fun::HIPFunction; shmem::Integer = 0, max_block_size::Integer = 0,
+)
+    grid_size_ref, block_size_ref = Ref{Cint}(), Ref{Cint}()
+    hipModuleOccupancyMaxPotentialBlockSize(
+        grid_size_ref, block_size_ref, fun, shmem, max_block_size) |> check
+    return (; gridsize=grid_size_ref[], groupsize=block_size_ref[])
+end
diff --git a/src/hip/pool.jl b/src/hip/pool.jl
new file mode 100644
index 000000000..f27f81636
--- /dev/null
+++ b/src/hip/pool.jl
@@ -0,0 +1,62 @@
+mutable struct HIPMemoryPool
+    handle::hipMemPool_t
+
+    function HIPMemoryPool(dev::HIPDevice;
+        alloc_type::hipMemAllocationType = hipMemAllocationTypePinned,
+        handle_type::hipMemAllocationHandleType = hipMemHandleTypeNone,
+    )
+        location = hipMemLocation(hipMemLocationTypeDevice, device_id(dev))
+        props = Ref(hipMemPoolProps(
+            alloc_type, handle_type, location,
+            C_NULL, ntuple(i->Cuchar(0), 64)))
+
+        handle_ref = Ref{hipMemPool_t}()
+        hipMemPoolCreate(handle_ref, props) |> check
+        pool = new(handle_ref[])
+
+        finalizer(pool) do pool
+            hipMemPoolDestroy(pool) |> check
+        end
+        return pool
+    end
+
+    global function default_memory_pool(dev::HIPDevice)
+        handle_ref = Ref{hipMemPool_t}()
+        hipDeviceGetDefaultMemPool(handle_ref, device_id(dev)) |> check
+        new(handle_ref[])
+    end
+
+    global function memory_pool(dev::HIPDevice)
+        handle_ref = Ref{hipMemPool_t}()
+        hipDeviceGetMemPool(handle_ref, device_id(dev)) |> check
+        new(handle_ref[])
+    end
+end
+
+Base.unsafe_convert(::Type{hipMemPool_t}, pool::HIPMemoryPool) = pool.handle
+
+Base.:(==)(a::HIPMemoryPool, b::HIPMemoryPool) = a.handle == b.handle
+
+Base.hash(pool::HIPMemoryPool, h::UInt) = hash(pool.handle, h)
+
+function attribute(X::Type, pool::HIPMemoryPool, attr::hipMemPoolAttr)
+    value = Ref{X}()
+    hipMemPoolGetAttribute(pool, attr, value) |> check
+    value[]
+end
+
+function attribute!(pool::HIPMemoryPool, attr::hipMemPoolAttr, value)
+    hipMemPoolSetAttribute(pool, attr, Ref(value)) |> check
+end
+
+function trim(pool::HIPMemoryPool, bytes_to_keep::Integer = 0)
+    hipMemPoolTrimTo(pool, bytes_to_keep) |> check
+end
+
+function memory_pool!(dev::HIPDevice, pool::HIPMemoryPool)
+    hipDeviceSetMemPool(device_id(dev), pool) |> check
+end
+
+used_memory(pool::HIPMemoryPool) = attribute(UInt64, pool, hipMemPoolAttrUsedMemCurrent)
+
+reserved_memory(pool::HIPMemoryPool) = attribute(UInt64, pool, hipMemPoolAttrReservedMemCurrent)
diff --git a/src/hip/stream.jl b/src/hip/stream.jl
new file mode 100644
index 000000000..48736914a
--- /dev/null
+++ b/src/hip/stream.jl
@@ -0,0 +1,119 @@
+mutable struct HIPStream
+    stream::hipStream_t
+    priority::Symbol
+    device::HIPDevice
+end
+
+"""
+    HIPStream(priority::Symbol = :normal)
+
+# Arguments:
+
+- `priority::Symbol`: Priority of the stream: `:normal`, `:high` or `:low`.
+
+Create HIPStream with given priority.
+Device is the default device that's currently in use.
+"""
+function HIPStream(priority::Symbol = :normal)
+    priority_int = symbol_to_priority(priority)
+
+    stream_ref = Ref{hipStream_t}()
+    hipStreamCreateWithPriority(stream_ref, Cuint(0), priority_int) |> check
+    stream = HIPStream(stream_ref[], priority, device())
+    finalizer(stream) do s
+        hipStreamDestroy(s.stream) |> check
+    end
+    return stream
+end
+
+default_stream() = HIPStream(convert(hipStream_t, C_NULL), :normal, device())
+
+"""
+    HIPStream(stream::hipStream_t)
+
+Create HIPStream from `hipStream_t` handle.
+Device is the default device that's currently in use.
+"""
+HIPStream(stream::hipStream_t) = HIPStream(stream, priority(stream), device())
+
+function isdone(stream::HIPStream)
+    query = hipStreamQuery(stream)
+    if query == hipSuccess
+        return true
+    elseif query == hipErrorNotReady
+        return false
+    else
+        throw(HIPError(query))
+    end
+end
+
+function _low_latency_synchronize(stream::HIPStream)
+    isdone(stream) && return true
+
+    # spin (initially without yielding to minimize latency)
+    spins = 0
+    while spins < 256
+        if spins < 32
+            ccall(:jl_cpu_pause, Cvoid, ())
+            # Temporary solution before we have gc transition support in codegen.
+            ccall(:jl_gc_safepoint, Cvoid, ())
+        else
+            yield()
+        end
+        isdone(stream) && return true
+        spins += 1
+    end
+    return false
+end
+
+function non_blocking_synchronize(stream::HIPStream)
+    while true
+        yield()
+        isdone(stream) && return true
+    end
+    return false
+end
+
+wait(stream::HIPStream) = hipStreamSynchronize(stream) |> check
+
+function synchronize(stream::HIPStream; blocking::Bool = true)
+    if blocking
+        _low_latency_synchronize(stream) || wait(stream)
+    else
+        non_blocking_synchronize(stream)
+    end
+    return
+end
+
+Base.unsafe_convert(::Type{Ptr{T}}, stream::HIPStream) where T =
+    reinterpret(Ptr{T}, stream.stream)
+
+function Base.show(io::IO, stream::HIPStream)
+    print(io, "HIPStream(device=$(stream.device), ptr=$(repr(UInt64(stream.stream))), priority=$(stream.priority))")
+end
+
+function priority_to_symbol(priority)
+    priority ==  0 && return :normal
+    priority == -1 && return :high
+    priority ==  1 && return :low
+    throw(ArgumentError("""
+    Invalid HIP priority: $priority.
+    Valid values are: 0, -1, 1.
+    """))
+end
+
+function symbol_to_priority(priority::Symbol)
+    priority == :normal && return Cint(0)
+    priority == :high && return Cint(-1)
+    priority == :low && return Cint(1)
+    throw(ArgumentError("""
+    Invalid HIP priority symbol: $priority.
+    Valid values are: `:normal`, `:low`, `:high`.
+    """))
+end
+
+function priority(stream::hipStream_t)
+    priority = Ref{Cint}()
+    hipStreamGetPriority(stream, priority) |> check
+    priority_to_symbol(priority[])
+end
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
index e3e3de6b0..4056f2bb6 100644
--- a/src/mapreduce.jl
+++ b/src/mapreduce.jl
@@ -5,26 +5,28 @@
 # - group-stride loop to delay need for second kernel launch
 
 # Reduce a value across a group, using local memory for communication
-@inline function reduce_group(op, val::T, neutral, ::Val{maxitems}) where {T, maxitems}
-    items = workgroupDim().x
-    item = workitemIdx().x
+@inline function reduce_group(op, val::T, neutral) where T
+    items::UInt32 = workgroupDim().x
+    item::UInt32 = workitemIdx().x
 
-    # shared mem for a complete reduction
-    shared = ROCDeviceArray((2*maxitems,), Device.alloc_special(Val(:reduce_block), T, Val(AS.Local), Val(2*maxitems)))
+    # Shared mem for a complete reduction.
+    shared = @ROCDynamicLocalArray(T, items, false)
     @inbounds shared[item] = val
 
-    # perform a reduction
-    d = items>>1
-    while d > 0
+    # Perform a reduction.
+    d::UInt32 = UInt32(1)
+    while d < items
         sync_workgroup()
-        if item <= d
-            shared[item] = op(shared[item], shared[item+d])
+        index::UInt32 = UInt32(2) * d * (item - UInt32(1)) + UInt32(1)
+        @inbounds if index ≤ items
+            other_val = ifelse(index + d ≤ items, shared[index + d], neutral)
+            shared[index] = op(shared[index], other_val)
         end
-        d >>= 1
+        d *= UInt32(2)
     end
 
-    # load the final value on the first item
-    if item == 1
+    # Load the final value on the first item.
+    if item == UInt32(1)
         val = @inbounds shared[item]
     end
 
@@ -38,43 +40,42 @@ Base.@propagate_inbounds _map_getindex(args::Tuple{}, I) = ()
 # Reduce an array across the grid. All elements to be processed can be addressed by the
 # product of the two iterators `Rreduce` and `Rother`, where the latter iterator will have
 # singleton entries for the dimensions that should be reduced (and vice versa).
-function partial_mapreduce_device(f, op, neutral, maxitems, Rreduce, Rother, R, As...)
+function partial_mapreduce_device(f, op, neutral, Rreduce, Rother, R, As...)
     # decompose the 1D hardware indices into separate ones for reduction (across items
     # and possibly groups if it doesn't fit) and other elements (remaining groups)
     localIdx_reduce = workitemIdx().x
     localDim_reduce = workgroupDim().x
-    groupIdx_reduce, groupIdx_other = fldmod1(workgroupIdx().x, length(Rother))
-    groupDim_reduce = gridGroupDim().x ÷ length(Rother)
+
+    n_elements_other::UInt32 = length(Rother)
+    groupIdx_reduce, groupIdx_other = fldmod1(workgroupIdx().x, n_elements_other)
+    groupDim_reduce = gridGroupDim().x ÷ n_elements_other
 
     # group-based indexing into the values outside of the reduction dimension
     # (that means we can safely synchronize items within this group)
     iother = groupIdx_other
-    @inbounds if iother <= length(Rother)
+    @inbounds if iother ≤ length(Rother)
         Iother = Rother[iother]
 
         # load the neutral value
         Iout = CartesianIndex(Tuple(Iother)..., groupIdx_reduce)
-        neutral = if neutral === nothing
-            R[Iout]
-        else
-            neutral
-        end
+        neutral = ifelse(neutral ≡ nothing, R[Iout], neutral)
 
         val = op(neutral, neutral)
 
         # reduce serially across chunks of input vector that don't fit in a group
-        ireduce = localIdx_reduce + (groupIdx_reduce - 1) * localDim_reduce
-        while ireduce <= length(Rreduce)
+        ireduce = localIdx_reduce + (groupIdx_reduce - UInt32(1)) * localDim_reduce
+        n_elements_reduce::UInt32 = length(Rreduce)
+        while ireduce ≤ n_elements_reduce
             Ireduce = Rreduce[ireduce]
             J = Base.max(Iother, Ireduce)
             val = op(val, f(_map_getindex(As, J)...))
             ireduce += localDim_reduce * groupDim_reduce
         end
 
-        val = reduce_group(op, val, neutral, maxitems)
+        val = reduce_group(op, val, neutral)
 
         # write back to memory
-        if localIdx_reduce == 1
+        if localIdx_reduce == UInt32(1)
             R[Iout] = val
         end
     end
@@ -84,9 +85,10 @@ end
 
 ## COV_EXCL_STOP
 
-function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T},
-                                 A::Union{AbstractArray,Broadcast.Broadcasted};
-                                 init=nothing) where {F, OP, T}
+function GPUArrays.mapreducedim!(
+    f::F, op::OP, R::AnyROCArray{T},
+    A::Union{AbstractArray,Broadcast.Broadcasted}; init=nothing,
+) where {F, OP, T}
     Base.check_reducedims(R, A)
     length(A) == 0 && return R # isempty(::Broadcasted) iterates
 
@@ -128,30 +130,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T},
     # we might not be able to launch all those items to reduce each slice in one go.
     # that's why each items also loops across their inputs, processing multiple values
     # so that we can span the entire reduction dimension using a single item group.
-
-    # group size is restricted by local memory
-    device = AMDGPU.device(R)
-    pools = filter(p -> Runtime.pool_segment(p) == HSA.AMD_SEGMENT_GROUP,
-        Runtime.memory_pools(device))
-    max_items = if !isempty(pools)
-        pool = first(pools)
-        max_lmem_elements = Runtime.pool_size(pool) ÷ sizeof(T)
-        isa = first(Runtime.isas(device))
-        Base.min(Runtime.isa_workgroup_max_size(isa), compute_items(max_lmem_elements ÷ 2))
-    else
-        @warn "No group segment detected for device $device; assuming 64 elements\nThis message will not be shown again" maxlog=1
-        64
-    end
-    # TODO: dynamic local memory to avoid two compilations
-
-    #= TODO: let the runtime suggest a group size
-    args = (f, op, init, Val(max_items), Rreduce, Rother, R′, A)
-    kernel_args = rocconvert.(args)
-    kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-    kernel = rocfunction(partial_mapreduce_device, kernel_tt)
-    reduce_items = compute_items(suggest_groupsize(kernel.fun, wanted_items).x)
-    =#
-    reduce_items = max_items
+    max_block_size = 256
+    compute_shmem(items) = items * sizeof(T)
+    max_shmem = max_block_size |> compute_items |> compute_shmem
+    kernel = @roc launch=false partial_mapreduce_device(
+        f, op, init, Rreduce, Rother, R′, A)
+    kernel_config = launch_configuration(kernel; shmem=max_shmem, max_block_size)
+    reduce_items = compute_items(kernel_config.groupsize)
+    reduce_shmem = compute_shmem(reduce_items)
 
     # how many groups should we launch?
     #
@@ -162,15 +148,14 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T},
     reduce_groups = cld(length(Rreduce), reduce_items)
 
     # determine the launch configuration
-    items = reduce_items
-    groups = reduce_groups*other_groups
-    gridsize = items*groups
+    blocks = reduce_items
+    grid = reduce_groups * other_groups
 
     # perform the actual reduction
     if reduce_groups == 1
         # we can cover the dimensions to reduce using a single group
-        wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device(
-            f, op, init, Val(items), Rreduce, Rother, R′, A))
+        @roc gridsize=grid groupsize=blocks shmem=reduce_shmem partial_mapreduce_device(
+            f, op, init, Rreduce, Rother, R′, A)
     else
         # we need multiple steps to cover all values to reduce
         partial = similar(R, (size(R)..., reduce_groups))
@@ -178,8 +163,8 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::AnyROCArray{T},
             # without an explicit initializer we need to copy from the output container
             partial .= R
         end
-        wait(@roc gridsize=gridsize groupsize=items partial_mapreduce_device(
-            f, op, init, Val(items), Rreduce, Rother, partial, A))
+        @roc gridsize=grid groupsize=blocks shmem=reduce_shmem partial_mapreduce_device(
+            f, op, init, Rreduce, Rother, partial, A)
 
         GPUArrays.mapreducedim!(identity, op, R′, partial; init=init)
     end
diff --git a/src/rand/random.jl b/src/rand/random.jl
index c27f48b73..fe75f56e2 100644
--- a/src/rand/random.jl
+++ b/src/rand/random.jl
@@ -1,10 +1,5 @@
 # interfacing with Random standard library
 
-using Random
-
-using GPUArrays
-
-
 mutable struct RNG <: Random.AbstractRNG
     handle::rocrand_generator
     typ::rocrand_rng_type
@@ -24,7 +19,6 @@ end
 
 Base.unsafe_convert(::Type{rocrand_generator}, rng::RNG) = rng.handle
 
-
 ## seeding
 function Random.seed!(rng::RNG, seed=Base.rand(UInt64), offset=0)
     rocrand_set_seed(rng, seed)
@@ -46,9 +40,7 @@ for (f,T) in ((:rocrand_generate, :UInt32), (:rocrand_generate_char,:Cuchar),
               (:rocrand_generate_uniform_double, :Float64), (:rocrand_generate_uniform_half, :Float16))
     @eval begin
         function Random.rand!(rng::RNG, A::ROCArray{$(T)})
-            wait!(A)
             $(f)(rng, A, length(A))
-            mark!(A, C_NULL)
             return A
         end
     end
@@ -56,16 +48,13 @@ end
 
 # some functions need pow2 lengths: use a padded array and copy back to the original one
 function inplace_pow2(A, f)
-    wait!(A)
     len = length(A)
     if len > 1 && ispow2(len)
         f(A)
-        mark!(A, C_NULL)
     else
         padlen = max(2, nextpow(2, len))
         B = similar(A, padlen)
         f(B)
-        mark!(B, C_NULL)
         copyto!(A, 1, B, 1, len)
         AMDGPU.unsafe_free!(B)
     end
@@ -156,24 +145,24 @@ rand_poisson(rng::RNG, T::PoissonType, dim1::Integer, dims::Integer...; kwargs..
     rand_poisson(rng, T, Dims((dim1, dims...)); kwargs...)
 
 # rand_logn! and rand_poisson! without specified rng
-rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(default_rng(), A; kwargs...)
-rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(default_rng(), A; kwargs...)
+rand_logn!(A::rocRAND.LognormalArray; kwargs...) = rand_logn!(handle(), A; kwargs...)
+rand_poisson!(A::rocRAND.PoissonArray; kwargs...) = rand_poisson!(handle(), A; kwargs...)
 
-rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(default_rng(), T, dims; kwargs...)
-rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(default_rng(), T, dims; kwargs...)
+rand_logn(T::rocRAND.LognormalType, dims::Dims; kwargs...) = rand_logn(handle(), T, dims; kwargs...)
+rand_poisson(T::rocRAND.PoissonType, dims::Dims; kwargs...) = rand_poisson(handle(), T, dims; kwargs...)
 
 rand_logn(T::rocRAND.LognormalType, dim1::Integer, dims::Integer...; kwargs...) =
-    rand_logn(default_rng(), T, Dims((dim1, dims...)); kwargs...)
+    rand_logn(handle(), T, Dims((dim1, dims...)); kwargs...)
 rand_poisson(T::rocRAND.PoissonType, dim1::Integer, dims::Integer...; kwargs...) =
-    rand_poisson(default_rng(), T, Dims((dim1, dims...)); kwargs...)
+    rand_poisson(handle(), T, Dims((dim1, dims...)); kwargs...)
 rand_logn(T::Type, dim1::Integer, dims::Integer...; kwargs...) =
     rand_logn!(ROCArray{T}(undef, dim1, dims...); kwargs...)
 rand_poisson(T::Type, dim1::Integer, dims::Integer...; kwargs...) =
     rand_poisson!(ROCArray{T}(undef, dim1, dims...); kwargs...)
 rand_logn(dim1::Integer, dims::Integer...; kwargs...) =
-    rand_logn(default_rng(), Dims((dim1, dims...)); kwargs...)
+    rand_logn(handle(), Dims((dim1, dims...)); kwargs...)
 rand_poisson(dim1::Integer, dims::Integer...; kwargs...) =
-    rand_poisson(default_rng(), Dims((dim1, dims...)); kwargs...)
+    rand_poisson(handle(), Dims((dim1, dims...)); kwargs...)
 rand_logn(T::Type, dims::Dims; kwargs...) = rand_logn!(ROCArray{T}(undef, dims...); kwargs...)
 rand_poisson(T::Type, dims::Dims; kwargs...) = rand_poisson!(ROCArray{T}(undef, dims...); kwargs...)
 rand_logn!(A::ROCArray; kwargs...) =
diff --git a/src/rand/rocRAND.jl b/src/rand/rocRAND.jl
index 48e54c878..8c4e2f184 100644
--- a/src/rand/rocRAND.jl
+++ b/src/rand/rocRAND.jl
@@ -1,12 +1,13 @@
 module rocRAND
 
 import ..AMDGPU
-import .AMDGPU: ROCArray, HandleCache, librocrand, mark!, wait!
-import ..HSA
+import .AMDGPU: ROCArray, HandleCache, librocrand, library_state
 import ..HIP
 import .HIP: HIPContext, HIPStream, hipStream_t
 
 using CEnum
+using GPUArrays
+using Random
 
 export rand_logn!, rand_poisson!, rand_logn, rand_poisson
 
@@ -21,43 +22,17 @@ end
 # stdlib Random integration
 include("random.jl")
 
-# Copied from CUDA.jl/lib/curand/CURAND.jl
+const IDLE_RNGS = HandleCache{HIPContext, RNG}()
 
-# cache for created, but unused handles
-const idle_rngs = HandleCache{HIPContext,RNG}()
+lib_state() = library_state(
+    :rocRAND, RNG, IDLE_RNGS,
+    () -> RNG(), r -> return, # RNG destroys itself in finalizer.
+    (nh, s) -> begin
+        Random.seed!(nh)
+        rocrand_set_stream(nh.handle, s)
+    end)
 
-function default_rng()
-    tls = AMDGPU.task_local_state()
-
-    # every task maintains library state per device
-    LibraryState = @NamedTuple{rng::RNG}
-    states = get!(task_local_storage(), :rocRAND) do
-        Dict{HIPContext,LibraryState}()
-    end::Dict{HIPContext,LibraryState}
-
-    # get library state
-    @noinline function new_state(tls)
-        new_rng = pop!(idle_rngs, tls.context) do
-            RNG()
-        end
-
-        finalizer(current_task()) do task
-            push!(idle_rngs, tls.context, new_rng) do
-                # no need to do anything, as the RNG is collected by its finalizer
-            end
-        end
-
-        Random.seed!(new_rng)
-
-        rocrand_set_stream(new_rng.handle, tls.stream)
-
-        (; rng=new_rng)
-    end
-    state = get!(states, tls.context) do
-        new_state(tls)
-    end
-
-    return state.rng
-end
+handle() = lib_state().handle
+stream() = lib_state().stream
 
 end
diff --git a/src/random.jl b/src/random.jl
index f74971611..23b392683 100644
--- a/src/random.jl
+++ b/src/random.jl
@@ -6,8 +6,8 @@ const GPUARRAY_RNG = Ref{Union{Nothing,GPUArrays.RNG}}(nothing)
 
 function GPUArrays.default_rng(::Type{<:ROCArray})
     if GPUARRAY_RNG[] == nothing
-        device = AMDGPU.default_device()
-        N = Int(Runtime.device_workgroup_max_size(device))
+        device = AMDGPU.device()
+        N = HIP.properties(device).maxThreadsPerBlock
         state = ROCArray{NTuple{4, UInt32}}(undef, N)
         GPUARRAY_RNG[] = GPUArrays.RNG(state)
         Random.seed!(GPUARRAY_RNG[])
@@ -16,7 +16,7 @@ function GPUArrays.default_rng(::Type{<:ROCArray})
 end
 
 gpuarrays_rng() = GPUArrays.default_rng(ROCArray)
-const rocrand_rng = librocrand !== nothing ? rocRAND.default_rng : gpuarrays_rng
+const rocrand_rng = librocrand !== nothing ? rocRAND.handle : gpuarrays_rng
 
 # the interface is split in two levels:
 # - functions that extend the Random standard library, and take an RNG as first argument,
diff --git a/src/reflection.jl b/src/reflection.jl
index b02a93fe2..f1dc93db3 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -12,11 +12,10 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native)
     @eval begin
         function $method(
             io::IO, @nospecialize(func), @nospecialize(types);
-            kernel::Bool=false, device=default_device(), kwargs...,
+            kernel::Bool=false, device=HIP.device(), kwargs...,
         )
             source = methodinstance(typeof(func), Base.to_tuple_type(types))
-            config = Compiler.compiler_config(
-                device; kernel, global_hooks=NamedTuple())
+            config = Compiler.compiler_config(device; kernel)
             job = CompilerJob(source, config)
             GPUCompiler.$method($(args...); kwargs...)
         end
diff --git a/src/rocm_discovery.jl b/src/rocm_discovery.jl
new file mode 100644
index 000000000..1e506b738
--- /dev/null
+++ b/src/rocm_discovery.jl
@@ -0,0 +1,223 @@
+const rocm_ext_libs = [
+    (:rocblas, :rocBLAS_jll),
+    (:rocsparse, :rocSPARSE_jll),
+    (:rocsolver, nothing),
+    (:rocalution, nothing),
+    (:rocrand, :rocRAND_jll),
+    (:rocfft, nothing),
+    (:MIOpen, :MIOpen_jll)]
+
+function enable_artifacts!(flag::Bool = true; show_message::Bool = true)
+    @set_preferences!("use_artifacts" => flag)
+    if show_message
+        @info """
+        Switched `use_artifacts` to `$flag`.
+        Restart Julia session for the changes to take effect.
+        """
+    end
+end
+
+# TODO need updated ROCm artifacts to enable them again (5.4+).
+use_artifacts()::Bool = @load_preference("use_artifacts", false)
+
+if haskey(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS")
+    disable_artifacts = parse(Bool, get(ENV, "JULIA_AMDGPU_DISABLE_ARTIFACTS", "false"))
+    enable_artifacts!(!disable_artifacts; show_message=false)
+end
+
+function find_artifact_library!(config, pkg; name::Symbol, lib::Symbol)
+    if pkg !== nothing
+        loaded, available, error_str = safe_import(pkg)
+        if loaded
+            if available
+                config[Symbol(:lib, name)] = getfield(@eval($pkg), lib)
+                config[Symbol(name, :_configured)] = true
+            else
+                config[Symbol(name, :_build_reason)] = "`$pkg` is not available on this platform."
+            end
+        else
+            iob = IOBuffer()
+            println(iob, "`import $pkg` failed:")
+            print(iob, error_str)
+            config[Symbol(name, :_build_reason)] = String(take!(iob))
+        end
+    end
+end
+
+function find_system_library!(
+    config, name::Symbol; lib, dirs = find_roc_paths(), ext = dlext,
+)
+    lib_path = find_rocm_library(lib, dirs, ext)
+    if !isempty(something(lib_path, ""))
+        loaded, error_str = safe_exec("using Libdl; dlopen(\"$lib_path\")")
+        if loaded
+            config[Symbol(:lib, name)] = lib_path
+            config[Symbol(name, :_configured)] = true
+        else
+            iob = IOBuffer()
+            println(iob, "Loading `$lib_path` failed:")
+            print(iob, error_str)
+            config[Symbol(name, :_build_reason)] = String(take!(iob))
+        end
+    else
+        config[Symbol(name, :_build_reason)] = "Could not find `$lib` libraries."
+    end
+end
+
+function find_hsa_runtime!(config)
+    if use_artifacts()
+        loaded, available, error_str = safe_import(:hsa_rocr_jll)
+        if loaded
+            if available
+                config[:libhsaruntime_path] = hsa_rocr_jll.libhsa_runtime64
+                config[:hsa_configured] = true
+            else
+                config[:hsa_build_reason] = "hsa_rocr_jll is not available on this platform."
+            end
+        else
+            iob = IOBuffer()
+            println(iob, "`import hsa_rocr_jll` failed:")
+            print(iob, error_str)
+            config[:hsa_build_reason] = String(take!(iob))
+        end
+    else
+        roc_dirs = find_roc_paths()
+        libhsaruntime_path = find_rocm_library("libhsa-runtime64", roc_dirs, "so.1")
+        if !isempty(something(libhsaruntime_path, ""))
+            loaded, error_str = safe_exec("using Libdl; dlopen(\"$libhsaruntime_path\")")
+            if loaded
+                config[:libhsaruntime_path] = libhsaruntime_path
+                config[:hsa_configured] = true
+            else
+                iob = IOBuffer()
+                println(iob, "Loading `libhsa-runtime64` v1 failed:")
+                print(iob, error_str)
+                config[:hsa_build_reason] = String(take!(iob))
+            end
+        else
+            config[:hsa_build_reason] = "Could not find `libhsa-runtime64` v1 library"
+        end
+    end
+end
+
+function find_ld_lld!(config)
+    if use_artifacts()
+        loaded, available, error_str = safe_import(:LLD_jll)
+        if loaded
+            if available
+                if isdefined(LLD_jll, :lld_path)
+                    config[:lld_path] = LLD_jll.lld_path
+                    config[:lld_artifact] = true
+                    config[:lld_configured] = true
+                else
+                    config[:lld_build_reason] = "LLD_jll does not export `lld_path`"
+                end
+            else
+                config[:lld_build_reason] = "LLD_jll is not available on this platform"
+            end
+        else
+            iob = IOBuffer()
+            println(iob, "`import LLD_jll` failed:")
+            print(iob, error_str)
+            config[:lld_build_reason] = String(take!(iob))
+        end
+    else
+        lld_path = find_ld_lld()
+        if !isempty(something(lld_path, ""))
+            # TODO: Validate ld.lld can compile programs
+            config[:lld_path] = lld_path
+            config[:lld_configured] = true
+        else
+            config[:lld_build_reason] = "Could not find `ld.lld` executable"
+        end
+    end
+end
+
+function find_device_libs!(config)
+    if use_artifacts()
+        find_artifact_library!(
+            config, :ROCmDeviceLibs_jll;
+            name=:device_libs, lib=:bitcode_path)
+    else
+        device_libs_path = find_device_libs()
+        if !isempty(something(device_libs_path, ""))
+            # TODO: Validate bitcode files
+            config[:libdevice_libs] = device_libs_path
+            config[:device_libs_configured] = true
+        else
+            config[:device_libs_build_reason] = "Couldn't find bitcode files"
+        end
+    end
+end
+
+function find_hip!(config)
+    if use_artifacts()
+        find_artifact_library!(config, :HIP_jll; name=:hip, lib=:libamdhip64)
+    else
+        find_system_library!(config, :hip; lib=["libamdhip64", "libhip_hcc"])
+    end
+end
+
+function find_hip_based_libs!(config, rocm_ext_libs)
+    @sync for (name, pkg) in rocm_ext_libs
+        @async begin
+            lib = Symbol(:lib, string(name))
+            if use_artifacts()
+                find_artifact_library!(config, pkg; name, lib)
+            else
+                find_system_library!(config, name; lib=string(lib))
+            end
+        end
+    end
+end
+
+function bindeps_setup()
+    config = Dict{Symbol, Any}(
+        :configured => false,
+        :build_reason => "unknown",
+
+        :lld_configured => false,
+        :lld_build_reason => "unknown",
+        :lld_artifact => false,
+
+        :hsa_configured => false,
+        :hsa_build_reason => "unknown",
+
+        :hip_configured => false,
+        :hip_build_reason => "unknown",
+
+        :device_libs_configured => false,
+        :device_libs_build_reason => "unknown")
+
+    for (name, _) in rocm_ext_libs
+        lib = Symbol(:lib, string(name))
+        config[lib] = nothing
+        config[Symbol(name, :_configured)] = false
+        config[Symbol(name, :_build_reason)] = "unknown"
+    end
+
+    if !Sys.islinux()
+        @warn "AMDGPU.jl only supports Linux OS."
+        config[:build_reason] = "Unsupported OS: $(repr(Sys.KERNEL))"
+        return config
+    end
+    if !ispath("/dev/kfd")
+        @warn "/dev/kfd is not available. Cannot use ROCm Runtime."
+    end
+
+    find_hsa_runtime!(config)
+    config[:hsa_configured] || return config
+
+    find_ld_lld!(config)
+    config[:lld_configured] || return config
+
+    find_device_libs!(config)
+    config[:device_libs_configured] || return config
+
+    find_hip!(config)
+    if config[:hip_configured]
+        find_hip_based_libs!(config, rocm_ext_libs)
+    end
+    config[:configured] = true
+    return config
+end
diff --git a/src/runtime/device.jl b/src/runtime/device.jl
index f63a437c0..ae149522b 100644
--- a/src/runtime/device.jl
+++ b/src/runtime/device.jl
@@ -1,53 +1,17 @@
-# Utilities for working with HSA devices
+const DEFAULT_DEVICE = Ref{HIPDevice}()
+const ALL_DEVICES = Vector{HIPDevice}()
+const HSA_DEVICES = Vector{ROCDevice}()
 
-mutable struct ROCDevice
-    agent::HSA.Agent
-
-    # Cached information
-    type::HSA.DeviceType
-    name::String
-    productname::String
-    uuid::String
-
-    function ROCDevice(handle::HSA.Agent)
-        device = new(handle)
-        device.type = device_type(device)
-        device.name = name(device)
-        device.productname = product_name(device)
-        device.uuid = uuid(device)
-
-        return device
-    end
-end
-ROCDevice() = AMDGPU.device()
-get_handle(device::ROCDevice) = device.agent.handle
-
-Base.:(==)(device1::ROCDevice, device2::ROCDevice) =
-    device1.agent == device2.agent
-
-const DEFAULT_DEVICE = Ref{ROCDevice}()
-const ALL_DEVICES = Vector{ROCDevice}()
-const DEVICES = Dict{UInt64, ROCDevice}() # Map from device handles to ROCDevice structs
-
-### @cfunction callbacks ###
+function fetch_devices()
+    isempty(ALL_DEVICES) || return copy(ALL_DEVICES)
 
-function agent_iterate_isas_cb(isa::HSA.ISA, isas)
-    push!(isas, isa)
-    return HSA.STATUS_SUCCESS
+    devs = HIP.devices()
+    append!(ALL_DEVICES, devs)
+    return devs
 end
 
-function iterate_agents_cb(agent::HSA.Agent, devices)
-    push!(devices, ROCDevice(agent))
-    return HSA.STATUS_SUCCESS
-end
-
-"""
-    fetch_devices() -> Vector{ROCDevice}
-
-Returns the list of HSA devices available on the system.
-"""
-function fetch_devices()
-    isempty(ALL_DEVICES) || return copy(ALL_DEVICES)
+function fetch_hsa_devices()
+    isempty(HSA_DEVICES) || return copy(HSA_DEVICES)
 
     devices = Ref(Vector{ROCDevice}())
     GC.@preserve devices begin
@@ -57,21 +21,21 @@ function fetch_devices()
         _devices = devices[]
     end
 
-    # Update the entries in the device handle dictionary
-    for device in _devices
-        push!(ALL_DEVICES, device)
-        DEVICES[device.agent.handle] = device
-    end
-
+    filter!(d -> device_type(d) == HSA.DEVICE_TYPE_GPU, _devices)
+    append!(HSA_DEVICES, _devices)
     return _devices
 end
 
+hsa_device(device::HIPDevice) = HSA_DEVICES[device.device_id]
+
 """
-    get_default_device() -> ROCDevice
+    get_default_device() -> HIPDevice
+
+TODO update docs
 
-Returns the default device, which is used for all kernel and array operations
-when one is not explicitly specified. May be changed with
-[`set_default_device!`](@ref).
+# Returns the default device, which is used for all kernel and array operations
+# when one is not explicitly specified. May be changed with
+# [`set_default_device!`](@ref).
 """
 function get_default_device()
     if !isassigned(DEFAULT_DEVICE)
@@ -79,356 +43,16 @@ function get_default_device()
     end
     DEFAULT_DEVICE[]
 end
+
 """
     set_default_device!(device::ROCDevice) -> ROCDevice
 
 Sets the default device to `device`. See [`get_default_device`](@ref) for more
 details.
 """
-function set_default_device!(device::ROCDevice)
+function set_default_device!(device::HIPDevice)
     DEFAULT_DEVICE[] = device
 end
 
 "Return all devices available to the runtime."
 devices() = copy(ALL_DEVICES)
-
-# Pretty-printing
-function Base.show(io::IO, device::ROCDevice)
-    name = device.uuid
-
-    name *= if device.name == device.productname || isempty(device.productname)
-        " [$(device.name)]"
-    else
-        " [$(device.productname) ($(device.name))]"
-    end
-
-    print(io, name)
-end
-
-### Device details
-
-getinfo(
-    agent::HSA.Agent, attribute::HSA.AgentInfo,
-    value::Union{Vector, Base.RefValue},
-) = HSA.agent_get_info(agent, attribute, value)
-
-getinfo(
-    agent::HSA.Agent, info::HSA.AMDAgentInfo,
-    value::Union{Vector, Base.RefValue},
-) = getinfo(agent, reinterpret(HSA.AgentInfo, info), value)
-
-getinfo(device::ROCDevice, query, value) = getinfo(device.agent, query, value)
-
-const AnyROCDevice = Union{ROCDevice,HSA.Agent}
-
-name(device::AnyROCDevice) =
-    getinfo(String, device, HSA.AGENT_INFO_NAME)
-
-product_name(device::AnyROCDevice) =
-    getinfo(String, device, HSA.AMD_AGENT_INFO_PRODUCT_NAME)
-
-uuid(device::AnyROCDevice) =
-    getinfo(String, device, HSA.AMD_AGENT_INFO_UUID)
-
-profile(device::AnyROCDevice) =
-    getinfo(HSA.Profile, device, HSA.AGENT_INFO_PROFILE)
-
-device_type(device::AnyROCDevice) =
-    getinfo(HSA.DeviceType, device, HSA.AGENT_INFO_DEVICE)
-
-device_wavefront_size(device::AnyROCDevice) =
-    getinfo(UInt32, device, HSA.AGENT_INFO_WAVEFRONT_SIZE)
-
-device_workgroup_max_size(device::AnyROCDevice) =
-    getinfo(UInt32, device, HSA.AGENT_INFO_WORKGROUP_MAX_SIZE)
-
-device_num_compute_units(device::AnyROCDevice) =
-    getinfo(UInt32, device, HSA.AMD_AGENT_INFO_COMPUTE_UNIT_COUNT)
-
-device_num_simds_per_compute_unit(device::AnyROCDevice) =
-    getinfo(UInt32, device, HSA.AMD_AGENT_INFO_NUM_SIMDS_PER_CU)
-
-function device_local_memory_size(device::AnyROCDevice)
-    _regions = regions(device)
-    for region in _regions
-        if region_segment(region) == HSA.REGION_SEGMENT_GROUP
-            return region_size(region)
-        end
-    end
-    error("Failed to find local memory region for $device")
-end
-
-### ISAs
-
-isas(device::ROCDevice) = isas(device.agent)
-function isas(agent::HSA.Agent)
-    isas = Ref(HSA.ISA[])
-    func = @cfunction(agent_iterate_isas_cb, HSA.Status, (HSA.ISA, Ref{Vector{HSA.ISA}}))
-    HSA.agent_iterate_isas(agent, func, isas) |> check
-    isas[]
-end
-
-# Device handle => default ISA.
-const DEFAULT_ISAS = Dict{UInt64, HSA.ISA}()
-
-function default_isa(device::ROCDevice)
-    lock(RT_LOCK) do
-        get!(
-            () -> first(Runtime.isas(device)),
-            DEFAULT_ISAS, Runtime.get_handle(device))
-    end
-end
-
-# TODO: PCRE regexes are not thread-safe
-const isa_regex = r"([a-z]*)-([a-z]*)-([a-z]*)--([a-z0-9]*)([a-zA-Z0-9+\-:]*)"
-function parse_isa(isa::HSA.ISA)
-    len = isa_name_length(isa)
-    name = Vector{UInt8}(undef, len)
-    getinfo(isa, HSA.ISA_INFO_NAME, name) |> check
-    name = String(name)
-    m = match(isa_regex, name)
-    @assert m !== nothing "Failed to match ISA name pattern: $name"
-    m
-end
-
-function llvm_arch_features(isa::HSA.ISA)
-    @memoize isa::HSA.ISA begin
-    m = parse_isa(isa)
-    arch = String(m.captures[4])
-    features = join(map(x->x[1:end-1],
-                        filter!(x->!isempty(x) && (x[end]=='+'),
-                               split(m.captures[5], ':'))),
-                    ",+")
-    if !isempty(features)
-        features = '+'*features
-    end
-    if Base.libllvm_version < v"12"
-        features = replace(features, "sramecc"=>"sram-ecc")
-    end
-    (arch, features)
-    end::Tuple{String, String}
-end
-architecture(isa::HSA.ISA) = llvm_arch_features(isa)[1]
-features(isa::HSA.ISA) = llvm_arch_features(isa)[2]
-
-getinfo(isa::HSA.ISA, attribute::HSA.ISAInfo,
-        value::Union{Vector, Base.RefValue}) =
-    HSA.isa_get_info_alt(isa, attribute, value)
-
-isa_name_length(isa::HSA.ISA) =
-    getinfo(Cuint, isa, HSA.ISA_INFO_NAME_LENGTH)
-
-isa_workgroup_max_size(isa::HSA.ISA) =
-    getinfo(UInt32, isa, HSA.ISA_INFO_WORKGROUP_MAX_SIZE)
-
-### Regions
-
-struct ROCMemoryRegion
-    region::HSA.Region
-end
-get_handle(region::ROCMemoryRegion) = region.region.handle
-
-function iterate_regions_cb(region::HSA.Region, regions)
-    push!(regions, region)
-    return HSA.STATUS_SUCCESS
-end
-
-function regions(agent::HSA.Agent)
-    @memoize agent::HSA.Agent begin
-        regions = Ref(HSA.Region[])
-        func = @cfunction(iterate_regions_cb, HSA.Status, (HSA.Region, Ref{Vector{HSA.Region}}))
-        HSA.agent_iterate_regions(agent, func, regions) |> check
-        map(ROCMemoryRegion, regions[])
-    end::Vector{ROCMemoryRegion}
-end
-regions(device::ROCDevice) = regions(device.agent)
-
-getinfo(region::HSA.Region, attribute::HSA.RegionInfo,
-        value::Union{Vector,Base.RefValue}) =
-    HSA.region_get_info(region, attribute, value)
-getinfo(region::HSA.Region, attribute::HSA.AMDRegionInfo,
-        value::Union{Vector,Base.RefValue}) =
-            getinfo(region, reinterpret(HSA.RegionInfo, attribute), value)
-
-getinfo(region::ROCMemoryRegion, query, value) = getinfo(region.region, query, value)
-
-const AnyROCMemoryRegion = Union{ROCMemoryRegion,HSA.Region}
-
-region_segment(region::AnyROCMemoryRegion) =
-    getinfo(HSA.RegionSegment, region, HSA.REGION_INFO_SEGMENT)
-region_global_flags(region::AnyROCMemoryRegion) =
-    getinfo(HSA.RegionGlobalFlag, region, HSA.REGION_INFO_GLOBAL_FLAGS)
-region_size(region::AnyROCMemoryRegion) =
-    getinfo(Csize_t, region, HSA.REGION_INFO_SIZE)
-region_alloc_max_size(region::AnyROCMemoryRegion) =
-    getinfo(Csize_t, region, HSA.REGION_INFO_ALLOC_MAX_SIZE)
-region_alloc_max_private_workgroup_size(region::AnyROCMemoryRegion) =
-    getinfo(Csize_t, region, HSA.REGION_INFO_ALLOC_MAX_PRIVATE_WORKGROUP_SIZE)
-region_runtime_alloc_allowed(region::AnyROCMemoryRegion) =
-    getinfo(Bool, region, HSA.REGION_INFO_RUNTIME_ALLOC_ALLOWED)
-region_runtime_alloc_granule(region::AnyROCMemoryRegion) =
-    getinfo(Csize_t, region, HSA.REGION_INFO_RUNTIME_ALLOC_GRANULE)
-region_runtime_alloc_alignment(region::AnyROCMemoryRegion) =
-    getinfo(Csize_t, region, HSA.REGION_INFO_RUNTIME_ALLOC_ALIGNMENT)
-region_host_accessible(region::AnyROCMemoryRegion) =
-    getinfo(Bool, region, HSA.AMD_REGION_INFO_HOST_ACCESSIBLE)
-
-function get_region(device::ROCDevice, kind::Symbol)
-    flag = if kind == :finegrained
-        HSA.REGION_GLOBAL_FLAG_FINE_GRAINED
-    elseif kind == :coarsegrained || kind == :coarsegrained_host
-        HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED
-    elseif kind == :kernarg
-        HSA.REGION_GLOBAL_FLAG_KERNARG
-    else
-        error("Region kind $kind not supported")
-    end
-    _regions = regions(device)
-    _regions = filter(r -> region_global_flags(r) & flag > 0, _regions)
-    kind == :coarsegrained && filter!(!region_host_accessible, _regions)
-    @assert !isempty(_regions) "No region of kind $kind in device $device"
-    return first(_regions)
-end
-
-function Base.show(io::IO, region::ROCMemoryRegion)
-    segment_map = Dict(HSA.REGION_SEGMENT_GLOBAL   => :global,
-                       HSA.REGION_SEGMENT_READONLY => :readonly,
-                       HSA.REGION_SEGMENT_PRIVATE  => :private,
-                       HSA.REGION_SEGMENT_GROUP    => :group,
-                       HSA.REGION_SEGMENT_KERNARG  => :kernarg)
-    segment = segment_map[region_segment(region)]
-
-    _flags = region_global_flags(region)
-    flags = Symbol[]
-    flag_map = Dict(UInt32(HSA.REGION_GLOBAL_FLAG_KERNARG)        => :kernarg,
-                    UInt32(HSA.REGION_GLOBAL_FLAG_FINE_GRAINED)   => :finegrained,
-                    UInt32(HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED) => :coarsegrained)
-    for (flag, flag_name) in CEnum.namemap(HSA.RegionGlobalFlag)
-        if flag & _flags > 0
-            push!(flags, flag_map[flag])
-        end
-    end
-    flags = "($(join(flags, ", ")))"
-
-    size = Base.format_bytes(region_size(region))
-
-    private_workgroup_size = segment == :private ? Base.format_bytes(region_alloc_max_private_workgroup_size(region)) : nothing
-
-    runtime_alloc = region_runtime_alloc_allowed(region)
-
-    alloc_granule = Base.format_bytes(region_runtime_alloc_granule(region))
-
-    alloc_align = Base.format_bytes(region_runtime_alloc_alignment(region))
-
-    max_size = Base.format_bytes(region_alloc_max_size(region))
-
-    host_access = region_host_accessible(region)
-
-    print(io, "ROCMemoryRegion @ $(repr(region.region.handle)): Segment $segment, Flags $flags, Size $size ($max_size max allocation), ")
-    if segment == :private
-        print(io, "Workgroup Max Alloc: $private_workgroup_size, ")
-    end
-    print(io, "Runtime Alloc: "); printstyled(io, "$runtime_alloc"; color=runtime_alloc ? :green : :red); print(io, " ($alloc_granule granularity, $alloc_align alignment), ")
-    print(io, "Host Accessible: "); printstyled(io, "$host_access"; color=host_access ? :green : :red)
-end
-
-### Memory Pools
-
-struct ROCMemoryPool
-    pool::HSA.AMDMemoryPool
-end
-get_handle(pool::ROCMemoryPool) = pool.pool.handle
-
-function iterate_pools_cb(pool::HSA.AMDMemoryPool, pools)
-    push!(pools, pool)
-    return HSA.STATUS_SUCCESS
-end
-
-function memory_pools(agent::HSA.Agent)
-    @memoize agent::HSA.Agent begin
-        pools = Ref(HSA.AMDMemoryPool[])
-        func = @cfunction(iterate_pools_cb, HSA.Status, (HSA.AMDMemoryPool, Ref{Vector{HSA.AMDMemoryPool}}))
-        HSA.amd_agent_iterate_memory_pools(agent, func, pools) |> check
-        map(ROCMemoryPool, pools[])
-    end::Vector{ROCMemoryPool}
-end
-memory_pools(device::ROCDevice) = memory_pools(device.agent)
-
-getinfo(pool::HSA.AMDMemoryPool, attribute::HSA.AMDMemoryPoolInfo,
-        value::Union{Vector,Base.RefValue}) =
-    HSA.amd_memory_pool_get_info(pool, attribute, value)
-
-getinfo(pool::ROCMemoryPool, query, value) = getinfo(pool.pool, query, value)
-
-const AnyROCMemoryPool = Union{ROCMemoryPool, HSA.AMDMemoryPool}
-
-pool_segment(pool::AnyROCMemoryPool) =
-    getinfo(HSA.AMDSegment, pool, HSA.AMD_MEMORY_POOL_INFO_SEGMENT)
-pool_global_flags(pool::AnyROCMemoryPool) =
-    getinfo(HSA.AMDMemoryPoolGlobalFlag, pool, HSA.AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS)
-pool_size(pool::AnyROCMemoryPool) =
-    getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_SIZE)
-pool_alloc_max_size(pool::AnyROCMemoryPool) =
-    getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE)
-pool_runtime_alloc_allowed(pool::AnyROCMemoryPool) =
-    getinfo(Bool, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED)
-pool_runtime_alloc_granule(pool::AnyROCMemoryPool) =
-    getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE)
-pool_runtime_alloc_alignment(pool::AnyROCMemoryPool) =
-    getinfo(Csize_t, pool, HSA.AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT)
-pool_accessible_by_all(pool::AnyROCMemoryPool) =
-    getinfo(Bool, pool, HSA.AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL)
-
-function Base.show(io::IO, pool::ROCMemoryPool)
-    segment_map = Dict(HSA.AMD_SEGMENT_GLOBAL   => :global,
-                       HSA.AMD_SEGMENT_READONLY => :readonly,
-                       HSA.AMD_SEGMENT_PRIVATE  => :private,
-                       HSA.AMD_SEGMENT_GROUP    => :group)
-    segment = segment_map[pool_segment(pool)]
-
-    _flags = pool_global_flags(pool)
-    flags = Symbol[]
-    flag_map = Dict(UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)   => :kernarg,
-                    UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED)   => :finegrained,
-                    UInt32(HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) => :coarsegrained)
-    for (flag, flag_name) in CEnum.namemap(HSA.AMDMemoryPoolGlobalFlag)
-        if flag & _flags > 0
-            push!(flags, flag_map[flag])
-        end
-    end
-    flags = "($(join(flags, ", ")))"
-
-    size = Base.format_bytes(pool_size(pool))
-
-    runtime_alloc = pool_runtime_alloc_allowed(pool)
-
-    alloc_granule = Base.format_bytes(pool_runtime_alloc_granule(pool))
-
-    alloc_align = Base.format_bytes(pool_runtime_alloc_alignment(pool))
-
-    accessible_all = pool_accessible_by_all(pool)
-
-    max_size = Base.format_bytes(pool_alloc_max_size(pool))
-
-    print(io, "ROCMemoryPool @ $(repr(pool.pool.handle)): Segment $segment, Flags $flags, Size $size ($max_size max allocation), ")
-    print(io, "Runtime Alloc: "); printstyled(io, "$runtime_alloc"; color=runtime_alloc ? :green : :red); print(io, " ($alloc_granule granularity, $alloc_align alignment), ")
-    print(io, "All Accessible: "); printstyled(io, "$accessible_all"; color=accessible_all ? :green : :red)
-end
-
-function get_memory_pool(device::ROCDevice, kind::Symbol)
-    flag = if kind == :finegrained
-        HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED
-    elseif kind == :coarsegrained || kind == :coarsegrained_host
-        HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED
-    elseif kind == :kernarg
-        HSA.AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT
-    else
-        error("Region kind $kind not supported")
-    end
-
-    _pools = memory_pools(device)
-    _pools = filter(p -> pool_global_flags(p) & flag > 0, _pools)
-    kind == :coarsegrained && filter!(!pool_accessible_by_all, _pools)
-    @assert !isempty(_pools) "No memory pool of kind $kind in device $device"
-    return first(_pools)
-end
diff --git a/src/runtime/executable.jl b/src/runtime/executable.jl
deleted file mode 100644
index 54f6959ad..000000000
--- a/src/runtime/executable.jl
+++ /dev/null
@@ -1,139 +0,0 @@
-getinfo(exsym::HSA.ExecutableSymbol, attribute::HSA.ExecutableSymbolInfo,
-        value::Union{Vector, Base.RefValue}) =
-    HSA.executable_symbol_get_info(exsym, attribute, value)
-
-executable_symbol_type(sym::HSA.ExecutableSymbol) =
-    getinfo(HSA.SymbolKind, sym, HSA.EXECUTABLE_SYMBOL_INFO_TYPE)
-
-# TODO: Symbol name length
-
-executable_symbol_name(sym::HSA.ExecutableSymbol) =
-    getinfo(String, sym, HSA.EXECUTABLE_SYMBOL_INFO_NAME)
-
-executable_symbol_kernel_object(sym::HSA.ExecutableSymbol) =
-    getinfo(UInt64, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT)
-
-executable_symbol_kernel_kernarg_segment_size(sym::HSA.ExecutableSymbol) =
-    getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE)
-
-executable_symbol_kernel_group_segment_size(sym::HSA.ExecutableSymbol) =
-    getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE)
-
-executable_symbol_kernel_private_segment_size(sym::HSA.ExecutableSymbol) =
-    getinfo(UInt32, sym, HSA.EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE)
-
-### @cfunction Callbacks ###
-
-function iterate_exec_agent_syms_cb(exe::HSA.Executable, agent::HSA.Agent,
-                                    sym::HSA.ExecutableSymbol,
-                                    sym_ref::Ptr{HSA.ExecutableSymbol})
-    if executable_symbol_type(sym) == HSA.SYMBOL_KIND_KERNEL
-        # FIXME: Ensure name matches
-        #name = executable_symbol_name(sym)
-        Base.unsafe_store!(sym_ref, sym)
-        return HSA.STATUS_INFO_BREAK
-    end
-
-    return HSA.STATUS_SUCCESS
-end
-
-mutable struct ROCExecutable
-    device::ROCDevice
-    executable::HSA.Executable
-    data::Vector{UInt8}
-    globals::Dict{Symbol, Mem.Buffer}
-end
-
-# TODO Docstring
-function ROCExecutable(device::ROCDevice, data::Vector{UInt8}, symbol::String; globals=())
-    code_object_reader_ref = Ref{HSA.CodeObjectReader}(HSA.CodeObjectReader(0))
-    HSA.code_object_reader_create_from_memory(
-        data, sizeof(data), code_object_reader_ref) |> check
-    code_object_reader = code_object_reader_ref[]
-
-    executable_ref = Ref{HSA.Executable}()
-    HSA.executable_create_alt(
-        profile(device), HSA.DEFAULT_FLOAT_ROUNDING_MODE_NEAR,
-        C_NULL, executable_ref) |> check
-    executable = executable_ref[]
-
-    _globals = Dict{Symbol,Mem.Buffer}()
-    for (gbl,sz) in globals
-        gbl_buf = Mem.alloc(device, sz; coherent=true)
-        HSA.executable_agent_global_variable_define(
-            executable, device.agent, string(gbl), gbl_buf.ptr) |> check
-        _globals[gbl] = gbl_buf
-    end
-
-    HSA.executable_load_agent_code_object(
-        executable, device.agent, code_object_reader,
-        C_NULL, C_NULL) |> check
-
-    HSA.executable_freeze(executable, "") |> check
-
-    exe = ROCExecutable(device, executable, data, _globals)
-
-    # TODO: Ensure no derived kernels are in flight during finalization
-    AMDGPU.hsaref!()
-    finalizer(exe) do e
-        HSA.executable_destroy(e.executable) |> check
-        for buf in values(e.globals)
-            Mem.free(buf)
-        end
-        HSA.code_object_reader_destroy(code_object_reader) |> check
-        AMDGPU.hsaunref!()
-    end
-
-    return exe
-end
-
-function get_global(exe::ROCExecutable, symbol::Symbol)
-    @assert symbol in keys(exe.globals) "No such global in executable: $symbol"
-    return exe.globals[symbol]
-end
-
-has_exception(e::ROCExecutable) = haskey(e.globals, :__global_exception_flag)
-
-function get_exception(
-    exe::ROCExecutable; cleanup::Bool = true, signal_handle::UInt64,
-)
-    has_exception(exe) || return nothing
-
-    # Check if any wavefront for this kernel threw an exception
-    ex_flag = get_global(exe, :__global_exception_flag)
-    ex_flag_ptr = Base.unsafe_convert(Ptr{Int64}, ex_flag)
-    ex_flag_value = Base.unsafe_load(ex_flag_ptr)
-    ex_flag_value == 0 && return nothing
-
-    ex_string = nothing
-    fetch_ex_strings = haskey(exe.globals, :__global_exception_ring)
-
-    if fetch_ex_strings
-        ex_strings = String[]
-        # Check for and collect any exceptions, and clear their slots
-        ex_ring = get_global(exe, :__global_exception_ring)
-        ex_ring_ptr_ptr = Base.unsafe_convert(
-            Ptr{Ptr{AMDGPU.Device.ExceptionEntry}}, ex_ring)
-        ex_ring_ptr = unsafe_load(ex_ring_ptr_ptr)
-
-        while (ex_ring_value = unsafe_load(ex_ring_ptr)).kern != 1
-            if ex_ring_value.kern == signal_handle && reinterpret(Ptr{UInt8}, ex_ring_value.ptr) != C_NULL
-                ex_ring_value_str = unsafe_string(
-                    reinterpret(Ptr{UInt8}, ex_ring_value.ptr))
-                push!(ex_strings, ex_ring_value_str)
-
-                if cleanup
-                    # FIXME: Write rest of entry first, then CAS 0 to kern field
-                    entry = AMDGPU.Device.ExceptionEntry(
-                        UInt64(0), Core.LLVMPtr{UInt8,1}(0))
-                    unsafe_store!(ex_ring_ptr, entry)
-                end
-            end
-            ex_ring_ptr += sizeof(AMDGPU.Device.ExceptionEntry)
-        end
-        unique!(ex_strings)
-        ex_string = join(ex_strings, '\n')
-    end
-
-    return KernelException(exe.device, ex_string)
-end
diff --git a/src/runtime/execution.jl b/src/runtime/execution.jl
index 44c00764b..1260a9479 100644
--- a/src/runtime/execution.jl
+++ b/src/runtime/execution.jl
@@ -24,218 +24,3 @@ The following keyword arguments are supported:
 - `queue` (defaults to the default queue)
 """
 AbstractKernel
-
-@generated function call(kernel::AbstractKernel{F,TT}, args...; call_kwargs...) where {F,TT}
-    sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type
-    args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...)
-
-    # filter out ghost arguments that shouldn't be passed
-    predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt)
-    to_pass = map(!predicate, sig.parameters)
-    call_t =                  Type[x[1] for x in zip(sig.parameters,  to_pass) if x[2]]
-    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass)            if x[2]]
-
-    # replace non-isbits arguments (they should be unused, or compilation would have failed)
-    # alternatively, allow `launch` with non-isbits arguments.
-    for (i,dt) in enumerate(call_t)
-        if !isbitstype(dt)
-            call_t[i] = Ptr{Any}
-            call_args[i] = :C_NULL
-        end
-    end
-
-    # finalize types
-    call_tt = Base.to_tuple_type(call_t)
-
-    quote
-        Base.@_inline_meta
-
-        roccall(kernel, $call_tt, $(call_args...); call_kwargs...)
-    end
-end
-
-
-## host-side kernels
-
-struct HostKernel{F,TT} <: AbstractKernel{F,TT}
-    f::F
-    mod::ROCModule
-    fun::ROCFunction
-end
-
-@doc (@doc AbstractKernel) HostKernel
-
-@inline function roccall(kernel::HostKernel, tt, args...; signal::ROCKernelSignal, groupsize=1, kwargs...)
-    if groupsize == :auto
-        config = AMDGPU.launch_configuration(kernel; signal.kernel.localmem)
-        roccall(signal, tt, args...; config..., kwargs...)
-    else
-        roccall(signal, tt, args...; kwargs..., groupsize)
-    end
-end
-
-
-## host-side API
-
-
-# https://github.com/JuliaLang/julia/issues/14919
-(kernel::HostKernel)(args...; kwargs...) = call(kernel, args...; kwargs...)
-
-export roccall
-
-"""
-    roccall(signal::ROCKernelSignal, types, values...; groupsize::ROCDim, gridsize::ROCDim)
-
-`ccall`-like interface for launching a ROC function `f` on a GPU.
-
-For example:
-
-    vadd = ROCFunction(md, "vadd")
-    a = rand(Float32, 10)
-    b = rand(Float32, 10)
-    ad = Mem.upload(a)
-    bd = Mem.upload(b)
-    c = zeros(Float32, 10)
-    cd = Mem.alloc(c)
-
-    roccall(vadd, (Ptr{Cfloat},Ptr{Cfloat},Ptr{Cfloat}), ad, bd, cd;
-             gridsize=10)
-    Mem.download!(c, cd)
-
-The `groupsize` and `gridsize` arguments control the launch configuration, and should both
-consist of either an integer, or a tuple of 1 to 3 integers (omitted dimensions default to
-1). The `types` argument can contain both a tuple of types, and a tuple type, the latter
-being slightly faster.
-"""
-roccall
-
-# we need a generated function to get a tuple of converted arguments (using unsafe_convert),
-# without having to inspect the types at runtime
-@generated function roccall(
-    signal::ROCKernelSignal, tt::Type, args::Vararg{Any,N};
-    groupsize::ROCDim=1, gridsize::ROCDim=groupsize,
-) where N
-
-    # the type of `tt` is Type{Tuple{<:DataType...}}
-    types = tt.parameters[1].parameters
-
-    ex = Expr(:block)
-    push!(ex.args, :(Base.@_inline_meta))
-
-    # convert the argument values to match the kernel's signature (specified by the user)
-    # (this mimics `lower-ccall` in julia-syntax.scm)
-    converted_args = Vector{Symbol}(undef, length(args))
-    arg_ptrs = Vector{Symbol}(undef, length(args))
-    for i in 1:length(args)
-        converted_args[i] = gensym()
-        arg_ptrs[i] = gensym()
-        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))
-        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i]))))
-    end
-
-    append!(ex.args, (quote
-        write_args!(signal.kernel, $(arg_ptrs...))
-        #GC.@preserve $(converted_args...) begin
-            launch_kernel!(signal, groupsize, gridsize, ($(arg_ptrs...),))
-        #end
-    end).args)
-
-    return ex
-end
-
-function write_args!(kernel::ROCKernel, args...)
-    # Allocate the kernel argument buffer
-    key = khash(args)
-    kernarg_address, do_write = Mem.alloc_pooled(kernel.device, key, :kernarg,
-                                                 kernel.kernarg_segment_size)
-
-    if do_write
-        # Fill kernel argument buffer
-        # FIXME: Query kernarg segment alignment
-        ctr = 0
-        for arg in args
-            sz = sizeof(typeof(arg))
-            if sz == 0
-                continue
-            end
-            rarg = Ref(arg)
-            align = Base.datatype_alignment(typeof(arg))
-            rem = mod(ctr, align)
-            if rem > 0
-                ctr += align-rem
-            end
-            ccall(:memcpy, Cvoid,
-                  (Ptr{Cvoid}, Ptr{Cvoid}, Csize_t),
-                  kernarg_address+ctr, rarg, sz)
-            ctr += sz
-        end
-    end
-
-    # Register kernarg buffer
-    kernel.kernarg_address = kernarg_address
-    AMDGPU.hsaref!()
-    finalizer(kernel) do k
-        Mem.free_pooled(k.device, key, :kernarg, kernarg_address)
-        AMDGPU.hsaunref!()
-    end
-end
-
-## device-side kernels
-
-struct DeviceKernel{F,TT} <: AbstractKernel{F,TT}
-    fun::Ptr{Cvoid}
-end
-
-@doc (@doc AbstractKernel) DeviceKernel
-
-@inline roccall(kernel::DeviceKernel, tt, args...; kwargs...) =
-    dynamic_roccall(kernel.fun, tt, args...; kwargs...)
-
-# FIXME: duplication with roccall
-@generated function dynamic_roccall(f::Ptr{Cvoid}, tt::Type, args...;
-                                     blocks=UInt32(1), threads=UInt32(1), shmem=UInt32(0),
-                                     queue=queue())
-    types = tt.parameters[1].parameters     # the type of `tt` is Type{Tuple{<:DataType...}}
-
-    ex = quote
-        Base.@_inline_meta
-    end
-
-    # convert the argument values to match the kernel's signature (specified by the user)
-    # (this mimics `lower-ccall` in julia-syntax.scm)
-    converted_args = Vector{Symbol}(undef, length(args))
-    arg_ptrs = Vector{Symbol}(undef, length(args))
-    for i in 1:length(args)
-        converted_args[i] = gensym()
-        arg_ptrs[i] = gensym()
-        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))
-        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i]))))
-    end
-
-    append!(ex.args, (quote
-        #GC.@preserve $(converted_args...) begin
-            launch(f, blocks, threads, shmem, stream, $(arg_ptrs...))
-        #end
-    end).args)
-
-    return ex
-end
-
-
-## device-side API
-
-"""
-    dynamic_rocfunction(f, tt=Tuple{})
-
-Low-level interface to compile a function invocation for the currently-active GPU, returning
-a callable kernel object. Device-side equivalent of [`AMDGPU.rocfunction`](@ref).
-
-No keyword arguments are supported.
-"""
-@inline function dynamic_rocfunction(f::F, tt::Type=Tuple{}) where {F <: Function}
-    fptr = GPUCompiler.deferred_codegen(Val(F), Val(tt))
-    DeviceKernel{f,tt}(fptr)
-end
-
-# https://github.com/JuliaLang/julia/issues/14919
-(kernel::DeviceKernel)(args...; kwargs...) = call(kernel, args...; kwargs...)
diff --git a/src/runtime/hashing.jl b/src/runtime/hashing.jl
deleted file mode 100644
index d15e04e5e..000000000
--- a/src/runtime/hashing.jl
+++ /dev/null
@@ -1,22 +0,0 @@
-# Kernel argument hashing
-
-## Arguments which are written to the kernarg buffer identically should have
-## the same khash value. Array contents are not hashed; instead, we hash the
-## array pointer.
-
-function khash(x::T, h::UInt=UInt(0)) where T
-    # Generic hashing
-    h = khash(T, h)
-    if isstructtype(T)
-        for name in fieldnames(T)
-            h = khash(getfield(x, name), h)
-        end
-    elseif isprimitivetype(T)
-        h = hash(x, h)
-    else
-        error("Can't hash: $T")
-    end
-    return h
-end
-khash(::Type{T}, h::UInt=UInt(0)) where T = hash(T, h)
-khash(x::Symbol, h::UInt=UInt(0)) = hash(x, h)
diff --git a/src/runtime/hip-execution.jl b/src/runtime/hip-execution.jl
new file mode 100644
index 000000000..1e68dc4ba
--- /dev/null
+++ b/src/runtime/hip-execution.jl
@@ -0,0 +1,115 @@
+struct HIPKernel{F, TT} <: AbstractKernel{F, TT}
+    f::F
+    fun::HIP.HIPFunction
+end
+
+@inline @generated function call(
+    kernel::HIPKernel{F, TT}, args...;
+    stream::HIP.HIPStream, call_kwargs...,
+) where {F, TT}
+    sig = Tuple{F, TT.parameters...} # Base.signature_type with a function type
+    args = (:(kernel.f), (:( args[$i] ) for i in 1:length(args))...)
+
+    # filter out ghost arguments that shouldn't be passed
+    predicate = dt -> GPUCompiler.isghosttype(dt) || Core.Compiler.isconstType(dt)
+    to_pass = map(!predicate, sig.parameters)
+    call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]]
+    call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]]
+
+    # replace non-isbits arguments (they should be unused, or compilation would have failed)
+    # alternatively, allow `launch` with non-isbits arguments.
+    for (i,dt) in enumerate(call_t)
+        if !isbitstype(dt)
+            call_t[i] = Ptr{Any}
+            call_args[i] = :C_NULL
+        end
+    end
+
+    # add the kernel state
+    pushfirst!(call_t, AMDGPU.KernelState)
+    pushfirst!(call_args, :(AMDGPU.KernelState(
+        stream.device, kernel.fun.global_hostcalls)))
+
+    # finalize types
+    call_tt = Base.to_tuple_type(call_t)
+
+    quote
+        roccall(kernel.fun, $call_tt, $(call_args...); stream, call_kwargs...)
+    end
+end
+
+function (ker::HIPKernel{F, TT})(
+    args...; stream::HIP.HIPStream = AMDGPU.stream(), call_kwargs...,
+) where {F, TT}
+    # Check if previous kernels threw an exception.
+    AMDGPU.throw_if_exception(stream.device)
+    call(ker, map(AMDGPU.rocconvert, args)...; stream, call_kwargs...)
+end
+
+@inline @generated function convert_arguments(f::Function, ::Type{tt}, args...) where tt
+    types = tt.parameters
+
+    ex = quote end
+
+    converted_args = Vector{Symbol}(undef, length(args))
+    arg_ptrs = Vector{Symbol}(undef, length(args))
+    for i in 1:length(args)
+        converted_args[i] = gensym()
+        arg_ptrs[i] = gensym()
+        push!(ex.args, :($(converted_args[i]) = Base.cconvert($(types[i]), args[$i])))
+        push!(ex.args, :($(arg_ptrs[i]) = Base.unsafe_convert($(types[i]), $(converted_args[i]))))
+    end
+
+    append!(ex.args, (quote
+        GC.@preserve $(converted_args...) begin
+            f($(arg_ptrs...))
+        end
+    end).args)
+    return ex
+end
+
+function roccall(fun::HIP.HIPFunction, tt::Type, args...; kwargs...)
+    convert_arguments(tt, args...) do pointers...
+        launch(fun, pointers...; kwargs...)
+    end
+end
+
+@inline @generated function pack_arguments(f::Function, args...)
+    for arg in args
+        isbitstype(arg) || throw(ArgumentError(
+            "Arguments to kernel should be bitstype, instead `$(arg)` was given."))
+    end
+
+    ex = quote end
+
+    arg_refs = Vector{Symbol}(undef, length(args))
+    for i in 1:length(args)
+        arg_refs[i] = gensym()
+        push!(ex.args, :($(arg_refs[i]) = Base.RefValue(args[$i])))
+    end
+
+    arg_ptrs = [
+        :(Base.unsafe_convert(Ptr{Cvoid}, $(arg_refs[i])))
+        for i in 1:length(args)]
+
+    append!(ex.args, (quote
+        GC.@preserve $(arg_refs...) begin
+            kernel_params = [$(arg_ptrs...)]
+            f(kernel_params)
+        end
+    end).args)
+    return ex
+end
+
+function launch(
+    fun::HIP.HIPFunction, args::Vararg{Any, N};
+    gridsize::ROCDim = 1, groupsize::ROCDim = 1,
+    shmem::Integer = 0, stream::HIP.HIPStream,
+) where N
+    gd, bd = ROCDim3(gridsize), ROCDim3(groupsize)
+    pack_arguments(args...) do kernel_params
+        HIP.hipModuleLaunchKernel(
+            fun, gd.x, gd.y, gd.z, bd.x, bd.y, bd.z,
+            shmem, stream, kernel_params, C_NULL) |> HIP.check
+    end
+end
diff --git a/src/runtime/hsa_device.jl b/src/runtime/hsa_device.jl
new file mode 100644
index 000000000..4c66b73de
--- /dev/null
+++ b/src/runtime/hsa_device.jl
@@ -0,0 +1,170 @@
+mutable struct ROCDevice
+    agent::HSA.Agent
+
+    # Cached information
+    type::HSA.DeviceType
+    name::String
+    productname::String
+    uuid::String
+
+    function ROCDevice(handle::HSA.Agent)
+        device = new(handle)
+        device.type = device_type(device)
+        device.name = name(device)
+        device.productname = product_name(device)
+        device.uuid = uuid(device)
+
+        return device
+    end
+end
+ROCDevice() = AMDGPU.device()
+get_handle(device::ROCDevice) = device.agent.handle
+
+Base.:(==)(device1::ROCDevice, device2::ROCDevice) =
+    device1.agent == device2.agent
+
+function agent_iterate_isas_cb(isa::HSA.ISA, isas)
+    push!(isas, isa)
+    return HSA.STATUS_SUCCESS
+end
+
+function iterate_agents_cb(agent::HSA.Agent, devices)
+    push!(devices, ROCDevice(agent))
+    return HSA.STATUS_SUCCESS
+end
+
+# Pretty-printing
+function Base.show(io::IO, device::ROCDevice)
+    name = device.uuid
+
+    name *= if device.name == device.productname || isempty(device.productname)
+        " [$(device.name)]"
+    else
+        " [$(device.productname) ($(device.name))]"
+    end
+
+    print(io, name)
+end
+
+### Device details
+
+getinfo(
+    agent::HSA.Agent, attribute::HSA.AgentInfo,
+    value::Union{Vector, Base.RefValue},
+) = HSA.agent_get_info(agent, attribute, value)
+
+getinfo(
+    agent::HSA.Agent, info::HSA.AMDAgentInfo,
+    value::Union{Vector, Base.RefValue},
+) = getinfo(agent, reinterpret(HSA.AgentInfo, info), value)
+
+getinfo(device::ROCDevice, query, value) = getinfo(device.agent, query, value)
+
+const AnyROCDevice = Union{ROCDevice,HSA.Agent}
+
+name(device::AnyROCDevice) =
+    getinfo(String, device, HSA.AGENT_INFO_NAME)
+
+product_name(device::AnyROCDevice) =
+    getinfo(String, device, HSA.AMD_AGENT_INFO_PRODUCT_NAME)
+
+uuid(device::AnyROCDevice) =
+    getinfo(String, device, HSA.AMD_AGENT_INFO_UUID)
+
+profile(device::AnyROCDevice) =
+    getinfo(HSA.Profile, device, HSA.AGENT_INFO_PROFILE)
+
+device_type(device::AnyROCDevice) =
+    getinfo(HSA.DeviceType, device, HSA.AGENT_INFO_DEVICE)
+
+device_wavefront_size(device::AnyROCDevice) =
+    getinfo(UInt32, device, HSA.AGENT_INFO_WAVEFRONT_SIZE)
+
+device_workgroup_max_size(device::AnyROCDevice) =
+    getinfo(UInt32, device, HSA.AGENT_INFO_WORKGROUP_MAX_SIZE)
+
+device_num_compute_units(device::AnyROCDevice) =
+    getinfo(UInt32, device, HSA.AMD_AGENT_INFO_COMPUTE_UNIT_COUNT)
+
+device_num_simds_per_compute_unit(device::AnyROCDevice) =
+    getinfo(UInt32, device, HSA.AMD_AGENT_INFO_NUM_SIMDS_PER_CU)
+
+function device_local_memory_size(device::AnyROCDevice)
+    _regions = regions(device)
+    for region in _regions
+        if region_segment(region) == HSA.REGION_SEGMENT_GROUP
+            return region_size(region)
+        end
+    end
+    error("Failed to find local memory region for $device")
+end
+
+### ISAs
+
+struct HSAISA
+    hsa_isa::HSA.ISA
+    arch_features::Tuple{String, String}
+end
+
+function HSAISA(hsa_isa::HSA.ISA)
+    HSAISA(hsa_isa, llvm_arch_features(hsa_isa))
+end
+
+isas(device::ROCDevice) = isas(device.agent)
+function isas(agent::HSA.Agent)
+    isas = Ref(HSA.ISA[])
+    func = @cfunction(agent_iterate_isas_cb, HSA.Status, (HSA.ISA, Ref{Vector{HSA.ISA}}))
+    HSA.agent_iterate_isas(agent, func, isas) |> check
+    HSAISA.(isas[])
+end
+
+# Device handle => default ISA.
+const DEFAULT_ISAS = Dict{UInt64, HSAISA}()
+
+function default_isa(device::ROCDevice)
+    lock(RT_LOCK) do
+        get!(
+            () -> first(Runtime.isas(device)),
+            DEFAULT_ISAS, Runtime.get_handle(device))
+    end
+end
+
+# TODO: PCRE regexes are not thread-safe
+const isa_regex = r"([a-z]*)-([a-z]*)-([a-z]*)--([a-z0-9]*)([a-zA-Z0-9+\-:]*)"
+function parse_isa(isa::HSA.ISA)
+    len = isa_name_length(isa)
+    name = Vector{UInt8}(undef, len)
+    getinfo(isa, HSA.ISA_INFO_NAME, name) |> check
+    name = String(name)
+    m = match(isa_regex, name)
+    @assert m !== nothing "Failed to match ISA name pattern: $name"
+    m
+end
+
+function llvm_arch_features(isa::HSA.ISA)
+    m = parse_isa(isa)
+    arch = String(m.captures[4])
+    features = join(map(x->x[1:end-1],
+                        filter!(x->!isempty(x) && (x[end]=='+'),
+                               split(m.captures[5], ':'))),
+                    ",+")
+    if !isempty(features)
+        features = '+'*features
+    end
+    if Base.libllvm_version < v"12"
+        features = replace(features, "sramecc"=>"sram-ecc")
+    end
+    (arch, features)
+end
+
+architecture(isa::HSA.ISA) = llvm_arch_features(isa)[1]
+
+features(isa::HSA.ISA) = llvm_arch_features(isa)[2]
+
+getinfo(isa::HSA.ISA, attribute::HSA.ISAInfo, value::Union{Vector, Base.RefValue}) =
+    HSA.isa_get_info_alt(isa, attribute, value)
+
+isa_name_length(isa::HSA.ISA) = getinfo(Cuint, isa, HSA.ISA_INFO_NAME_LENGTH)
+
+isa_workgroup_max_size(isa::HSA.ISA) =
+    getinfo(UInt32, isa, HSA.ISA_INFO_WORKGROUP_MAX_SIZE)
diff --git a/src/runtime/kernel-signal.jl b/src/runtime/kernel-signal.jl
deleted file mode 100644
index 9582d67e1..000000000
--- a/src/runtime/kernel-signal.jl
+++ /dev/null
@@ -1,105 +0,0 @@
-struct KernelException <: Exception
-    device::ROCDevice
-    exstr::Union{String, Nothing}
-end
-
-function Base.showerror(io::IO, err::KernelException)
-    print(io, "KernelException: exception(s) thrown during kernel execution on device $(err.device)")
-    if err.exstr !== nothing
-        println(io, ":")
-        print(io, err.exstr)
-    end
-end
-
-mutable struct ROCKernelSignal
-    signal::ROCSignal
-    queue::ROCQueue
-    kernel::ROCKernel
-    exception::Union{Exception, Nothing}
-    @atomic active::Bool
-
-    function ROCKernelSignal(
-        signal::ROCSignal, queue::ROCQueue, kernel::ROCKernel;
-    )
-        kersig = new(signal, queue, kernel, nothing, true)
-        finalizer(kersig) do k
-            cleanup!(k; finished=true)
-        end
-        kersig
-    end
-end
-
-function Base.wait(
-    kersig::ROCKernelSignal; check_exceptions::Bool = true,
-    cleanup::Bool = false, signal_kwargs...,
-)
-    @log_start(:wait, (;entry=kersig.kernel.sym, signal=get_handle(kersig.signal)), nothing)
-    finished = try
-        wait(kersig.signal; queue=kersig.queue, signal_kwargs...)
-        true
-    catch err
-        if isa(err, SignalTimeoutException) && SIGNAL_TIMEOUT_KILL_QUEUE[]
-            kill_queue!(kersig.queue)
-        end
-        isnothing(kersig.exception) && (kersig.exception = err;)
-        false
-    finally
-        @log_finish(:wait, (;entry=kersig.kernel.sym, signal=get_handle(kersig.signal)), nothing)
-    end
-
-    if cleanup
-        cleanup!(kersig; finished, check_exceptions)
-    elseif check_exceptions
-        # Ensure we check for and propagate errors
-        ex = get_exception(kersig; finished, cleanup=false)
-        if ex !== nothing
-            kersig.exception = ex
-        end
-    end
-
-    if check_exceptions
-        # Report any kernel-specific exceptions
-        if kersig.exception !== nothing
-            throw(kersig.exception)
-        end
-        # Report any queue-specific exceptions
-        ensure_active(kersig.queue)
-    end
-
-    return finished
-end
-
-function cleanup!(
-    kersig::ROCKernelSignal; finished::Bool, check_exceptions::Bool = true,
-)
-    _, succ = @atomicreplace kersig.active true => false
-    succ || return
-
-    unpreserve!(kersig)
-
-    if finished
-        ex = get_exception(kersig; finished, cleanup=true)
-        isnothing(ex) || (kersig.exception = ex;)
-    end
-
-    exe = kersig.kernel.exe::ROCExecutable
-    lock(RT_LOCK) do
-        mod = EXE_TO_MODULE_MAP[exe].value::ROCModule
-        signal_handle = get_handle(kersig.signal)::UInt64
-        delete_metadata!(mod; signal_handle)
-    end
-
-    return
-end
-function get_exception(kersig::ROCKernelSignal; finished::Bool, cleanup::Bool)
-    exe = kersig.kernel.exe::ROCExecutable
-    signal_handle::UInt64 = get_handle(kersig.signal)
-    return get_exception(exe; signal_handle, cleanup)
-end
-
-function Base.show(io::IO, kersig::ROCKernelSignal)
-    ex = kersig.exception
-    print(io, "ROCKernelSignal(signal=$(kersig.signal)$(ex !== nothing ? ", exception=$ex" : ""))")
-end
-
-Base.notify(kersig::ROCKernelSignal) = notify(kersig.signal)
diff --git a/src/runtime/kernel.jl b/src/runtime/kernel.jl
deleted file mode 100644
index 7b1e9734c..000000000
--- a/src/runtime/kernel.jl
+++ /dev/null
@@ -1,124 +0,0 @@
-export barrier_and!, barrier_or!
-
-## Kernel allocations
-
-struct KernelMetadata
-    kern::UInt64
-    buf::Mem.Buffer
-end
-
-## Kernel module and function
-
-export ROCModule, ROCFunction
-
-const MAX_EXCEPTIONS = 256
-const EXE_TO_MODULE_MAP = IdDict{ROCExecutable, WeakRef}()
-mutable struct ROCModule
-    exe::ROCExecutable
-    metadata::Vector{KernelMetadata}
-    exceptions::Mem.Buffer
-end
-
-function delete_metadata!(m::ROCModule; signal_handle::UInt64 = UInt64(0))
-    isempty(m.metadata) && return nothing
-
-    only_handle = signal_handle != UInt64(0)
-    for i in length(m.metadata):-1:1
-        meta = m.metadata[i]
-        Mem.free(meta.buf)
-        if only_handle && (meta.kern == signal_handle)
-            deleteat!(m.metadata, i)
-        end
-    end
-    only_handle || empty!(m.metadata)
-    return nothing
-end
-
-function ROCModule(exe::ROCExecutable)
-    device = exe.device
-    metadata = KernelMetadata[]
-    bytesize = sizeof(AMDGPU.Device.ExceptionEntry) * MAX_EXCEPTIONS
-    exceptions = Mem.alloc(device, bytesize; coherent=true)
-
-    mod = ROCModule(exe, metadata, exceptions)
-    EXE_TO_MODULE_MAP[exe] = WeakRef(mod)
-
-    AMDGPU.hsaref!()
-    return finalizer(mod) do m
-        delete_metadata!(m)
-        Mem.free(m.exceptions)
-        delete!(EXE_TO_MODULE_MAP, m.exe)
-        AMDGPU.hsaunref!()
-    end
-end
-mutable struct ROCFunction
-    mod::ROCModule
-    entry::String
-    hash::UInt64
-end
-
-## Kernel instance
-
-mutable struct ROCKernel
-    device::ROCDevice
-    exe::ROCExecutable
-    sym::String
-    localmem::Int64
-    kernel_object::UInt64
-    kernarg_segment_size::UInt32
-    group_segment_size::UInt32
-    private_segment_size::UInt32
-    kernarg_address::Ptr{Cvoid}
-end
-
-function executable_symbol_any(exe::ROCExecutable, device::ROCDevice)
-    agent_func = @cfunction(iterate_exec_agent_syms_cb, HSA.Status,
-                            (HSA.Executable, HSA.Agent, HSA.ExecutableSymbol, Ptr{HSA.ExecutableSymbol}))
-    exec_symbol_ref = Ref{HSA.ExecutableSymbol}()
-    ret = HSA.executable_iterate_agent_symbols(
-        exe.executable, device.agent, agent_func, exec_symbol_ref)
-    @assert ret == HSA.STATUS_SUCCESS || ret == HSA.STATUS_INFO_BREAK
-    if isassigned(exec_symbol_ref)
-        return exec_symbol_ref[]
-    end
-    return nothing
-end
-function executable_symbol_by_name(exe::ROCExecutable, device::ROCDevice, name::Symbol)
-    agent_ref = Ref(device.agent)
-    exec_symbol_ref = Ref{HSA.ExecutableSymbol}()
-    GC.@preserve agent_ref begin
-        HSA.executable_get_symbol_by_name(
-            exe.executable, symbol, agent_ref, exec_symbol_ref) |> check
-    end
-    if isassigned(exec_symbol_ref)
-        return exec_symbol_ref[]
-    end
-    return nothing
-end
-
-function ROCKernel(kernel #= ::HostKernel =#; localmem::Int=0)
-    exe = kernel.mod.exe
-    device = exe.device
-    symbol = kernel.fun.entry
-
-    exec_symbol = executable_symbol_any(exe, device)
-    # TODO: Conditionally disable once ROCR is fixed
-    if exec_symbol === nothing
-        exec_symbol = something(executable_symbol_by_name(exe, device, symbol))
-    end
-
-    kernel_object = executable_symbol_kernel_object(exec_symbol)
-    kernarg_segment_size = executable_symbol_kernel_kernarg_segment_size(exec_symbol)
-    if kernarg_segment_size == 0
-        # FIXME: Hidden arguments!
-        # Allocate some memory anyway, #10
-        kernarg_segment_size = UInt32(max(kernarg_segment_size, 8))
-    end
-
-    group_segment_size = executable_symbol_kernel_group_segment_size(exec_symbol)
-    group_segment_size = UInt32(group_segment_size + localmem)
-    private_segment_size = executable_symbol_kernel_private_segment_size(exec_symbol)
-    ROCKernel(device, exe, symbol, localmem, kernel_object,
-        kernarg_segment_size, group_segment_size,
-        private_segment_size, Ptr{Cvoid}(0))
-end
diff --git a/src/runtime/launch.jl b/src/runtime/launch.jl
deleted file mode 100644
index 560aa79b9..000000000
--- a/src/runtime/launch.jl
+++ /dev/null
@@ -1,201 +0,0 @@
-# Kernel and barrier launch
-# modeled after: CUDAdrv/src/execution.jl
-
-## HSA object preservation while a kernel is active
-
-const SIGNAL_PRESERVED = IdDict{ROCSignal, Vector{Any}}()
-
-function preserve!(sig::ROCSignal, @nospecialize(x))
-    set = get!(()->Any[], SIGNAL_PRESERVED, sig)
-    push!(set, x)
-end
-preserve!(sig::ROCKernelSignal, @nospecialize(x)) = preserve!(sig.signal, x)
-
-unpreserve!(sig::ROCSignal) = delete!(SIGNAL_PRESERVED, sig)
-unpreserve!(sig::ROCKernelSignal) = unpreserve!(sig.signal)
-
-# we need a generated function to get an args array,
-# without having to inspect the types at runtime
-@generated function launch_kernel!(
-    signal::ROCKernelSignal, groupsize::ROCDim,
-    gridsize::ROCDim, args::NTuple{N,Any},
-) where N
-    all(isbitstype, args.parameters) ||
-        throw(ArgumentError("Arguments to kernel should be bitstype."))
-
-    ex = Expr(:block)
-    push!(ex.args, :(Base.@_inline_meta))
-
-    # put arguments in Ref boxes so that we can get a pointers to them
-    arg_refs = Vector{Symbol}(undef, N)
-    for i in 1:N
-        arg_refs[i] = gensym()
-        push!(ex.args, :($(arg_refs[i]) = Base.RefValue(args[$i])))
-    end
-
-    append!(ex.args, (quote
-        GC.@preserve $(arg_refs...) begin
-            # validate launch parameters
-            groupsize, gridsize = normalize_launch_dimensions(groupsize, gridsize)
-
-            # launch kernel
-            Base.@lock signal.queue.lock begin
-                push!(signal.queue.active_kernels, signal)
-            end
-            launch_kernel!(signal.queue, signal.kernel, signal.signal, groupsize, gridsize)
-
-            # preserve kernel and arguments
-            $preserve!(signal, signal.kernel)
-            for arg in args
-                $preserve!(signal, arg)
-            end
-
-            notify(signal.queue.running)
-        end
-    end).args)
-
-    return ex
-end
-
-struct ROCSignalSet{T}
-    signals::Vector{ROCSignal}
-end
-ROCSignalSet{T}() where T = ROCSignalSet{T}(ROCSignal[])
-
-Base.wait(ss::ROCSignalSet{HSA.BarrierAndPacket}) = wait.(ss.signals)
-
-function Base.wait(ss::ROCSignalSet{HSA.BarrierOrPacket})
-    #= FIXME
-    # We need to hack around the fact that barrier OR packets don't handle more
-    # than 5 dependencies. We could implement the waiting in software, and emit
-    # a barrier that waits on a signal tied to that waiter.
-    =#
-    @warn "Waiting on OR barriers waits on all partitioned barriers to complete" maxlog=1
-    wait.(ss.signals)
-end
-
-"Normalize and validate launch groupsize and gridsize."
-function normalize_launch_dimensions(groupsize, gridsize)
-    groupsize = ROCDim3(groupsize)
-    gridsize = ROCDim3(gridsize)
-
-    # Validate group and grid dimensions
-    (groupsize.x > 0 && groupsize.y > 0 && groupsize.z > 0) ||
-        throw(ArgumentError("Group dimensions must be non-zero"))
-    (gridsize.x > 0 && gridsize.y > 0 && gridsize.z > 0) ||
-        throw(ArgumentError("Grid dimensions must be non-zero"))
-    (groupsize.x <= AMDGPU.Device._max_group_size &&
-     groupsize.y <= AMDGPU.Device._max_group_size &&
-     groupsize.z <= AMDGPU.Device._max_group_size &&
-     groupsize.x * groupsize.y * groupsize.z <= AMDGPU.Device._max_group_size) ||
-        throw(ArgumentError("Group dimensions too large"))
-
-    return groupsize, gridsize
-end
-
-"""
-    launch_kernel!(queue::ROCQueue, kern::ROCKernel, signal::ROCSignal,
-                   groupsize::ROCDim, gridsize::ROCDim)
-
-Low-level call to launch a function (encoded in `kern`) on the GPU, using
-`groupsize` and `gridsize` as the grid and block configuration, respectively.
-The kernel is launched on `queue` and will notify `signal` upon completion.
-
-Arguments to a kernel should either be bitstype, in which case they will be
-copied to the internal kernel parameter buffer, or a pointer to device memory.
-
-This is a low-level call, preferably use [`roccall`](@ref) instead.
-"""
-function launch_kernel!(
-    queue::ROCQueue, kernel::ROCKernel, signal::ROCSignal,
-    groupsize::ROCDim3, gridsize::ROCDim3,
-)
-    @log_start(:launch_kernel!, (;entry=kernel.sym, signal=get_handle(signal)), (;queue=UInt64(get_handle(queue))))
-
-    enqueue_packet!(HSA.KernelDispatchPacket, queue) do _packet
-        @set! _packet.setup = 3 << Int(HSA.KERNEL_DISPATCH_PACKET_SETUP_DIMENSIONS)
-        @set! _packet.workgroup_size_x = groupsize.x
-        @set! _packet.workgroup_size_y = groupsize.y
-        @set! _packet.workgroup_size_z = groupsize.z
-        @set! _packet.grid_size_x = gridsize.x
-        @set! _packet.grid_size_y = gridsize.y
-        @set! _packet.grid_size_z = gridsize.z
-        @set! _packet.completion_signal = signal.signal
-        @set! _packet.kernel_object = kernel.kernel_object
-        @set! _packet.kernarg_address = kernel.kernarg_address
-        @set! _packet.private_segment_size = kernel.private_segment_size
-        @set! _packet.group_segment_size = kernel.group_segment_size
-        _packet
-    end
-
-    @log_finish(:launch_kernel!, (;entry=kernel.sym, signal=get_handle(signal)), (;queue=UInt64(get_handle(queue))))
-end
-
-function launch_barrier!(T, queue::ROCQueue, signals::Vector{ROCSignal})
-    outset = ROCSignalSet{T}()
-    if !isempty(signals)
-        for signal_set in Iterators.partition(signals, 5)
-            comp_signal = ROCSignal()
-            enqueue_packet!(T, queue) do _packet
-                @set! _packet.dep_signal = ntuple(i->length(signal_set)>=i ? signal_set[i].signal : HSA.Signal(0), 5)
-                _packet
-            end
-            push!(outset.signals, comp_signal)
-        end
-    end
-    return outset
-end
-
-# Atomic store using LLVM intrinsics
-# Necessary for writing the AQL packet header to the queue
-# prior to launching a kernel.
-@eval atomic_store_n!(x::Ptr{UInt16}, v::UInt16) =
-    Base.llvmcall($"""
-    %ptr = inttoptr i$(Sys.WORD_SIZE) %0 to i16*
-    store atomic i16 %1, i16* %ptr release, align 64
-    ret void
-    """, Cvoid, Tuple{Ptr{UInt16}, UInt16}, x, v)
-
-function enqueue_packet!(f::Base.Callable, ::Type{T}, queue::ROCQueue) where T
-    # Obtain the current queue write index and queue size
-    ensure_active(queue)
-    queue_ptr = queue.queue
-    _queue = unsafe_load(queue.queue)
-    queue_size = _queue.size
-    write_index = HSA.queue_add_write_index_scacq_screl(queue_ptr, UInt64(1))
-
-    # Yield until queue has space
-    while write_index - HSA.queue_load_read_index_scacquire(queue_ptr) >= queue_size
-        # TODO: Exponential backoff with initial `Libc.systemsleep` calls
-        yield()
-    end
-
-    # TODO: Make this less ugly
-    dispatch_packet = Ref{T}()
-    ccall(:memset, Cvoid,
-          (Ptr{Cvoid}, Cint, Csize_t),
-          dispatch_packet, 0, sizeof(T))
-    dispatch_packet[] = f(dispatch_packet[])
-
-    queueMask = UInt32(queue_size - 1)
-    baseaddr_ptr = Ptr{HSA.KernelDispatchPacket}(_queue.base_address)
-    baseaddr_ptr = baseaddr_ptr + sizeof(HSA.KernelDispatchPacket) * (write_index & queueMask)
-    dispatch_packet_ptr = convert(Ptr{HSA.KernelDispatchPacket}, Base.unsafe_convert(Ptr{T}, dispatch_packet))
-    unsafe_copyto!(baseaddr_ptr, dispatch_packet_ptr, 1)
-
-    # TODO: Generalize to allow barrier on kernel
-    packetheadertype(::Type{HSA.KernelDispatchPacket}) = HSA.PACKET_TYPE_KERNEL_DISPATCH
-    packetheadertype(::Type{HSA.BarrierAndPacket}) = HSA.PACKET_TYPE_BARRIER_AND
-    packetheadertype(::Type{HSA.BarrierOrPacket}) = HSA.PACKET_TYPE_BARRIER_OR
-
-    # Create and atomically store the header
-    # TODO: Generalize to make scopes configurable
-    header = Ref{UInt16}(0)
-    header[] |= UInt16(HSA.FENCE_SCOPE_SYSTEM) << UInt16(HSA.PACKET_HEADER_SCACQUIRE_FENCE_SCOPE)
-    header[] |= UInt16(HSA.FENCE_SCOPE_SYSTEM) << UInt16(HSA.PACKET_HEADER_SCRELEASE_FENCE_SCOPE)
-    header[] |= UInt16(packetheadertype(T))    << UInt16(HSA.PACKET_HEADER_TYPE)
-    atomic_store_n!(Base.unsafe_convert(Ptr{UInt16}, baseaddr_ptr), header[])
-
-    # Ring the doorbell to dispatch the kernel
-    HSA.signal_store_relaxed(_queue.doorbell_signal, Int64(write_index))
-end
diff --git a/src/runtime/linked-list.jl b/src/runtime/linked-list.jl
deleted file mode 100644
index 54d420916..000000000
--- a/src/runtime/linked-list.jl
+++ /dev/null
@@ -1,121 +0,0 @@
-mutable struct LinkedListNode{T}
-    data::T
-    @atomic next::Union{LinkedListNode{T},Nothing}
-    LinkedListNode(data::T) where T = new{T}(data, nothing)
-end
-
-mutable struct LinkedList{T}
-    @atomic head::Union{LinkedListNode{T},Nothing}
-    LinkedList{T}() where T = new{T}(nothing)
-end
-
-function Base.push!(list::LinkedList{T}, data::T) where T
-    ours = LinkedListNode(data)
-    if @atomic(list.head) === nothing && @atomicreplace(list.head, nothing => ours)[2]
-        return data
-    end
-    node = lastnode(list.head)
-    while true
-        if @atomic(node.next) === nothing
-            # Attempt to swap with our node
-            _, succ = @atomicreplace node.next nothing => ours
-            if succ
-                # Success, done
-                return data
-            end
-        end
-        # Advance to end
-        node = last(node)::LinkedListNode{T}
-    end
-end
-function next!(list::LinkedList)
-    head = @atomic(list.head)
-    isnothing(head) && throw(BoundsError())
-
-    # N.B. We assume this is only done single-threaded
-    next = @atomic(head.next)
-    @atomic list.head = next
-    return list
-end
-function Base.empty!(list::LinkedList)
-    @atomic list.head = nothing
-    return list
-end
-Base.isempty(list::LinkedList) = @atomic(list.head) === nothing
-
-function Base.first(list::LinkedList)
-    head = @atomic(list.head)
-    isnothing(head) && throw(BoundsError())
-    return head.data
-end
-
-function Base.last(list::LinkedList)
-    head = @atomic(list.head)
-    isnothing(head) && throw(BoundsError())
-    return last(head)
-end
-
-function lastnode(node::LinkedListNode)
-    while true
-        # Test if this is the last node
-        if @atomic(node.next) === nothing
-            return node
-        else
-            # Advance to next node
-            node = @atomic node.next
-        end
-    end
-end
-
-Base.last(node::LinkedListNode) = lastnode(node).data
-
-function maybelast(list::LinkedList)
-    head = @atomic(list.head)
-    isnothing(head) ? nothing : last(head)
-end
-
-function Base.length(list::LinkedList)
-    head = @atomic(list.head)
-    isnothing(head) ? 0 : length(head)
-end
-
-function Base.length(node::LinkedListNode)
-    ctr = 1
-    while @atomic(node.next) !== nothing
-        node = @atomic node.next
-        ctr += 1
-    end
-    return ctr
-end
-
-function Base.show(io::IO, list::LinkedList{T}) where T
-    print(io, "LinkedList{$T}($(length(list)) entries)")
-end
-
-function Base.copy(list::LinkedList{T}) where T
-    new_list = LinkedList{T}()
-    head = @atomic list.head
-    isnothing(head) && return new_list
-
-    @atomic new_list.head = head
-    return new_list
-end
-
-function Base.Array(list::LinkedList{T}) where T
-    vec = T[]
-    node = @atomic(list.head)
-    while node !== nothing
-        push!(vec, node.data)
-        node = @atomic(node.next)
-    end
-    return vec
-end
-
-function Base.iterate(list::LinkedList)
-    head = @atomic list.head
-    isnothing(head) ? nothing : (head.data, head.next)
-end
-
-function Base.iterate(_::LinkedList, node)
-    isnothing(node) ? nothing : (node.data, node.next)
-end
diff --git a/src/runtime/memory.jl b/src/runtime/memory.jl
deleted file mode 100644
index d5b4abffb..000000000
--- a/src/runtime/memory.jl
+++ /dev/null
@@ -1,802 +0,0 @@
-# Raw memory management
-# Copied from CUDAdrv.jl
-
-import AMDGPU
-import AMDGPU: HSA
-if AMDGPU.hip_configured
-    import AMDGPU: HIP
-end
-import AMDGPU: Runtime
-import .Runtime: ROCDevice, ROCSignal, ROCMemoryRegion, ROCMemoryPool, ROCDim, ROCDim3
-import .Runtime: DEVICES, check, get_region, get_memory_pool, get_handle
-using Preferences
-
-## buffer type
-
-struct Buffer
-    ptr::Ptr{Cvoid}
-    host_ptr::Ptr{Cvoid}
-    base_ptr::Ptr{Cvoid}
-    bytesize::Int
-    device::ROCDevice
-    coherent::Bool
-    pool_alloc::Bool
-    # Unique ID used for refcounting.
-    _id::UInt64
-
-    function Buffer(
-        ptr::Ptr{Cvoid}, host_ptr::Ptr{Cvoid}, base_ptr::Ptr{Cvoid},
-        bytesize::Int, device::ROCDevice, coherent::Bool, pool_alloc::Bool,
-    )
-        _id = _buffer_id!()
-        new(ptr, host_ptr, base_ptr, bytesize, device, coherent, pool_alloc, _id)
-    end
-end
-
-Base.unsafe_convert(::Type{Ptr{T}}, buf::Buffer) where {T} = convert(Ptr{T}, buf.ptr)
-
-function view(buf::Buffer, bytes::Int)
-    bytes > buf.bytesize && throw(BoundsError(buf, bytes))
-    return Buffer(buf.ptr+bytes,
-                  buf.host_ptr != C_NULL ? buf.host_ptr+bytes : C_NULL,
-                  buf.base_ptr,
-                  buf.bytesize-bytes, buf.device, buf.coherent, buf.pool_alloc)
-end
-
-## refcounting
-
-const _ID_COUNTER = Threads.Atomic{UInt64}(0)
-const refcounts = Dict{UInt64, Int}()
-const liveness = Dict{UInt64, Bool}()
-const refcounts_lock = Threads.ReentrantLock()
-
-function _buffer_id!()::UInt64
-    return Threads.atomic_add!(_ID_COUNTER, UInt64(1))
-end
-
-function refcount(buf::Buffer)
-    Base.lock(refcounts_lock) do
-        get(refcounts, buf._id, 0)
-    end
-end
-
-"""
-    retain(buf::Buffer)
-
-Increase the refcount of a buffer.
-"""
-function retain(buf::Buffer)
-    Base.lock(refcounts_lock) do
-        live = get!(liveness, buf._id, true)
-        @assert live "Trying to retain dead buffer!"
-        count = get!(refcounts, buf._id, 0)
-        refcounts[buf._id] = count + 1
-    end
-    return
-end
-
-"""
-    release(buf::Buffer)
-
-Decrease the refcount of a buffer. Returns `true` if the refcount has dropped
-to 0, and some action needs to be taken.
-"""
-function release(buf::Buffer)
-    while !Base.trylock(refcounts_lock) end
-    try
-        count = refcounts[buf._id]
-        @assert count >= 1 "Buffer refcount dropping below 0!"
-        refcounts[buf._id] = count - 1
-        done = count == 1
-
-        live = liveness[buf._id]
-
-        if done
-            if live
-                free(buf)
-            end
-            untrack(buf)
-        end
-        return done
-    finally
-        Base.unlock(refcounts_lock)
-    end
-end
-
-"""
-    free_if_live(buf::Buffer)
-
-Frees the base pointer for `buf` if it is still live (not yet freed). Does not
-update refcounts.
-"""
-function free_if_live(buf::Buffer)
-    Base.lock(refcounts_lock) do
-        if liveness[buf._id]
-            liveness[buf._id] = false
-            free(buf)
-        end
-    end
-end
-
-"""
-    untrack(buf::Buffer)
-
-Removes refcount tracking information for a buffer.
-"""
-function untrack(buf::Buffer)
-    while !Base.trylock(refcounts_lock) end
-    try
-        delete!(liveness, buf._id)
-        delete!(refcounts, buf._id)
-    finally
-        Base.unlock(refcounts_lock)
-    end
-end
-
-
-## memory info
-
-"""
-    info()
-
-Returns a tuple of two integers, indicating respectively the free and total amount of memory
-(in bytes) available for allocation on the device.
-"""
-function info()
-    free_ref = Ref{Csize_t}()
-    total_ref = Ref{Csize_t}()
-    # FIXME: I'm not sure HSA has an API for this...
-    return convert(Int, free_ref[]), convert(Int, total_ref[])
-end
-
-"""
-    free()
-
-Returns the free amount of memory (in bytes), available for allocation on the device.
-"""
-free() = info()[1]
-
-"""
-    total()
-
-Returns the total amount of memory (in bytes), available for allocation on the device.
-"""
-total() = info()[2]
-
-"""
-    used()
-
-Returns the used amount of memory (in bytes), allocated on the device.
-"""
-used() = total()-free()
-
-"""
-    pointerinfo(ptr::Ptr)
-    pointerinfo(buf::Buffer)
-    pointerinfo(a::Array)
-
-Retrieve information about the allocation referenced by the given pointer.
-"""
-function pointerinfo(ptr::Ptr)
-    ptrinfo = Ref{HSA.AMDPointerInfo}()
-    ptrinfo_ptr = Base.unsafe_convert(Ptr{HSA.AMDPointerInfo}, ptrinfo)
-    ccall(:memset, Ptr{Cvoid},
-                   (Ptr{HSA.AMDPointerInfo}, UInt8, Csize_t),
-                   ptrinfo_ptr, UInt8(0), sizeof(HSA.AMDPointerInfo))
-    unsafe_store!(reinterpret(Ptr{Csize_t}, ptrinfo_ptr), sizeof(HSA.AMDPointerInfo))
-    HSA.amd_pointer_info(Ptr{Cvoid}(ptr), ptrinfo, C_NULL, Ptr{UInt32}(C_NULL), C_NULL) |> check
-    return ptrinfo[]
-end
-pointerinfo(buf::Buffer) = pointerinfo(buf.ptr)
-pointerinfo(a::Array) = pointerinfo(pointer(a))
-
-## Page-locking
-
-"""
-    lock(ptr::Ptr, bytesize::Integer, device::ROCDevice)
-    lock(ptr, bytesize)
-    lock(a::Array, device)
-    lock(a)
-
-Page-lock a host pointer allocated by the OS allocator and return a new pointer from
-the given `device`. For more information, see `hsa_amd_memory_lock()`.
-
-See also: [`unlock`](@ref)
-"""
-function lock(ptr::Ptr, bytesize::Integer, device::ROCDevice)
-    plocked = Ref{Ptr{Cvoid}}()
-    ccall(:memset, Ptr{Cvoid},
-                   (Ptr{Ptr{Cvoid}}, UInt8, Csize_t),
-                   Base.unsafe_convert(Ptr{Ptr{Cvoid}}, plocked), UInt8(0), sizeof(Ptr{Cvoid}))
-    HSA.amd_memory_lock(Ptr{Cvoid}(ptr), bytesize, Ref(device.agent), 1, plocked) |> check
-    return plocked[]
-end
-lock(ptr, bytesize) = lock(ptr, bytesize, AMDGPU.device())
-lock(a::Array, device::ROCDevice) = lock(pointer(a), sizeof(a), device)
-lock(a::Array) = lock(pointer(a), sizeof(a), AMDGPU.device())
-
-"""
-    unlock(ptr::Ptr)
-    unlock(a::Array)
-
-Unlock the host pointer previously page-locked with [`lock`](@ref).
-NB: `ptr` should be the original locked host pointer and not the pointer returned by `lock`!
-"""
-function unlock(ptr::Ptr)
-    HSA.amd_memory_unlock(Ptr{Cvoid}(ptr)) |> check
-end
-unlock(a::Array) = unlock(pointer(a))
-
-## generic interface (for documentation purposes)
-
-"""
-Allocate linear memory on the device and return a buffer to the allocated memory. The
-allocated memory is suitably aligned for any kind of variable. The memory will not be freed
-automatically, use [`free(::Buffer)`](@ref) for that.
-"""
-function alloc end
-
-"""
-Free device memory.
-"""
-function free end
-
-"""
-Initialize device memory with a repeating value.
-"""
-function set! end
-
-"""
-Upload memory from host to device.
-Executed asynchronously on `queue` if `async` is true.
-"""
-function upload end
-@doc (@doc upload) upload!
-
-"""
-Download memory from device to host.
-Executed asynchronously on `queue` if `async` is true.
-"""
-function download end
-@doc (@doc download) download!
-
-"""
-Transfer memory from device to device.
-Executed asynchronously on `queue` if `async` is true.
-"""
-function transfer end
-@doc (@doc transfer) transfer!
-
-
-## pointer-based
-
-"Enables or disables the slow allocation fallback for non-coherent allocations."
-enable_slow_allocation_fallback!(flag::Bool) = @set_preferences!("use_slow_allocation_fallback" => flag)
-const USE_SLOW_ALLOCATION_FALLBACK = let
-    if haskey(ENV, "JULIA_AMDGPU_USE_SLOW_ALLOCATION_FALLBACK")
-        flag = parse(Bool, ENV["JULIA_AMDGPU_USE_SLOW_ALLOCATION_FALLBACK"])
-        enable_slow_allocation_fallback!(flag)
-        flag
-    else
-        @load_preference("use_slow_allocation_fallback", true)
-    end
-end
-
-"Enables or disables using hipMalloc/hipFree for non-coherent allocations."
-enable_hip_malloc_override!(flag::Bool) = @set_preferences!("use_hip_malloc_override" => flag)
-const USE_HIP_MALLOC_OVERRIDE = let
-    if haskey(ENV, "JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE")
-        flag = parse(Bool, ENV["JULIA_AMDGPU_USE_HIP_MALLOC_OVERRIDE"])
-        enable_hip_malloc_override!(flag)
-        flag
-    else
-        @load_preference("use_hip_malloc_override", false)
-    end
-end
-
-"Sets a limit for total GPU memory allocations."
-set_memory_alloc_limit!(limit::Integer) =
-    @set_preferences!("memory_alloc_limit" => limit)
-const MEMORY_ALLOC_LIMIT = let
-    if haskey(ENV, "JULIA_AMDGPU_MEMORY_ALLOC_LIMIT")
-        limit = parse(Int, ENV["JULIA_AMDGPU_MEMORY_ALLOC_LIMIT"])
-        set_memory_alloc_limit!(limit)
-        limit
-    else
-        @load_preference("memory_alloc_limit", typemax(Int))
-    end
-end
-
-"""
-    alloc(bytesize::Integer; coherent=false) -> Buffer
-
-Allocate `bytesize` bytes of HSA-managed memory on the default device.
-
-    alloc(device::ROCDevice, bytesize::Integer; coherent=false) -> Buffer
-
-Allocate `bytesize` bytes of HSA-managed memory on `device`.
-
-When using the above two methods, allocations are not coherent by default,
-meaning that the allocated buffer is only accessible from the given device.
-
-If `coherent` is set to `true`, the allocated buffer will be accessible from
-all HSA devices, including the host CPU.  Even though this is convenient, it can
-sometimes be slower than explicit memory transfers if memory accesses are not
-carefully managed.
-
-    alloc(device::ROCDevice, pool::ROCMemoryPool, bytesize::Integer) -> Buffer
-    alloc(device::ROCDevice, region::ROCMemoryRegion, bytesize::Integer) -> Buffer
-
-Allocate `bytesize` bytes of HSA-managed memory on the region `region` or
-memory pool `pool`.
-"""
-function alloc(device::ROCDevice, bytesize::Integer; coherent=false, slow_fallback=!coherent && USE_SLOW_ALLOCATION_FALLBACK)
-    alloc_id = rand(UInt64)
-    Runtime.@log_start(:alloc, (;alloc_id), (;device=get_handle(device), size=bytesize, coherent))
-
-    bytesize == 0 && return Buffer(C_NULL, C_NULL, C_NULL, 0, device, coherent, false)
-
-    region_kind = coherent ? :finegrained : :coarsegrained
-
-    buf = nothing
-    region = nothing
-    try
-        if region_kind != :coarsegrained
-            region = get_region(device, region_kind)
-            @debug "Allocating $(Base.format_bytes(bytesize)) from $region"
-            buf = alloc(device, region, bytesize)
-        else
-            if USE_HIP_MALLOC_OVERRIDE
-                @debug "Allocating $(Base.format_bytes(bytesize)) from HIP"
-                buf = alloc_hip(bytesize)
-            else
-                region = get_memory_pool(device, region_kind)
-                @debug "Allocating $(Base.format_bytes(bytesize)) from $region"
-                buf = alloc(device, region, bytesize)
-                # This is a no-op and we need to make sure that we use the right region instead
-                # check(HSA.memory_assign_agent(buf.ptr, device.agent, HSA.ACCESS_PERMISSION_RW))
-            end
-        end
-    catch err
-        if slow_fallback &&
-           !coherent &&
-           err isa Runtime.HSAError &&
-           (err.code == HSA.STATUS_ERROR_OUT_OF_RESOURCES ||
-            err.code == HSA.STATUS_ERROR_INVALID_ALLOCATION)
-            # TODO: How to handle this with logging?
-            buf = alloc(device, bytesize; coherent=true)
-        else
-            rethrow(err)
-        end
-    finally
-        ptr = buf !== nothing ? buf.ptr : C_NULL
-        region = region !== nothing ? get_handle(region) : C_NULL
-        Runtime.@log_finish(:alloc, (;alloc_id), (;ptr, region))
-    end
-    return buf
-end
-function alloc_or_retry!(f)
-    for phase in 1:3
-        if phase == 2
-            GC.gc(false)
-            yield()
-        elseif phase == 3
-            GC.gc(true)
-            yield()
-        end
-        status = f()
-        @debug "Allocation phase $phase: $status"
-        if status == HSA.STATUS_SUCCESS
-            break
-        elseif status == HSA.STATUS_ERROR_OUT_OF_RESOURCES || status == HSA.STATUS_ERROR_INVALID_ALLOCATION
-            if phase == 3
-                check(status)
-            end
-        else
-            check(status)
-        end
-    end
-end
-
-const ALL_ALLOCS = Threads.Atomic{Int64}(0)
-
-_alloc(p::ROCMemoryPool, bytesize::Integer, ptr_ref) = HSA.amd_memory_pool_allocate(p.pool, bytesize, 0, ptr_ref)
-_alloc(p::ROCMemoryRegion, bytesize::Integer, ptr_ref) = HSA.memory_allocate(p.region, bytesize, ptr_ref)
-
-_accessible(p::ROCMemoryRegion)::Bool = Runtime.region_host_accessible(p)
-_accessible(p::ROCMemoryPool)::Bool = Runtime.pool_accessible_by_all(p)
-
-function alloc(
-    device::ROCDevice, space::S, bytesize::Integer,
-) where S <: Union{ROCMemoryPool, ROCMemoryRegion}
-    ptr_ref = Ref{Ptr{Cvoid}}()
-    alloc_or_retry!(() -> _alloc(space, bytesize, ptr_ref))
-    ptr = ptr_ref[]
-    AMDGPU.hsaref!()
-    Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
-    Buffer(ptr, C_NULL, ptr, Int64(bytesize), device, _accessible(space), S <: ROCMemoryPool)
-end
-
-alloc(bytesize; kwargs...) = alloc(AMDGPU.device(), bytesize; kwargs...)
-
-@static if AMDGPU.hip_configured
-    function alloc_hip(bytesize::Integer)
-        ptr_ref = Ref{Ptr{Cvoid}}()
-        # FIXME: Set HIP device
-        alloc_or_retry!() do
-            try
-                HIP.@check HIP.hipMalloc(ptr_ref, Csize_t(bytesize))
-                HSA.STATUS_SUCCESS
-            catch
-                # FIXME: Actually check error code
-                HSA.STATUS_ERROR_OUT_OF_RESOURCES
-            end
-        end
-        AMDGPU.hsaref!()
-        ptr = ptr_ref[]
-        Threads.atomic_add!(ALL_ALLOCS, Int64(bytesize))
-        Buffer(ptr, C_NULL, ptr, Int64(bytesize), AMDGPU.device(), false, true)
-    end
-end
-
-function free(buf::Buffer)
-    buf.ptr == C_NULL && return
-
-    Runtime.@log_start(:free, (;ptr=buf.ptr), nothing)
-    if buf.host_ptr == C_NULL
-        # HSA-backed
-        if buf.pool_alloc
-            if USE_HIP_MALLOC_OVERRIDE
-                @static if AMDGPU.hip_configured
-                    # Actually HIP-backed
-                    HIP.@check HIP.hipFree(buf.base_ptr)
-                end
-            else
-                memory_check(HSA.amd_memory_pool_free(buf.base_ptr), buf.base_ptr)
-            end
-            Threads.atomic_sub!(ALL_ALLOCS, Int64(buf.bytesize))
-        else
-            memory_check(HSA.memory_free(buf.base_ptr), buf.base_ptr)
-        end
-        AMDGPU.hsaunref!()
-    else
-        # Wrapped
-        unlock(buf.ptr)
-    end
-    Runtime.@log_finish(:free, (;ptr=buf.ptr), nothing)
-    return
-end
-# N.B. We try to keep this from yielding or throwing, since this usually runs
-# in a finalizer
-function memory_check(status::HSA.Status, ptr::Ptr{Cvoid})
-    if status != HSA.STATUS_SUCCESS
-        err_str = Runtime.description(status)
-        Core.println("Error when attempting to free an HSA buffer:\n  $err_str")
-        pinfo = pointerinfo(ptr)
-        Core.println(sprint(io->Base.show(io, pinfo)))
-        return false
-    end
-    return true
-end
-
-struct PoolAllocation
-    addr::Ptr{Cvoid}
-    refs::Threads.Atomic{Int}
-end
-PoolAllocation(addr) =
-    PoolAllocation(addr, Threads.Atomic{Int}(1))
-Base.hash(p::PoolAllocation) = hash(p.addr, hash(PoolAllocation))
-Base.isequal(p1::P, p2::P) where P<:PoolAllocation = p1.addr == p2.addr
-
-const ALLOC_POOL_BINNED = IdDict{ROCDevice,Dict{Int,Vector{Ptr{Cvoid}}}}()
-const ALLOC_POOL_PTR_BIN_MAP = Dict{Ptr{Cvoid},Int}()
-const ALLOC_POOL_SHARED = IdDict{ROCDevice,Dict{UInt64,PoolAllocation}}()
-const ALLOC_POOL_LOCK = Threads.SpinLock()
-const ALLOC_POOL_MAX_SIZE = Ref{Int}(64)
-const ALLOC_POOL_MAX_BINS = 8
-
-function alloc_pooled(device::ROCDevice, key::UInt64, kind::Symbol, bytesize::Integer)
-    @assert kind == :kernarg "Pooled non-kernarg allocations not implemented"
-
-    if bytesize == 0
-        return C_NULL, false
-    end
-
-    # Try to grab from pool
-    Base.lock(ALLOC_POOL_LOCK) do
-        # Try to grab a shared allocation
-        device_dict_shared = get!(()->Dict{UInt64,PoolAllocation}(), ALLOC_POOL_SHARED, device)
-        if (alloc = get(device_dict_shared, key, nothing)) !== nothing
-            Threads.atomic_add!(alloc.refs, 1)
-            return alloc.addr, false
-        end
-        # Fallback, try to grab a binned (unshared) allocation
-        device_dict_binned = get!(ALLOC_POOL_BINNED, device) do
-            d = Dict{Int,Vector{Ptr{Cvoid}}}()
-            for bin in 1:ALLOC_POOL_MAX_BINS
-                d[bin] = Vector{Ptr{Cvoid}}()
-            end
-            d
-        end
-        bin_min = ceil(Int, log2(bytesize))
-        if bin_min <= ALLOC_POOL_MAX_BINS
-            # Find any compatible allocation
-            bin = findfirst(bin->bin >= bin_min && length(device_dict_binned[bin]) > 0, bin_min:ALLOC_POOL_MAX_BINS)
-            if bin !== nothing
-                ptr = pop!(device_dict_binned[bin])
-                ALLOC_POOL_PTR_BIN_MAP[ptr] = bin
-                return ptr, true
-            end
-        end
-
-        # No available allocations to grab, make a new one
-        Base.unlock(ALLOC_POOL_LOCK)
-
-        if bin_min <= ALLOC_POOL_MAX_BINS
-            # Round-up bytesize to allow reuse in bins
-            bytesize = 2^bin_min
-        end
-
-        # N.B. We use the region API because kernarg allocations don't
-        # show up in the memory pools API
-        kernarg_region = Runtime.get_region(device, :kernarg)
-        kernarg_address = Ref{Ptr{Nothing}}(Ptr{Nothing}(0))
-        alloc_or_retry!() do
-            HSA.memory_allocate(kernarg_region.region,
-                                bytesize,
-                                kernarg_address)
-        end
-
-        Base.lock(ALLOC_POOL_LOCK)
-
-        # Try to share this allocation
-        if length(device_dict_shared) < ALLOC_POOL_MAX_SIZE[]
-            device_dict_shared[key] = PoolAllocation(kernarg_address[])
-        end
-
-        return kernarg_address[], true
-    end
-end
-
-function free_pooled(device::ROCDevice, key::UInt64, kind::Symbol, ptr::Ptr{Cvoid})
-    # Return to pool
-    Runtime.@spinlock ALLOC_POOL_LOCK begin
-        # Check if this pointer is a shared allocation
-        device_dict_shared = get!(()->Dict{UInt64,PoolAllocation}(), ALLOC_POOL_SHARED, device)
-        if (alloc = get(device_dict_shared, key, nothing)) !== nothing
-            if Threads.atomic_sub!(alloc.refs, 1) == 1
-                # TODO: Don't delete unless we're out of space
-                delete!(device_dict_shared, key)
-                # TODO: Consider putting into a bin if power-of-two bytesize
-                check(HSA.memory_free(ptr))
-            end
-            return
-        end
-        # Check if this pointer is a binned allocation
-        if !haskey(ALLOC_POOL_PTR_BIN_MAP, ptr)
-            # Not binned or shared
-            check(HSA.memory_free(ptr))
-            return
-        end
-        bin = ALLOC_POOL_PTR_BIN_MAP[ptr]
-        allocs = ALLOC_POOL_BINNED[device][bin]
-        if length(allocs) < ALLOC_POOL_MAX_SIZE[]
-            # Save for later
-            push!(allocs, ptr)
-        else
-            # No free space
-            check(HSA.memory_free(ptr))
-        end
-        return
-    end
-end
-
-"""
-    set!(buf::Buffer, value::UInt32, len::Integer)
-
-Write `len` copies of the 32-bit `value` at the start of `buf`.
-"""
-function set!(buf::Buffer, value::UInt32, len::Integer)
-    HSA.amd_memory_fill(buf.ptr, value, len) |> check
-end
-
-"""
-    upload!(dst::Buffer, src, nbytes::Integer)
-
-Upload `nbytes` memory from `src` at the host to `dst` on the device.
-"""
-function upload!(dst::Buffer, src::Ptr{T}, nbytes::Integer) where T
-    Runtime.@log_start(:upload!, (;dest=dst.ptr, src=reinterpret(Ptr{Cvoid}, src)), (;nbytes))
-    nbytes > 0 || return
-    if dst.host_ptr == C_NULL
-        HSA.memory_copy(Ptr{T}(dst.ptr), src, nbytes) |> check
-    else
-        Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst.host_ptr),
-                            reinterpret(Ptr{UInt8}, src),
-                            nbytes)
-    end
-    Runtime.@log_finish(:upload!, (;dest=dst.ptr, src=reinterpret(Ptr{Cvoid}, src)), (;nbytes))
-end
-
-"""
-    download!(dst::Ref, src::Buffer, nbytes::Integer)
-
-Download `nbytes` memory from `src` on the device to `dst` on the host.
-"""
-function download!(dst::Ptr{T}, src::Buffer, nbytes::Integer) where T
-    Runtime.@log_start(:download!, (;dest=reinterpret(Ptr{Cvoid}, dst), src=src.ptr), (;nbytes))
-    nbytes > 0 || return
-    if src.host_ptr == C_NULL
-        HSA.memory_copy(dst, Ptr{T}(src.ptr), nbytes) |> check
-    else
-        Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst),
-                            reinterpret(Ptr{UInt8}, src.host_ptr),
-                            nbytes)
-    end
-    Runtime.@log_finish(:download!, (;dest=reinterpret(Ptr{Cvoid}, dst), src=src.ptr), (;nbytes))
-end
-
-"""
-    transfer!(dst::Buffer, src::Buffer, nbytes::Integer)
-
-Transfer `nbytes` of device memory from `src` to `dst`.
-"""
-function transfer!(dst::Buffer, src::Buffer, nbytes::Integer)
-    Runtime.@log_start(:transfer!, (;dest=dst.ptr, src=src.ptr), (;nbytes))
-    nbytes > 0 || return
-    if dst.host_ptr != C_NULL && src.host_ptr != C_NULL
-        Base.unsafe_copyto!(reinterpret(Ptr{UInt8}, dst.host_ptr),
-                            reinterpret(Ptr{UInt8}, src.host_ptr),
-                            nbytes)
-    elseif dst.host_ptr != C_NULL
-        download!(dst.host_ptr, src, nbytes)
-    elseif src.host_ptr != C_NULL
-        upload!(dst, src.host_ptr, nbytes)
-    else
-        HSA.memory_copy(dst.ptr, src.ptr, nbytes) |> check
-    end
-    Runtime.@log_finish(:transfer!, (;dest=dst.ptr, src=src.ptr), (;nbytes))
-end
-
-"""
-    unsafe_copy3d!(dst::Ptr{T}, src::Ptr{T}, width, height=1, depth=1;
-                   dstPos::ROCDim=(1,1,1), dstPitch=0, dstSlice=0,
-                   srcPos::ROCDim=(1,1,1), srcPitch=0, srcSlice=0,
-                   async::Bool=false, signal::ROCSignal=nothing) where T
-
-Perform a 3D memory copy between pointers `src` and `dst` at respectively position `srcPos` and `dstPos` 
-(1-indexed). Both pitch and slice can be specified for both the source and destination. This call is 
-executed asynchronously if `async` is set, otherwise `signal` is synchronized.
-"""
-function unsafe_copy3d!(dst::Ptr{T}, src::Ptr{T}, width, height=1, depth=1;
-                        dstPos::ROCDim=(1,1,1), dstPitch=0, dstSlice=0,
-                        srcPos::ROCDim=(1,1,1), srcPitch=0, srcSlice=0,
-                        async::Bool=false, signal::ROCSignal=nothing) where T
-    (T == Nothing) && error("Type of Ptr is Nothing")
-
-    dstPtr_info = pointerinfo(dst)
-    srcPtr_info = pointerinfo(src)
-
-    if dstPtr_info.type == HSA.EXT_POINTER_TYPE_UNKNOWN || srcPtr_info.type == HSA.EXT_POINTER_TYPE_UNKNOWN
-        error("Only device pointers or locked host pointers are supported, see unsafe_wrap and Mem.lock")
-    end
-
-    if dstPtr_info.type == HSA.EXT_POINTER_TYPE_HSA && srcPtr_info.type == HSA.EXT_POINTER_TYPE_LOCKED
-        Runtime.device_type(dstPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU || error("dst should point to device memory")
-        hsaCopyDir = HSA.LibHSARuntime.hsaHostToDevice
-    elseif dstPtr_info.type == HSA.EXT_POINTER_TYPE_LOCKED && srcPtr_info.type == HSA.EXT_POINTER_TYPE_HSA
-        Runtime.device_type(srcPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU || error("src should point to device memory")
-        hsaCopyDir = HSA.LibHSARuntime.hsaDeviceToHost
-    elseif dstPtr_info.type == HSA.EXT_POINTER_TYPE_HSA && srcPtr_info.type == HSA.EXT_POINTER_TYPE_HSA
-        (Runtime.device_type(dstPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU && Runtime.device_type(srcPtr_info.agentOwner) == HSA.DEVICE_TYPE_GPU) || error("dst and src should point to device memory")
-        hsaCopyDir = HSA.LibHSARuntime.hsaDeviceToDevice
-    else
-        error("Only device to device, host to device, and device to host memory transfer is supported")
-    end
-
-    dstOffset = (sizeof(T)*(dstPos[1]-1), dstPos[2]-1, dstPos[3]-1)
-    srcOffset = (sizeof(T)*(srcPos[1]-1), srcPos[2]-1, srcPos[3]-1)
-
-    dstRef = Ref(HSA.PitchedPtr(dst, dstPitch, dstSlice))
-    srcRef = Ref(HSA.PitchedPtr(src, srcPitch, srcSlice))
-    dstOffsetRef = Ref(HSA.Dim3(dstOffset...))
-    srcOffsetRef = Ref(HSA.Dim3(srcOffset...))
-    rangeRef = Ref(HSA.Dim3(sizeof(T)*width, height, depth))
-
-    AMDGPU.HSA.amd_memory_async_copy_rect(
-        Base.unsafe_convert(Ptr{HSA.PitchedPtr}, dstRef),
-        Base.unsafe_convert(Ptr{HSA.Dim3},       dstOffsetRef),
-        Base.unsafe_convert(Ptr{HSA.PitchedPtr}, srcRef),
-        Base.unsafe_convert(Ptr{HSA.Dim3},       srcOffsetRef),
-        Base.unsafe_convert(Ptr{HSA.Dim3},       rangeRef),
-        AMDGPU.device().agent, hsaCopyDir,
-        UInt32(0), C_NULL, signal.signal) |> check
-
-    async || wait(signal)
-    return nothing
-end
-
-
-## array based
-
-"""
-    alloc(src::AbstractArray; alloc_kwargs...)
-
-Allocate space to store the contents of `src`.
-"""
-function alloc(src::AbstractArray; alloc_kwargs...)
-    return alloc(sizeof(src); alloc_kwargs...)
-end
-
-"""
-    upload!(dst::Buffer, src::AbstractArray)
-
-Upload the contents of an array `src` to `dst`.
-"""
-function upload!(dst::Buffer, src::AbstractArray)
-    GC.@preserve src upload!(dst, pointer(src), sizeof(src))
-end
-
-"""
-    upload(src::AbstractArray; alloc_kwargs...)::Buffer
-
-Allocates space for and uploads the contents of an array `src`, returning a Buffer.
-For the allocation keywoard arguments see [`alloc`](@ref).
-"""
-function upload(src::AbstractArray; alloc_kwargs...)
-    dst = alloc(src; alloc_kwargs...)
-    upload!(dst, src)
-    return dst
-end
-
-"""
-    download!(dst::AbstractArray, src::Buffer)
-
-Downloads memory from `src` to the array at `dst`. The amount of memory downloaded is
-determined by calling `sizeof` on the array, so it needs to be properly preallocated.
-"""
-function download!(dst::AbstractArray, src::Buffer)
-    GC.@preserve dst download!(pointer(dst), src, sizeof(dst))
-    return
-end
-
-## type based
-
-function check_type(::Type{Buffer}, T)
-    if isa(T, UnionAll) || isabstracttype(T) || !isconcretetype(T)
-        throw(ArgumentError("cannot represent abstract or non-leaf object"))
-    end
-    Base.datatype_pointerfree(T) || throw(ArgumentError("cannot handle non-ptrfree objects"))
-    sizeof(T) == 0 && throw(ArgumentError("cannot represent singleton objects"))
-end
-
-"""
-    alloc(T::Type, [count::Integer=1]; alloc_kwargs...)
-
-Allocate space for `count` objects of type `T`.
-"""
-function alloc(::Type{T}, count::Integer=1; alloc_kwargs...) where {T}
-    check_type(Buffer, T)
-
-    return alloc(sizeof(T)*count; alloc_kwargs...)
-end
-
-"""
-    download(::Type{T}, src::Buffer, [count::Integer=1])::Vector{T}
-
-Download `count` objects of type `T` from the device at `src`, returning a vector.
-"""
-function download(::Type{T}, src::Buffer, count::Integer=1) where {T}
-    dst = Vector{T}(undef, count)
-    download!(dst, src)
-    return dst
-end
-
-# Pretty-printing
-function Base.show(io::IO, ptrinfo::HSA.AMDPointerInfo)
-    println(io, "Pointer type: $(ptrinfo.type)")
-    println(io, "Owner: $(DEVICES[ptrinfo.agentOwner.handle])")
-    println(io, "Agent base address: $(ptrinfo.agentBaseAddress)")
-    println(io, "Host base address: $(ptrinfo.hostBaseAddress)")
-    print(io, "Size (bytes): $(ptrinfo.sizeInBytes)")
-end
diff --git a/src/runtime/memory/hip.jl b/src/runtime/memory/hip.jl
new file mode 100644
index 000000000..ffe9fb2f1
--- /dev/null
+++ b/src/runtime/memory/hip.jl
@@ -0,0 +1,180 @@
+const _POOL_STATUS = AMDGPU.LockedObject(
+    Dict{HIP.HIPDevice, Base.RefValue{Union{Nothing, Bool}}}())
+
+function pool_status(dev::HIP.HIPDevice)
+    Base.lock(_POOL_STATUS) do ps
+        get!(ps, dev, Ref{Union{Nothing, Bool}}(nothing))
+    end
+end
+
+const __pool_cleanup = Ref{Task}()
+function pool_cleanup()
+    idle_counters = Base.fill(0, HIP.ndevices())
+    devices = HIP.devices()
+    while true
+        for (i, dev) in enumerate(devices)
+            status = pool_status(dev)
+            isnothing(status[]) && continue
+
+            if status[]::Bool
+                idle_counters[i] = 0
+            else
+                idle_counters[i] += 1
+            end
+            status[] = false
+
+            if idle_counters[i] == 5
+                old_device = HIP.device()
+                old_device != dev && HIP.device!(dev)
+                HIP.reclaim()
+                old_device != dev && HIP.device!(old_device)
+            end
+        end
+
+        try
+            sleep(60)
+        catch ex
+            if ex isa EOFError
+                # If we get EOF here, it's because Julia is shutting down,
+                # so we should just exit the loop.
+                break
+            else
+                rethrow()
+            end
+        end
+    end
+end
+
+function mark_pool!(dev::HIP.HIPDevice)
+    status = pool_status(dev)
+    if isnothing(status[])
+        # Default to `0` which is the default value in HIP.
+        limit = parse_memory_limit(@load_preference("soft_memory_limit", "0 MiB"))
+        HIP.attribute!(
+            HIP.memory_pool(dev), HIP.hipMemPoolAttrReleaseThreshold, limit)
+        if !isassigned(__pool_cleanup)
+            __pool_cleanup[] = errormonitor(Threads.@spawn pool_cleanup())
+        end
+    end
+    status[] = true
+end
+
+struct HIPBuffer <: AbstractAMDBuffer
+    device::HIPDevice
+    ptr::Ptr{Cvoid}
+    bytesize::Int
+    _id::UInt64 # Unique ID used for refcounting.
+end
+
+# TODO pass device?
+function HIPBuffer(bytesize; stream::HIP.HIPStream)
+    dev = stream.device
+    bytesize == 0 && return HIPBuffer(dev, C_NULL, 0, _buffer_id!())
+
+    mark_pool!(dev)
+    pool = HIP.memory_pool(dev)
+
+    has_limit = HARD_MEMORY_LIMIT != typemax(UInt64)
+
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    alloc_or_retry!() do
+        try
+            # Try to ensure there is enough memory before even trying to allocate.
+            if has_limit
+                used = HIP.used_memory(pool)
+                (used + bytesize) > HARD_MEMORY_LIMIT &&
+                    throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
+            end
+
+            # Try to allocate.
+            HIP.hipMallocAsync(ptr_ref, bytesize, stream) |> HIP.check
+            ptr_ref[] == C_NULL && throw(HIP.HIPError(HIP.hipErrorOutOfMemory))
+            return HSA.STATUS_SUCCESS
+        catch err
+            # TODO rethrow if not out of memory error
+            @debug "hipMallocAsync exception. Requested $(Base.format_bytes(bytesize))." exception=(err, catch_backtrace())
+            return HSA.STATUS_ERROR_OUT_OF_RESOURCES
+        end
+    end
+    ptr = ptr_ref[]
+    @assert ptr != C_NULL "hipMallocAsync resulted in C_NULL for $(Base.format_bytes(bytesize))"
+
+    # TODO do not reclaim (ROCm 5.5+ has hard pool size limit)
+    if has_limit
+        if HIP.reserved_memory(pool) > HARD_MEMORY_LIMIT
+            HIP.reclaim() # TODO do not reclaim all memory
+        end
+        @assert HIP.reserved_memory(pool) ≤ HARD_MEMORY_LIMIT
+    end
+
+    HIPBuffer(dev, ptr, bytesize, _buffer_id!())
+end
+
+HIPBuffer(ptr::Ptr{Cvoid}, bytesize::Int) = HIPBuffer(
+    AMDGPU.device(), ptr, bytesize, _buffer_id!())
+
+Base.unsafe_convert(::Type{Ptr{T}}, buf::HIPBuffer) where T = convert(Ptr{T}, buf.ptr)
+
+function view(buf::HIPBuffer, bytesize::Int)
+    bytesize > buf.bytesize && throw(BoundsError(buf, bytesize))
+    HIPBuffer(buf.device, buf.ptr + bytesize, buf.bytesize - bytesize, buf._id)
+end
+
+function free(buf::HIPBuffer; stream::HIP.HIPStream)
+    buf.ptr == C_NULL && return
+    HIP.hipFreeAsync(buf, stream) |> HIP.check
+    return
+end
+
+function upload!(dst::HIPBuffer, src::Ptr, bytesize::Int; stream::HIP.HIPStream)
+    bytesize == 0 && return nothing
+    HIP.hipMemcpyHtoDAsync(dst, src, bytesize, stream) |> HIP.check
+    HIP.HIPEvent(stream)
+end
+
+function download!(dst::Ptr, src::HIPBuffer, bytesize::Int; stream::HIP.HIPStream)
+    bytesize == 0 && return nothing
+    HIP.hipMemcpyDtoHAsync(dst, src, bytesize, stream) |> HIP.check
+    HIP.HIPEvent(stream)
+end
+
+function transfer!(dst::HIPBuffer, src::HIPBuffer, bytesize::Int; stream::HIP.HIPStream)
+    bytesize == 0 && return nothing
+    HIP.hipMemcpyDtoDAsync(dst, src, bytesize, stream) |> HIP.check
+    HIP.HIPEvent(stream)
+end
+
+struct HostBuffer <: AbstractAMDBuffer
+    ptr::Ptr{Cvoid}
+    bytesize::Int
+end
+
+HostBuffer() = HostBuffer(C_NULL, 0)
+
+Base.unsafe_convert(::Type{Ptr{T}}, buf::HostBuffer) where T = convert(Ptr{T}, buf.ptr)
+
+function HostBuffer(bytesize::Integer, flags = 0)
+    bytesize == 0 && return HostBuffer()
+
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    HIP.hipHostMalloc(ptr_ref, bytesize, flags) |> HIP.check
+    HostBuffer(ptr_ref[], bytesize)
+end
+
+function free(buf::HostBuffer)
+    buf.ptr == C_NULL && return
+    HIP.hipHostFree(buf) |> HIP.check
+    return
+end
+
+# TODO
+# - introduce hipPtr.
+# - use Base.convert instead of `device_ptr`.
+# - define unsafe_copyto! for all buffers instead of upload!, etc.
+
+function device_ptr(buf::HostBuffer)
+    buf.ptr == C_NULL && return C_NULL
+    ptr_ref = Ref{Ptr{Cvoid}}()
+    HIP.hipHostGetDevicePointer(ptr_ref, buf.ptr, 0) |> HIP.check
+    ptr_ref[]
+end
diff --git a/src/runtime/memory/refcount.jl b/src/runtime/memory/refcount.jl
new file mode 100644
index 000000000..2b8e3762d
--- /dev/null
+++ b/src/runtime/memory/refcount.jl
@@ -0,0 +1,62 @@
+const _ID_COUNTER = Threads.Atomic{UInt64}(0)
+const refcounts = Dict{UInt64, Int}()
+const liveness = Dict{UInt64, Bool}()
+
+function _buffer_id!()::UInt64
+    return Threads.atomic_add!(_ID_COUNTER, UInt64(1))
+end
+
+function refcount(buf::AbstractAMDBuffer)
+    Base.lock(refcounts_lock) do
+        get(refcounts, buf._id, 0)
+    end
+end
+
+function retain(buf::AbstractAMDBuffer)
+    Base.lock(refcounts_lock) do
+        live = get!(liveness, buf._id, true)
+        @assert live "Trying to retain dead buffer!"
+        count = get!(refcounts, buf._id, 0)
+        refcounts[buf._id] = count + 1
+    end
+    return
+end
+
+function release(buf::HIPBuffer; stream::HIP.HIPStream)
+    while !Base.trylock(refcounts_lock) end
+    try
+        count = refcounts[buf._id]
+        @assert count >= 1 "Buffer refcount dropping below 0!"
+        refcounts[buf._id] = count - 1
+        done = count == 1
+
+        live = liveness[buf._id]
+
+        if done
+            live && free(buf; stream)
+            untrack(buf)
+        end
+        return done
+    finally
+        Base.unlock(refcounts_lock)
+    end
+end
+
+function free_if_live(buf::HIPBuffer; stream::HIP.HIPStream)
+    Base.lock(refcounts_lock) do
+        if liveness[buf._id]
+            liveness[buf._id] = false
+            free(buf; stream)
+        end
+    end
+end
+
+function untrack(buf::AbstractAMDBuffer)
+    while !Base.trylock(refcounts_lock) end
+    try
+        delete!(liveness, buf._id)
+        delete!(refcounts, buf._id)
+    finally
+        Base.unlock(refcounts_lock)
+    end
+end
diff --git a/src/runtime/memory/utils.jl b/src/runtime/memory/utils.jl
new file mode 100644
index 000000000..22736c8ca
--- /dev/null
+++ b/src/runtime/memory/utils.jl
@@ -0,0 +1,157 @@
+"""
+    info()
+
+Returns a tuple of two integers, indicating respectively the free and total amount of memory
+(in bytes) available for allocation on the device.
+"""
+function info()
+    free_ref = Ref{Csize_t}()
+    total_ref = Ref{Csize_t}()
+    HIP.hipMemGetInfo(free_ref, total_ref) |> HIP.check
+    return convert(Int, free_ref[]), convert(Int, total_ref[])
+end
+
+"""
+    free()
+
+Returns the free amount of memory (in bytes), available for allocation on the device.
+"""
+free() = info()[1]
+
+"""
+    total()
+
+Returns the total amount of memory (in bytes), available for allocation on the device.
+"""
+total() = info()[2]
+
+"""
+    used()
+
+Returns the used amount of memory (in bytes), allocated on the device.
+"""
+used() = total() - free()
+
+const ALL_ALLOCS = Threads.Atomic{Int64}(0)
+
+function parse_memory_limit(limit_str::String)
+    limit_str == "none" && return typemax(UInt64)
+
+    units = ("%", "MiB", "GiB")
+
+    value, unit = split(limit_str) # TODO check length 2 before split
+    unit in units || throw(ArgumentError("""
+    Memory limit must be specified in `$units` units, but `$unit` was given.
+    """))
+
+    total_memory = total()
+    limit = if unit == "%"
+        v = parse(Int, value)
+        0 < v ≤ 100 || throw(ArgumentError("""
+        Invalid percentage value for memory limit `$v`.
+        Must be in (0, 100] range or 'none'.
+        """))
+        floor(UInt64, total_memory * (v / 100))
+    else
+        scale = unit == "MiB" ? (1024^2) : (1024^3)
+        parse(UInt64, value) * scale
+    end
+
+    limit > total_memory && throw(ArgumentError("""
+    Memory limit `$(Base.format_bytes(limit))` is bigger than the actual memory `$(Base.format_bytes(total_memory))`.
+    Set to `none` to disable memory limit.
+    """))
+
+    limit
+end
+
+"""
+Set a hard limit for total GPU memory allocations.
+"""
+set_memory_alloc_limit!(limit::String) =
+    @set_preferences!("hard_memory_limit" => limit)
+
+const HARD_MEMORY_LIMIT = parse_memory_limit(
+    @load_preference("hard_memory_limit", "none"))
+
+function alloc_or_retry!(f)
+    status = f()
+    status == HSA.STATUS_SUCCESS && return
+
+    stream = AMDGPU.stream()
+
+    phase = 1
+    while true
+        if phase == 1
+            HIP.synchronize(stream)
+        elseif phase == 2
+            HIP.device_synchronize()
+        elseif phase == 3
+            GC.gc(false)
+            HIP.device_synchronize()
+        elseif phase == 4
+            GC.gc(true)
+            HIP.device_synchronize()
+        elseif phase == 5
+            HIP.trim(HIP.memory_pool(stream.device))
+        else
+            break
+        end
+        phase += 1
+
+        status = f()
+        status == HSA.STATUS_SUCCESS && break
+    end
+
+    if status != HSA.STATUS_SUCCESS
+        pool = HIP.memory_pool(stream.device)
+        @warn """
+        Failed to successfully execute function and free resources for it.
+        Reporting current memory usage:
+        - HIP pool used: $(Base.format_bytes(HIP.used_memory(pool))).
+        - HIP pool reserved: $(Base.format_bytes(HIP.reserved_memory(pool))).
+        - Hard memory limit: $(Base.format_bytes(HARD_MEMORY_LIMIT)).
+        """
+    end
+
+    check(status)
+    return
+end
+
+"""
+Allocate linear memory on the device and return a buffer to the allocated memory. The
+allocated memory is suitably aligned for any kind of variable. The memory will not be freed
+automatically, use [`free(::Buffer)`](@ref) for that.
+"""
+function alloc end
+
+"""
+Free device memory.
+"""
+function free end
+
+"""
+Initialize device memory with a repeating value.
+"""
+function set! end
+
+"""
+Upload memory from host to device.
+Executed asynchronously on `queue` if `async` is true.
+"""
+function upload end
+@doc (@doc upload) upload!
+
+"""
+Download memory from device to host.
+Executed asynchronously on `queue` if `async` is true.
+"""
+function download end
+@doc (@doc download) download!
+
+"""
+Transfer memory from device to device.
+Executed asynchronously on `queue` if `async` is true.
+"""
+function transfer end
+@doc (@doc transfer) transfer!
diff --git a/src/runtime/queue.jl b/src/runtime/queue.jl
deleted file mode 100644
index 80277f2f9..000000000
--- a/src/runtime/queue.jl
+++ /dev/null
@@ -1,331 +0,0 @@
-mutable struct ROCQueue
-    device::ROCDevice
-    queue::Ptr{HSA.Queue}
-    priority::Symbol
-    status::HSA.Status
-    @atomic active::Bool
-    active_kernels::LinkedList # TODO: Concrete type
-    running::Base.Event
-    lock::Threads.ReentrantLock
-end
-
-"""
-    ROCQueue(; priority::Symbol=:normal, pooled::Bool=false)
-
-Create an HSA queue on the currently active device.
-
-!!! note
-    Users are encouraged to use this method,
-    instead of manually providing device since this one
-    correctly handles device changes.
-"""
-function ROCQueue(; priority::Symbol=:normal, pooled::Bool=false)
-    ROCQueue(AMDGPU.device(); priority, pooled)
-end
-
-get_handle(queue::ROCQueue) = reinterpret(Ptr{Cvoid}, queue.queue)
-
-function Base.show(io::IO, queue::ROCQueue)
-    print(io, "ROCQueue(device=$(queue.device), ptr=$(repr(UInt(queue.queue))), priority=$(queue.priority), status=$(queue.status), active=$(queue.active), running=$(queue.running.set))")
-end
-
-const QUEUES = Dict{Ptr{HSA.Queue}, WeakRef}()
-
-function queue_error_handler(
-    status::HSA.Status, _queue::Ptr{HSA.Queue}, queue_obj_ptr::Ptr{Cvoid},
-)::Nothing
-    if status != HSA.STATUS_SUCCESS
-        queue::ROCQueue = unsafe_pointer_to_objref(queue_obj_ptr)
-        queue.status = status
-    end
-    nothing
-end
-
-struct QueueError <: Exception
-    queue::ROCQueue
-    exception::Union{Exception,Nothing}
-end
-function QueueError(queue::ROCQueue)
-    err = if queue.status != HSA.STATUS_SUCCESS
-        HSAError(queue.status)
-    else
-        nothing
-    end
-    return QueueError(queue, err)
-end
-function Base.showerror(io::IO, err::QueueError)
-    queue = err.queue
-    println(io, "QueueError(Queue $(repr(reinterpret(UInt64, queue.queue))) on $(queue.device)) due to:")
-    if err.exception !== nothing
-        Base.showerror(io, err.exception)
-    else
-        print(io, "Queue was killed")
-    end
-    println(io); print(io, "You can select a new queue with `AMDGPU.reset_dead_queue!()`")
-end
-
-mutable struct QueuePool
-    pool::Dict{ROCDevice,Dict{Symbol,Vector{ROCQueue}}}
-    max_size::NTuple{3, Int}
-    idx::Int
-end
-QueuePool() = QueuePool(
-    Dict{ROCDevice,Dict{Symbol,Vector{ROCQueue}}}(),
-    (@load_preference("queue_pool_max_size", [12, 1, 1])...,),
-    0)
-
-const QUEUE_POOL = LockedObject(QueuePool())
-
-"""
-    set_queue_pool_size!(nums::NTuple{3, Int})
-
-Set HSA queue pool max size for each priority.
-Restart Julia session for the changes to take effect.
-
-# Arguments:
-
-- `nums::NTuple{3, Int}`: Maximum number of queues for `:normal`,
-    `:low` and `:high` priority.
-    Providing `0` for specific priority, disables pool for it.
-"""
-function set_queue_pool_size!(nums::NTuple{3, Int})
-    @set_preferences!("queue_pool_max_size" => [nums...])
-    @info """Successfully set queue pool max size to `$nums` (:normal, :low, :high).
-    Reset your Julia session for the changes to take effect."""
-end
-
-function get_pool_queue!(device::ROCDevice, priority::Symbol)
-    prio_idx = priority == :normal ? 1 : (priority == :low ? 2 : 3)
-
-    lock(QUEUE_POOL) do pool
-        device_pool = get!(() -> Dict{Symbol, Vector{ROCQueue}}(), pool.pool, device)
-        queues = get!(() -> ROCQueue[], device_pool, priority)
-        length(queues) < pool.max_size[prio_idx] && return nothing
-
-        # If all queues are allocated, pick next one.
-        idx = pool.idx % length(queues) + 1
-        pool.idx += 1
-
-        queue = queues[idx]
-        queue.active && return queue
-
-        @debug "Removing dead queue from pool"
-        deleteat!(queues, idx)
-        return nothing
-    end
-end
-
-function pool_queue!(queue::ROCQueue)
-    prio_idx = queue.priority == :normal ? 1 : (queue.priority == :low ? 2 : 3)
-    QUEUE_POOL.payload.max_size[prio_idx] == 0 && return false
-
-    lock(QUEUE_POOL) do pool
-        queues = pool.pool[queue.device][queue.priority]
-        length(queues) < pool.max_size[prio_idx] ?
-            (push!(queues, queue); true) :
-            false
-    end
-end
-
-function remove_pooled_queue!(queue::ROCQueue)
-    lock(QUEUE_POOL) do pool
-        device_pool = get(pool.pool, queue.device, nothing)
-        isnothing(device_pool) && return
-
-        queues = get(device_pool, queue.priority, nothing)
-        isnothing(queues) && return
-
-        idx = findfirst(q -> q === queue, queues)
-        isnothing(idx) || deleteat!(queues, idx)
-    end
-end
-
-device_queue_max_size(device::AnyROCDevice) =
-    getinfo(UInt32, device, HSA.AGENT_INFO_QUEUE_MAX_SIZE)
-
-device_queue_type(device::AnyROCDevice) =
-    getinfo(HSA.QueueType, device, HSA.AGENT_INFO_QUEUE_TYPE)
-
-"""
-    ROCQueue(device::ROCDevice; priority::Symbol=:normal, pooled::Bool=false)
-
-Create an HSA queue which will be used to
-instruct GPU hardware which kernels to launch.
-
-Each queue, spawns an error monitoring thread that's responsible
-for actually waiting on kernels and performing a cleanup after
-kernel finished its execution.
-
-!!! note "Oversubscribed Command Queues in GPUs"
-    Be careful, with the number of HSA queues in use.
-    When the number of allocated HSA queues is greater than
-    the number of hardware queues, the GPU wastes significant time
-    rotating between all allocated queues in search of ready tasks.
-
-# Arguments:
-
-- `device::ROCDevice`: Device on which to create queue.
-- `priority::Symbol`: Queue's priority. Can be `:normal`, `:low`, `:high`.
-- `pooled::Bool`: Whether to use pool when creating queues.
-    When `true`, queues are drawn from it on creation
-    and returned to pool instead of destroyed.
-"""
-function ROCQueue(device::ROCDevice; priority::Symbol=:normal, pooled::Bool=false)
-    if !in(priority, (:normal, :low, :high))
-        throw(ArgumentError(
-            "Invalid queue priority: $priority\n" *
-            "Options are :low, :normal, :high"))
-    end
-
-    queue = pooled ? get_pool_queue!(device, priority) : nothing
-    isnothing(queue) || return queue
-
-    alloc_id = rand(UInt64)
-    @log_start(:alloc_queue, (;alloc_id), (;device=get_handle(device), priority))
-
-    # Allocate a new queue from HSA.
-    c_queue_error_handler = @cfunction(queue_error_handler,
-        Cvoid, (HSA.Status, Ptr{HSA.Queue}, Ptr{Cvoid}))
-
-    queue_size = device_queue_max_size(device)
-    queue_type = device_queue_type(device)
-    @assert queue_size > 0
-    @assert queue_type == HSA.QUEUE_TYPE_MULTI
-
-    # Create ROCQueue before HSA queue to be able to pass it to error handler.
-    queue = ROCQueue(
-        device, Ptr{HSA.Queue}(0), priority, HSA.STATUS_SUCCESS, true,
-        LinkedList{ROCKernelSignal}(), Base.Event(), Threads.ReentrantLock())
-
-    # Create HSA queue.
-    r_queue = Ref{Ptr{HSA.Queue}}()
-    HSA.queue_create(
-        device.agent, queue_size, queue_type,
-        c_queue_error_handler, pointer_from_objref(queue),
-        typemax(UInt32), typemax(UInt32), r_queue) |> check
-
-    AMDGPU.hsaref!()
-    queue.queue = r_queue[]
-
-    lock(RT_LOCK) do
-        @assert !haskey(QUEUES, queue.queue)
-        QUEUES[queue.queue] = WeakRef(queue)
-    end
-
-    HSA.amd_queue_set_priority(queue.queue, hsa_priority(priority)) |> check
-    errormonitor(Threads.@spawn monitor_queue(queue))
-
-    finalizer(queue) do q
-        kill_queue!(q)
-        AMDGPU.hsaunref!()
-    end
-
-    pooled && pool_queue!(queue)
-    @log_finish(:alloc_queue, (;alloc_id), (;queue=reinterpret(UInt64, queue.queue)))
-    return queue
-end
-
-function hsa_priority(priority::Symbol)
-    if priority == :normal
-        HSA.AMD_QUEUE_PRIORITY_NORMAL
-    elseif priority == :low
-        HSA.AMD_QUEUE_PRIORITY_LOW
-    elseif priority == :high
-        HSA.AMD_QUEUE_PRIORITY_HIGH
-    end
-end
-
-function monitor_queue(queue::ROCQueue)
-    kerns = queue.active_kernels::LinkedList{ROCKernelSignal}
-    while queue.active || length(kerns) > 0
-        # Fetch oldest signal, if any
-        sig = lock(queue.lock) do
-            if length(kerns) > 0
-                # Notify waiters that queue is running
-                notify(queue.running)
-                return first(kerns)
-            else
-                # Reset event
-                reset(queue.running)
-                return nothing
-            end
-        end
-
-        # Wait for signal completion or new launches
-        if sig !== nothing
-            try
-                wait(sig; check_exceptions=true, cleanup=true)
-            catch err
-                @debug "Kernel exception" exception=(err,catch_backtrace())
-            end
-            # Move to the next kernel.
-            Base.@lock queue.lock begin
-                kerns = next!(kerns)
-            end
-        else
-            wait(queue.running)
-        end
-    end
-end
-
-function ensure_active(queue::ROCQueue)
-    @label check
-    if !queue.active
-        throw(QueueError(queue))
-    elseif queue.status != HSA.STATUS_SUCCESS
-        # We track status updates from the queue callback
-        kill_queue!(queue)
-        @goto check
-    end
-end
-
-"""
-Determine if there are active kernels for the queue.
-If not, we can re-use it.
-"""
-function has_active_kernels(q::ROCQueue)
-    lock(q.lock) do
-        return !isempty(q.active_kernels)
-    end
-end
-
-"""
-    kill_queue!(queue::ROCQueue)
-
-Kill `queue` and propagate queue error to
-all waiter signals in case if there is one.
-
-If queue is in the pool, it will be removed from it.
-
-!!! note
-    No need to manually call this function during regular use,
-    it will be called automatically from [`ROCQueue`](@ref) finalizer.
-"""
-function kill_queue!(queue::ROCQueue)
-    _, succ = @atomicreplace queue.active true => false
-    succ || return
-
-    # TODO: Eliminate race from active=false to setting exception
-
-    @log_start(:kill_queue!, (;queue=reinterpret(UInt64, queue.queue)), nothing)
-    remove_pooled_queue!(queue)
-
-    lock(RT_LOCK) do
-        delete!(QUEUES, queue.queue)
-    end
-    lock(queue.lock) do
-        # Send exception to all waiter signals
-        if queue.status != HSA.STATUS_SUCCESS
-            err = QueueError(queue)
-            for kersig in queue.active_kernels::LinkedList{ROCKernelSignal}
-                kersig::ROCKernelSignal
-                kersig.exception = err
-                notify(kersig)
-            end
-        end
-    end
-
-    HSA.queue_destroy(queue.queue) |> check
-    @log_finish(:kill_queue!, (;queue=reinterpret(UInt64, queue.queue)), nothing)
-    return
-end
diff --git a/src/runtime/signal.jl b/src/runtime/signal.jl
deleted file mode 100644
index 7b37ead73..000000000
--- a/src/runtime/signal.jl
+++ /dev/null
@@ -1,131 +0,0 @@
-const DEFAULT_SIGNAL_TIMEOUT = Ref{Union{Float64, Nothing}}(nothing)
-
-const SIGNAL_TIMEOUT_KILL_QUEUE = Ref{Bool}(true)
-
-struct SignalPool
-    pool::Set{HSA.Signal}
-    max_size::Int
-end
-SignalPool() = SignalPool(Set{HSA.Signal}(), @load_preference("signal_pool_max_size", 128))
-
-const SIGNAL_POOL = LockedObject(SignalPool())
-
-function set_signal_pool_size!(num::Integer)
-    @set_preferences!("signal_pool_max_size" => num)
-    @info """Successfully set signal pool max size to `$num`.
-    Reset your Julia session for the changes to take effect."""
-end
-
-function get_pool_signal!()::Union{HSA.Signal, Nothing}
-    lock(SIGNAL_POOL) do pool
-        isempty(pool.pool) ? nothing : pop!(pool.pool)
-    end
-end
-
-"""
-Return `true` if destroyed a signal, otherwise `false`.
-If `destroy=true` then destroy signal immediately.
-"""
-function free_pool_signal!(signal::HSA.Signal; destroy::Bool)::Bool
-    destroy && (check(HSA.signal_destroy(signal)); return true)
-    lock(SIGNAL_POOL) do pool
-        destroy = length(pool.pool) < pool.max_size
-        destroy ?
-            check(HSA.signal_destroy(signal)) :
-            push!(pool.pool, signal)
-        destroy
-    end
-end
-
-mutable struct ROCSignal
-    signal::HSA.Signal
-end
-Adapt.adapt_structure(::Adaptor, sig::ROCSignal) = sig.signal
-
-struct SignalTimeoutException <: Exception
-    signal::ROCSignal
-end
-
-"""
-    ROCSignal(init::Integer = 1; pooled::Bool=true, ipc::Bool=false) -> ROCSignal
-
-Acquires an HSA signal object which can be used to communicate values between
-the host and device.
-
-- `pooled::Bool`: If `true`, the signal may be taken from an existing pool of
-    signals; if `false`, or if the pool is empty, the signal is allocated from HSA.
-- `ipc::Bool`: If `true`, signal may be used for interprocess communication.
-    IPC signals can be read, written, and waited on from any process.
-    Disables signal pooling when `true`.
-"""
-function ROCSignal(init::Int64 = 1; pooled::Bool = true, ipc::Bool = false)
-    pooled = ipc ? false : pooled
-    raw_signal = pooled ? get_pool_signal!() : nothing
-
-    if isnothing(raw_signal)
-        signal_ref = Ref{HSA.Signal}()
-        check(ipc ?
-            HSA.amd_signal_create(init, 0, C_NULL, HSA.AMD_SIGNAL_IPC, signal_ref) :
-            HSA.signal_create(init, 0, C_NULL, signal_ref))
-        raw_signal = signal_ref[]
-    else
-        HSA.signal_store_relaxed(raw_signal, init) |> check
-    end
-
-    AMDGPU.hsaref!()
-    signal = ROCSignal(raw_signal)
-    finalizer(signal) do signal
-        # Destroy if not using pool, otherwise return to pool.
-        destroyed = free_pool_signal!(signal.signal; destroy=!pooled)
-        destroyed && AMDGPU.hsaunref!()
-    end
-    signal
-end
-
-get_handle(signal::ROCSignal) = signal.signal.handle
-
-load_acquire(signal::ROCSignal) = HSA.signal_load_scacquire(signal.signal)
-
-Base.isdone(signal::ROCSignal) = load_acquire(signal) < 1
-
-Base.show(io::IO, signal::ROCSignal) =
-    print(io, "ROCSignal($(repr(get_handle(signal))))")
-
-function Base.wait(
-    signal::ROCSignal; timeout::Union{Real, Nothing} = DEFAULT_SIGNAL_TIMEOUT[],
-    min_latency::Int64 = 1_000, #= 1 micro-second =#
-    queue = nothing,
-)
-    has_timeout = !isnothing(timeout)
-    has_timeout && (timeout < 0) && error(
-        "Timeout `$timeout` must be a positive real value or `nothing`.")
-
-    start_time = time_ns()
-    finished = false
-
-    GC.@preserve signal while !finished
-        finished = 0 == HSA.signal_wait_scacquire(
-            signal.signal, HSA.SIGNAL_CONDITION_LT, 1,
-            min_latency, HSA.WAIT_STATE_BLOCKED)
-
-        if has_timeout && !finished
-            diff_time = (time_ns() - start_time) / 1e9
-            (diff_time > timeout) && throw(SignalTimeoutException(signal))
-        end
-
-        if queue !== nothing
-            ensure_active(queue)
-        end
-
-        # Allow another scheduled task to run.
-        # This is especially needed in the case
-        # when kernels need to perform HostCalls.
-        yield()
-    end
-end
-
-function Base.wait(signal::HSA.Signal; timeout = DEFAULT_SIGNAL_TIMEOUT[])
-    wait(ROCSignal(signal); timeout)
-end
-
-Base.notify(signal::ROCSignal) = HSA.signal_store_screlease(signal.signal, 0)
diff --git a/src/runtime/sync.jl b/src/runtime/sync.jl
deleted file mode 100644
index 0f3579b88..000000000
--- a/src/runtime/sync.jl
+++ /dev/null
@@ -1,38 +0,0 @@
-import ..AMDGPU: hip_configured
-
-"Tracks HSA signals and HIP streams to sync against."
-struct SyncState
-    signals::Vector{ROCKernelSignal}
-    streams::Vector{Ptr{Cvoid}}
-    lock::Threads.ReentrantLock
-end
-SyncState() = SyncState(ROCKernelSignal[], Ptr{Cvoid}[], Threads.ReentrantLock())
-
-struct WaitAdaptor end
-struct MarkAdaptor{S}
-    s::S
-end
-
-function wait!(ss::SyncState)
-    lock(ss.lock) do
-        # FIXME: Use barrier_and on dedicated queue
-        foreach(wait, ss.signals)
-        empty!(ss.signals)
-        @static if hip_configured
-            for s in ss.streams
-                AMDGPU.HIP.@check AMDGPU.HIP.hipStreamSynchronize(s)
-            end
-            empty!(ss.streams)
-        end
-    end
-    return
-end
-mark!(ss::SyncState, signal::ROCKernelSignal) =
-    lock(()->push!(ss.signals, signal), ss.lock)
-mark!(ss::SyncState, stream::Ptr{Cvoid}) =
-    lock(()->push!(ss.streams, stream), ss.lock)
-mark!(ss::SyncState, stream::HIP.HIPStream) =
-    mark!(ss, stream.stream)
-
-wait!(x) = Adapt.adapt(WaitAdaptor(), x)
-mark!(x, s) = Adapt.adapt(MarkAdaptor(s), x) # TODO constrain type of `s`
diff --git a/src/runtime/thread-utils.jl b/src/thread-utils.jl
similarity index 99%
rename from src/runtime/thread-utils.jl
rename to src/thread-utils.jl
index 0517d9196..fb85eae06 100644
--- a/src/runtime/thread-utils.jl
+++ b/src/thread-utils.jl
@@ -1,5 +1,3 @@
-import ..LLVM
-
 ## Lazy Initialization
 # Borrowed from CUDA.jl
 
@@ -55,8 +53,6 @@ end
 ## Memoization
 # Borrowed from CUDA.jl
 
-export @memoize
-
 """
     @memoize [key::T] [maxlen=...] begin
         # expensive computation
diff --git a/src/tls.jl b/src/tls.jl
index ef3cca0eb..abe4581bf 100644
--- a/src/tls.jl
+++ b/src/tls.jl
@@ -1,37 +1,22 @@
 struct TaskLocalState
-    device::ROCDevice
+    device::HIPDevice
     context::HIPContext
-    queues::Vector{Union{ROCQueue,Nothing}}
     streams::Vector{Union{HIPStream,Nothing}}
     priority::Symbol
 end
-function TaskLocalState(device::Union{ROCDevice,Nothing},
-                        context::Union{HIPContext,Nothing},
-                        queue::Union{ROCQueue,Nothing},
-                        stream::Union{HIPStream,Nothing},
-                        priority::Symbol)
+function TaskLocalState(
+    device::Union{HIPDevice,Nothing}, context::Union{HIPContext,Nothing},
+    stream::Union{HIPStream,Nothing}, priority::Symbol,
+)
     if device === nothing
-        if queue === nothing
-            device = Runtime.get_default_device()
-            queue = ROCQueue(device; priority, pooled=true)
-        else
-            device = AMDGPU.device(queue)
-        end
-    else
-        if queue === nothing
-            queue = ROCQueue(device; priority, pooled=true)
-        else
-            queue.device == device || throw(ArgumentError("""
-            Provided ROCQueue is on a differen device `$(queue.device)`
-            from the default one `$device`.
-            """))
-            @assert queue.priority == priority
-        end
+        # TODO get from stream if provided
+        device = Runtime.get_default_device()
     end
     if context === nothing
-        context = HIPContext(device_id(device))
+        context = HIPContext(device)
     end
     HIP.context!(context) # Switches HIP active device as well.
+
     if stream === nothing
         stream = HIPStream(priority)
     else
@@ -41,45 +26,27 @@ function TaskLocalState(device::Union{ROCDevice,Nothing},
         """))
         @assert stream.priority == priority
     end
-    queues = Union{ROCQueue,Nothing}[nothing for _ in 1:length(devices())]
-    streams = Union{HIPStream,Nothing}[nothing for _ in 1:length(devices())]
-    queues[device_id(device)] = queue
+    streams = Union{HIPStream, Nothing}[nothing for _ in 1:length(devices())]
     streams[device_id(device)] = stream
-    return TaskLocalState(device, context, queues, streams, priority)
+    return TaskLocalState(device, context, streams, priority)
 end
-TaskLocalState() = TaskLocalState(nothing, nothing, nothing, nothing, :normal)
+TaskLocalState() = TaskLocalState(nothing, nothing, nothing, :normal)
 
 function Base.getproperty(state::TaskLocalState, field::Symbol)
-    # Helpers to return active queue or stream
-    if field == :queue
-        return state.queues[device_id(state.device)]::ROCQueue
-    elseif field == :stream
+    # Helpers to return active stream
+    if field == :stream
         return state.streams[device_id(state.device)]::HIPStream
     else
         return getfield(state, field)
     end
 end
-Base.copy(state::TaskLocalState) =
-    TaskLocalState(state.device,
-                   state.context,
-                   copy(state.queues),
-                   copy(state.streams),
-                   state.priority)
-
-function reset_dead_queue!()
-    state = task_local_state()
-    queue = state.queue
-    if !queue.active
-        queue = state.queues[device_id(state.device)] = ROCQueue(state.device; priority=state.priority)
-    end
-    return queue
-end
+Base.copy(state::TaskLocalState) = TaskLocalState(
+    state.device, state.context, copy(state.streams), state.priority)
 
 function Base.show(io::IO, state::TaskLocalState)
     println(io, "TaskLocalState:")
     println(io, "  Device: $(state.device)")
     println(io, "  HIP Context: $(state.context)")
-    println(io, "  HSA Queue: $(state.queue)")
     println(io, "  HIP Stream: $(state.stream)")
     print(io, "  Priority: $(state.priority)")
 end
@@ -88,42 +55,36 @@ end
     task_local_state() -> TaskLocalState
 
 Returns the task-local state in the form of a `TaskLocalState`. Automatically
-picks a device, context, queue, and stream if they haven't already been selected.
+picks a device, context, and stream if they haven't already been selected.
 """
 task_local_state()::TaskLocalState =
     get!(()->TaskLocalState(), task_local_storage(), :AMDGPU)
 
 """
-    task_local_state!(; device=nothing, context=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal)
+    task_local_state!(; device=nothing, context=nothing, stream=nothing, priority::Symbol=:normal)
 
-Sets the task-local device, queue (with the specified priority), and HIP stream. If
-`device`, `queue`, or `stream` is `nothing` and an existing task-local state has been
-configured, then those values are retrived from the existing state (unless the
-`priority` has changed, in which case a new queue is selected); if no
-task-local state has been configured, then defaults are used when `nothing` is
-supplied.
+Sets the task-local device and HIP stream.
+If `device`, , or `stream` is `nothing` and an existing task-local state
+has been configured, then those values are retrived from the existing state
+(unless the `priority` has changed, in which case a new stream is selected);
+if no task-local state has been configured, then defaults are used
+when `nothing` is supplied.
 
-Note that these are only task-local defaults; when a device, queue or stream is
+Note that these are only task-local defaults; when a device or stream is
 manually passed to an AMDGPU operation (such as `@roc`), then the task-local
 value is ignored in favor of the passed argument.
 """
-function task_local_state!(; device=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal)
+function task_local_state!(; device=nothing, stream=nothing, priority::Symbol=:normal)
     if haskey(task_local_storage(), :AMDGPU)
         old_state = task_local_state()
         if device === nothing
             device = old_state.device
             context = old_state.context
         else
-            context = HIPContext(device_id(device))
+            context = HIPContext(device)
         end
         HIP.context!(context)
-        if queue === nothing
-            if priority == old_state.priority && old_state.queues[device_id(device)] !== nothing
-                queue = old_state.queues[device_id(device)]
-            else
-                queue = ROCQueue(device; priority, pooled=true)
-            end
-        end
+
         if stream === nothing
             if priority == old_state.priority && old_state.streams[device_id(device)] !== nothing
                 stream = old_state.streams[device_id(device)]
@@ -131,46 +92,41 @@ function task_local_state!(; device=nothing, queue=nothing, stream=nothing, prio
                 stream = HIPStream(priority)
             end
         end
-        queues = copy(old_state.queues)
         streams = copy(old_state.streams)
     else # TODO Use default constructor?
         if device === nothing
             device = Runtime.get_default_device()
         end
+
         context = HIPContext(device_id(device))
         HIP.context!(context)
-        if queue === nothing
-            queue = ROCQueue(device; priority)
-        end
         if stream === nothing
             stream = HIPStream(priority)
         end
-        queues = Union{ROCQueue,Nothing}[nothing for _ in 1:length(devices())]
         streams = Union{HIPStream,Nothing}[nothing for _ in 1:length(devices())]
     end
-    queues[device_id(device)] = queue
-    streams[device_id(device)] = stream
-    new_state = TaskLocalState(device, context, queues, streams, priority)
 
+    streams[device_id(device)] = stream
+    new_state = TaskLocalState(device, context, streams, priority)
     task_local_storage(:AMDGPU, new_state)
 end
 
 task_local_state!(state::TaskLocalState) = task_local_storage(:AMDGPU, state)
 
 """
-    task_local_state!(f::Base.Callable; device=nothing, queue=nothing, stream=nothing, priority::Symbol=:normal)
+    task_local_state!(f::Base.Callable; device=nothing, stream=nothing, priority::Symbol=:normal)
 
 Executes `f` with the given task-local state, and when finished, resets the
 state back to previous values and returns the result of `f()`.
 """
-function task_local_state!(f::Base.Callable;
-                           device=nothing, queue=nothing, stream=nothing,
-                           priority::Symbol=:normal)
+function task_local_state!(
+    f::Base.Callable; device=nothing, stream=nothing, priority::Symbol=:normal,
+)
     restore = haskey(task_local_storage(), :AMDGPU)
     if restore
         old_state = task_local_state()
     end
-    task_local_state!(; device, queue, stream, priority)
+    task_local_state!(; device, stream, priority)
 
     return try
         f()
@@ -178,7 +134,7 @@ function task_local_state!(f::Base.Callable;
         if restore
             task_local_state!(old_state)
         else
-            # We want a fresh state with pooled queues and default priority
+            # We want a fresh state and default priority
             delete!(task_local_storage(), :AMDGPU)
             task_local_state!()
         end
diff --git a/src/utils.jl b/src/utils.jl
index a6a4b9bb1..782ee4a96 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -12,12 +12,11 @@ function hsa_version()
 end
 
 function versioninfo(io::IO=stdout)
-    println("Using ROCm provided by: ", use_artifacts ? "JLLs" : "System")
+    println("Using ROCm provided by: ", use_artifacts() ? "JLLs" : "System")
     println("HSA Runtime ($(functional(:hsa) ? "ready" : "MISSING"))")
     if functional(:hsa)
         println("- Path: $libhsaruntime_path")
         println("- Version: $(hsa_version())")
-        #println("- Initialized: $(repr(HSA_REFCOUNT[] > 0))")
     end
     println("ld.lld ($(functional(:lld) ? "ready" : "MISSING"))")
     if functional(:lld)
@@ -25,13 +24,11 @@ function versioninfo(io::IO=stdout)
     end
     println("ROCm-Device-Libs ($(functional(:device_libs) ? "ready" : "MISSING"))")
     if functional(:device_libs)
-        println("- Path: $device_libs_path")
-        # TODO: println("- Version: $(device_libs_version)")
+        println("- Path: $libdevice_libs")
     end
     println("HIP Runtime ($(functional(:hip) ? "ready" : "MISSING"))")
     if functional(:hip)
-        println("- Path: $libhip_path")
-        # TODO: println("- Version: $(libhip_version)")
+        println("- Path: $libhip")
     end
     println("rocBLAS ($(functional(:rocblas) ? "ready" : "MISSING"))")
     if functional(:rocblas)
@@ -63,7 +60,7 @@ function versioninfo(io::IO=stdout)
     end
 
     if functional(:hsa)
-        println("HSA Agents ($(length(Runtime.devices()))):")
+        println("HIP Devices ($(length(Runtime.devices()))):")
         for device in Runtime.devices()
             println("- ", repr(device))
         end
@@ -155,11 +152,8 @@ function functional(component::Symbol)
 end
 
 function has_rocm_gpu()
-    if !functional(:hsa)
-        return false
-    else
-        return length(devices(:gpu)) > 0
-    end
+    (functional(:hsa) && functional(:hip)) || return false
+    return length(devices()) > 0
 end
 
 function print_build_diagnostics()
diff --git a/test/codegen/trap.jl b/test/codegen/trap.jl
index 5d3043d97..ee75fd2d8 100644
--- a/test/codegen/trap.jl
+++ b/test/codegen/trap.jl
@@ -1,19 +1,18 @@
-if !IS_NAVI_2
-    @testset "Trapping" begin
-        function trapkern()
-            Device.trap()
-            nothing
-        end
-        function debugtrapkern()
-            Device.debugtrap()
-            nothing
-        end
-
-        iob = IOBuffer()
-        AMDGPU.code_gcn(iob, trapkern, Tuple{}; kernel=true)
-        @test occursin("s_trap 2", String(take!(iob)))
-        iob = IOBuffer()
-        AMDGPU.code_gcn(iob, debugtrapkern, Tuple{}; kernel=true)
-        @test occursin("s_trap 3", String(take!(iob)))
+@testset "Trapping" begin
+    function trapkern()
+        Device.trap()
+        nothing
+    end
+    function debugtrapkern()
+        Device.debugtrap()
+        nothing
     end
+
+    iob = IOBuffer()
+    AMDGPU.code_gcn(iob, trapkern, Tuple{}; kernel=true)
+    @test occursin("s_trap 2", String(take!(iob)))
+
+    iob = IOBuffer()
+    AMDGPU.code_gcn(iob, debugtrapkern, Tuple{}; kernel=true)
+    @test occursin("s_trap 3", String(take!(iob)))
 end
diff --git a/test/device/array.jl b/test/device/array.jl
index ef76ee2e3..34c97deaa 100644
--- a/test/device/array.jl
+++ b/test/device/array.jl
@@ -19,8 +19,4 @@
     @test occursin("4×4 device array at", sprint(io->show(io, RD)))
     @test occursin("2×2 device array view", sprint(io->show(io, RD_view)))
     @test occursin("4×4 device array wrapper Adjoint", sprint(io->show(io, RD_adj)))
-
-    # Custom hash methods are defined
-    @test AMDGPU.Runtime.khash(RD) isa UInt # test that hashing doesn't segfault
-    @test AMDGPU.Runtime.khash(RD_view) isa UInt # test that SubArray hashing works
 end
diff --git a/test/device/deps.jl b/test/device/deps.jl
deleted file mode 100644
index facd626be..000000000
--- a/test/device/deps.jl
+++ /dev/null
@@ -1,73 +0,0 @@
-@testset "Kernel Dependencies" begin
-    function kernel(sig, waitval, A, val)
-        i = workitemIdx().x
-        AMDGPU.Device.hostcall_device_signal_wait(sig, waitval)
-        A[i] = val
-        return nothing
-    end
-
-    @testset "Barrier AND" begin
-        for i in (0, 1, 5, 7)
-            @testset "$i inputs" begin
-                RA = ROCArray(zeros(Float64, 1))
-                sig = AMDGPU.ROCSignal(0)
-
-                # Disable wait and mark because:
-                # - We need the kernels (ret1 vs ret2) to race
-                # - We're accessing RA before the kernels are complete
-                ret1 = map(1:i) do _
-                    @roc wait=false mark=false kernel(sig, 3, RA, 1.0)
-                end
-
-                retb = AMDGPU.barrier_and!(ret1)
-
-                ret2 = @roc wait=false mark=false kernel(sig, 0, RA, 2.0)
-
-                if i > 0
-                    sleep(0.5)
-                    @test Array(RA)[1] == 0.0
-                    HSA.signal_store_screlease(sig.signal, 3)
-                    wait.(ret1)
-                    @test Array(RA)[1] == 1.0
-                end
-                HSA.signal_store_screlease(sig.signal, 0)
-                # FIXME: wait(retb)
-                wait(ret2)
-                @test Array(RA)[1] == 2.0
-            end
-        end
-    end
-
-    #= FIXME
-    @testset "Barrier OR" begin
-        for i in (0, 1, 5, 7)
-            @testset "$i inputs" begin
-                RA = ROCArray(zeros(Float64, 1))
-                sig = AMDGPU.ROCSignal(0)
-
-                ret1 = [@roc(kernel(sig, 7, RA, 5.0)) for _ in 1:i]
-                pushfirst!(ret1, @roc(kernel(sig, 3, RA, 1.0)))
-
-                retb = AMDGPU.barrier_or!(ret1)
-                ret2 = @roc kernel(sig, 0, RA, 2.0)
-
-                if i > 0
-                    sleep(0.5)
-                    @test Array(RA)[1] == 0.0
-                    HSA.signal_store_release(sig.signal, 3)
-
-                    wait(ret1[1])
-                    @test Array(RA)[1] == 1.0
-                end
-                HSA.signal_store_screlease(sig.signal, 0)
-                sleep(0.5)
-                @test Array(RA)[1] == 2.0
-                wait(ret2)
-                # FIXME: wait(retb)
-                # clear waiting kernels
-                HSA.signal_store_screlease(sig.signal, 7)
-            end
-        end
-    end
-    =#
-end
diff --git a/test/device/exceptions.jl b/test/device/exceptions.jl
index 8fb47dbf9..10c38c49a 100644
--- a/test/device/exceptions.jl
+++ b/test/device/exceptions.jl
@@ -1,19 +1,18 @@
 @testset "Exceptions" begin
-
-function oob_kernel(X)
-    X[0] = 1
-    nothing
-end
-
-RA = ROCArray(ones(Float32, 4))
-try
-    wait(@roc oob_kernel(RA))
-catch err
-    @test err isa Runtime.KernelException
-    if err isa Runtime.KernelException
-        @test err.exstr !== nothing
-        @test occursin("Out-of-bounds array access", err.exstr)
+    function oob_kernel(X)
+        X[0] = 1
+        nothing
     end
-end
 
+    RA = ROCArray(ones(Float32, 4))
+    @roc oob_kernel(RA)
+    try
+        AMDGPU.synchronize()
+    catch err
+        @test err isa ErrorException
+    finally
+        AMDGPU.reset_exception_holder!(AMDGPU.device())
+    end
+    # TODO check exception message
+    # TODO check specific exception type
 end
diff --git a/test/device/execution_control.jl b/test/device/execution_control.jl
index cacf53d12..27f1dc036 100644
--- a/test/device/execution_control.jl
+++ b/test/device/execution_control.jl
@@ -1,66 +1,51 @@
 @testset "Execution Control Intrinsics" begin
+    @testset "sendmsg/sendmsghalt/endpgm" begin
+        function exec_ctl_kernel()
+            Device.sendmsg(5)
+            Device.sendmsghalt(6)
+            Device.endpgm()
+        end
 
-@testset "Completion Signal" begin
-    function completion_signal_kernel(X)
-        X[1] = AMDGPU.Device._completion_signal()
-        nothing
-    end
-
-    RA = ROCArray(rand(UInt64, 1))
-
-    ev = @roc completion_signal_kernel(RA)
-    wait(ev)
-    @test Array(RA)[1] == ev.signal.signal.handle
-end
-
-@testset "sendmsg/sendmsghalt/endpgm" begin
-    function exec_ctl_kernel()
-        Device.sendmsg(5)
-        Device.sendmsghalt(6)
-        Device.endpgm()
+        iob = IOBuffer()
+        AMDGPU.code_native(iob, exec_ctl_kernel, Tuple{})
+        str = String(take!(iob))
+        @test occursin("s_sendmsg ", str)
+        @test occursin("s_sendmsghalt ", str)
+        # TODO: Can't easily count these, since they're automatically inserted
+        @test occursin("s_endpgm", str)
     end
 
-    iob = IOBuffer()
-    AMDGPU.code_native(iob, exec_ctl_kernel, Tuple{})
-    str = String(take!(iob))
-    @test occursin("s_sendmsg ", str)
-    @test occursin("s_sendmsghalt ", str)
-    # TODO: Can't easily count these, since they're automatically inserted
-    @test occursin("s_endpgm", str)
-end
-
-@testset "device_sleep/memtime/memrealtime" begin
-    function time_kernel(X)
-        t1 = AMDGPU.Device.memtime()
-        tr1 = AMDGPU.Device.memrealtime()
-        AMDGPU.Device.device_sleep(Int32(1))
-        t2 = AMDGPU.Device.memtime()
-        tr2 = AMDGPU.Device.memrealtime()
-        X[1] = t2 > t1
-        X[2] = tr2 > tr1
-        return
+    @testset "device_sleep/memtime/memrealtime" begin
+        function time_kernel(X)
+            t1 = AMDGPU.Device.memtime()
+            tr1 = AMDGPU.Device.memrealtime()
+            AMDGPU.Device.device_sleep(Int32(2))
+            t2 = AMDGPU.Device.memtime()
+            tr2 = AMDGPU.Device.memrealtime()
+            X[1] = t2 > t1
+            X[2] = tr2 > tr1
+            return
+        end
+        RX = ROCArray(zeros(Bool, 2))
+        @roc time_kernel(RX)
+        @test all(Array(RX))
     end
-    RX = ROCArray(zeros(Bool, 2))
-    wait(@roc time_kernel(RX))
-    @test all(Array(RX))
-end
 
-@testset "readfirstlane" begin
-    function readfirstlane_kernel(B, A)
-        idx = workitemIdx().x
-        if idx > 1
-            B[idx] = AMDGPU.Device.readfirstlane(A[idx])
-        else
-            B[idx] = A[idx]
+    @testset "readfirstlane" begin
+        function readfirstlane_kernel(B, A)
+            idx = workitemIdx().x
+            if idx > 1
+                B[idx] = AMDGPU.Device.readfirstlane(A[idx])
+            else
+                B[idx] = A[idx]
+            end
+            return
         end
-        return
+        RB = ROCArray(zeros(Int32, 8))
+        RA = ROCArray(Int32(1):Int32(8))
+        @roc groupsize=8 readfirstlane_kernel(RB, RA)
+        B = Array(RB)
+        @test B[1] == Int32(1)
+        @test all(B[2:8] .== Int32(2))
     end
-    RB = ROCArray(zeros(Int32, 8))
-    RA = ROCArray(Int32(1):Int32(8))
-    wait(@roc groupsize=8 readfirstlane_kernel(RB, RA))
-    B = Array(RB)
-    @test B[1] == Int32(1)
-    @test all(B[2:8] .== Int32(2))
-end
-
 end
diff --git a/test/device/globals.jl b/test/device/globals.jl
deleted file mode 100644
index d71370075..000000000
--- a/test/device/globals.jl
+++ /dev/null
@@ -1,20 +0,0 @@
-@testset "Globals" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    function kernel(X)
-        ptr = Device.get_global_pointer(Val(:myglobal), Float32)
-        Base.unsafe_store!(ptr, 3f0)
-        nothing
-    end
-
-    mygbl_ptr = Ref{Any}()
-    function gbl_init(gbl, mod, dev)
-        gbl_ptr = Base.unsafe_convert(Ptr{Float32}, gbl.ptr)
-        mygbl_ptr[] = gbl_ptr
-
-        Base.unsafe_store!(gbl_ptr, 2f0)
-    end
-
-    wait(@roc groupsize=1 global_hooks=(myglobal=gbl_init,) kernel(Int32(1)))
-    @test Base.unsafe_load(mygbl_ptr[]) == 3f0
-end
diff --git a/test/device/hostcall.jl b/test/device/hostcall.jl
index 806f3a75d..06ca26fd0 100644
--- a/test/device/hostcall.jl
+++ b/test/device/hostcall.jl
@@ -1,260 +1,214 @@
 @testset "Hostcall" begin
 
 @testset "Call: No return or arguments" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         hostcall!(sig)
         b[1] = a[1]
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
 
     dref = Ref{Bool}(false)
-    hc = HostCall(Nothing, Tuple{}) do
+    hc = HostCallHolder(Nothing, Tuple{}) do
         dref[] = true
         nothing
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 1f0
     @test dref[] == true
 end
 
 @testset "Call: Error" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         hostcall!(sig)
         b[1] = a[1]
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
     dref = Ref{Bool}(false)
 
-    # This should throw an exception and the error message should be logged
     @test_logs (:error, "HostCall error") begin
-        hc, hc_task = HostCall(Nothing, Tuple{}; return_task=true) do
+        hc = HostCallHolder(Nothing, Tuple{}) do
             error("Some error")
             dref[] = true
             nothing
         end
 
-        @test_throws Runtime.KernelException wait(@roc kernel(RA, RB, hc))
+        @roc kernel(RA, RB, hc)
+        @test_throws ErrorException AMDGPU.synchronize(; blocking=false)
+        AMDGPU.reset_exception_holder!(AMDGPU.device())
 
-        empty!(RB.syncstate.signals)
         @test Array(RB)[1] == 0f0
         @test dref[] == false
-        @test Base.istaskfailed(hc_task)
+        sleep(1) # Give time for the task to shut down.
+        @test Base.istaskfailed(hc.task)
     end
 end
 
 @testset "Call: (0 args)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc = hostcall!(sig)::Float32
         b[1] = a[1] + inc
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Float32, Tuple{}) do
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Float32, Tuple{}) do
         1f0
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 2f0
 end
 
 @testset "Call: (1 arg)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc = hostcall!(sig, 42f0)::Float32
         b[1] = a[1] + inc
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Float32, Tuple{Float32}) do arg1
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Float32, Tuple{Float32}) do arg1
         arg1 + 1f0
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 44f0
 end
 
 @testset "Call: (2 homogeneous args)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc = hostcall!(sig, 42f0, 3f0)::Float32
         b[1] = a[1] + inc
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Float32, Tuple{Float32,Float32}) do arg1, arg2
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Float32, Tuple{Float32,Float32}) do arg1, arg2
         arg1 + arg2 + 1f0
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 47f0
 end
 
 @testset "Call: (2 heterogeneous args)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc = hostcall!(sig, 42f0, Int16(3))::Float32
         b[1] = a[1] + inc
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Float32, Tuple{Float32,Int16}) do arg1, arg2
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Float32, Tuple{Float32,Int16}) do arg1, arg2
         arg1 + Float32(arg2) + 1f0
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 47f0
 end
 
 @testset "Call: (2 heterogeneous args, return homogeneous tuple)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc1, inc2 = hostcall!(sig, 42f0, Int16(3))::Tuple{Float32,Float32}
         b[1] = a[1] + inc1 + inc2
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Tuple{Float32,Float32}, Tuple{Float32,Int16}) do arg1, arg2
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Tuple{Float32,Float32}, Tuple{Float32,Int16}) do arg1, arg2
         (arg1 + Float32(arg2) + 1f0, 1f0)
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 48f0
 end
 
 @testset "Call: (2 heterogeneous args, return heterogeneous tuple)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc1, inc2 = hostcall!(sig, 42f0, Int16(3))::Tuple{Float32,Int64}
         b[1] = a[1] + inc1 + Float32(inc2)
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Tuple{Float32,Int64}, Tuple{Float32,Int16}) do arg1, arg2
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Tuple{Float32,Int64}, Tuple{Float32,Int16}) do arg1, arg2
         (arg1 + Float32(arg2) + 1f0, 1)
     end
 
-    wait(@roc kernel(RA, RB, hc))
-
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 48f0
 end
 
 @testset "Call: (2 hostcalls, 1 kernel)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig1,sig2)
         inc1 = hostcall!(sig1, 3f0)::Float32
         inc2 = hostcall!(sig2, 4f0)::Float32
         b[1] = a[1] + inc1 + inc2
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc1 = HostCall(Float32, Tuple{Float32}) do arg1
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc1 = HostCallHolder(Float32, Tuple{Float32}) do arg1
         arg1 + 1f0
     end
-    hc2 = HostCall(Float32, Tuple{Float32}) do arg1
+    hc2 = HostCallHolder(Float32, Tuple{Float32}) do arg1
         arg1 + 2f0
     end
 
-    wait(@roc kernel(RA, RB, hc1, hc2))
+    @roc kernel(RA, RB, hc1, hc2)
+    AMDGPU.synchronize(; blocking=false)
     @test Array(RB)[1] == 11f0
 end
 
 @testset "Call: (1 hostcall, 2 kernels)" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     function kernel(a,b,sig)
         inc = hostcall!(sig, 3f0)::Float32
         b[1] = a[1] + inc
-
         nothing
     end
 
-    A = ones(Float32, 1)
-    B = zeros(Float32, 1)
-    RA = ROCArray(A)
-    RB = ROCArray(B)
-
-    hc = HostCall(Float32, Tuple{Float32}; continuous=true) do arg1
+    RA = ROCArray(ones(Float32, 1))
+    RB = ROCArray(zeros(Float32, 1))
+    hc = HostCallHolder(Float32, Tuple{Float32}; continuous=true) do arg1
         arg1 + 1f0
     end
 
-    wait(@roc kernel(RA, RB, hc))
-    wait(@roc kernel(RA, RB, hc))
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
+
+    @roc kernel(RA, RB, hc)
+    AMDGPU.synchronize(; blocking=false)
+
+    # Next time HC will be called from the kernel is its last time.
+    # So that it shutdowns correctly and does not stick to the end.
+    AMDGPU.Device.finish!(hc)
 
     @test Array(RB)[1] == 5f0
+
+    # Give HostCall task time to exit.
+    sleep(2)
+    @test istaskdone(hc)
 end
 
 end
diff --git a/test/device/indexing.jl b/test/device/indexing.jl
index 2a4b175b8..982c39fdd 100644
--- a/test/device/indexing.jl
+++ b/test/device/indexing.jl
@@ -13,7 +13,7 @@
 
     A = zeros(Int64, 6)
     RA = ROCArray(A)
-    wait(@roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA))
+    @roc groupsize=(1,2,3) gridsize=(4,5,6) idx_kern(RA)
     A = Array(RA)
     @test all(A .> 0)
 
@@ -35,7 +35,9 @@
 
     A = zeros(Int64, 9)
     RA = ROCArray(A)
-    wait(@roc groupsize=(1,2,3) gridsize=(4,4,6) dim_kern(RA))
+    groupsize = (1, 2, 3)
+    gridsize = (4, 4, 6)
+    @roc groupsize=groupsize gridsize=gridsize dim_kern(RA)
     A = Array(RA)
-    @test A == [1,2,3,4,4,6,4,2,2]
+    @test A == [groupsize..., (groupsize .* gridsize)..., gridsize...]
 end
diff --git a/test/device/launch.jl b/test/device/launch.jl
index 77543a91b..415d42cf5 100644
--- a/test/device/launch.jl
+++ b/test/device/launch.jl
@@ -1,9 +1,6 @@
 @testset "Launch Options" begin
     kernel() = nothing
 
-    device = AMDGPU.default_device()
-    queue = AMDGPU.queue(device)
-
     # Group/grid size selection and aliases
     for (groupsize,gridsize) in (
         (1,1),
@@ -19,53 +16,32 @@
         ((1,1,1),2),
         (1,(1024,1,1)),
     )
-        eval(:(wait(@roc groupsize=$groupsize $kernel())))
-        eval(:(wait(@roc groupsize=$groupsize gridsize=$gridsize $kernel())))
-        eval(:(wait(@roc gridsize=$gridsize $kernel())))
-
-        threads = groupsize
-        blocks = gridsize .÷ groupsize
-        eval(:(wait(@roc threads=$threads $kernel())))
-        eval(:(wait(@roc blocks=$blocks $kernel())))
-        eval(:(wait(@roc threads=$threads blocks=$blocks $kernel())))
+        @roc groupsize=groupsize kernel()
+        @roc groupsize=groupsize gridsize=gridsize kernel()
+        @roc gridsize=gridsize kernel()
     end
 
-    # Device/queue selection and aliases
-    # FIXME: Test that device/queue are used!
-    eval(:(wait(@roc device=$device $kernel())))
-    eval(:(wait(@roc device=$device queue=$queue $kernel())))
-    eval(:(wait(@roc queue=$queue $kernel())))
-    eval(:(wait(@roc stream=$queue $kernel())))
+    stream = AMDGPU.stream()
+    @roc stream=stream kernel()
+    AMDGPU.synchronize()
 
-    # Non-default queue
-    queue2 = ROCQueue()
-    sig = @roc queue=queue2 kernel()
-    @test sig.queue === queue2
+    # Non-default stream
+    stream2 = HIPStream()
+    @roc stream=stream2 kernel()
+    AMDGPU.synchronize(stream2)
 
     # Group size validity
-    @test_throws ArgumentError eval(:(wait(@roc groupsize=0 $kernel())))
-    eval(:(wait(@roc groupsize=1024 $kernel())))
-    @test_throws ArgumentError eval(:(wait(@roc groupsize=1025 $kernel())))
-    @test_throws ArgumentError eval(:(wait(@roc groupsize=(1024,2) $kernel())))
-    @test_throws ArgumentError eval(:(wait(@roc groupsize=(512,2,2) $kernel())))
+    @test_throws AMDGPU.HIP.HIPError @roc groupsize=0 kernel()
+    @test_throws AMDGPU.HIP.HIPError @roc groupsize=1025 kernel()
+    @test_throws AMDGPU.HIP.HIPError @roc groupsize=(1024, 2) kernel()
+    @test_throws AMDGPU.HIP.HIPError @roc groupsize=(512, 2, 2) kernel()
 
     # No-launch
-    kersig = eval(:(@roc launch=true $kernel()))
-    @test isa(kersig, AMDGPU.ROCKernelSignal)
-    wait(kersig)
-
-    host_kernel = eval(:(@roc launch=false $kernel()))
-    @test isa(host_kernel, Runtime.HostKernel)
-
+    host_kernel = @roc launch=false kernel()
+    @test isa(host_kernel, Runtime.HIPKernel)
     @test_throws Exception eval(:(@roc launch=1 $kernel())) # TODO: ArgumentError
 end
 
-@testset "No-argument kernel" begin
-    kernel() = nothing
-
-    wait(@roc kernel())
-end
-
 @testset "Kernel argument alignment" begin
     function kernel(x, y)
         if Int64(x) != y
@@ -75,7 +51,8 @@ end
     end
     x = rand(UInt32)
     y = Int64(x)
-    wait(@roc kernel(x, y))
+    @roc kernel(x, y)
+    AMDGPU.synchronize()
 end
 
 @testset "Function/Argument Conversion" begin
@@ -89,81 +66,53 @@ end
             @roc kernel(f)
         end
 
-        a = [1.]
-        a_dev = ROCArray(a)
-
-        outer(a_dev, 2.)
-
-        @test Array(a_dev) == [2.]
+        a_dev = ROCArray([1.0])
+        outer(a_dev, 2.0)
+        @test Array(a_dev) == [2.0]
     end
 end
 
-@testset "Signal waiting" begin
-    sig = @roc identity(nothing)
-    wait(sig)
-    wait(sig.signal)
-    wait(sig.signal.signal)
-    @test sig.queue === AMDGPU.queue()
-end
-
-@testset "Custom signal" begin
-    sig = ROCSignal()
-    sig2 = @roc signal=sig identity(nothing)
-    @test sig2.signal == sig
-    wait(sig)
-    wait(sig2)
-end
-
 if length(AMDGPU.devices()) > 1
     @testset "Multi-GPU" begin
-        # HSA will throw if the compiler and launch use different devices
-        a1, a2 = AMDGPU.devices()[1:2]
-        wait(@roc device=a1 identity(nothing))
-        wait(@roc device=a2 identity(nothing))
+        dev = AMDGPU.device()
+
+        AMDGPU.device!(AMDGPU.devices()[2])
+        @roc identity(nothing)
+
+        AMDGPU.device!(dev)
+        @roc identity(nothing)
     end
 else
     @warn "Only 1 GPU detected; skipping multi-GPU tests"
-    @test_broken "Multi-GPU"
+    @test_skip "Multi-GPU"
 end
 
 @testset "Launch Configuration" begin
-    kern = @roc launch=false identity(nothing)
+    function f(x)
+        x[1] = 1
+        return
+    end
+    x = ROCArray([1])
+    kern = @roc launch=false f(x)
     occ = AMDGPU.launch_configuration(kern)
     @test occ isa NamedTuple
-    @test haskey(occ, :groupsize)
+    @test haskey(occ, :groupsize) && haskey(occ, :gridsize)
     # This kernel has no occupancy constraints
     @test occ.groupsize == AMDGPU.Device._max_group_size
 
-    @testset "Automatic groupsize selection" begin
-        function groupsize_kernel(A)
-            A[1] = workgroupDim().x
-            nothing
-        end
-        A = AMDGPU.ones(Int, 1)
-        kern = @roc launch=false groupsize_kernel(A)
-        # Verify first that there are no occupancy constraints
-        @test AMDGPU.launch_configuration(kern).groupsize == AMDGPU.Device._max_group_size
-        # Then check that this value was used
-        wait(@roc groupsize=:auto groupsize_kernel(A))
-        @test Array(A)[1] == AMDGPU.Device._max_group_size
-    end
-
-    @testset "Function redefinition" begin
-        RX = ROCArray(rand(Float32, 1))
-        function f(X)
-            Y = @ROCStaticLocalArray(Float32, 1)
-            X[1] = Y[1]
-            return
-        end
-        occ1 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX))
-        function f(X)
-            Y = @ROCStaticLocalArray(Float32, 1024)
-            X[1] = Y[1]
-            return
-        end
-        occ2 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX))
-        @test occ1 != occ2
-    end
+    # TODO
+    # @testset "Automatic groupsize selection" begin
+    #     function groupsize_kernel(A)
+    #         A[1] = workgroupDim().x
+    #         nothing
+    #     end
+    #     A = AMDGPU.ones(Int, 1)
+    #     kern = @roc launch=false groupsize_kernel(A)
+    #     # Verify first that there are no occupancy constraints
+    #     @test AMDGPU.launch_configuration(kern).groupsize == AMDGPU.Device._max_group_size
+    #     @roc groupsize=:auto groupsize_kernel(A)
+    #     @test Array(A)[1] == AMDGPU.Device._max_group_size
+    # end
 
     @testset "Local memory" begin
         function f(X)
@@ -172,16 +121,11 @@ end
             unsafe_store!(Y.ptr, unsafe_load(X.ptr))
             return
         end
+
         RX = ROCArray(rand(Float32, 1))
-        @testset "Static" begin
-            occ = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX))
-            @test occ.LDS_size == sizeof(Float32) * 16
-        end
-        @testset "Dynamic" begin
-            # Test that localmem is properly accounted for
-            occ1 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX))
-            occ2 = AMDGPU.Compiler.calculate_occupancy(@roc launch=false f(RX); localmem=65536÷2)
-            @test occ1 != occ2
-        end
+        # Test that localmem is properly accounted for
+        occ1 = AMDGPU.launch_configuration(@roc launch=false f(RX))
+        occ2 = AMDGPU.launch_configuration(@roc launch=false f(RX); shmem=65536 ÷ 2)
+        @test occ1 != occ2
     end
 end
diff --git a/test/device/math.jl b/test/device/math.jl
index 1961c827a..bd7962b20 100644
--- a/test/device/math.jl
+++ b/test/device/math.jl
@@ -1,8 +1,6 @@
 using Base.FastMath
 
 @testset "Math Intrinsics" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
     for T in (Float16, Float32, Float64)
         a = rand(T, 16) .* T(42)
         d_a = ROCArray(a)
@@ -11,8 +9,9 @@ using Base.FastMath
             b = map(f, a)
             d_b = map(f, d_a)
             for out_idx in 1:length(f(a[1]))
-                @test all(sc->(sc[1][out_idx] ≈ sc[2][out_idx]),
-                          zip(b, Array(d_b)))
+                @test all(
+                    sc -> (sc[1][out_idx] ≈ sc[2][out_idx]),
+                    zip(b, Array(d_b)))
             end
         end
     end
diff --git a/test/device/memory.jl b/test/device/memory.jl
index 8a23eac1c..8be356c19 100644
--- a/test/device/memory.jl
+++ b/test/device/memory.jl
@@ -24,7 +24,7 @@
         RB = ROCArray(zeros(Float32, 8))
         RC = ROCArray(ones(Float32, 8))
 
-        wait(@roc groupsize=8 memory_static_kernel(RA, RB, RC))
+        @roc groupsize=8 memory_static_kernel(RA, RB, RC)
         @test Array(RA) ≈ Array(RB)
         # Test zero-initialization
         @test all(iszero, Array(RC))
@@ -49,7 +49,8 @@
             RA = ROCArray(A)
             RC = ROCArray(ones(Float32, N))
 
-            wait(@roc localmem=N*sizeof(Float32) dynamic_localmem_kernel(RA, RC))
+            shmem = N * sizeof(Float32)
+            @roc shmem=shmem dynamic_localmem_kernel(RA, RC)
 
             @test Array(RA) ≈ A .+ 1f0
             # Test zero-initialization
@@ -58,43 +59,40 @@
     end
 end
 
-@testset "Memory: Dynamic" begin
-    function malloc_kernel(X)
-        ptr = AMDGPU.Device.malloc(Csize_t(4))
-        X[1] = reinterpret(UInt64, ptr)
-        AMDGPU.Device.free(ptr)
-        nothing
-    end
+# TODO
+# @testset "Memory: Dynamic" begin
+#     function malloc_kernel(X)
+#         ptr = AMDGPU.Device.malloc(Csize_t(4))
+#         X[1] = reinterpret(UInt64, ptr)
+#         AMDGPU.Device.free(ptr)
+#         nothing
+#     end
 
-    RA = ROCArray(zeros(UInt64, 1))
-    wait(@roc malloc_kernel(RA))
-    @test Array(RA)[1] != 0
-end
+#     RA = ROCArray(zeros(UInt64, 1))
+#     @roc malloc_kernel(RA)
+#     @test Array(RA)[1] != 0
+# end
 
 @testset "Memcpy/Memset" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    function memcpy_kernel(X,Y)
-        AMDGPU.Device.memcpy!(Y.ptr, X.ptr, sizeof(Float32)*length(X))
+    function memcpy_kernel(X, Y)
+        AMDGPU.Device.memcpy!(Y.ptr, X.ptr, sizeof(Float32) * length(X))
         nothing
     end
 
     A = rand(Float32, 4)
     B = zeros(Float32, 4)
-    RA, RB = ROCArray.((A,B))
-
-    wait(@roc memcpy_kernel(RA,RB))
+    RA, RB = ROCArray.((A, B))
+    @roc memcpy_kernel(RA, RB)
     @test A == collect(RA) == collect(RB)
 
-    function memset_kernel(X,y)
-        AMDGPU.Device.memset!(X.ptr, y, div(length(X),2))
+    function memset_kernel(X, y)
+        AMDGPU.Device.memset!(X.ptr, y, length(X) ÷ 2)
         nothing
     end
 
     A = zeros(UInt8, 4)
     RA = ROCArray(A)
-    wait(@roc memset_kernel(RA,0x3))
-
+    @roc memset_kernel(RA, 0x3)
     @test all(collect(RA)[1:2] .== 0x3)
     @test all(collect(RA)[3:4] .== 0x0)
 end
diff --git a/test/device/output.jl b/test/device/output.jl
index bb09d1061..c353a109d 100644
--- a/test/device/output.jl
+++ b/test/device/output.jl
@@ -1,232 +1,237 @@
-import .Device: OutputContext
-
 @testset "@rocprintln" begin
-
-@testset "Plain, no newline" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel(oc) = @rocprint oc "Hello World!"
-
-    iob = IOBuffer()
-    oc = OutputContext(iob)
-    wait(@roc kernel(oc))
-    @test String(take!(iob)) == "Hello World!"
-end
-
-@testset "Plain, with newline" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel(oc) = @rocprintln oc "Hello World!"
-
-    iob = IOBuffer()
-    oc = OutputContext(iob)
-    wait(@roc kernel(oc))
-    @test String(take!(iob)) == "Hello World!\n"
-end
-
-@testset "Plain, multiple calls" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    function kernel(oc)
-        @rocprint oc "Hello World!"
-        @rocprintln oc "Goodbye World!"
+    @testset "Plain, no newline" begin
+        kernel() = @rocprint "Hello World!"
+
+        _, msg = @grab_output begin
+            @roc kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Hello World!"
     end
 
-    iob = IOBuffer()
-    oc = OutputContext(iob)
-    wait(@roc kernel(oc))
-    @test String(take!(iob)) == "Hello World!Goodbye World!\n"
-end
-
-@testset "Plain, global context" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
+    @testset "Plain, with newline" begin
+        kernel() = @rocprintln "Hello World!"
 
-    function kernel()
-        @rocprint "Hello World!"
-        @rocprintln "Goodbye World!"
+        _, msg = @grab_output begin
+            @roc kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Hello World!\n"
     end
 
-    _, msg = @grab_output wait(@roc kernel())
-    @test msg == "Hello World!Goodbye World!\n"
-end
-
-#= TODO
-@testset "Interpolated string" begin
-    inner_str = "to the"
-    function kernel(oc)
-        @rocprintln oc "Hello $inner_str World!"
-        nothing
+    @testset "Plain, multiple calls" begin
+        function kernel()
+            @rocprint "Hello World!"
+            @rocprintln "Goodbye World!"
+        end
+
+        _, msg = @grab_output begin
+            @roc kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Hello World!Goodbye World!\n"
     end
 
-    iob = IOBuffer()
-    oc = OutputContext(iob)
-    @roc kernel(oc)
-    sleep(1)
-    @test String(take!(iob)) == "Hello to the World!\n"
-end
-=#
-
+    #= TODO
+    @testset "Interpolated string" begin
+        inner_str = "to the"
+        function kernel(oc)
+            @rocprintln oc "Hello $inner_str World!"
+            nothing
+        end
+
+        iob = IOBuffer()
+        oc = OutputContext(iob)
+        @roc kernel(oc)
+        sleep(1)
+        @test String(take!(iob)) == "Hello to the World!\n"
+    end
+    =#
 end
 
 @testset "@rocprintf" begin
-
-@testset "Plain" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel() = @rocprintf "Hello World!\n"
-
-    _, msg = @grab_output wait(@roc kernel())
-    @test msg == "Hello World!\n"
-end
-
-@testset "Integer argument" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel(x) = @rocprintf "Value: %d\n" x
-
-    _, msg = @grab_output wait(@roc kernel(42))
-    @test msg == "Value: 42\n"
-end
-
-@testset "Multiple arguments" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    function kernel(x)
-        y = 0.123401
-        @rocprintf "Value: %d | %.4f\n" x y
+    @testset "Plain" begin
+        kernel() = @rocprintf "Hello World!\n"
+
+        _, msg = @grab_output begin
+            @roc kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Hello World!\n"
     end
 
-    _, msg = @grab_output wait(@roc kernel(42))
-    @test msg == "Value: 42 | 0.1234\n"
-end
-
-@testset "Per-lane" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel() = @rocprintf :lane "[%d] " workitemIdx().x
-
-    # One group, one wavefront
-    exp = reduce(*, ["[$i] " for i in 1:8])
-    _, msg = @grab_output wait(@roc groupsize=8 kernel())
-    @test msg == exp
-
-    # One group, multiple wavefronts
-    exp = reduce(*, ["[$i] " for i in 1:128])
-    _, msg = @grab_output wait(@roc groupsize=128 kernel())
-    @test msg == exp
-
-    # Multiple groups, one wavefront each
-    exp = reduce(*, ["[$i] " for i in vcat(1:64, 1:64, 1:64, 1:64)])
-    _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel())
-    @test msg == exp
+    @testset "Integer argument" begin
+        kernel(x) = @rocprintf "Value: %d\n" x
 
-    # Multiple groups, multiple wavefronts each
-    exp = reduce(*, ["[$i] " for i in vcat(1:128, 1:128)])
-    _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel())
-    @test msg == exp
-end
-
-@testset "Per-wavefront" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel() = @rocprintf :wave "[%d] " workitemIdx().x
-    wsize::Int64 = AMDGPU.wavefrontsize(ROCDevice())
-
-    # One group, one wavefront
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=1 kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # One group, multiple wavefronts
-    groupsize = 128
-    exp = reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)])
-    _, msg = @grab_output wait(@roc groupsize=groupsize kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # Multiple groups, one wavefront each
-    gridsize = 256
-    exp = repeat("[1] ", gridsize ÷ wsize)
-    _, msg = @grab_output(wait(@roc groupsize=wsize gridsize=gridsize kernel()))
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # Multiple groups, multiple wavefronts each
-    groupsize = 128
-    n_groups = gridsize ÷ groupsize
-    exp = repeat(
-        reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]),
-        n_groups)
-    _, msg = @grab_output(wait(@roc groupsize=128 gridsize=256 kernel()))
-    @test msg == exp
-end
-
-@testset "Per-workgroup" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    kernel() = @rocprintf :group "[%d] " workitemIdx().x
-
-    # One group, one wavefront
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=8 kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # One group, multiple wavefronts
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=128 kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # Multiple groups, one wavefront each
-    exp = reduce(*, ["[$i] " for i in [1, 1, 1, 1]])
-    _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # Multiple groups, multiple wavefronts each
-    exp = reduce(*, ["[$i] " for i in [1, 1]])
-    _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel())
-    @test msg == exp
-end
-
-@testset "Per-grid" begin
-    kernel() = @rocprintf :grid "[%d] " workitemIdx().x
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # One group, one wavefront
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=8 kernel())
-    @test msg == exp
-
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    # One group, multiple wavefronts
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=128 kernel())
-    @test msg == exp
+        _, msg = @grab_output begin
+            @roc kernel(42)
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Value: 42\n"
+    end
 
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
+    @testset "Multiple arguments" begin
+        function kernel(x)
+            y = 0.123401
+            @rocprintf "Value: %d | %.4f\n" x y
+        end
+
+        _, msg = @grab_output begin
+            @roc kernel(42)
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == "Value: 42 | 0.1234\n"
+    end
 
-    # Multiple groups, one wavefront each
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=64 gridsize=256 kernel())
-    @test msg == exp
+    @testset "Per-lane" begin
+        kernel() = @rocprintf :lane "[%d] " workitemIdx().x
+
+        # One group, one wavefront
+        exp = reduce(*, ["[$i] " for i in 1:8])
+        _, msg = @grab_output begin
+            @roc groupsize=8 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # One group, multiple wavefronts
+        exp = reduce(*, ["[$i] " for i in 1:128])
+        _, msg = @grab_output begin
+            @roc groupsize=128 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, one wavefront each
+        exp = reduce(*, ["[$i] " for i in vcat(1:64, 1:64, 1:64, 1:64)])
+        _, msg = @grab_output begin
+            @roc groupsize=64 gridsize=4 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, multiple wavefronts each
+        exp = reduce(*, ["[$i] " for i in vcat(1:128, 1:128)])
+        _, msg = @grab_output begin
+            @roc groupsize=128 gridsize=2 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+    end
 
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
+    @testset "Per-wavefront" begin
+        kernel() = @rocprintf :wave "[%d] " workitemIdx().x
+        hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device())
+        wsize::Int = AMDGPU.Runtime.device_wavefront_size(hsa_dev)
+
+        # One group, one wavefront
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=1 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # One group, multiple wavefronts
+        groupsize = 128
+        exp = reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)])
+        _, msg = @grab_output begin
+            @roc groupsize=groupsize kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, one wavefront each
+        gridsize = 256 ÷ wsize
+        exp = repeat("[1] ", 256 ÷ wsize)
+        _, msg = @grab_output begin
+            @roc groupsize=wsize gridsize=gridsize kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, multiple wavefronts each
+        groupsize = 128
+        n_groups = 256 ÷ groupsize
+        exp = repeat(
+            reduce(*, ["[$i] " for i in collect(1:wsize:groupsize)]),
+            n_groups)
+        _, msg = @grab_output begin
+            @roc groupsize=128 gridsize=2 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+    end
 
-    # Multiple groups, multiple wavefronts each
-    exp = "[1] "
-    _, msg = @grab_output wait(@roc groupsize=128 gridsize=256 kernel())
-    @test msg == exp
-end
+    @testset "Per-workgroup" begin
+        kernel() = @rocprintf :group "[%d] " workitemIdx().x
+
+        # One group, one wavefront
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=8 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # One group, multiple wavefronts
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=128 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, one wavefront each
+        exp = reduce(*, ["[$i] " for i in [1, 1, 1, 1]])
+        _, msg = @grab_output begin
+            @roc groupsize=64 gridsize=4 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, multiple wavefronts each
+        exp = reduce(*, ["[$i] " for i in [1, 1]])
+        _, msg = @grab_output begin
+            @roc groupsize=128 gridsize=2 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+    end
 
+    @testset "Per-grid" begin
+        kernel() = @rocprintf :grid "[%d] " workitemIdx().x
+
+        # One group, one wavefront
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=8 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # One group, multiple wavefronts
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=128 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, one wavefront each
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=64 gridsize=4 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+
+        # Multiple groups, multiple wavefronts each
+        exp = "[1] "
+        _, msg = @grab_output begin
+            @roc groupsize=128 gridsize=2 kernel()
+            AMDGPU.synchronize(; blocking=false)
+        end
+        @test msg == exp
+    end
 end
diff --git a/test/device/queries.jl b/test/device/queries.jl
deleted file mode 100644
index e73e1c92a..000000000
--- a/test/device/queries.jl
+++ /dev/null
@@ -1,21 +0,0 @@
-@testset "Active kernels" begin
-    AMDGPU.reset_dead_queue!() # Reset queue in case of signal timeout.
-
-    function kernel(sig)
-        hostcall!(sig)
-        nothing
-    end
-
-    wait_ev = Base.Event()
-    hc = HostCall(Nothing, Tuple{}) do
-        wait(wait_ev)
-    end
-
-    sig = @roc kernel(hc)
-    @test sig in AMDGPU.active_kernels()
-    @test (@atomic sig.active)
-
-    notify(wait_ev)
-    wait(sig)
-    @test !(sig in AMDGPU.active_kernels())
-end
diff --git a/test/device/vadd.jl b/test/device/vadd.jl
index 8a32baa75..390616a1d 100644
--- a/test/device/vadd.jl
+++ b/test/device/vadd.jl
@@ -16,7 +16,7 @@
     d_c = similar(d_a)
     len = prod(dims)
 
-    wait(@roc groupsize=len vadd(d_a, d_b, d_c))
+    @roc groupsize=len vadd(d_a, d_b, d_c)
     c = Array(d_c)
     @test a+b ≈ c
 end
diff --git a/test/device/wavefront.jl b/test/device/wavefront.jl
index 628f20bed..c33c041a8 100644
--- a/test/device/wavefront.jl
+++ b/test/device/wavefront.jl
@@ -1,5 +1,6 @@
 @testset "Wavefront Operations" begin
-    wavefrontsize = AMDGPU.wavefrontsize(AMDGPU.default_device())
+    hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device())
+    wavefrontsize = AMDGPU.Runtime.device_wavefront_size(hsa_dev)
 
     function reduce_kernel(op,X,Y)
         idx = workitemIdx().x
@@ -23,42 +24,46 @@
         X = rand(T(1):T(100), wavefrontsize)
         for op in (Base.:+, max, min, Base.:&, Base.:|, Base.:⊻)
             RX, RY = ROCArray(X), ROCArray(zeros(T,1))
-            wait(@roc groupsize=wavefrontsize reduce_kernel(op,RX,RY))
+            @roc groupsize=wavefrontsize reduce_kernel(op,RX,RY)
             @test Array(RY)[1] == reduce(op,X)
 
             RX, RY = ROCArray(X), ROCArray(zeros(T,wavefrontsize))
-            wait(@roc groupsize=wavefrontsize scan_kernel(op,RX,RY))
+            @roc groupsize=wavefrontsize scan_kernel(op,RX,RY)
             @test Array(RY) == accumulate(op,X)
         end
     end
+
     for T in (Float16, Float32, Float64)
         X = rand(T, wavefrontsize)
         for op in (Base.:+, max, min)
             RX, RY = ROCArray(X), ROCArray(zeros(T,1))
-            wait(@roc groupsize=wavefrontsize reduce_kernel(op,RX,RY))
+            @roc groupsize=wavefrontsize reduce_kernel(op,RX,RY)
             @test Array(RY)[1] ≈ reduce(op,X)
 
             RX, RY = ROCArray(X), ROCArray(zeros(T,wavefrontsize))
-            wait(@roc groupsize=wavefrontsize scan_kernel(op,RX,RY))
+            @roc groupsize=wavefrontsize scan_kernel(op,RX,RY)
             @test Array(RY) ≈ accumulate(op,X)
         end
     end
-    for X in (rand(Cint(0):Cint(1), wavefrontsize),
-              zeros(Cint, wavefrontsize),
-              ones(Cint, wavefrontsize),
-              )
+
+    for X in (
+        rand(Cint(0):Cint(1), wavefrontsize),
+        zeros(Cint, wavefrontsize),
+        ones(Cint, wavefrontsize),
+    )
         RX, RY = ROCArray(X), ROCArray(zeros(Bool,3))
-        wait(@roc groupsize=wavefrontsize bool_kernel(RX,RY))
+        @roc groupsize=wavefrontsize bool_kernel(RX,RY)
         Y = Array(RY)
-        @test_skip Y[1] == all(x->x==1,X)
+
+        @test_skip Y[1] == all(x -> x == 1, X)
         @test_skip Y[2] == any(x->x==1,X)
         @test_skip Y[3] == (length(unique(X)) == 1)
     end
 end
 
 @testset "Wavefront Information" begin
-    wavefrontsize = AMDGPU.wavefrontsize(AMDGPU.default_device())
-
+    hsa_dev = AMDGPU.Runtime.hsa_device(AMDGPU.device())
+    wavefrontsize = AMDGPU.Runtime.device_wavefront_size(hsa_dev)
     @test wavefrontsize == 32 || wavefrontsize == 64
 
     function kernel(X)
@@ -66,6 +71,7 @@ end
         nothing
     end
     RX = ROCArray(zeros(UInt32, 1))
-    wait(@roc kernel(RX))
+    @roc kernel(RX)
+    AMDGPU.synchronize()
     @allowscalar @test RX[1] == wavefrontsize
 end
diff --git a/test/dnn/pool.jl b/test/dnn/pool.jl
index 7265a9682..a9b2a8551 100644
--- a/test/dnn/pool.jl
+++ b/test/dnn/pool.jl
@@ -12,7 +12,6 @@
         yd, workspace = MIOpen.maxpool(xd; pkwargs...)
         yd, workspace = MIOpen.maxpool!(yd, xd; pkwargs...)
         @test Array(yd) ≈ y
-        wh1 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize)
         @test Array(yd) ≈ y
 
         dy = ones(Float32, size(y))
@@ -21,13 +20,9 @@
         dx = NNlib.∇maxpool(dy, y, x, pdims)
         dxd = MIOpen.∇maxpool(dyd, yd, xd; workspace, pkwargs...)
         @test Array(dxd) ≈ dx
-        wh2 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize)
-        @test wh1 ≈ wh2 # Check that workspace was not modified.
 
         dxd = MIOpen.∇maxpool!(dxd, dyd, yd, xd; workspace, pkwargs...)
         @test Array(dxd) ≈ dx
-        wh3 = AMDGPU.Runtime.Mem.download(UInt8, workspace.data, workspace.data.bytesize)
-        @test wh1 ≈ wh3 # Check that workspace was not modified.
 
         # Mean pooling.
 
diff --git a/test/dnn/softmax.jl b/test/dnn/softmax.jl
index c4cfe5817..559cbf6a3 100644
--- a/test/dnn/softmax.jl
+++ b/test/dnn/softmax.jl
@@ -3,6 +3,7 @@
         for (sz, dims) in [
             ((5,), :), ((5,), 1),
             ((5, 5), :), ((5, 5), 1), ((5, 5), 2),
+            ((5, 5, 5), (1, 2)), ((5, 5, 5), (1, 3)),
             ((5, 5, 5, 5), (2, 3)), ((5, 5, 5, 5), (2, 4)),
         ]
             if T == Float16
diff --git a/test/external/forwarddiff.jl b/test/external/forwarddiff.jl
index c4de55d73..1be23821c 100644
--- a/test/external/forwarddiff.jl
+++ b/test/external/forwarddiff.jl
@@ -9,7 +9,8 @@ function test_derivative(f, x::T) where T
         a[] = ForwardDiff.derivative(f, x)
         return
     end
-    wait(@roc kernel(buf, x))
+    @roc kernel(buf, x)
+    AMDGPU.synchronize()
     return AMDGPU.@allowscalar buf[]
 end
 
diff --git a/test/hsa/device.jl b/test/hsa/device.jl
index ce7e5ab67..b68e156ef 100644
--- a/test/hsa/device.jl
+++ b/test/hsa/device.jl
@@ -1,20 +1,17 @@
 @testset "Devices" begin
     @testset "Device IDs" begin
-        for kind in (:cpu, :gpu)
-            devices = AMDGPU.devices()
-            for (idx,device) in enumerate(devices)
-                @test AMDGPU.device_id(device) == idx
-            end
+        devices = AMDGPU.devices()
+        for (idx,device) in enumerate(devices)
+            @test AMDGPU.device_id(device) == idx
         end
     end
 
     @testset "Default selection" begin
         device = AMDGPU.default_device()
         @test device !== nothing
-        @test AMDGPU.device_type(device) == :gpu
-        @test ROCDevice().agent == device.agent
+        @test device == AMDGPU.device()
 
-        device_name = Runtime.name(device)
+        device_name = HIP.name(device)
         @test length(device_name) > 0
         @test !occursin('\0', device_name)
 
@@ -23,9 +20,11 @@
                 init_device = AMDGPU.default_device()
                 init_device_id = AMDGPU.default_device_id()
                 @test init_device_id == 1
+
                 AMDGPU.default_device_id!(2)
                 @test AMDGPU.default_device_id() == 2
                 @test AMDGPU.default_device() != init_device
+
                 AMDGPU.default_device_id!(1)
                 @test AMDGPU.default_device_id() == 1
                 @test AMDGPU.default_device() == init_device
@@ -40,7 +39,7 @@
 
     @testset "ISAs" begin
         device = AMDGPU.default_device()
-        device_isa = string(AMDGPU.default_isa_architecture(device))
+        device_isa, features = AMDGPU.default_isa(device).arch_features
         @test length(device_isa) > 0
         @test occursin("gfx", device_isa)
     end
diff --git a/test/hsa/error.jl b/test/hsa/error.jl
deleted file mode 100644
index 923866312..000000000
--- a/test/hsa/error.jl
+++ /dev/null
@@ -1,11 +0,0 @@
-@testset "HSA Status Error" begin
-    errorcode = AMDGPU.HSAError(HSA.STATUS_SUCCESS)
-    @test Runtime.description(errorcode) == "HSA_STATUS_SUCCESS: The function has been executed successfully."
-end
-
-if !IS_NAVI_2
-    @testset "HSA Async Queue Error" begin
-        kernel() = (Device.trap(); nothing)
-        @test_throws Runtime.QueueError wait(@roc kernel())
-    end
-end
diff --git a/test/hsa/getinfo.jl b/test/hsa/getinfo.jl
index dc1026ec9..7e5fa7c2e 100644
--- a/test/hsa/getinfo.jl
+++ b/test/hsa/getinfo.jl
@@ -1,39 +1,15 @@
 @testset "getinfo queries" begin
     @testset "ROCDevice" begin
         device = AMDGPU.default_device()
-        @test AMDGPU.Runtime.name(device) isa String
-        @test AMDGPU.Runtime.device_type(device) isa AMDGPU.HSA.DeviceType
-        @test AMDGPU.Runtime.device_wavefront_size(device) isa UInt32
+        hsa_dev = AMDGPU.Runtime.hsa_device(device)
+        @test AMDGPU.Runtime.name(hsa_dev) isa String
+        @test AMDGPU.Runtime.device_type(hsa_dev) isa AMDGPU.HSA.DeviceType
+        @test AMDGPU.Runtime.device_wavefront_size(hsa_dev) isa UInt32
     end
+
     @testset "HSA.ISA" begin
         device = AMDGPU.default_device()
-        device_isa = AMDGPU.default_isa(device)
+        device_isa = AMDGPU.default_isa(device).hsa_isa
         @test AMDGPU.Runtime.isa_workgroup_max_size(device_isa) isa UInt32
     end
-    @testset "ROCMemoryRegion" begin
-        device = AMDGPU.default_device()
-        region = first(AMDGPU.Runtime.regions(device))
-        @test AMDGPU.Runtime.region_segment(region) isa AMDGPU.HSA.RegionSegment
-        @test AMDGPU.Runtime.region_runtime_alloc_allowed(region) isa Bool
-        @test AMDGPU.Runtime.region_runtime_alloc_granule(region) isa Csize_t
-    end
-    @testset "ROCMemoryPool" begin
-        device = AMDGPU.default_device()
-        pool = first(AMDGPU.Runtime.memory_pools(device))
-        @test AMDGPU.Runtime.pool_segment(pool) isa AMDGPU.HSA.AMDSegment
-        @test AMDGPU.Runtime.pool_size(pool) isa Csize_t
-        @test AMDGPU.Runtime.pool_accessible_by_all(pool) isa Bool
-    end
-    @testset "HSA.ExecutableSymbol" begin
-        device = AMDGPU.default_device()
-        kernel = @roc launch=false identity(nothing)
-        exe = kernel.mod.exe
-        sym_name = kernel.fun.entry
-        exec_sym = AMDGPU.Runtime.executable_symbol_any(exe, device)
-        if exec_sym === nothing
-            exec_sym = AMDGPU.Runtime.executable_symbol_by_name(exe, device, sym_name)
-        end
-        @test AMDGPU.Runtime.executable_symbol_name(exec_sym) isa String
-        @test AMDGPU.Runtime.executable_symbol_kernel_private_segment_size(exec_sym) isa UInt32
-    end
 end
diff --git a/test/hsa/hashing.jl b/test/hsa/hashing.jl
deleted file mode 100644
index fc155a44b..000000000
--- a/test/hsa/hashing.jl
+++ /dev/null
@@ -1,100 +0,0 @@
-@testset "Kernel Argument Hashing" begin
-    khash = AMDGPU.Runtime.khash
-
-    @testset "Primitives" begin
-        x = UInt8(1)
-        y = UInt8(2)
-
-        @test khash(x) == khash(x)
-        @test khash(y) == khash(y)
-        @test khash(x) != khash(y)
-
-        @test khash(x, UInt(1)) == khash(x, UInt(1))
-        @test khash(y, UInt(1)) == khash(y, UInt(1))
-        @test khash(x, UInt(1)) != khash(y, UInt(1))
-
-        @test khash(x, UInt(1)) != khash(x, UInt(2))
-        @test khash(y, UInt(1)) != khash(y, UInt(2))
-        @test khash(x, UInt(1)) != khash(y, UInt(2))
-
-        for T in [UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64]
-            xc = convert(T, x)
-            yc = convert(T, y)
-            @test khash(xc) == khash(xc)
-            @test khash(yc) == khash(yc)
-            @test khash(xc) != khash(yc)
-
-            @test khash(xc, UInt(1)) == khash(xc, UInt(1))
-            @test khash(yc, UInt(1)) == khash(yc, UInt(1))
-            @test khash(xc, UInt(1)) != khash(yc, UInt(1))
-
-            @test khash(xc, UInt(1)) != khash(xc, UInt(2))
-            @test khash(yc, UInt(1)) != khash(yc, UInt(2))
-            @test khash(xc, UInt(1)) != khash(yc, UInt(2))
-        end
-    end
-
-    @testset "Tuples" begin
-        z1 = (UInt8(1), UInt8(2), UInt8(3))
-        z2 = (UInt8(1), UInt8(2), UInt8(4))
-        z3 = (UInt8(1), UInt8(2), UInt16(3))
-
-        @test khash(z1) == khash(z1)
-        @test khash(z1, UInt(1)) == khash(z1, UInt(1))
-        @test khash(z1, UInt(1)) != khash(z1, UInt(2))
-
-        @test khash(z1) != khash(z2)
-        @test khash(z1, UInt(1)) != khash(z2, UInt(1))
-
-        @test khash(z1) != khash(z3)
-        @test khash(z1, UInt(1)) != khash(z3, UInt(1))
-    end
-
-    @testset "Functions" begin
-        @test khash(+) == khash(+)
-        @test khash(+, UInt(1)) == khash(+, UInt(1))
-        @test khash(+, UInt(1)) != khash(+, UInt(2))
-        @test khash(+) != khash(-) != khash(/) != khash(identity)
-
-        x = 1
-        f() = x
-        @test khash(f) == khash(f)
-        @test khash(f, UInt(1)) == khash(f, UInt(1))
-
-        g() = x
-        @test khash(f) != khash(g)
-        @test khash(f, UInt(1)) != khash(g, UInt(1))
-    end
-
-    @testset "ROCDeviceArray" begin
-        RA = ROCArray(rand(4))
-        DA = rocconvert(RA)
-
-        @test khash(DA) == khash(DA)
-        @test khash(DA, UInt(1)) == khash(DA, UInt(1))
-        @test khash(DA, UInt(1)) != khash(DA, UInt(2))
-
-        A_hash = khash(DA)
-        RA .= RA .+ 1
-        @test khash(DA) == A_hash
-
-        RB = copy(RA)
-        DB = rocconvert(RB)
-
-        @test khash(RA) != khash(RB)
-    end
-
-    @testset "ROCDeviceArray wrappers" begin
-        RC = ROCArray(rand(4, 4))
-        DC = rocconvert(RC)
-        RCv = @view RC[2:3, 2:3]
-        DCv = rocconvert(RCv)
-
-        @test khash(DC) != khash(DCv)
-        @test khash(DC, UInt(1)) != khash(DCv, UInt(1))
-
-        @test khash(DCv) == khash(DCv)
-        @test khash(DCv, UInt(1)) == khash(DCv, UInt(1))
-        @test khash(DCv, UInt(1)) != khash(DCv, UInt(2))
-    end
-end
diff --git a/test/hsa/memory.jl b/test/hsa/memory.jl
deleted file mode 100644
index d600c68e1..000000000
--- a/test/hsa/memory.jl
+++ /dev/null
@@ -1,278 +0,0 @@
-@testset "Memory" begin
-
-@testset "Pointer-based" begin
-    @testset "Mem transfers" begin
-        src = 42
-
-        buf1 = Mem.alloc(sizeof(src); coherent=true)
-
-        Mem.set!(buf1, UInt32(57), 1)
-        x = Mem.download(UInt32, buf1)
-        @test x[1] == UInt32(57)
-
-        GC.@preserve Mem.upload!(buf1, pointer_from_objref(Ref(src)), sizeof(src))
-
-        dst1 = Ref(0)
-        GC.@preserve Mem.download!(pointer_from_objref(dst1), buf1, sizeof(src))
-        @test src == dst1[]
-
-        buf2 = Mem.alloc(sizeof(src))
-
-        Mem.transfer!(buf2, buf1, sizeof(src))
-
-        dst2 = Ref(0)
-        GC.@preserve Mem.download!(pointer_from_objref(dst2), buf2, sizeof(src))
-        @test src == dst2[]
-
-        Mem.free(buf2)
-        Mem.free(buf1)
-    end
-
-    @testset "Unsafe copy h2d and d2h" begin
-        nx,ny,nz = 7,6,5
-
-        P = zeros(nx, ny, nz)
-        P .= [iz*1e2 + iy*1e1 + ix for ix=1:size(P,1), iy=1:size(P,2), iz=1:size(P,3)]
-        P = ROCArray(P)
-        P2 = AMDGPU.zeros(eltype(P),size(P))
-        ranges = [1:size(P,1), 1:size(P,2), 1:size(P,3)]
-
-        # init buffers
-        buf = zeros(size(P))
-        dbuf = AMDGPU.zeros(eltype(P),size(P))
-
-        # lock host pointer and convert it to eltype(buf)
-        buf_Ptr = convert(Ptr{eltype(buf)}, AMDGPU.Mem.lock(pointer(buf), sizeof(buf), AMDGPU.default_device()))
-
-        # 1. test device to host
-        P_Ptr = convert(Ptr{eltype(buf)}, pointer(P))
-        signal1 = ROCSignal()
-        Mem.unsafe_copy3d!(
-            buf_Ptr, P_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]);
-            dstPitch=sizeof(eltype(buf))*length(ranges[1]), dstSlice=sizeof(eltype(buf))*length(ranges[1])*length(ranges[2]),
-            srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), srcPitch=sizeof(eltype(buf))*size(P,1), srcSlice=sizeof(eltype(buf))*size(P,1)*size(P,2),
-            async=true, signal=signal1
-        )
-        wait(signal1)
-        @test all(buf[:] .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-
-        # 2. test host to device
-        P2_Ptr  = convert(Ptr{eltype(buf)}, pointer(P2))
-        signal2 = ROCSignal()
-        Mem.unsafe_copy3d!(
-            P2_Ptr, buf_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]);
-            dstPos=(ranges[1][1], ranges[2][1], ranges[3][1]), dstPitch=sizeof(eltype(buf))*size(P,1), dstSlice=sizeof(eltype(buf))*size(P,1)*size(P,2),
-            srcPitch=sizeof(eltype(buf))*length(ranges[1]), srcSlice=sizeof(eltype(buf))*length(ranges[1])*length(ranges[2]),
-            async=true, signal=signal2
-        )
-        wait(signal2)
-        @test all(buf[:] .== Array(P2[ranges[1],ranges[2],ranges[3]][:]))
-
-        # unlock host pointer
-        Mem.unlock(pointer(buf))
-
-        # 3. test device to device
-        dbuf_Ptr = convert(Ptr{eltype(dbuf)}, pointer(dbuf))
-        signal3 = ROCSignal()
-        Mem.unsafe_copy3d!(
-            dbuf_Ptr, P_Ptr, length(ranges[1]), length(ranges[2]), length(ranges[3]);
-            dstPitch=sizeof(eltype(dbuf))*length(ranges[1]), dstSlice=sizeof(eltype(dbuf))*length(ranges[1])*length(ranges[2]),
-            srcPos=(ranges[1][1], ranges[2][1], ranges[3][1]), srcPitch=sizeof(eltype(dbuf))*size(P,1), srcSlice=sizeof(eltype(dbuf))*size(P,1)*size(P,2),
-            async=true, signal=signal3
-        )
-        wait(signal3)
-        @test all(Array(dbuf[:]) .== Array(P[ranges[1],ranges[2],ranges[3]][:]))
-    end
-end
-
-@testset "Array-based" begin
-    src = [42]
-
-    buf1 = Mem.alloc(src)
-
-    Mem.upload!(buf1, src)
-
-    dst1 = similar(src)
-    Mem.download!(dst1, buf1)
-    @test src == dst1
-
-    buf2 = Mem.upload(src)
-
-    dst2 = similar(src)
-    Mem.download!(dst2, buf2)
-    @test src == dst2
-
-    Mem.free(buf1)
-end
-
-@testset "Type-based" begin
-    buf = Mem.alloc(Int)
-
-    # there's no type-based upload, duh
-    src = [42]
-    Mem.upload!(buf, src)
-
-    dst = Mem.download(eltype(src), buf)
-    @test src == dst
-end
-
-@testset "Pointer information" begin
-    default_device = AMDGPU.default_device()
-
-    N = 1024
-    a = rand(N)
-    b = Mem.alloc(default_device, N)
-
-    ptrinfo_host = Mem.pointerinfo(a)
-    ptrinfo_hsa = Mem.pointerinfo(b)
-
-    @test ptrinfo_host.type == HSA.EXT_POINTER_TYPE_UNKNOWN
-    @test ptrinfo_hsa.type == HSA.EXT_POINTER_TYPE_HSA
-    @test_skip ptrinfo_hsa.agentOwner.handle == default_device.agent.handle
-
-    Mem.free(b)
-end
-
-@testset "Page-locked memory (OS allocations)" begin
-    a = rand(1024)
-    plocked = Mem.lock(a)
-
-    # NOTE - For a single device, it seems that plocked == pointer(a)
-    @test Mem.pointerinfo(pointer(a)).type == HSA.EXT_POINTER_TYPE_LOCKED
-    @test Mem.pointerinfo(plocked).type == HSA.EXT_POINTER_TYPE_LOCKED
-    @test Mem.pointerinfo(plocked).sizeInBytes == sizeof(a)
-
-    Mem.unlock(a)
-    @test Mem.pointerinfo(pointer(a)).type == HSA.EXT_POINTER_TYPE_UNKNOWN
-    @test Mem.pointerinfo(plocked).type == HSA.EXT_POINTER_TYPE_UNKNOWN
-end
-
-@testset "Memory Region Queries" begin
-    @testset "Region API Queries" begin
-        for (idx, device) in enumerate(AMDGPU.devices())
-            regions = Runtime.regions(device)
-            regions_global = filter(r->Runtime.region_segment(r) == HSA.REGION_SEGMENT_GLOBAL, regions)
-            regions_global_coarse_nohost = filter(r->(Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_COARSE_GRAINED > 0) &&
-                                                     !Runtime.region_host_accessible(r), regions)
-            regions_group = filter(r->Runtime.region_segment(r) == HSA.REGION_SEGMENT_GROUP, regions)
-            regions_finegrained = filter(r->Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_FINE_GRAINED > 0, regions)
-            regions_kernarg = filter(r->Runtime.region_global_flags(r) & HSA.REGION_GLOBAL_FLAG_KERNARG > 0, regions)
-
-            @test length(regions_global) > 0
-            if idx == 1
-                @test length(regions_global_coarse_nohost) >= 1
-                @test length(regions_group) == 1
-            else
-                # BUG: https://github.com/RadeonOpenCompute/ROCR-Runtime/issues/134
-                @test length(regions_global_coarse_nohost) == 0
-                @test length(regions_group) == 0
-            end
-            @test length(regions_finegrained) > 0
-            @test length(regions_kernarg) > 0
-
-            @test all(r->Runtime.region_size(r) > 0, regions)
-
-            @test all(Runtime.region_runtime_alloc_allowed, regions_global)
-            @test all(r->Runtime.region_runtime_alloc_granule(r) > 0, regions_global)
-            @test all(r->Runtime.region_runtime_alloc_alignment(r) > 0, regions_global)
-            @test all(r->Runtime.region_alloc_max_size(r) > 0, regions_global)
-
-            @test !any(Runtime.region_runtime_alloc_allowed, regions_group)
-            @test !any(Runtime.region_host_accessible, regions_group)
-
-            @test all(Runtime.region_runtime_alloc_allowed, regions_finegrained)
-            @test all(Runtime.region_host_accessible, regions_finegrained)
-
-            @test all(Runtime.region_runtime_alloc_allowed, regions_kernarg)
-            @test all(Runtime.region_host_accessible, regions_kernarg)
-
-            for region in filter(Runtime.region_runtime_alloc_allowed, regions)
-                buf = Mem.alloc(device, region, 8)
-                @test buf.ptr != C_NULL
-                @test !buf.pool_alloc
-                Mem.free(buf)
-            end
-        end
-    end
-    @testset "Memory Pool API Queries" begin
-        for device in AMDGPU.devices()
-            pools = Runtime.memory_pools(device)
-            pools_global = filter(r->Runtime.pool_segment(r) == HSA.AMD_SEGMENT_GLOBAL, pools)
-            pools_group = filter(r->Runtime.pool_segment(r) == HSA.AMD_SEGMENT_GROUP, pools)
-
-            @test length(pools_global) >= 1
-            @test length(pools_group) == 1
-
-            @test all(r->Runtime.pool_size(r) > 0, pools)
-            @test !any(Runtime.pool_accessible_by_all, pools)
-
-            @test all(Runtime.pool_runtime_alloc_allowed, pools_global)
-            @test all(r->Runtime.pool_runtime_alloc_granule(r) > 0, pools_global)
-            @test all(r->Runtime.pool_runtime_alloc_alignment(r) > 0, pools_global)
-            @test all(r->Runtime.pool_alloc_max_size(r) > 0, pools_global)
-
-            @test !any(Runtime.pool_runtime_alloc_allowed, pools_group)
-
-            for pool in filter(Runtime.pool_runtime_alloc_allowed, pools)
-                buf = Mem.alloc(device, pool, 8)
-                @test buf.ptr != C_NULL
-                @test buf.pool_alloc
-                Mem.free(buf)
-            end
-        end
-    end
-end
-
-@testset "Exceptions" begin
-    @test_throws ArgumentError Mem.alloc(Function, 1)   # abstract
-    @test_throws ArgumentError Mem.alloc(Array{Int}, 1) # UnionAll
-    @test_throws ArgumentError Mem.alloc(Integer, 1)    # abstract
-    # TODO: can we test for the third case?
-    #       !abstract && leaftype seems to imply UnionAll nowadays...
-
-    # zero-width allocations should be permitted
-    null = Mem.alloc(Int, 0)
-    Mem.free(null)
-
-    # double-free should throw
-    x = Mem.alloc(1)
-    Mem.free(x)
-    # FIXME: Segfaults... @test_throws HSAError Mem.free(x)
-end
-
-@testset "Mutable structs" begin
-    @eval mutable struct MutablePtrFree
-        foo::Int
-        bar::Int
-    end
-    buf = Mem.alloc(MutablePtrFree)
-    Mem.upload!(buf, [MutablePtrFree(0,0)])
-    Mem.free(buf)
-
-    @eval mutable struct MutableNonPtrFree
-        foo::Int
-        bar::String
-    end
-    @test_throws ArgumentError Mem.alloc(MutableNonPtrFree)
-end
-
-@testset "Retry" begin
-    device = AMDGPU.default_device()
-    finegrained_region = Runtime.get_region(device, :finegrained)
-    coarsegrained_pool = Runtime.get_memory_pool(device, :coarsegrained)
-    finegrained_max = Runtime.region_size(finegrained_region)
-    coarsegrained_max = Runtime.pool_size(coarsegrained_pool)
-
-    if coarsegrained_max < finegrained_max
-        @testset "Coherent Fallback" begin
-            # This will still work because we fallback to coherent allocations
-            A = ROCVector{UInt8}(undef, Int(coarsegrained_max+8))
-            @test A.buf.coherent
-            A = nothing
-        end
-    else
-        @test_skip "Coherent Fallback"
-    end
-end
-
-end
diff --git a/test/hsa/queue.jl b/test/hsa/queue.jl
deleted file mode 100644
index 55a990a8d..000000000
--- a/test/hsa/queue.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-@testset "Queues" begin
-    @testset "Priorities" begin
-        # Test that priorities can be set
-        for priority in (:low, :normal, :high)
-            ROCQueue(; priority)
-        end
-        @test_throws ArgumentError ROCQueue(; priority=:fake)
-    end
-end
diff --git a/test/rocarray/base.jl b/test/rocarray/base.jl
index 1861e712c..d07f1f1ac 100644
--- a/test/rocarray/base.jl
+++ b/test/rocarray/base.jl
@@ -58,39 +58,40 @@ end
     end
 end
 
-@testset "unsafe_wrap" begin
-    A = rand(4, 3)
-    A_orig = copy(A)
-    RA = Base.unsafe_wrap(ROCArray, pointer(A), size(A))
-    @test RA.buf.device == AMDGPU.default_device()
-    @test RA isa ROCArray{Float64,2}
-
-    # GPU pointer works
-    RA .+= 1.0
-
-    # Host pointer is updated
-    @test A ≈ A_orig .+ 1.0
-
-    # Base.show
-    @test (println(devnull, RA); true)
-
-    # Mem.download!
-    B = zeros(4, 3)
-    copyto!(B, RA)
-    @test B ≈ Array(RA)
-
-    # Mem.upload!
-    C = rand(4, 3)
-    copyto!(RA, C)
-    @test Array(RA) ≈ C
-
-    # Mem.transfer!
-    D = rand(4, 3)
-    D_orig = copy(D)
-    RD = Base.unsafe_wrap(ROCArray, pointer(D), size(D))
-    copyto!(RD, RA)
-    @test Array(RD) ≈ Array(RA) ≈ C
-end
+# FIXME
+# @testset "unsafe_wrap" begin
+#     A = rand(4, 3)
+#     A_orig = copy(A)
+#     RA = Base.unsafe_wrap(ROCArray, pointer(A), size(A))
+#     @test RA.buf.device == AMDGPU.default_device()
+#     @test RA isa ROCArray{Float64,2}
+
+#     # GPU pointer works
+#     RA .+= 1.0
+
+#     # Host pointer is updated
+#     @test A ≈ A_orig .+ 1.0
+
+#     # Base.show
+#     @test (println(devnull, RA); true)
+
+#     # Mem.download!
+#     B = zeros(4, 3)
+#     copyto!(B, RA)
+#     @test B ≈ Array(RA)
+
+#     # Mem.upload!
+#     C = rand(4, 3)
+#     copyto!(RA, C)
+#     @test Array(RA) ≈ C
+
+#     # Mem.transfer!
+#     D = rand(4, 3)
+#     D_orig = copy(D)
+#     RD = Base.unsafe_wrap(ROCArray, pointer(D), size(D))
+#     copyto!(RD, RA)
+#     @test Array(RD) ≈ Array(RA) ≈ C
+# end
 
 @testset "unsafe_free" begin
     A = AMDGPU.ones(4, 3)
@@ -111,7 +112,7 @@ end
         A = AMDGPU.ones(16)
         @test refcount_live(A) == (1, true)
         B = f(A)
-        @test A.buf.base_ptr == B.buf.base_ptr
+        @test A.buf.ptr == B.buf.ptr
         @test refcount_live(A) == refcount_live(B)
         @test refcount_live(B) == (2-switch, true)
         finalize(B)
diff --git a/test/rocarray/broadcast.jl b/test/rocarray/broadcast.jl
index 2abd13471..2c6b21070 100644
--- a/test/rocarray/broadcast.jl
+++ b/test/rocarray/broadcast.jl
@@ -1,41 +1,41 @@
 @testset "broadcast" begin
-  @test testf((x)       -> fill!(x, 1),  rand(3,3))
-  @test testf((x, y)    -> map(+, x, y), rand(2, 3), rand(2, 3))
-  @test testf((x)       -> sin.(x),      rand(2, 3))
-  @test testf((x)       -> log.(x) .+ 1, rand(2, 3))
-  @test testf((x)       -> 2x,           rand(2, 3))
-  @test testf((x)       -> x .^ 0,      rand(2, 3))
-  @test testf((x)       -> x .^ 1,      rand(2, 3))
-  @test testf((x)       -> x .^ 2,      rand(2, 3))
-  @test testf((x)       -> x .^ 3,      rand(2, 3))
-  @test testf((x)       -> x .^ 5,      rand(2, 3))
-  @test testf((x)       -> (z = Int32(5); x .^ z),      rand(2, 3))
-  @test testf((x)       -> (z = Float64(π); x .^ z),      rand(2, 3))
-  @test testf((x)       -> (z = Float32(π); x .^ z),      rand(Float32, 2, 3))
-  @test testf((x, y)    -> x .+ y,       rand(2, 3), rand(1, 3))
-  @test testf((z, x, y) -> z .= x .+ y,  rand(2, 3), rand(2, 3), rand(2))
-  @test (ROCArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == ROCArray([C_NULL])
-  @test ROCArray([1,2,3]) .+ ROCArray([1.0,2.0,3.0]) == ROCArray([2,4,6])
+    @test testf((x)       -> fill!(x, 1),  rand(3,3))
+    @test testf((x, y)    -> map(+, x, y), rand(2, 3), rand(2, 3))
+    @test testf((x)       -> sin.(x),      rand(2, 3))
+    @test testf((x)       -> log.(x) .+ 1, rand(2, 3))
+    @test testf((x)       -> 2x,           rand(2, 3))
+    @test testf((x)       -> x .^ 0,      rand(2, 3))
+    @test testf((x)       -> x .^ 1,      rand(2, 3))
+    @test testf((x)       -> x .^ 2,      rand(2, 3))
+    @test testf((x)       -> x .^ 3,      rand(2, 3))
+    @test testf((x)       -> x .^ 5,      rand(2, 3))
+    @test testf((x)       -> (z = Int32(5); x .^ z),      rand(2, 3))
+    @test testf((x)       -> (z = Float64(π); x .^ z),      rand(2, 3))
+    @test testf((x)       -> (z = Float32(π); x .^ z),      rand(Float32, 2, 3))
+    @test testf((x, y)    -> x .+ y,       rand(2, 3), rand(1, 3))
+    @test testf((z, x, y) -> z .= x .+ y,  rand(2, 3), rand(2, 3), rand(2))
+    @test (ROCArray{Ptr{Cvoid}}(undef, 1) .= C_NULL) == ROCArray([C_NULL])
+    @test ROCArray([1,2,3]) .+ ROCArray([1.0,2.0,3.0]) == ROCArray([2,4,6])
 
-  @eval struct Whatever{T}
-      x::Int
-  end
-  @test Array(Whatever{Int}.(ROCArray([1]))) == Whatever{Int}.([1])
+    @eval struct Whatever{T}
+        x::Int
+    end
+    @test Array(Whatever{Int}.(ROCArray([1]))) == Whatever{Int}.([1])
 end
 
 # https://github.com/JuliaGPU/CUDA.jl/issues/223
 @testset "Ref Broadcast" begin
-  foobar(idx, A) = A[idx]
-  @test ROCArray([42]) == foobar.(ROCArray([1]), Base.RefValue(ROCArray([42])))
+    foobar(idx, A) = A[idx]
+    @test ROCArray([42]) == foobar.(ROCArray([1]), Base.RefValue(ROCArray([42])))
 end
 
 @testset "Broadcast Fix" begin
-  @test testf(x -> log.(x), rand(3,3))
-  @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3))
+    @test testf(x -> log.(x), rand(3,3))
+    @test testf((x,xs) -> log.(x.+xs), Ref(1), rand(3,3))
 end
 
 # https://github.com/JuliaGPU/CUDA.jl/issues/261
 @testset "Broadcast Ref{<:Type}" begin
-  A = ROCArray{ComplexF64}(undef, (2,2))
-  @test eltype(convert.(ComplexF32, A)) == ComplexF32
+    A = ROCArray{ComplexF64}(undef, (2,2))
+    @test eltype(convert.(ComplexF32, A)) == ComplexF32
 end
diff --git a/test/rocarray/nmf.jl b/test/rocarray/nmf.jl
deleted file mode 100644
index a9d37a9b9..000000000
--- a/test/rocarray/nmf.jl
+++ /dev/null
@@ -1,31 +0,0 @@
-@testset "NMF" begin
-
-## A simple NMF implementation, which is useful to test mark/wait
-
-function step(X, W, H)
-    # H update
-    H = (H .* (W' * (X ./ (W * H)))
-         ./ (sum(W; dims=1))')
-    # W update
-    W = (W .* ((X ./ (W * H)) * (H'))
-         ./ (sum(H; dims=2)'))
-    # error estimate
-    X - W * H
-end
-
-for scale in (1:5:50)
-    ncol = 2001
-    nrow = 1002*scale
-    nfeatures = 12
-    X = rand(Float32, nrow, ncol)
-    W = rand(Float32, nrow, nfeatures)
-    H = rand(Float32, nfeatures, ncol)
-    cpu_res = step(X, W, H)
-    RX = ROCArray(X)
-    RW = ROCArray(W)
-    RH = ROCArray(H)
-    gpu_res = step(RX, RW, RH)
-    @test Array(gpu_res) ≈ cpu_res
-end
-
-end
diff --git a/test/runtests.jl b/test/runtests.jl
index 802f966a3..5ec43d1b6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -6,11 +6,13 @@ include("setup.jl")
 @testset "AMDGPU" begin
 
 # Run tests in parallel
-np = Threads.nthreads()
+
+# FIXME
+# HostCall tests hang with multiple workers.
+np = 1 # Threads.nthreads()
 ws = Int[]
 ws_pids = Int[]
 if np == 1
-    include("setup.jl")
     push!(ws, 1)
     push!(ws_pids, getpid())
 else
@@ -44,46 +46,38 @@ tasks = Dict{Int,String}()
 @info "Testing using device $(AMDGPU.default_device())"
 AMDGPU.versioninfo()
 
-@info "Running tests with $(length(ws)) workers"
+@info "Running tests with $(length(ws)) workers and $(Threads.nthreads()) threads."
 
 push!(tests, "HSA" => ()->begin
-    include("hsa/error.jl")
     include("hsa/utils.jl")
     include("hsa/getinfo.jl")
     include("hsa/device.jl")
-    include("hsa/queue.jl")
-    include("hsa/memory.jl")
-    include("hsa/hashing.jl")
 end)
 push!(tests, "Codegen" => ()->begin
     include("codegen/synchronization.jl")
     include("codegen/trap.jl")
 end)
-if AMDGPU.Runtime.LOGGING_STATIC_ENABLED
-    push!(tests, "Logging" => ()->include("logging.jl"))
-else
-    @warn """
-    Logging is statically disabled, skipping logging tests.
-    This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests.
-    """
-    @test_skip "Logging"
-end
+# if AMDGPU.Runtime.LOGGING_STATIC_ENABLED
+#     push!(tests, "Logging" => ()->include("logging.jl"))
+# else
+#     @warn """
+#     Logging is statically disabled, skipping logging tests.
+#     This can be fixed by calling `AMDGPU.Runtime.enable_logging!()` and re-running tests.
+#     """
+#     @test_skip "Logging"
+# end
 push!(tests, "Device Functions" => ()->begin
     include("device/launch.jl")
     include("device/array.jl")
     include("device/vadd.jl")
     include("device/memory.jl")
     include("device/indexing.jl")
-    include("device/hostcall.jl")
-    include("device/output.jl")
-    include("device/globals.jl")
     include("device/math.jl")
     include("device/wavefront.jl")
     include("device/execution_control.jl")
     include("device/exceptions.jl")
-    # FIXME segfaults in a weird way (on check_ir)
-    # include("device/deps.jl")
-    include("device/queries.jl")
+    include("device/hostcall.jl")
+    include("device/output.jl")
 end)
 push!(tests, "Multitasking" => ()->include("tls.jl"))
 push!(tests, "ROCArray - Base" => ()->include("rocarray/base.jl"))
@@ -92,7 +86,7 @@ if CI
     push!(tests, "ROCm libraries are functional" => ()->begin
         @test AMDGPU.functional(:rocblas)
         @test AMDGPU.functional(:rocrand)
-        if !AMDGPU.use_artifacts
+        if !AMDGPU.use_artifacts()
             # We don't have artifacts for these
             @test AMDGPU.functional(:rocfft)
         end
@@ -112,20 +106,14 @@ push!(tests, "rocRAND" => ()->begin
         @test_skip "rocRAND"
     end
 end)
-push!(tests, "rocFFT" => ()->begin
-    if AMDGPU.functional(:rocfft)
-        include("rocarray/fft.jl")
-    else
-        @test_skip "rocFFT"
-    end
-end)
-push!(tests, "NMF" => ()->begin
-    if AMDGPU.functional(:rocblas)
-        include("rocarray/nmf.jl")
-    else
-        @test_skip "NMF"
-    end
-end)
+# # FIXME outdated library
+# push!(tests, "rocFFT" => ()->begin
+#     if AMDGPU.functional(:rocfft)
+#         include("rocarray/fft.jl")
+#     else
+#         @test_skip "rocFFT"
+#     end
+# end)
 push!(tests, "MIOpen" => ()->begin
     if AMDGPU.functional(:MIOpen)
         include("dnn/miopen.jl")
@@ -134,14 +122,23 @@ push!(tests, "MIOpen" => ()->begin
     end
 end)
 push!(tests, "External Packages" => ()->include("external/forwarddiff.jl"))
-for (i, name) in enumerate(keys(TestSuite.tests))
-    push!(tests, "GPUArrays TestSuite - $name" =>
-        ()->TestSuite.tests[name](ROCArray))
+for (i, name) in enumerate(sort(collect(keys(TestSuite.tests))))
+    push!(tests, "GPUArrays TestSuite - $name" => () -> begin
+        TestSuite.tests[name](ROCArray)
+        # Multidimensional indexing contains boxing,
+        # launching global malloc hostcall.
+        # Synchronize to disable it.
+        if name == "indexing multidimensional"
+            AMDGPU.synchronize(; blocking=false)
+        end
+    end)
 end
-push!(tests, "KernelAbstractions" => ()->begin
+push!(tests, "KernelAbstractions" => ()-> begin
     Testsuite.testsuite(
         ROCBackend, "ROCM", AMDGPU, ROCArray, AMDGPU.ROCDeviceArray;
-        skip_tests=Set(["sparse"]))
+        skip_tests=Set(["Printing", "sparse"])) # TODO fix KA printing
+    # Disable global malloc hostcall started by conversion tests.
+    AMDGPU.synchronize(; blocking=false)
 end)
 
 function run_worker(w)
diff --git a/test/setup.jl b/test/setup.jl
index 89e49b212..be69fbaea 100644
--- a/test/setup.jl
+++ b/test/setup.jl
@@ -1,8 +1,5 @@
 using AMDGPU
-using AMDGPU: Runtime, Mem, Device, HSA, AS
-if AMDGPU.functional(:hip)
-    using AMDGPU: HIP
-end
+using AMDGPU: Runtime, Mem, Device, HIP, HSA, AS
 using GPUCompiler
 using LinearAlgebra
 using LLVM, LLVM.Interop
@@ -33,7 +30,7 @@ if isdefined(TestSuite, :WrapArray)
 end
 
 import AMDGPU: allowscalar, @allowscalar
-import AMDGPU.Device: HostCall, hostcall!
+import AMDGPU.Device: HostCallHolder, hostcall!
 allowscalar(false)
 
 CI = parse(Bool, get(ENV, "CI", "false"))
@@ -43,7 +40,4 @@ if CI
     AMDGPU.Runtime.EXIT_ON_MEMORY_FAULT[] = true
 end
 
-Runtime.DEFAULT_SIGNAL_TIMEOUT[] = 5.0
 Device.DEFAULT_HOSTCALL_TIMEOUT[] = 5.0
-
-const IS_NAVI_2 = AMDGPU.default_device().name in ("gfx1030", "gfx1031", "gfx1032")
diff --git a/test/tls.jl b/test/tls.jl
index c6d0f31d1..52af31d88 100644
--- a/test/tls.jl
+++ b/test/tls.jl
@@ -17,16 +17,13 @@ end
 
 @testset "Basics" begin
     device = @inferred AMDGPU.device()
-    @test device isa ROCDevice
+    @test device isa HIPDevice
     @test device === AMDGPU.Runtime.get_default_device()
 
     context = @inferred AMDGPU.context()
     @test context isa HIPContext
     @test AMDGPU.device_id(AMDGPU.device(context)) == AMDGPU.device_id(device)
 
-    queue = @inferred AMDGPU.queue()
-    @test queue isa ROCQueue
-
     stream = @inferred AMDGPU.stream()
     @test stream isa HIPStream
     @test AMDGPU.device_id(AMDGPU.device(context)) == AMDGPU.device_id(device)
@@ -34,96 +31,78 @@ end
     tls = @inferred AMDGPU.task_local_state()
     @test tls isa AMDGPU.TaskLocalState
     @test device === tls.device
-    @test queue === tls.queue
-    @test queue.priority == tls.priority
     @test stream === tls.stream
     @test stream.priority == tls.priority
     @test context === tls.context
 end
 
 if length(AMDGPU.devices()) > 1
-@testset "Devices" begin
-    dev1 = AMDGPU.devices()[1]
-    tls1 = copy(AMDGPU.task_local_state())
-    @assert tls1.device === dev1
-    dev2 = AMDGPU.devices()[2]
-    AMDGPU.device!(dev2)
-    tls2 = copy(AMDGPU.task_local_state())
-    AMDGPU.device!(dev1)
-    tls3 = copy(AMDGPU.task_local_state())
-
-    @test tls2.device === dev2
-    @test tls1.device !== tls2.device
-    @test tls1.context !== tls2.context
-    @test tls2.queue isa ROCQueue
-    @test tls1.queue !== tls2.queue
-    @test tls2.stream isa HIPStream
-    @test tls1.stream !== tls2.stream
-    @test AMDGPU.device(tls2.queue) == dev2
-    @test AMDGPU.device_id(AMDGPU.device(tls2.context)) == 2
-    @test AMDGPU.device_id(AMDGPU.device(tls2.stream)) == 2
-
-    @test tls3.device === dev1
-    @test tls1.device === tls3.device
-    @test tls1.context === tls3.context
-    @test tls3.queue isa ROCQueue
-    @test tls1.queue === tls3.queue
-    @test tls2.stream isa HIPStream
-    @test tls1.stream === tls3.stream
-    @test AMDGPU.device(tls3.queue) == dev1
-    @test AMDGPU.device_id(AMDGPU.device(tls3.context)) == 1
-    @test AMDGPU.device_id(AMDGPU.device(tls3.stream)) == 1
-end
+    @testset "Devices" begin
+        dev1 = AMDGPU.devices()[1]
+        tls1 = copy(AMDGPU.task_local_state())
+        @assert tls1.device === dev1
+        dev2 = AMDGPU.devices()[2]
+        AMDGPU.device!(dev2)
+        tls2 = copy(AMDGPU.task_local_state())
+        AMDGPU.device!(dev1)
+        tls3 = copy(AMDGPU.task_local_state())
+
+        @test tls2.device === dev2
+        @test tls1.device !== tls2.device
+        @test tls1.context !== tls2.context
+        @test tls2.stream isa HIPStream
+        @test tls1.stream !== tls2.stream
+        @test AMDGPU.device_id(AMDGPU.device(tls2.context)) == 2
+        @test AMDGPU.device_id(AMDGPU.device(tls2.stream)) == 2
+
+        @test tls3.device === dev1
+        @test tls1.device === tls3.device
+        @test tls1.context === tls3.context
+        @test tls2.stream isa HIPStream
+        @test tls1.stream === tls3.stream
+        @test AMDGPU.device_id(AMDGPU.device(tls3.context)) == 1
+        @test AMDGPU.device_id(AMDGPU.device(tls3.stream)) == 1
+    end
 else
-@test_skip "Devices"
+    @test_skip "Devices"
 end
 
-@testset "Queues/Streams" begin
+@testset "Streams" begin
     tls1 = copy(AMDGPU.task_local_state())
-    queue1 = AMDGPU.queue()
-    @test tls1.queue === queue1 === AMDGPU.queue()
     stream1 = AMDGPU.stream()
     @test tls1.stream === stream1 === AMDGPU.stream()
-    @test tls1.priority == queue1.priority == stream1.priority == :normal
+    @test tls1.priority == stream1.priority == :normal
 
     tls2 = async_tls()
     @test tls2.device === tls1.device
     @test tls2.context === tls1.context
-    @test tls2.queue !== tls1.queue
     @test tls2.stream !== tls1.stream
     @test tls2.priority == :normal
 
     tls3 = copy(AMDGPU.task_local_state())
-    @test tls3.queue === queue1 === AMDGPU.queue()
     @test tls3.stream === stream1 === AMDGPU.stream()
 
     @testset "Priorities" begin
         AMDGPU.priority!(:high)
         tlsh = copy(AMDGPU.task_local_state())
         @test tlsh.priority == :high
-        @test tlsh.queue !== tls1.queue
-        @test tlsh.queue.priority == :high
         @test tlsh.stream !== tls1.stream
         @test tlsh.stream.priority == :high
 
         AMDGPU.priority!(:low)
         tlsl = copy(AMDGPU.task_local_state())
         @test tlsl.priority == :low
-        @test tlsl.queue !== tls1.queue
-        @test tlsl.queue.priority == :low
         @test tlsl.stream !== tls1.stream
         @test tlsl.stream.priority == :low
 
         AMDGPU.priority!(:normal)
         tlsn = copy(AMDGPU.task_local_state())
         @test tlsn.priority == :normal
-        @test tlsn.queue.priority == :normal
         @test tlsn.stream.priority == :normal
 
         AMDGPU.priority!(:high)
         tlsn2 = async_tls()
         @test tlsn2.priority == :normal
-        @test tlsn2.queue.priority == :normal
         @test tlsn2.stream.priority == :normal
 
         AMDGPU.priority!(:normal)
@@ -131,7 +110,6 @@ end
             AMDGPU.priority!(:high)
         end
         @test tlsh2.priority == :high
-        @test tlsh2.queue.priority == :high
         @test tlsh2.stream.priority == :high
         @test AMDGPU.task_local_state().priority == :normal
     end