JuliaGPU · maleadt · Oct 17, 2024 · Apr 10, 2024
diff --git a/Project.toml b/Project.toml
@@ -30,7 +30,7 @@ oneAPI_Support_jll = "b049733a-a71d-5ed3-8eba-7d323ac00b36"
 Adapt = "4"
 CEnum = "0.4, 0.5"
 ExprTools = "0.1"
-GPUArrays = "10"
+GPUArrays = "11"
 GPUCompiler = "0.23, 0.24, 0.25, 0.26, 0.27, 1"
 KernelAbstractions = "0.9.1"
 LLVM = "6, 7, 8, 9"

diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
@@ -1,70 +1,5 @@
 # GPUArrays.jl interface
 
-
-#
-# Device functionality
-#
-
-
-## execution
-
-struct oneArrayBackend <: AbstractGPUBackend end
-
-struct oneKernelContext <: AbstractKernelContext end
-
-@inline function GPUArrays.launch_heuristic(::oneArrayBackend, f::F, args::Vararg{Any,N};
-                                             elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @oneapi launch=false f(oneKernelContext(), args...)
-
-    items = launch_configuration(kernel)
-    # XXX: how many groups is a good number? the API doesn't tell us.
-    #      measured on a low-end IGP, 32 blocks seems like a good sweet spot.
-    #      note that this only matters for grid-stride kernels, like broadcast.
-    return (threads=items, blocks=32)
-end
-
-function GPUArrays.gpu_call(::oneArrayBackend, f, args, threads::Int, blocks::Int;
-                            name::Union{String,Nothing})
-    @oneapi items=threads groups=blocks name=name f(oneKernelContext(), args...)
-end
-
-
-## on-device
-
-# indexing
-
-GPUArrays.blockidx(ctx::oneKernelContext) = oneAPI.get_group_id(0)
-GPUArrays.blockdim(ctx::oneKernelContext) = oneAPI.get_local_size(0)
-GPUArrays.threadidx(ctx::oneKernelContext) = oneAPI.get_local_id(0)
-GPUArrays.griddim(ctx::oneKernelContext) = oneAPI.get_num_groups(0)
-
-# math
-
-@inline GPUArrays.cos(ctx::oneKernelContext, x) = oneAPI.cos(x)
-@inline GPUArrays.sin(ctx::oneKernelContext, x) = oneAPI.sin(x)
-@inline GPUArrays.sqrt(ctx::oneKernelContext, x) = oneAPI.sqrt(x)
-@inline GPUArrays.log(ctx::oneKernelContext, x) = oneAPI.log(x)
-
-# memory
-
-@inline function GPUArrays.LocalMemory(::oneKernelContext, ::Type{T}, ::Val{dims}, ::Val{id}
-                                      ) where {T, dims, id}
-    ptr = oneAPI.emit_localmemory(Val(id), T, Val(prod(dims)))
-    oneDeviceArray(dims, LLVMPtr{T, onePI.AS.Local}(ptr))
-end
-
-# synchronization
-
-@inline GPUArrays.synchronize_threads(::oneKernelContext) = oneAPI.barrier()
-
-
-
-#
-# Host abstractions
-#
-
-GPUArrays.backend(::Type{<:oneArray}) = oneArrayBackend()
-
 const GLOBAL_RNGs = Dict{ZeDevice,GPUArrays.RNG}()
 function GPUArrays.default_rng(::Type{<:oneArray})
     dev = device()