From 558e72e3d82702959031eb0de7e557c775baccdf Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:23:11 -0500
Subject: [PATCH 01/11] abstract functionality of pyop2 types to be overloaded
 by backends

---
 pyop2/compilation.py   |  2 +-
 pyop2/global_kernel.py | 60 ++++++++--------------------------------
 pyop2/parloop.py       | 63 ++++++++++++++----------------------------
 pyop2/types/dat.py     | 59 ++++++++++++++++++++++++---------------
 pyop2/types/dataset.py |  6 ++++
 pyop2/types/glob.py    | 27 ++++++++++--------
 pyop2/types/map.py     |  3 +-
 pyop2/types/set.py     | 20 ++++++++------
 8 files changed, 107 insertions(+), 133 deletions(-)

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 8fd7bf023..cee148c42 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -575,7 +575,7 @@ def load(jitmodule, extension, fn_name, cppargs=(), ldargs=(),
     :kwarg comm: Optional communicator to compile the code on (only
         rank 0 compiles code) (defaults to pyop2.mpi.COMM_WORLD).
     """
-    from pyop2.global_kernel import GlobalKernel
+    from pyop2.global_kernel import AbstractGlobalKernel as GlobalKernel
 
     if isinstance(jitmodule, str):
         class StrCode(object):
diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py
index 0277ef773..fd2ad1409 100644
--- a/pyop2/global_kernel.py
+++ b/pyop2/global_kernel.py
@@ -1,20 +1,18 @@
 import collections.abc
 import ctypes
+import abc
 from dataclasses import dataclass
 import itertools
-import os
 from typing import Optional, Tuple
 
-import loopy as lp
-from petsc4py import PETSc
 import numpy as np
 
-from pyop2 import compilation, mpi
+from pyop2 import mpi
 from pyop2.caching import Cached
 from pyop2.configuration import configuration
 from pyop2.datatypes import IntType, as_ctypes
 from pyop2.types import IterationRegion
-from pyop2.utils import cached_property, get_petsc_dir
+from pyop2.utils import cached_property
 
 
 # We set eq=False to force identity-based hashing. This is required for when
@@ -226,7 +224,7 @@ def pack(self):
         return MatPack
 
 
-class GlobalKernel(Cached):
+class AbstractGlobalKernel(Cached, abc.ABC):
     """Class representing the generated code for the global computation.
 
     :param local_kernel: :class:`pyop2.LocalKernel` instance representing the
@@ -348,48 +346,6 @@ def builder(self):
             builder.add_argument(arg)
         return builder
 
-    @cached_property
-    def code_to_compile(self):
-        """Return the C/C++ source code as a string."""
-        from pyop2.codegen.rep2loopy import generate
-
-        wrapper = generate(self.builder)
-        code = lp.generate_code_v2(wrapper)
-
-        if self.local_kernel.cpp:
-            from loopy.codegen.result import process_preambles
-            preamble = "".join(process_preambles(getattr(code, "device_preambles", [])))
-            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
-            return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
-        return code.device_code()
-
-    @PETSc.Log.EventDecorator()
-    @mpi.collective
-    def compile(self, comm):
-        """Compile the kernel.
-
-        :arg comm: The communicator the compilation is collective over.
-        :returns: A ctypes function pointer for the compiled function.
-        """
-        extension = "cpp" if self.local_kernel.cpp else "c"
-        cppargs = (
-            tuple("-I%s/include" % d for d in get_petsc_dir())
-            + tuple("-I%s" % d for d in self.local_kernel.include_dirs)
-            + ("-I%s" % os.path.abspath(os.path.dirname(__file__)),)
-        )
-        ldargs = (
-            tuple("-L%s/lib" % d for d in get_petsc_dir())
-            + tuple("-Wl,-rpath,%s/lib" % d for d in get_petsc_dir())
-            + ("-lpetsc", "-lm")
-            + tuple(self.local_kernel.ldargs)
-        )
-
-        return compilation.load(self, extension, self.name,
-                                cppargs=cppargs,
-                                ldargs=ldargs,
-                                restype=ctypes.c_int,
-                                comm=comm)
-
     @cached_property
     def argtypes(self):
         """Return the ctypes datatypes of the compiled function."""
@@ -410,3 +366,11 @@ def num_flops(self, iterset):
             elif region not in {IterationRegion.TOP, IterationRegion.BOTTOM}:
                 size = layers - 1
         return size * self.local_kernel.num_flops
+
+    @abc.abstractproperty
+    def code_to_compile(self):
+        """Return the C/C++ source code as a string."""
+
+    @abc.abstractmethod
+    def compile(self):
+        pass
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index ac78e6bda..df422f54c 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -13,7 +13,7 @@
 from pyop2.datatypes import as_numpy_dtype
 from pyop2.exceptions import KernelTypeError, MapValueError, SetTypeError
 from pyop2.global_kernel import (GlobalKernelArg, DatKernelArg, MixedDatKernelArg,
-                                 MatKernelArg, MixedMatKernelArg, GlobalKernel)
+                                 MatKernelArg, MixedMatKernelArg)
 from pyop2.local_kernel import LocalKernel, CStringLocalKernel, CoffeeLocalKernel, LoopyLocalKernel
 from pyop2.types import (Access, Global, AbstractDat, Dat, DatView, MixedDat, Mat, Set,
                          MixedSet, ExtrudedSet, Subset, Map, ComposedMap, MixedMap)
@@ -125,7 +125,7 @@ def map_kernel_args(self):
         return tuple(itertools.chain(*itertools.product(rmap._kernel_args_, cmap._kernel_args_)))
 
 
-class Parloop:
+class AbstractParloop:
     """A parallel loop invocation.
 
     :arg global_knl: The :class:`GlobalKernel` to be executed.
@@ -226,7 +226,7 @@ def __call__(self):
     def zero_global_increments(self):
         """Zero any global increments every time the loop is executed."""
         for g in self.reduced_globals.keys():
-            g._data[...] = 0
+            g.data[...] = 0
 
     def replace_lgmaps(self):
         """Swap out any lgmaps for any :class:`MatParloopArg` instances
@@ -350,38 +350,13 @@ def _l2g_idxs(self):
                 seen.add(pl_arg.data)
         return tuple(indices)
 
-    @PETSc.Log.EventDecorator("ParLoopRednBegin")
-    @mpi.collective
+    @abc.abstractmethod
     def reduction_begin(self):
         """Begin reductions."""
-        requests = []
-        for idx in self._reduction_idxs:
-            glob = self.arguments[idx].data
-            mpi_op = {Access.INC: mpi.MPI.SUM,
-                      Access.MIN: mpi.MPI.MIN,
-                      Access.MAX: mpi.MPI.MAX}.get(self.accesses[idx])
-
-            if mpi.MPI.VERSION >= 3:
-                requests.append(self.comm.Iallreduce(glob._data, glob._buf, op=mpi_op))
-            else:
-                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
-        return tuple(requests)
-
-    @PETSc.Log.EventDecorator("ParLoopRednEnd")
-    @mpi.collective
+
+    @abc.abstractmethod
     def reduction_end(self, requests):
         """Finish reductions."""
-        if mpi.MPI.VERSION >= 3:
-            for idx, req in zip(self._reduction_idxs, requests):
-                req.Wait()
-                glob = self.arguments[idx].data
-                glob._data[:] = glob._buf
-        else:
-            assert len(requests) == 0
-
-            for idx in self._reduction_idxs:
-                glob = self.arguments[idx].data
-                glob._data[:] = glob._buf
 
     @cached_property
     def _reduction_idxs(self):
@@ -393,7 +368,7 @@ def _reduction_idxs(self):
     def finalize_global_increments(self):
         """Finalise global increments."""
         for tmp, glob in self.reduced_globals.items():
-            glob.data._data += tmp._data
+            glob.data.data[:] += tmp.data_ro
 
     @mpi.collective
     def update_arg_data_state(self):
@@ -463,11 +438,12 @@ def prepare_reduced_globals(self, arguments, global_knl):
         :class:`Global` in parallel produces the right result. The same is not
         needed for MAX and MIN because they commute with the reduction.
         """
+        from pyop2.op2 import compute_backend
         arguments = list(arguments)
         reduced_globals = {}
         for i, (lk_arg, gk_arg, pl_arg) in enumerate(self.zip_arguments(global_knl, arguments)):
             if isinstance(gk_arg, GlobalKernelArg) and lk_arg.access == Access.INC:
-                tmp = Global(gk_arg.dim, data=np.zeros_like(pl_arg.data.data_ro), dtype=lk_arg.dtype, comm=self.comm)
+                tmp = compute_backend.Global(gk_arg.dim, data=np.zeros_like(pl_arg.data.data_ro), dtype=lk_arg.dtype, comm=self.comm)
                 reduced_globals[tmp] = pl_arg
                 arguments[i] = GlobalParloopArg(tmp)
 
@@ -623,6 +599,7 @@ def LegacyParloop(local_knl, iterset, *args, **kwargs):
         raise SetTypeError("Iteration set is of the wrong type")
 
     # finish building the local kernel
+    from pyop2.op2 import compute_backend
     local_knl.accesses = tuple(a.access for a in args)
     if isinstance(local_knl, (CStringLocalKernel, CoffeeLocalKernel)):
         local_knl.dtypes = tuple(a.data.dtype for a in args)
@@ -632,15 +609,15 @@ def LegacyParloop(local_knl, iterset, *args, **kwargs):
     extruded_periodic = iterset._extruded_periodic
     constant_layers = extruded and iterset.constant_layers
     subset = isinstance(iterset, Subset)
-    global_knl = GlobalKernel(local_knl, global_knl_args,
-                              extruded=extruded,
-                              extruded_periodic=extruded_periodic,
-                              constant_layers=constant_layers,
-                              subset=subset,
-                              **kwargs)
+    global_knl = compute_backend.GlobalKernel(local_knl, global_knl_args,
+                                              extruded=extruded,
+                                              extruded_periodic=extruded_periodic,
+                                              constant_layers=constant_layers,
+                                              subset=subset,
+                                              **kwargs)
 
     parloop_args = tuple(a.parloop_arg for a in args)
-    return Parloop(global_knl, iterset, parloop_args)
+    return compute_backend.Parloop(global_knl, iterset, parloop_args)
 
 
 def par_loop(*args, **kwargs):
@@ -654,8 +631,10 @@ def parloop(knl, *args, **kwargs):
     For a description of the possible arguments to this function see
     :class:`Parloop` and :func:`LegacyParloop`.
     """
-    if isinstance(knl, GlobalKernel):
-        Parloop(knl, *args, **kwargs)()
+    from pyop2.global_kernel import AbstractGlobalKernel
+    if isinstance(knl, AbstractGlobalKernel):
+        from pyop2.op2 import compute_backend
+        compute_backend.Parloop(knl, *args, **kwargs)()
     elif isinstance(knl, LocalKernel):
         LegacyParloop(knl, *args, **kwargs)()
     else:
diff --git a/pyop2/types/dat.py b/pyop2/types/dat.py
index 9819c3452..e17a1f38c 100644
--- a/pyop2/types/dat.py
+++ b/pyop2/types/dat.py
@@ -1,4 +1,3 @@
-import abc
 import contextlib
 import ctypes
 import itertools
@@ -15,13 +14,14 @@
     mpi,
     utils
 )
+from pyop2.offload_utils import OffloadMixin
 from pyop2.types.access import Access
-from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.dataset import DataSet, GlobalDataSet
 from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
 from pyop2.types.set import ExtrudedSet, GlobalSet, Set
 
 
-class AbstractDat(DataCarrier, EmptyDataMixin, abc.ABC):
+class AbstractDat(DataCarrier, EmptyDataMixin):
     """OP2 vector data. A :class:`Dat` holds values on every element of a
     :class:`DataSet`.o
 
@@ -349,12 +349,11 @@ def _op_kernel(self, op, globalp, dtype):
         return self._op_kernel_cache.setdefault(key, Kernel(knl, name))
 
     def _op(self, other, op):
-        from pyop2.types.glob import Global
+        from pyop2.op2 import compute_backend
         from pyop2.parloop import parloop
-
-        ret = Dat(self.dataset, None, self.dtype)
+        ret = compute_backend.Dat(self.dataset, None, self.dtype)
         if np.isscalar(other):
-            other = Global(1, data=other, comm=self.comm)
+            other = compute_backend.Global(1, data=other, comm=self.comm)
             globalp = True
         else:
             self._check_shape(other)
@@ -398,12 +397,12 @@ def _iop_kernel(self, op, globalp, other_is_self, dtype):
         return self._iop_kernel_cache.setdefault(key, Kernel(knl, name))
 
     def _iop(self, other, op):
+        from pyop2.op2 import compute_backend
         from pyop2.parloop import parloop
-        from pyop2.types.glob import Global
 
         globalp = False
         if np.isscalar(other):
-            other = Global(1, data=other, comm=self.comm)
+            other = compute_backend.Global(1, data=other, comm=self.comm)
             globalp = True
         elif other is not self:
             self._check_shape(other)
@@ -447,10 +446,10 @@ def inner(self, other):
 
         """
         from pyop2.parloop import parloop
-        from pyop2.types.glob import Global
+        from pyop2.op2 import compute_backend
 
         self._check_shape(other)
-        ret = Global(1, data=0, dtype=self.dtype, comm=self.comm)
+        ret = compute_backend.Global(1, data=0, dtype=self.dtype, comm=self.comm)
         parloop(self._inner_kernel(other.dtype), self.dataset.set,
                 self(Access.READ), other(Access.READ), ret(Access.INC))
         return ret.data_ro[0]
@@ -466,7 +465,8 @@ def norm(self):
         return sqrt(self.inner(self).real)
 
     def __pos__(self):
-        pos = Dat(self)
+        from pyop2.op2 import compute_backend
+        pos = compute_backend.Dat(self)
         return pos
 
     def __add__(self, other):
@@ -499,8 +499,9 @@ def _neg_kernel(self):
 
     def __neg__(self):
         from pyop2.parloop import parloop
+        from pyop2.op2 import compute_backend
 
-        neg = Dat(self.dataset, dtype=self.dtype)
+        neg = compute_backend.Dat(self.dataset, dtype=self.dtype)
         parloop(self._neg_kernel, self.dataset.set, neg(Access.WRITE), self(Access.READ))
         return neg
 
@@ -710,7 +711,11 @@ def data_ro_with_halos(self):
         return full[idx]
 
 
-class Dat(AbstractDat, VecAccessMixin):
+class Dat(AbstractDat, VecAccessMixin, OffloadMixin):
+
+    @utils.cached_property
+    def can_be_represented_as_petscvec(self):
+        return ((self.dtype == PETSc.ScalarType) and self.cdim > 0)
 
     def __init__(self, *args, **kwargs):
         AbstractDat.__init__(self, *args, **kwargs)
@@ -721,14 +726,17 @@ def __init__(self, *args, **kwargs):
     @utils.cached_property
     def _vec(self):
         assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
         # Can't duplicate layout_vec of dataset, because we then
         # carry around extra unnecessary data.
         # But use getSizes to save an Allreduce in computing the
         # global size.
         size = self.dataset.layout_vec.getSizes()
         data = self._data[:size[0]]
-        return PETSc.Vec().createWithArray(data, size=size, bsize=self.cdim, comm=self.comm)
+        vec = PETSc.Vec().createWithArray(data, size=size,
+                                          bsize=self.cdim, comm=self.comm)
+        return vec
 
     @contextlib.contextmanager
     def vec_context(self, access):
@@ -757,12 +765,13 @@ class MixedDat(AbstractDat, VecAccessMixin):
 
     def __init__(self, mdset_or_dats):
         from pyop2.types.glob import Global
+        from pyop2.op2 import compute_backend
 
         def what(x):
             if isinstance(x, (Global, GlobalDataSet, GlobalSet)):
-                return Global
+                return compute_backend.Global
             elif isinstance(x, (Dat, DataSet, Set)):
-                return Dat
+                return compute_backend.Dat
             else:
                 raise ex.DataSetTypeError("Huh?!")
         if isinstance(mdset_or_dats, MixedDat):
@@ -811,7 +820,8 @@ def split(self):
     @utils.cached_property
     def dataset(self):
         r""":class:`MixedDataSet`\s this :class:`MixedDat` is defined on."""
-        return MixedDataSet(tuple(s.dataset for s in self._dats))
+        from pyop2.op2 import compute_backend
+        return compute_backend.MixedDataSet(tuple(s.dataset for s in self._dats))
 
     @utils.cached_property
     def _data(self):
@@ -959,6 +969,7 @@ def inner(self, other):
         return ret
 
     def _op(self, other, op):
+        from pyop2.op2 import compute_backend
         ret = []
         if np.isscalar(other):
             for s in self:
@@ -967,7 +978,7 @@ def _op(self, other, op):
             self._check_shape(other)
             for s, o in zip(self, other):
                 ret.append(op(s, o))
-        return MixedDat(ret)
+        return compute_backend.MixedDat(ret)
 
     def _iop(self, other, op):
         if np.isscalar(other):
@@ -980,16 +991,20 @@ def _iop(self, other, op):
         return self
 
     def __pos__(self):
+        from pyop2.op2 import compute_backend
+
         ret = []
         for s in self:
             ret.append(s.__pos__())
-        return MixedDat(ret)
+        return compute_backend.MixedDat(ret)
 
     def __neg__(self):
+        from pyop2.op2 import compute_backend
+
         ret = []
         for s in self:
             ret.append(s.__neg__())
-        return MixedDat(ret)
+        return compute_backend.MixedDat(ret)
 
     def __add__(self, other):
         """Pointwise addition of fields."""
diff --git a/pyop2/types/dataset.py b/pyop2/types/dataset.py
index 4e114032a..b4d9adb85 100644
--- a/pyop2/types/dataset.py
+++ b/pyop2/types/dataset.py
@@ -188,9 +188,11 @@ def local_ises(self):
     @utils.cached_property
     def layout_vec(self):
         """A PETSc Vec compatible with the dof layout of this DataSet."""
+        from pyop2.op2 import compute_backend
         vec = PETSc.Vec().create(comm=self.comm)
         size = (self.size * self.cdim, None)
         vec.setSizes(size, bsize=self.cdim)
+        vec.setType(compute_backend.PETScVecType)
         vec.setUp()
         return vec
 
@@ -315,9 +317,11 @@ def local_ises(self):
     @utils.cached_property
     def layout_vec(self):
         """A PETSc Vec compatible with the dof layout of this DataSet."""
+        from pyop2.op2 import compute_backend
         vec = PETSc.Vec().create(comm=self.comm)
         size = (self.size * self.cdim, None)
         vec.setSizes(size, bsize=self.cdim)
+        vec.setType(compute_backend.PETScVecType)
         vec.setUp()
         return vec
 
@@ -463,10 +467,12 @@ def __repr__(self):
     @utils.cached_property
     def layout_vec(self):
         """A PETSc Vec compatible with the dof layout of this MixedDataSet."""
+        from pyop2.op2 import compute_backend
         vec = PETSc.Vec().create(comm=self.comm)
         # Compute local and global size from sizes of layout vecs
         lsize, gsize = map(sum, zip(*(d.layout_vec.sizes for d in self)))
         vec.setSizes((lsize, gsize), bsize=1)
+        vec.setType(compute_backend.PETScVecType)
         vec.setUp()
         return vec
 
diff --git a/pyop2/types/glob.py b/pyop2/types/glob.py
index 4bc67fed8..04954a0c3 100644
--- a/pyop2/types/glob.py
+++ b/pyop2/types/glob.py
@@ -10,12 +10,12 @@
     mpi,
     utils
 )
+from pyop2.offload_utils import OffloadMixin
 from pyop2.types.access import Access
-from pyop2.types.dataset import GlobalDataSet
 from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
 
 
-class Global(DataCarrier, EmptyDataMixin, VecAccessMixin):
+class Global(DataCarrier, EmptyDataMixin, VecAccessMixin, OffloadMixin):
 
     """OP2 global value.
 
@@ -97,15 +97,16 @@ def __getitem__(self, idx):
 
     def __str__(self):
         return "OP2 Global Argument: %s with dim %s and value %s" \
-            % (self._name, self._dim, self._data)
+            % (self._name, self._dim, self.data_ro)
 
     def __repr__(self):
-        return "Global(%r, %r, %r, %r)" % (self._dim, self._data,
-                                           self._data.dtype, self._name)
+        return "Global(%r, %r, %r, %r)" % (self._dim, self.data_ro,
+                                           self.data.dtype, self._name)
 
     @utils.cached_property
     def dataset(self):
-        return GlobalDataSet(self)
+        from pyop2.op2 import compute_backend
+        return compute_backend.GlobalDataSet(self)
 
     @property
     def shape(self):
@@ -303,7 +304,8 @@ def inner(self, other):
     @utils.cached_property
     def _vec(self):
         assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
         # Can't duplicate layout_vec of dataset, because we then
         # carry around extra unnecessary data.
         # But use getSizes to save an Allreduce in computing the
@@ -325,7 +327,10 @@ def vec_context(self, access):
         """A context manager for a :class:`PETSc.Vec` from a :class:`Global`.
 
         :param access: Access descriptor: READ, WRITE, or RW."""
-        yield self._vec
-        if access is not Access.READ:
-            data = self._data
-            self.comm.Bcast(data, 0)
+        raise NotImplementedError()
+
+    def ensure_availability_on_host(self):
+        raise NotImplementedError()
+
+    def ensure_availability_on_device(self):
+        raise NotImplementedError()
diff --git a/pyop2/types/map.py b/pyop2/types/map.py
index 91224d52a..fa199dd02 100644
--- a/pyop2/types/map.py
+++ b/pyop2/types/map.py
@@ -11,10 +11,11 @@
     utils
 )
 from pyop2 import mpi
+from pyop2.offload_utils import OffloadMixin
 from pyop2.types.set import GlobalSet, MixedSet, Set
 
 
-class Map:
+class Map(OffloadMixin):
 
     """OP2 map, a relation between two :class:`Set` objects.
 
diff --git a/pyop2/types/set.py b/pyop2/types/set.py
index 1f6ea30c8..6c4fc682e 100644
--- a/pyop2/types/set.py
+++ b/pyop2/types/set.py
@@ -11,6 +11,7 @@
     mpi,
     utils
 )
+from pyop2.offload_utils import OffloadMixin
 
 
 class Set:
@@ -163,7 +164,9 @@ def __call__(self, *indices):
             indices = indices[0]
             if np.isscalar(indices):
                 indices = [indices]
-        return Subset(self, indices)
+
+        from pyop2.op2 import compute_backend
+        return compute_backend.Subset(self, indices)
 
     def __contains__(self, dset):
         """Indicate whether a given DataSet is compatible with this Set."""
@@ -175,8 +178,8 @@ def __contains__(self, dset):
 
     def __pow__(self, e):
         """Derive a :class:`DataSet` with dimension ``e``"""
-        from pyop2.types import DataSet
-        return DataSet(self, dim=e)
+        from pyop2.op2 import compute_backend
+        return compute_backend.DataSet(self, dim=e)
 
     @utils.cached_property
     def layers(self):
@@ -289,7 +292,7 @@ def __hash__(self):
         return hash(type(self))
 
 
-class ExtrudedSet(Set):
+class ExtrudedSet(Set, OffloadMixin):
 
     """OP2 ExtrudedSet.
 
@@ -378,7 +381,7 @@ def layers_array(self):
         return self._layers
 
 
-class Subset(ExtrudedSet):
+class Subset(ExtrudedSet, OffloadMixin):
 
     """OP2 subset.
 
@@ -452,7 +455,8 @@ def __call__(self, *indices):
             indices = indices[0]
             if np.isscalar(indices):
                 indices = [indices]
-        return Subset(self, indices)
+        from pyop2.op2 import compute_backend
+        return compute_backend.Subset(self, indices)
 
     @utils.cached_property
     def superset(self):
@@ -632,8 +636,8 @@ def __len__(self):
 
     def __pow__(self, e):
         """Derive a :class:`MixedDataSet` with dimensions ``e``"""
-        from pyop2.types import MixedDataSet
-        return MixedDataSet(self._sets, e)
+        from pyop2.op2 import compute_backend
+        return compute_backend.MixedDataSet(self._sets, e)
 
     def __str__(self):
         return "OP2 MixedSet composed of Sets: %s" % (self._sets,)

From 36b8a3c56b7cc1faabfda3353a674ecbd3223727 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:25:16 -0500
Subject: [PATCH 02/11] generalize loopy codegen to allow OpenCL/CUDA targets

---
 pyop2/codegen/rep2loopy.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index 7085c324d..3510b1d38 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -400,7 +400,7 @@ def bounds(exprs):
     return dict((op, (names[op], dep)) for op, dep in deps.items())
 
 
-def generate(builder, wrapper_name=None):
+def generate(builder, wrapper_name=None, include_math=True, include_petsc=True, include_complex=True):
     if builder.layer_index is not None:
         outer_inames = frozenset([builder._loop_index.name,
                                   builder.layer_index.name])
@@ -547,7 +547,16 @@ def renamer(expr):
     # register kernel
     kernel = builder.kernel
     headers = set(kernel.headers)
-    headers = headers | set(["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"])
+
+    if include_math:
+        headers.add("#include <math.h>")
+
+    if include_petsc:
+        headers.add("#include <petsc.h>")
+
+    if include_complex:
+        headers.add("#include <complex.h>")
+
     if PETSc.Log.isActive():
         headers = headers | set(["#include <petsclog.h>"])
     preamble = "\n".join(sorted(headers))
@@ -622,15 +631,22 @@ def statement_assign(expr, context):
     if isinstance(lvalue, Indexed):
         context.index_ordering.append(tuple(i.name for i in lvalue.index_ordering()))
     lvalue, rvalue = tuple(expression(c, context.parameters) for c in expr.children)
-    within_inames = context.within_inames[expr]
+    if isinstance(expr.label, (PreUnpackInst, UnpackInst)):
+        tag = "scatter"
+    elif isinstance(expr.label, PackInst):
+        tag = "gather"
+    else:
+        raise NotImplementedError()
 
+    within_inames = context.within_inames[expr]
     id, depends_on = context.instruction_dependencies[expr]
     predicates = frozenset(context.conditions)
     return loopy.Assignment(lvalue, rvalue, within_inames=within_inames,
                             within_inames_is_final=True,
                             predicates=predicates,
                             id=id,
-                            depends_on=depends_on, depends_on_is_final=True)
+                            depends_on=depends_on, depends_on_is_final=True,
+                            tags=frozenset([tag]))
 
 
 @statement.register(FunctionCall)

From 54d33e5ac6d9ac581b7323b6f3573d6a35ca1ce6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:26:17 -0500
Subject: [PATCH 03/11] defines an AbstractComputeBackend type

---
 pyop2/backends/__init__.py | 43 ++++++++++++++++++++++++++++++++++++++
 setup.py                   |  2 +-
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 pyop2/backends/__init__.py

diff --git a/pyop2/backends/__init__.py b/pyop2/backends/__init__.py
new file mode 100644
index 000000000..680ad9d3a
--- /dev/null
+++ b/pyop2/backends/__init__.py
@@ -0,0 +1,43 @@
+class _not_implemented:  # noqa
+    """Not Implemented"""
+
+
+class AbstractComputeBackend:
+    """
+    Abstract class to record all the backend specific implementation of
+    :mod:`pyop2`'s data structures.
+    """
+    GlobalKernel = _not_implemented
+    Parloop = _not_implemented
+    Set = _not_implemented
+    ExtrudedSet = _not_implemented
+    MixedSet = _not_implemented
+    Subset = _not_implemented
+    DataSet = _not_implemented
+    MixedDataSet = _not_implemented
+    Map = _not_implemented
+    MixedMap = _not_implemented
+    Dat = _not_implemented
+    MixedDat = _not_implemented
+    DatView = _not_implemented
+    Mat = _not_implemented
+    Global = _not_implemented
+    GlobalDataSet = _not_implemented
+    PETScVecType = _not_implemented
+
+    def __getattribute__(self, key):
+        val = super().__getattribute__(key)
+        if val is _not_implemented:
+            raise NotImplementedError(f"'{val}' is not implemented for backend"
+                                      f" '{type(self).__name__}'.")
+        return val
+
+    def turn_on_offloading(self):
+        raise NotImplementedError()
+
+    def turn_off_offloading(self):
+        raise NotImplementedError()
+
+    @property
+    def cache_key(self):
+        raise NotImplementedError()
diff --git a/setup.py b/setup.py
index a271e2415..c2830e7d0 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,7 @@ def run(self):
       ],
       install_requires=install_requires + test_requires,
       dependency_links=dep_links,
-      packages=['pyop2', 'pyop2.codegen', 'pyop2.types'],
+      packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types'],
       package_data={
           'pyop2': ['assets/*', '*.h', '*.pxd', '*.pyx', 'codegen/c/*.c']},
       scripts=glob('scripts/*'),

From 09502a3a66ae244a0b73b4dceb43424934ed37ce Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:28:10 -0500
Subject: [PATCH 04/11] implements CPU backend

---
 pyop2/backends/cpu.py | 219 ++++++++++++++++++++++++++++++++++++++++++
 pyop2/op2.py          |  13 ++-
 2 files changed, 229 insertions(+), 3 deletions(-)
 create mode 100644 pyop2/backends/cpu.py

diff --git a/pyop2/backends/cpu.py b/pyop2/backends/cpu.py
new file mode 100644
index 000000000..ce79c5262
--- /dev/null
+++ b/pyop2/backends/cpu.py
@@ -0,0 +1,219 @@
+from pyop2.types.dat import Dat as BaseDat, MixedDat, DatView
+from pyop2.types.set import Set, ExtrudedSet, Subset, MixedSet
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.map import Map, MixedMap
+from pyop2.parloop import AbstractParloop
+from pyop2.global_kernel import AbstractGlobalKernel
+from pyop2.types.access import READ, INC, MIN, MAX
+from pyop2.types.mat import Mat
+from pyop2.types.glob import Global as BaseGlobal
+from pyop2.backends import AbstractComputeBackend
+from petsc4py import PETSc
+from pyop2 import (
+    compilation,
+    mpi,
+    utils
+)
+
+import ctypes
+import os
+import loopy as lp
+from contextlib import contextmanager
+import numpy as np
+
+
+class Dat(BaseDat):
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        size = self.dataset.layout_vec.getSizes()
+        data = self._data[:size[0]]
+        vec = PETSc.Vec().createWithArray(data, size=size,
+                                          bsize=self.cdim, comm=self.comm)
+        return vec
+
+    @contextmanager
+    def vec_context(self, access):
+        # PETSc Vecs have a state counter and cache norm computations
+        # to return immediately if the state counter is unchanged.
+        # Since we've updated the data behind their back, we need to
+        # change that state counter.
+        self._vec.stateIncrease()
+        yield self._vec
+        if access is not READ:
+            self.halo_valid = False
+
+    def ensure_availability_on_device(self):
+        from pyop2.op2 import compute_backend
+        assert compute_backend is cpu_backend
+        # data transfer is noop for CPU backend
+
+    def ensure_availability_on_host(self):
+        from pyop2.op2 import compute_backend
+        assert compute_backend is cpu_backend
+        # data transfer is noop for CPU backend
+
+
+class Global(BaseGlobal):
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        data = self._data
+        size = self.dataset.layout_vec.getSizes()
+        if self.comm.rank == 0:
+            return PETSc.Vec().createWithArray(data, size=size,
+                                               bsize=self.cdim,
+                                               comm=self.comm)
+        else:
+            return PETSc.Vec().createWithArray(np.empty(0, dtype=self.dtype),
+                                               size=size,
+                                               bsize=self.cdim,
+                                               comm=self.comm)
+
+    @contextmanager
+    def vec_context(self, access):
+        """A context manager for a :class:`PETSc.Vec` from a :class:`Global`.
+
+        :param access: Access descriptor: READ, WRITE, or RW."""
+        yield self._vec
+        if access is not READ:
+            data = self._data
+            self.comm.Bcast(data, 0)
+
+    def ensure_availability_on_device(self):
+        from pyop2.op2 import compute_backend
+        assert compute_backend is cpu_backend
+        # data transfer is noop for CPU backend
+
+    def ensure_availability_on_host(self):
+        from pyop2.op2 import compute_backend
+        assert compute_backend is cpu_backend
+        # data transfer is noop for CPU backend
+
+
+class GlobalKernel(AbstractGlobalKernel):
+
+    @utils.cached_property
+    def code_to_compile(self):
+        """Return the C/C++ source code as a string."""
+        from pyop2.codegen.rep2loopy import generate
+
+        wrapper = generate(self.builder)
+        code = lp.generate_code_v2(wrapper)
+
+        if self.local_kernel.cpp:
+            from loopy.codegen.result import process_preambles
+            preamble = "".join(
+                process_preambles(getattr(code, "device_preambles", [])))
+            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
+            return preamble + '\nextern "C" {\n' + device_code + "\n}\n"
+        return code.device_code()
+
+    @PETSc.Log.EventDecorator()
+    @mpi.collective
+    def compile(self, comm):
+        """Compile the kernel.
+
+        :arg comm: The communicator the compilation is collective over.
+        :returns: A ctypes function pointer for the compiled function.
+        """
+        extension = "cpp" if self.local_kernel.cpp else "c"
+        cppargs = (
+            tuple("-I%s/include" % d for d in utils.get_petsc_dir())
+            + tuple("-I%s" % d for d in self.local_kernel.include_dirs)
+            + ("-I%s" % os.path.abspath(os.path.dirname(__file__)),)
+        )
+        ldargs = (
+            tuple("-L%s/lib" % d for d in utils.get_petsc_dir())
+            + tuple("-Wl,-rpath,%s/lib" % d for d in utils.get_petsc_dir())
+            + ("-lpetsc", "-lm")
+            + tuple(self.local_kernel.ldargs)
+        )
+
+        return compilation.load(self, extension, self.name,
+                                cppargs=cppargs,
+                                ldargs=ldargs,
+                                restype=ctypes.c_int,
+                                comm=comm)
+
+
+class Parloop(AbstractParloop):
+    @PETSc.Log.EventDecorator("ParLoopRednBegin")
+    @mpi.collective
+    def reduction_begin(self):
+        """Begin reductions."""
+        requests = []
+        for idx in self._reduction_idxs:
+            glob = self.arguments[idx].data
+            mpi_op = {INC: mpi.MPI.SUM,
+                      MIN: mpi.MPI.MIN,
+                      MAX: mpi.MPI.MAX}.get(self.accesses[idx])
+
+            if mpi.MPI.VERSION >= 3:
+                requests.append(self.comm.Iallreduce(glob._data,
+                                                     glob._buf,
+                                                     op=mpi_op))
+            else:
+                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
+        return tuple(requests)
+
+    @PETSc.Log.EventDecorator("ParLoopRednEnd")
+    @mpi.collective
+    def reduction_end(self, requests):
+        """Finish reductions."""
+        if mpi.MPI.VERSION >= 3:
+            mpi.MPI.Request.Waitall(requests)
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+        else:
+            assert len(requests) == 0
+
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+
+
+class CPUBackend(AbstractComputeBackend):
+    GlobalKernel = GlobalKernel
+    Parloop = Parloop
+    Set = Set
+    ExtrudedSet = ExtrudedSet
+    MixedSet = MixedSet
+    Subset = Subset
+    DataSet = DataSet
+    MixedDataSet = MixedDataSet
+    Map = Map
+    MixedMap = MixedMap
+    Dat = Dat
+    MixedDat = MixedDat
+    DatView = DatView
+    Mat = Mat
+    Global = Global
+    GlobalDataSet = GlobalDataSet
+    PETScVecType = PETSc.Vec.Type.STANDARD
+
+    def turn_on_offloading(self):
+        pass
+
+    def turn_off_offloading(self):
+        pass
+
+    @property
+    def cache_key(self):
+        return (type(self),)
+
+
+cpu_backend = CPUBackend()
diff --git a/pyop2/op2.py b/pyop2/op2.py
index 726168e79..d0f1857c2 100644
--- a/pyop2/op2.py
+++ b/pyop2/op2.py
@@ -50,12 +50,14 @@
 
 from pyop2.local_kernel import CStringLocalKernel, LoopyLocalKernel, CoffeeLocalKernel, Kernel  # noqa: F401
 from pyop2.global_kernel import (GlobalKernelArg, DatKernelArg, MixedDatKernelArg,  # noqa: F401
-                                 MatKernelArg, MixedMatKernelArg, MapKernelArg, GlobalKernel)
+                                 MatKernelArg, MixedMatKernelArg, MapKernelArg)
 from pyop2.parloop import (GlobalParloopArg, DatParloopArg, MixedDatParloopArg,  # noqa: F401
-                           MatParloopArg, MixedMatParloopArg, Parloop, parloop, par_loop)
+                           MatParloopArg, MixedMatParloopArg, AbstractParloop,
+                           parloop, par_loop)
 from pyop2.parloop import (GlobalLegacyArg, DatLegacyArg, MixedDatLegacyArg,  # noqa: F401
                            MatLegacyArg, MixedMatLegacyArg, LegacyParloop, ParLoop)
 
+from pyop2.backends.cpu import cpu_backend
 
 __all__ = ['configuration', 'READ', 'WRITE', 'RW', 'INC', 'MIN', 'MAX',
            'ON_BOTTOM', 'ON_TOP', 'ON_INTERIOR_FACETS', 'ALL',
@@ -63,7 +65,7 @@
            'set_log_level', 'MPI', 'init', 'exit', 'Kernel', 'Set', 'ExtrudedSet',
            'MixedSet', 'Subset', 'DataSet', 'GlobalDataSet', 'MixedDataSet',
            'Halo', 'Dat', 'MixedDat', 'Mat', 'Global', 'Map', 'MixedMap',
-           'Sparsity', 'parloop', 'Parloop', 'ParLoop', 'par_loop',
+           'Sparsity', 'parloop', 'AbstractParloop', 'ParLoop', 'par_loop',
            'DatView', 'PermutedMap', 'ComposedMap']
 
 
@@ -119,3 +121,8 @@ def exit():
     configuration.reset()
     global _initialised
     _initialised = False
+
+
+compute_backend = cpu_backend
+
+GlobalKernel = cpu_backend.GlobalKernel

From 3238129ac4708258dd6f7d76ec4d232c82320e87 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:27:16 -0500
Subject: [PATCH 05/11] defines offloading helper types

---
 pyop2/offload_utils.py | 71 ++++++++++++++++++++++++++++++++++++++++++
 pyop2/op2.py           |  4 ++-
 2 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 pyop2/offload_utils.py

diff --git a/pyop2/offload_utils.py b/pyop2/offload_utils.py
new file mode 100644
index 000000000..a60ee5c47
--- /dev/null
+++ b/pyop2/offload_utils.py
@@ -0,0 +1,71 @@
+from enum import IntEnum
+from contextlib import contextmanager
+
+
+class DataAvailability(IntEnum):
+    """
+    Indicates whether the device or host contains valid data.
+    """
+    AVAILABLE_ON_HOST_ONLY = 1
+    AVAILABLE_ON_DEVICE_ONLY = 2
+    AVAILABLE_ON_BOTH = 3
+
+
+class OffloadMixin:
+    def get_availability(self):
+        raise NotImplementedError
+
+    def ensure_availability_on_host(self):
+        raise NotImplementedError
+
+    def ensure_availaibility_on_device(self):
+        raise NotImplementedError
+
+    def is_available_on_host(self):
+        return bool(self.get_availability() & AVAILABLE_ON_HOST_ONLY)
+
+    def is_available_on_device(self):
+        return bool(self.get_availability() & AVAILABLE_ON_DEVICE_ONLY)
+
+
+AVAILABLE_ON_HOST_ONLY = DataAvailability.AVAILABLE_ON_HOST_ONLY
+AVAILABLE_ON_DEVICE_ONLY = DataAvailability.AVAILABLE_ON_DEVICE_ONLY
+AVAILABLE_ON_BOTH = DataAvailability.AVAILABLE_ON_BOTH
+
+
+def set_offloading_backend(backend):
+    """
+    Sets a backend for offloading operations to.
+
+    By default the operations are executed on the host, to mark operations for
+    offloading wrap them within a :func:`~pyop2.offload_utils.offloading`
+    context.
+
+    :arg backend: An instance of :class:`pyop2.backends.AbstractComputeBackend`.
+
+    .. warning::
+
+        * Must be called before any instance of PyOP2 type is allocated.
+        * Calling :func:`set_offloading_bacckend` different values of *backend*
+          over the course of the program is an undefined behavior. (i.e.
+          preferably avoided)
+    """
+    from pyop2 import op2
+    from pyop2.backends import AbstractComputeBackend
+    assert isinstance(backend, AbstractComputeBackend)
+    op2.compute_backend = backend
+
+
+@contextmanager
+def offloading():
+    """
+    Operations (such as manipulating a :class:`~pyop2.types.Dat`, calling a
+    :class:`~pyop2.global_kernel.GlobalKernel`, etc) within the offloading
+    region will be executed on backend as selected via
+    :func:`set_offloading_backend`.
+    """
+    from pyop2 import op2
+    op2.compute_backend.turn_on_offloading()
+    yield
+    op2.compute_backend.turn_off_offloading()
+    return
diff --git a/pyop2/op2.py b/pyop2/op2.py
index d0f1857c2..e827dfcfd 100644
--- a/pyop2/op2.py
+++ b/pyop2/op2.py
@@ -58,6 +58,7 @@
                            MatLegacyArg, MixedMatLegacyArg, LegacyParloop, ParLoop)
 
 from pyop2.backends.cpu import cpu_backend
+from pyop2.offload_utils import set_offloading_backend, offloading
 
 __all__ = ['configuration', 'READ', 'WRITE', 'RW', 'INC', 'MIN', 'MAX',
            'ON_BOTTOM', 'ON_TOP', 'ON_INTERIOR_FACETS', 'ALL',
@@ -66,7 +67,8 @@
            'MixedSet', 'Subset', 'DataSet', 'GlobalDataSet', 'MixedDataSet',
            'Halo', 'Dat', 'MixedDat', 'Mat', 'Global', 'Map', 'MixedMap',
            'Sparsity', 'parloop', 'AbstractParloop', 'ParLoop', 'par_loop',
-           'DatView', 'PermutedMap', 'ComposedMap']
+           'DatView', 'PermutedMap', 'ComposedMap', 'set_offloading_backend',
+           'offloading']
 
 
 _initialised = False

From 0c07d2ac4c0ccff024ae24cdd9f8b3abc17a5ace Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:27:41 -0500
Subject: [PATCH 06/11] implement GPU codegen helpers

---
 pyop2/compilation.py          | 116 ++++++++++++++++++++++++++++++----
 pyop2/configuration.py        |  10 ++-
 pyop2/transforms/__init__.py  |   0
 pyop2/transforms/gpu_utils.py |  94 +++++++++++++++++++++++++++
 pyop2/transforms/snpt.py      |  50 +++++++++++++++
 setup.py                      |   3 +-
 6 files changed, 258 insertions(+), 15 deletions(-)
 create mode 100644 pyop2/transforms/__init__.py
 create mode 100644 pyop2/transforms/gpu_utils.py
 create mode 100644 pyop2/transforms/snpt.py

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index cee148c42..9dbb91dfa 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -147,6 +147,29 @@ def sniff_compiler(exe):
     return compiler
 
 
+def _check_src_hashes(comm, global_kernel):
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = configuration["cache_dir"]
+    cachedir = os.path.join(cachedir, dirpart)
+
+    if configuration["check_src_hashes"] or configuration["debug"]:
+        matching = comm.allreduce(basename, op=_check_op)
+        if matching != basename:
+            # Dump all src code to disk for debugging
+            output = os.path.join(cachedir, "mismatching-kernels")
+            srcfile = os.path.join(output, "src-rank%d.c" % comm.rank)
+            if comm.rank == 0:
+                os.makedirs(output, exist_ok=True)
+            comm.barrier()
+            with open(srcfile, "w") as f:
+                f.write(global_kernel.code_to_compile)
+            comm.barrier()
+            raise CompilationError("Generated code differs across ranks"
+                                   f" (see output in {output})")
+
+
 class Compiler(ABC):
     """A compiler for shared libraries.
 
@@ -317,19 +340,8 @@ def get_so(self, jitmodule, extension):
         # atomically (avoiding races).
         tmpname = os.path.join(cachedir, "%s_p%d.so.tmp" % (basename, pid))
 
-        if configuration['check_src_hashes'] or configuration['debug']:
-            matching = self.comm.allreduce(basename, op=_check_op)
-            if matching != basename:
-                # Dump all src code to disk for debugging
-                output = os.path.join(configuration["cache_dir"], "mismatching-kernels")
-                srcfile = os.path.join(output, "src-rank%d.c" % self.comm.rank)
-                if self.comm.rank == 0:
-                    os.makedirs(output, exist_ok=True)
-                self.comm.barrier()
-                with open(srcfile, "w") as f:
-                    f.write(jitmodule.code_to_compile)
-                self.comm.barrier()
-                raise CompilationError("Generated code differs across ranks (see output in %s)" % output)
+        _check_src_hashes(self.comm, jitmodule)
+
         try:
             # Are we in the cache?
             return ctypes.CDLL(soname)
@@ -662,3 +674,81 @@ def clear_cache(prompt=False):
         shutil.rmtree(cachedir)
     else:
         print("Not removing cached libraries")
+
+
+def _get_code_to_compile(comm, global_kernel):
+    # Determine cache key
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    cachedir = configuration["cache_dir"]
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = os.path.join(cachedir, dirpart)
+    cname = os.path.join(cachedir, f"{basename}_code.cu")
+
+    _check_src_hashes(comm, global_kernel)
+
+    if os.path.isfile(cname):
+        # Are we in the cache?
+        with open(cname, "r") as f:
+            code_to_compile = f.read()
+    else:
+        # No, let"s go ahead and build
+        if comm.rank == 0:
+            # No need to do this on all ranks
+            os.makedirs(cachedir, exist_ok=True)
+            with progress(INFO, "Compiling wrapper"):
+                # make sure that compiles successfully before writing to file
+                code_to_compile = global_kernel.code_to_compile
+                with open(cname, "w") as f:
+                    f.write(code_to_compile)
+        comm.barrier()
+
+    return code_to_compile
+
+
+@mpi.collective
+def get_prepared_cuda_function(comm, global_kernel):
+    from pycuda.compiler import SourceModule
+
+    # Determine cache key
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    cachedir = configuration["cache_dir"]
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = os.path.join(cachedir, dirpart)
+
+    nvcc_opts = ["-use_fast_math", "-w"]
+
+    code_to_compile = _get_code_to_compile(comm, global_kernel)
+    source_module = SourceModule(code_to_compile, options=nvcc_opts,
+                                 cache_dir=cachedir)
+
+    cu_func = source_module.get_function(global_kernel.name)
+
+    type_map = {ctypes.c_void_p: "P", ctypes.c_int: "i"}
+    argtypes = "".join(type_map[t] for t in global_kernel.argtypes)
+    cu_func.prepare(argtypes)
+
+    return cu_func
+
+
+@mpi.collective
+def get_opencl_kernel(comm, global_kernel):
+    import pyopencl as cl
+    from pyop2.backends.opencl import opencl_backend
+    cl_ctx = opencl_backend.context
+
+    # Determine cache key
+    hsh = md5(str(global_kernel.cache_key[1:]).encode())
+    basename = hsh.hexdigest()
+    cachedir = configuration["cache_dir"]
+    dirpart, basename = basename[:2], basename[2:]
+    cachedir = os.path.join(cachedir, dirpart)
+
+    code_to_compile = _get_code_to_compile(comm, global_kernel)
+
+    prg = cl.Program(cl_ctx, code_to_compile).build(options=[],
+                                                    cache_dir=cachedir)
+
+    cl_knl = cl.Kernel(prg, global_kernel.name)
+    return cl_knl
diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index 29717718c..a98cad124 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -74,6 +74,12 @@ class Configuration(dict):
         cdim > 1 be built as block sparsities, or dof sparsities.  The
         former saves memory but changes which preconditioners are
         available for the resulting matrices.  (Default yes)
+    :param gpu_strategy: A :class:str` indicating the transformation strategy
+        that must be applied to a :class:`pyop2.global_kernel.GlobalKernel`
+        when offloading to a GPGPU. Can be one of:
+        - ``"snpt"``: Single-"N" Per Thread. In the transform strategy, the
+          work of each element of the iteration set over which a global kernel
+          operates is assigned to a work-item (i.e. a CUDA thread)
     """
     # name, env variable, type, default, write once
     cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid())
@@ -113,7 +119,9 @@ class Configuration(dict):
         "matnest":
             ("PYOP2_MATNEST", bool, True),
         "block_sparsity":
-            ("PYOP2_BLOCK_SPARSITY", bool, True)
+            ("PYOP2_BLOCK_SPARSITY", bool, True),
+        "gpu_strategy":
+            ("PYOP2_GPU_STRATEGY", str, "snpt"),
     }
     """Default values for PyOP2 configuration parameters"""
 
diff --git a/pyop2/transforms/__init__.py b/pyop2/transforms/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pyop2/transforms/gpu_utils.py b/pyop2/transforms/gpu_utils.py
new file mode 100644
index 000000000..28cd558e1
--- /dev/null
+++ b/pyop2/transforms/gpu_utils.py
@@ -0,0 +1,94 @@
+import loopy as lp
+from pyop2.configuration import configuration
+
+
+def get_loopy_target(target):
+    if target == "opencl":
+        return lp.PyOpenCLTarget()
+    elif target == "cuda":
+        return lp.CudaTarget()
+    else:
+        raise NotImplementedError()
+
+
+def preprocess_t_unit_for_gpu(t_unit):
+
+    # {{{ inline all kernels in t_unit
+
+    kernels_to_inline = {
+        name for name, clbl in t_unit.callables_table.items()
+        if isinstance(clbl, lp.CallableKernel)}
+
+    for knl_name in kernels_to_inline:
+        t_unit = lp.inline_callable_kernel(t_unit, knl_name)
+
+    # }}}
+
+    kernel = t_unit.default_entrypoint
+
+    # changing the address space of temps
+    def _change_aspace_tvs(tv):
+        if tv.read_only:
+            assert tv.initializer is not None
+            return tv.copy(address_space=lp.AddressSpace.GLOBAL)
+        else:
+            return tv.copy(address_space=lp.AddressSpace.PRIVATE)
+
+    new_tvs = {tv_name: _change_aspace_tvs(tv) for tv_name, tv in
+               kernel.temporary_variables.items()}
+    kernel = kernel.copy(temporary_variables=new_tvs)
+
+    def insn_needs_atomic(insn):
+        # updates to global variables are atomic
+        import pymbolic
+        if isinstance(insn, lp.Assignment):
+            if isinstance(insn.assignee, pymbolic.primitives.Subscript):
+                assignee_name = insn.assignee.aggregate.name
+            else:
+                assert isinstance(insn.assignee, pymbolic.primitives.Variable)
+                assignee_name = insn.assignee.name
+
+            if assignee_name in kernel.arg_dict:
+                return assignee_name in insn.read_dependency_names()
+        return False
+
+    new_insns = []
+    args_marked_for_atomic = set()
+    for insn in kernel.instructions:
+        if insn_needs_atomic(insn):
+            atomicity = (lp.AtomicUpdate(insn.assignee.aggregate.name), )
+            insn = insn.copy(atomicity=atomicity)
+            args_marked_for_atomic |= set([insn.assignee.aggregate.name])
+
+        new_insns.append(insn)
+
+    # label args as atomic
+    new_args = []
+    for arg in kernel.args:
+        if arg.name in args_marked_for_atomic:
+            new_args.append(arg.copy(for_atomic=True))
+        else:
+            new_args.append(arg)
+
+    kernel = kernel.copy(instructions=new_insns, args=new_args)
+
+    return t_unit.with_kernel(kernel)
+
+
+def apply_gpu_transforms(t_unit, target):
+    t_unit = t_unit.copy(target=get_loopy_target(target))
+    t_unit = preprocess_t_unit_for_gpu(t_unit)
+    kernel = t_unit.default_entrypoint
+    transform_strategy = configuration["gpu_strategy"]
+
+    kernel = lp.assume(kernel, "end > start")
+
+    if transform_strategy == "snpt":
+        from pyop2.transforms.snpt import split_n_across_workgroups
+        kernel, args_to_make_global = split_n_across_workgroups(kernel, 32)
+    else:
+        raise NotImplementedError(f"'{transform_strategy}' transform strategy.")
+
+    t_unit = t_unit.with_kernel(kernel)
+
+    return t_unit, args_to_make_global
diff --git a/pyop2/transforms/snpt.py b/pyop2/transforms/snpt.py
new file mode 100644
index 000000000..23c145f1c
--- /dev/null
+++ b/pyop2/transforms/snpt.py
@@ -0,0 +1,50 @@
+import loopy as lp
+
+
+def _make_tv_array_arg(tv):
+    assert tv.address_space != lp.AddressSpace.PRIVATE
+    arg = lp.ArrayArg(name=tv.name,
+                      dtype=tv.dtype,
+                      shape=tv.shape,
+                      dim_tags=tv.dim_tags,
+                      offset=tv.offset,
+                      dim_names=tv.dim_names,
+                      order=tv.order,
+                      alignment=tv.alignment,
+                      address_space=tv.address_space,
+                      is_output=not tv.read_only,
+                      is_input=tv.read_only)
+    return arg
+
+
+def split_n_across_workgroups(kernel, workgroup_size):
+    """
+    Returns a transformed version of *kernel* with the workload in the loop
+    with induction variable 'n' distributed across work-groups of size
+    *workgroup_size* and each work-item in the work-group performing the work
+    of a single iteration of 'n'.
+    """
+
+    kernel = lp.assume(kernel, "start < end")
+    kernel = lp.split_iname(kernel, "n", workgroup_size,
+                            outer_tag="g.0", inner_tag="l.0")
+
+    # {{{ making consts as globals: necessary to make the strategy emit valid
+    # kernels for all forms
+
+    old_temps = kernel.temporary_variables.copy()
+    args_to_make_global = [tv.initializer.flatten()
+                           for tv in old_temps.values()
+                           if tv.initializer is not None]
+
+    new_temps = {tv.name: tv
+                 for tv in old_temps.values()
+                 if tv.initializer is None}
+    kernel = kernel.copy(args=kernel.args+[_make_tv_array_arg(tv)
+                                           for tv in old_temps.values()
+                                           if tv.initializer is not None],
+                         temporary_variables=new_temps)
+
+    # }}}
+
+    return kernel, args_to_make_global
diff --git a/setup.py b/setup.py
index c2830e7d0..5ed54b7fe 100644
--- a/setup.py
+++ b/setup.py
@@ -142,7 +142,8 @@ def run(self):
       ],
       install_requires=install_requires + test_requires,
       dependency_links=dep_links,
-      packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types'],
+      packages=['pyop2', 'pyop2.backends', 'pyop2.codegen', 'pyop2.types',
+                'pyop2.transforms'],
       package_data={
           'pyop2': ['assets/*', '*.h', '*.pxd', '*.pyx', 'codegen/c/*.c']},
       scripts=glob('scripts/*'),

From 8929a8f42ca235b96e17f665ef3bce483f86488a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:30:39 -0500
Subject: [PATCH 07/11] Implements CUDA backend

---
 pyop2/backends/cuda.py | 663 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 663 insertions(+)
 create mode 100644 pyop2/backends/cuda.py

diff --git a/pyop2/backends/cuda.py b/pyop2/backends/cuda.py
new file mode 100644
index 000000000..2d9d67122
--- /dev/null
+++ b/pyop2/backends/cuda.py
@@ -0,0 +1,663 @@
+"""OP2 CUDA backend."""
+
+import os
+
+from hashlib import md5
+from contextlib import contextmanager
+
+from pyop2 import compilation, mpi, utils
+from pyop2.offload_utils import (AVAILABLE_ON_HOST_ONLY,
+                                 AVAILABLE_ON_DEVICE_ONLY,
+                                 AVAILABLE_ON_BOTH,
+                                 DataAvailability)
+from pyop2.configuration import configuration
+from pyop2.types.set import (MixedSet, Subset as BaseSubset,
+                             ExtrudedSet as BaseExtrudedSet,
+                             Set)
+from pyop2.types.map import Map as BaseMap, MixedMap
+from pyop2.types.dat import Dat as BaseDat, MixedDat, DatView
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.mat import Mat
+from pyop2.types.glob import Global as BaseGlobal
+from pyop2.types.access import RW, READ, MIN, MAX, INC
+from pyop2.parloop import AbstractParloop
+from pyop2.global_kernel import AbstractGlobalKernel
+from pyop2.backends import AbstractComputeBackend, cpu as cpu_backend
+from petsc4py import PETSc
+
+import numpy
+import pycuda.driver as cuda
+import pycuda.gpuarray as cuda_np
+import loopy as lp
+from pytools import memoize_method
+from dataclasses import dataclass
+from typing import Tuple
+import ctypes
+
+
+class Map(BaseMap):
+
+    def __init__(self, *args, **kwargs):
+        super(Map, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _cuda_values(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cuda_np.to_gpu(self._values)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        self._cuda_values
+
+    def ensure_availability_on_host(self):
+        # Map once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if cuda_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self._cuda_values.gpudata,)
+        else:
+            return super(Map, self)._kernel_args_
+
+
+class ExtrudedSet(BaseExtrudedSet):
+    """
+    ExtrudedSet for CUDA.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(ExtrudedSet, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def cuda_layers_array(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cuda_np.to_gpu(self.layers_array)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        self.cuda_layers_array
+
+    def ensure_availability_on_host(self):
+        # ExtrudedSet once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if cuda_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self.cuda_layers_array.gpudata,)
+        else:
+            return super(ExtrudedSet, self)._kernel_args_
+
+
+class Subset(BaseSubset):
+    """
+    Subset for CUDA.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Subset, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    def get_availability(self):
+        return self._availability_flag
+
+    @utils.cached_property
+    def _cuda_indices(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cuda_np.to_gpu(self._indices)
+
+    def ensure_availability_on_device(self):
+        self._cuda_indices
+
+    def ensure_availability_on_host(self):
+        # Subset once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if cuda_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self._cuda_indices.gpudata,)
+        else:
+            return super(Subset, self)._kernel_args_
+
+
+class Dat(BaseDat):
+    """
+    Dat for CUDA.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Dat, self).__init__(*args, **kwargs)
+        # _availability_flag: only used when Dat cannot be represented as a
+        # petscvec; when Dat can be represented as a petscvec the availability
+        # flag is directly read from the petsc vec.
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _cuda_data(self):
+        """
+        Only used when the Dat's data cannot be represented as a petsc Vec.
+        """
+        self._availability_flag = AVAILABLE_ON_BOTH
+        if self.can_be_represented_as_petscvec:
+            with self.vec as petscvec:
+                return cuda_np.GPUArray(shape=self._data.shape,
+                                        dtype=self._data.dtype,
+                                        gpudata=petscvec.getCUDAHandle("r"),
+                                        strides=self._data.strides)
+        else:
+            return cuda_np.to_gpu(self._data)
+
+    def zero(self, subset=None):
+        if subset is None:
+            self.data[:] = 0*self.data
+            self.halo_valid = True
+        else:
+            raise NotImplementedError
+
+    def copy(self, other, subset=None):
+        raise NotImplementedError
+
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        size = self.dataset.layout_vec.getSizes()
+        data = self._data[:size[0]]
+        return PETSc.Vec().createCUDAWithArrays(data, size=size,
+                                                bsize=self.cdim,
+                                                comm=self.comm)
+
+    def get_availability(self):
+        if self.can_be_represented_as_petscvec:
+            return DataAvailability(self._vec.getOffloadMask())
+        else:
+            return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        if self.can_be_represented_as_petscvec:
+            if not cuda_backend.offloading:
+                raise NotImplementedError("PETSc limitation: can ensure availability"
+                                          " on GPU only within an offloading"
+                                          " context.")
+            # perform a host->device transfer if needed
+            self._vec.getCUDAHandle("r")
+        else:
+            if not self.is_available_on_device():
+                self._cuda_data.set(self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    def ensure_availability_on_host(self):
+        if self.can_be_represented_as_petscvec:
+            # perform a device->host transfer if needed
+            self._vec.getArray(readonly=True)
+        else:
+            if not self.is_available_on_host():
+                self._cuda_data.get(self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    @contextmanager
+    def vec_context(self, access):
+        r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`.
+
+        :param access: Access descriptor: READ, WRITE, or RW."""
+        # PETSc Vecs have a state counter and cache norm computations
+        # to return immediately if the state counter is unchanged.
+        # Since we've updated the data behind their back, we need to
+        # change that state counter.
+        self._vec.stateIncrease()
+
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            self._vec.bindToCPU(False)
+        else:
+            self.ensure_availability_on_host()
+            self._vec.bindToCPU(True)
+
+        yield self._vec
+
+        if access is not READ:
+            self.halo_valid = False
+
+    @property
+    def _kernel_args_(self):
+        if self.can_be_represented_as_petscvec:
+            if cuda_backend.offloading:
+                self.ensure_availability_on_device()
+                # tell petsc that we have updated the data in the CL buffer
+                with self.vec as v:
+                    v.restoreCUDAHandle(v.getCUDAHandle())
+                return (self._cuda_data.gpudata,)
+            else:
+                self.ensure_availability_on_host()
+                # tell petsc that we have updated the data on the host
+                with self.vec as v:
+                    v.stateIncrease()
+                return (self._data.ctypes.data, )
+        else:
+            if cuda_backend.offloading:
+                self.ensure_availability_on_device()
+
+                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+                return (self._cuda_data.gpudata, )
+            else:
+                self.ensure_availability_on_host()
+
+                self._availability_flag = AVAILABLE_ON_HOST_ONLY
+                return (self._data.ctypes.data, )
+
+    @mpi.collective
+    @property
+    def data(self):
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+        self.halo_valid = False
+
+        if cuda_backend.offloading:
+
+            self.ensure_availability_on_device()
+
+            # {{{ marking data on the host as invalid
+
+            if self.can_be_represented_as_petscvec:
+                # report to petsc that we are updating values on the device
+                with self.vec as petscvec:
+                    petscvec.restoreCUDAHandle(petscvec.getCUDAHandle("w"))
+            else:
+                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+
+            # }}}
+            v = self._cuda_data[:self.dataset.size]
+        else:
+            self.ensure_availability_on_host()
+            v = self._data[:self.dataset.size].view()
+            v.setflags(write=True)
+
+            # {{{ marking data on the device as invalid
+
+            if self.can_be_represented_as_petscvec:
+                # report to petsc that we are altering data on the CPU
+                with self.vec as petscvec:
+                    petscvec.stateIncrease()
+            else:
+                self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+            # }}}
+
+        return v
+
+    @property
+    @mpi.collective
+    def data_with_halos(self):
+        self.global_to_local_begin(RW)
+        self.global_to_local_end(RW)
+        return self.data
+
+    @property
+    @mpi.collective
+    def data_ro(self):
+
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            v = self._cuda_data[:self.dataset.size].view()
+            # FIXME: PyCUDA doesn't support 'read-only' arrays yet
+        else:
+            self.ensure_availability_on_host()
+            v = self._data[:self.dataset.size].view()
+            v.setflags(write=False)
+
+        return v
+
+    @property
+    @mpi.collective
+    def data_ro_with_halos(self):
+        self.global_to_local_begin(READ)
+        self.global_to_local_end(READ)
+
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            v = self._cuda_data.view()
+            # FIXME: PyCUDA doesn't support 'read-only' arrays yet
+        else:
+            self.ensure_availability_on_host()
+            v = self._data.view()
+            v.setflags(write=False)
+
+        return v
+
+
+class Global(BaseGlobal):
+    """
+    Global for CUDA.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Global, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _cuda_data(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cuda_np.to_gpu(self._data)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        if not self.is_available_on_device():
+            self._cuda_data.set(self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    def ensure_availability_on_host(self):
+        if not self.is_available_on_host():
+            self._cuda_data.get(ary=self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    @property
+    def _kernel_args_(self):
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+            return (self._cuda_data.gpudata,)
+        else:
+            self.ensure_availability_on_host()
+            self._availability_flag = AVAILABLE_ON_HOST_ONLY
+            return super(Global, self)._kernel_args_
+
+    @mpi.collective
+    @property
+    def data(self):
+
+        if len(self._data) == 0:
+            raise RuntimeError("Illegal access: No data associated with"
+                               " this Global!")
+
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+            return self._cuda_data
+        else:
+            self.ensure_availability_on_host()
+            self._availability_flag = AVAILABLE_ON_HOST_ONLY
+            return self._data
+
+    @property
+    def data_ro(self):
+        if len(self._data) == 0:
+            raise RuntimeError("Illegal access: No data associated with"
+                               " this Global!")
+
+        if cuda_backend.offloading:
+            self.ensure_availability_on_device()
+            view = self._cuda_data.view()
+            # FIXME: PyCUDA doesn't support read-only arrays yet
+            return view
+        else:
+            self.ensure_availability_on_host()
+            view = self._data.view()
+            view.setflags(write=False)
+            return view
+
+    @utils.cached_property
+    def _vec(self):
+        raise NotImplementedError()
+
+    @contextmanager
+    def vec_context(self, access):
+        raise NotImplementedError()
+
+
+@dataclass(frozen=True)
+class CUFunctionWithExtraArgs:
+    """
+    A partial :class:`pycuda.driver.Function` with bound arguments *args_to_append*
+    that are appended to the arguments passed in :meth:`__call__`.
+    """
+    cu_func: cuda.Function
+    args_to_append: Tuple[cuda_np.GPUArray, ...]
+
+    def __call__(self, grid, block, *args):
+        if all(grid + block):
+            self.cu_func.prepared_call(grid, block,
+                                       *args,
+                                       *[arg_to_append.gpudata
+                                         for arg_to_append in self.args_to_append])
+
+
+class GlobalKernel(AbstractGlobalKernel):
+    @classmethod
+    def _cache_key(cls, *args, **kwargs):
+        key = super()._cache_key(*args, **kwargs)
+        key = key + (configuration["gpu_strategy"], "cuda")
+        return key
+
+    @utils.cached_property
+    def encoded_cache_key(self):
+        return md5(str(self.cache_key[1:]).encode()).hexdigest()
+
+    @utils.cached_property
+    def computational_grid_expr_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_grid_params.py")
+
+    @utils.cached_property
+    def extra_args_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_extra_args.npz")
+
+    @memoize_method
+    def get_grid_size(self, start, end):
+        """
+        Returns a :class:`tuple` of the form ``(grid, block)``, where *grid* is
+        the 2-:class:`tuple` corresponding to the CUDA computational grid size
+        and ``block`` is the thread block size. Refer to
+        `CUDA docs <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#programming-model>`_
+        for a detailed explanation of these terms.
+        """  # noqa: E501
+        fpath = self.computational_grid_expr_cache_file_path
+
+        with open(fpath, "r") as f:
+            globals_dict = {}
+            exec(f.read(), globals_dict)
+            get_grid_sizes = globals_dict["get_grid_sizes"]
+
+        grid, block = get_grid_sizes(start=start, end=end)
+
+        # TODO: PyCUDA doesn't allow numpy int's for grid dimensions. Remove
+        # these casts after PyCUDA has been patched.
+        return (tuple(int(dim) for dim in grid),
+                tuple(int(dim) for dim in block))
+
+    @memoize_method
+    def get_extra_args(self):
+        """
+        Returns device buffers corresponding to array literals baked into
+        :attr:`local_kernel`.
+        """
+        fpath = self.extra_args_cache_file_path
+        npzfile = numpy.load(fpath)
+        assert npzfile["ids"].ndim == 1
+        assert len(npzfile.files) == len(npzfile["ids"]) + 1
+        extra_args_np = [npzfile[arg_id]
+                         for arg_id in npzfile["ids"]]
+
+        return tuple([cuda_np.to_gpu(arg_np) for arg_np in extra_args_np])
+
+    @utils.cached_property
+    def argtypes(self):
+        result = super().argtypes
+        return result + (ctypes.c_voidp,) * len(self.get_extra_args())
+
+    @utils.cached_property
+    def code_to_compile(self):
+        from pyop2.codegen.rep2loopy import generate
+        from pyop2.transforms.gpu_utils import apply_gpu_transforms
+        from pymbolic.interop.ast import to_evaluatable_python_function
+
+        t_unit = generate(self.builder,
+                          include_math=False,
+                          include_petsc=False,
+                          include_complex=False)
+
+        # Make temporary variables with initializers kernel's arguments.
+        t_unit, extra_args = apply_gpu_transforms(t_unit, "cuda")
+
+        ary_ids = [f"_op2_arg_{i}"
+                   for i in range(len(extra_args))]
+
+        numpy.savez(self.extra_args_cache_file_path,
+                    ids=numpy.array(ary_ids),
+                    **{ary_id: extra_arg
+                       for ary_id, extra_arg in zip(ary_ids, extra_args)})
+
+        # {{{ save python code to get grid sizes
+
+        with open(self.computational_grid_expr_cache_file_path, "w") as f:
+            glens, llens = (t_unit
+                            .default_entrypoint
+                            .get_grid_size_upper_bounds_as_exprs(t_unit
+                                                                 .callables_table))
+            glens = glens + (1,) * (2 - len(glens))  # cuda expects 2d grid shape
+            llens = llens + (1,) * (3 - len(llens))  # cuda expects 3d block shape
+            f.write(to_evaluatable_python_function((glens, llens), "get_grid_sizes"))
+
+        # }}}
+
+        code = lp.generate_code_v2(t_unit).device_code()
+
+        return code
+
+    @mpi.collective
+    def __call__(self, comm, *args):
+        key = id(comm)
+        try:
+            func = self._func_cache[key]
+        except KeyError:
+            func = self.compile(comm)
+            self._func_cache[key] = func
+
+        grid, block = self.get_grid_size(args[0], args[1])
+        func(grid, block, *args)
+
+    @mpi.collective
+    def compile(self, comm):
+        cu_func = compilation.get_prepared_cuda_function(comm,
+                                                         self)
+        return CUFunctionWithExtraArgs(cu_func, self.get_extra_args())
+
+
+class Parloop(AbstractParloop):
+    @PETSc.Log.EventDecorator("ParLoopRednBegin")
+    @mpi.collective
+    def reduction_begin(self):
+        """Begin reductions."""
+        requests = []
+        for idx in self._reduction_idxs:
+            glob = self.arguments[idx].data
+            mpi_op = {INC: mpi.MPI.SUM,
+                      MIN: mpi.MPI.MIN,
+                      MAX: mpi.MPI.MAX}.get(self.accesses[idx])
+
+            if mpi.MPI.VERSION >= 3:
+                # FIXME: Get rid of this D2H for CUDA-Aware MPI.
+                glob.ensure_availability_on_host()
+                requests.append(self.comm.Iallreduce(glob._data,
+                                                     glob._buf,
+                                                     op=mpi_op))
+            else:
+                # FIXME: Get rid of this D2H for CUDA-Aware MPI.
+                glob.ensure_availability_on_host()
+                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
+        return tuple(requests)
+
+    @PETSc.Log.EventDecorator("ParLoopRednEnd")
+    @mpi.collective
+    def reduction_end(self, requests):
+        """Finish reductions."""
+        if mpi.MPI.VERSION >= 3:
+            mpi.MPI.Request.Waitall(requests)
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
+                # FIXME: Get rid of this H2D for CUDA-Aware MPI.
+                glob.ensure_availability_on_device()
+        else:
+            assert len(requests) == 0
+
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
+                # FIXME: Get rid of this H2D for CUDA-Aware MPI.
+                glob.ensure_availability_on_device()
+
+
+class CUDABackend(AbstractComputeBackend):
+    Parloop_offloading = Parloop
+    Parloop_no_offloading = cpu_backend.Parloop
+    GlobalKernel_offloading = GlobalKernel
+    GlobalKernel_no_offloading = cpu_backend.GlobalKernel
+
+    Parloop = cpu_backend.Parloop
+    GlobalKernel = cpu_backend.GlobalKernel
+    Set = Set
+    ExtrudedSet = ExtrudedSet
+    MixedSet = MixedSet
+    Subset = Subset
+    DataSet = DataSet
+    MixedDataSet = MixedDataSet
+    Map = Map
+    MixedMap = MixedMap
+    Dat = Dat
+    MixedDat = MixedDat
+    DatView = DatView
+    Mat = Mat
+    Global = Global
+    GlobalDataSet = GlobalDataSet
+    PETScVecType = PETSc.Vec.Type.CUDA
+
+    def __init__(self):
+        self.offloading = False
+
+    def turn_on_offloading(self):
+        self.offloading = True
+        self.Parloop = self.Parloop_offloading
+        self.GlobalKernel = self.GlobalKernel_offloading
+
+    def turn_off_offloading(self):
+        self.offloading = False
+        self.Parloop = self.Parloop_no_offloading
+        self.GlobalKernel = self.GlobalKernel_no_offloading
+
+    @property
+    def cache_key(self):
+        return (type(self), self.offloading)
+
+
+cuda_backend = CUDABackend()

From f0be1f4e2217f03d7984637bf09e38b4dc903dd1 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 11 Jul 2022 21:30:48 -0500
Subject: [PATCH 08/11] Implements OpenCL backend

---
 pyop2/backends/opencl.py | 681 +++++++++++++++++++++++++++++++++++++++
 pyop2/offload_utils.py   |   2 +
 2 files changed, 683 insertions(+)
 create mode 100644 pyop2/backends/opencl.py

diff --git a/pyop2/backends/opencl.py b/pyop2/backends/opencl.py
new file mode 100644
index 000000000..a041860cb
--- /dev/null
+++ b/pyop2/backends/opencl.py
@@ -0,0 +1,681 @@
+"""OP2 OpenCL backend."""
+
+import os
+from hashlib import md5
+from contextlib import contextmanager
+
+from pyop2 import compilation, mpi, utils
+from pyop2.offload_utils import (AVAILABLE_ON_HOST_ONLY,
+                                 AVAILABLE_ON_DEVICE_ONLY,
+                                 AVAILABLE_ON_BOTH,
+                                 DataAvailability)
+from pyop2.configuration import configuration
+from pyop2.types.set import (MixedSet, Subset as BaseSubset,
+                             ExtrudedSet as BaseExtrudedSet,
+                             Set)
+from pyop2.types.map import Map as BaseMap, MixedMap
+from pyop2.types.dat import Dat as BaseDat, MixedDat, DatView
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.mat import Mat
+from pyop2.types.glob import Global as BaseGlobal
+from pyop2.types.access import RW, READ, MIN, MAX, INC
+from pyop2.parloop import AbstractParloop
+from pyop2.global_kernel import AbstractGlobalKernel
+from pyop2.backends import AbstractComputeBackend, cpu as cpu_backend
+from petsc4py import PETSc
+
+import numpy
+import loopy as lp
+from pytools import memoize_method
+from dataclasses import dataclass
+from typing import Tuple
+import ctypes
+
+import pyopencl as cl
+import pyopencl.array as cla
+
+
+def read_only_clarray_setitem(self, *args, **kwargs):
+    # emulates np.ndarray.setitem for numpy arrays with read-only flags
+    raise ValueError("assignment destination is read-only")
+
+
+class Map(BaseMap):
+
+    def __init__(self, *args, **kwargs):
+        super(Map, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _opencl_values(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cla.to_device(opencl_backend.queue, self._values)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        self._opencl_values
+
+    def ensure_availability_on_host(self):
+        # Map once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if opencl_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self._opencl_values.data,)
+        else:
+            return super(Map, self)._kernel_args_
+
+
+class ExtrudedSet(BaseExtrudedSet):
+    """
+    ExtrudedSet for OpenCL.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(ExtrudedSet, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def opencl_layers_array(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cla.to_device(opencl_backend.queue, self.layers_array)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        self.opencl_layers_array
+
+    def ensure_availability_on_host(self):
+        # ExtrudedSet once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if opencl_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self.opencl_layers_array.data,)
+        else:
+            return super(ExtrudedSet, self)._kernel_args_
+
+
+class Subset(BaseSubset):
+    """
+    Subset for OpenCL.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Subset, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    def get_availability(self):
+        return self._availability_flag
+
+    @utils.cached_property
+    def _opencl_indices(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cla.to_device(opencl_backend.queue, self._indices)
+
+    def ensure_availability_on_device(self):
+        self._opencl_indices
+
+    def ensure_availability_on_host(self):
+        # Subset once initialized is not over-written so always available
+        # on host.
+        pass
+
+    @property
+    def _kernel_args_(self):
+        if opencl_backend.offloading:
+            if not self.is_available_on_device():
+                self.ensure_availability_on_device()
+
+            return (self._opencl_indices.data,)
+        else:
+            return super(Subset, self)._kernel_args_
+
+
+class Dat(BaseDat):
+    """
+    Dat for OpenCL.
+    """
+    def __init__(self, *args, **kwargs):
+        super(Dat, self).__init__(*args, **kwargs)
+        # _availability_flag: only used when Dat cannot be represented as a
+        # petscvec; when Dat can be represented as a petscvec the availability
+        # flag is directly read from the petsc vec.
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _opencl_data(self):
+        """
+        Only used when the Dat's data cannot be represented as a petsc Vec.
+        """
+        self._availability_flag = AVAILABLE_ON_BOTH
+        if self.can_be_represented_as_petscvec:
+            with self.vec as petscvec:
+                return cla.Array(cq=opencl_backend.queue,
+                                 shape=self._data.shape,
+                                 dtype=self._data.dtype,
+                                 data=cl.Buffer.from_int_ptr(
+                                     petscvec.getCLMemHandle("r"),
+                                     retain=False),
+                                 strides=self._data.strides)
+        else:
+            return cla.to_device(opencl_backend.queue, self._data)
+
+    def zero(self, subset=None):
+        if subset is None:
+            self.data[:] = 0
+            self.halo_valid = True
+        else:
+            raise NotImplementedError
+
+    def copy(self, other, subset=None):
+        raise NotImplementedError
+
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype,
+                                                           PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        size = self.dataset.layout_vec.getSizes()
+        data = self._data[:size[0]]
+        return PETSc.Vec().createViennaCLWithArrays(data, size=size,
+                                                    bsize=self.cdim,
+                                                    comm=self.comm)
+
+    def get_availability(self):
+        if self.can_be_represented_as_petscvec:
+            return DataAvailability(self._vec.getOffloadMask())
+        else:
+            return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        if self.can_be_represented_as_petscvec:
+            if not opencl_backend.offloading:
+                raise NotImplementedError("PETSc limitation: can ensure availability"
+                                          " on GPU only within an offloading"
+                                          " context.")
+            # perform a host->device transfer if needed
+            self._vec.getCLMemHandle("r")
+        else:
+            if not self.is_available_on_device():
+                self._opencl_data.set(self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    def ensure_availability_on_host(self):
+        if self.can_be_represented_as_petscvec:
+            # perform a device->host transfer if needed
+            self._vec.getArray(readonly=True)
+        else:
+            if not self.is_available_on_host():
+                self._opencl_data.get(opencl_backend.queue, self._data)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    @contextmanager
+    def vec_context(self, access):
+        r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`.
+
+        :param access: Access descriptor: READ, WRITE, or RW."""
+        # PETSc Vecs have a state counter and cache norm computations
+        # to return immediately if the state counter is unchanged.
+        # Since we've updated the data behind their back, we need to
+        # change that state counter.
+        self._vec.stateIncrease()
+
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            self._vec.bindToCPU(False)
+        else:
+            self.ensure_availability_on_host()
+            self._vec.bindToCPU(True)
+
+        yield self._vec
+
+        if access is not READ:
+            self.halo_valid = False
+
+    @property
+    def _kernel_args_(self):
+        if self.can_be_represented_as_petscvec:
+            if opencl_backend.offloading:
+                self.ensure_availability_on_device()
+                # tell petsc that we have updated the data in the CL buffer
+                with self.vec as v:
+                    v.getCLMemHandle()
+                    v.restoreCLMemHandle()
+
+                return (self._opencl_data.data,)
+            else:
+                self.ensure_availability_on_host()
+                # tell petsc that we have updated the data on the host
+                with self.vec as v:
+                    v.stateIncrease()
+                return (self._data.ctypes.data, )
+        else:
+            if opencl_backend.offloading:
+                self.ensure_availability_on_device()
+
+                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+                return (self._opencl_data.data, )
+            else:
+                self.ensure_availability_on_host()
+
+                self._availability_flag = AVAILABLE_ON_HOST_ONLY
+                return (self._data.ctypes.data, )
+
+    @mpi.collective
+    @property
+    def data(self):
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+
+        self.halo_valid = False
+
+        if opencl_backend.offloading:
+
+            self.ensure_availability_on_device()
+
+            # {{{ marking data on the host as invalid
+
+            if self.can_be_represented_as_petscvec:
+                # report to petsc that we are updating values on the device
+                with self.vec as petscvec:
+                    petscvec.getCLMemHandle("w")
+                    petscvec.restoreCLMemHandle()
+            else:
+                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+
+            # }}}
+            v = self._opencl_data[:self.dataset.size]
+        else:
+            self.ensure_availability_on_host()
+            v = self._data[:self.dataset.size].view()
+            v.setflags(write=True)
+
+            # {{{ marking data on the device as invalid
+
+            if self.can_be_represented_as_petscvec:
+                # report to petsc that we are altering data on the CPU
+                with self.vec as petscvec:
+                    petscvec.stateIncrease()
+            else:
+                self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+            # }}}
+
+        return v
+
+    @property
+    @mpi.collective
+    def data_with_halos(self):
+        self.global_to_local_begin(RW)
+        self.global_to_local_end(RW)
+        return self.data
+
+    @property
+    @mpi.collective
+    def data_ro(self):
+
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+
+        self.halo_valid = False
+
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            v = self._opencl_data[:self.dataset.size].view()
+            v.setitem = read_only_clarray_setitem
+        else:
+            self.ensure_availability_on_host()
+            v = self._data[:self.dataset.size].view()
+            v.setflags(write=False)
+
+        return v
+
+    @property
+    @mpi.collective
+    def data_ro_with_halos(self):
+        self.global_to_local_begin(READ)
+        self.global_to_local_end(READ)
+
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            v = self._opencl_data.view()
+            v.setitem = read_only_clarray_setitem
+        else:
+            self.ensure_availability_on_host()
+            v = self._data.view()
+            v.setflags(write=False)
+
+        return v
+
+
+class Global(BaseGlobal):
+    """
+    Global for OpenCL.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super(Global, self).__init__(*args, **kwargs)
+        self._availability_flag = AVAILABLE_ON_HOST_ONLY
+
+    @utils.cached_property
+    def _opencl_data(self):
+        self._availability_flag = AVAILABLE_ON_BOTH
+        return cla.to_device(opencl_backend.queue, self._data)
+
+    def get_availability(self):
+        return self._availability_flag
+
+    def ensure_availability_on_device(self):
+        if not self.is_available_on_device():
+            self._opencl_data.set(ary=self._data,
+                                  queue=opencl_backend.queue)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    def ensure_availability_on_host(self):
+        if not self.is_available_on_host():
+            self._opencl_data.get(ary=self._data,
+                                  queue=opencl_backend.queue)
+            self._availability_flag = AVAILABLE_ON_BOTH
+
+    @property
+    def _kernel_args_(self):
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+            return (self._opencl_data.data,)
+        else:
+            self.ensure_availability_on_host()
+            self._availability_flag = AVAILABLE_ON_HOST_ONLY
+            return super(Global, self)._kernel_args_
+
+    @mpi.collective
+    @property
+    def data(self):
+        if len(self._data) == 0:
+            raise RuntimeError("Illegal access: No data associated with"
+                               " this Global!")
+
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
+            return self._opencl_data
+        else:
+            self.ensure_availability_on_host()
+            self._availability_flag = AVAILABLE_ON_HOST_ONLY
+            return self._data
+
+    @property
+    def data_ro(self):
+        if len(self._data) == 0:
+            raise RuntimeError("Illegal access: No data associated with"
+                               " this Global!")
+
+        if opencl_backend.offloading:
+            self.ensure_availability_on_device()
+            view = self._opencl_data.view()
+            view.setitem = read_only_clarray_setitem
+            return view
+        else:
+            self.ensure_availability_on_host()
+            view = self._data.view()
+            view.setflags(write=False)
+            return view
+
+    @utils.cached_property
+    def _vec(self):
+        raise NotImplementedError()
+
+    @contextmanager
+    def vec_context(self, access):
+        raise NotImplementedError()
+
+
+@dataclass(frozen=True)
+class CLKernelWithExtraArgs:
+    cl_kernel: cl.Kernel
+    args_to_append: Tuple[cla.Array, ...]
+
+    def __call__(self, grid, block, start, end, *args):
+        if all(grid + block):
+            self.cl_kernel(opencl_backend.queue,
+                           grid, block,
+                           numpy.int32(start),
+                           numpy.int32(end),
+                           *args,
+                           *[arg_to_append.data
+                             for arg_to_append in self.args_to_append],
+                           g_times_l=True)
+
+
+class GlobalKernel(AbstractGlobalKernel):
+    @classmethod
+    def _cache_key(cls, *args, **kwargs):
+        key = super()._cache_key(*args, **kwargs)
+        key = key + (configuration["gpu_strategy"], "opencl")
+        return key
+
+    @utils.cached_property
+    def encoded_cache_key(self):
+        return md5(str(self.cache_key[1:]).encode()).hexdigest()
+
+    @utils.cached_property
+    def computational_grid_expr_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_grid_params.py")
+
+    @utils.cached_property
+    def extra_args_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_extra_args.npz")
+
+    @memoize_method
+    def get_grid_size(self, start, end):
+        fpath = self.computational_grid_expr_cache_file_path
+
+        with open(fpath, "r") as f:
+            globals_dict = {}
+            exec(f.read(), globals_dict)
+            get_grid_sizes = globals_dict["get_grid_sizes"]
+
+        return get_grid_sizes(start=start, end=end)
+
+    @memoize_method
+    def get_extra_args(self):
+        """
+        Returns device buffers corresponding to array literals baked into
+        :attr:`local_kernel`.
+        """
+        fpath = self.extra_args_cache_file_path
+        npzfile = numpy.load(fpath)
+        assert npzfile["ids"].ndim == 1
+        assert len(npzfile.files) == len(npzfile["ids"]) + 1
+        extra_args_np = [npzfile[arg_id]
+                         for arg_id in npzfile["ids"]]
+        return tuple(cla.to_device(opencl_backend.queue, arg)
+                     for arg in extra_args_np)
+
+    @utils.cached_property
+    def argtypes(self):
+        result = super().argtypes
+        return result + (ctypes.c_voidp,) * len(self.get_extra_args())
+
+    @utils.cached_property
+    def code_to_compile(self):
+        from pyop2.codegen.rep2loopy import generate
+        from pyop2.transforms.gpu_utils import apply_gpu_transforms
+        from pymbolic.interop.ast import to_evaluatable_python_function
+
+        t_unit = generate(self.builder,
+                          include_math=False,
+                          include_petsc=False,
+                          include_complex=False)
+
+        # Make temporary variables with initializers kernel's arguments.
+        t_unit, extra_args = apply_gpu_transforms(t_unit, "opencl")
+
+        ary_ids = [f"_op2_arg_{i}"
+                   for i in range(len(extra_args))]
+
+        numpy.savez(self.extra_args_cache_file_path,
+                    ids=numpy.array(ary_ids),
+                    **{ary_id: extra_arg
+                       for ary_id, extra_arg in zip(ary_ids, extra_args)})
+
+        # {{{ save python code to get grid sizes
+
+        with open(self.computational_grid_expr_cache_file_path, "w") as f:
+            glens, llens = (t_unit
+                            .default_entrypoint
+                            .get_grid_size_upper_bounds_as_exprs(t_unit
+                                                                 .callables_table))
+            f.write(to_evaluatable_python_function((glens, llens), "get_grid_sizes"))
+
+        code = lp.generate_code_v2(t_unit).device_code()
+
+        # }}}
+
+        return code
+
+    @mpi.collective
+    def __call__(self, comm, *args):
+        key = id(comm)
+        try:
+            func = self._func_cache[key]
+        except KeyError:
+            func = self.compile(comm)
+            self._func_cache[key] = func
+
+        grid, block = self.get_grid_size(args[0], args[1])
+        func(grid, block, *args)
+
+    @mpi.collective
+    def compile(self, comm):
+        cl_knl = compilation.get_opencl_kernel(comm,
+                                               self)
+        return CLKernelWithExtraArgs(cl_knl, self.get_extra_args())
+
+
+class Parloop(AbstractParloop):
+    @PETSc.Log.EventDecorator("ParLoopRednBegin")
+    @mpi.collective
+    def reduction_begin(self):
+        """Begin reductions."""
+        requests = []
+        for idx in self._reduction_idxs:
+            glob = self.arguments[idx].data
+            mpi_op = {INC: mpi.MPI.SUM,
+                      MIN: mpi.MPI.MIN,
+                      MAX: mpi.MPI.MAX}.get(self.accesses[idx])
+
+            if mpi.MPI.VERSION >= 3:
+                glob.ensure_availability_on_host()
+                requests.append(self.comm.Iallreduce(glob._data,
+                                                     glob._buf,
+                                                     op=mpi_op))
+            else:
+                glob.ensure_availability_on_host()
+                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
+        return tuple(requests)
+
+    @PETSc.Log.EventDecorator("ParLoopRednEnd")
+    @mpi.collective
+    def reduction_end(self, requests):
+        """Finish reductions."""
+        if mpi.MPI.VERSION >= 3:
+            mpi.MPI.Request.Waitall(requests)
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
+                glob.ensure_availability_on_device()
+        else:
+            assert len(requests) == 0
+
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[:] = glob._buf
+                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
+                glob.ensure_availability_on_device()
+
+
+class OpenCLBackend(AbstractComputeBackend):
+    Parloop_offloading = Parloop
+    Parloop_no_offloading = cpu_backend.Parloop
+    GlobalKernel_offloading = GlobalKernel
+    GlobalKernel_no_offloading = cpu_backend.GlobalKernel
+
+    Parloop = cpu_backend.Parloop
+    GlobalKernel = cpu_backend.GlobalKernel
+    Set = Set
+    ExtrudedSet = ExtrudedSet
+    MixedSet = MixedSet
+    Subset = Subset
+    DataSet = DataSet
+    MixedDataSet = MixedDataSet
+    Map = Map
+    MixedMap = MixedMap
+    Dat = Dat
+    MixedDat = MixedDat
+    DatView = DatView
+    Mat = Mat
+    Global = Global
+    GlobalDataSet = GlobalDataSet
+    PETScVecType = PETSc.Vec.Type.VIENNACL
+
+    def __init__(self):
+        self.offloading = False
+
+    @utils.cached_property
+    def context(self):
+        # create a dummy vector and extract its underlying context
+        x = PETSc.Vec().create(PETSc.COMM_WORLD)
+        x.setType("viennacl")
+        x.setSizes(size=1)
+        ctx_ptr = x.getCLContextHandle()
+        return cl.Context.from_int_ptr(ctx_ptr, retain=False)
+
+    @utils.cached_property
+    def queue(self):
+        # TODO: Instruct the user to pass
+        # -viennacl_backend opencl
+        # -viennacl_opencl_device_type gpu
+        # create a dummy vector and extract its associated command queue
+        x = PETSc.Vec().create(PETSc.COMM_WORLD)
+        x.setType("viennacl")
+        x.setSizes(size=1)
+        queue_ptr = x.getCLQueueHandle()
+        return cl.CommandQueue.from_int_ptr(queue_ptr, retain=False)
+
+    def turn_on_offloading(self):
+        self.offloading = True
+        self.Parloop = self.Parloop_offloading
+        self.GlobalKernel = self.GlobalKernel_offloading
+
+    def turn_off_offloading(self):
+        self.offloading = False
+        self.Parloop = self.Parloop_no_offloading
+        self.GlobalKernel = self.GlobalKernel_no_offloading
+
+    @property
+    def cache_key(self):
+        return (type(self), self.offloading)
+
+
+opencl_backend = OpenCLBackend()
diff --git a/pyop2/offload_utils.py b/pyop2/offload_utils.py
index a60ee5c47..7ad1f1c16 100644
--- a/pyop2/offload_utils.py
+++ b/pyop2/offload_utils.py
@@ -22,9 +22,11 @@ def ensure_availaibility_on_device(self):
         raise NotImplementedError
 
     def is_available_on_host(self):
+        # bitwise op to detect both AVAILABLE_ON_HOST and AVAILABLE_ON_BOTH
         return bool(self.get_availability() & AVAILABLE_ON_HOST_ONLY)
 
     def is_available_on_device(self):
+        # bitwise op to detect both AVAILABLE_ON_DEVICE and AVAILABLE_ON_BOTH
         return bool(self.get_availability() & AVAILABLE_ON_DEVICE_ONLY)
 
 

From 1857250e200801681995cbb54b1e3a409e69175e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 13 Jul 2022 13:45:44 -0500
Subject: [PATCH 09/11] adds a CI job: test_opencl

---
 .github/workflows/ci.yml | 66 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index bf9246734..f917f0312 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -104,3 +104,69 @@ jobs:
           jekyll: false
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+  test_opencl:
+    runs-on: ubuntu-latest
+    env:
+      CC: mpicc
+      PETSC_DIR: ${{ github.workspace }}/petsc
+      PETSC_ARCH: default
+      PETSC_CONFIGURE_OPTIONS: --with-debugging=1 --with-shared-libraries=1 --with-c2html=0 --with-fortran-bindings=0 --with-opencl=1 --download-viennacl --with-viennacl=1
+
+    steps:
+      - name: Install system dependencies
+        shell: bash
+        run: |
+          sudo apt update
+          sudo apt install build-essential mpich libmpich-dev \
+              libblas-dev liblapack-dev gfortran
+          sudo apt install ocl-icd-opencl-dev pocl-opencl-icd
+
+      - name: Set correct Python version
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.10'
+
+      - name: Clone PETSc
+        uses: actions/checkout@v2
+        with:
+          repository: firedrakeproject/petsc
+          path: ${{ env.PETSC_DIR }}
+
+      - name: Build and install PETSc
+        shell: bash
+        working-directory: ${{ env.PETSC_DIR }}
+        run: |
+          ./configure ${PETSC_CONFIGURE_OPTIONS}
+          make
+
+      - name: Build and install petsc4py
+        shell: bash
+        working-directory: ${{ env.PETSC_DIR }}/src/binding/petsc4py
+        run: |
+          python -m pip install --upgrade cython numpy
+          python -m pip install --no-deps .
+
+      - name: Checkout PyOP2
+        uses: actions/checkout@v2
+        with:
+          path: PyOP2
+
+      - name: Install PyOP2
+        shell: bash
+        working-directory: PyOP2
+        run: |
+          python -m pip install pip==20.2  # pip 20.2 needed for loopy install to work.
+
+          # xargs is used to force installation of requirements in the order we specified.
+          xargs -l1 python -m pip install < requirements-ext.txt
+          xargs -l1 python -m pip install < requirements-git.txt
+          python -m pip install pulp
+          python -m pip install -U flake8
+          python -m pip install pyopencl
+          python -m pip install .
+
+      - name: Run tests
+        shell: bash
+        working-directory: PyOP2
+        run: pytest test -v --tb=native

From c57d6ca12ab8df70c79265bc7d27c2ae1fca1325 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 13 Jul 2022 14:08:36 -0500
Subject: [PATCH 10/11] adds test_opencl_offloading

---
 test/unit/test_opencl_offloading.py | 119 ++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)
 create mode 100644 test/unit/test_opencl_offloading.py

diff --git a/test/unit/test_opencl_offloading.py b/test/unit/test_opencl_offloading.py
new file mode 100644
index 000000000..b39f11a1c
--- /dev/null
+++ b/test/unit/test_opencl_offloading.py
@@ -0,0 +1,119 @@
+import pytest
+pytest.importorskip("pyopencl")
+
+import sys
+import petsc4py
+petsc4py.init(sys.argv
+              + "-viennacl_backend opencl".split()
+              + "-viennacl_opencl_device_type cpu".split())
+from pyop2 import op2
+import pyopencl.array as cla
+import numpy as np
+import loopy as lp
+lp.set_caching_enabled(False)
+
+
+def allclose(a, b, rtol=1e-05, atol=1e-08):
+    """
+    Prefer this routine over np.allclose(...) to allow pycuda/pyopencl arrays
+    """
+    return bool(abs(a - b) < (atol + rtol * abs(b)))
+
+
+def pytest_generate_tests(metafunc):
+    if "backend" in metafunc.fixturenames:
+        from pyop2.backends.opencl import opencl_backend
+        metafunc.parametrize("backend", [opencl_backend])
+
+
+def test_new_backend_raises_not_implemented_error():
+    from pyop2.backends import AbstractComputeBackend
+    unimplemented_backend = AbstractComputeBackend()
+
+    attrs = ["GlobalKernel", "Parloop", "Set", "ExtrudedSet", "MixedSet",
+             "Subset", "DataSet", "MixedDataSet", "Map", "MixedMap", "Dat",
+             "MixedDat", "DatView", "Mat", "Global", "GlobalDataSet",
+             "PETScVecType"]
+
+    for attr in attrs:
+        with pytest.raises(NotImplementedError):
+            getattr(unimplemented_backend, attr)
+
+
+def test_dat_with_petscvec_representation(backend):
+    op2.set_offloading_backend(backend)
+
+    nelems = 9
+    data = np.random.rand(nelems)
+    set_ = op2.compute_backend.Set(nelems)
+    dset = op2.compute_backend.DataSet(set_, 1)
+    dat = op2.compute_backend.Dat(dset, data.copy())
+
+    assert isinstance(dat.data_ro, np.ndarray)
+    dat.data[:] *= 3
+
+    with op2.offloading():
+        assert isinstance(dat.data_ro, cla.Array)
+        dat.data[:] *= 2
+
+    assert isinstance(dat.data_ro, np.ndarray)
+    np.testing.assert_allclose(dat.data_ro, 6*data)
+
+
+def test_dat_not_as_petscvec(backend):
+    op2.set_offloading_backend(backend)
+
+    nelems = 9
+    data = np.random.randint(low=-10, high=10,
+                             size=nelems,
+                             dtype=np.int64)
+    set_ = op2.compute_backend.Set(nelems)
+    dset = op2.compute_backend.DataSet(set_, 1)
+    dat = op2.compute_backend.Dat(dset, data.copy())
+
+    assert isinstance(dat.data_ro, np.ndarray)
+    dat.data[:] *= 3
+
+    with op2.offloading():
+        assert isinstance(dat.data_ro, cla.Array)
+        dat.data[:] *= 2
+
+    assert isinstance(dat.data_ro, np.ndarray)
+    np.testing.assert_allclose(dat.data_ro, 6*data)
+
+
+def test_global_reductions(backend):
+    op2.set_offloading_backend(backend)
+
+    sum_knl = lp.make_function(
+        "{ : }",
+        """
+        g[0] = g[0] + x[0]
+        """,
+        [lp.GlobalArg("g,x",
+                      dtype="float64",
+                      shape=lp.auto,
+                      )],
+        name="sum_knl",
+        target=lp.CWithGNULibcTarget(),
+        lang_version=(2018, 2))
+
+    rng = np.random.default_rng()
+    nelems = 4_000
+    data_to_sum = rng.random(nelems)
+
+    with op2.offloading():
+
+        set_ = op2.compute_backend.Set(4_000)
+        dset = op2.compute_backend.DataSet(set_, 1)
+
+        dat = op2.compute_backend.Dat(dset, data_to_sum.copy())
+        glob = op2.compute_backend.Global(1, 0, np.float64, "g")
+
+        op2.parloop(op2.Kernel(sum_knl, "sum_knl"),
+                    set_,
+                    glob(op2.INC),
+                    dat(op2.READ))
+
+        assert isinstance(glob.data_ro, cla.Array)
+        assert allclose(glob.data_ro[0], data_to_sum.sum())

From e869afc89a624b5c2afe7413f7f9ced4d2a87574 Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Thu, 2 Mar 2023 12:15:54 +0100
Subject: [PATCH 11/11] Introduce MirroredArray abstraction

---
 pyop2/array.py                     | 274 ++++++++++++++++++
 pyop2/backends/__init__.py         |  43 ---
 pyop2/backends/cpu.py              |  40 +--
 pyop2/backends/opencl.py           | 313 +--------------------
 pyop2/codegen/rep2loopy.py         |   1 +
 pyop2/compilation.py               |   2 +-
 pyop2/configuration.py             |   2 +
 pyop2/global_kernel.py             | 179 +++++++++++-
 pyop2/offload_utils.py             |  55 ++--
 pyop2/op2.py                       |  12 +-
 pyop2/parloop.py                   |  77 +++--
 pyop2/types/dat.py                 | 434 +++++++++++++++++++----------
 pyop2/types/data_carrier.py        |  88 +-----
 pyop2/types/dataset.py             |  12 +-
 pyop2/types/glob.py                | 191 +++++++++----
 pyop2/types/map.py                 |  42 +--
 pyop2/types/mat.py                 |   7 +-
 pyop2/types/set.py                 | 119 +++++---
 test/unit/test_api.py              |  13 +-
 test/unit/test_global_reduction.py |   2 +-
 test/unit/test_globals.py          |   2 +-
 test/unit/test_indirect_loop.py    |  14 +-
 test/unit/test_linalg.py           |  40 +--
 test/unit/test_linalg_complex.py   |  64 ++---
 test/unit/test_matrices.py         |   7 -
 test/unit/test_petsc.py            |   4 +-
 test/unit/test_subset.py           |  20 +-
 27 files changed, 1159 insertions(+), 898 deletions(-)
 create mode 100644 pyop2/array.py

diff --git a/pyop2/array.py b/pyop2/array.py
new file mode 100644
index 000000000..2126cf436
--- /dev/null
+++ b/pyop2/array.py
@@ -0,0 +1,274 @@
+import abc
+import collections
+import enum
+
+import numpy as np
+from petsc4py import PETSc
+
+from pyop2.configuration import configuration
+from pyop2.datatypes import ScalarType
+from pyop2.exceptions import DataTypeError
+from pyop2.offload_utils import _offloading as offloading, _backend as backend, OffloadingBackend
+from pyop2.types.access import READ, RW, WRITE, INC, MIN, MAX
+from pyop2.utils import verify_reshape
+
+try:
+    import pyopencl
+except ImportError:
+    pass
+
+try:
+    import pycuda
+except ImportError:
+    pass
+
+
+class MirroredArray(abc.ABC):
+    """An array that is available on both host and device, copying values where necessary."""
+
+    # TODO: Use __new__ instead?
+    @classmethod
+    def new(cls, *args, **kwargs):
+        if backend == OffloadingBackend.CPU:
+            return CPUOnlyArray(*args, **kwargs)
+        elif backend == OffloadingBackend.OPENCL:
+            return OpenCLArray(*args, **kwargs)
+        elif backend == OffloadingBackend.CUDA:
+            return CUDAArray(*args, **kwargs)
+        else:
+            raise ValueError
+
+    def __init__(self, data):
+        # option 1: passed shape and dtype
+        if (len(data) == 2 and isinstance(data[0], collections.abc.Collection)
+                and isinstance(data[1], np.dtype)):
+            self.shape = data[0]
+            self.dtype = data[1]
+            self._lazy_host_data = None
+        # option 2: passed an array
+        elif isinstance(data, np.ndarray):
+            self.shape = data.shape
+            self.dtype = data.dtype
+            self._lazy_host_data = data
+        else:
+            raise ValueError("Unexpected arguments encountered")
+
+        # for now assume we are only ever passed numpy arrays (not device arrays)
+        self._lazy_device_data = None
+        self.is_available_on_host = True
+        self.is_available_on_device = False
+
+        # counter used to keep track of modifications
+        self.state = 0
+
+    @property
+    def data(self):
+        return self.device_data if offloading else self.host_data
+
+    @property
+    def data_ro(self):
+        return self.device_data_ro if offloading else self.host_data_ro
+
+    @property
+    def host_data(self):
+        return self.host_data_rw
+
+    @property
+    def host_data_rw(self):
+        self.state += 1
+
+        if not self.is_available_on_host:
+            self.device_to_host_copy()
+            self.is_available_on_host = True
+
+        # modifying on host invalidates the device
+        self.is_available_on_device = False
+
+        data = self._host_data.view()
+        data.setflags(write=True)
+        return data
+
+    @property
+    def host_data_ro(self):
+        if not self.is_available_on_host:
+            self.device_to_host_copy()
+            self.is_available_on_host = True
+
+        data = self._host_data.view()
+        data.setflags(write=False)
+        return data
+
+    @property
+    def host_data_wo(self):
+        self.state += 1  # TODO make a decorator
+
+        # if we are writing then we don't need to copy from the device
+        pass
+
+        # modifying on host invalidates the device
+        self.is_available_on_host = True
+        self.is_available_on_device = False
+
+        data = self._host_data.view()
+        data.setflags(write=True)
+        return data
+
+    @property
+    def device_data(self):
+        return self.device_data_rw
+
+    @property
+    @abc.abstractmethod
+    def device_data_rw(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def device_data_ro(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def device_data_wo(self):
+        pass
+
+    @property
+    def ptr_rw(self):
+        return self.device_ptr_rw if offloading else self.host_ptr_rw
+
+    @property
+    def ptr_ro(self):
+        return self.device_ptr_ro if offloading else self.host_ptr_ro
+
+    @property
+    def ptr_wo(self):
+        return self.device_ptr_wo if offloading else self.host_ptr_wo
+
+    @property
+    def host_ptr_rw(self):
+        return self.host_data_rw.ctypes.data
+
+    @property
+    def host_ptr_ro(self):
+        return self.host_data_ro.ctypes.data
+
+    @property
+    def host_ptr_wo(self):
+        return self.host_data_wo.ctypes.data
+
+    @property
+    @abc.abstractmethod
+    def device_ptr_rw(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def device_ptr_ro(self):
+        pass
+
+    @property
+    @abc.abstractmethod
+    def device_ptr_wo(self):
+        pass
+
+    @abc.abstractmethod
+    def host_to_device_copy(self):
+        pass
+
+    @abc.abstractmethod
+    def device_to_host_copy(self):
+        pass
+
+    @abc.abstractmethod
+    def alloc_device_data(self, shape, dtype):
+        pass
+
+    @property
+    def size(self):
+        return self.data_ro.size
+
+    @property
+    def _host_data(self):
+        if self._lazy_host_data is None:
+            self._lazy_host_data = np.zeros(self.shape, self.dtype)
+        return self._lazy_host_data
+
+    @property
+    def _device_data(self):
+        if self._lazy_device_data is None:
+            self._lazy_device_data = self.alloc_device_data(self.shape, self.dtype)
+        return self._lazy_device_data
+
+
+class CPUOnlyArray(MirroredArray):
+    """TODO"""
+
+    @property
+    def device_data_rw(self):
+        return self.host_data_rw
+
+    @property
+    def device_data_ro(self):
+        return self.host_data_ro
+
+    @property
+    def device_data_wo(self):
+        return self.host_data_wo
+
+    @property
+    def device_ptr_rw(self):
+        return self.host_ptr_rw
+
+    @property
+    def device_ptr_ro(self):
+        return self.host_ptr_ro
+
+    @property
+    def device_ptr_wo(self):
+        return self.host_ptr_wo
+
+    def host_to_device_copy(self):
+        pass
+
+    def device_to_host_copy(self):
+        pass
+
+    def alloc_device_data(self, shape, dtype):
+        raise AssertionError("Should not hit this")
+
+
+class OpenCLArray(MirroredArray):
+
+    @property
+    def device_data_rw(self):
+        raise NotImplementedError
+
+    @property
+    def device_data_ro(self):
+        raise NotImplementedError
+
+    @property
+    def device_data_wo(self):
+        raise NotImplementedError
+
+    @property
+    def device_ptr_rw(self):
+        raise NotImplementedError
+
+    @property
+    def device_ptr_ro(self):
+        raise NotImplementedError
+
+    @property
+    def device_ptr_wo(self):
+        raise NotImplementedError
+
+    def host_to_device_copy(self):
+        self._device_data.set(self._host_data)
+
+    def device_to_host_copy(self):
+        # FIXME: self.queue not a thing
+        self._device_data.get(self.queue, self._host_data)
+
+    def alloc_device_data(self, shape, dtype):
+        raise NotImplementedError("TODO")
diff --git a/pyop2/backends/__init__.py b/pyop2/backends/__init__.py
index 680ad9d3a..e69de29bb 100644
--- a/pyop2/backends/__init__.py
+++ b/pyop2/backends/__init__.py
@@ -1,43 +0,0 @@
-class _not_implemented:  # noqa
-    """Not Implemented"""
-
-
-class AbstractComputeBackend:
-    """
-    Abstract class to record all the backend specific implementation of
-    :mod:`pyop2`'s data structures.
-    """
-    GlobalKernel = _not_implemented
-    Parloop = _not_implemented
-    Set = _not_implemented
-    ExtrudedSet = _not_implemented
-    MixedSet = _not_implemented
-    Subset = _not_implemented
-    DataSet = _not_implemented
-    MixedDataSet = _not_implemented
-    Map = _not_implemented
-    MixedMap = _not_implemented
-    Dat = _not_implemented
-    MixedDat = _not_implemented
-    DatView = _not_implemented
-    Mat = _not_implemented
-    Global = _not_implemented
-    GlobalDataSet = _not_implemented
-    PETScVecType = _not_implemented
-
-    def __getattribute__(self, key):
-        val = super().__getattribute__(key)
-        if val is _not_implemented:
-            raise NotImplementedError(f"'{val}' is not implemented for backend"
-                                      f" '{type(self).__name__}'.")
-        return val
-
-    def turn_on_offloading(self):
-        raise NotImplementedError()
-
-    def turn_off_offloading(self):
-        raise NotImplementedError()
-
-    @property
-    def cache_key(self):
-        raise NotImplementedError()
diff --git a/pyop2/backends/cpu.py b/pyop2/backends/cpu.py
index ce79c5262..b8581c6f4 100644
--- a/pyop2/backends/cpu.py
+++ b/pyop2/backends/cpu.py
@@ -110,6 +110,7 @@ def code_to_compile(self):
         """Return the C/C++ source code as a string."""
         from pyop2.codegen.rep2loopy import generate
 
+        # import pdb; pdb.set_trace()
         wrapper = generate(self.builder)
         code = lp.generate_code_v2(wrapper)
 
@@ -162,11 +163,11 @@ def reduction_begin(self):
                       MAX: mpi.MPI.MAX}.get(self.accesses[idx])
 
             if mpi.MPI.VERSION >= 3:
-                requests.append(self.comm.Iallreduce(glob._data,
+                requests.append(self.comm.Iallreduce(glob.data,
                                                      glob._buf,
                                                      op=mpi_op))
             else:
-                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
+                self.comm.Allreduce(glob.data, glob._buf, op=mpi_op)
         return tuple(requests)
 
     @PETSc.Log.EventDecorator("ParLoopRednEnd")
@@ -177,43 +178,10 @@ def reduction_end(self, requests):
             mpi.MPI.Request.Waitall(requests)
             for idx in self._reduction_idxs:
                 glob = self.arguments[idx].data
-                glob._data[:] = glob._buf
+                glob._array.data[:] = glob._buf
         else:
             assert len(requests) == 0
 
             for idx in self._reduction_idxs:
                 glob = self.arguments[idx].data
                 glob._data[:] = glob._buf
-
-
-class CPUBackend(AbstractComputeBackend):
-    GlobalKernel = GlobalKernel
-    Parloop = Parloop
-    Set = Set
-    ExtrudedSet = ExtrudedSet
-    MixedSet = MixedSet
-    Subset = Subset
-    DataSet = DataSet
-    MixedDataSet = MixedDataSet
-    Map = Map
-    MixedMap = MixedMap
-    Dat = Dat
-    MixedDat = MixedDat
-    DatView = DatView
-    Mat = Mat
-    Global = Global
-    GlobalDataSet = GlobalDataSet
-    PETScVecType = PETSc.Vec.Type.STANDARD
-
-    def turn_on_offloading(self):
-        pass
-
-    def turn_off_offloading(self):
-        pass
-
-    @property
-    def cache_key(self):
-        return (type(self),)
-
-
-cpu_backend = CPUBackend()
diff --git a/pyop2/backends/opencl.py b/pyop2/backends/opencl.py
index a041860cb..2df3073e2 100644
--- a/pyop2/backends/opencl.py
+++ b/pyop2/backends/opencl.py
@@ -10,16 +10,16 @@
                                  AVAILABLE_ON_BOTH,
                                  DataAvailability)
 from pyop2.configuration import configuration
-from pyop2.types.set import (MixedSet, Subset as BaseSubset,
-                             ExtrudedSet as BaseExtrudedSet,
+from pyop2.types.set import (MixedSet, Subset,
+                             ExtrudedSet,
                              Set)
-from pyop2.types.map import Map as BaseMap, MixedMap
+from pyop2.types.map import Map, MixedMap
 from pyop2.types.dat import Dat as BaseDat, MixedDat, DatView
 from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
 from pyop2.types.mat import Mat
 from pyop2.types.glob import Global as BaseGlobal
 from pyop2.types.access import RW, READ, MIN, MAX, INC
-from pyop2.parloop import AbstractParloop
+from pyop2.parloop import Parloop
 from pyop2.global_kernel import AbstractGlobalKernel
 from pyop2.backends import AbstractComputeBackend, cpu as cpu_backend
 from petsc4py import PETSc
@@ -40,110 +40,6 @@ def read_only_clarray_setitem(self, *args, **kwargs):
     raise ValueError("assignment destination is read-only")
 
 
-class Map(BaseMap):
-
-    def __init__(self, *args, **kwargs):
-        super(Map, self).__init__(*args, **kwargs)
-        self._availability_flag = AVAILABLE_ON_HOST_ONLY
-
-    @utils.cached_property
-    def _opencl_values(self):
-        self._availability_flag = AVAILABLE_ON_BOTH
-        return cla.to_device(opencl_backend.queue, self._values)
-
-    def get_availability(self):
-        return self._availability_flag
-
-    def ensure_availability_on_device(self):
-        self._opencl_values
-
-    def ensure_availability_on_host(self):
-        # Map once initialized is not over-written so always available
-        # on host.
-        pass
-
-    @property
-    def _kernel_args_(self):
-        if opencl_backend.offloading:
-            if not self.is_available_on_device():
-                self.ensure_availability_on_device()
-
-            return (self._opencl_values.data,)
-        else:
-            return super(Map, self)._kernel_args_
-
-
-class ExtrudedSet(BaseExtrudedSet):
-    """
-    ExtrudedSet for OpenCL.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super(ExtrudedSet, self).__init__(*args, **kwargs)
-        self._availability_flag = AVAILABLE_ON_HOST_ONLY
-
-    @utils.cached_property
-    def opencl_layers_array(self):
-        self._availability_flag = AVAILABLE_ON_BOTH
-        return cla.to_device(opencl_backend.queue, self.layers_array)
-
-    def get_availability(self):
-        return self._availability_flag
-
-    def ensure_availability_on_device(self):
-        self.opencl_layers_array
-
-    def ensure_availability_on_host(self):
-        # ExtrudedSet once initialized is not over-written so always available
-        # on host.
-        pass
-
-    @property
-    def _kernel_args_(self):
-        if opencl_backend.offloading:
-            if not self.is_available_on_device():
-                self.ensure_availability_on_device()
-
-            return (self.opencl_layers_array.data,)
-        else:
-            return super(ExtrudedSet, self)._kernel_args_
-
-
-class Subset(BaseSubset):
-    """
-    Subset for OpenCL.
-    """
-    def __init__(self, *args, **kwargs):
-        super(Subset, self).__init__(*args, **kwargs)
-        self._availability_flag = AVAILABLE_ON_HOST_ONLY
-
-    def get_availability(self):
-        return self._availability_flag
-
-    @utils.cached_property
-    def _opencl_indices(self):
-        self._availability_flag = AVAILABLE_ON_BOTH
-        return cla.to_device(opencl_backend.queue, self._indices)
-
-    def ensure_availability_on_device(self):
-        self._opencl_indices
-
-    def ensure_availability_on_host(self):
-        # Subset once initialized is not over-written so always available
-        # on host.
-        pass
-
-    @property
-    def _kernel_args_(self):
-        if opencl_backend.offloading:
-            if not self.is_available_on_device():
-                self.ensure_availability_on_device()
-
-            return (self._opencl_indices.data,)
-        else:
-            return super(Subset, self)._kernel_args_
-
-
 class Dat(BaseDat):
     """
     Dat for OpenCL.
@@ -173,16 +69,6 @@ def _opencl_data(self):
         else:
             return cla.to_device(opencl_backend.queue, self._data)
 
-    def zero(self, subset=None):
-        if subset is None:
-            self.data[:] = 0
-            self.halo_valid = True
-        else:
-            raise NotImplementedError
-
-    def copy(self, other, subset=None):
-        raise NotImplementedError
-
     @utils.cached_property
     def _vec(self):
         assert self.dtype == PETSc.ScalarType, \
@@ -249,121 +135,6 @@ def vec_context(self, access):
         if access is not READ:
             self.halo_valid = False
 
-    @property
-    def _kernel_args_(self):
-        if self.can_be_represented_as_petscvec:
-            if opencl_backend.offloading:
-                self.ensure_availability_on_device()
-                # tell petsc that we have updated the data in the CL buffer
-                with self.vec as v:
-                    v.getCLMemHandle()
-                    v.restoreCLMemHandle()
-
-                return (self._opencl_data.data,)
-            else:
-                self.ensure_availability_on_host()
-                # tell petsc that we have updated the data on the host
-                with self.vec as v:
-                    v.stateIncrease()
-                return (self._data.ctypes.data, )
-        else:
-            if opencl_backend.offloading:
-                self.ensure_availability_on_device()
-
-                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
-                return (self._opencl_data.data, )
-            else:
-                self.ensure_availability_on_host()
-
-                self._availability_flag = AVAILABLE_ON_HOST_ONLY
-                return (self._data.ctypes.data, )
-
-    @mpi.collective
-    @property
-    def data(self):
-        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
-            raise RuntimeError("Illegal access: no data associated with this Dat!")
-
-        self.halo_valid = False
-
-        if opencl_backend.offloading:
-
-            self.ensure_availability_on_device()
-
-            # {{{ marking data on the host as invalid
-
-            if self.can_be_represented_as_petscvec:
-                # report to petsc that we are updating values on the device
-                with self.vec as petscvec:
-                    petscvec.getCLMemHandle("w")
-                    petscvec.restoreCLMemHandle()
-            else:
-                self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
-
-            # }}}
-            v = self._opencl_data[:self.dataset.size]
-        else:
-            self.ensure_availability_on_host()
-            v = self._data[:self.dataset.size].view()
-            v.setflags(write=True)
-
-            # {{{ marking data on the device as invalid
-
-            if self.can_be_represented_as_petscvec:
-                # report to petsc that we are altering data on the CPU
-                with self.vec as petscvec:
-                    petscvec.stateIncrease()
-            else:
-                self._availability_flag = AVAILABLE_ON_HOST_ONLY
-
-            # }}}
-
-        return v
-
-    @property
-    @mpi.collective
-    def data_with_halos(self):
-        self.global_to_local_begin(RW)
-        self.global_to_local_end(RW)
-        return self.data
-
-    @property
-    @mpi.collective
-    def data_ro(self):
-
-        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
-            raise RuntimeError("Illegal access: no data associated with this Dat!")
-
-        self.halo_valid = False
-
-        if opencl_backend.offloading:
-            self.ensure_availability_on_device()
-            v = self._opencl_data[:self.dataset.size].view()
-            v.setitem = read_only_clarray_setitem
-        else:
-            self.ensure_availability_on_host()
-            v = self._data[:self.dataset.size].view()
-            v.setflags(write=False)
-
-        return v
-
-    @property
-    @mpi.collective
-    def data_ro_with_halos(self):
-        self.global_to_local_begin(READ)
-        self.global_to_local_end(READ)
-
-        if opencl_backend.offloading:
-            self.ensure_availability_on_device()
-            v = self._opencl_data.view()
-            v.setitem = read_only_clarray_setitem
-        else:
-            self.ensure_availability_on_host()
-            v = self._data.view()
-            v.setflags(write=False)
-
-        return v
-
 
 class Global(BaseGlobal):
     """
@@ -405,39 +176,6 @@ def _kernel_args_(self):
             self._availability_flag = AVAILABLE_ON_HOST_ONLY
             return super(Global, self)._kernel_args_
 
-    @mpi.collective
-    @property
-    def data(self):
-        if len(self._data) == 0:
-            raise RuntimeError("Illegal access: No data associated with"
-                               " this Global!")
-
-        if opencl_backend.offloading:
-            self.ensure_availability_on_device()
-            self._availability_flag = AVAILABLE_ON_DEVICE_ONLY
-            return self._opencl_data
-        else:
-            self.ensure_availability_on_host()
-            self._availability_flag = AVAILABLE_ON_HOST_ONLY
-            return self._data
-
-    @property
-    def data_ro(self):
-        if len(self._data) == 0:
-            raise RuntimeError("Illegal access: No data associated with"
-                               " this Global!")
-
-        if opencl_backend.offloading:
-            self.ensure_availability_on_device()
-            view = self._opencl_data.view()
-            view.setitem = read_only_clarray_setitem
-            return view
-        else:
-            self.ensure_availability_on_host()
-            view = self._data.view()
-            view.setflags(write=False)
-            return view
-
     @utils.cached_property
     def _vec(self):
         raise NotImplementedError()
@@ -572,49 +310,6 @@ def compile(self, comm):
         return CLKernelWithExtraArgs(cl_knl, self.get_extra_args())
 
 
-class Parloop(AbstractParloop):
-    @PETSc.Log.EventDecorator("ParLoopRednBegin")
-    @mpi.collective
-    def reduction_begin(self):
-        """Begin reductions."""
-        requests = []
-        for idx in self._reduction_idxs:
-            glob = self.arguments[idx].data
-            mpi_op = {INC: mpi.MPI.SUM,
-                      MIN: mpi.MPI.MIN,
-                      MAX: mpi.MPI.MAX}.get(self.accesses[idx])
-
-            if mpi.MPI.VERSION >= 3:
-                glob.ensure_availability_on_host()
-                requests.append(self.comm.Iallreduce(glob._data,
-                                                     glob._buf,
-                                                     op=mpi_op))
-            else:
-                glob.ensure_availability_on_host()
-                self.comm.Allreduce(glob._data, glob._buf, op=mpi_op)
-        return tuple(requests)
-
-    @PETSc.Log.EventDecorator("ParLoopRednEnd")
-    @mpi.collective
-    def reduction_end(self, requests):
-        """Finish reductions."""
-        if mpi.MPI.VERSION >= 3:
-            mpi.MPI.Request.Waitall(requests)
-            for idx in self._reduction_idxs:
-                glob = self.arguments[idx].data
-                glob._data[:] = glob._buf
-                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
-                glob.ensure_availability_on_device()
-        else:
-            assert len(requests) == 0
-
-            for idx in self._reduction_idxs:
-                glob = self.arguments[idx].data
-                glob._data[:] = glob._buf
-                glob._availability_flag = AVAILABLE_ON_HOST_ONLY
-                glob.ensure_availability_on_device()
-
-
 class OpenCLBackend(AbstractComputeBackend):
     Parloop_offloading = Parloop
     Parloop_no_offloading = cpu_backend.Parloop
diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index 3510b1d38..1fdb04fda 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -529,6 +529,7 @@ def renamer(expr):
         assumptions = assumptions & pwaffd[parameters.layer_start].le_set(pwaffd[parameters.layer_end])
     assumptions = reduce(operator.and_, assumptions.get_basic_sets())
 
+    # import pdb; pdb.set_trace()
     wrapper = loopy.make_kernel(domains,
                                 statements,
                                 kernel_data=parameters.kernel_data,
diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 9dbb91dfa..3449f9df1 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -587,7 +587,7 @@ def load(jitmodule, extension, fn_name, cppargs=(), ldargs=(),
     :kwarg comm: Optional communicator to compile the code on (only
         rank 0 compiles code) (defaults to pyop2.mpi.COMM_WORLD).
     """
-    from pyop2.global_kernel import AbstractGlobalKernel as GlobalKernel
+    from pyop2.global_kernel import GlobalKernel
 
     if isinstance(jitmodule, str):
         class StrCode(object):
diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index a98cad124..07681cc1c 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -84,6 +84,8 @@ class Configuration(dict):
     # name, env variable, type, default, write once
     cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid())
     DEFAULTS = {
+        "backend":
+            ("PYOP2_BACKEND", str, "CPU_ONLY"),
         "cc":
             ("PYOP2_CC", str, ""),
         "cxx":
diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py
index fd2ad1409..f0e39c8e5 100644
--- a/pyop2/global_kernel.py
+++ b/pyop2/global_kernel.py
@@ -1,17 +1,21 @@
-import collections.abc
-import ctypes
 import abc
+import collections
+import ctypes
+import os
 from dataclasses import dataclass
 import itertools
 from typing import Optional, Tuple
 
+import loopy as lp
 import numpy as np
+from petsc4py import PETSc
 
-from pyop2 import mpi
+from pyop2 import compilation, mpi, utils
 from pyop2.caching import Cached
 from pyop2.configuration import configuration
 from pyop2.datatypes import IntType, as_ctypes
 from pyop2.types import IterationRegion
+from pyop2.offload_utils import OffloadingBackend, _backend as backend, _offloading as offloading
 from pyop2.utils import cached_property
 
 
@@ -224,7 +228,7 @@ def pack(self):
         return MatPack
 
 
-class AbstractGlobalKernel(Cached, abc.ABC):
+class GlobalKernel(Cached, abc.ABC):
     """Class representing the generated code for the global computation.
 
     :param local_kernel: :class:`pyop2.LocalKernel` instance representing the
@@ -264,6 +268,16 @@ def _cache_key(cls, local_knl, arguments, **kwargs):
 
         return tuple(key)
 
+    @classmethod
+    def new(cls, *args, **kwargs):
+        if offloading:
+            if backend == OffloadingBackend.CPU:
+                return CPUGlobalKernel(*args, **kwargs)
+            else:
+                raise NotImplementedError
+        else:
+            return CPUGlobalKernel(*args, **kwargs)
+
     def __init__(self, local_kernel, arguments, *,
                  extruded=False,
                  extruded_periodic=False,
@@ -374,3 +388,160 @@ def code_to_compile(self):
     @abc.abstractmethod
     def compile(self):
         pass
+
+
+class CPUGlobalKernel(GlobalKernel):
+
+    @cached_property
+    def code_to_compile(self):
+        """Return the C/C++ source code as a string."""
+        from pyop2.codegen.rep2loopy import generate
+
+        wrapper = generate(self.builder)
+        code = lp.generate_code_v2(wrapper)
+
+        if self.local_kernel.cpp:
+            from loopy.codegen.result import process_preambles
+            preamble = "".join(
+                process_preambles(getattr(code, "device_preambles", [])))
+            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
+            return preamble + '\nextern "C" {\n' + device_code + "\n}\n"
+        return code.device_code()
+
+    @PETSc.Log.EventDecorator()
+    @mpi.collective
+    def compile(self, comm):
+        """Compile the kernel.
+
+        :arg comm: The communicator the compilation is collective over.
+        :returns: A ctypes function pointer for the compiled function.
+        """
+        extension = "cpp" if self.local_kernel.cpp else "c"
+        cppargs = (
+            tuple("-I%s/include" % d for d in utils.get_petsc_dir())
+            + tuple("-I%s" % d for d in self.local_kernel.include_dirs)
+            + ("-I%s" % os.path.abspath(os.path.dirname(__file__)),)
+        )
+        ldargs = (
+            tuple("-L%s/lib" % d for d in utils.get_petsc_dir())
+            + tuple("-Wl,-rpath,%s/lib" % d for d in utils.get_petsc_dir())
+            + ("-lpetsc", "-lm")
+            + tuple(self.local_kernel.ldargs)
+        )
+
+        return compilation.load(self, extension, self.name,
+                                cppargs=cppargs,
+                                ldargs=ldargs,
+                                restype=ctypes.c_int,
+                                comm=comm)
+
+
+class OpenCLGlobalKernel(GlobalKernel):
+    @classmethod
+    def _cache_key(cls, *args, **kwargs):
+        key = super()._cache_key(*args, **kwargs)
+        key = key + (configuration["gpu_strategy"], "opencl")
+        return key
+
+    @utils.cached_property
+    def encoded_cache_key(self):
+        return md5(str(self.cache_key[1:]).encode()).hexdigest()
+
+    @utils.cached_property
+    def computational_grid_expr_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_grid_params.py")
+
+    @utils.cached_property
+    def extra_args_cache_file_path(self):
+        cachedir = configuration["cache_dir"]
+        return os.path.join(cachedir, f"{self.encoded_cache_key}_extra_args.npz")
+
+    # @memoize_method
+    def get_grid_size(self, start, end):
+        fpath = self.computational_grid_expr_cache_file_path
+
+        with open(fpath, "r") as f:
+            globals_dict = {}
+            exec(f.read(), globals_dict)
+            get_grid_sizes = globals_dict["get_grid_sizes"]
+
+        return get_grid_sizes(start=start, end=end)
+
+    # @memoize_method
+    def get_extra_args(self):
+        """
+        Returns device buffers corresponding to array literals baked into
+        :attr:`local_kernel`.
+        """
+        fpath = self.extra_args_cache_file_path
+        npzfile = numpy.load(fpath)
+        assert npzfile["ids"].ndim == 1
+        assert len(npzfile.files) == len(npzfile["ids"]) + 1
+        extra_args_np = [npzfile[arg_id]
+                         for arg_id in npzfile["ids"]]
+        return tuple(cla.to_device(opencl_backend.queue, arg)
+                     for arg in extra_args_np)
+
+    @utils.cached_property
+    def argtypes(self):
+        result = super().argtypes
+        return result + (ctypes.c_voidp,) * len(self.get_extra_args())
+
+    @utils.cached_property
+    def code_to_compile(self):
+        from pyop2.codegen.rep2loopy import generate
+        from pyop2.transforms.gpu_utils import apply_gpu_transforms
+        from pymbolic.interop.ast import to_evaluatable_python_function
+
+        t_unit = generate(self.builder,
+                          include_math=False,
+                          include_petsc=False,
+                          include_complex=False)
+
+        # Make temporary variables with initializers kernel's arguments.
+        t_unit, extra_args = apply_gpu_transforms(t_unit, "opencl")
+
+        ary_ids = [f"_op2_arg_{i}"
+                   for i in range(len(extra_args))]
+
+        numpy.savez(self.extra_args_cache_file_path,
+                    ids=numpy.array(ary_ids),
+                    **{ary_id: extra_arg
+                       for ary_id, extra_arg in zip(ary_ids, extra_args)})
+
+        # {{{ save python code to get grid sizes
+
+        with open(self.computational_grid_expr_cache_file_path, "w") as f:
+            glens, llens = (t_unit
+                            .default_entrypoint
+                            .get_grid_size_upper_bounds_as_exprs(t_unit
+                                                                 .callables_table))
+            f.write(to_evaluatable_python_function((glens, llens), "get_grid_sizes"))
+
+        code = lp.generate_code_v2(t_unit).device_code()
+
+        # }}}
+
+        return code
+
+    @mpi.collective
+    def __call__(self, comm, *args):
+        key = id(comm)
+        try:
+            func = self._func_cache[key]
+        except KeyError:
+            func = self.compile(comm)
+            self._func_cache[key] = func
+
+        grid, block = self.get_grid_size(args[0], args[1])
+        func(grid, block, *args)
+
+    @mpi.collective
+    def compile(self, comm):
+        cl_knl = compilation.get_opencl_kernel(comm,
+                                               self)
+        return CLKernelWithExtraArgs(cl_knl, self.get_extra_args())
+
+class CUDAGlobalKernel(GlobalKernel):
+    ...
diff --git a/pyop2/offload_utils.py b/pyop2/offload_utils.py
index 7ad1f1c16..d3bd91239 100644
--- a/pyop2/offload_utils.py
+++ b/pyop2/offload_utils.py
@@ -1,38 +1,19 @@
-from enum import IntEnum
 from contextlib import contextmanager
+import enum
 
 
-class DataAvailability(IntEnum):
-    """
-    Indicates whether the device or host contains valid data.
-    """
-    AVAILABLE_ON_HOST_ONLY = 1
-    AVAILABLE_ON_DEVICE_ONLY = 2
-    AVAILABLE_ON_BOTH = 3
-
-
-class OffloadMixin:
-    def get_availability(self):
-        raise NotImplementedError
-
-    def ensure_availability_on_host(self):
-        raise NotImplementedError
+class OffloadingBackend(enum.Enum):
+    """TODO"""
+    CPU = enum.auto()
+    OPENCL = enum.auto()
+    CUDA = enum.auto()
 
-    def ensure_availaibility_on_device(self):
-        raise NotImplementedError
 
-    def is_available_on_host(self):
-        # bitwise op to detect both AVAILABLE_ON_HOST and AVAILABLE_ON_BOTH
-        return bool(self.get_availability() & AVAILABLE_ON_HOST_ONLY)
+_offloading = False
+"""Global state indicating whether or not we are running on the host or device"""
 
-    def is_available_on_device(self):
-        # bitwise op to detect both AVAILABLE_ON_DEVICE and AVAILABLE_ON_BOTH
-        return bool(self.get_availability() & AVAILABLE_ON_DEVICE_ONLY)
-
-
-AVAILABLE_ON_HOST_ONLY = DataAvailability.AVAILABLE_ON_HOST_ONLY
-AVAILABLE_ON_DEVICE_ONLY = DataAvailability.AVAILABLE_ON_DEVICE_ONLY
-AVAILABLE_ON_BOTH = DataAvailability.AVAILABLE_ON_BOTH
+_backend = OffloadingBackend.CPU
+"""TODO"""
 
 
 def set_offloading_backend(backend):
@@ -52,10 +33,11 @@ def set_offloading_backend(backend):
           over the course of the program is an undefined behavior. (i.e.
           preferably avoided)
     """
-    from pyop2 import op2
-    from pyop2.backends import AbstractComputeBackend
-    assert isinstance(backend, AbstractComputeBackend)
-    op2.compute_backend = backend
+    global _backend
+
+    if not isinstance(backend, OffloadingBackend):
+        raise TypeError("TODO")
+    _backend = backend
 
 
 @contextmanager
@@ -66,8 +48,9 @@ def offloading():
     region will be executed on backend as selected via
     :func:`set_offloading_backend`.
     """
-    from pyop2 import op2
-    op2.compute_backend.turn_on_offloading()
+    global _offloading
+
+    _offloading = True
     yield
-    op2.compute_backend.turn_off_offloading()
+    _offloading = False
     return
diff --git a/pyop2/op2.py b/pyop2/op2.py
index e827dfcfd..10a4cfb81 100644
--- a/pyop2/op2.py
+++ b/pyop2/op2.py
@@ -49,15 +49,14 @@
                          ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL)
 
 from pyop2.local_kernel import CStringLocalKernel, LoopyLocalKernel, CoffeeLocalKernel, Kernel  # noqa: F401
-from pyop2.global_kernel import (GlobalKernelArg, DatKernelArg, MixedDatKernelArg,  # noqa: F401
+from pyop2.global_kernel import (GlobalKernel, GlobalKernelArg, DatKernelArg, MixedDatKernelArg,  # noqa: F401
                                  MatKernelArg, MixedMatKernelArg, MapKernelArg)
 from pyop2.parloop import (GlobalParloopArg, DatParloopArg, MixedDatParloopArg,  # noqa: F401
-                           MatParloopArg, MixedMatParloopArg, AbstractParloop,
+                           MatParloopArg, MixedMatParloopArg, Parloop,
                            parloop, par_loop)
 from pyop2.parloop import (GlobalLegacyArg, DatLegacyArg, MixedDatLegacyArg,  # noqa: F401
                            MatLegacyArg, MixedMatLegacyArg, LegacyParloop, ParLoop)
 
-from pyop2.backends.cpu import cpu_backend
 from pyop2.offload_utils import set_offloading_backend, offloading
 
 __all__ = ['configuration', 'READ', 'WRITE', 'RW', 'INC', 'MIN', 'MAX',
@@ -66,7 +65,7 @@
            'set_log_level', 'MPI', 'init', 'exit', 'Kernel', 'Set', 'ExtrudedSet',
            'MixedSet', 'Subset', 'DataSet', 'GlobalDataSet', 'MixedDataSet',
            'Halo', 'Dat', 'MixedDat', 'Mat', 'Global', 'Map', 'MixedMap',
-           'Sparsity', 'parloop', 'AbstractParloop', 'ParLoop', 'par_loop',
+           'Sparsity', 'parloop', 'Parloop', 'ParLoop', 'par_loop',
            'DatView', 'PermutedMap', 'ComposedMap', 'set_offloading_backend',
            'offloading']
 
@@ -123,8 +122,3 @@ def exit():
     configuration.reset()
     global _initialised
     _initialised = False
-
-
-compute_backend = cpu_backend
-
-GlobalKernel = cpu_backend.GlobalKernel
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index df422f54c..2c849c583 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -12,11 +12,12 @@
 from pyop2.configuration import configuration
 from pyop2.datatypes import as_numpy_dtype
 from pyop2.exceptions import KernelTypeError, MapValueError, SetTypeError
-from pyop2.global_kernel import (GlobalKernelArg, DatKernelArg, MixedDatKernelArg,
+from pyop2.global_kernel import (GlobalKernel, GlobalKernelArg, DatKernelArg, MixedDatKernelArg,
                                  MatKernelArg, MixedMatKernelArg)
 from pyop2.local_kernel import LocalKernel, CStringLocalKernel, CoffeeLocalKernel, LoopyLocalKernel
 from pyop2.types import (Access, Global, AbstractDat, Dat, DatView, MixedDat, Mat, Set,
                          MixedSet, ExtrudedSet, Subset, Map, ComposedMap, MixedMap)
+from pyop2.types.access import READ, WRITE, RW, INC, MIN, MAX
 from pyop2.utils import cached_property
 
 
@@ -60,7 +61,7 @@ def __post_init__(self):
 
     @property
     def map_kernel_args(self):
-        return self.map_._kernel_args_ if self.map_ else ()
+        return self.map_.kernel_args if self.map_ else ()
 
     @property
     def maps(self):
@@ -82,7 +83,7 @@ def __post_init__(self):
 
     @property
     def map_kernel_args(self):
-        return self.map_._kernel_args_ if self.map_ else ()
+        return self.map_.kernel_args if self.map_ else ()
 
     @property
     def maps(self):
@@ -104,7 +105,7 @@ def __post_init__(self):
     @property
     def map_kernel_args(self):
         rmap, cmap = self.maps
-        return tuple(itertools.chain(*itertools.product(rmap._kernel_args_, cmap._kernel_args_)))
+        return tuple(itertools.chain(*itertools.product(rmap.kernel_args, cmap.kernel_args)))
 
 
 @dataclass
@@ -122,10 +123,10 @@ def __post_init__(self):
     @property
     def map_kernel_args(self):
         rmap, cmap = self.maps
-        return tuple(itertools.chain(*itertools.product(rmap._kernel_args_, cmap._kernel_args_)))
+        return tuple(itertools.chain(*itertools.product(rmap.kernel_args, cmap.kernel_args)))
 
 
-class AbstractParloop:
+class Parloop:
     """A parallel loop invocation.
 
     :arg global_knl: The :class:`GlobalKernel` to be executed.
@@ -168,9 +169,18 @@ def accesses(self):
     @property
     def arglist(self):
         """Prepare the argument list for calling generated code."""
-        arglist = self.iterset._kernel_args_
-        for d in self.arguments:
-            arglist += d.data._kernel_args_
+        arglist = self.iterset.kernel_args_ro
+        for larg, _, plarg in self.zipped_arguments:
+            if larg.access == READ:
+                kernel_args = plarg.data.kernel_args_ro
+            elif larg.access in {WRITE, MIN, MAX}:
+                kernel_args = plarg.data.kernel_args_wo
+            elif larg.access in {RW, INC}:
+                # TODO: Is this definitely right for INC?
+                kernel_args = plarg.data.kernel_args_rw
+            else:
+                raise NotImplementedError
+            arglist += kernel_args
 
         # Collect an ordered set of maps (ignore duplicates)
         maps = {m: None for d in self.arguments for m in d.map_kernel_args}
@@ -350,13 +360,44 @@ def _l2g_idxs(self):
                 seen.add(pl_arg.data)
         return tuple(indices)
 
-    @abc.abstractmethod
+    @PETSc.Log.EventDecorator("ParLoopRednBegin")
+    @mpi.collective
     def reduction_begin(self):
         """Begin reductions."""
-
-    @abc.abstractmethod
+        requests = []
+        for idx in self._reduction_idxs:
+            glob = self.arguments[idx].data
+
+            # FIXME also do for buf
+            # if not glob.data.is_available_on_host:
+            #     glob.data.device_to_host_copy()
+
+            mpi_op = {INC: mpi.MPI.SUM,
+                      MIN: mpi.MPI.MIN,
+                      MAX: mpi.MPI.MAX}.get(self.accesses[idx])
+            if mpi.MPI.VERSION >= 3:
+                requests.append(self.comm.Iallreduce(glob.data,
+                                                     glob._stashed_data.data,
+                                                     op=mpi_op))
+            else:
+                self.comm.Allreduce(glob.data, glob._stashed_data.data, op=mpi_op)
+        return tuple(requests)
+
+    @PETSc.Log.EventDecorator("ParLoopRednEnd")
+    @mpi.collective
     def reduction_end(self, requests):
         """Finish reductions."""
+        if mpi.MPI.VERSION >= 3:
+            mpi.MPI.Request.Waitall(requests)
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data.data[...] = glob._stashed_data.data
+        else:
+            assert len(requests) == 0
+
+            for idx in self._reduction_idxs:
+                glob = self.arguments[idx].data
+                glob._data[...] = glob._stashed_data.data
 
     @cached_property
     def _reduction_idxs(self):
@@ -438,7 +479,6 @@ def prepare_reduced_globals(self, arguments, global_knl):
         :class:`Global` in parallel produces the right result. The same is not
         needed for MAX and MIN because they commute with the reduction.
         """
-        from pyop2.op2 import compute_backend
         arguments = list(arguments)
         reduced_globals = {}
         for i, (lk_arg, gk_arg, pl_arg) in enumerate(self.zip_arguments(global_knl, arguments)):
@@ -599,7 +639,6 @@ def LegacyParloop(local_knl, iterset, *args, **kwargs):
         raise SetTypeError("Iteration set is of the wrong type")
 
     # finish building the local kernel
-    from pyop2.op2 import compute_backend
     local_knl.accesses = tuple(a.access for a in args)
     if isinstance(local_knl, (CStringLocalKernel, CoffeeLocalKernel)):
         local_knl.dtypes = tuple(a.data.dtype for a in args)
@@ -609,7 +648,7 @@ def LegacyParloop(local_knl, iterset, *args, **kwargs):
     extruded_periodic = iterset._extruded_periodic
     constant_layers = extruded and iterset.constant_layers
     subset = isinstance(iterset, Subset)
-    global_knl = compute_backend.GlobalKernel(local_knl, global_knl_args,
+    global_knl = GlobalKernel.new(local_knl, global_knl_args,
                                               extruded=extruded,
                                               extruded_periodic=extruded_periodic,
                                               constant_layers=constant_layers,
@@ -617,7 +656,7 @@ def LegacyParloop(local_knl, iterset, *args, **kwargs):
                                               **kwargs)
 
     parloop_args = tuple(a.parloop_arg for a in args)
-    return compute_backend.Parloop(global_knl, iterset, parloop_args)
+    return Parloop(global_knl, iterset, parloop_args)
 
 
 def par_loop(*args, **kwargs):
@@ -631,10 +670,8 @@ def parloop(knl, *args, **kwargs):
     For a description of the possible arguments to this function see
     :class:`Parloop` and :func:`LegacyParloop`.
     """
-    from pyop2.global_kernel import AbstractGlobalKernel
-    if isinstance(knl, AbstractGlobalKernel):
-        from pyop2.op2 import compute_backend
-        compute_backend.Parloop(knl, *args, **kwargs)()
+    if isinstance(knl, GlobalKernel):
+        Parloop(knl, *args, **kwargs)()
     elif isinstance(knl, LocalKernel):
         LegacyParloop(knl, *args, **kwargs)()
     else:
diff --git a/pyop2/types/dat.py b/pyop2/types/dat.py
index e17a1f38c..78a632ecc 100644
--- a/pyop2/types/dat.py
+++ b/pyop2/types/dat.py
@@ -1,3 +1,4 @@
+import collections
 import contextlib
 import ctypes
 import itertools
@@ -6,6 +7,7 @@
 import loopy as lp
 import numpy as np
 from petsc4py import PETSc
+import pytools
 
 from pyop2 import (
     configuration as conf,
@@ -14,14 +16,15 @@
     mpi,
     utils
 )
-from pyop2.offload_utils import OffloadMixin
-from pyop2.types.access import Access
-from pyop2.types.dataset import DataSet, GlobalDataSet
-from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
+from pyop2.types.access import READ, WRITE, RW, INC, MIN, MAX
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.data_carrier import DataCarrier, VecAccessMixin
 from pyop2.types.set import ExtrudedSet, GlobalSet, Set
+from pyop2.array import MirroredArray
+from pyop2.offload_utils import _offloading as offloading
 
 
-class AbstractDat(DataCarrier, EmptyDataMixin):
+class AbstractDat(DataCarrier):
     """OP2 vector data. A :class:`Dat` holds values on every element of a
     :class:`DataSet`.o
 
@@ -61,28 +64,35 @@ class AbstractDat(DataCarrier, EmptyDataMixin):
     _zero_kernels = {}
     """Class-level cache for zero kernels."""
 
-    _modes = [Access.READ, Access.WRITE, Access.RW, Access.INC, Access.MIN, Access.MAX]
+    _modes = [READ, WRITE, RW, INC, MIN, MAX]
 
     @utils.validate_type(('dataset', (DataCarrier, DataSet, Set), ex.DataSetTypeError),
                          ('name', str, ex.NameTypeError))
     @utils.validate_dtype(('dtype', None, ex.DataTypeError))
     def __init__(self, dataset, data=None, dtype=None, name=None):
-
         if isinstance(dataset, Dat):
             self.__init__(dataset.dataset, None, dtype=dataset.dtype,
                           name="copy_of_%s" % dataset.name)
             dataset.copy(self)
             return
+
+        # If a Set, rather than a dataset is passed in, default to
+        # a dataset dimension of 1.
         if type(dataset) is Set or type(dataset) is ExtrudedSet:
-            # If a Set, rather than a dataset is passed in, default to
-            # a dataset dimension of 1.
             dataset = dataset ** 1
-        self._shape = (dataset.total_size,) + (() if dataset.cdim == 1 else dataset.dim)
-        EmptyDataMixin.__init__(self, data, dtype, self._shape)
 
-        self._dataset = dataset
+        # handle the interplay between dataset, data and dtype
+        shape = (dataset.total_size,) + (() if dataset.cdim == 1 else dataset.dim)
+        if data is not None:
+            data = utils.verify_reshape(data, dtype, shape)
+        else:
+            dtype = np.dtype(dtype or DataCarrier.DEFAULT_DTYPE)
+            data = (shape, dtype)
+
         self.comm = mpi.internal_comm(dataset.comm)
         self.halo_valid = True
+        self._dataset = dataset
+        self._data = MirroredArray.new(data)
         self._name = name or "dat_#x%x" % id(self)
 
         self._halo_frozen = False
@@ -92,9 +102,21 @@ def __del__(self):
         if hasattr(self, "comm"):
             mpi.decref(self.comm)
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return (self._data.ctypes.data, )
+    @property
+    def kernel_args_rw(self):
+        return (self._data.ptr_rw,)
+
+    @property
+    def kernel_args_ro(self):
+        return (self._data.ptr_ro,)
+
+    @property
+    def kernel_args_wo(self):
+        return (self._data.ptr_wo,)
+
+    @property
+    def dat_version(self):
+        return self._data.state
 
     @utils.cached_property
     def _argtypes_(self):
@@ -152,14 +174,10 @@ def data(self):
         :meth:`data_with_halos`.
 
         """
-        # Increment dat_version since this accessor assumes data modification
-        self.increment_dat_version()
         if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
             raise RuntimeError("Illegal access: no data associated with this Dat!")
         self.halo_valid = False
-        v = self._data[:self.dataset.size].view()
-        v.setflags(write=True)
-        return v
+        return self._data.data[:self.dataset.size]
 
     @property
     @mpi.collective
@@ -172,12 +190,10 @@ def data_with_halos(self):
         With this accessor, you get to see up to date halo values, but
         you should not try and modify them, because they will be
         overwritten by the next halo exchange."""
-        self.global_to_local_begin(Access.RW)
-        self.global_to_local_end(Access.RW)
+        self.global_to_local_begin(RW)
+        self.global_to_local_end(RW)
         self.halo_valid = False
-        v = self._data.view()
-        v.setflags(write=True)
-        return v
+        return self._data.data
 
     @property
     @mpi.collective
@@ -193,9 +209,7 @@ def data_ro(self):
         """
         if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
             raise RuntimeError("Illegal access: no data associated with this Dat!")
-        v = self._data[:self.dataset.size].view()
-        v.setflags(write=False)
-        return v
+        return self._data.data_ro[:self.dataset.size]
 
     @property
     @mpi.collective
@@ -211,11 +225,9 @@ def data_ro_with_halos(self):
         overwritten by the next halo exchange.
 
         """
-        self.global_to_local_begin(Access.READ)
-        self.global_to_local_end(Access.READ)
-        v = self._data.view()
-        v.setflags(write=False)
-        return v
+        self.global_to_local_begin(READ)
+        self.global_to_local_end(READ)
+        return self._data.data_ro
 
     def save(self, filename):
         """Write the data array to file ``filename`` in NumPy format."""
@@ -238,13 +250,13 @@ def load(self, filename):
         else:
             self.data[:] = np.load(filename)
 
-    @utils.cached_property
+    @property
     def shape(self):
-        return self._shape
+        return self._data.shape
 
-    @utils.cached_property
+    @property
     def dtype(self):
-        return self._dtype
+        return self._data.dtype
 
     @utils.cached_property
     def nbytes(self):
@@ -265,10 +277,9 @@ def zero(self, subset=None):
 
         :arg subset: A :class:`Subset` of entries to zero (optional)."""
         # Data modification
-        self.increment_dat_version()
         # If there is no subset we can safely zero the halo values.
         if subset is None:
-            self._data[:] = 0
+            self.data_with_halos[...] = 0
             self.halo_valid = True
         elif subset.superset != self.dataset.set:
             raise ex.MapValueError("The subset and dataset are incompatible")
@@ -286,10 +297,11 @@ def copy(self, other, subset=None):
         if subset is None:
             # If the current halo is valid we can also copy these values across.
             if self.halo_valid:
-                other._data[:] = self._data
+                other.data_with_halos[...] = self.data_ro_with_halos
                 other.halo_valid = True
             else:
-                other.data[:] = self.data_ro
+                other.data[...] = self.data_ro
+                other.halo_valid = False
         elif subset.superset != self.dataset.set:
             raise ex.MapValueError("The subset and dataset are incompatible")
         else:
@@ -349,9 +361,10 @@ def _op_kernel(self, op, globalp, dtype):
         return self._op_kernel_cache.setdefault(key, Kernel(knl, name))
 
     def _op(self, other, op):
-        from pyop2.op2 import compute_backend
+        from pyop2.types.glob import Global
         from pyop2.parloop import parloop
-        ret = compute_backend.Dat(self.dataset, None, self.dtype)
+
+        ret = Dat(self.dataset, None, self.dtype)
         if np.isscalar(other):
             other = compute_backend.Global(1, data=other, comm=self.comm)
             globalp = True
@@ -359,7 +372,7 @@ def _op(self, other, op):
             self._check_shape(other)
             globalp = False
         parloop(self._op_kernel(op, globalp, other.dtype),
-                self.dataset.set, self(Access.READ), other(Access.READ), ret(Access.WRITE))
+                self.dataset.set, self(READ), other(READ), ret(WRITE))
         return ret
 
     def _iop_kernel(self, op, globalp, other_is_self, dtype):
@@ -397,7 +410,7 @@ def _iop_kernel(self, op, globalp, other_is_self, dtype):
         return self._iop_kernel_cache.setdefault(key, Kernel(knl, name))
 
     def _iop(self, other, op):
-        from pyop2.op2 import compute_backend
+        from pyop2.types.glob import Global
         from pyop2.parloop import parloop
 
         globalp = False
@@ -406,9 +419,9 @@ def _iop(self, other, op):
             globalp = True
         elif other is not self:
             self._check_shape(other)
-        args = [self(Access.INC)]
+        args = [self(INC)]
         if other is not self:
-            args.append(other(Access.READ))
+            args.append(other(READ))
         parloop(self._iop_kernel(op, globalp, other is self, other.dtype), self.dataset.set, *args)
         return self
 
@@ -446,12 +459,12 @@ def inner(self, other):
 
         """
         from pyop2.parloop import parloop
-        from pyop2.op2 import compute_backend
+        from pyop2.types.glob import Global
 
         self._check_shape(other)
         ret = compute_backend.Global(1, data=0, dtype=self.dtype, comm=self.comm)
         parloop(self._inner_kernel(other.dtype), self.dataset.set,
-                self(Access.READ), other(Access.READ), ret(Access.INC))
+                self(READ), other(READ), ret(INC))
         return ret.data_ro[0]
 
     @property
@@ -465,8 +478,7 @@ def norm(self):
         return sqrt(self.inner(self).real)
 
     def __pos__(self):
-        from pyop2.op2 import compute_backend
-        pos = compute_backend.Dat(self)
+        pos = Dat(self)
         return pos
 
     def __add__(self, other):
@@ -499,10 +511,9 @@ def _neg_kernel(self):
 
     def __neg__(self):
         from pyop2.parloop import parloop
-        from pyop2.op2 import compute_backend
 
-        neg = compute_backend.Dat(self.dataset, dtype=self.dtype)
-        parloop(self._neg_kernel, self.dataset.set, neg(Access.WRITE), self(Access.READ))
+        neg = Dat(self.dataset, dtype=self.dtype)
+        parloop(self._neg_kernel, self.dataset.set, neg(WRITE), self(READ))
         return neg
 
     def __sub__(self, other):
@@ -556,11 +567,11 @@ def global_to_local_begin(self, access_mode):
         halo = self.dataset.halo
         if halo is None or self._halo_frozen:
             return
-        if not self.halo_valid and access_mode in {Access.READ, Access.RW}:
-            halo.global_to_local_begin(self, Access.WRITE)
-        elif access_mode in {Access.INC, Access.MIN, Access.MAX}:
+        if not self.halo_valid and access_mode in {READ, RW}:
+            halo.global_to_local_begin(self, WRITE)
+        elif access_mode in {INC, MIN, MAX}:
             min_, max_ = dtypes.dtype_limits(self.dtype)
-            val = {Access.MAX: min_, Access.MIN: max_, Access.INC: 0}[access_mode]
+            val = {MAX: min_, MIN: max_, INC: 0}[access_mode]
             self._data[self.dataset.size:] = val
         else:
             # WRITE
@@ -575,10 +586,10 @@ def global_to_local_end(self, access_mode):
         halo = self.dataset.halo
         if halo is None or self._halo_frozen:
             return
-        if not self.halo_valid and access_mode in {Access.READ, Access.RW}:
-            halo.global_to_local_end(self, Access.WRITE)
+        if not self.halo_valid and access_mode in {READ, RW}:
+            halo.global_to_local_end(self, WRITE)
             self.halo_valid = True
-        elif access_mode in {Access.INC, Access.MIN, Access.MAX}:
+        elif access_mode in {INC, MIN, MAX}:
             self.halo_valid = False
         else:
             # WRITE
@@ -662,9 +673,17 @@ def __init__(self, dat, index):
                                       name="view[%s](%s)" % (index, dat.name))
         self._parent = dat
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return self._parent._kernel_args_
+    @property
+    def kernel_args_rw(self):
+        return self._parent.kernel_args_rw
+
+    @property
+    def kernel_args_ro(self):
+        return self._parent.kernel_args_ro
+
+    @property
+    def kernel_args_wo(self):
+        return self._parent.kernel_args_wo
 
     @utils.cached_property
     def _argtypes_(self):
@@ -711,41 +730,86 @@ def data_ro_with_halos(self):
         return full[idx]
 
 
-class Dat(AbstractDat, VecAccessMixin, OffloadMixin):
+class Dat(AbstractDat, VecAccessMixin):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._lazy_petsc_vec = None
 
-    @utils.cached_property
-    def can_be_represented_as_petscvec(self):
-        return ((self.dtype == PETSc.ScalarType) and self.cdim > 0)
+    @property
+    def petsc_vec(self):
+        if self.dtype != dtypes.ScalarType:
+            raise ex.DataTypeError(
+                "Only arrays with dtype matching PETSc's scalar type can be "
+                "represented as a PETSc Vec")
 
-    def __init__(self, *args, **kwargs):
-        AbstractDat.__init__(self, *args, **kwargs)
-        # Determine if we can rely on PETSc state counter
-        petsc_counter = (self.dtype == PETSc.ScalarType)
-        VecAccessMixin.__init__(self, petsc_counter=petsc_counter)
+        if self._lazy_petsc_vec is None:
+            assert self._data._lazy_host_data is not None
 
-    @utils.cached_property
-    def _vec(self):
-        assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype,
-                                                           PETSc.ScalarType)
-        # Can't duplicate layout_vec of dataset, because we then
-        # carry around extra unnecessary data.
-        # But use getSizes to save an Allreduce in computing the
-        # global size.
-        size = self.dataset.layout_vec.getSizes()
-        data = self._data[:size[0]]
-        vec = PETSc.Vec().createWithArray(data, size=size,
-                                          bsize=self.cdim, comm=self.comm)
-        return vec
+            bsize = np.prod(self._data.shape[1:], dtype=int)
+            self._lazy_petsc_vec = PETSc.Vec().createWithArray(
+                self._data._lazy_host_data, size=self._data.size, bsize=bsize, comm=self.comm
+            )
+
+        return self._lazy_petsc_vec
 
+    @property
     @contextlib.contextmanager
-    def vec_context(self, access):
-        r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`.
+    def vec(self):
+        """TODO"""
+        if offloading:
+            if not self._data.is_available_on_device:
+                self._data.host_to_device_copy()
+            self.petsc_vec.bindToCPU(False)
+        else:
+            if not self._data.is_available_on_host:
+                self._data.device_to_host_copy()
+            self.petsc_vec.bindToCPU(True)
 
-        :param access: Access descriptor: READ, WRITE, or RW."""
-        yield self._vec
-        if access is not Access.READ:
-            self.halo_valid = False
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+        self.halo_valid = False
+        if offloading:
+            self._data.is_available_on_host = False
+        else:
+            self._data.is_available_on_device = False
+
+    @property
+    @contextlib.contextmanager
+    def vec_ro(self):
+        """TODO"""
+        if offloading:
+            if not self._data.is_available_on_device:
+                self._data.host_to_device_copy()
+            self.petsc_vec.bindToCPU(False)
+        else:
+            if not self._data.is_available_on_host:
+                self._data.device_to_host_copy()
+            self.petsc_vec.bindToCPU(True)
+
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+    @property
+    @contextlib.contextmanager
+    def vec_wo(self):
+        """TODO"""
+        if offloading:
+            self.petsc_vec.bindToCPU(False)
+        else:
+            self.petsc_vec.bindToCPU(True)
+
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+        self.halo_valid = False
+        if offloading:
+            self._data.is_available_on_host = False
+        else:
+            self._data.is_available_on_device = False
 
 
 class MixedDat(AbstractDat, VecAccessMixin):
@@ -765,13 +829,12 @@ class MixedDat(AbstractDat, VecAccessMixin):
 
     def __init__(self, mdset_or_dats):
         from pyop2.types.glob import Global
-        from pyop2.op2 import compute_backend
 
         def what(x):
             if isinstance(x, (Global, GlobalDataSet, GlobalSet)):
-                return compute_backend.Global
+                return Global
             elif isinstance(x, (Dat, DataSet, Set)):
-                return compute_backend.Dat
+                return Dat
             else:
                 raise ex.DataSetTypeError("Huh?!")
         if isinstance(mdset_or_dats, MixedDat):
@@ -783,6 +846,8 @@ def what(x):
         # TODO: Think about different communicators on dats (c.f. MixedSet)
         self.comm = mpi.internal_comm(self._dats[0].comm)
 
+        self._lazy_petsc_vec = None
+
     @property
     def dat_version(self):
         return sum(d.dat_version for d in self._dats)
@@ -791,9 +856,17 @@ def __call__(self, access, path=None):
         from pyop2.parloop import MixedDatLegacyArg
         return MixedDatLegacyArg(self, path, access)
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return tuple(itertools.chain(*(d._kernel_args_ for d in self)))
+    @property
+    def kernel_args_rw(self):
+        return tuple(itertools.chain(*(d.kernel_args_rw for d in self)))
+
+    @property
+    def kernel_args_ro(self):
+        return tuple(itertools.chain(*(d.kernel_args_ro for d in self)))
+
+    @property
+    def kernel_args_wo(self):
+        return tuple(itertools.chain(*(d.kernel_args_wo for d in self)))
 
     @utils.cached_property
     def _argtypes_(self):
@@ -820,14 +893,7 @@ def split(self):
     @utils.cached_property
     def dataset(self):
         r""":class:`MixedDataSet`\s this :class:`MixedDat` is defined on."""
-        from pyop2.op2 import compute_backend
-        return compute_backend.MixedDataSet(tuple(s.dataset for s in self._dats))
-
-    @utils.cached_property
-    def _data(self):
-        """Return the user-provided data buffer, or a zeroed buffer of
-        the correct size if none was provided."""
-        return tuple(d._data for d in self)
+        return MixedDataSet(tuple(s.dataset for s in self._dats))
 
     @property
     @mpi.collective
@@ -969,7 +1035,6 @@ def inner(self, other):
         return ret
 
     def _op(self, other, op):
-        from pyop2.op2 import compute_backend
         ret = []
         if np.isscalar(other):
             for s in self:
@@ -978,7 +1043,7 @@ def _op(self, other, op):
             self._check_shape(other)
             for s, o in zip(self, other):
                 ret.append(op(s, o))
-        return compute_backend.MixedDat(ret)
+        return MixedDat(ret)
 
     def _iop(self, other, op):
         if np.isscalar(other):
@@ -991,20 +1056,16 @@ def _iop(self, other, op):
         return self
 
     def __pos__(self):
-        from pyop2.op2 import compute_backend
-
         ret = []
         for s in self:
             ret.append(s.__pos__())
-        return compute_backend.MixedDat(ret)
+        return MixedDat(ret)
 
     def __neg__(self):
-        from pyop2.op2 import compute_backend
-
         ret = []
         for s in self:
             ret.append(s.__neg__())
-        return compute_backend.MixedDat(ret)
+        return MixedDat(ret)
 
     def __add__(self, other):
         """Pointwise addition of fields."""
@@ -1064,42 +1125,121 @@ def _vec(self):
         # because we're not placing an array.
         return self.dataset.layout_vec.duplicate()
 
+    @property
+    def petsc_vec(self):
+        if self._lazy_petsc_vec is None:
+            subvecs = [dat.petsc_vec for dat in self._dats]
+            self._lazy_petsc_vec = PETSc.Vec().createNest(subvecs, comm=self.comm)
+
+        return self._lazy_petsc_vec
+
+    @property
     @contextlib.contextmanager
-    def vec_context(self, access):
-        r"""A context manager scattering the arrays of all components of this
-        :class:`MixedDat` into a contiguous :class:`PETSc.Vec` and reverse
-        scattering to the original arrays when exiting the context.
+    def vec(self):
+        """TODO"""
+        for subdat in self._dats:
+            if offloading:
+                if not subdat._data.is_available_on_device:
+                    subdat._data.host_to_device_copy()
+                subdat.petsc_vec.bindToCPU(False)
+            else:
+                if not subdat._data.is_available_on_host:
+                    subdat._data.device_to_host_copy()
+                subdat.petsc_vec.bindToCPU(True)
 
-        :param access: Access descriptor: READ, WRITE, or RW.
+            subdat.petsc_vec.stateSet(subdat._data.state)
 
-        .. note::
+        yield self.petsc_vec
 
-           The :class:`~PETSc.Vec` obtained from this context is in
-           the correct order to be left multiplied by a compatible
-           :class:`MixedMat`.  In parallel it is *not* just a
-           concatenation of the underlying :class:`Dat`\s."""
-        # Do the actual forward scatter to fill the full vector with
-        # values
-        if access is not Access.WRITE:
-            offset = 0
-            with self._vec as array:
-                for d in self:
-                    with d.vec_ro as v:
-                        size = v.local_size
-                        array[offset:offset+size] = v.array_r[:]
-                        offset += size
-
-        yield self._vec
-        if access is not Access.READ:
-            # Reverse scatter to get the values back to their original locations
-            offset = 0
-            array = self._vec.array_r
-            for d in self:
-                with d.vec_wo as v:
-                    size = v.local_size
-                    v.array[:] = array[offset:offset+size]
-                    offset += size
-            self.halo_valid = False
+        for subdat in self._dats:
+            subdat._data.state = subdat.petsc_vec.stateGet()
+            subdat.halo_valid = False
+            if offloading:
+                subdat._data.is_available_on_host = False
+            else:
+                subdat._data.is_available_on_device = False
+
+    @property
+    @contextlib.contextmanager
+    def vec_ro(self):
+        """TODO"""
+        for subdat in self._dats:
+            if offloading:
+                if not subdat._data.is_available_on_device:
+                    subdat._data.host_to_device_copy()
+                subdat.petsc_vec.bindToCPU(False)
+            else:
+                if not subdat._data.is_available_on_host:
+                    subdat._data.device_to_host_copy()
+                subdat.petsc_vec.bindToCPU(True)
+
+            subdat.petsc_vec.stateSet(subdat._data.state)
+
+        yield self.petsc_vec
+
+        for subdat in self._dats:
+            subdat._data.state = subdat.petsc_vec.stateGet()
+
+    @property
+    @contextlib.contextmanager
+    def vec_wo(self):
+        """TODO"""
+        for subdat in self._dats:
+            if offloading:
+                subdat.petsc_vec.bindToCPU(False)
+            else:
+                subdat.petsc_vec.bindToCPU(True)
+
+            subdat.petsc_vec.stateSet(subdat._data.state)
+
+        yield self.petsc_vec
+
+        for subdat in self._dats:
+            subdat._data.state = subdat.petsc_vec.stateGet()
+
+            subdat.halo_valid = False
+            if offloading:
+                subdat._data.is_available_on_host = False
+            else:
+                subdat._data.is_available_on_device = False
+
+    # TODO VecNest?
+    # @contextlib.contextmanager
+    # def vec_context(self, access):
+    #     r"""A context manager scattering the arrays of all components of this
+    #     :class:`MixedDat` into a contiguous :class:`PETSc.Vec` and reverse
+    #     scattering to the original arrays when exiting the context.
+    #
+    #     :param access: Access descriptor: READ, WRITE, or RW.
+    #
+    #     .. note::
+    #
+    #        The :class:`~PETSc.Vec` obtained from this context is in
+    #        the correct order to be left multiplied by a compatible
+    #        :class:`MixedMat`.  In parallel it is *not* just a
+    #        concatenation of the underlying :class:`Dat`\s."""
+    #     # Do the actual forward scatter to fill the full vector with
+    #     # values
+    #     if access in {READ, RW}:
+    #         offset = 0
+    #         with self.petsc_vec as array:
+    #             for subdat in self:
+    #                 with subdat.vec_ro as vec:
+    #                     size = vec.local_size
+    #                     array[offset:offset+size] = vec.array_r
+    #                     offset += size
+    #
+    #     yield self.petsc_vec
+    #     if access is not Access.READ:
+    #         # Reverse scatter to get the values back to their original locations
+    #         offset = 0
+    #         array = self._vec.array_r
+    #         for d in self:
+    #             with d.vec_wo as v:
+    #                 size = v.local_size
+    #                 v.array[:] = array[offset:offset+size]
+    #                 offset += size
+    #         self.halo_valid = False
 
 
 class frozen_halo:
diff --git a/pyop2/types/data_carrier.py b/pyop2/types/data_carrier.py
index 73d3974c2..9e0259b85 100644
--- a/pyop2/types/data_carrier.py
+++ b/pyop2/types/data_carrier.py
@@ -8,6 +8,7 @@
     utils
 )
 from pyop2.types.access import Access
+from pyop2.array import MirroredArray
 
 
 class DataCarrier(abc.ABC):
@@ -18,6 +19,8 @@ class DataCarrier(abc.ABC):
     (:class:`Global`), rank 1 (:class:`Dat`), or rank 2
     (:class:`Mat`)"""
 
+    DEFAULT_DTYPE = dtypes.ScalarType
+
     @utils.cached_property
     def dtype(self):
         """The Python type of the data."""
@@ -44,90 +47,27 @@ def cdim(self):
         the product of the dim tuple."""
         return self._cdim
 
-    def increment_dat_version(self):
-        pass
-
-
-class EmptyDataMixin(abc.ABC):
-    """A mixin for :class:`Dat` and :class:`Global` objects that takes
-    care of allocating data on demand if the user has passed nothing
-    in.
-
-    Accessing the :attr:`_data` property allocates a zeroed data array
-    if it does not already exist.
-    """
-    def __init__(self, data, dtype, shape):
-        if data is None:
-            self._dtype = np.dtype(dtype if dtype is not None else dtypes.ScalarType)
-        else:
-            self._numpy_data = utils.verify_reshape(data, dtype, shape, allow_none=True)
-            self._dtype = self._data.dtype
-
-    @utils.cached_property
-    def _data(self):
-        """Return the user-provided data buffer, or a zeroed buffer of
-        the correct size if none was provided."""
-        if not self._is_allocated:
-            self._numpy_data = np.zeros(self.shape, dtype=self._dtype)
-        return self._numpy_data
-
-    @property
-    def _is_allocated(self):
-        """Return True if the data buffer has been allocated."""
-        return hasattr(self, '_numpy_data')
-
 
 class VecAccessMixin(abc.ABC):
-
-    def __init__(self, petsc_counter=None):
-        if petsc_counter:
-            # Use lambda since `_vec` allocates the data buffer
-            # -> Dat/Global should not allocate storage until accessed
-            self._dat_version = lambda: self._vec.stateGet()
-            self.increment_dat_version = lambda: self._vec.stateIncrease()
-        else:
-            # No associated PETSc Vec if incompatible type:
-            # -> Equip Dat/Global with their own counter.
-            self._version = 0
-            self._dat_version = lambda: self._version
-
-            def _inc():
-                self._version += 1
-            self.increment_dat_version = _inc
+    """Mixin class providing access to arrays as PETSc Vecs."""
 
     @property
-    def dat_version(self):
-        return self._dat_version()
-
+    @mpi.collective
     @abc.abstractmethod
-    def vec_context(self, access):
-        pass
-
-    @abc.abstractproperty
-    def _vec(self):
+    def vec(self):
+        """Return a read-write context manager for wrapping data with a PETSc Vec."""
         pass
 
     @property
     @mpi.collective
-    def vec(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
-
-        You're allowed to modify the data you get back from this view."""
-        return self.vec_context(access=Access.RW)
+    @abc.abstractmethod
+    def vec_ro(self):
+        """Return a read-only context manager for wrapping data with a PETSc Vec."""
+        pass
 
     @property
     @mpi.collective
+    @abc.abstractmethod
     def vec_wo(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
-
-        You're allowed to modify the data you get back from this view,
-        but you cannot read from it."""
-        return self.vec_context(access=Access.WRITE)
-
-    @property
-    @mpi.collective
-    def vec_ro(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
-
-        You're not allowed to modify the data you get back from this view."""
-        return self.vec_context(access=Access.READ)
+        """Return a write-only context manager for wrapping data with a PETSc Vec."""
+        pass
diff --git a/pyop2/types/dataset.py b/pyop2/types/dataset.py
index b4d9adb85..848aad633 100644
--- a/pyop2/types/dataset.py
+++ b/pyop2/types/dataset.py
@@ -10,6 +10,7 @@
     mpi,
     utils
 )
+from pyop2.offload_utils import _backend as backend, OffloadingBackend
 from pyop2.types.set import ExtrudedSet, GlobalSet, MixedSet, Set, Subset
 
 
@@ -188,11 +189,14 @@ def local_ises(self):
     @utils.cached_property
     def layout_vec(self):
         """A PETSc Vec compatible with the dof layout of this DataSet."""
-        from pyop2.op2 import compute_backend
+        if backend == OffloadingBackend.CPU:
+            vec_type = PETSc.Vec.Type.STANDARD
+        else:
+            raise NotImplementedError
+
         vec = PETSc.Vec().create(comm=self.comm)
-        size = (self.size * self.cdim, None)
-        vec.setSizes(size, bsize=self.cdim)
-        vec.setType(compute_backend.PETScVecType)
+        vec.setSizes(self.size*self.cdim, bsize=self.cdim)
+        vec.setType(vec_type)
         vec.setUp()
         return vec
 
diff --git a/pyop2/types/glob.py b/pyop2/types/glob.py
index 04954a0c3..10c48cb5a 100644
--- a/pyop2/types/glob.py
+++ b/pyop2/types/glob.py
@@ -1,21 +1,25 @@
+import collections
 import contextlib
 import ctypes
 import operator
+import numbers
 
 import numpy as np
 from petsc4py import PETSc
 
 from pyop2 import (
+    datatypes,
     exceptions as ex,
     mpi,
     utils
 )
-from pyop2.offload_utils import OffloadMixin
 from pyop2.types.access import Access
-from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
+from pyop2.types.data_carrier import DataCarrier, VecAccessMixin
+from pyop2.array import MirroredArray
+from pyop2.offload_utils import _offloading as offloading
 
 
-class Global(DataCarrier, EmptyDataMixin, VecAccessMixin, OffloadMixin):
+class Global(DataCarrier, VecAccessMixin):
 
     """OP2 global value.
 
@@ -44,27 +48,43 @@ def __init__(self, dim, data=None, dtype=None, name=None, comm=None):
             self.__init__(dim._dim, None, dtype=dim.dtype,
                           name="copy_of_%s" % dim.name, comm=dim.comm)
             dim.copy(self)
+            return
+
+        dim = utils.as_tuple(dim, int)
+
+        # TODO make this a function so can be shared by Dat and Global
+        # handle the interplay between dataset, data and dtype
+        if data is not None:
+            data = utils.verify_reshape(data, dtype, dim)
+            self._stashed_data = MirroredArray.new((data.shape, data.dtype))
         else:
-            self._dim = utils.as_tuple(dim, int)
-            self._cdim = np.prod(self._dim).item()
-            EmptyDataMixin.__init__(self, data, dtype, self._dim)
-            self._buf = np.empty(self.shape, dtype=self.dtype)
-            self._name = name or "global_#x%x" % id(self)
-            if comm is None:
-                import warnings
-                warnings.warn("PyOP2.Global has no comm, this is likely to break in parallel!")
-            self.comm = mpi.internal_comm(comm)
-            # Object versioning setup
-            petsc_counter = (comm and self.dtype == PETSc.ScalarType)
-            VecAccessMixin.__init__(self, petsc_counter=petsc_counter)
+            dtype = np.dtype(dtype or DataCarrier.DEFAULT_DTYPE)
+            data = (dim, dtype)
+            self._stashed_data = MirroredArray.new(data)
+        self._data = MirroredArray.new(data)
+
+        # TODO shouldn't need to set these
+        self._dim = dim
+        self._cdim = np.prod(dim)
+
+        self._name = name or "global_#x%x" % id(self)
+        self.comm = mpi.internal_comm(comm)
 
     def __del__(self):
         if hasattr(self, "comm"):
             mpi.decref(self.comm)
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return (self._data.ctypes.data, )
+    @property
+    def kernel_args_rw(self):
+        return (self._data.ptr_rw,)
+
+    @property
+    def kernel_args_ro(self):
+        return (self._data.ptr_ro,)
+
+    @property
+    def kernel_args_wo(self):
+        return (self._data.ptr_wo,)
 
     @utils.cached_property
     def _argtypes_(self):
@@ -100,7 +120,7 @@ def __str__(self):
             % (self._name, self._dim, self.data_ro)
 
     def __repr__(self):
-        return "Global(%r, %r, %r, %r)" % (self._dim, self.data_ro,
+        return "Global(%r, %r, %r, %r)" % (self.shape, self.data_ro,
                                            self.data.dtype, self._name)
 
     @utils.cached_property
@@ -110,31 +130,21 @@ def dataset(self):
 
     @property
     def shape(self):
-        return self._dim
+        return self._data.shape
 
     @property
     def data(self):
         """Data array."""
-        self.increment_dat_version()
-        if len(self._data) == 0:
-            raise RuntimeError("Illegal access: No data associated with this Global!")
-        return self._data
+        return self._data.data
 
     @property
     def dtype(self):
-        return self._dtype
+        return self._data.dtype
 
     @property
     def data_ro(self):
         """Data array."""
-        view = self._data.view()
-        view.setflags(write=False)
-        return view
-
-    @data.setter
-    def data(self, value):
-        self.increment_dat_version()
-        self._data[:] = utils.verify_reshape(value, self.dtype, self.dim)
+        return self._data.data_ro
 
     @property
     def data_with_halos(self):
@@ -178,8 +188,7 @@ def copy(self, other, subset=None):
     @mpi.collective
     def zero(self, subset=None):
         assert subset is None
-        self.increment_dat_version()
-        self._data[...] = 0
+        self._data.data[...] = 0
 
     @mpi.collective
     def global_to_local_begin(self, access_mode):
@@ -301,36 +310,96 @@ def inner(self, other):
         assert isinstance(other, Global)
         return np.dot(self.data_ro, np.conj(other.data_ro))
 
-    @utils.cached_property
-    def _vec(self):
-        assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype,
-                                                           PETSc.ScalarType)
-        # Can't duplicate layout_vec of dataset, because we then
-        # carry around extra unnecessary data.
-        # But use getSizes to save an Allreduce in computing the
-        # global size.
-        data = self._data
-        size = self.dataset.layout_vec.getSizes()
-        if self.comm.rank == 0:
-            return PETSc.Vec().createWithArray(data, size=size,
-                                               bsize=self.cdim,
-                                               comm=self.comm)
+    @property
+    def dat_version(self):
+        return self._data.state
+
+    @property
+    def petsc_vec(self):
+        if self._lazy_petsc_vec is None:
+            if self.dtype != datatypes.ScalarType:
+                raise ex.DataTypeError(
+                    "Only arrays with dtype matching PETSc's scalar type can be "
+                    "represented as a PETSc Vec")
+
+            assert self.comm is not None
+            assert self._data._lazy_host_data is not None
+
+            self._lazy_petsc_vec = PETSc.Vec().createWithArray(
+                self._data._lazy_host_data, size=self._data.size,
+                bsize=self.cdim, comm=self.comm)
+
+        return self._lazy_petsc_vec
+
+    @property
+    @contextlib.contextmanager
+    def vec(self):
+        """TODO"""
+        if offloading:
+            raise NotImplementedError("TODO")
+        # if offloading:
+        #     if not self._data.is_available_on_device:
+        #         self._data.host_to_device_copy()
+        #     self.petsc_vec.bindToCPU(False)
+        # else:
+        #     if not self._data.is_available_on_host:
+        #         self._data.device_to_host_copy()
+        #     self.petsc_vec.bindToCPU(True)
+
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+        if offloading:
+            self._data.is_available_on_host = False
         else:
-            return PETSc.Vec().createWithArray(np.empty(0, dtype=self.dtype),
-                                               size=size,
-                                               bsize=self.cdim,
-                                               comm=self.comm)
+            self._data.is_available_on_device = False
+        # not sure this is right
+        self.comm.Bcast(self._data._lazy_host_data, 0)
 
     @contextlib.contextmanager
     def vec_context(self, access):
-        """A context manager for a :class:`PETSc.Vec` from a :class:`Global`.
+        """A context manager for a :class:`PETSc.Vec` from a :class:`Global`."""
 
-        :param access: Access descriptor: READ, WRITE, or RW."""
-        raise NotImplementedError()
+    @property
+    @contextlib.contextmanager
+    def vec_ro(self):
+        """TODO"""
+        if offloading:
+            raise NotImplementedError("TODO")
+        if offloading:
+            if not self._data.is_available_on_device:
+                self._data.host_to_device_copy()
+            self.petsc_vec.bindToCPU(False)
+        else:
+            if not self._data.is_available_on_host:
+                self._data.device_to_host_copy()
+            self.petsc_vec.bindToCPU(True)
 
-    def ensure_availability_on_host(self):
-        raise NotImplementedError()
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+    @property
+    @contextlib.contextmanager
+    def vec_wo(self):
+        """TODO"""
+        if offloading:
+            raise NotImplementedError("TODO")
+        if offloading:
+            self.petsc_vec.bindToCPU(False)
+        else:
+            self.petsc_vec.bindToCPU(True)
+
+        self.petsc_vec.stateSet(self._data.state)
+        yield self.petsc_vec
+        self._data.state = self.petsc_vec.stateGet()
+
+        self.halo_valid = False
+        if offloading:
+            self._data.is_available_on_host = False
+        else:
+            self._data.is_available_on_device = False
 
-    def ensure_availability_on_device(self):
-        raise NotImplementedError()
+        # not sure this is right
+        self.comm.Bcast(self._data._lazy_host_data, 0)
diff --git a/pyop2/types/map.py b/pyop2/types/map.py
index fa199dd02..65ffbac79 100644
--- a/pyop2/types/map.py
+++ b/pyop2/types/map.py
@@ -11,11 +11,11 @@
     utils
 )
 from pyop2 import mpi
-from pyop2.offload_utils import OffloadMixin
 from pyop2.types.set import GlobalSet, MixedSet, Set
+from pyop2.array import MirroredArray
 
 
-class Map(OffloadMixin):
+class Map:
 
     """OP2 map, a relation between two :class:`Set` objects.
 
@@ -34,23 +34,24 @@ class Map(OffloadMixin):
 
     @utils.validate_type(('iterset', Set, ex.SetTypeError), ('toset', Set, ex.SetTypeError),
                          ('arity', numbers.Integral, ex.ArityTypeError), ('name', str, ex.NameTypeError))
-    def __init__(self, iterset, toset, arity, values=None, name=None, offset=None, offset_quotient=None):
+    def __init__(self, iterset, toset, arity, values, name=None, offset=None, offset_quotient=None):
+        shape = (iterset.total_size, arity)
+        values = utils.verify_reshape(values, self.dtype, shape)
+
         self._iterset = iterset
         self._toset = toset
         self.comm = mpi.internal_comm(toset.comm)
         self._arity = arity
-        self._values = utils.verify_reshape(values, dtypes.IntType,
-                                            (iterset.total_size, arity), allow_none=True)
-        self.shape = (iterset.total_size, arity)
+        self._values = MirroredArray.new(values)
         self._name = name or "map_#x%x" % id(self)
         if offset is None or len(offset) == 0:
             self._offset = None
         else:
-            self._offset = utils.verify_reshape(offset, dtypes.IntType, (arity, ))
+            self._offset = utils.verify_reshape(offset, self.dtype, (arity, ))
         if offset_quotient is None or len(offset_quotient) == 0:
             self._offset_quotient = None
         else:
-            self._offset_quotient = utils.verify_reshape(offset_quotient, dtypes.IntType, (arity, ))
+            self._offset_quotient = utils.verify_reshape(offset_quotient, self.dtype, (arity, ))
         # A cache for objects built on top of this map
         self._cache = {}
 
@@ -58,9 +59,10 @@ def __del__(self):
         if hasattr(self, "comm"):
             mpi.decref(self.comm)
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return (self._values.ctypes.data, )
+    @property
+    def kernel_args(self):
+        # we only ever read from maps
+        return (self._values.ptr_ro,)
 
     @utils.cached_property
     def _wrapper_cache_key_(self):
@@ -121,22 +123,22 @@ def arange(self):
         """Tuple of arity offsets for each constituent :class:`Map`."""
         return (0, self._arity)
 
-    @utils.cached_property
+    @property
     def values(self):
         """Mapping array.
 
         This only returns the map values for local points, to see the
         halo points too, use :meth:`values_with_halo`."""
-        return self._values[:self.iterset.size]
+        return self._values.data[:self.iterset.size]
 
-    @utils.cached_property
+    @property
     def values_with_halo(self):
         """Mapping array.
 
         This returns all map values (including halo points), see
         :meth:`values` if you only need to look at the local
         points."""
-        return self._values
+        return self._values.data
 
     @utils.cached_property
     def name(self):
@@ -264,8 +266,8 @@ def __init__(self, *maps_, name=None):
         self.maps_ = tuple(maps_)
 
     @utils.cached_property
-    def _kernel_args_(self):
-        return tuple(itertools.chain(*[m._kernel_args_ for m in self.maps_]))
+    def kernel_args(self):
+        return tuple(itertools.chain(*[m.kernel_args for m in self.maps_]))
 
     @utils.cached_property
     def _wrapper_cache_key_(self):
@@ -329,9 +331,9 @@ def _process_args(cls, *args, **kwargs):
     def _cache_key(cls, maps):
         return maps
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return tuple(itertools.chain(*(m._kernel_args_ for m in self if m is not None)))
+    @property
+    def kernel_args(self):
+        return tuple(itertools.chain(*(m.kernel_args for m in self if m is not None)))
 
     @utils.cached_property
     def _argtypes_(self):
diff --git a/pyop2/types/mat.py b/pyop2/types/mat.py
index aefd77de1..b1c402f6d 100644
--- a/pyop2/types/mat.py
+++ b/pyop2/types/mat.py
@@ -641,10 +641,13 @@ def __init__(self, *args, **kwargs):
     # Firedrake relies on this to distinguish between MatBlock and not for boundary conditions
     local_to_global_maps = (None, None)
 
-    @utils.cached_property
-    def _kernel_args_(self):
+    @property
+    def kernel_args_rw(self):
         return tuple(a.handle.handle for a in self)
 
+    kernel_args_ro = kernel_args_rw
+    kernel_args_wo = kernel_args_rw
+
     @mpi.collective
     def _init(self):
         if not self.dtype == PETSc.ScalarType:
diff --git a/pyop2/types/set.py b/pyop2/types/set.py
index 6c4fc682e..28dfc5e83 100644
--- a/pyop2/types/set.py
+++ b/pyop2/types/set.py
@@ -11,7 +11,7 @@
     mpi,
     utils
 )
-from pyop2.offload_utils import OffloadMixin
+from pyop2.array import MirroredArray
 
 
 class Set:
@@ -56,7 +56,9 @@ class Set:
     _extruded = False
     _extruded_periodic = False
 
-    _kernel_args_ = ()
+    kernel_args_rw = ()
+    kernel_args_ro = ()
+    kernel_args_wo = ()
     _argtypes_ = ()
 
     @utils.cached_property
@@ -165,8 +167,7 @@ def __call__(self, *indices):
             if np.isscalar(indices):
                 indices = [indices]
 
-        from pyop2.op2 import compute_backend
-        return compute_backend.Subset(self, indices)
+        return Subset(self, indices)
 
     def __contains__(self, dset):
         """Indicate whether a given DataSet is compatible with this Set."""
@@ -178,8 +179,8 @@ def __contains__(self, dset):
 
     def __pow__(self, e):
         """Derive a :class:`DataSet` with dimension ``e``"""
-        from pyop2.op2 import compute_backend
-        return compute_backend.DataSet(self, dim=e)
+        from pyop2.types import DataSet
+        return DataSet(self, dim=e)
 
     @utils.cached_property
     def layers(self):
@@ -209,7 +210,7 @@ def difference(self, other):
         if other is self:
             return Subset(self, [])
         else:
-            return type(other)(self, np.setdiff1d(np.asarray(range(self.total_size), dtype=dtypes.IntType), other._indices))
+            return type(other)(self, np.setdiff1d(np.asarray(range(self.total_size), dtype=dtypes.IntType), other.indices))
 
     def symmetric_difference(self, other):
         self._check_operands(other)
@@ -224,7 +225,9 @@ class GlobalSet(Set):
     """A proxy set allowing a :class:`Global` to be used in place of a
     :class:`Dat` where appropriate."""
 
-    _kernel_args_ = ()
+    kernel_args_rw = ()
+    kernel_args_ro = ()
+    kernel_args_wo = ()
     _argtypes_ = ()
 
     def __init__(self, comm=None):
@@ -292,7 +295,7 @@ def __hash__(self):
         return hash(type(self))
 
 
-class ExtrudedSet(Set, OffloadMixin):
+class ExtrudedSet(Set):
 
     """OP2 ExtrudedSet.
 
@@ -332,13 +335,35 @@ def __init__(self, parent, layers, extruded_periodic=False):
             layers = np.asarray([[0, layers]], dtype=dtypes.IntType)
             self.constant_layers = True
 
-        self._layers = layers
+        self._layers_array = MirroredArray.new(layers)
         self._extruded = True
         self._extruded_periodic = extruded_periodic
+        self._sizes = parent._sizes
+        self._cache = parent._cache
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return (self.layers_array.ctypes.data, )
+    @property
+    def comm(self):
+        return self._parent.comm
+
+    @property
+    def _name(self):
+        return self._parent._name
+
+    @property
+    def _layers(self):
+        return self._layers_array.data
+
+    @property
+    def kernel_args_rw(self):
+        return (self._layers_array.ptr_rw,)
+
+    @property
+    def kernel_args_ro(self):
+        return (self._layers_array.ptr_ro,)
+
+    @property
+    def kernel_args_wo(self):
+        return (self._layers_array.ptr_wo,)
 
     @utils.cached_property
     def _argtypes_(self):
@@ -381,7 +406,7 @@ def layers_array(self):
         return self._layers
 
 
-class Subset(ExtrudedSet, OffloadMixin):
+class Subset(ExtrudedSet):
 
     """OP2 subset.
 
@@ -405,23 +430,42 @@ def __init__(self, superset, indices):
         assert type(superset) is Set or type(superset) is ExtrudedSet, \
             'Subset construction failed, should not happen'
 
-        self._superset = superset
-        self._indices = utils.verify_reshape(indices, dtypes.IntType, (len(indices),))
+        indices = utils.verify_reshape(indices, dtypes.IntType, (len(indices),))
 
-        if len(self._indices) > 0 and (self._indices[0] < 0 or self._indices[-1] >= self._superset.total_size):
+        if len(indices) > 0 and (indices[0] < 0 or indices[-1] >= superset.total_size):
             raise ex.SubsetIndexOutOfBounds(
                 'Out of bounds indices in Subset construction: [%d, %d) not [0, %d)' %
-                (self._indices[0], self._indices[-1], self._superset.total_size))
+                (indices[0], indices[-1], superset.total_size))
 
-        self._sizes = ((self._indices < superset.core_size).sum(),
-                       (self._indices < superset.size).sum(),
-                       len(self._indices))
+        self._superset = superset
+        self._indices = MirroredArray.new(indices)
+        self._sizes = ((indices < superset.core_size).sum(),
+                       (indices < superset.size).sum(),
+                       len(indices))
         self._extruded = superset._extruded
         self._extruded_periodic = superset._extruded_periodic
+        self.constant_layers = superset.constant_layers if hasattr(superset, "constant_layers") else None
+        self._cache = superset._cache
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return self._superset._kernel_args_ + (self._indices.ctypes.data, )
+    @property
+    def name(self):
+        return self._superset.name
+
+    @property
+    def comm(self):
+        return self._superset.comm
+
+    @property
+    def kernel_args_rw(self):
+        return self._superset.kernel_args_rw + (self._indices.ptr_rw,)
+
+    @property
+    def kernel_args_ro(self):
+        return self._superset.kernel_args_ro + (self._indices.ptr_ro,)
+
+    @property
+    def kernel_args_wo(self):
+        return self._superset.kernel_args_wo + (self._indices.ptr_wo,)
 
     @utils.cached_property
     def _argtypes_(self):
@@ -442,7 +486,7 @@ def __str__(self):
             (self._name, self._sizes)
 
     def __repr__(self):
-        return "Subset(%r, %r)" % (self._superset, self._indices)
+        return "Subset(%r, %r)" % (self._superset, self.indices)
 
     def __call__(self, *indices):
         """Build a :class:`Subset` from this :class:`Subset`
@@ -455,27 +499,26 @@ def __call__(self, *indices):
             indices = indices[0]
             if np.isscalar(indices):
                 indices = [indices]
-        from pyop2.op2 import compute_backend
-        return compute_backend.Subset(self, indices)
+        return Subset(self, indices)
 
-    @utils.cached_property
+    @property
     def superset(self):
         """Returns the superset Set"""
         return self._superset
 
-    @utils.cached_property
+    @property
     def indices(self):
         """Returns the indices pointing in the superset."""
-        return self._indices
+        return self._indices.data
 
-    @utils.cached_property
+    @property
     def owned_indices(self):
         """Return the indices that correspond to the owned entities of the
         superset.
         """
         return self.indices[self.indices < self.superset.size]
 
-    @utils.cached_property
+    @property
     def layers_array(self):
         if self._superset.constant_layers:
             return self._superset.layers_array
@@ -497,28 +540,28 @@ def intersection(self, other):
         if other is self._superset:
             return self
         else:
-            return type(self)(self._superset, np.intersect1d(self._indices, other._indices))
+            return type(self)(self._superset, np.intersect1d(self.indices, other.indices))
 
     def union(self, other):
         self._check_operands(other)
         if other is self._superset:
             return other
         else:
-            return type(self)(self._superset, np.union1d(self._indices, other._indices))
+            return type(self)(self._superset, np.union1d(self.indices, other.indices))
 
     def difference(self, other):
         self._check_operands(other)
         if other is self._superset:
             return Subset(other, [])
         else:
-            return type(self)(self._superset, np.setdiff1d(self._indices, other._indices))
+            return type(self)(self._superset, np.setdiff1d(self.indices, other.indices))
 
     def symmetric_difference(self, other):
         self._check_operands(other)
         if other is self._superset:
             return other.symmetric_difference(self)
         else:
-            return type(self)(self._superset, np.setxor1d(self._indices, other._indices))
+            return type(self)(self._superset, np.setxor1d(self.indices, other.indices))
 
 
 class SetPartition:
@@ -636,8 +679,8 @@ def __len__(self):
 
     def __pow__(self, e):
         """Derive a :class:`MixedDataSet` with dimensions ``e``"""
-        from pyop2.op2 import compute_backend
-        return compute_backend.MixedDataSet(self._sets, e)
+        from pyop2.types import MixedDataSet
+        return MixedDataSet(self._sets, e)
 
     def __str__(self):
         return "OP2 MixedSet composed of Sets: %s" % (self._sets,)
diff --git a/test/unit/test_api.py b/test/unit/test_api.py
index 8ec4cf2ab..fbb308736 100644
--- a/test/unit/test_api.py
+++ b/test/unit/test_api.py
@@ -825,7 +825,8 @@ def test_dat_ro_write_accessor(self, dat):
     def test_dat_lazy_allocation(self, dset):
         "Temporary Dats should not allocate storage until accessed."
         d = op2.Dat(dset)
-        assert not d._is_allocated
+        assert not d._data._lazy_host_data
+        assert not d._data._lazy_device_data
 
     def test_dat_zero_cdim(self, set):
         "A Dat built on a DataSet with zero dim should be allowed."
@@ -1284,16 +1285,6 @@ def test_global_properties(self):
         assert g.dim == (2, 2) and g.dtype == np.float64 and g.name == 'bar' \
             and g.data.sum() == 4
 
-    def test_global_setter(self, g):
-        "Setter attribute on data should correct set data value."
-        g.data = 2
-        assert g.data.sum() == 2
-
-    def test_global_setter_malformed_data(self, g):
-        "Setter attribute should reject malformed data."
-        with pytest.raises(exceptions.DataValueError):
-            g.data = [1, 2]
-
     def test_global_iter(self, g):
         "Global should be iterable and yield self."
         for g_ in g:
diff --git a/test/unit/test_global_reduction.py b/test/unit/test_global_reduction.py
index fa2258924..2f281b538 100644
--- a/test/unit/test_global_reduction.py
+++ b/test/unit/test_global_reduction.py
@@ -400,7 +400,7 @@ def test_1d_multi_inc_same_global_reset(self, k1_inc_to_global, set, d1):
                      d1(op2.READ))
         assert g.data == d1.data.sum()
 
-        g.data = 10
+        g.data[...] = 10
         op2.par_loop(k1_inc_to_global, set,
                      g(op2.INC),
                      d1(op2.READ))
diff --git a/test/unit/test_globals.py b/test/unit/test_globals.py
index b7adf57c6..2a0a99cb9 100644
--- a/test/unit/test_globals.py
+++ b/test/unit/test_globals.py
@@ -72,7 +72,7 @@ def test_global_dat_version():
     assert g2.dat_version == 1
 
     # Access data setter
-    g2.data = d1
+    g2.data[...] = d1
 
     assert g1.dat_version == 2
     assert g2.dat_version == 2
diff --git a/test/unit/test_indirect_loop.py b/test/unit/test_indirect_loop.py
index ab77d182b..fed56b0b8 100644
--- a/test/unit/test_indirect_loop.py
+++ b/test/unit/test_indirect_loop.py
@@ -112,22 +112,16 @@ def test_mismatching_iterset(self, iterset, indset, x):
         the par_loop's should raise an exception."""
         with pytest.raises(MapValueError):
             op2.par_loop(op2.Kernel("", "dummy"), iterset,
-                         x(op2.WRITE, op2.Map(op2.Set(nelems), indset, 1)))
+                         x(op2.WRITE, op2.Map(op2.Set(nelems), indset, 1,
+                                              np.zeros(nelems))))
 
     def test_mismatching_indset(self, iterset, x):
         """Accessing a par_loop argument via a Map with toset not matching
         the Dat's should raise an exception."""
         with pytest.raises(MapValueError):
             op2.par_loop(op2.Kernel("", "dummy"), iterset,
-                         x(op2.WRITE, op2.Map(iterset, op2.Set(nelems), 1)))
-
-    def test_uninitialized_map(self, iterset, indset, x):
-        """Accessing a par_loop argument via an uninitialized Map should raise
-        an exception."""
-        kernel_wo = "static void wo(unsigned int* x) { *x = 42; }\n"
-        with pytest.raises(MapValueError):
-            op2.par_loop(op2.Kernel(kernel_wo, "wo"), iterset,
-                         x(op2.WRITE, op2.Map(iterset, indset, 1)))
+                         x(op2.WRITE, op2.Map(iterset, op2.Set(nelems), 1,
+                                              np.zeros(nelems))))
 
     def test_onecolor_wo(self, iterset, x, iterset2indset):
         """Set a Dat to a scalar value with op2.WRITE."""
diff --git a/test/unit/test_linalg.py b/test/unit/test_linalg.py
index 5fce55d0e..b7587a055 100644
--- a/test/unit/test_linalg.py
+++ b/test/unit/test_linalg.py
@@ -84,19 +84,19 @@ class TestLinAlgOp:
     """
 
     def test_add(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x + y).data == 3 * y.data)
 
     def test_sub(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x - y).data == y.data)
 
     def test_mul(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x * y).data == 2 * y.data * y.data)
 
     def test_div(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x / y).data == 2.0)
 
     def test_add_shape_mismatch(self, x2, y2):
@@ -116,11 +116,11 @@ def test_div_shape_mismatch(self, x2, y2):
             x2 / y2
 
     def test_add_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         assert all(x.data == (y + 1.0).data)
 
     def test_radd_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         assert all(x.data == (1.0 + y).data)
 
     def test_pos_copies(self, y):
@@ -134,23 +134,23 @@ def test_neg_copies(self, y):
         assert z is not y
 
     def test_sub_scalar(self, x, y):
-        x._data = y.data - 1.0
+        x.data[...] = y.data - 1.0
         assert all(x.data == (y - 1.0).data)
 
     def test_rsub_scalar(self, x, y):
-        x._data = 1.0 - y.data
+        x.data[...] = 1.0 - y.data
         assert all(x.data == (1.0 - y).data)
 
     def test_mul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all(x.data == (y * 2.0).data)
 
     def test_rmul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all(x.data == (2.0 * y).data)
 
     def test_div_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x / 2.0).data == y.data)
 
     def test_add_ftype(self, y, yi):
@@ -187,7 +187,7 @@ def test_div_itype(self, y, yi):
 
     def test_linalg_and_parloop(self, x, y):
         """Linear algebra operators should force computation"""
-        x._data = np.zeros(x.dataset.total_size, dtype=np.float64)
+        x.data[...] = np.zeros(x.dataset.total_size, dtype=np.float64)
         k = op2.Kernel('static void k(double *x) { *x = 1.0; }', 'k')
         op2.par_loop(k, x.dataset.set, x(op2.WRITE))
         z = x + y
@@ -201,22 +201,22 @@ class TestLinAlgIop:
     """
 
     def test_iadd(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x += y
         assert all(x.data == 3 * y.data)
 
     def test_isub(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x -= y
         assert all(x.data == y.data)
 
     def test_imul(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x *= y
         assert all(x.data == 2 * y.data * y.data)
 
     def test_idiv(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x /= y
         assert all(x.data == 2.0)
 
@@ -237,22 +237,22 @@ def test_idiv_shape_mismatch(self, x2, y2):
             x2 /= y2
 
     def test_iadd_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         y += 1.0
         assert all(x.data == y.data)
 
     def test_isub_scalar(self, x, y):
-        x._data = y.data - 1.0
+        x.data[...] = y.data - 1.0
         y -= 1.0
         assert all(x.data == y.data)
 
     def test_imul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         y *= 2.0
         assert all(x.data == y.data)
 
     def test_idiv_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x /= 2.0
         assert all(x.data == y.data)
 
diff --git a/test/unit/test_linalg_complex.py b/test/unit/test_linalg_complex.py
index f7ee2f4cd..eed8cc1bd 100644
--- a/test/unit/test_linalg_complex.py
+++ b/test/unit/test_linalg_complex.py
@@ -94,30 +94,30 @@ class TestLinAlgOp:
     """
 
     def test_add(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x + y).data == 3 * y.data)
 
     def test_sub(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x - y).data == y.data)
 
     def test_mul_complex(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         assert all((x * y).data == (2+2j) * y.data * y.data)
 
     def test_div_complex(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         # Note complex division does not have the same stability as
         # floating point when vectorised
         assert all(x.data / y.data == 2.0+2.j)
         assert np.allclose((x / y).data, 2.0+2.j)
 
     def test_mul(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x * y).data == 2 * y.data * y.data)
 
     def test_div(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x.data / y.data
         # Note complex division does not have the same stability as
         # floating point when vectorised
@@ -141,19 +141,19 @@ def test_div_shape_mismatch(self, x2, y2):
             x2 / y2
 
     def test_add_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         assert all(x.data == (y + 1.0).data)
 
     def test_radd_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         assert all(x.data == (1.0 + y).data)
 
     def test_add_complex_scalar(self, x, y):
-        x._data = y.data + (1.0+1.j)
+        x.data[...] = y.data + (1.0+1.j)
         assert all(x.data == (y + (1.0+1.j)).data)
 
     def test_radd_complex_scalar(self, x, y):
-        x._data = y.data + (1.0+1.j)
+        x.data[...] = y.data + (1.0+1.j)
         assert all(x.data == ((1.0+1.j) + y).data)
 
     def test_pos_copies(self, y):
@@ -167,39 +167,39 @@ def test_neg_copies(self, y):
         assert z is not y
 
     def test_sub_scalar(self, x, y):
-        x._data = y.data - 1.0
+        x.data[...] = y.data - 1.0
         assert all(x.data == (y - 1.0).data)
 
     def test_rsub_scalar(self, x, y):
-        x._data = 1.0 - y.data
+        x.data[...] = 1.0 - y.data
         assert all(x.data == (1.0 - y).data)
 
     def test_mul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all(x.data == (y * 2.0).data)
 
     def test_rmul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all(x.data == (2.0 * y).data)
 
     def test_sub_complex_scalar(self, x, y):
-        x._data = y.data - (1.0+1.j)
+        x.data[...] = y.data - (1.0+1.j)
         assert all(x.data == (y - (1.0+1.j)).data)
 
     def test_rsub_complex_scalar(self, x, y):
-        x._data = (1.0+1.j) - y.data
+        x.data[...] = (1.0+1.j) - y.data
         assert all(x.data == ((1.0+1.j) - y).data)
 
     def test_mul_complex_scalar(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         assert all(x.data == (y * (2.0+2.j)).data)
 
     def test_rmul_complex_scalar(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         assert all(x.data == ((2.0+2.j) * y).data)
 
     def test_div_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         assert all((x / 2.0).data == y.data)
 
     def test_add_ftype(self, y, yf):
@@ -252,7 +252,7 @@ def test_div_itype(self, y, yi):
 
     def test_linalg_and_parloop(self, x, y):
         """Linear algebra operators should force computation"""
-        x._data = np.zeros(x.dataset.total_size, dtype=np.complex128)
+        x.data[...] = np.zeros(x.dataset.total_size, dtype=np.complex128)
         k = op2.Kernel('static void k(complex double *x) { *x = 1.0+1.0*I; }', 'k')
         op2.par_loop(k, x.dataset.set, x(op2.WRITE))
         z = x + y
@@ -266,22 +266,22 @@ class TestLinAlgIop:
     """
 
     def test_iadd(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x += y
         assert all(x.data == 3 * y.data)
 
     def test_isub(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x -= y
         assert all(x.data == y.data)
 
     def test_imul(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x *= y
         assert all(x.data == 2 * y.data * y.data)
 
     def test_idiv(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x /= y
         # Note complex division does not have the same stability as
         # floating point when vectorised
@@ -304,42 +304,42 @@ def test_idiv_shape_mismatch(self, x2, y2):
             x2 /= y2
 
     def test_iadd_scalar(self, x, y):
-        x._data = y.data + 1.0
+        x.data[...] = y.data + 1.0
         y += 1.0
         assert all(x.data == y.data)
 
     def test_isub_scalar(self, x, y):
-        x._data = y.data - 1.0
+        x.data[...] = y.data - 1.0
         y -= 1.0
         assert all(x.data == y.data)
 
     def test_imul_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         y *= 2.0
         assert all(x.data == y.data)
 
     def test_idiv_scalar(self, x, y):
-        x._data = 2 * y.data
+        x.data[...] = 2 * y.data
         x /= 2.0
         assert all(x.data == y.data)
 
     def test_iadd_complex_scalar(self, x, y):
-        x._data = y.data + (1.0+1.j)
+        x.data[...] = y.data + (1.0+1.j)
         y += (1.0+1.j)
         assert all(x.data == y.data)
 
     def test_isub_complex_scalar(self, x, y):
-        x._data = y.data - (1.0+1.j)
+        x.data[...] = y.data - (1.0+1.j)
         y -= (1.0+1.j)
         assert all(x.data == y.data)
 
     def test_imul_complex_scalar(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         y *= (2.0+2.j)
         assert all(x.data == y.data)
 
     def test_idiv_complex_scalar(self, x, y):
-        x._data = (2+2j) * y.data
+        x.data[...] = (2+2j) * y.data
         x /= (2.0+2j)
         assert all(x.data == y.data)
 
diff --git a/test/unit/test_matrices.py b/test/unit/test_matrices.py
index 34b467e21..1e6b39a84 100644
--- a/test/unit/test_matrices.py
+++ b/test/unit/test_matrices.py
@@ -567,13 +567,6 @@ class TestSparsity:
     Sparsity tests
     """
 
-    def test_sparsity_null_maps(self):
-        """Building sparsity from a pair of non-initialized maps should fail."""
-        s = op2.Set(5)
-        with pytest.raises(MapValueError):
-            m = op2.Map(s, s, 1)
-            op2.Sparsity((s, s), (m, m))
-
     def test_sparsity_has_diagonal_space(self):
         # A sparsity should have space for diagonal entries if rmap==cmap
         s = op2.Set(1)
diff --git a/test/unit/test_petsc.py b/test/unit/test_petsc.py
index 57068a7aa..0d5dbe7ca 100644
--- a/test/unit/test_petsc.py
+++ b/test/unit/test_petsc.py
@@ -74,10 +74,10 @@ def test_mixed_vec_access(self):
             assert np.allclose(v.array_r, [1.0, 2.0])
 
         d.data[0][:] = 0.0
-        d.data[0][:] = 0.0
+        d.data[1][:] = 0.0
 
         with d.vec_wo as v:
-            assert np.allclose(v.array_r, [1.0, 2.0])
+            assert np.allclose(v.array_r, [0.0, 0.0])
             v.array[:] = 1
 
         assert d.data[0][0] == 1
diff --git a/test/unit/test_subset.py b/test/unit/test_subset.py
index a2c2f9f9b..603c852bf 100644
--- a/test/unit/test_subset.py
+++ b/test/unit/test_subset.py
@@ -270,8 +270,8 @@ def test_set_set_operations(self):
         s = a.symmetric_difference(a)
         assert u is a
         assert i is a
-        assert d._indices.size == 0
-        assert s._indices.size == 0
+        assert d.indices.size == 0
+        assert s.indices.size == 0
 
     def test_set_subset_operations(self):
         """Test standard set operations between a set and a subset"""
@@ -283,8 +283,8 @@ def test_set_subset_operations(self):
         s = a.symmetric_difference(b)
         assert u is a
         assert i is b
-        assert (d._indices == [0, 1, 4, 6, 8, 9]).all()
-        assert (s._indices == d._indices).all()
+        assert (d.indices == [0, 1, 4, 6, 8, 9]).all()
+        assert (s.indices == d.indices).all()
 
     def test_subset_set_operations(self):
         """Test standard set operations between a subset and a set"""
@@ -296,8 +296,8 @@ def test_subset_set_operations(self):
         s = b.symmetric_difference(a)
         assert u is a
         assert i is b
-        assert d._indices.size == 0
-        assert (s._indices == [0, 1, 4, 6, 8, 9]).all()
+        assert d.indices.size == 0
+        assert (s.indices == [0, 1, 4, 6, 8, 9]).all()
 
     def test_subset_subset_operations(self):
         """Test standard set operations between two subsets"""
@@ -308,7 +308,7 @@ def test_subset_subset_operations(self):
         i = b.intersection(c)
         d = b.difference(c)
         s = b.symmetric_difference(c)
-        assert (u._indices == [2, 3, 4, 5, 6, 7, 8]).all()
-        assert (i._indices == [2, ]).all()
-        assert (d._indices == [3, 5, 7]).all()
-        assert (s._indices == [3, 4, 5, 6, 7, 8]).all()
+        assert (u.indices == [2, 3, 4, 5, 6, 7, 8]).all()
+        assert (i.indices == [2, ]).all()
+        assert (d.indices == [3, 5, 7]).all()
+        assert (s.indices == [3, 4, 5, 6, 7, 8]).all()